% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@ARTICLE{Bhme:811713,
      author       = {Böhme, David and Geimer, Markus and Arnold, Lukas and
                      Voigtlaender, Felix and Wolf, Felix},
      title        = {{I}dentifying the {R}oot {C}auses of {W}ait {S}tates in
                      {L}arge-{S}cale {P}arallel {A}pplications},
      journal      = {ACM Transactions on Parallel Computing},
      volume       = {3},
      number       = {2},
      issn         = {2374-0353},
      address      = {New York, NY},
      publisher    = {acm Association for Computing Machinery},
      reportid     = {FZJ-2016-04097},
      pages        = {11},
      year         = {2016},
      abstract     = {Driven by growing application requirements and accelerated
                      by current trends in microprocessor design, the number of
                      processor cores on modern supercomputers is increasing from
                      generation to generation. However, load or communication
                      imbalance prevents many codes from taking advantage of the
                      available parallelism, as delays of single processes may
                      spread wait states across the entire machine. Moreover, when
                      employing complex point-to-point communication patterns,
                      wait states may propagate along far-reaching cause-effect
                      chains that are hard to track manually and that complicate
                      an assessment of the actual costs of an imbalance. Building
                      on earlier work by Meira Jr. et al., we present a scalable
                      approach that identifies program wait states and attributes
                      their costs in terms of resource waste to their original
                      cause. By replaying event traces in parallel both forward
                      and backward, we can identify the processes and call paths
                      responsible for the most severe imbalances even for runs
                      with hundreds of thousands of processes.},
      cin          = {JSC},
      cid          = {I:(DE-Juel1)JSC-20090406},
      pnm          = {511 - Computational Science and Mathematical Methods
                      (POF3-511) / ATMLPP - ATML Parallel Performance (ATMLPP)},
      pid          = {G:(DE-HGF)POF3-511 / G:(DE-Juel-1)ATMLPP},
      typ          = {PUB:(DE-HGF)16},
      doi          = {10.1145/2934661},
      url          = {https://juser.fz-juelich.de/record/811713},
}