% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@ARTICLE{Geimer:6609,
      author       = {Geimer, M. and Wolf, F. and Wylie, B. and Mohr, B.},
      title        = {{A} scalable tool architecture for diagnosing wait states
                      in massively parallel applications},
      journal      = {Parallel computing},
      volume       = {35},
      issn         = {0167-8191},
      address      = {Amsterdam [u.a.]},
      publisher    = {North-Holland, Elsevier Science},
      reportid     = {PreJuSER-6609},
      pages        = {375 - 388},
      year         = {2009},
      note         = {This work was supported by the Helmholtz Association under
                      Grants No. VH-NG-118 and No. VH-VI-228. Also, we would like
                      to thank Marek Behr, Mike Nicolai, and Markus Probst from
                      the Chair for Computational Analysis of Technical Systems at
                      RWTH Aachen University for giving us access to their code.},
      abstract     = {When scaling message-passing applications to thousands of
                      processors, their performance is often affected by wait
                      states that occur when processes fail to reach
                      synchronization points simultaneously. As a first step in
                      reducing the performance impact, we have shown in our
                      earlier work that wait states can be diagnosed by searching
                      event traces for characteristic patterns. However, our
                      initial sequential search method did not scale beyond
                      several hundred processes. Here, we present a scalable
                      approach, based on a parallel replay of the target
                      application's communication behavior, that can efficiently
                      identify wait states at the previously inaccessible scale of
                      65,536 processes and that has potential for even larger
                      configurations. We explain how our new approach has been
                      integrated into a comprehensive parallel tool architecture,
                      which we use to demonstrate that wait states may consume a
                      major fraction of the execution time at larger scales. (C)
                      2009 Elsevier B.V. All rights reserved.},
      keywords     = {J (WoSType)},
      cin          = {JSC / JARA-HPC},
      ddc          = {004},
      cid          = {I:(DE-Juel1)JSC-20090406 / $I:(DE-82)080012_20140620$},
      pnm          = {Scientific Computing / ATMLPP - ATML Parallel Performance
                      (ATMLPP)},
      pid          = {G:(DE-Juel1)FUEK411 / G:(DE-Juel-1)ATMLPP},
      shelfmark    = {Computer Science, Theory $\&$ Methods},
      typ          = {PUB:(DE-HGF)16},
      UT           = {WOS:000268438000001},
      doi          = {10.1016/j.parco.2009.02.003},
      url          = {https://juser.fz-juelich.de/record/6609},
}