% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@ARTICLE{Geimer:6609,
author = {Geimer, M. and Wolf, F. and Wylie, B. and Mohr, B.},
title = {{A} scalable tool architecture for diagnosing wait states
in massively parallel applications},
journal = {Parallel computing},
volume = {35},
issn = {0167-8191},
address = {Amsterdam [u.a.]},
publisher = {North-Holland, Elsevier Science},
reportid = {PreJuSER-6609},
pages = {375 - 388},
year = {2009},
note = {This work was supported by the Helmholtz Association under
Grants No. VH-NG-118 and No. VH-VI-228. Also, we would like
to thank Marek Behr, Mike Nicolai, and Markus Probst from
the Chair for Computational Analysis of Technical Systems at
RWTH Aachen University for giving us access to their code.},
abstract = {When scaling message-passing applications to thousands of
processors, their performance is often affected by wait
states that occur when processes fail to reach
synchronization points simultaneously. As a first step in
reducing the performance impact, we have shown in our
earlier work that wait states can be diagnosed by searching
event traces for characteristic patterns. However, our
initial sequential search method did not scale beyond
several hundred processes. Here, we present a scalable
approach, based on a parallel replay of the target
application's communication behavior, that can efficiently
identify wait states at the previously inaccessible scale of
65,536 processes and that has potential for even larger
configurations. We explain how our new approach has been
integrated into a comprehensive parallel tool architecture,
which we use to demonstrate that wait states may consume a
major fraction of the execution time at larger scales. (C)
2009 Elsevier B.V. All rights reserved.},
keywords = {J (WoSType)},
cin = {JSC / JARA-HPC},
ddc = {004},
cid = {I:(DE-Juel1)JSC-20090406 / $I:(DE-82)080012_20140620$},
pnm = {Scientific Computing / ATMLPP - ATML Parallel Performance
(ATMLPP)},
pid = {G:(DE-Juel1)FUEK411 / G:(DE-Juel-1)ATMLPP},
shelfmark = {Computer Science, Theory $\&$ Methods},
typ = {PUB:(DE-HGF)16},
UT = {WOS:000268438000001},
doi = {10.1016/j.parco.2009.02.003},
url = {https://juser.fz-juelich.de/record/6609},
}