% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@PHDTHESIS{Bhme:151145,
author = {Böhme, David},
title = {{C}haracterizing {L}oad and {C}ommunication {I}mbalance in
{P}arallel {A}pplications},
volume = {23},
school = {RWTH Aachen},
type = {Dr.},
address = {Jülich},
publisher = {Forschungszentrum Jülich GmbH Zentralbibliothek, Verlag},
reportid = {FZJ-2014-01145},
isbn = {978-3-89336-940-9},
series = {Schriften des Forschungszentrums Jülich. IAS Series},
pages = {xv, 111 S.},
year = {2014},
note = {RWTH Aachen, Diss., 2013},
abstract = {The amount of parallelism in modern supercomputers
currently grows from generation to generation. Further
application performance improvements therefore depend on
software-managed parallelism: the software must organize
data exchange between processing elements efficiently and
optimally distribute the workload between them. Performance
analysis tools help developers of parallel applications to
evaluate and optimize the parallel efficiency of their
programs. This dissertation presents two novel methods to
automatically detect imbalance-related performance problems
in MPI programs and intuitively guide the performance
analyst to inefficiencies whose optimization promise the
highest benefit. The first method, the delay analysis,
identifies the root causes of wait states. A delay occurs
when a program activity needs more time on one process than
on another, which leads to the formation of wait states at a
subsequent synchronization point. Wait states are the
primary symptom of load imbalance in parallel programs.
While wait states themselves are easy to detect, the
potentially large temporal and spatial distance between wait
states and the delays causing them complicates the
identification of wait-state root causes. The delay analysis
closes this gap, accounting for both short-term and
long-term effects. The second method is based on the
detection of the critical path, which determines the effect
of imbalance on program runtime. The critical path is the
longest execution path in a parallel program without wait
states: optimizing an activity on the critical path will
reduce the program’s runtime. Comparing the duration of
activities on the critical path with their duration on each
process yields a set of novel, compact performance
indicators. These indicators allow users to evaluate load
balance, identify performance bottlenecks, and determine the
performance impact of load imbalance at first glance by
providing an intuitive understanding of complex performance
phenomena.Both analysis methods leverage the scalable
event-trace analysis technique employed by the Scalasca
toolset: by replaying event traces in parallel, the
bottleneck search algorithms can harness the distributed
memory and computational resources of the target system for
the analysis, allowing them to process even large-scale
program runs. The scalability and performance insight that
the novel analysis approaches provide are demonstrated by
evaluating a variety of real-world HPC codes in
configurations with up to 262,144 processor cores.},
keywords = {Dissertation (GND)},
cin = {JSC},
cid = {I:(DE-Juel1)JSC-20090406},
pnm = {411 - Computational Science and Mathematical Methods
(POF2-411) / ATMLPP - ATML Parallel Performance (ATMLPP)},
pid = {G:(DE-HGF)POF2-411 / G:(DE-Juel-1)ATMLPP},
typ = {PUB:(DE-HGF)11},
urn = {urn:nbn:de:0001-2014012708},
url = {https://juser.fz-juelich.de/record/151145},
}