% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@PHDTHESIS{Szebenyi:21706,
author = {Szebenyi, Zoltán Péter},
title = {{C}apturing {P}arallel {P}erformance {D}ynamics},
volume = {12},
school = {RWTH Aachen},
type = {Dr. (FH)},
address = {Jülich},
publisher = {Forschungszentrum Jülich GmbH Zentralbibliothek, Verlag},
reportid = {PreJuSER-21706},
isbn = {978-3-89336-798-6},
series = {Schriften des Forschungszentrums Jülich. IAS Series},
pages = {XXI, 192 S.},
year = {2012},
note = {Record converted from JUWEL: 18.07.2013; RWTH Aachen,
Diss., 2012},
abstract = {Supercomputers play a key role in countless areas of
science and engineering, enabling the development of new
insights and technological advances never possible before.
The strategic importance and ever-growing complexity of the
efficient usage of supercomputing resources makes
application performance analysis invaluable for the
development of parallel codes. Runtime call-path profiling
is a conventional, well-known method used for collecting
summary statistics of an execution such as the time spent in
different call paths of the code. However, these kinds of
measurements only give the user a summary overview of the
entire execution, without regard to changes in performance
behavior over time. The possible causes of temporal changes
are quite numerous, ranging from adaptive workload balancing
through periodically executed extra work or distinct
computational phases to system noise. As present day
scientific applications tend to be run for extended periods
of time, understanding the patterns and trends in the
performance data along the time axis becomes crucial. A
straightforward approach is profiling every iteration of the
main loop separately. As shown by our analysis of a
representative set of scientific codes, such measurements
provide a wealth of new data that often leads to invaluable
new insights. However, the introduction of the time
dimension makes the amount of data collected proportional to
the number of iterations, and memory usage and file sizes
grow considerably. To counter this problem, a low-overhead
online compression algorithm was developed that requires
only a fraction of the memory and file sizes needed for an
uncompressed measurement. By exploiting similarities between
different iterations, the lossy compression algorithm allows
all the relevant temporal patterns of the performance
behavior to be reconstructed. While standard, direct
instrumentation, which is assumed by the initial version of
the compression algorithm, results in fairly low overhead
with many scientific codes, in some cases the high frequency
of events (e.g., tiny C++ member function calls) makes such
measurements impractical. To overcome this problem, a
sampling-based methodology could be used instead, where the
amount of measurement overhead becomes a function of the
sampling frequency, independent of the function-call
frequency. However, sampling alone is insufficient for our
purposes, as it does not provide access to the communication
metrics the compression algorithm heavily depends on.
Therefore, a hybrid solution was developed that seamlessly
integrates both types of measurement techniques in a single
unified measurement, using direct instrumentation for
message passing constructs, while sampling the rest of the
code. Finally, the compression algorithm was adapted to the
hybrid profiling approach, avoiding the overhead of pure
direct instrumentation. Evaluation of the above
methodologies shows that our semantics-based compression
algorithm provides a very good approximation of the original
data with very little measurement dilation, while the hybrid
combination of sampling and direct instrumentation fulfills
its purpose by showing the expected reduction of measurement
dilation in cases unsuitable for direct instrumentation.
Beyond testing with standardized benchmark suites, the
usefulness of these techniques was demonstrated by their key
role in gaining important new insights into the performance
characteristics of real-world applications.},
cin = {JSC},
ddc = {500},
cid = {I:(DE-Juel1)JSC-20090406},
pnm = {Scientific Computing (FUEK411) / 411 - Computational
Science and Mathematical Methods (POF2-411) / ATMLPP - ATML
Parallel Performance (ATMLPP)},
pid = {G:(DE-Juel1)FUEK411 / G:(DE-HGF)POF2-411 /
G:(DE-Juel-1)ATMLPP},
typ = {PUB:(DE-HGF)11 / PUB:(DE-HGF)3},
urn = {urn:nbn:de:0001-2012062204},
url = {https://juser.fz-juelich.de/record/21706},
}