% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@PHDTHESIS{Becker:10841,
author = {Becker, Daniel},
title = {{T}imestamp {S}ynchronization of {C}oncurrent {E}vents},
volume = {4},
school = {RWTH Aachen},
type = {Dr. (FH)},
address = {Jülich},
publisher = {Forschungszentrum Jülich GmbH Zentralbibliothek, Verlag},
reportid = {PreJuSER-10841},
isbn = {978-3-89336-625-5},
series = {Schriften des Forschungszentrums Jülich : IAS Series},
pages = {XVIII, 116 S.},
year = {2010},
note = {Record converted from VDB: 12.11.2012; RWTH Aachen, Diss.,
2010},
abstract = {Supercomputing is a key technological pillar of modern
science and engineering, indispensable for solving critical
problems of high complexity. However, to effectively utilize
the enormously complex large-scale computer systems
available today, scientists and engineers need powerful and
robust software development tools. One technique widely used
by such tools is event tracing with a broad spectrum of
applications ranging from performance analysis, performance
prediction and modeling to debugging. In particular, event
traces are helpful in understanding the performance behavior
of parallel programs since they allow the in-depth analysis
of communication and synchronization patterns. The accuracy
of such analyses depends on the comparability of timestamps
taken on different processors and may be adversely affected
by non-synchronized clocks leading to inaccurate relative
event timings. Such inaccuracies may cause a given interval
to appear shorter or longer than it actually was, or
introduce violations of the logical event order, which
requires a message to be received only after it has been
sent. Inconsistent trace data may not only lead to false
conclusions, for instance, when the impact of communication
patterns is quantified, but may also confuse the user of
trace-visualization tools by causing message arrows to point
backward in time-line views. Even more strikingly,
trace-analysis tools may also cease to work in a
satisfactorymanner if they rely on the correct order to
function properly. Although linear offset interpolation can
restore the consistency of the trace data to some degree,
time-dependent drifts and other inaccuracies may still
disarrange the original sequence of events, as shown in a
study conducted as a part of this Ph.D. thesis. The already
familiar controlled logical clock algorithm accounts for
such violations in point-to-point communication by shifting
message events in time as much as needed while trying to
preserve the length of local intervals. This algorithm is,
however, not suitable for realistic applications because (i)
it ignores collective and shared-memory operations and (ii)
as a serial algorithm it offers only limited scalability.
This thesis addresses these shortcomings by extending the
algorithm to restore event semantics related to collective
and shared-memory operations and by parallelizing the
extended version to make it suitable for large-scale systems
including computational grids. The basic idea behind the
semantic extension is to consider collective and
shared-memory operations as being composed of multiple
point-to-point messages, taking the semantics of the
different flavors of these operations into account. In order
to accomplish the correction in a scalable way, both
distributed memory and parallel processing capabilities are
exploited by processing separate local trace files in
parallel and replaying the original communication on as many
CPUs as were used to execute the target application itself.
To employ the replay mechanism in computational grids, this
work also defines the necessary infrastructure to accurately
measure clock offsets in distributed environments with
hierarchical networks. The methodology was evaluated in
practice by integrating the extended and parallelized
algorithm into the Scalasca trace-analysis framework and
applied to traces of realistic applications taken on single
cluster systems and computational grids. The thesis shows
that the algorithm eliminates inconsistent timings of
concurrent events while onlymarginally changing the length
of intervals between local events – even if wide-area
communication is involved. Scalability is demonstrated with
up to 4,096 application processes.},
cin = {JSC},
cid = {I:(DE-Juel1)JSC-20090406},
pnm = {Scientific Computing (FUEK411) / 411 - Computational
Science and Mathematical Methods (POF2-411) / ATMLPP - ATML
Parallel Performance (ATMLPP)},
pid = {G:(DE-Juel1)FUEK411 / G:(DE-HGF)POF2-411 /
G:(DE-Juel-1)ATMLPP},
typ = {PUB:(DE-HGF)11 / PUB:(DE-HGF)3},
url = {https://juser.fz-juelich.de/record/10841},
}