% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@PHDTHESIS{Hermanns:844062,
author = {Hermanns, Marc-André},
title = {{U}nderstanding the formation of wait states in one-sided
communication},
volume = {35},
school = {RWTH Aachen},
type = {Dr.},
address = {Jülich},
publisher = {Forschungszentrum Jülich GmbH Zentralbibliothek, Verlag},
reportid = {FZJ-2018-01571},
isbn = {978-3-95806-297-9},
series = {Schriften des Forschungszentrums Jülich. Reihe IAS},
pages = {xiv, 144 S.},
year = {2018},
note = {RWTH Aachen, Diss., 2017},
abstract = {Due to the available concurrency in modern-day
supercomputers, the complexity of developing efficient
parallel applications for these platforms has grown rapidly
in the last years. Many applications use message passing for
parallelization, offering three main communication
paradigms: point-to-point, collective and one-sided
communication. Each paradigm fits certain domains of
algorithms and communication patterns best. The one-sided
paradigm decouples communication and synchronization and
allows a single process to define a complete communication.
These are important features for runtime systems of new
programming paradigms and state-of-the-art dynamic
load-balancing strategies. In any process interaction, wait
states can occur, where a process is waiting for another -
idling - before it proceeds with its local computation. To
eliminate such wait states, runtime and application
developers alike need support in detecting and quantifying
them and their root causes. However, tool support for
identifying complex wait states in one-sided communication
is scarce. This thesis contributes novel methods for the
scalable detection and quantification of wait states in
one-sided communication, the automatic identification of
their root causes, and the assessment of optimization
potential. The methods for wait-state detection and
quantification, as introduced by Böhme et al. and extended
by this thesis, build upon a parallel post-mortem traversal
of process-local event traces, modeling an application's
runtime behavior. Performance-relevant data is exchanged
just in time on the recorded communication paths. Through
the nature of one-sided communication, information on such
communication paths is not available on all processes
involved, impeding the use of this original approach for
one-sided communication. The use of a novel high-level
messaging framework enables the exchange of messages on the
implicit communication paths of one-sided communication,
while retaining the scalability of the original approach.
This enables the identification of previously unstudied
types of wait states unique to one-sided communication: lack
of remote progress and resource contention. Beyond simple
accounting of waiting time, other contributed methods allow
pinpointing root causes of such wait states and identifying
optimization potential in one-sided applications.
Furthermore, they distinguish two fundamentally different
classes of wait-state root causes: delays for direct process
synchronization (similar to point-to-point and collective
communication) and contention in case of lock-based process
synchronization, whose resolution strategies are
diametrically opposed to each other. Finally, the
contributed methods enable the identification of the longest
wait-state-free execution path (i.e., critical path) in
parallel applications using one-sided communication. As only
optimization of functions on the critical path will yield
performance improvements, its identification is key to
choosing promising optimization targets. All of these
methods are integrated into the Scalasca performance
toolset. Their scalability and effectiveness are
demonstrated by evaluating a variety of applications using
one-sided communication interfaces running in configurations
with up to 65,536 processes.},
cin = {JSC},
cid = {I:(DE-Juel1)JSC-20090406},
pnm = {511 - Computational Science and Mathematical Methods
(POF3-511) / ATMLPP - ATML Parallel Performance (ATMLPP)},
pid = {G:(DE-HGF)POF3-511 / G:(DE-Juel-1)ATMLPP},
typ = {PUB:(DE-HGF)3 / PUB:(DE-HGF)11},
urn = {urn:nbn:de:0001-2018012504},
url = {https://juser.fz-juelich.de/record/844062},
}