% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@ARTICLE{Hermanns:128163,
author = {Hermanns, Marc-André and Krishnamoorthy, Sriram and Wolf,
Felix},
title = {{A} scalable infrastructure for the performance analysis of
passive target synchronization},
journal = {Parallel computing},
volume = {39},
number = {3},
address = {Amsterdam [u.a.]},
publisher = {North-Holland, Elsevier Science},
reportid = {FZJ-2012-01058},
pages = {132-145},
year = {2013},
abstract = {Partitioned global address space (PGAS) languages combine
the convenient abstraction of shared memory with the notion
of affinity, extending multi-threaded programming to
large-scale systems with physically distributed memory.
However, in spite of their obvious advantages, PGAS
languages still lack appropriate tool support for
performance analysis, one of the reasons why their adoption
is still in its infancy. Some of the performance problems
for which tool support is needed occur at the level of the
underlying one-sided communication substrate, such as the
Aggregate Remote Memory Copy Interface (ARMCI). One such
example is the waiting time in situations where asynchronous
data transfers cannot be completed without software
intervention at the target side. This is not uncommon on
systems with reduced operating-system kernels such as IBM
Blue Gene/P where the use of progress threads would double
the number of cores necessary to run an application. In this
paper, we present an extension of the Scalasca
trace-analysis infrastructure aimed at the identification
and quantification of progress-related waiting times at
larger scales. We demonstrate its utility and scalability
using a benchmark running with up to 32,768 processes.},
cin = {JSC},
ddc = {004},
cid = {I:(DE-Juel1)JSC-20090406},
pnm = {411 - Computational Science and Mathematical Methods
(POF2-411) / ATMLPP - ATML Parallel Performance (ATMLPP)},
pid = {G:(DE-HGF)POF2-411 / G:(DE-Juel-1)ATMLPP},
typ = {PUB:(DE-HGF)16},
UT = {WOS:000317371900004},
doi = {10.1016/j.parco.2012.09.002},
url = {https://juser.fz-juelich.de/record/128163},
}