% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@ARTICLE{Hermanns:128163,
      author       = {Hermanns, Marc-André and Krishnamoorthy, Sriram and Wolf,
                      Felix},
      title        = {{A} scalable infrastructure for the performance analysis of
                      passive target synchronization},
      journal      = {Parallel computing},
      volume       = {39},
      number       = {3},
      address      = {Amsterdam [u.a.]},
      publisher    = {North-Holland, Elsevier Science},
      reportid     = {FZJ-2012-01058},
      pages        = {132-145},
      year         = {2013},
      abstract     = {Partitioned global address space (PGAS) languages combine
                      the convenient abstraction of shared memory with the notion
                      of affinity, extending multi-threaded programming to
                      large-scale systems with physically distributed memory.
                      However, in spite of their obvious advantages, PGAS
                      languages still lack appropriate tool support for
                      performance analysis, one of the reasons why their adoption
                      is still in its infancy. Some of the performance problems
                      for which tool support is needed occur at the level of the
                      underlying one-sided communication substrate, such as the
                      Aggregate Remote Memory Copy Interface (ARMCI). One such
                      example is the waiting time in situations where asynchronous
                      data transfers cannot be completed without software
                      intervention at the target side. This is not uncommon on
                      systems with reduced operating-system kernels such as IBM
                      Blue Gene/P where the use of progress threads would double
                      the number of cores necessary to run an application. In this
                      paper, we present an extension of the Scalasca
                      trace-analysis infrastructure aimed at the identification
                      and quantification of progress-related waiting times at
                      larger scales. We demonstrate its utility and scalability
                      using a benchmark running with up to 32,768 processes.},
      cin          = {JSC},
      ddc          = {004},
      cid          = {I:(DE-Juel1)JSC-20090406},
      pnm          = {411 - Computational Science and Mathematical Methods
                      (POF2-411) / ATMLPP - ATML Parallel Performance (ATMLPP)},
      pid          = {G:(DE-HGF)POF2-411 / G:(DE-Juel-1)ATMLPP},
      typ          = {PUB:(DE-HGF)16},
      UT           = {WOS:000317371900004},
      doi          = {10.1016/j.parco.2012.09.002},
      url          = {https://juser.fz-juelich.de/record/128163},
}