% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@ARTICLE{Wylie:1030735,
      author       = {Wylie, Brian J. N. and Feld, Christian and Geimer, Markus
                      and Llort, Germán and Mendez, Sandra and Mercadal,
                      Estanislao and Visser, Anke and García-Gasulla, Marta},
      collaboration = {Giménez, Judit},
      title        = {15+ years of joint parallel application performance
                      analysis/tools training with {S}calasca/{S}core-{P} and
                      {P}araver/{E}xtrae toolsets},
      journal      = {Future generation computer systems},
      volume       = {162},
      issn         = {0167-739X},
      address      = {Amsterdam [u.a.]},
      publisher    = {Elsevier Science},
      reportid     = {FZJ-2024-05442},
      pages        = {107472},
      year         = {2025},
      note         = {Keywords: Hybrid parallel programming; MPI message-passing;
                      OpenMP multithreading; OpenACC device offload acceleration;
                      HPC application execution performance measurement $\&$
                      analysis; Performance assessment $\&$ optimisation
                      methodology $\&$ tools; Hands-on training $\&$ coaching},
      abstract     = {The diverse landscape of distributed heterogeneous computer
                      systems currently available and being created to address
                      computational challenges with the highest performance
                      requirements presents daunting complexity for application
                      developers. They must effectively decompose and distribute
                      their application functionality and data, efficiently
                      orchestrating the associated communication and
                      synchronisation, on multi/manycore CPU processors with
                      multiple attached acceleration devices structured within
                      compute nodes with interconnection networks of various
                      topologies.Sophisticated compilers, runtime systems and
                      libraries are (loosely) matched with debugging, performance
                      measurement and analysis tools, with proprietary versions by
                      integrators/vendors provided exclusively for their systems
                      complemented by portable (primarily) open-source equivalents
                      developed and supported by the international research
                      community over many years. The Scalasca and Paraver toolsets
                      are two widely employed examples of the latter, installed on
                      personal notebook computers through to the largest
                      leadership HPC systems. Over more than fifteen years their
                      developers have worked closely together in numerous
                      collaborative projects culminating in the creation of a
                      universal parallel performance assessment and optimisation
                      methodology focused on application execution efficiency and
                      scalability, and the associated training and coaching of
                      application developers (often in teams) in its productive
                      use, reviewed in this article with lessons learnt
                      therefrom.},
      cin          = {JSC},
      ddc          = {004},
      cid          = {I:(DE-Juel1)JSC-20090406},
      pnm          = {5112 - Cross-Domain Algorithms, Tools, Methods Labs (ATMLs)
                      and Research Groups (POF4-511) / JLESC - Joint Laboratory
                      for Extreme Scale Computing (JLESC-20150708) / POP -
                      Performance Optimisation and Productivity (676553) / POP2 -
                      Performance Optimisation and Productivity 2 (824080) / POP3
                      - Performance Optimisation and Productivity 3 (101143931) /
                      ATMLPP - ATML Parallel Performance (ATMLPP)},
      pid          = {G:(DE-HGF)POF4-5112 / G:(DE-Juel1)JLESC-20150708 /
                      G:(EU-Grant)676553 / G:(EU-Grant)824080 /
                      G:(EU-Grant)101143931 / G:(DE-Juel-1)ATMLPP},
      typ          = {PUB:(DE-HGF)16},
      UT           = {WOS:001294686400001},
      doi          = {10.1016/j.future.2024.07.050},
      url          = {https://juser.fz-juelich.de/record/1030735},
}