% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@ARTICLE{Wylie:1030735,
author = {Wylie, Brian J. N. and Feld, Christian and Geimer, Markus
and Llort, Germán and Mendez, Sandra and Mercadal,
Estanislao and Visser, Anke and García-Gasulla, Marta},
collaboration = {Giménez, Judit},
title = {15+ years of joint parallel application performance
analysis/tools training with {S}calasca/{S}core-{P} and
{P}araver/{E}xtrae toolsets},
journal = {Future generation computer systems},
volume = {162},
issn = {0167-739X},
address = {Amsterdam [u.a.]},
publisher = {Elsevier Science},
reportid = {FZJ-2024-05442},
pages = {107472},
year = {2025},
note = {Keywords: Hybrid parallel programming; MPI message-passing;
OpenMP multithreading; OpenACC device offload acceleration;
HPC application execution performance measurement $\&$
analysis; Performance assessment $\&$ optimisation
methodology $\&$ tools; Hands-on training $\&$ coaching},
abstract = {The diverse landscape of distributed heterogeneous computer
systems currently available and being created to address
computational challenges with the highest performance
requirements presents daunting complexity for application
developers. They must effectively decompose and distribute
their application functionality and data, efficiently
orchestrating the associated communication and
synchronisation, on multi/manycore CPU processors with
multiple attached acceleration devices structured within
compute nodes with interconnection networks of various
topologies.Sophisticated compilers, runtime systems and
libraries are (loosely) matched with debugging, performance
measurement and analysis tools, with proprietary versions by
integrators/vendors provided exclusively for their systems
complemented by portable (primarily) open-source equivalents
developed and supported by the international research
community over many years. The Scalasca and Paraver toolsets
are two widely employed examples of the latter, installed on
personal notebook computers through to the largest
leadership HPC systems. Over more than fifteen years their
developers have worked closely together in numerous
collaborative projects culminating in the creation of a
universal parallel performance assessment and optimisation
methodology focused on application execution efficiency and
scalability, and the associated training and coaching of
application developers (often in teams) in its productive
use, reviewed in this article with lessons learnt
therefrom.},
cin = {JSC},
ddc = {004},
cid = {I:(DE-Juel1)JSC-20090406},
pnm = {5112 - Cross-Domain Algorithms, Tools, Methods Labs (ATMLs)
and Research Groups (POF4-511) / JLESC - Joint Laboratory
for Extreme Scale Computing (JLESC-20150708) / POP -
Performance Optimisation and Productivity (676553) / POP2 -
Performance Optimisation and Productivity 2 (824080) / POP3
- Performance Optimisation and Productivity 3 (101143931) /
ATMLPP - ATML Parallel Performance (ATMLPP)},
pid = {G:(DE-HGF)POF4-5112 / G:(DE-Juel1)JLESC-20150708 /
G:(EU-Grant)676553 / G:(EU-Grant)824080 /
G:(EU-Grant)101143931 / G:(DE-Juel-1)ATMLPP},
typ = {PUB:(DE-HGF)16},
UT = {WOS:001294686400001},
doi = {10.1016/j.future.2024.07.050},
url = {https://juser.fz-juelich.de/record/1030735},
}