% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@ARTICLE{Falquez:1049545,
author = {Falquez, Carlos and Long, Shiting and Ho, Nam and Suarez,
Estela and Pleiter, Dirk},
title = {{P}rocessor simulation as a tool for performance
engineering},
journal = {Frontiers in high performance computing},
volume = {3},
issn = {2813-7337},
address = {Beijing},
publisher = {Frontiers Media SA},
reportid = {FZJ-2025-05349},
pages = {1669101},
year = {2025},
abstract = {The diversity of processor architectures used for
High-Performance Computing (HPC) applications has increased
significantly over the last few years. This trend is
expected to continue for different reasons, including the
emergence of various instruction set extensions. Examples
are the renewed interest in vector instructions like Arm's
Scalable Vector Extension (SVE) or RISC-V's RVV. For
application developers, research software developers, and
performance engineers, the increased diversity and
complexity of architectures have led to the following
challenges: Limited access to these different processor
architectures and more difficult root cause analysis in case
of performance issues. To address these challenges, we
propose leveraging the much-improved capabilities of
processor simulators such as gem5. We enhanced this
simulator with a performance analysis framework. We extend
available performance counters and introduce new analysis
capabilities to track the temporal behaviour of running
applications. An algorithm has been implemented to link
these statistics to specific regions. The resulting
performance profiles allow for the identification of code
regions with the potential for optimization. The focus is on
observables to monitor quantities that are usually not
directly accessible on real hardware. Different algorithms
have been implemented to identify potential performance
bottlenecks. The framework is evaluated for different types
of HPC applications like the molecular-dynamics application
GROMACS, Ligra, which implements the breadth-first search
(BFS) algorithm, and a kernel from the Lattice QCD solver
DD-αAMG.},
cin = {JSC},
ddc = {004},
cid = {I:(DE-Juel1)JSC-20090406},
pnm = {5122 - Future Computing $\&$ Big Data Systems (POF4-512) /
EPI SGA2 (16ME0507K) / The European PILOT - Pilot using
Independent Local $\&$ Open Technologies (101034126) /
AQTIVATE - Advanced computing, quantum algorithms, and
data-driven approaches for science, technology and
engineering (101072344)},
pid = {G:(DE-HGF)POF4-5122 / G:(BMBF)16ME0507K /
G:(EU-Grant)101034126 / G:(EU-Grant)101072344},
typ = {PUB:(DE-HGF)16},
doi = {10.3389/fhpcp.2025.1669101},
url = {https://juser.fz-juelich.de/record/1049545},
}