% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@ARTICLE{Falquez:1049545,
      author       = {Falquez, Carlos and Long, Shiting and Ho, Nam and Suarez,
                      Estela and Pleiter, Dirk},
      title        = {{P}rocessor simulation as a tool for performance
                      engineering},
      journal      = {Frontiers in high performance computing},
      volume       = {3},
      issn         = {2813-7337},
      address      = {Beijing},
      publisher    = {Frontiers Media SA},
      reportid     = {FZJ-2025-05349},
      pages        = {1669101},
      year         = {2025},
      abstract     = {The diversity of processor architectures used for
                      High-Performance Computing (HPC) applications has increased
                      significantly over the last few years. This trend is
                      expected to continue for different reasons, including the
                      emergence of various instruction set extensions. Examples
                      are the renewed interest in vector instructions like Arm's
                      Scalable Vector Extension (SVE) or RISC-V's RVV. For
                      application developers, research software developers, and
                      performance engineers, the increased diversity and
                      complexity of architectures have led to the following
                      challenges: Limited access to these different processor
                      architectures and more difficult root cause analysis in case
                      of performance issues. To address these challenges, we
                      propose leveraging the much-improved capabilities of
                      processor simulators such as gem5. We enhanced this
                      simulator with a performance analysis framework. We extend
                      available performance counters and introduce new analysis
                      capabilities to track the temporal behaviour of running
                      applications. An algorithm has been implemented to link
                      these statistics to specific regions. The resulting
                      performance profiles allow for the identification of code
                      regions with the potential for optimization. The focus is on
                      observables to monitor quantities that are usually not
                      directly accessible on real hardware. Different algorithms
                      have been implemented to identify potential performance
                      bottlenecks. The framework is evaluated for different types
                      of HPC applications like the molecular-dynamics application
                      GROMACS, Ligra, which implements the breadth-first search
                      (BFS) algorithm, and a kernel from the Lattice QCD solver
                      DD-αAMG.},
      cin          = {JSC},
      ddc          = {004},
      cid          = {I:(DE-Juel1)JSC-20090406},
      pnm          = {5122 - Future Computing $\&$ Big Data Systems (POF4-512) /
                      EPI SGA2 (16ME0507K) / The European PILOT - Pilot using
                      Independent Local $\&$ Open Technologies (101034126) /
                      AQTIVATE - Advanced computing, quantum algorithms, and
                      data-driven approaches for science, technology and
                      engineering (101072344)},
      pid          = {G:(DE-HGF)POF4-5122 / G:(BMBF)16ME0507K /
                      G:(EU-Grant)101034126 / G:(EU-Grant)101072344},
      typ          = {PUB:(DE-HGF)16},
      doi          = {10.3389/fhpcp.2025.1669101},
      url          = {https://juser.fz-juelich.de/record/1049545},
}