% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@ARTICLE{Ho:1042334,
      author       = {Ho, Nam and FALQUEZ, CARLOS and PORTERO, ANTONI and SUAREZ,
                      ESTELA and PLEITER, DIRK},
      title        = {{M}emory {P}refetching {E}valuation of {S}cientific
                      {A}pplications on {A} {M}odern {HPC} {A}rm-based
                      {P}rocessor},
      journal      = {IEEE access},
      volume       = {13},
      issn         = {2169-3536},
      address      = {New York, NY},
      publisher    = {IEEE},
      reportid     = {FZJ-2025-02537},
      pages        = {85898 - 85926},
      year         = {2025},
      abstract     = {Memory prefetching is a well-known technique for mitigating
                      the negative impact of memory access latencies on memory
                      bandwidth. This problem has become more pressing as
                      improvements in memory bandwidth have not kept pace with
                      increases in computational power. While much existing work
                      has been devoted to finding appropriate prefetching
                      techniques for specific workloads, few provide insight into
                      the behavior of scientific applications to better understand
                      the impact of prefetchers. This paper investigates the
                      impact of hardware prefetchers on the latest Arm-based
                      high-end processor architectures. In this work, we
                      investigate memory access patterns by analyzing locality
                      properties and visualizing delta and repetitive address
                      patterns. A deeper understanding of memory access patterns
                      allows the use of the appropriate prefetcher and reaching a
                      better correlation between access pattern properties and
                      prefetcher performance. This can guide future co-design
                      efforts. We evaluated traditional and innovative prefetchers
                      using a gem5-based model of Arm Neoverse V1 cores. The model
                      features a 16-core architecture, using Amazon’s Graviton 3
                      processor as a hardware reference, but substituting DDR5 by
                      high bandwidth memory (HBM2). We performed a detailed
                      prefetching evaluation focusing on stencil, sparse
                      matrix-vector multiplication, and Breadth-First Search
                      kernels. These kernels represent a broad range of the
                      applications running on today’s High-Performance Computing
                      (HPC) systems, which are sensitive to memory performance.},
      cin          = {JSC},
      ddc          = {621.3},
      cid          = {I:(DE-Juel1)JSC-20090406},
      pnm          = {5122 - Future Computing $\&$ Big Data Systems (POF4-512) /
                      EPI SGA2 (16ME0507K) / EPI SGA1 - SGA1 (Specific Grant
                      Agreement 1) OF THE EUROPEAN PROCESSOR INITIATIVE (EPI)
                      (826647)},
      pid          = {G:(DE-HGF)POF4-5122 / G:(BMBF)16ME0507K /
                      G:(EU-Grant)826647},
      typ          = {PUB:(DE-HGF)16},
      UT           = {WOS:001492121500023},
      doi          = {10.1109/ACCESS.2025.3569533},
      url          = {https://juser.fz-juelich.de/record/1042334},
}