% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@ARTICLE{Saglam:1031162,
      author       = {Saglam, Berk and Ho, Nam and Falquez, Carlos and Portero,
                      Antonio and Schätzle, Fabian and Suarez, Estela and
                      Pleiter, Dirk},
      title        = {{D}ata {P}refetching on {P}rocessors with {H}eterogeneous
                      {M}emory},
      journal      = {10th International Symposium on Memory Systems (MEMSY24)},
      reportid     = {FZJ-2024-05566},
      year         = {2024},
      abstract     = {Heterogeneous memory architectures, such as a mix of High
                      Bandwidth Memory (HBM) and Double Data Rate (DDR), offer
                      flexible performance optimization by leveraging the high
                      bandwidth of HBM along with the high capacity of DDR.
                      However, these architectures present challenges in balancing
                      bandwidth and capacity to maximize overall system
                      performance and complicate hardware design.In a flat memory
                      organization mixing HBM and DDR, prefetchers must carefully
                      reduce prefetch requests on DDR when transitioning from HBM
                      to avoid performance degradation due to potential bandwidth
                      saturation. Traditional hardware prefetchers, which
                      typically assume a homogeneous memory, are unaware of this
                      circumstance, so they may not be effective in heterogeneous
                      memory architectures. The paper enhances the aggressiveness
                      of prefetchers in this kind of architecture. Our technique
                      enables a prefetcher to dynamically determine the optimal
                      prefetch degree and distance based on memory type. It
                      balances prefetch aggressiveness and timeliness through an
                      adaptive strategy informed by bandwidth utilization and
                      prefetch metrics learned for each memory type. We evaluated
                      the technique within the Stride and Stream Prefetchers at L2
                      in a gem5 model of a 20-core Arm Neoverse V1-like
                      architecture, a mix of HBM2 and DDR5. The simulation
                      results, focusing on scientific benchmarks, showed that the
                      technique effectively guides prefetchers to near-optimal
                      static configurations. On HBM2, the adaptation strategy
                      detects bandwidth availability and prefetches more
                      aggressively to boost performance, achieving speedups of
                      $1.3\times$ to $2.3\times$. On DDR5, when faced with
                      saturated bandwidth contention, the adaptation strategy
                      switches to conservative prefetching mode to mitigate
                      performance degradation.},
      cin          = {JSC},
      cid          = {I:(DE-Juel1)JSC-20090406},
      pnm          = {5122 - Future Computing $\&$ Big Data Systems (POF4-512) /
                      EPI SGA2 (16ME0507K)},
      pid          = {G:(DE-HGF)POF4-5122 / G:(BMBF)16ME0507K},
      typ          = {PUB:(DE-HGF)25},
      doi          = {10.1145/3695794.3695800},
      url          = {https://juser.fz-juelich.de/record/1031162},
}