% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@ARTICLE{Saglam:1031162,
author = {Saglam, Berk and Ho, Nam and Falquez, Carlos and Portero,
Antonio and Schätzle, Fabian and Suarez, Estela and
Pleiter, Dirk},
title = {{D}ata {P}refetching on {P}rocessors with {H}eterogeneous
{M}emory},
journal = {10th International Symposium on Memory Systems (MEMSY24)},
reportid = {FZJ-2024-05566},
year = {2024},
abstract = {Heterogeneous memory architectures, such as a mix of High
Bandwidth Memory (HBM) and Double Data Rate (DDR), offer
flexible performance optimization by leveraging the high
bandwidth of HBM along with the high capacity of DDR.
However, these architectures present challenges in balancing
bandwidth and capacity to maximize overall system
performance and complicate hardware design.In a flat memory
organization mixing HBM and DDR, prefetchers must carefully
reduce prefetch requests on DDR when transitioning from HBM
to avoid performance degradation due to potential bandwidth
saturation. Traditional hardware prefetchers, which
typically assume a homogeneous memory, are unaware of this
circumstance, so they may not be effective in heterogeneous
memory architectures. The paper enhances the aggressiveness
of prefetchers in this kind of architecture. Our technique
enables a prefetcher to dynamically determine the optimal
prefetch degree and distance based on memory type. It
balances prefetch aggressiveness and timeliness through an
adaptive strategy informed by bandwidth utilization and
prefetch metrics learned for each memory type. We evaluated
the technique within the Stride and Stream Prefetchers at L2
in a gem5 model of a 20-core Arm Neoverse V1-like
architecture, a mix of HBM2 and DDR5. The simulation
results, focusing on scientific benchmarks, showed that the
technique effectively guides prefetchers to near-optimal
static configurations. On HBM2, the adaptation strategy
detects bandwidth availability and prefetches more
aggressively to boost performance, achieving speedups of
$1.3\times$ to $2.3\times$. On DDR5, when faced with
saturated bandwidth contention, the adaptation strategy
switches to conservative prefetching mode to mitigate
performance degradation.},
cin = {JSC},
cid = {I:(DE-Juel1)JSC-20090406},
pnm = {5122 - Future Computing $\&$ Big Data Systems (POF4-512) /
EPI SGA2 (16ME0507K)},
pid = {G:(DE-HGF)POF4-5122 / G:(BMBF)16ME0507K},
typ = {PUB:(DE-HGF)25},
doi = {10.1145/3695794.3695800},
url = {https://juser.fz-juelich.de/record/1031162},
}