% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@ARTICLE{Zaourar:1026292,
author = {Zaourar, Lilia and Benazouz, Mohamed and Mouhagir, Ayoub
and Falquez, Carlos and Portero, Antoni and Ho, Nam and
Suarez, Estela and Petrakis, Polydoros and Marazakis,
Manolis and Sgherzi, Francesco and Fernandez, Ivan and
Dolbeau, Romain and Pleiter, Dirk},
title = {{C}ase {S}tudies on the {I}mpact and {C}hallenges of
{H}eterogeneous {NUMA} {A}rchitectures for {HPC}},
reportid = {FZJ-2024-03363},
year = {2024},
abstract = {The memory systems of High-Performance Computing (HPC)
systems commonly feature non-uniform data paths to memory,
i.e. are non-uniform memory access (NUMA) architectures.
Memory is divided into multiple regions, with each
processing unit having its own local memory. Therefore, for
each processing unit access to local memory regions is
faster compared to accessing memory at non-local regions.
Architectures with hybrid memory technologies result in
further non-uniformity. This paper presents case studies of
the performance potential and data placement implications of
non-uniform and heterogeneous memory in HPC systems. Using
the gem5 and VPSim simulation platforms, we model NUMA
systems with processors based on the ARMv8 Neoverse V1
Reference Design. The gem5 simulator provides a
cycle-accurate view, while VPSim offers greater simulation
speed, with a high-level view of the simulated system. We
highlight the performance impact of design trade-offs
regarding NUMA node organization and System Level Cache
(SLC) group assignment, as well as Network-on-Chip (NoC)
configuration. Our case studies provide essential input to a
co-design process involving HPC processor architects and
system integrators. A comparison of system configurations
for different NoC bandwidths shows reduced NoC latency and
high memory bandwidth improvement when NUMA control is
enabled. Furthermore, a configuration with HBM2 memory
organized as four NUMA nodes highlights the memory bandwidth
performance gap and NoC queuing latency impact when
comparing local vs. remote memory accesses. On the other
hand, NUMA can result in an unbalanced distribution of
memory accesses and reduced SLC hit ratios, as shown with
DDR4 memory organized as four NUMA nodes.},
cin = {JSC},
cid = {I:(DE-Juel1)JSC-20090406},
pnm = {5122 - Future Computing $\&$ Big Data Systems (POF4-512) /
EPI SGA2 (16ME0507K)},
pid = {G:(DE-HGF)POF4-5122 / G:(BMBF)16ME0507K},
typ = {PUB:(DE-HGF)25},
doi = {10.34734/FZJ-2024-03363},
url = {https://juser.fz-juelich.de/record/1026292},
}