% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@ARTICLE{Zaourar:1026292,
      author       = {Zaourar, Lilia and Benazouz, Mohamed and Mouhagir, Ayoub
                      and Falquez, Carlos and Portero, Antoni and Ho, Nam and
                      Suarez, Estela and Petrakis, Polydoros and Marazakis,
                      Manolis and Sgherzi, Francesco and Fernandez, Ivan and
                      Dolbeau, Romain and Pleiter, Dirk},
      title        = {{C}ase {S}tudies on the {I}mpact and {C}hallenges of
                      {H}eterogeneous {NUMA} {A}rchitectures for {HPC}},
      reportid     = {FZJ-2024-03363},
      year         = {2024},
      abstract     = {The memory systems of High-Performance Computing (HPC)
                      systems commonly feature non-uniform data paths to memory,
                      i.e. are non-uniform memory access (NUMA) architectures.
                      Memory is divided into multiple regions, with each
                      processing unit having its own local memory. Therefore, for
                      each processing unit access to local memory regions is
                      faster compared to accessing memory at non-local regions.
                      Architectures with hybrid memory technologies result in
                      further non-uniformity. This paper presents case studies of
                      the performance potential and data placement implications of
                      non-uniform and heterogeneous memory in HPC systems. Using
                      the gem5 and VPSim simulation platforms, we model NUMA
                      systems with processors based on the ARMv8 Neoverse V1
                      Reference Design. The gem5 simulator provides a
                      cycle-accurate view, while VPSim offers greater simulation
                      speed, with a high-level view of the simulated system. We
                      highlight the performance impact of design trade-offs
                      regarding NUMA node organization and System Level Cache
                      (SLC) group assignment, as well as Network-on-Chip (NoC)
                      configuration. Our case studies provide essential input to a
                      co-design process involving HPC processor architects and
                      system integrators. A comparison of system configurations
                      for different NoC bandwidths shows reduced NoC latency and
                      high memory bandwidth improvement when NUMA control is
                      enabled. Furthermore, a configuration with HBM2 memory
                      organized as four NUMA nodes highlights the memory bandwidth
                      performance gap and NoC queuing latency impact when
                      comparing local vs. remote memory accesses. On the other
                      hand, NUMA can result in an unbalanced distribution of
                      memory accesses and reduced SLC hit ratios, as shown with
                      DDR4 memory organized as four NUMA nodes.},
      cin          = {JSC},
      cid          = {I:(DE-Juel1)JSC-20090406},
      pnm          = {5122 - Future Computing $\&$ Big Data Systems (POF4-512) /
                      EPI SGA2 (16ME0507K)},
      pid          = {G:(DE-HGF)POF4-5122 / G:(BMBF)16ME0507K},
      typ          = {PUB:(DE-HGF)25},
      doi          = {10.34734/FZJ-2024-03363},
      url          = {https://juser.fz-juelich.de/record/1026292},
}