Understanding Data Movement in AMD Multi-GPU Systems with Infinity Fabric

Schieffer, Gabin; Shi, Ruimin; Peng, Ivy; Faj, Jennifer; Herten, Andreas; Markidis, Stefano
doi:10.1109/SCW63240.2024.00079
% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@INPROCEEDINGS{Schieffer:1037595,
      author       = {Schieffer, Gabin and Shi, Ruimin and Markidis, Stefano and
                      Herten, Andreas and Faj, Jennifer and Peng, Ivy},
      title        = {{U}nderstanding {D}ata {M}ovement in {AMD} {M}ulti-{GPU}
                      {S}ystems with {I}nfinity {F}abric},
      publisher    = {IEEE},
      reportid     = {FZJ-2025-00766},
      pages        = {567-576},
      year         = {2024},
      abstract     = {Modern GPU systems are constantly evolving tomeet the needs
                      of computing-intensive applications in scientificand machine
                      learning domains. However, there is typically a gapbetween
                      the hardware capacity and the achievable
                      applicationperformance. This work aims to provide a better
                      understandingof the Infinity Fabric interconnects on AMD
                      GPUs and CPUs. Wepropose a test and evaluation methodology
                      for characterizing theperformance of data movements on
                      multi-GPU systems, stressingdifferent communication options
                      on AMD MI250X GPUs, includ-ing point-to-point and collective
                      communication, and memoryallocation strategies between GPUs,
                      as well as the host CPU.In a single-node setup with four
                      GPUs, we show that directpeer-to-peer memory accesses
                      between GPUs and utilization ofthe RCCL library outperform
                      MPI-based solutions in terms ofmemory/communication latency
                      and bandwidth. Our test andevaluation method serves as a
                      base for validating memory andcommunication strategies on a
                      system and improving applicationson AMD multi-GPU computing
                      systems.},
      month         = {Nov},
      date          = {2024-11-17},
      organization  = {SC24-W: Workshops of the International
                       Conference for High Performance
                       Computing, Networking, Storage and
                       Analysis, Atlanta, GA (USA), 17 Nov
                       2024 - 22 Nov 2024},
      cin          = {JSC},
      cid          = {I:(DE-Juel1)JSC-20090406},
      pnm          = {5112 - Cross-Domain Algorithms, Tools, Methods Labs (ATMLs)
                      and Research Groups (POF4-511) / ATML-X-DEV - ATML
                      Accelerating Devices (ATML-X-DEV)},
      pid          = {G:(DE-HGF)POF4-5112 / G:(DE-Juel-1)ATML-X-DEV},
      typ          = {PUB:(DE-HGF)8},
      UT           = {WOS:001451792300060},
      doi          = {10.1109/SCW63240.2024.00079},
      url          = {https://juser.fz-juelich.de/record/1037595},
}
guest :: login JuSER
		Search		Submit		Personalize Your alerts Your baskets Your searches		Help