% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@MISC{GarciadeGonzalo:916372,
      author       = {Garcia de Gonzalo, Simon and Oden, Lena and Herten, Andreas
                      and Hrywniak, Markus and Kraus, Jiri},
      title        = {{E}fficient {D}istributed {GPU} {P}rogramming for
                      {E}xascale},
      reportid     = {FZJ-2022-06173},
      year         = {2022},
      abstract     = {Over the past years, GPUs became ubiquitous in HPC
                      installations around the world. Today, they provide the
                      majority of performance of some of the largest
                      supercomputers (e.g. Summit, Sierra, JUWELS Booster). This
                      trend continues in the pre-exascale and exascale systems
                      (LUMI, Leonardo; Perlmutter, Frontier): GPUs are chosen as
                      the core computing devices to enter this next era of HPC. To
                      take advantage of future GPU-accelerated systems with tens
                      of thousands of devices, application developers need to have
                      the propers skills and tools to understand, manage, and
                      optimize distributed GPU applications. In this tutorial,
                      participants will learn techniques to efficiently program
                      large-scale multi-GPU systems. While programming multiple
                      GPUs with MPI is explained in detail, advanced tuning
                      techniques and complementary programming models like NCCL
                      and NVSHMEM are presented as well. Tools for analysis are
                      shown and used to motivate and implement performance
                      optimizations. The tutorial is a combination of lectures and
                      hands-on exercises, using Europe's fastest supercomputer,
                      JUWELS Booster with NVIDIA GPUs, for interactive learning
                      and discovery.},
      month         = {May},
      date          = {2022-05-29},
      organization  = {ISC High Performance 2022, Hamburg
                       (Germany), 29 May 2022 - 29 May 2022},
      subtyp        = {After Call},
      cin          = {JSC},
      cid          = {I:(DE-Juel1)JSC-20090406},
      pnm          = {5122 - Future Computing $\&$ Big Data Systems (POF4-512) /
                      5112 - Cross-Domain Algorithms, Tools, Methods Labs (ATMLs)
                      and Research Groups (POF4-511) / 5111 - Domain-Specific
                      Simulation $\&$ Data Life Cycle Labs (SDLs) and Research
                      Groups (POF4-511) / ATML-X-DEV - ATML Accelerating Devices
                      (ATML-X-DEV)},
      pid          = {G:(DE-HGF)POF4-5122 / G:(DE-HGF)POF4-5112 /
                      G:(DE-HGF)POF4-5111 / G:(DE-Juel-1)ATML-X-DEV},
      typ          = {PUB:(DE-HGF)17},
      doi          = {10.5281/ZENODO.6603470},
      url          = {https://juser.fz-juelich.de/record/916372},
}