% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@MISC{GarciadeGonzalo:903614,
      author       = {Garcia de Gonzalo, Simon and Hrywniak, Markus and Kraus,
                      Jiri and Oden, Lena and Herten, Andreas},
      title        = {{E}fficient {D}istributed {GPU} {P}rogramming for
                      {E}xascale},
      reportid     = {FZJ-2021-05268},
      year         = {2021},
      note         = {Tutorial at SC21 Conference, consisting of lectures and
                      hands-on exercises.},
      abstract     = {Over the past years, GPUs became ubiquitous in HPC
                      installations around the world. Today, they provide the
                      majority of performance of some of the largest
                      supercomputers (e.g. Summit, Sierra, JUWELS Booster). This
                      trend continues in upcoming pre-exascale and exascale
                      systems (LUMI, Leonardo; Frontier): GPUs are chosen as the
                      core computing devices to enter this next era of HPC.To take
                      advantage of future GPU-accelerated systems with tens of
                      thousands of devices, application developers need to have
                      the proper skills and tools to understand, manage, and
                      optimize distributed GPU applications.</p> <p>In this
                      tutorial, participants will learn techniques to efficiently
                      program large-scale multi-GPU systems. While programming
                      multiple GPUs with MPI is explained in detail, also advanced
                      techniques and models (NCCL, NVSHMEM, $\&hellip;)$ are
                      presented. Tools for analysis are used to motivate
                      implementation of performance optimizations. The tutorial
                      combines lectures and hands-on exercises, using Europe's
                      fastest supercomputer, JUWELS Booster with NVIDIA A100
                      GPUs.},
      month         = {Nov},
      date          = {2021-11-14},
      organization  = {Supercomputing Conference 2021,
                       online, 14 Nov 2021 - 14 Nov 2021},
      subtyp        = {After Call},
      cin          = {JSC},
      cid          = {I:(DE-Juel1)JSC-20090406},
      pnm          = {5121 - Supercomputing $\&$ Big Data Facilities (POF4-512) /
                      5111 - Domain-Specific Simulation $\&$ Data Life Cycle Labs
                      (SDLs) and Research Groups (POF4-511) / ATML-X-DEV - ATML
                      Accelerating Devices (ATML-X-DEV)},
      pid          = {G:(DE-HGF)POF4-5121 / G:(DE-HGF)POF4-5111 /
                      G:(DE-Juel-1)ATML-X-DEV},
      typ          = {PUB:(DE-HGF)17},
      doi          = {10.5281/ZENODO.5745505},
      url          = {https://juser.fz-juelich.de/record/903614},
}