% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@INPROCEEDINGS{Herten:893756,
      author       = {Herten, Andreas},
      title        = {{JUWELS} {B}ooster - {E}arly {U}ser {E}xperiences},
      reportid     = {FZJ-2021-02811},
      year         = {2021},
      abstract     = {Over the last few years, GPUs became ubiquitous in HPC
                      installations around the world. Today, they provide the main
                      source of performance in a number of Top500 machines - for
                      example Summit, Sierra, and JUWELS Booster. Also for the
                      upcoming Exascale era, GPUs are selected as key enablers and
                      will be installed numerously. While individual GPU devices
                      already offer plenty of performance (O (10) TFLOP/s FP64),
                      current and next-generation super-computers employ them in
                      the thousands. Using these machines to the fullest extend
                      means not only utilizing individual devices efficiently, but
                      using the entire interconnected system of devices
                      thoroughly.JUWELS Booster is a recently installed Tier-0/1
                      system at Jülich Supercomputing Centre (JSC), currently the
                      7th-fastest supercomputer in the world, and the fastest in
                      Europe. JUWELS Booster features 936 nodes, each equipped
                      with 4 NVIDIA A100 Tensor Core GPUs and 4 Mellanox HDR200
                      InfiniBand HCAs. The peak performance of all GPUs together
                      sums up to 73 PFLOP/s and it features a DragonFly+ network
                      topology with 800 Gbit/s network injection bandwidth per
                      node.During installation of JUWELS Booster, a selected set
                      of applications were given access to the system as part of
                      the JUWELS Booster Early Access Program. To prepare for
                      their first compute time allocation, scientific users were
                      able to gain first experiences on the machine. They gave
                      direct feedback to the system operations team during
                      installation and beyond. Close collaboration was facilitated
                      with the application support staff of JSC, giving unique
                      insights into the individual processes of utilizing a
                      brand-new large-sale system for a first time. Likewise,
                      performance profiles of applications could be studied and
                      collaboratively analyzed, employing available tools and
                      methods. Performance limiters of the specific application on
                      the platform were identified and proposals for improvement
                      developed.This talk will present first experiences with
                      JUWELS Booster and the applications utilizing the system
                      during its first months. Applied methods for onboarding,
                      analysis, and optimization will be shown and assessed.
                      Highlights of the state of the art of performance analysis
                      and modeling for GPUs will be presented with concrete
                      examples from the JUWELS Booster Early Access Program.},
      month         = {Jun},
      date          = {2021-06-21},
      organization  = {The 30th International Symposium on
                       High-Performance Parallel and
                       Distributed Computing, PERMAVOST
                       Workshop, Virtual (Sweden), 21 Jun 2021
                       - 25 Jun 2021},
      subtyp        = {Plenary/Keynote},
      cin          = {JSC},
      cid          = {I:(DE-Juel1)JSC-20090406},
      pnm          = {5121 - Supercomputing $\&$ Big Data Facilities (POF4-512) /
                      5112 - Cross-Domain Algorithms, Tools, Methods Labs (ATMLs)
                      and Research Groups (POF4-511) / ATML-X-DEV - ATML
                      Accelerating Devices (ATML-X-DEV)},
      pid          = {G:(DE-HGF)POF4-5121 / G:(DE-HGF)POF4-5112 /
                      G:(DE-Juel-1)ATML-X-DEV},
      typ          = {PUB:(DE-HGF)6},
      UT           = {WOS:001322551200001},
      doi          = {10.1145/3452412.3462752},
      url          = {https://juser.fz-juelich.de/record/893756},
}