% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@INPROCEEDINGS{Schlepphorst:1018549,
      author       = {Schlepphorst, Simon and Krieg, Stefan},
      title        = {{B}enchmarking a portable lattice quantum chromodynamics
                      kernel written in {K}okkos and {MPI}},
      publisher    = {ACM New York, NY, USA},
      reportid     = {FZJ-2023-04877},
      pages        = {1027–1037},
      year         = {2023},
      comment      = {Proceedings of the SC '23 Workshops of The International
                      Conference on High Performance Computing, Network, Storage,
                      and Analysis - ACM New York, NY, USA, 2023. - ISBN
                      9798400707858 - doi:10.1145/3624062.3624179},
      booktitle     = {Proceedings of the SC '23 Workshops of
                       The International Conference on High
                       Performance Computing, Network,
                       Storage, and Analysis - ACM New York,
                       NY, USA, 2023. - ISBN 9798400707858 -
                       doi:10.1145/3624062.3624179},
      abstract     = {Simulations of Lattice Quantum Chromodynamics (LQCD) are an
                      important application (two digit percentage of cycles) on
                      major High Performance Computing (HPC) installations,
                      including systems high up on and leading the top500 list. In
                      the rapidly changing hardware landscape of HPC, tying up
                      manpower optimizing simulation software for every
                      architecture becomes a sustainability issue. In this work we
                      explore the feasibility of using performance portable
                      parallel code for an important LQCD kernel. Fusing the
                      Kokkos C++ Performance Portability EcoSystem with MPI allows
                      applications to scale on massive parallel machines while
                      still being able to target a plentitude of different
                      architectures with the same simple code. We report on
                      benchmarking results for a range of currently deployed and
                      recently introduced systems, including AMD EPYC 7742, AMD
                      MI250, Fujitsu A64FX, Nvidia A100 and Nvidia H100
                      components, with mostly encouraging results.},
      month         = {Nov},
      date          = {2023-11-12},
      organization  = {SC-W 2023: Workshops of The
                       International Conference on High
                       Performance Computing, Network,
                       Storage, and Analysis, Denver CO USA
                       (USA), 12 Nov 2023 - 17 Nov 2023},
      cin          = {JSC / CASA},
      cid          = {I:(DE-Juel1)JSC-20090406 / I:(DE-Juel1)CASA-20230315},
      pnm          = {5111 - Domain-Specific Simulation $\&$ Data Life Cycle Labs
                      (SDLs) and Research Groups (POF4-511)},
      pid          = {G:(DE-HGF)POF4-5111},
      typ          = {PUB:(DE-HGF)8 / PUB:(DE-HGF)7},
      doi          = {10.1145/3624062.3624179},
      url          = {https://juser.fz-juelich.de/record/1018549},
}