% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@INPROCEEDINGS{Altherr:1049804,
      author       = {Altherr, Anian and Campos, Isabel and Coles, Jonathan and
                      Cotellucci, Alessandro and Fernández De la Garza, Juan
                      Antonio and Gruber, Roman and Harris, Tim and Komijani,
                      Javad and Lücke, Jens and Maier, Stephanie and Marinkovic,
                      Marina and Parato, Letizia and Patella, Agostino and Rosso,
                      Sara and Tavella, Paola and Vogt, Hannes},
      title        = {{O}(a)-improved {QCD}+{QED} {W}ilson {D}irac operator on
                      {GPU}s},
      reportid     = {FZJ-2025-05588},
      series       = {Proceedings of Science},
      pages        = {28},
      year         = {2025},
      comment      = {Proceedings of The 41st International Symposium on Lattice
                      Field Theory},
      booktitle     = {Proceedings of The 41st International
                       Symposium on Lattice Field Theory},
      abstract     = {Markov Chain Monte Carlo simulations of lattice Quantum
                      Chromodynamics (QCD) are the onlyknown tool to investigate
                      non-perturbatively the theory of the strong interaction and
                      are requiredto perform precision tests of the Standard Model
                      of Particle Physics. As the Markov Chain isa serial process,
                      the sole option for improving the sampling rate is
                      accelerating each individualupdate step. Heterogeneous
                      clusters of GPU-accelerated nodes offer large total memory
                      bandwidthwhich can be used to speed-up our application,
                      openQxD-1.1, which is dominated by inversionsof the Dirac
                      operator, a large sparse matrix. In this work we investigate
                      offloading the inversionto GPU using the lattice-QCD library
                      QUDA, and our early results demonstrate a
                      significantpotential speed-up in the time-to-solution for
                      state-of-the-art problem sizes. Minimal extensionsto the
                      existing QUDA library are required for our specific physics
                      programme while greatlyenhancing the performance portability
                      of our code and retaining the reliability and robustness
                      ofexisting applications in openQxD-1.1. Our new interface
                      will enable us to utilize pre-exascaleinfrastructure and
                      reduce the systematic uncertainty in our physics predictions
                      by incorporatingthe effects of quantum electromagnetism
                      (QED) in our simulations.},
      month         = {Jul},
      date          = {2024-07-28},
      organization  = {The 41st International Symposium on
                       Lattice Field Theory, Liverpool (UK),
                       28 Jul 2024 - 3 Aug 2024},
      cin          = {JSC},
      cid          = {I:(DE-Juel1)JSC-20090406},
      pnm          = {5111 - Domain-Specific Simulation $\&$ Data Life Cycle Labs
                      (SDLs) and Research Groups (POF4-511)},
      pid          = {G:(DE-HGF)POF4-5111},
      typ          = {PUB:(DE-HGF)8 / PUB:(DE-HGF)7},
      doi          = {10.22323/1.466.0280},
      url          = {https://juser.fz-juelich.de/record/1049804},
}