% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@ARTICLE{Durr:1019542,
      author       = {Durr, Stephan},
      title        = {{P}ortable {CPU} implementation of {W}ilson, {B}rillouin
                      and {S}usskind fermions in lattice {QCD}},
      journal      = {Computer physics communications},
      volume       = {282},
      issn         = {0010-4655},
      address      = {Amsterdam},
      publisher    = {North Holland Publ. Co.},
      reportid     = {FZJ-2023-05489},
      pages        = {108555 -},
      year         = {2023},
      abstract     = {A modern Fortran implementation of three Dirac operators
                      (Wilson, Brillouin, Susskind) in lattice QCD is presented,
                      based on OpenMP shared-memory parallelization and SIMD
                      pragmas.The main idea is to apply a Dirac operator to $N_v$
                      vectors simultaneously, to ease the memory bandwidth
                      bottleneck.All index computations are left to the compiler
                      and maximum weight is given to portability and
                      flexibility.The lattice volume, $N_x N_y N_z N_t$, the
                      number of colors, $N_c$, and the number of right-hand sides,
                      $N_v$, are parameters defined at compile time.Several memory
                      layout options are compared.The code performs well on modern
                      many-core architectures (480\,Gflop/s, 880\,Gflop/s, and
                      780\,Gflop/s with $N_v=12$for the three operators in single
                      precision on a 72-core KNL processor, a $2 \times 24$-core
                      Skylake node yields similar results).Explicit run-time tests
                      with CG/BiCGstab inverters confirm that the memory layout is
                      relevant for the KNL, but less so for the Skylake
                      architecture.The ancillary code distribution contains all
                      routines, including the single, double, and mixed precision
                      Krylov space solvers, to render it self-contained and
                      ready-to-use.},
      cin          = {JSC},
      ddc          = {530},
      cid          = {I:(DE-Juel1)JSC-20090406},
      pnm          = {5111 - Domain-Specific Simulation $\&$ Data Life Cycle Labs
                      (SDLs) and Research Groups (POF4-511) / DFG project
                      448374536 - Fortschritte bei einer präzisen ab initio
                      Bestimmung der Partonen-Struktur von Hadronen (448374536)},
      pid          = {G:(DE-HGF)POF4-5111 / G:(GEPRIS)448374536},
      typ          = {PUB:(DE-HGF)16},
      UT           = {WOS:000876219500004},
      doi          = {10.1016/j.cpc.2022.108555},
      url          = {https://juser.fz-juelich.de/record/1019542},
}