% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@ARTICLE{Bonati:830143,
      author       = {Bonati, Claudio and Coscetti, Simone and D’Elia, Massimo
                      and Mesiti, Michele and Negro, Francesco and Calore, Enrico
                      and Schifano, Sebastiano Fabio and Silvi, Giorgio and
                      Tripiccione, Raffaele},
      title        = {{D}esign and optimization of a portable {LQCD} {M}onte
                      {C}arlo code using {O}pen{ACC}},
      journal      = {International journal of modern physics / C},
      volume       = {28},
      number       = {05},
      issn         = {1793-6586},
      address      = {Singapore [u.a.]},
      publisher    = {World Scientific},
      reportid     = {FZJ-2017-03720},
      pages        = {1750063 -},
      year         = {2017},
      abstract     = {The present panorama of HPC architectures is extremely
                      heterogeneous, ranging from traditional multi-core CPU
                      processors, supporting a wide class of applications but
                      delivering moderate computing performance, to many-core
                      Graphics Processor Units (GPUs), exploiting aggressive
                      data-parallelism and delivering higher performances for
                      streaming computing applications. In this scenario, code
                      portability (and performance portability) become necessary
                      for easy maintainability of applications; this is very
                      relevant in scientific computing where code changes are very
                      frequent, making it tedious and prone to error to keep
                      different code versions aligned. In this work, we present
                      the design and optimization of a state-of-the-art
                      production-level LQCD Monte Carlo application, using the
                      directive-based OpenACC programming model. OpenACC abstracts
                      parallel programming to a descriptive level, relieving
                      programmers from specifying how codes should be mapped onto
                      the target architecture. We describe the implementation of a
                      code fully written in OpenAcc, and show that we are able to
                      target several different architectures, including
                      state-of-the-art traditional CPUs and GPUs, with the same
                      code. We also measure performance, evaluating the computing
                      efficiency of our OpenACC code on several architectures,
                      comparing with GPU-specific implementations and showing that
                      a good level of performance-portability can be reached.},
      cin          = {JSC},
      ddc          = {530},
      cid          = {I:(DE-Juel1)JSC-20090406},
      pnm          = {511 - Computational Science and Mathematical Methods
                      (POF3-511)},
      pid          = {G:(DE-HGF)POF3-511},
      typ          = {PUB:(DE-HGF)16},
      UT           = {WOS:000401622900007},
      doi          = {10.1142/S0129183117500632},
      url          = {https://juser.fz-juelich.de/record/830143},
}