% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@ARTICLE{Bonati:845928,
      author       = {Bonati, Claudio and Calore, Enrico and D’Elia, Massimo
                      and Mesiti, Michele and Negro, Francesco and Sanfilippo,
                      Francesco and Schifano, Sebastiano Fabio and Silvi, Giorgio
                      and Tripiccione, Raffaele},
      title        = {{P}ortable multi-node {LQCD} {M}onte {C}arlo simulations
                      using {O}pen{ACC}},
      journal      = {International journal of modern physics / C},
      volume       = {29},
      number       = {01},
      issn         = {1793-6586},
      address      = {Singapore [u.a.]},
      publisher    = {World Scientific},
      reportid     = {FZJ-2018-03126},
      pages        = {1850010 -},
      year         = {2018},
      abstract     = {This paper describes a state-of-the-art parallel Lattice
                      QCD Monte Carlo code for staggered fermions, purposely
                      designed to be portable across different computer
                      architectures, including GPUs and commodity CPUs.
                      Portability is achieved using the OpenACC parallel
                      programming model, used to develop a code that can be
                      compiled for several processor architectures. The paper
                      focuses on parallelization on multiple computing nodes using
                      OpenACC to manage parallelism within the node, and OpenMPI
                      to manage parallelism among the nodes. We first discuss the
                      available strategies to be adopted to maximize performances,
                      we then describe selected relevant details of the code, and
                      finally measure the level of performance and
                      scaling-performance that we are able to achieve. The work
                      focuses mainly on GPUs, which offer a significantly high
                      level of performances for this application, but also
                      compares with results measured on other processors.},
      cin          = {JSC},
      ddc          = {530},
      cid          = {I:(DE-Juel1)JSC-20090406},
      pnm          = {511 - Computational Science and Mathematical Methods
                      (POF3-511) / PhD no Grant - Doktorand ohne besondere
                      Förderung (PHD-NO-GRANT-20170405)},
      pid          = {G:(DE-HGF)POF3-511 / G:(DE-Juel1)PHD-NO-GRANT-20170405},
      typ          = {PUB:(DE-HGF)16},
      UT           = {WOS:000426590000010},
      doi          = {10.1142/S0129183118500109},
      url          = {https://juser.fz-juelich.de/record/845928},
}