% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@ARTICLE{Bonati:830143,
author = {Bonati, Claudio and Coscetti, Simone and D’Elia, Massimo
and Mesiti, Michele and Negro, Francesco and Calore, Enrico
and Schifano, Sebastiano Fabio and Silvi, Giorgio and
Tripiccione, Raffaele},
title = {{D}esign and optimization of a portable {LQCD} {M}onte
{C}arlo code using {O}pen{ACC}},
journal = {International journal of modern physics / C},
volume = {28},
number = {05},
issn = {1793-6586},
address = {Singapore [u.a.]},
publisher = {World Scientific},
reportid = {FZJ-2017-03720},
pages = {1750063 -},
year = {2017},
abstract = {The present panorama of HPC architectures is extremely
heterogeneous, ranging from traditional multi-core CPU
processors, supporting a wide class of applications but
delivering moderate computing performance, to many-core
Graphics Processor Units (GPUs), exploiting aggressive
data-parallelism and delivering higher performances for
streaming computing applications. In this scenario, code
portability (and performance portability) become necessary
for easy maintainability of applications; this is very
relevant in scientific computing where code changes are very
frequent, making it tedious and prone to error to keep
different code versions aligned. In this work, we present
the design and optimization of a state-of-the-art
production-level LQCD Monte Carlo application, using the
directive-based OpenACC programming model. OpenACC abstracts
parallel programming to a descriptive level, relieving
programmers from specifying how codes should be mapped onto
the target architecture. We describe the implementation of a
code fully written in OpenAcc, and show that we are able to
target several different architectures, including
state-of-the-art traditional CPUs and GPUs, with the same
code. We also measure performance, evaluating the computing
efficiency of our OpenACC code on several architectures,
comparing with GPU-specific implementations and showing that
a good level of performance-portability can be reached.},
cin = {JSC},
ddc = {530},
cid = {I:(DE-Juel1)JSC-20090406},
pnm = {511 - Computational Science and Mathematical Methods
(POF3-511)},
pid = {G:(DE-HGF)POF3-511},
typ = {PUB:(DE-HGF)16},
UT = {WOS:000401622900007},
doi = {10.1142/S0129183117500632},
url = {https://juser.fz-juelich.de/record/830143},
}