% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@INPROCEEDINGS{Kraus:281260,
author = {Kraus, Jiri and Schlottke, Michael and Adinets, Andrey and
Pleiter, Dirk},
title = {{A}ccelerating a {C}++ {CFD} {C}ode with {O}pen{ACC}},
publisher = {IEEE},
reportid = {FZJ-2016-00959},
pages = {47-54},
year = {2014},
comment = {2014 First Workshop on Accelerator Programming using
Directives : [Proceedings] - ISBN 978-1-4673-6753-0 -},
booktitle = {2014 First Workshop on Accelerator
Programming using Directives :
[Proceedings] - ISBN 978-1-4673-6753-0
-},
abstract = {Todays HPC systems are increasingly utilizing accelerators
to lower time to solution for their users and reduce power
consumption. To utilize the higher performance and energy
efficiency of these accelerators, application developers
need to rewrite at least parts of their codes. Taking the
C++ flow solver ZFS as an example, we show that the
directive-based programming model allows one to achieve good
performance with reasonable effort, even for mature codes
with many lines of code. Using OpenACC directives permitted
us to incrementally accelerate ZFS, focusing on the parts of
the program that are relevant for the problem at hand. The
two new OpenACC 2.0 features, unstructured data regions and
atomics, are required for this. OpenACC's interoperability
with existing GPU libraries via the $host_data$ $use_device$
construct allowed to use CUDAaware MPI to achieve multi-GPU
scalability comparable to the CPU version of ZFS. Like many
other codes, the data structures of ZFS have been designed
with traditional CPUs and their relatively large private
caches in mind. This leads to suboptimal memory access
patterns on accelerators, such as GPUs. We show how the
texture cache on NVIDIA GPUs can be used to minimize the
performance impact of these suboptimal patterns without
writing platform specific code. For the kernel most affected
by the memory access pattern, we compare the initial array
of structures memory layout with a structure of arrays
layout.},
month = {Nov},
date = {2014-11-17},
organization = {2014 First Workshop on Accelerator
Programming using Directives (WACCPD),
New Orleans (LA), 17 Nov 2014 - 17 Nov
2014},
cin = {JSC},
cid = {I:(DE-Juel1)JSC-20090406},
pnm = {513 - Supercomputer Facility (POF3-513) / 41G -
Supercomputer Facility (POF2-41G21)},
pid = {G:(DE-HGF)POF3-513 / G:(DE-HGF)POF2-41G21},
typ = {PUB:(DE-HGF)8},
doi = {10.1109/WACCPD.2014.11},
url = {https://juser.fz-juelich.de/record/281260},
}