% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@INPROCEEDINGS{Kraus:281260,
      author       = {Kraus, Jiri and Schlottke, Michael and Adinets, Andrey and
                      Pleiter, Dirk},
      title        = {{A}ccelerating a {C}++ {CFD} {C}ode with {O}pen{ACC}},
      publisher    = {IEEE},
      reportid     = {FZJ-2016-00959},
      pages        = {47-54},
      year         = {2014},
      comment      = {2014 First Workshop on Accelerator Programming using
                      Directives : [Proceedings] - ISBN 978-1-4673-6753-0 -},
      booktitle     = {2014 First Workshop on Accelerator
                       Programming using Directives :
                       [Proceedings] - ISBN 978-1-4673-6753-0
                       -},
      abstract     = {Todays HPC systems are increasingly utilizing accelerators
                      to lower time to solution for their users and reduce power
                      consumption. To utilize the higher performance and energy
                      efficiency of these accelerators, application developers
                      need to rewrite at least parts of their codes. Taking the
                      C++ flow solver ZFS as an example, we show that the
                      directive-based programming model allows one to achieve good
                      performance with reasonable effort, even for mature codes
                      with many lines of code. Using OpenACC directives permitted
                      us to incrementally accelerate ZFS, focusing on the parts of
                      the program that are relevant for the problem at hand. The
                      two new OpenACC 2.0 features, unstructured data regions and
                      atomics, are required for this. OpenACC's interoperability
                      with existing GPU libraries via the $host_data$ $use_device$
                      construct allowed to use CUDAaware MPI to achieve multi-GPU
                      scalability comparable to the CPU version of ZFS. Like many
                      other codes, the data structures of ZFS have been designed
                      with traditional CPUs and their relatively large private
                      caches in mind. This leads to suboptimal memory access
                      patterns on accelerators, such as GPUs. We show how the
                      texture cache on NVIDIA GPUs can be used to minimize the
                      performance impact of these suboptimal patterns without
                      writing platform specific code. For the kernel most affected
                      by the memory access pattern, we compare the initial array
                      of structures memory layout with a structure of arrays
                      layout.},
      month         = {Nov},
      date          = {2014-11-17},
      organization  = {2014 First Workshop on Accelerator
                       Programming using Directives (WACCPD),
                       New Orleans (LA), 17 Nov 2014 - 17 Nov
                       2014},
      cin          = {JSC},
      cid          = {I:(DE-Juel1)JSC-20090406},
      pnm          = {513 - Supercomputer Facility (POF3-513) / 41G -
                      Supercomputer Facility (POF2-41G21)},
      pid          = {G:(DE-HGF)POF3-513 / G:(DE-HGF)POF2-41G21},
      typ          = {PUB:(DE-HGF)8},
      doi          = {10.1109/WACCPD.2014.11},
      url          = {https://juser.fz-juelich.de/record/281260},
}