% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@TECHREPORT{Schnurpfeil:187427,
      author       = {Schnurpfeil, Alexander and Janetzko, Florian and Janetzko,
                      Stefanie and Thust, Kay and Emran, M. S. and Schumacher, J.},
      title        = {{P}erformance {A}nalysis and {E}nabling of the {R}ay{B}en
                      {C}ode for the {I}ntel® {MIC} {A}rchitecture},
      number       = {PRACE WP 129},
      publisher    = {PRACE Consortium Partners},
      reportid     = {FZJ-2015-01092, PRACE WP 129},
      pages        = {9 p.},
      year         = {2014},
      abstract     = {The subject of this project is the analysis and enabling of
                      the RayBen code, which implements a finite difference scheme
                      for the simulation of turbulent Rayleigh-Bénard convection
                      in a closed cylindrical cell, for the Intel® Xeon Phi
                      coprocessor architecture. After a brief introduction to the
                      physical background of the code, the integration of Rayben
                      into the benchmarking environment JuBE is discussed. The
                      structure of the code is analysed through its call graph.
                      The most performance-critical routines were identified. A
                      detailed analysis of the OpenMP parallelization revealed
                      several race conditions which were eliminated. The code was
                      ported to the JUROPA cluster at the Jülich Supercomputing
                      as well as to the EURORA cluster at CINECA. The performance
                      of the code is discussed using the results of pure MPI and
                      hybrid MPI/OpenMP benchmarks. It is shown that RayBen is a
                      memory-intensive application that highly benefits from the
                      MPI parallelization. The offloading mechanism for the
                      Intel® MIC architecture lowers considerably the performance
                      while the use of binaries that run exclusively on the
                      coprocessor show a satisfactory performance and a
                      scalability which is comparable to the CPU.},
      cin          = {JSC},
      cid          = {I:(DE-Juel1)JSC-20090406},
      pnm          = {41G - Supercomputer Facility (POF2-41G21) / PRACE-1IP -
                      PRACE - First Implementation Phase Project (261557)},
      pid          = {G:(DE-HGF)POF2-41G21 / G:(EU-Grant)261557},
      typ          = {PUB:(DE-HGF)29},
      url          = {https://juser.fz-juelich.de/record/187427},
}