% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@INPROCEEDINGS{Schltter:152041,
      author       = {Schlütter, Marc and Philippen, Peter and Morin, Laurent
                      and Geimer, Markus and Mohr, Bernd},
      title        = {{P}rofiling {H}ybrid {HMPP} {A}pplications with {S}core-{P}
                      on {H}eterogeneous {H}ardware},
      volume       = {25},
      publisher    = {IOS Press},
      reportid     = {FZJ-2014-01861},
      isbn         = {978-1-61499-380-3},
      series       = {Advances in Parallel Computing},
      pages        = {773 - 782},
      year         = {2014},
      comment      = {Parallel Computing: Accelerating Computational Science and
                      Engineering (CSE)},
      booktitle     = {Parallel Computing: Accelerating
                       Computational Science and Engineering
                       (CSE)},
      abstract     = {In heterogeneous environments with multi-core systems and
                      accelerators, programming and optimizing large parallel
                      applications turns into a time-intensive and
                      hardware-dependent challenge. To assist application
                      developers in this process, a number of tools and high-level
                      compilers have been developed. Directive-based programming
                      models such as HMPP and OpenACC provide abstractions over
                      low-level GPU programming models,such as CUDA or OpenCL. The
                      compilers developed by CAPS automatically transform the
                      pragma-annotated application code into low-level code,
                      thereby allowing the parallelization and optimization for a
                      given accelerator hardware. To analyze the performance of
                      parallel applications, multiple partners in Germany and the
                      US jointly develop the community measurement infrastructure
                      Score-P. Score-P gathers performance execution profiles,
                      which can be presented and analyzed within the CUBE result
                      browser, and collects detailed event traces to be processed
                      by post-mortem analysis tools such as Scalasca and Vampir.In
                      this paper we present the integration and combined use of
                      Score-P and the CAPS compilers as one approach to
                      efficiently parallelize and optimize codes. Specifically, we
                      describe the PHMPP profiling interface, it's implementation
                      in Score-P, and the presentation of preliminary results in
                      CUBE.},
      month         = {Sep},
      date          = {2013-09-10},
      organization  = {International Conference on Parallel
                       Computing, Munich (Germany), 10 Sep
                       2013 - 13 Sep 2013},
      cin          = {JSC},
      cid          = {I:(DE-Juel1)JSC-20090406},
      pnm          = {411 - Computational Science and Mathematical Methods
                      (POF2-411) / ATMLPP - ATML Parallel Performance (ATMLPP)},
      pid          = {G:(DE-HGF)POF2-411 / G:(DE-Juel-1)ATMLPP},
      typ          = {PUB:(DE-HGF)8 / PUB:(DE-HGF)7},
      UT           = {WOS:000452120400078},
      doi          = {10.3233/978-1-61499-381-0-773},
      url          = {https://juser.fz-juelich.de/record/152041},
}