% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@ARTICLE{Knobloch:875265,
      author       = {Knobloch, Michael and Mohr, Bernd},
      title        = {{T}ools for {GPU} {C}omputing - {D}ebugging and
                      {P}erformance {A}nalysis of {H}eterogenous {HPC}
                      {A}pplications},
      journal      = {Supercomputing frontiers and innovations},
      volume       = {7},
      number       = {1},
      issn         = {2313-8734},
      address      = {Chelyabinsk},
      publisher    = {South Ural State University},
      reportid     = {FZJ-2020-01909},
      pages        = {91-111},
      year         = {2020},
      abstract     = {General purpose GPUs are now ubiquitous in high-end
                      supercomputing. All but one (the Japanese Fugaku system,
                      which is based on ARM processors) of the announced
                      (pre-)exascale systems contain vast amounts of GPUs that
                      deliver the majority of the performance of these systems.
                      Thus, GPU programming will be a necessity for application
                      developers using high-end HPC systems.However, programming
                      GPUs efficiently is an even more daunting task than
                      traditional HPC application development. This becomes even
                      more apparent for large-scale systems containing thousands
                      of GPUs. Orchestrating all the resources of such a system
                      imposes a tremendous challenge to developers. Luckily a rich
                      ecosystem of tools exist to assist developers in every
                      development step of a GPU application at all scales.In this
                      paper we present an overview of these tools and discuss
                      their capabilities. We start with an overview of different
                      GPU programming models, from low-level with CUDA over
                      pragma-based models like OpenACC to high-level approaches
                      like Kokkos. We discuss their respective tool interfaces as
                      the main method for tools to obtain information on the
                      execution of a kernel on the GPU. The main focus of this
                      paper is on two classes of tools, debuggers and performance
                      analysis tools. Debuggers help the developer to identify
                      problems both on the CPU and GPU side as well as in the
                      interplay of both. Once the application runs correctly,
                      performance analysis tools can be used to pinpoint
                      bottlenecks in the execution of the code and help to
                      increase the overall performance.},
      cin          = {JSC},
      ddc          = {004},
      cid          = {I:(DE-Juel1)JSC-20090406},
      pnm          = {511 - Computational Science and Mathematical Methods
                      (POF3-511) / POP2 - Performance Optimisation and
                      Productivity 2 (824080) / ATMLPP - ATML Parallel Performance
                      (ATMLPP)},
      pid          = {G:(DE-HGF)POF3-511 / G:(EU-Grant)824080 /
                      G:(DE-Juel-1)ATMLPP},
      typ          = {PUB:(DE-HGF)16},
      doi          = {10.14529/jsfi200105},
      url          = {https://juser.fz-juelich.de/record/875265},
}