% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@ARTICLE{Knobloch:875265,
author = {Knobloch, Michael and Mohr, Bernd},
title = {{T}ools for {GPU} {C}omputing - {D}ebugging and
{P}erformance {A}nalysis of {H}eterogenous {HPC}
{A}pplications},
journal = {Supercomputing frontiers and innovations},
volume = {7},
number = {1},
issn = {2313-8734},
address = {Chelyabinsk},
publisher = {South Ural State University},
reportid = {FZJ-2020-01909},
pages = {91-111},
year = {2020},
abstract = {General purpose GPUs are now ubiquitous in high-end
supercomputing. All but one (the Japanese Fugaku system,
which is based on ARM processors) of the announced
(pre-)exascale systems contain vast amounts of GPUs that
deliver the majority of the performance of these systems.
Thus, GPU programming will be a necessity for application
developers using high-end HPC systems.However, programming
GPUs efficiently is an even more daunting task than
traditional HPC application development. This becomes even
more apparent for large-scale systems containing thousands
of GPUs. Orchestrating all the resources of such a system
imposes a tremendous challenge to developers. Luckily a rich
ecosystem of tools exist to assist developers in every
development step of a GPU application at all scales.In this
paper we present an overview of these tools and discuss
their capabilities. We start with an overview of different
GPU programming models, from low-level with CUDA over
pragma-based models like OpenACC to high-level approaches
like Kokkos. We discuss their respective tool interfaces as
the main method for tools to obtain information on the
execution of a kernel on the GPU. The main focus of this
paper is on two classes of tools, debuggers and performance
analysis tools. Debuggers help the developer to identify
problems both on the CPU and GPU side as well as in the
interplay of both. Once the application runs correctly,
performance analysis tools can be used to pinpoint
bottlenecks in the execution of the code and help to
increase the overall performance.},
cin = {JSC},
ddc = {004},
cid = {I:(DE-Juel1)JSC-20090406},
pnm = {511 - Computational Science and Mathematical Methods
(POF3-511) / POP2 - Performance Optimisation and
Productivity 2 (824080) / ATMLPP - ATML Parallel Performance
(ATMLPP)},
pid = {G:(DE-HGF)POF3-511 / G:(EU-Grant)824080 /
G:(DE-Juel-1)ATMLPP},
typ = {PUB:(DE-HGF)16},
doi = {10.14529/jsfi200105},
url = {https://juser.fz-juelich.de/record/875265},
}