% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@PHDTHESIS{Morgenstern:1029548,
author = {Morgenstern, Laura},
title = {{E}ventify {M}eets {H}eterogeneity: {E}nabling
{F}ine-{G}rained {T}ask-{P}arallelism on {GPU}s},
volume = {63},
school = {Techn. Univ. Chemnitz},
type = {Dissertation},
address = {Jülich},
publisher = {Forschungszentrum Jülich GmbH, Zentralbibliothek, Verlag},
reportid = {FZJ-2024-05160},
isbn = {978-3-95806-765-3},
series = {Schriften des Forschungszentrums Jülich IAS Series},
pages = {xv, 110 Seiten : Illustrationen, Diagramme},
year = {2024},
note = {Dissertation, Techn. Univ. Chemnitz, 2023},
abstract = {Many scientific computing algorithms barely provide
sufficient data-parallelism to exploit the ever-increasing
hardware parallelism of today’s heterogeneous computing
environments. The challenge is to fully exploit the
parallelization potential of such algorithms. To tackle this
challenge, diverse task-parallel programming technologies
have been introduced that allow for the flexible description
of algorithms along task graphs. For algorithms with dense
task graphs, however, taskparallelism is still hard to
exploit efficiently since it is programmatically complex to
describe and imposes high dependency resolution overheads on
the execution model. This becomes especially challenging on
GPUs which are not designed for synchronization-heavy
applications. The research objective of this thesis is an
execution model that enables fine-grained task parallelism
on GPUs. To reach this objective, the contributions of the
thesis are five fold. Firstly, it refines the stream
interaction model behind Flynn’s Taxonomy as uniform
foundation forconcurrency in architectures and programming
models. Secondly, it analyzes the quantitative trends in CPU
and GPU architectures and examines their influence on
programming models. Thirdly, it introduces an execution
model that enables threading, efficient blocking
synchronization and queue-based task scheduling on GPUs.
Fourthly, it ports the task-parallel programming library
Eventify to GPUs. And fifthly, it examines the performance
and sustainability of this approach with the task graph of a
fast multipole method as use case. The results show that
fine-grained task parallelism improves execution time by an
order of magnitude in comparison to classical loop-based
data parallelism.},
cin = {JSC},
cid = {I:(DE-Juel1)JSC-20090406},
pnm = {5112 - Cross-Domain Algorithms, Tools, Methods Labs (ATMLs)
and Research Groups (POF4-511)},
pid = {G:(DE-HGF)POF4-5112},
typ = {PUB:(DE-HGF)3 / PUB:(DE-HGF)11},
urn = {urn:nbn:de:0001-20250106145649658-2552643-6},
doi = {10.34734/FZJ-2024-05160},
url = {https://juser.fz-juelich.de/record/1029548},
}