% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@PHDTHESIS{Morgenstern:1029548,
      author       = {Morgenstern, Laura},
      title        = {{E}ventify {M}eets {H}eterogeneity: {E}nabling
                      {F}ine-{G}rained {T}ask-{P}arallelism on {GPU}s},
      volume       = {63},
      school       = {Techn. Univ. Chemnitz},
      type         = {Dissertation},
      address      = {Jülich},
      publisher    = {Forschungszentrum Jülich GmbH, Zentralbibliothek, Verlag},
      reportid     = {FZJ-2024-05160},
      isbn         = {978-3-95806-765-3},
      series       = {Schriften des Forschungszentrums Jülich IAS Series},
      pages        = {xv, 110 Seiten : Illustrationen, Diagramme},
      year         = {2024},
      note         = {Dissertation, Techn. Univ. Chemnitz, 2023},
      abstract     = {Many scientific computing algorithms barely provide
                      sufficient data-parallelism to exploit the ever-increasing
                      hardware parallelism of today’s heterogeneous computing
                      environments. The challenge is to fully exploit the
                      parallelization potential of such algorithms. To tackle this
                      challenge, diverse task-parallel programming technologies
                      have been introduced that allow for the flexible description
                      of algorithms along task graphs. For algorithms with dense
                      task graphs, however, taskparallelism is still hard to
                      exploit efficiently since it is programmatically complex to
                      describe and imposes high dependency resolution overheads on
                      the execution model. This becomes especially challenging on
                      GPUs which are not designed for synchronization-heavy
                      applications. The research objective of this thesis is an
                      execution model that enables fine-grained task parallelism
                      on GPUs. To reach this objective, the contributions of the
                      thesis are five fold. Firstly, it refines the stream
                      interaction model behind Flynn’s Taxonomy as uniform
                      foundation forconcurrency in architectures and programming
                      models. Secondly, it analyzes the quantitative trends in CPU
                      and GPU architectures and examines their influence on
                      programming models. Thirdly, it introduces an execution
                      model that enables threading, efficient blocking
                      synchronization and queue-based task scheduling on GPUs.
                      Fourthly, it ports the task-parallel programming library
                      Eventify to GPUs. And fifthly, it examines the performance
                      and sustainability of this approach with the task graph of a
                      fast multipole method as use case. The results show that
                      fine-grained task parallelism improves execution time by an
                      order of magnitude in comparison to classical loop-based
                      data parallelism.},
      cin          = {JSC},
      cid          = {I:(DE-Juel1)JSC-20090406},
      pnm          = {5112 - Cross-Domain Algorithms, Tools, Methods Labs (ATMLs)
                      and Research Groups (POF4-511)},
      pid          = {G:(DE-HGF)POF4-5112},
      typ          = {PUB:(DE-HGF)3 / PUB:(DE-HGF)11},
      urn          = {urn:nbn:de:0001-20250106145649658-2552643-6},
      doi          = {10.34734/FZJ-2024-05160},
      url          = {https://juser.fz-juelich.de/record/1029548},
}