% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@INPROCEEDINGS{Morgenstern:884765,
      author       = {Morgenstern, Laura and Haensel, David and Beckmann, Andreas
                      and Kabadshow, Ivo},
      title        = {{NUMA}-{A}wareness as a {P}lug-{I}n for an
                      {E}ventify-{B}ased {F}ast {M}ultipole {M}ethod},
      reportid     = {FZJ-2020-03241},
      isbn         = {978-3-030-50436-6},
      pages        = {428-441},
      year         = {2020},
      comment      = {Computational Science – ICCS 2020},
      booktitle     = {Computational Science – ICCS 2020},
      abstract     = {Following the trend towards Exascale, today’s
                      supercomputers consist of increasingly complex and
                      heterogeneous compute nodes. To exploit the performance of
                      these systems, research software in HPC needs to keep up
                      with the rapid development of hardware architectures. Since
                      manual tuning of software to each and every architecture is
                      neither sustainable nor viable, we aim to tackle this
                      challenge through appropriate software design. In this
                      article, we aim to improve the performance and
                      sustainability of FMSolvr, a parallel Fast Multipole Method
                      for Molecular Dynamics, by adapting it to Non-Uniform Memory
                      Access architectures in a portable and maintainable way. The
                      parallelization of FMSolvr is based on Eventify, an
                      event-based tasking framework we co-developed with FMSolvr.
                      We describe a layered software architecture that enables the
                      separation of the Fast Multipole Method from its
                      parallelization. The focus of this article is on the
                      development and analysis of a reusable NUMA module that
                      improves performance while keeping both layers separated to
                      preserve maintainability and extensibility. By means of the
                      NUMA module we introduce diverse NUMA-aware data
                      distribution, thread pinning and work stealing policies for
                      FMSolvr. During the performance analysis the modular design
                      of the NUMA module was advantageous since it facilitates
                      combination, interchange and redesign of the developed
                      policies. The performance analysis reveals that the runtime
                      of FMSolvr is reduced by $21\%$ from 1.48 ms to 1.16 ms
                      through these policies.},
      month         = {Jun},
      date          = {2020-06-03},
      organization  = {International Conference on
                       Computational Science 2020, Amsterdam
                       (The Netherlands), 3 Jun 2020 - 5 Jun
                       2020},
      cin          = {JSC / IAS-7},
      cid          = {I:(DE-Juel1)JSC-20090406 / I:(DE-Juel1)IAS-7-20180321},
      pnm          = {511 - Computational Science and Mathematical Methods
                      (POF3-511) / PhD no Grant - Doktorand ohne besondere
                      Förderung (PHD-NO-GRANT-20170405)},
      pid          = {G:(DE-HGF)POF3-511 / G:(DE-Juel1)PHD-NO-GRANT-20170405},
      typ          = {PUB:(DE-HGF)8 / PUB:(DE-HGF)7},
      UT           = {WOS:000841686400031},
      doi          = {10.1007/978-3-030-50436-6_31},
      url          = {https://juser.fz-juelich.de/record/884765},
}