% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@ARTICLE{Kohnke:889151,
      author       = {Kohnke, Bartosz and Kutzner, Carsten and Beckmann, Andreas
                      and Lube, Gert and Kabadshow, Ivo and Dachsel, Holger and
                      Grubmüller, Helmut},
      title        = {{A} {CUDA} fast multipole method with highly efficient
                      {M}2{L} far field evaluation},
      journal      = {The international journal of high performance computing
                      applications},
      volume       = {35},
      number       = {1},
      issn         = {1741-2846},
      address      = {Thousand Oaks, Calif.},
      publisher    = {Sage Science Press},
      reportid     = {FZJ-2021-00076},
      pages        = {97 - 117},
      year         = {2021},
      abstract     = {Solving an N-body problem, electrostatic or gravitational,
                      is a crucial task and the main computational bottleneck in
                      many scientific applications. Its direct solution is an
                      ubiquitous showcase example for the compute power of
                      graphics processing units (GPUs). However, the naïve
                      pairwise summation has 𝒪(𝑁2) computational complexity.
                      The fast multipole method (FMM) can reduce runtime and
                      complexity to 𝒪(𝑁) for any specified precision. Here,
                      we present a CUDA-accelerated, C++ FMM implementation for
                      multi particle systems with 𝑟−1 potential that are
                      found, e.g. in biomolecular simulations. The algorithm
                      involves several operators to exchange information in an
                      octree data structure. We focus on the Multipole-to-Local
                      (M2L) operator, as its runtime is limiting for the overall
                      performance. We propose, implement and benchmark three
                      different M2L parallelization approaches. Approach (1)
                      utilizes Unified Memory to minimize programming and porting
                      efforts. It achieves decent speedups for only little
                      implementation work. Approach (2) employs CUDA Dynamic
                      Parallelism to significantly improve performance for high
                      approximation accuracies. The presorted list-based approach
                      (3) fits periodic boundary conditions particularly well. It
                      exploits FMM operator symmetries to minimize both memory
                      access and the number of complex multiplications. The result
                      is a compute-bound implementation, i.e. performance is
                      limited by arithmetic operations rather than by memory
                      accesses. The complete CUDA parallelized FMM is incorporated
                      within the GROMACS molecular dynamics package as an
                      alternative Coulomb solver.},
      cin          = {JSC},
      ddc          = {004},
      cid          = {I:(DE-Juel1)JSC-20090406},
      pnm          = {5112 - Cross-Domain Algorithms, Tools, Methods Labs (ATMLs)
                      and Research Groups (POF4-511)},
      pid          = {G:(DE-HGF)POF4-5112},
      typ          = {PUB:(DE-HGF)16},
      UT           = {WOS:000578560700001},
      doi          = {10.1177/1094342020964857},
      url          = {https://juser.fz-juelich.de/record/889151},
}