% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@ARTICLE{Kohnke:889151,
author = {Kohnke, Bartosz and Kutzner, Carsten and Beckmann, Andreas
and Lube, Gert and Kabadshow, Ivo and Dachsel, Holger and
Grubmüller, Helmut},
title = {{A} {CUDA} fast multipole method with highly efficient
{M}2{L} far field evaluation},
journal = {The international journal of high performance computing
applications},
volume = {35},
number = {1},
issn = {1741-2846},
address = {Thousand Oaks, Calif.},
publisher = {Sage Science Press},
reportid = {FZJ-2021-00076},
pages = {97 - 117},
year = {2021},
abstract = {Solving an N-body problem, electrostatic or gravitational,
is a crucial task and the main computational bottleneck in
many scientific applications. Its direct solution is an
ubiquitous showcase example for the compute power of
graphics processing units (GPUs). However, the naïve
pairwise summation has 𝒪(𝑁2) computational complexity.
The fast multipole method (FMM) can reduce runtime and
complexity to 𝒪(𝑁) for any specified precision. Here,
we present a CUDA-accelerated, C++ FMM implementation for
multi particle systems with 𝑟−1 potential that are
found, e.g. in biomolecular simulations. The algorithm
involves several operators to exchange information in an
octree data structure. We focus on the Multipole-to-Local
(M2L) operator, as its runtime is limiting for the overall
performance. We propose, implement and benchmark three
different M2L parallelization approaches. Approach (1)
utilizes Unified Memory to minimize programming and porting
efforts. It achieves decent speedups for only little
implementation work. Approach (2) employs CUDA Dynamic
Parallelism to significantly improve performance for high
approximation accuracies. The presorted list-based approach
(3) fits periodic boundary conditions particularly well. It
exploits FMM operator symmetries to minimize both memory
access and the number of complex multiplications. The result
is a compute-bound implementation, i.e. performance is
limited by arithmetic operations rather than by memory
accesses. The complete CUDA parallelized FMM is incorporated
within the GROMACS molecular dynamics package as an
alternative Coulomb solver.},
cin = {JSC},
ddc = {004},
cid = {I:(DE-Juel1)JSC-20090406},
pnm = {5112 - Cross-Domain Algorithms, Tools, Methods Labs (ATMLs)
and Research Groups (POF4-511)},
pid = {G:(DE-HGF)POF4-5112},
typ = {PUB:(DE-HGF)16},
UT = {WOS:000578560700001},
doi = {10.1177/1094342020964857},
url = {https://juser.fz-juelich.de/record/889151},
}