% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@INPROCEEDINGS{Lengvenis:1049541,
author = {Lengvenis, Arijus and Dachsel, Holger and Morgenstern,
Laura and Kabadshow, Ivo},
title = {{A} {N}ew {S}pin on the {F}ast {M}ultipole {M}ethod for
{GPUS}: {R}ethinking the {F}ar-{F}ield {O}perators},
reportid = {FZJ-2025-05345},
year = {2025},
abstract = {The Fast Multipole Method (FMM) is an optimally efficient
algorithm for solving N -body problems: a fundamental
challenge in fields like astrophysics, plasma physics and
molecular dynamics. It is particularly suited for computing
1/r potentials present in Coulomb and gravitational particle
systems. Despite the near-field phase being trivially
parallelisable, the far-field phase of the 1/r FMM currently
lacks an efficient, massively parallel GPU algorithm fitting
for the era of Exascale computing. Current state-of-the-art
approaches either favor highly parallel but inefficient
expansion shift operators or asymptotically efficient but
poorly parallelisable rotation-based ones. Recently, a
breakthrough was made with the re-evaluation of a rotation
operator variant called fast rotation, which dramatically
increases caching effectiveness and marries the advantages
of both methods. Thus, this paper incorporates this approach
to create fast rotation-based operators that facilitate an
efficient far-field algorithm for the FMM on GPUs.
Additionally, a warpcentric data access scheme is
co-developed alongside a matching octree design, which
yields coalesced memory access patterns for the bottleneck
operators of the far-field phase. The fast rotation
algorithm is enhanced with a cache-tiling mechanism,
maximising GPU cache utilisation. Compared to the
state-of-theart GPU FMM far-field implementation, our
algorithm achieves lower running times across the board and
a 2.47 x speedup for an increased precision simulation, with
the performance improvement growing as precision increases,
providing concrete proof of efficacy for dense particle
systems.},
month = {Jun},
date = {2025-06-03},
organization = {2025 IEEE International Parallel and
Distributed Processing Symposium,
Milano (Italy), 3 Jun 2025 - 7 Jun
2025},
subtyp = {After Call},
cin = {JSC},
cid = {I:(DE-Juel1)JSC-20090406},
pnm = {5112 - Cross-Domain Algorithms, Tools, Methods Labs (ATMLs)
and Research Groups (POF4-511)},
pid = {G:(DE-HGF)POF4-5112},
typ = {PUB:(DE-HGF)6},
url = {https://juser.fz-juelich.de/record/1049541},
}