% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@INPROCEEDINGS{Lengvenis:1049541,
      author       = {Lengvenis, Arijus and Dachsel, Holger and Morgenstern,
                      Laura and Kabadshow, Ivo},
      title        = {{A} {N}ew {S}pin on the {F}ast {M}ultipole {M}ethod for
                      {GPUS}: {R}ethinking the {F}ar-{F}ield {O}perators},
      reportid     = {FZJ-2025-05345},
      year         = {2025},
      abstract     = {The Fast Multipole Method (FMM) is an optimally efficient
                      algorithm for solving N -body problems: a fundamental
                      challenge in fields like astrophysics, plasma physics and
                      molecular dynamics. It is particularly suited for computing
                      1/r potentials present in Coulomb and gravitational particle
                      systems. Despite the near-field phase being trivially
                      parallelisable, the far-field phase of the 1/r FMM currently
                      lacks an efficient, massively parallel GPU algorithm fitting
                      for the era of Exascale computing. Current state-of-the-art
                      approaches either favor highly parallel but inefficient
                      expansion shift operators or asymptotically efficient but
                      poorly parallelisable rotation-based ones. Recently, a
                      breakthrough was made with the re-evaluation of a rotation
                      operator variant called fast rotation, which dramatically
                      increases caching effectiveness and marries the advantages
                      of both methods. Thus, this paper incorporates this approach
                      to create fast rotation-based operators that facilitate an
                      efficient far-field algorithm for the FMM on GPUs.
                      Additionally, a warpcentric data access scheme is
                      co-developed alongside a matching octree design, which
                      yields coalesced memory access patterns for the bottleneck
                      operators of the far-field phase. The fast rotation
                      algorithm is enhanced with a cache-tiling mechanism,
                      maximising GPU cache utilisation. Compared to the
                      state-of-theart GPU FMM far-field implementation, our
                      algorithm achieves lower running times across the board and
                      a 2.47 x speedup for an increased precision simulation, with
                      the performance improvement growing as precision increases,
                      providing concrete proof of efficacy for dense particle
                      systems.},
      month         = {Jun},
      date          = {2025-06-03},
      organization  = {2025 IEEE International Parallel and
                       Distributed Processing Symposium,
                       Milano (Italy), 3 Jun 2025 - 7 Jun
                       2025},
      subtyp        = {After Call},
      cin          = {JSC},
      cid          = {I:(DE-Juel1)JSC-20090406},
      pnm          = {5112 - Cross-Domain Algorithms, Tools, Methods Labs (ATMLs)
                      and Research Groups (POF4-511)},
      pid          = {G:(DE-HGF)POF4-5112},
      typ          = {PUB:(DE-HGF)6},
      url          = {https://juser.fz-juelich.de/record/1049541},
}