% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@ARTICLE{Mayani:1049564,
author = {Mayani, Sonali and Montanaro, Veronica and Cerfon, Antoine
and Frey, Matthias and Muralikrishnan, Sriramkrishnan and
Adelmann, Andreas},
title = {{A} {M}assively {P}arallel {P}erformance {P}ortable
{F}ree-{S}pace {S}pectral {P}oisson {S}olver},
journal = {ACM transactions on mathematical software},
volume = {51},
number = {3},
issn = {0098-3500},
address = {New York, NY},
publisher = {ACM},
reportid = {FZJ-2025-05368},
pages = {1 - 23},
year = {2025},
abstract = {Vico et al. suggest a fast algorithm for computing volume
potentials, beneficial to fields with problems requiring the
solution of the free-space Poisson’s equation, such as
beam and plasma physics. Currently, the standard is the
algorithm of Hockney and Eastwood, with second order in
convergence at best. The algorithm proposed by Vico et al.
converges spectrally for sufficiently smooth functions,
i.e., faster than any fixed order in the number of grid
points. We implement a performance portable version of the
traditional Hockney-Eastwood and the novel Vico-Greengard
Poisson solver as part of the Independent Parallel Particle
Layer (IPPL) library. For sufficiently smooth source
functions, the Vico-Greengard algorithm achieves higher
accuracy than the Hockney-Eastwood method with the same grid
size, reducing the computational demands of high-resolution
simulations since one could use coarser grids to achieve
them. Additionally, we propose an improvement to the
Vico-Greengard method which further reduces its memory
footprint. This is important for GPUs, which have limited
memory, and should be taken into account when selecting
numerical algorithms for performance portable codes.
Finally, we showcase performance through GPU and CPU scaling
studies on the Perlmutter (NERSC) supercomputer, with
efficiencies staying above $50\%$ in the strong scaling
case. To showcase portability, we also run the scaling
studies on the Alps supercomputer at CSCS, Switzerland and
the GPU partition of the Lumi supercomputer at CSC,
Finland.},
cin = {JSC},
ddc = {004},
cid = {I:(DE-Juel1)JSC-20090406},
pnm = {5112 - Cross-Domain Algorithms, Tools, Methods Labs (ATMLs)
and Research Groups (POF4-511)},
pid = {G:(DE-HGF)POF4-5112},
typ = {PUB:(DE-HGF)16},
doi = {10.1145/3748815},
url = {https://juser.fz-juelich.de/record/1049564},
}