% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@ARTICLE{Mayani:1049564,
      author       = {Mayani, Sonali and Montanaro, Veronica and Cerfon, Antoine
                      and Frey, Matthias and Muralikrishnan, Sriramkrishnan and
                      Adelmann, Andreas},
      title        = {{A} {M}assively {P}arallel {P}erformance {P}ortable
                      {F}ree-{S}pace {S}pectral {P}oisson {S}olver},
      journal      = {ACM transactions on mathematical software},
      volume       = {51},
      number       = {3},
      issn         = {0098-3500},
      address      = {New York, NY},
      publisher    = {ACM},
      reportid     = {FZJ-2025-05368},
      pages        = {1 - 23},
      year         = {2025},
      abstract     = {Vico et al. suggest a fast algorithm for computing volume
                      potentials, beneficial to fields with problems requiring the
                      solution of the free-space Poisson’s equation, such as
                      beam and plasma physics. Currently, the standard is the
                      algorithm of Hockney and Eastwood, with second order in
                      convergence at best. The algorithm proposed by Vico et al.
                      converges spectrally for sufficiently smooth functions,
                      i.e., faster than any fixed order in the number of grid
                      points. We implement a performance portable version of the
                      traditional Hockney-Eastwood and the novel Vico-Greengard
                      Poisson solver as part of the Independent Parallel Particle
                      Layer (IPPL) library. For sufficiently smooth source
                      functions, the Vico-Greengard algorithm achieves higher
                      accuracy than the Hockney-Eastwood method with the same grid
                      size, reducing the computational demands of high-resolution
                      simulations since one could use coarser grids to achieve
                      them. Additionally, we propose an improvement to the
                      Vico-Greengard method which further reduces its memory
                      footprint. This is important for GPUs, which have limited
                      memory, and should be taken into account when selecting
                      numerical algorithms for performance portable codes.
                      Finally, we showcase performance through GPU and CPU scaling
                      studies on the Perlmutter (NERSC) supercomputer, with
                      efficiencies staying above $50\%$ in the strong scaling
                      case. To showcase portability, we also run the scaling
                      studies on the Alps supercomputer at CSCS, Switzerland and
                      the GPU partition of the Lumi supercomputer at CSC,
                      Finland.},
      cin          = {JSC},
      ddc          = {004},
      cid          = {I:(DE-Juel1)JSC-20090406},
      pnm          = {5112 - Cross-Domain Algorithms, Tools, Methods Labs (ATMLs)
                      and Research Groups (POF4-511)},
      pid          = {G:(DE-HGF)POF4-5112},
      typ          = {PUB:(DE-HGF)16},
      doi          = {10.1145/3748815},
      url          = {https://juser.fz-juelich.de/record/1049564},
}