% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@INPROCEEDINGS{Baumann:1028864,
      author       = {Baumann, Thomas and Speck, Robert},
      title        = {{P}orting mpi4py-fft to {GPU}},
      reportid     = {FZJ-2024-04850},
      year         = {2024},
      abstract     = {The mpi4py-fft library enables distributed fast Fourier
                      transforms on CPUs with an easy to use interface and scales
                      very well. We attempt to port this to GPUs, which
                      significantly outperform the CPU counterpart at a given node
                      count. While the porting is straightforward for the most
                      part, the best communication strategy is still an open
                      question for us.The algorithm relies on MPI alltoallw. Even
                      with CUDA-aware MPI, this exhibits very poor performance on
                      the Juelich computers. By replacing it with a custom
                      communication strategy, throughput can be increased at a
                      slight loss of generality. We would like to discuss
                      optimising the strategy, or even if the performance of
                      alltoallw can be increased by some measure.},
      month         = {Apr},
      date          = {2024-04-16},
      organization  = {16th JLESC Workshop, Kobe (Japan), 16
                       Apr 2024 - 18 Apr 2024},
      subtyp        = {After Call},
      cin          = {JSC},
      cid          = {I:(DE-Juel1)JSC-20090406},
      pnm          = {5112 - Cross-Domain Algorithms, Tools, Methods Labs (ATMLs)
                      and Research Groups (POF4-511) / JLESC - Joint Laboratory
                      for Extreme Scale Computing (JLESC-20150708) / RGRSE - RG
                      Research Software Engineering for HPC (RG RSE) (RG-RSE)},
      pid          = {G:(DE-HGF)POF4-5112 / G:(DE-Juel1)JLESC-20150708 /
                      G:(DE-Juel-1)RG-RSE},
      typ          = {PUB:(DE-HGF)6},
      doi          = {10.34734/FZJ-2024-04850},
      url          = {https://juser.fz-juelich.de/record/1028864},
}