% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@INPROCEEDINGS{Baumann:1028864,
author = {Baumann, Thomas and Speck, Robert},
title = {{P}orting mpi4py-fft to {GPU}},
reportid = {FZJ-2024-04850},
year = {2024},
abstract = {The mpi4py-fft library enables distributed fast Fourier
transforms on CPUs with an easy to use interface and scales
very well. We attempt to port this to GPUs, which
significantly outperform the CPU counterpart at a given node
count. While the porting is straightforward for the most
part, the best communication strategy is still an open
question for us.The algorithm relies on MPI alltoallw. Even
with CUDA-aware MPI, this exhibits very poor performance on
the Juelich computers. By replacing it with a custom
communication strategy, throughput can be increased at a
slight loss of generality. We would like to discuss
optimising the strategy, or even if the performance of
alltoallw can be increased by some measure.},
month = {Apr},
date = {2024-04-16},
organization = {16th JLESC Workshop, Kobe (Japan), 16
Apr 2024 - 18 Apr 2024},
subtyp = {After Call},
cin = {JSC},
cid = {I:(DE-Juel1)JSC-20090406},
pnm = {5112 - Cross-Domain Algorithms, Tools, Methods Labs (ATMLs)
and Research Groups (POF4-511) / JLESC - Joint Laboratory
for Extreme Scale Computing (JLESC-20150708) / RGRSE - RG
Research Software Engineering for HPC (RG RSE) (RG-RSE)},
pid = {G:(DE-HGF)POF4-5112 / G:(DE-Juel1)JLESC-20150708 /
G:(DE-Juel-1)RG-RSE},
typ = {PUB:(DE-HGF)6},
doi = {10.34734/FZJ-2024-04850},
url = {https://juser.fz-juelich.de/record/1028864},
}