% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@ARTICLE{Malapally:1005609,
      author       = {Malapally, Nitin and Bolnykh, Viacheslav and Suarez, Estela
                      and Carloni, Paolo and Lippert, Thomas and Mandelli, Davide},
      title        = {{S}calability of 3{D}-{DFT} by block tensor-matrix
                      multiplication on the {JUWELS} {C}luster},
      reportid     = {FZJ-2023-01559},
      year         = {2023},
      abstract     = {The 3D Discrete Fourier Transform (DFT) is a technique used
                      to solve problems in disparate fields. Nowadays, the
                      commonly adopted implementation of the 3D-DFT is derived
                      from the Fast Fourier Transform (FFT) algorithm. However,
                      evidence indicates that the distributed memory 3D-FFT
                      algorithm does not scale well due to its use of all-to-all
                      communication. Here, building on the work of Sedukhin et al.
                      [Proceedings of the 30th International Conference on
                      Computers and Their Applications, CATA 2015 pp. 193-200 (01
                      2015)], we revisit the possibility of improving the scaling
                      of the 3D-DFT by using an alternative approach that uses
                      point-to-point communication, albeit at a higher arithmetic
                      complexity. The new algorithm exploits tensor-matrix
                      multiplications on a volumetrically decomposed domain via
                      three specially adapted variants of Cannon's algorithm. It
                      has here been implemented as a C++ library called S3DFT and
                      tested on the JUWELS Cluster at the $J\"ulich$
                      Supercomputing Center. Our implementation of the shared
                      memory tensor-matrix multiplication attained $88\%$ of the
                      theoretical single node peak performance. One variant of the
                      distributed memory tensor-matrix multiplication shows
                      excellent scaling, while the other two show poorer
                      performance, which can be attributed to their intrinsic
                      communication patterns. A comparison of S3DFT with the Intel
                      MKL and FFTW3 libraries indicates that currently iMKL
                      performs best overall, followed in order by FFTW3 and S3DFT.
                      This picture might change with further improvements of the
                      algorithm and/or when running on clusters that use network
                      connections with higher latency, e.g. on cloud platforms.},
      cin          = {IAS-5 / INM-9 / JSC},
      cid          = {I:(DE-Juel1)IAS-5-20120330 / I:(DE-Juel1)INM-9-20140121 /
                      I:(DE-Juel1)JSC-20090406},
      pnm          = {5241 - Molecular Information Processing in Cellular Systems
                      (POF4-524) / 5121 - Supercomputing $\&$ Big Data Facilities
                      (POF4-512)},
      pid          = {G:(DE-HGF)POF4-5241 / G:(DE-HGF)POF4-5121},
      typ          = {PUB:(DE-HGF)25},
      url          = {https://juser.fz-juelich.de/record/1005609},
}