% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@ARTICLE{Sarma:1031520,
      author       = {Sarma, Rakesh and Inanc, Eray and Aach, Marcel and
                      Lintermann, Andreas},
      title        = {{P}arallel and scalable {AI} in {HPC} systems for {CFD}
                      applications and beyond},
      volume       = {2},
      reportid     = {FZJ-2024-05715},
      pages        = {1444337},
      year         = {2024},
      note         = {Missing Journal: Frontiers in High Performance Computing
                      (Front. High Perform. Comput.) = 2813-7337 (import from
                      CrossRef, Journals: juser.fz-juelich.de); Please add the
                      journal to the list of journals},
      abstract     = {This manuscript presents the library AI4HPC with its
                      architecture and components. The library enables large-scale
                      trainings of AI models on High-Performance Computing
                      systems. It addresses challenges in handling non-uniform
                      datasets through data manipulation routines, model
                      complexity through specialized ML architectures, scalability
                      through extensive code optimizations that augment
                      performance, HyperParameter Optimization (HPO), and
                      performance monitoring. The scalability of the library is
                      demonstrated by strong scaling experiments on up to 3,664
                      Graphical Processing Units (GPUs) resulting in a scaling
                      efficiency of $96\%,$ using the performance on 1 node as
                      baseline. Furthermore, code optimizations and
                      communication/computation bottlenecks are discussed for
                      training a neural network on an actuated Turbulent Boundary
                      Layer (TBL) simulation dataset (8.3 TB) on the HPC system
                      JURECA at the Jülich Supercomputing Centre. The distributed
                      training approach significantly influences the accuracy,
                      which can be drastically compromised by varying mini-batch
                      sizes. Therefore, AI4HPC implements learning rate scaling
                      and adaptive summation algorithms, which are tested and
                      evaluated in this work. For the TBL use case, results scaled
                      up to 64 workers are shown. A further increase in the number
                      of workers causes an additional overhead due to too small
                      dataset samples per worker. Finally, the library is applied
                      for the reconstruction of TBL flows with a convolutional
                      autoencoder-based architecture and a diffusion model. In
                      case of the autoencoder, a modal decomposition shows that
                      the network provides accurate reconstructions of the
                      underlying field and achieves a mean drag prediction error
                      of $≈5\%.$ With the diffusion model, a reconstruction
                      error of $≈4\%$ is achieved when super-resolution is
                      applied to 5-fold coarsened velocity fields. The AI4HPC
                      library is agnostic to the underlying network and can be
                      adapted across various scientific and technical
                      disciplines.},
      cin          = {JSC},
      cid          = {I:(DE-Juel1)JSC-20090406},
      pnm          = {5111 - Domain-Specific Simulation $\&$ Data Life Cycle Labs
                      (SDLs) and Research Groups (POF4-511) / RAISE - Research on
                      AI- and Simulation-Based Engineering at Exascale (951733)},
      pid          = {G:(DE-HGF)POF4-5111 / G:(EU-Grant)951733},
      typ          = {PUB:(DE-HGF)16},
      doi          = {10.3389/fhpcp.2024.1444337},
      url          = {https://juser.fz-juelich.de/record/1031520},
}