% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@ARTICLE{Sarma:1031520,
author = {Sarma, Rakesh and Inanc, Eray and Aach, Marcel and
Lintermann, Andreas},
title = {{P}arallel and scalable {AI} in {HPC} systems for {CFD}
applications and beyond},
volume = {2},
reportid = {FZJ-2024-05715},
pages = {1444337},
year = {2024},
note = {Missing Journal: Frontiers in High Performance Computing
(Front. High Perform. Comput.) = 2813-7337 (import from
CrossRef, Journals: juser.fz-juelich.de); Please add the
journal to the list of journals},
abstract = {This manuscript presents the library AI4HPC with its
architecture and components. The library enables large-scale
trainings of AI models on High-Performance Computing
systems. It addresses challenges in handling non-uniform
datasets through data manipulation routines, model
complexity through specialized ML architectures, scalability
through extensive code optimizations that augment
performance, HyperParameter Optimization (HPO), and
performance monitoring. The scalability of the library is
demonstrated by strong scaling experiments on up to 3,664
Graphical Processing Units (GPUs) resulting in a scaling
efficiency of $96\%,$ using the performance on 1 node as
baseline. Furthermore, code optimizations and
communication/computation bottlenecks are discussed for
training a neural network on an actuated Turbulent Boundary
Layer (TBL) simulation dataset (8.3 TB) on the HPC system
JURECA at the Jülich Supercomputing Centre. The distributed
training approach significantly influences the accuracy,
which can be drastically compromised by varying mini-batch
sizes. Therefore, AI4HPC implements learning rate scaling
and adaptive summation algorithms, which are tested and
evaluated in this work. For the TBL use case, results scaled
up to 64 workers are shown. A further increase in the number
of workers causes an additional overhead due to too small
dataset samples per worker. Finally, the library is applied
for the reconstruction of TBL flows with a convolutional
autoencoder-based architecture and a diffusion model. In
case of the autoencoder, a modal decomposition shows that
the network provides accurate reconstructions of the
underlying field and achieves a mean drag prediction error
of $≈5\%.$ With the diffusion model, a reconstruction
error of $≈4\%$ is achieved when super-resolution is
applied to 5-fold coarsened velocity fields. The AI4HPC
library is agnostic to the underlying network and can be
adapted across various scientific and technical
disciplines.},
cin = {JSC},
cid = {I:(DE-Juel1)JSC-20090406},
pnm = {5111 - Domain-Specific Simulation $\&$ Data Life Cycle Labs
(SDLs) and Research Groups (POF4-511) / RAISE - Research on
AI- and Simulation-Based Engineering at Exascale (951733)},
pid = {G:(DE-HGF)POF4-5111 / G:(EU-Grant)951733},
typ = {PUB:(DE-HGF)16},
doi = {10.3389/fhpcp.2024.1444337},
url = {https://juser.fz-juelich.de/record/1031520},
}