% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@ARTICLE{Aach:1008234,
author = {Aach, Marcel and Inanc, Eray and Sarma, Rakesh and Riedel,
Morris and Lintermann, Andreas},
title = {{L}arge scale performance analysis of distributed deep
learning frameworks for convolutional neural networks},
journal = {Journal of Big Data},
volume = {10},
number = {1},
issn = {2196-1115},
address = {Heidelberg [u.a.]},
publisher = {SpringerOpen},
reportid = {FZJ-2023-02265},
pages = {96},
year = {2023},
abstract = {Continuously increasing data volumes from multiple sources,
such as simulation and experimental measurements, demand
efficient algorithms for an analysis within a realistic
timeframe. Deep learning models have proven to be capable of
understanding and analyzing large quantities of data with
high accuracy. However, training them on massive datasets
remains a challenge and requires distributed learning
exploiting High-Performance Computing systems. This study
presents a comprehensive analysis and comparison of three
well-established distributed deep learning frameworks -
Horovod, DeepSpeed, and Distributed Data Parallel by PyTorch
- with a focus on their runtime performance and scalability.
Additionally, the performance of two data loaders, the
native PyTorch data loader and the DALI data loader by
NVIDIA, is investigated. To evaluate these frameworks and
data loaders, three standard ResNet architectures with 50,
101, and 152 layers are tested using the ImageNet dataset.
The impact of different learning rate schedulers on
validation accuracy is also assessed. The novel contribution
lies in the detailed analysis and comparison of these
frameworks and data loaders on the state-of-the-art Jülich
Wizard for European Leadership Science (JUWELS) Booster
system at the Jülich Supercomputing Centre, using up to
1024 A100 NVIDIA GPUs in parallel. Findings show that the
DALI data loader significantly reduces the overall runtime
of ResNet50 from more than 12 h on 4 GPUs to less than 200 s
on 1024 GPUs. The outcomes of this work highlight the
potential impact of distributed deep learning using
efficient tools on accelerating scientific discoveries and
data-driven applications.},
cin = {JSC},
ddc = {004},
cid = {I:(DE-Juel1)JSC-20090406},
pnm = {5111 - Domain-Specific Simulation $\&$ Data Life Cycle Labs
(SDLs) and Research Groups (POF4-511) / 5112 - Cross-Domain
Algorithms, Tools, Methods Labs (ATMLs) and Research Groups
(POF4-511) / RAISE - Research on AI- and Simulation-Based
Engineering at Exascale (951733)},
pid = {G:(DE-HGF)POF4-5111 / G:(DE-HGF)POF4-5112 /
G:(EU-Grant)951733},
typ = {PUB:(DE-HGF)16},
UT = {WOS:001005042700001},
doi = {10.1186/s40537-023-00765-w},
url = {https://juser.fz-juelich.de/record/1008234},
}