% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@ARTICLE{Aach:1008234,
      author       = {Aach, Marcel and Inanc, Eray and Sarma, Rakesh and Riedel,
                      Morris and Lintermann, Andreas},
      title        = {{L}arge scale performance analysis of distributed deep
                      learning frameworks for convolutional neural networks},
      journal      = {Journal of Big Data},
      volume       = {10},
      number       = {1},
      issn         = {2196-1115},
      address      = {Heidelberg [u.a.]},
      publisher    = {SpringerOpen},
      reportid     = {FZJ-2023-02265},
      pages        = {96},
      year         = {2023},
      abstract     = {Continuously increasing data volumes from multiple sources,
                      such as simulation and experimental measurements, demand
                      efficient algorithms for an analysis within a realistic
                      timeframe. Deep learning models have proven to be capable of
                      understanding and analyzing large quantities of data with
                      high accuracy. However, training them on massive datasets
                      remains a challenge and requires distributed learning
                      exploiting High-Performance Computing systems. This study
                      presents a comprehensive analysis and comparison of three
                      well-established distributed deep learning frameworks -
                      Horovod, DeepSpeed, and Distributed Data Parallel by PyTorch
                      - with a focus on their runtime performance and scalability.
                      Additionally, the performance of two data loaders, the
                      native PyTorch data loader and the DALI data loader by
                      NVIDIA, is investigated. To evaluate these frameworks and
                      data loaders, three standard ResNet architectures with 50,
                      101, and 152 layers are tested using the ImageNet dataset.
                      The impact of different learning rate schedulers on
                      validation accuracy is also assessed. The novel contribution
                      lies in the detailed analysis and comparison of these
                      frameworks and data loaders on the state-of-the-art Jülich
                      Wizard for European Leadership Science (JUWELS) Booster
                      system at the Jülich Supercomputing Centre, using up to
                      1024 A100 NVIDIA GPUs in parallel. Findings show that the
                      DALI data loader significantly reduces the overall runtime
                      of ResNet50 from more than 12 h on 4 GPUs to less than 200 s
                      on 1024 GPUs. The outcomes of this work highlight the
                      potential impact of distributed deep learning using
                      efficient tools on accelerating scientific discoveries and
                      data-driven applications.},
      cin          = {JSC},
      ddc          = {004},
      cid          = {I:(DE-Juel1)JSC-20090406},
      pnm          = {5111 - Domain-Specific Simulation $\&$ Data Life Cycle Labs
                      (SDLs) and Research Groups (POF4-511) / 5112 - Cross-Domain
                      Algorithms, Tools, Methods Labs (ATMLs) and Research Groups
                      (POF4-511) / RAISE - Research on AI- and Simulation-Based
                      Engineering at Exascale (951733)},
      pid          = {G:(DE-HGF)POF4-5111 / G:(DE-HGF)POF4-5112 /
                      G:(EU-Grant)951733},
      typ          = {PUB:(DE-HGF)16},
      UT           = {WOS:001005042700001},
      doi          = {10.1186/s40537-023-00765-w},
      url          = {https://juser.fz-juelich.de/record/1008234},
}