% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@ARTICLE{Morenolvarez:1015018,
      author       = {Moreno-Álvarez, Sergio and Paoletti, Mercedes E. and
                      Cavallaro, Gabriele and Haut, Juan M.},
      title        = {{E}nhancing {D}istributed {N}eural {N}etwork {T}raining
                      {T}hrough {N}ode-{B}ased {C}ommunications},
      journal      = {IEEE transactions on neural networks and learning systems},
      volume       = {35},
      issn         = {2162-237X},
      address      = {[New York, NY]},
      publisher    = {IEEE},
      reportid     = {FZJ-2023-03545},
      pages        = {1 - 15},
      year         = {2023},
      abstract     = {The amount of data needed to effectively train modern deep
                      neural architectures has grown significantly, leading to
                      increased computational requirements. These intensive
                      computations are tackled by the combination of last
                      generation computing resources, such as accelerators, or
                      classic processing units. Nevertheless, gradient
                      communication remains as the major bottleneck, hindering the
                      efficiency notwithstanding the improvements in runtimes
                      obtained through data parallelism strategies. Data
                      parallelism involves all processes in a global exchange of
                      potentially high amount of data, which may impede the
                      achievement of the desired speedup and the elimination of
                      noticeable delays or bottlenecks. As a result, communication
                      latency issues pose a significant challenge that profoundly
                      impacts the performance on distributed platforms. This
                      research presents node-based optimization steps to
                      significantly reduce the gradient exchange between model
                      replicas whilst ensuring model convergence. The proposal
                      serves as a versatile communication scheme, suitable for
                      integration into a wide range of general-purpose deep neural
                      network (DNN) algorithms. The optimization takes into
                      consideration the specific location of each replica within
                      the platform. To demonstrate the effectiveness, different
                      neural network approaches and datasets with disjoint
                      properties are used. In addition, multiple types of
                      applications are considered to demonstrate the robustness
                      and versatility of our proposal. The experimental results
                      show a global training time reduction whilst slightly
                      improving accuracy.},
      cin          = {JSC},
      ddc          = {004},
      cid          = {I:(DE-Juel1)JSC-20090406},
      pnm          = {5111 - Domain-Specific Simulation $\&$ Data Life Cycle Labs
                      (SDLs) and Research Groups (POF4-511)},
      pid          = {G:(DE-HGF)POF4-5111},
      typ          = {PUB:(DE-HGF)16},
      pubmed       = {37721884},
      UT           = {WOS:001071988900001},
      doi          = {10.1109/TNNLS.2023.3309735},
      url          = {https://juser.fz-juelich.de/record/1015018},
}