% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@ARTICLE{Morenolvarez:1015018,
author = {Moreno-Álvarez, Sergio and Paoletti, Mercedes E. and
Cavallaro, Gabriele and Haut, Juan M.},
title = {{E}nhancing {D}istributed {N}eural {N}etwork {T}raining
{T}hrough {N}ode-{B}ased {C}ommunications},
journal = {IEEE transactions on neural networks and learning systems},
volume = {35},
issn = {2162-237X},
address = {[New York, NY]},
publisher = {IEEE},
reportid = {FZJ-2023-03545},
pages = {1 - 15},
year = {2023},
abstract = {The amount of data needed to effectively train modern deep
neural architectures has grown significantly, leading to
increased computational requirements. These intensive
computations are tackled by the combination of last
generation computing resources, such as accelerators, or
classic processing units. Nevertheless, gradient
communication remains as the major bottleneck, hindering the
efficiency notwithstanding the improvements in runtimes
obtained through data parallelism strategies. Data
parallelism involves all processes in a global exchange of
potentially high amount of data, which may impede the
achievement of the desired speedup and the elimination of
noticeable delays or bottlenecks. As a result, communication
latency issues pose a significant challenge that profoundly
impacts the performance on distributed platforms. This
research presents node-based optimization steps to
significantly reduce the gradient exchange between model
replicas whilst ensuring model convergence. The proposal
serves as a versatile communication scheme, suitable for
integration into a wide range of general-purpose deep neural
network (DNN) algorithms. The optimization takes into
consideration the specific location of each replica within
the platform. To demonstrate the effectiveness, different
neural network approaches and datasets with disjoint
properties are used. In addition, multiple types of
applications are considered to demonstrate the robustness
and versatility of our proposal. The experimental results
show a global training time reduction whilst slightly
improving accuracy.},
cin = {JSC},
ddc = {004},
cid = {I:(DE-Juel1)JSC-20090406},
pnm = {5111 - Domain-Specific Simulation $\&$ Data Life Cycle Labs
(SDLs) and Research Groups (POF4-511)},
pid = {G:(DE-HGF)POF4-5111},
typ = {PUB:(DE-HGF)16},
pubmed = {37721884},
UT = {WOS:001071988900001},
doi = {10.1109/TNNLS.2023.3309735},
url = {https://juser.fz-juelich.de/record/1015018},
}