% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@ARTICLE{Fischer:1010660,
author = {Fischer, Kirsten and Dahmen, David and Helias, Moritz},
title = {{O}ptimal signal propagation in {R}es{N}ets through
residual scaling},
publisher = {arXiv},
reportid = {FZJ-2023-03175},
year = {2023},
abstract = {Residual networks (ResNets) have significantly better
trainability and thus performance than feed-forward networks
at large depth. Introducing skip connections facilitates
signal propagation to deeper layers. In addition, previous
works found that adding a scaling parameter for the residual
branch further improves generalization performance. While
they empirically identified a particularly beneficial range
of values for this scaling parameter, the associated
performance improvement and its universality across network
hyperparameters yet need to be understood. For feed-forward
networks (FFNets), finite-size theories have led to
important insights with regard to signal propagation and
hyperparameter tuning. We here derive a systematic
finite-size theory for ResNets to study signal propagation
and its dependence on the scaling for the residual branch.
We derive analytical expressions for the response function,
a measure for the network's sensitivity to inputs, and show
that for deep networks the empirically found values for the
scaling parameter lie within the range of maximal
sensitivity. Furthermore, we obtain an analytical expression
for the optimal scaling parameter that depends only weakly
on other network hyperparameters, such as the weight
variance, thereby explaining its universality across
hyperparameters. Overall, this work provides a framework for
theory-guided optimal scaling in ResNets and, more
generally, provides the theoretical framework to study
ResNets at finite widths.},
keywords = {Disordered Systems and Neural Networks (cond-mat.dis-nn)
(Other) / Machine Learning (cs.LG) (Other) / Machine
Learning (stat.ML) (Other) / FOS: Physical sciences (Other)
/ FOS: Computer and information sciences (Other)},
cin = {INM-6 / IAS-6 / INM-10},
cid = {I:(DE-Juel1)INM-6-20090406 / I:(DE-Juel1)IAS-6-20130828 /
I:(DE-Juel1)INM-10-20170113},
pnm = {5232 - Computational Principles (POF4-523) / 5234 -
Emerging NC Architectures (POF4-523) / RenormalizedFlows -
Transparent Deep Learning with Renormalized Flows
(BMBF-01IS19077A) / MSNN - Theory of multi-scale neuronal
networks (HGF-SMHB-2014-2018) / ACA - Advanced Computing
Architectures (SO-092) / neuroIC002 - Recurrence and
stochasticity for neuro-inspired computation
(EXS-SF-neuroIC002) / GRK 2416 - GRK 2416:
MultiSenses-MultiScales: Neue Ansätze zur Aufklärung
neuronaler multisensorischer Integration (368482240)},
pid = {G:(DE-HGF)POF4-5232 / G:(DE-HGF)POF4-5234 /
G:(DE-Juel-1)BMBF-01IS19077A /
G:(DE-Juel1)HGF-SMHB-2014-2018 / G:(DE-HGF)SO-092 /
G:(DE-82)EXS-SF-neuroIC002 / G:(GEPRIS)368482240},
typ = {PUB:(DE-HGF)25},
doi = {10.48550/ARXIV.2305.07715},
url = {https://juser.fz-juelich.de/record/1010660},
}