% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@ARTICLE{Fischer:1048776,
author = {Fischer, Kirsten and Dahmen, David and Helias, Moritz},
title = {{F}ield theory for optimal signal propagation in residual
networks},
journal = {Physical review / E},
volume = {112},
number = {6},
issn = {2470-0045},
address = {Woodbury, NY},
publisher = {Inst.},
reportid = {FZJ-2025-04891},
pages = {065301},
year = {2025},
abstract = {Residual networks have significantly better trainability
and thus performance than feed-forward networks at large
depth. Introducing skip connections facilitates signal
propagation to deeper layers. In addition, previous works
found that adding a scaling parameter for the residual
branch further improves generalization performance. While
they empirically identified a particularly beneficial range
of values for this scaling parameter, the mechanism for the
resulting performance improvement and its universality
across network hyperparameters remain an open question. For
feed-forward networks, finite-size theories have led to
important insights with regard to signal propagation and
hyperparameter tuning. We here derive a systematic
finite-size field theory for residual networks to study
signal propagation and its dependence on the scaling for the
residual branch. We derive analytical expressions for the
response function, a measure for the network’s sensitivity
to inputs, and show that for deep networks the empirically
found values for the scaling parameter lie within the range
of maximal sensitivity. Furthermore, we obtain an analytical
expression for the optimal scaling parameter that depends
only weakly on other network hyperparameters, such as the
weight variance, thereby explaining its universality across
hyperparameters. Overall, this work provides a theoretical
framework to study ResNets at finite size.},
cin = {IAS-6},
ddc = {530},
cid = {I:(DE-Juel1)IAS-6-20130828},
pnm = {5232 - Computational Principles (POF4-523) / 5234 -
Emerging NC Architectures (POF4-523) / RenormalizedFlows -
Transparent Deep Learning with Renormalized Flows
(BMBF-01IS19077A) / MSNN - Theory of multi-scale neuronal
networks (HGF-SMHB-2014-2018) / ACA - Advanced Computing
Architectures (SO-092) / neuroIC002 - Recurrence and
stochasticity for neuro-inspired computation
(EXS-SF-neuroIC002) / DFG project G:(GEPRIS)491111487 -
Open-Access-Publikationskosten / 2025 - 2027 /
Forschungszentrum Jülich (OAPKFZJ) (491111487)},
pid = {G:(DE-HGF)POF4-5232 / G:(DE-HGF)POF4-5234 /
G:(DE-Juel-1)BMBF-01IS19077A /
G:(DE-Juel1)HGF-SMHB-2014-2018 / G:(DE-HGF)SO-092 /
G:(DE-82)EXS-SF-neuroIC002 / G:(GEPRIS)491111487},
typ = {PUB:(DE-HGF)16},
doi = {10.1103/5lgz-4t7h},
url = {https://juser.fz-juelich.de/record/1048776},
}