% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@ARTICLE{Fischer:1048776,
      author       = {Fischer, Kirsten and Dahmen, David and Helias, Moritz},
      title        = {{F}ield theory for optimal signal propagation in residual
                      networks},
      journal      = {Physical review / E},
      volume       = {112},
      number       = {6},
      issn         = {2470-0045},
      address      = {Woodbury, NY},
      publisher    = {Inst.},
      reportid     = {FZJ-2025-04891},
      pages        = {065301},
      year         = {2025},
      abstract     = {Residual networks have significantly better trainability
                      and thus performance than feed-forward networks at large
                      depth. Introducing skip connections facilitates signal
                      propagation to deeper layers. In addition, previous works
                      found that adding a scaling parameter for the residual
                      branch further improves generalization performance. While
                      they empirically identified a particularly beneficial range
                      of values for this scaling parameter, the mechanism for the
                      resulting performance improvement and its universality
                      across network hyperparameters remain an open question. For
                      feed-forward networks, finite-size theories have led to
                      important insights with regard to signal propagation and
                      hyperparameter tuning. We here derive a systematic
                      finite-size field theory for residual networks to study
                      signal propagation and its dependence on the scaling for the
                      residual branch. We derive analytical expressions for the
                      response function, a measure for the network’s sensitivity
                      to inputs, and show that for deep networks the empirically
                      found values for the scaling parameter lie within the range
                      of maximal sensitivity. Furthermore, we obtain an analytical
                      expression for the optimal scaling parameter that depends
                      only weakly on other network hyperparameters, such as the
                      weight variance, thereby explaining its universality across
                      hyperparameters. Overall, this work provides a theoretical
                      framework to study ResNets at finite size.},
      cin          = {IAS-6},
      ddc          = {530},
      cid          = {I:(DE-Juel1)IAS-6-20130828},
      pnm          = {5232 - Computational Principles (POF4-523) / 5234 -
                      Emerging NC Architectures (POF4-523) / RenormalizedFlows -
                      Transparent Deep Learning with Renormalized Flows
                      (BMBF-01IS19077A) / MSNN - Theory of multi-scale neuronal
                      networks (HGF-SMHB-2014-2018) / ACA - Advanced Computing
                      Architectures (SO-092) / neuroIC002 - Recurrence and
                      stochasticity for neuro-inspired computation
                      (EXS-SF-neuroIC002) / DFG project G:(GEPRIS)491111487 -
                      Open-Access-Publikationskosten / 2025 - 2027 /
                      Forschungszentrum Jülich (OAPKFZJ) (491111487)},
      pid          = {G:(DE-HGF)POF4-5232 / G:(DE-HGF)POF4-5234 /
                      G:(DE-Juel-1)BMBF-01IS19077A /
                      G:(DE-Juel1)HGF-SMHB-2014-2018 / G:(DE-HGF)SO-092 /
                      G:(DE-82)EXS-SF-neuroIC002 / G:(GEPRIS)491111487},
      typ          = {PUB:(DE-HGF)16},
      doi          = {10.1103/5lgz-4t7h},
      url          = {https://juser.fz-juelich.de/record/1048776},
}