% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@PHDTHESIS{Fischer:1041036,
      author       = {Fischer, Kirsten},
      title        = {{M}echanics of deep neural networks beyond the {G}aussian
                      limit},
      volume       = {110},
      school       = {RWTH Aachen University},
      type         = {Dissertation},
      address      = {Jülich},
      publisher    = {Forschungszentrum Jülich GmbH Zentralbibliothek, Verlag},
      reportid     = {FZJ-2025-02100},
      isbn         = {978-3-95806-815-5},
      series       = {Schriften des Forschungszentrums Jülich Reihe Information
                      / Information},
      pages        = {165},
      year         = {2025},
      note         = {Dissertation, RWTH Aachen University, 2025},
      abstract     = {Current developments in the field of artificial
                      intelligence and the neural network technology supersede our
                      theoretical understanding of these networks. In the limit of
                      infinite width, networks at initialization are well
                      described by the neural network Gaussian process (NNGP): the
                      distribution of outputs is a zero-mean Gaussian
                      characterized by its covariance or kernel across data
                      samples. Going to the lazy learning regime, where network
                      parameters change only slightly from their initial values,
                      the neural tangent kernel characterizes networks trained
                      with gradient descent. Despite the success of these Gaussian
                      limits for deep neural networks, they do not capture
                      important properties such as network trainability or feature
                      learning. In this work, we go beyond Gaussian limits of deep
                      neural networks by obtaining higher-order corrections from
                      field-theoretic descriptions of neural networks. From a
                      statistical point of view, two complimentary averages have
                      to be considered: the distribution over data samples and the
                      distribution over network parameters. We investigate both
                      cases, gaining insights into the working mechanisms of deep
                      neural networks. In the former case, we study how data
                      statistics are transformed across network layers to solve
                      classification tasks. We find that, while the hidden layers
                      are well described by a non-linear mapping of the Gaussian
                      statistics, the input layer extracts information from
                      higher-order cumulants of the data. The developed
                      theoretical framework allows us to investigate the relevance
                      of different cumulant orders for classification: On MNIST,
                      Gaussian statistics account for most of the classification
                      performance, and higher-order cumulants are required to
                      fine-tune the networks for the last few percentages. In
                      contrast, more complex data sets such as CIFAR-10 require
                      the inclusion of higher-order cumulants for reasonable
                      performance values, giving an explanation for why
                      fully-connected networks perform subpar compared to
                      convolutional networks. In the latter case, we investigate
                      two different aspects: First, we derive the network kernels
                      for the Bayesian network posterior of fully-connected
                      networks and observe a non-linear adaptation of the kernels
                      to the target, which is not present in the NNGP. These
                      feature corrections result from fluctuation corrections to
                      the NNGP in finitesize networks, which allow the networks to
                      adapt to the data. While fluctuations become larger near
                      criticality, we uncover a trade-off between criticality and
                      feature learning scales in networks as a driving mechanism
                      for feature learning. Second, we study network trainability
                      of residual networks by deriving the network prior at
                      initialization. From this, we obtain the response function
                      as a leading-order correction to the NNGP, which describes
                      the signal propagation in networks. We find that scaling the
                      residual branch by a hyperparameter improves signal
                      propagation since it avoids saturation of the non-linearity
                      and thus information loss. Finally, we observe a strong
                      dependence of the optimal scaling of the residual branch on
                      the network depth but only a weak dependence on other
                      network hyperparameters, giving an explanation for the
                      universal success of depth-dependent scaling of the residual
                      branch. Overall, we derive statistical field theories for
                      deep neural networks that allow us to obtain systematic
                      corrections to the Gaussian limits. In this way, we take a
                      step towards a better mechanistic understanding of
                      information processing and data representations in neural
                      networks.},
      cin          = {IAS-6},
      cid          = {I:(DE-Juel1)IAS-6-20130828},
      pnm          = {5232 - Computational Principles (POF4-523) / 5234 -
                      Emerging NC Architectures (POF4-523) / MSNN - Theory of
                      multi-scale neuronal networks (HGF-SMHB-2014-2018) /
                      RenormalizedFlows - Transparent Deep Learning with
                      Renormalized Flows (BMBF-01IS19077A) / ACA - Advanced
                      Computing Architectures (SO-092) / neuroIC002 - Recurrence
                      and stochasticity for neuro-inspired computation
                      (EXS-SF-neuroIC002)},
      pid          = {G:(DE-HGF)POF4-5232 / G:(DE-HGF)POF4-5234 /
                      G:(DE-Juel1)HGF-SMHB-2014-2018 /
                      G:(DE-Juel-1)BMBF-01IS19077A / G:(DE-HGF)SO-092 /
                      G:(DE-82)EXS-SF-neuroIC002},
      typ          = {PUB:(DE-HGF)3 / PUB:(DE-HGF)11},
      urn          = {urn:nbn:de:0001-2504220916057.519614615515},
      doi          = {10.34734/FZJ-2025-02100},
      url          = {https://juser.fz-juelich.de/record/1041036},
}