% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@PHDTHESIS{Fischer:1041036,
author = {Fischer, Kirsten},
title = {{M}echanics of deep neural networks beyond the {G}aussian
limit},
volume = {110},
school = {RWTH Aachen University},
type = {Dissertation},
address = {Jülich},
publisher = {Forschungszentrum Jülich GmbH Zentralbibliothek, Verlag},
reportid = {FZJ-2025-02100},
isbn = {978-3-95806-815-5},
series = {Schriften des Forschungszentrums Jülich Reihe Information
/ Information},
pages = {165},
year = {2025},
note = {Dissertation, RWTH Aachen University, 2025},
abstract = {Current developments in the field of artificial
intelligence and the neural network technology supersede our
theoretical understanding of these networks. In the limit of
infinite width, networks at initialization are well
described by the neural network Gaussian process (NNGP): the
distribution of outputs is a zero-mean Gaussian
characterized by its covariance or kernel across data
samples. Going to the lazy learning regime, where network
parameters change only slightly from their initial values,
the neural tangent kernel characterizes networks trained
with gradient descent. Despite the success of these Gaussian
limits for deep neural networks, they do not capture
important properties such as network trainability or feature
learning. In this work, we go beyond Gaussian limits of deep
neural networks by obtaining higher-order corrections from
field-theoretic descriptions of neural networks. From a
statistical point of view, two complimentary averages have
to be considered: the distribution over data samples and the
distribution over network parameters. We investigate both
cases, gaining insights into the working mechanisms of deep
neural networks. In the former case, we study how data
statistics are transformed across network layers to solve
classification tasks. We find that, while the hidden layers
are well described by a non-linear mapping of the Gaussian
statistics, the input layer extracts information from
higher-order cumulants of the data. The developed
theoretical framework allows us to investigate the relevance
of different cumulant orders for classification: On MNIST,
Gaussian statistics account for most of the classification
performance, and higher-order cumulants are required to
fine-tune the networks for the last few percentages. In
contrast, more complex data sets such as CIFAR-10 require
the inclusion of higher-order cumulants for reasonable
performance values, giving an explanation for why
fully-connected networks perform subpar compared to
convolutional networks. In the latter case, we investigate
two different aspects: First, we derive the network kernels
for the Bayesian network posterior of fully-connected
networks and observe a non-linear adaptation of the kernels
to the target, which is not present in the NNGP. These
feature corrections result from fluctuation corrections to
the NNGP in finitesize networks, which allow the networks to
adapt to the data. While fluctuations become larger near
criticality, we uncover a trade-off between criticality and
feature learning scales in networks as a driving mechanism
for feature learning. Second, we study network trainability
of residual networks by deriving the network prior at
initialization. From this, we obtain the response function
as a leading-order correction to the NNGP, which describes
the signal propagation in networks. We find that scaling the
residual branch by a hyperparameter improves signal
propagation since it avoids saturation of the non-linearity
and thus information loss. Finally, we observe a strong
dependence of the optimal scaling of the residual branch on
the network depth but only a weak dependence on other
network hyperparameters, giving an explanation for the
universal success of depth-dependent scaling of the residual
branch. Overall, we derive statistical field theories for
deep neural networks that allow us to obtain systematic
corrections to the Gaussian limits. In this way, we take a
step towards a better mechanistic understanding of
information processing and data representations in neural
networks.},
cin = {IAS-6},
cid = {I:(DE-Juel1)IAS-6-20130828},
pnm = {5232 - Computational Principles (POF4-523) / 5234 -
Emerging NC Architectures (POF4-523) / MSNN - Theory of
multi-scale neuronal networks (HGF-SMHB-2014-2018) /
RenormalizedFlows - Transparent Deep Learning with
Renormalized Flows (BMBF-01IS19077A) / ACA - Advanced
Computing Architectures (SO-092) / neuroIC002 - Recurrence
and stochasticity for neuro-inspired computation
(EXS-SF-neuroIC002)},
pid = {G:(DE-HGF)POF4-5232 / G:(DE-HGF)POF4-5234 /
G:(DE-Juel1)HGF-SMHB-2014-2018 /
G:(DE-Juel-1)BMBF-01IS19077A / G:(DE-HGF)SO-092 /
G:(DE-82)EXS-SF-neuroIC002},
typ = {PUB:(DE-HGF)3 / PUB:(DE-HGF)11},
urn = {urn:nbn:de:0001-2504220916057.519614615515},
doi = {10.34734/FZJ-2025-02100},
url = {https://juser.fz-juelich.de/record/1041036},
}