% IMPORTANT: The following is UTF-8 encoded. This means that in the presence % of non-ASCII characters, it will not work with BibTeX 0.99 or older. % Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or % “biber”. @INPROCEEDINGS{Fischer:1029334, author = {Fischer, Kirsten and Lindner, Javed and Dahmen, David and Ringel, Zohar and Krämer, Michael and Helias, Moritz}, title = {{C}ritical feature learning in deep neural networks}, reportid = {FZJ-2024-05061}, year = {2024}, abstract = {A key property of neural networks driving their success is their ability to learn features from data. Understanding feature learning from a theoretical viewpoint is an emerging field with many open questions. In this work we capture finite-width effects with a systematic theory of network kernels in deep non-linear neural networks. We show that the Bayesian prior of the network can be written in closed form as a superposition of Gaussian processes, whose kernels are distributed with a variance that depends inversely on the network width N . A large deviation approach, which is exact in the proportional limit for the number of data points P=αN→∞, yields a pair of forward-backward equations for the maximum a posteriori kernels in all layers at once. We study their solutions perturbatively to demonstrate how the backward propagation across layers aligns kernels with the target. An alternative field-theoretic formulation shows that kernel adaptation of the Bayesian posterior at finite-width results from fluctuations in the prior: larger fluctuations correspond to a more flexible network prior and thus enable stronger adaptation to data. We thus find a bridge between the classical edge-of-chaos NNGP theory and feature learning, exposing an intricate interplay between criticality, response functions, and feature scale.}, month = {Jul}, date = {2024-07-21}, organization = {The Forty-first International Conference on Machine Learning, Wien (Austria), 21 Jul 2024 - 27 Jul 2024}, subtyp = {After Call}, cin = {IAS-6}, cid = {I:(DE-Juel1)IAS-6-20130828}, pnm = {5232 - Computational Principles (POF4-523) / 5234 - Emerging NC Architectures (POF4-523) / RenormalizedFlows - Transparent Deep Learning with Renormalized Flows (BMBF-01IS19077A) / MSNN - Theory of multi-scale neuronal networks (HGF-SMHB-2014-2018) / ACA - Advanced Computing Architectures (SO-092)}, pid = {G:(DE-HGF)POF4-5232 / G:(DE-HGF)POF4-5234 / G:(DE-Juel-1)BMBF-01IS19077A / G:(DE-Juel1)HGF-SMHB-2014-2018 / G:(DE-HGF)SO-092}, typ = {PUB:(DE-HGF)24}, url = {https://juser.fz-juelich.de/record/1029334}, }