% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@INPROCEEDINGS{Rubin:1052069,
      author       = {Rubin, Noa and Fischer, Kirsten and Lindner, Javed and
                      Dahmen, David and Seroussi, Inbar and Ringel, Zohar and
                      Michael, Krämer and Helias, Moritz},
      title        = {{F}rom {K}ernels to {F}eatures: {A} {M}ulti-{S}cale
                      {A}daptive {T}heory of {F}eature {L}earning},
      reportid     = {FZJ-2026-00739},
      year         = {2025},
      abstract     = {Feature learning in neural networks is crucial fortheir
                      expressive power and inductive biases, moti-vating various
                      theoretical approaches. Some ap-proaches describe network
                      behavior after train-ing through a change in kernel scale
                      from initial-ization, resulting in a generalization power
                      com-parable to a Gaussian process. Conversely, inother
                      approaches training results in the adapta-tion of the kernel
                      to the data, involving directionalchanges to the kernel. The
                      relationship and re-spective strengths of these two views
                      have so farremained unresolved. This work presents a
                      theo-retical framework of multi-scale adaptive
                      featurelearning bridging these two views. Using methodsfrom
                      statistical mechanics, we derive analyticalexpressions for
                      network output statistics whichare valid across scaling
                      regimes and in the contin-uum between them. A systematic
                      expansion ofthe network’s probability distribution reveals
                      thatmean-field scaling requires only a
                      saddle-pointapproximation, while standard scaling
                      necessi-tates additional correction terms. Remarkably,we
                      find across regimes that kernel adaptation canbe reduced to
                      an effective kernel rescaling whenpredicting the mean
                      network output in the spe-cial case of a linear network.
                      However, for linearand non-linear networks, the multi-scale
                      adaptiveapproach captures directional feature learning
                      ef-fects, providing richer insights than what couldbe
                      recovered from a rescaling of the kernel alone},
      month         = {Jul},
      date          = {2025-07-13},
      organization  = {The 42nd International Conference on
                       Machine Learning, Vancouver (Canada),
                       13 Jul 2025 - 19 Jul 2025},
      subtyp        = {After Call},
      cin          = {IAS-6},
      cid          = {I:(DE-Juel1)IAS-6-20130828},
      pnm          = {5232 - Computational Principles (POF4-523) / 5234 -
                      Emerging NC Architectures (POF4-523) / MSNN - Theory of
                      multi-scale neuronal networks (HGF-SMHB-2014-2018) / ACA -
                      Advanced Computing Architectures (SO-092) / GRK 2416 - GRK
                      2416: MultiSenses-MultiScales: Neue Ansätze zur Aufklärung
                      neuronaler multisensorischer Integration (368482240)},
      pid          = {G:(DE-HGF)POF4-5232 / G:(DE-HGF)POF4-5234 /
                      G:(DE-Juel1)HGF-SMHB-2014-2018 / G:(DE-HGF)SO-092 /
                      G:(GEPRIS)368482240},
      typ          = {PUB:(DE-HGF)6},
      url          = {https://juser.fz-juelich.de/record/1052069},
}