% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@ARTICLE{Filatov:1053128,
author = {Filatov, Oleg and Wang, Jiangtao and Ebert, Jan and
Kesselheim, Stefan},
title = {{O}ptimal {S}caling {N}eeds {O}ptimal {N}orm},
publisher = {arXiv},
reportid = {FZJ-2026-01461, arXiv:2510.03871},
year = {2025},
abstract = {Despite recent progress in optimal hyperparameter transfer
under model and dataset scaling, no unifying explanatory
principle has been established. For Adam and Scion
optimizers, we discover that joint optimal scaling across
model and dataset sizes is conditioned on a single
invariant: the operator norm of the output layer. Across
models with up to 1.3B parameters trained on up to 138B
tokens, the optimal learning rate/batch size pair
$(η^{\ast}, B^{\ast})$ consistently has the same operator
norm value - a phenomenon we term norm transfer. This
constant norm condition is necessary but not sufficient:
while for each dataset size, multiple $(η, B)$ reach the
optimal norm, only a unique $(η^{\ast}, B^{\ast})$ achieves
the best loss. As a sufficient condition, we provide the
first measurement of $(η^{\ast}, B^{\ast})$ scaling with
dataset size for Scion, and find that the scaling rules are
consistent with those of Adam. Tuning per-layer-group
learning rates also improves model performance, with the
output layer being the most sensitive and hidden layers
benefiting from lower learning rates. We provide practical
insights on norm-guided optimal scaling and release our
Distributed Scion (Disco) implementation with logs from over
two thousand runs to support research on LLM training
dynamics at scale.},
keywords = {Machine Learning (cs.LG) (Other) / Artificial Intelligence
(cs.AI) (Other) / Machine Learning (stat.ML) (Other) / FOS:
Computer and information sciences (Other)},
pnm = {5112 - Cross-Domain Algorithms, Tools, Methods Labs (ATMLs)
and Research Groups (POF4-511) / Helmholtz AI Consultant
Team FB Information (E54.303.11) / TrustLLM - Democratize
Trustworthy and Efficient Large Language Model Technology
for Europe (101135671)},
pid = {G:(DE-HGF)POF4-5112 / G:(DE-Juel-1)E54.303.11 /
G:(EU-Grant)101135671},
typ = {PUB:(DE-HGF)25},
eprint = {2510.03871},
howpublished = {arXiv:2510.03871},
archivePrefix = {arXiv},
SLACcitation = {$\%\%CITATION$ = $arXiv:2510.03871;\%\%$},
url = {https://juser.fz-juelich.de/record/1053128},
}