% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@ARTICLE{Khalfaoui:1053126,
      author       = {Khalfaoui, Ismail and Kesselheim, Stefan},
      title        = {{P}olynomial, trigonometric, and tropical activations},
      publisher    = {arXiv},
      reportid     = {FZJ-2026-01459, arXiv:2502.01247},
      year         = {2025},
      abstract     = {Which functions can be used as activations in deep neural
                      networks? This article explores families of functions based
                      on orthonormal bases, including the Hermite polynomial basis
                      and the Fourier trigonometric basis, as well as a basis
                      resulting from the tropicalization of a polynomial basis.
                      Our study shows that, through simple variance-preserving
                      initialization and without additional clamping mechanisms,
                      these activations can successfully be used to train deep
                      models, such as GPT-2 for next-token prediction on
                      OpenWebText and ConvNeXt for image classification on
                      ImageNet. Our work addresses the issue of exploding and
                      vanishing activations and gradients, particularly prevalent
                      with polynomial activations, and opens the door for
                      improving the efficiency of large-scale learning tasks.
                      Furthermore, our approach provides insight into the
                      structure of neural networks, revealing that networks with
                      polynomial activations can be interpreted as multivariate
                      polynomial mappings. Finally, using Hermite interpolation,
                      we show that our activations can closely approximate
                      classical ones in pre-trained models by matching both the
                      function and its derivative, making them especially useful
                      for fine-tuning tasks. These activations are available in
                      the torchortho library, which can be accessed via:
                      https://github.com/K-H-Ismail/torchortho.},
      keywords     = {Machine Learning (cs.LG) (Other) / Artificial Intelligence
                      (cs.AI) (Other) / Computation and Language (cs.CL) (Other) /
                      Computer Vision and Pattern Recognition (cs.CV) (Other) /
                      Algebraic Geometry (math.AG) (Other) / FOS: Computer and
                      information sciences (Other) / FOS: Mathematics (Other)},
      pnm          = {Helmholtz AI Consultant Team FB Information (E54.303.11) /
                      nxtAIM - nxtAIM – NXT GEN AI Methods (19A23014l) / 5112 -
                      Cross-Domain Algorithms, Tools, Methods Labs (ATMLs) and
                      Research Groups (POF4-511)},
      pid          = {G:(DE-Juel-1)E54.303.11 / G:(BMWK)19A23014l /
                      G:(DE-HGF)POF4-5112},
      typ          = {PUB:(DE-HGF)25},
      eprint       = {2502.01247},
      howpublished = {arXiv:2502.01247},
      archivePrefix = {arXiv},
      SLACcitation = {$\%\%CITATION$ = $arXiv:2502.01247;\%\%$},
      url          = {https://juser.fz-juelich.de/record/1053126},
}