% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@ARTICLE{Keup:906961,
      author       = {Keup, Christian and Helias, Moritz},
      title        = {{O}rigami in {N} dimensions: {H}ow feed-forward networks
                      manufacture linear separability},
      publisher    = {arXiv},
      reportid     = {FZJ-2022-01779, arXiv:2203.11355},
      year         = {2022},
      abstract     = {Neural networks can implement arbitrary functions. But,
                      mechanistically, what are the tools at their disposal to
                      construct the target? For classification tasks, the network
                      must transform the data classes into a linearly separable
                      representation in the final hidden layer. We show that a
                      feed-forward architecture has one primary tool at hand to
                      achieve this separability: progressive folding of the data
                      manifold in unoccupied higher dimensions. The operation of
                      folding provides a useful intuition in low-dimensions that
                      generalizes to high ones. We argue that an alternative
                      method based on shear, requiring very deep architectures,
                      plays only a small role in real-world networks. The folding
                      operation, however, is powerful as long as layers are wider
                      than the data dimensionality, allowing efficient solutions
                      by providing access to arbitrary regions in the
                      distribution, such as data points of one class forming
                      islands within the other classes. We argue that a link
                      exists between the universal approximation property in ReLU
                      networks and the fold-and-cut theorem (Demaine et al., 1998)
                      dealing with physical paper folding. Based on the
                      mechanistic insight, we predict that the progressive
                      generation of separability is necessarily accompanied by
                      neurons showing mixed selectivity and bimodal tuning curves.
                      This is validated in a network trained on the poker hand
                      task, showing the emergence of bimodal tuning curves during
                      training. We hope that our intuitive picture of the data
                      transformation in deep networks can help to provide
                      interpretability, and discuss possible applications to the
                      theory of convolutional networks, loss landscapes, and
                      generalization. TL;DR: Shows that the internal processing of
                      deep networks can be thought of as literal folding
                      operations on the data distribution in the N-dimensional
                      activation space. A link to a well-known theorem in origami
                      theory is provided.},
      keywords     = {Machine Learning (cs.LG) (Other) / Disordered Systems and
                      Neural Networks (cond-mat.dis-nn) (Other) / Machine Learning
                      (stat.ML) (Other) / FOS: Computer and information sciences
                      (Other) / FOS: Physical sciences (Other)},
      cin          = {INM-6 / IAS-6 / INM-10},
      cid          = {I:(DE-Juel1)INM-6-20090406 / I:(DE-Juel1)IAS-6-20130828 /
                      I:(DE-Juel1)INM-10-20170113},
      pnm          = {5232 - Computational Principles (POF4-523) /
                      RenormalizedFlows - Transparent Deep Learning with
                      Renormalized Flows (BMBF-01IS19077A) / neuroIC002 -
                      Recurrence and stochasticity for neuro-inspired computation
                      (EXS-SF-neuroIC002) / SDS005 - Towards an integrated data
                      science of complex natural systems (PF-JARA-SDS005) / GRK
                      2416 - GRK 2416: MultiSenses-MultiScales: Neue Ansätze zur
                      Aufklärung neuronaler multisensorischer Integration
                      (368482240)},
      pid          = {G:(DE-HGF)POF4-5232 / G:(DE-Juel-1)BMBF-01IS19077A /
                      G:(DE-82)EXS-SF-neuroIC002 / G:(DE-Juel-1)PF-JARA-SDS005 /
                      G:(GEPRIS)368482240},
      typ          = {PUB:(DE-HGF)25},
      eprint       = {2203.11355},
      howpublished = {arXiv:2203.11355},
      archivePrefix = {arXiv},
      SLACcitation = {$\%\%CITATION$ = $arXiv:2203.11355;\%\%$},
      doi          = {10.48550/arXiv.2203.11355},
      url          = {https://juser.fz-juelich.de/record/906961},
}