% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@ARTICLE{Keup:906961,
author = {Keup, Christian and Helias, Moritz},
title = {{O}rigami in {N} dimensions: {H}ow feed-forward networks
manufacture linear separability},
publisher = {arXiv},
reportid = {FZJ-2022-01779, arXiv:2203.11355},
year = {2022},
abstract = {Neural networks can implement arbitrary functions. But,
mechanistically, what are the tools at their disposal to
construct the target? For classification tasks, the network
must transform the data classes into a linearly separable
representation in the final hidden layer. We show that a
feed-forward architecture has one primary tool at hand to
achieve this separability: progressive folding of the data
manifold in unoccupied higher dimensions. The operation of
folding provides a useful intuition in low-dimensions that
generalizes to high ones. We argue that an alternative
method based on shear, requiring very deep architectures,
plays only a small role in real-world networks. The folding
operation, however, is powerful as long as layers are wider
than the data dimensionality, allowing efficient solutions
by providing access to arbitrary regions in the
distribution, such as data points of one class forming
islands within the other classes. We argue that a link
exists between the universal approximation property in ReLU
networks and the fold-and-cut theorem (Demaine et al., 1998)
dealing with physical paper folding. Based on the
mechanistic insight, we predict that the progressive
generation of separability is necessarily accompanied by
neurons showing mixed selectivity and bimodal tuning curves.
This is validated in a network trained on the poker hand
task, showing the emergence of bimodal tuning curves during
training. We hope that our intuitive picture of the data
transformation in deep networks can help to provide
interpretability, and discuss possible applications to the
theory of convolutional networks, loss landscapes, and
generalization. TL;DR: Shows that the internal processing of
deep networks can be thought of as literal folding
operations on the data distribution in the N-dimensional
activation space. A link to a well-known theorem in origami
theory is provided.},
keywords = {Machine Learning (cs.LG) (Other) / Disordered Systems and
Neural Networks (cond-mat.dis-nn) (Other) / Machine Learning
(stat.ML) (Other) / FOS: Computer and information sciences
(Other) / FOS: Physical sciences (Other)},
cin = {INM-6 / IAS-6 / INM-10},
cid = {I:(DE-Juel1)INM-6-20090406 / I:(DE-Juel1)IAS-6-20130828 /
I:(DE-Juel1)INM-10-20170113},
pnm = {5232 - Computational Principles (POF4-523) /
RenormalizedFlows - Transparent Deep Learning with
Renormalized Flows (BMBF-01IS19077A) / neuroIC002 -
Recurrence and stochasticity for neuro-inspired computation
(EXS-SF-neuroIC002) / SDS005 - Towards an integrated data
science of complex natural systems (PF-JARA-SDS005) / GRK
2416 - GRK 2416: MultiSenses-MultiScales: Neue Ansätze zur
Aufklärung neuronaler multisensorischer Integration
(368482240)},
pid = {G:(DE-HGF)POF4-5232 / G:(DE-Juel-1)BMBF-01IS19077A /
G:(DE-82)EXS-SF-neuroIC002 / G:(DE-Juel-1)PF-JARA-SDS005 /
G:(GEPRIS)368482240},
typ = {PUB:(DE-HGF)25},
eprint = {2203.11355},
howpublished = {arXiv:2203.11355},
archivePrefix = {arXiv},
SLACcitation = {$\%\%CITATION$ = $arXiv:2203.11355;\%\%$},
doi = {10.48550/arXiv.2203.11355},
url = {https://juser.fz-juelich.de/record/906961},
}