% IMPORTANT: The following is UTF-8 encoded. This means that in the presence % of non-ASCII characters, it will not work with BibTeX 0.99 or older. % Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or % “biber”. @ARTICLE{Finkbeiner:1037904, author = {Finkbeiner, Jan Robert and Neftci, Emre}, title = {{O}n-{C}hip {L}earning via {T}ransformer {I}n-{C}ontext {L}earning}, reportid = {FZJ-2025-01042}, year = {2024}, abstract = {Autoregressive decoder-only transformers have become key components for scalable sequence processing and generation models. However, the transformer's self-attention mechanism requires transferring prior token projections from the main memory at each time step (token), thus severely limiting their performance on conventional processors. Self-attention can be viewed as a dynamic feed-forward layer, whose matrix is input sequence-dependent similarly to the result of local synaptic plasticity. Using this insight, we present a neuromorphic decoder-only transformer model that utilizes an on-chip plasticity processor to compute self-attention. Interestingly, the training of transformers enables them to ``learn'' the input context during inference. We demonstrate this in-context learning ability of transformers on the Loihi 2 processor by solving a few-shot classification problem. With this we emphasize the importance of pretrained models especially their ability to find simple, local, backpropagation free, learning rules enabling on-chip learning and adaptation in a hardware friendly manner.}, cin = {PGI-15}, cid = {I:(DE-Juel1)PGI-15-20210701}, pnm = {5234 - Emerging NC Architectures (POF4-523) / BMBF 03ZU1106CA - NeuroSys: Algorithm-Hardware Co-Design (Projekt C) - A (03ZU1106CA) / BMBF 03ZU1106CB - NeuroSys: Algorithm-Hardware Co-Design (Projekt C) - B (BMBF-03ZU1106CB)}, pid = {G:(DE-HGF)POF4-5234 / G:(BMBF)03ZU1106CA / G:(DE-Juel1)BMBF-03ZU1106CB}, typ = {PUB:(DE-HGF)25}, doi = {10.34734/FZJ-2025-01042}, url = {https://juser.fz-juelich.de/record/1037904}, }