% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@ARTICLE{Finkbeiner:1037904,
author = {Finkbeiner, Jan Robert and Neftci, Emre},
title = {{O}n-{C}hip {L}earning via {T}ransformer {I}n-{C}ontext
{L}earning},
reportid = {FZJ-2025-01042},
year = {2024},
abstract = {Autoregressive decoder-only transformers have become key
components for scalable sequence processing and generation
models. However, the transformer's self-attention mechanism
requires transferring prior token projections from the main
memory at each time step (token), thus severely limiting
their performance on conventional processors. Self-attention
can be viewed as a dynamic feed-forward layer, whose matrix
is input sequence-dependent similarly to the result of local
synaptic plasticity. Using this insight, we present a
neuromorphic decoder-only transformer model that utilizes an
on-chip plasticity processor to compute self-attention.
Interestingly, the training of transformers enables them to
``learn'' the input context during inference. We demonstrate
this in-context learning ability of transformers on the
Loihi 2 processor by solving a few-shot classification
problem. With this we emphasize the importance of pretrained
models especially their ability to find simple, local,
backpropagation free, learning rules enabling on-chip
learning and adaptation in a hardware friendly manner.},
cin = {PGI-15},
cid = {I:(DE-Juel1)PGI-15-20210701},
pnm = {5234 - Emerging NC Architectures (POF4-523) / BMBF
03ZU1106CA - NeuroSys: Algorithm-Hardware Co-Design (Projekt
C) - A (03ZU1106CA) / BMBF 03ZU1106CB - NeuroSys:
Algorithm-Hardware Co-Design (Projekt C) - B
(BMBF-03ZU1106CB)},
pid = {G:(DE-HGF)POF4-5234 / G:(BMBF)03ZU1106CA /
G:(DE-Juel1)BMBF-03ZU1106CB},
typ = {PUB:(DE-HGF)25},
doi = {10.34734/FZJ-2025-01042},
url = {https://juser.fz-juelich.de/record/1037904},
}