% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@ARTICLE{Leroux:1038064,
author = {Leroux, Nathan and Manea, Paul-Philipp and Sudarshan,
Chirag and Finkbeiner, Jan and Siegel, Sebastian and
Strachan, John Paul and Neftci, Emre},
title = {{A}nalog {I}n-{M}emory {C}omputing {A}ttention {M}echanism
for {F}ast and {E}nergy-{E}fficient {L}arge {L}anguage
{M}odels},
publisher = {arXiv},
reportid = {FZJ-2025-01113},
year = {2024},
abstract = {Transformer networks, driven by self-attention, are central
to Large Language Models. In generative Transformers,
self-attention uses cache memory to store token projections,
avoiding recomputation at each time step. However,
GPU-stored projections must be loaded into SRAM for each new
generation step, causing latency and energy bottlenecks. We
present a custom self-attention in-memory computing
architecture based on emerging charge-based memories called
gain cells, which can be efficiently written to store new
tokens during sequence generation and enable parallel analog
dot-product computation required for self-attention.
However, the analog gain cell circuits introduce
non-idealities and constraints preventing the direct mapping
of pre-trained models. To circumvent this problem, we design
an initialization algorithm achieving text processing
performance comparable to GPT-2 without training from
scratch. Our architecture respectively reduces attention
latency and energy consumption by up to two and five orders
of magnitude compared to GPUs, marking a significant step
toward ultra-fast, low-power generative Transformers.},
keywords = {Neural and Evolutionary Computing (cs.NE) (Other) /
Artificial Intelligence (cs.AI) (Other) / Hardware
Architecture (cs.AR) (Other) / Emerging Technologies (cs.ET)
(Other) / FOS: Computer and information sciences (Other)},
cin = {PGI-15 / PGI-14},
cid = {I:(DE-Juel1)PGI-15-20210701 / I:(DE-Juel1)PGI-14-20210412},
pnm = {5234 - Emerging NC Architectures (POF4-523) / BMBF 16ME0400
- Verbundprojekt: Neuro-inspirierte Technologien der
künstlichen Intelligenz für die Elektronik der Zukunft -
NEUROTEC II - (16ME0400) / BMBF 03ZU1106CA - NeuroSys:
Algorithm-Hardware Co-Design (Projekt C) - A (03ZU1106CA) /
BMBF 03ZU1106CB - NeuroSys: Algorithm-Hardware Co-Design
(Projekt C) - B (BMBF-03ZU1106CB)},
pid = {G:(DE-HGF)POF4-5234 / G:(BMBF)16ME0400 / G:(BMBF)03ZU1106CA
/ G:(DE-Juel1)BMBF-03ZU1106CB},
typ = {PUB:(DE-HGF)25},
doi = {10.48550/arXiv.2409.19315},
url = {https://juser.fz-juelich.de/record/1038064},
}