% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@INPROCEEDINGS{Finkbeiner:1037902,
author = {Finkbeiner, Jan and Gmeinder, Thomas and Pupilli, Mark and
Titterton, Alexander and Neftci, Emre},
title = {{H}arnessing {M}anycore {P}rocessors with {D}istributed
{M}emory for {A}ccelerated {T}raining of {S}parse and
{R}ecurrent {M}odels},
reportid = {FZJ-2025-01040},
pages = {11996-12005},
year = {2024},
abstract = {Current AI training infrastructure is dominated by single
instruction multiple data (SIMD) and systolic array
architectures, such as Graphics Processing Units (GPUs) and
Tensor Processing Units (TPUs), that excel at accelerating
parallel workloads and dense vector matrix multiplications.
Potentially more efficient neural network models utilizing
sparsity and recurrence cannot leverage the full power of
SIMD processor and are thus at a severe disadvantage
compared to today’s prominent parallel architectures like
Transformers and CNNs, thereby hindering the path towards
more sustainable AI. To overcome this limitation, we explore
sparse and recurrent model training on a massively parallel
multiple instruction multiple data (MIMD) architecture with
distributed local memory. We implement a training routine
based on backpropagation through time (BPTT) for the
brain-inspired class of Spiking Neural Networks (SNNs) that
feature binary sparse activations. We observe a massive
advantage in using sparse activation tensors with a MIMD
processor, the Intelligence Processing Unit (IPU) compared
to GPUs. On training workloads, our results demonstrate
5-10× throughput gains compared to A100 GPUs and up to 38×
gains for higher levels of activation sparsity, without a
significant slowdown in training convergence or reduction in
final model performance. Furthermore, our results show
highly promising trends for both single and multi IPU
configurations as we scale up to larger model sizes. Our
work paves the way towards more efficient,non-standard
models via AI training hardware beyond GPUs, and competitive
large scale SNN models.},
month = {Feb},
date = {2024-02-27},
organization = {AAAI Conference on Artificial
Intelligence, Vancouver (Canada), 27
Feb 2024 - 4 Mar 2024},
cin = {PGI-15},
cid = {I:(DE-Juel1)PGI-15-20210701},
pnm = {5234 - Emerging NC Architectures (POF4-523) / BMBF
03ZU1106CA - NeuroSys: Algorithm-Hardware Co-Design (Projekt
C) - A (03ZU1106CA) / BMBF 03ZU1106CB - NeuroSys:
Algorithm-Hardware Co-Design (Projekt C) - B
(BMBF-03ZU1106CB)},
pid = {G:(DE-HGF)POF4-5234 / G:(BMBF)03ZU1106CA /
G:(DE-Juel1)BMBF-03ZU1106CB},
typ = {PUB:(DE-HGF)8},
url = {https://juser.fz-juelich.de/record/1037902},
}