% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@INPROCEEDINGS{Finkbeiner:1037902,
      author       = {Finkbeiner, Jan and Gmeinder, Thomas and Pupilli, Mark and
                      Titterton, Alexander and Neftci, Emre},
      title        = {{H}arnessing {M}anycore {P}rocessors with {D}istributed
                      {M}emory for {A}ccelerated {T}raining of {S}parse and
                      {R}ecurrent {M}odels},
      reportid     = {FZJ-2025-01040},
      pages        = {11996-12005},
      year         = {2024},
      abstract     = {Current AI training infrastructure is dominated by single
                      instruction multiple data (SIMD) and systolic array
                      architectures, such as Graphics Processing Units (GPUs) and
                      Tensor Processing Units (TPUs), that excel at accelerating
                      parallel workloads and dense vector matrix multiplications.
                      Potentially more efficient neural network models utilizing
                      sparsity and recurrence cannot leverage the full power of
                      SIMD processor and are thus at a severe disadvantage
                      compared to today’s prominent parallel architectures like
                      Transformers and CNNs, thereby hindering the path towards
                      more sustainable AI. To overcome this limitation, we explore
                      sparse and recurrent model training on a massively parallel
                      multiple instruction multiple data (MIMD) architecture with
                      distributed local memory. We implement a training routine
                      based on backpropagation through time (BPTT) for the
                      brain-inspired class of Spiking Neural Networks (SNNs) that
                      feature binary sparse activations. We observe a massive
                      advantage in using sparse activation tensors with a MIMD
                      processor, the Intelligence Processing Unit (IPU) compared
                      to GPUs. On training workloads, our results demonstrate
                      5-10× throughput gains compared to A100 GPUs and up to 38×
                      gains for higher levels of activation sparsity, without a
                      significant slowdown in training convergence or reduction in
                      final model performance. Furthermore, our results show
                      highly promising trends for both single and multi IPU
                      configurations as we scale up to larger model sizes. Our
                      work paves the way towards more efficient,non-standard
                      models via AI training hardware beyond GPUs, and competitive
                      large scale SNN models.},
      month         = {Feb},
      date          = {2024-02-27},
      organization  = {AAAI Conference on Artificial
                       Intelligence, Vancouver (Canada), 27
                       Feb 2024 - 4 Mar 2024},
      cin          = {PGI-15},
      cid          = {I:(DE-Juel1)PGI-15-20210701},
      pnm          = {5234 - Emerging NC Architectures (POF4-523) / BMBF
                      03ZU1106CA - NeuroSys: Algorithm-Hardware Co-Design (Projekt
                      C) - A (03ZU1106CA) / BMBF 03ZU1106CB - NeuroSys:
                      Algorithm-Hardware Co-Design (Projekt C) - B
                      (BMBF-03ZU1106CB)},
      pid          = {G:(DE-HGF)POF4-5234 / G:(BMBF)03ZU1106CA /
                      G:(DE-Juel1)BMBF-03ZU1106CB},
      typ          = {PUB:(DE-HGF)8},
      url          = {https://juser.fz-juelich.de/record/1037902},
}