% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@ARTICLE{Bencheikh:1037903,
      author       = {Bencheikh, Wadjih and Finkbeiner, Jan and Neftci, Emre},
      title        = {{O}ptimal {G}radient {C}heckpointing for {S}parse and
                      {R}ecurrent {A}rchitectures using {O}ff-{C}hip {M}emory},
      reportid     = {FZJ-2025-01041},
      year         = {2024},
      abstract     = {Recurrent neural networks (RNNs) are valued for their
                      computational efficiency and reduced memory requirements on
                      tasks involving long sequence lengths but require high
                      memory-processor bandwidth to train. Checkpointing
                      techniques can reduce the memory requirements by only
                      storing a subset of intermediate states, the checkpoints,
                      but are still rarely used due to the computational overhead
                      of the additional recomputation phase. This work addresses
                      these challenges by introducing memory-efficient gradient
                      checkpointing strategies tailored for the general class of
                      sparse RNNs and Spiking Neural Networks (SNNs). SNNs are
                      energy efficient alternatives to RNNs thanks to their local,
                      event-driven operation and potential neuromorphic
                      implementation. We use the Intelligence Processing Unit
                      (IPU) as an exemplary platform for architectures with
                      distributed local memory. We exploit its suitability for
                      sparse and irregular workloads to scale SNN training on long
                      sequence lengths. We find that Double Checkpointing emerges
                      as the most effective method, optimizing the use of local
                      memory resources while minimizing recomputation overhead.
                      This approach reduces dependency on slower large-scale
                      memory access, enabling training on sequences over 10 times
                      longer or 4 times larger networks than previously feasible,
                      with only marginal time overhead. The presented techniques
                      demonstrate significant potential to enhance scalability and
                      efficiency in training sparse and recurrent networks across
                      diverse hardware platforms, and highlights the benefits of
                      sparse activations for scalable recurrent neural network
                      training.},
      cin          = {PGI-15},
      cid          = {I:(DE-Juel1)PGI-15-20210701},
      pnm          = {5234 - Emerging NC Architectures (POF4-523)},
      pid          = {G:(DE-HGF)POF4-5234},
      typ          = {PUB:(DE-HGF)25},
      doi          = {10.34734/FZJ-2025-01041},
      url          = {https://juser.fz-juelich.de/record/1037903},
}