% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@ARTICLE{Bencheikh:1037903,
author = {Bencheikh, Wadjih and Finkbeiner, Jan and Neftci, Emre},
title = {{O}ptimal {G}radient {C}heckpointing for {S}parse and
{R}ecurrent {A}rchitectures using {O}ff-{C}hip {M}emory},
reportid = {FZJ-2025-01041},
year = {2024},
abstract = {Recurrent neural networks (RNNs) are valued for their
computational efficiency and reduced memory requirements on
tasks involving long sequence lengths but require high
memory-processor bandwidth to train. Checkpointing
techniques can reduce the memory requirements by only
storing a subset of intermediate states, the checkpoints,
but are still rarely used due to the computational overhead
of the additional recomputation phase. This work addresses
these challenges by introducing memory-efficient gradient
checkpointing strategies tailored for the general class of
sparse RNNs and Spiking Neural Networks (SNNs). SNNs are
energy efficient alternatives to RNNs thanks to their local,
event-driven operation and potential neuromorphic
implementation. We use the Intelligence Processing Unit
(IPU) as an exemplary platform for architectures with
distributed local memory. We exploit its suitability for
sparse and irregular workloads to scale SNN training on long
sequence lengths. We find that Double Checkpointing emerges
as the most effective method, optimizing the use of local
memory resources while minimizing recomputation overhead.
This approach reduces dependency on slower large-scale
memory access, enabling training on sequences over 10 times
longer or 4 times larger networks than previously feasible,
with only marginal time overhead. The presented techniques
demonstrate significant potential to enhance scalability and
efficiency in training sparse and recurrent networks across
diverse hardware platforms, and highlights the benefits of
sparse activations for scalable recurrent neural network
training.},
cin = {PGI-15},
cid = {I:(DE-Juel1)PGI-15-20210701},
pnm = {5234 - Emerging NC Architectures (POF4-523)},
pid = {G:(DE-HGF)POF4-5234},
typ = {PUB:(DE-HGF)25},
doi = {10.34734/FZJ-2025-01041},
url = {https://juser.fz-juelich.de/record/1037903},
}