% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@INPROCEEDINGS{Penke:1034062,
      author       = {Penke, Carolin},
      title        = {{E}fficient {C}omputation of {L}ow-{R}ank {R}epresentations
                      to {R}educe {M}emory {R}equirements in {D}eep {L}earning},
      reportid     = {FZJ-2024-06883},
      year         = {2024},
      abstract     = {Computing an orthogonal basis that approximates the range
                      or corange of a matrix is a ubiquitous problem in
                      computational science and engineering. In numerous
                      applications, a rapid decay of singular values permits the
                      use of such bases to approximate a linear operator by
                      restricting it to low-rank subspaces, thereby significantly
                      reducing computational and storage demands. A powerful
                      approach for constructing a basis with a specified rank or
                      approximation tolerance is the (adaptive) randomized range
                      finder. In this talk, we introduce a novel variant of this
                      algorithm, based on the blocked Householder QR
                      decomposition, optimized for modern GPU accelerators. This
                      development is motivated by its potential to substantially
                      lower memory requirements during the training of deep neural
                      networks such as transformers. We discuss the GaLore
                      (Gradient Low-Rank Projection) training framework, and
                      demonstrate how the randomized range finder can be employed
                      to derive low-rank representations of optimizer states.
                      Further potential avenues for future research are
                      discussed.},
      month         = {Dec},
      date          = {2024-12-11},
      organization  = {RWTH Aachen SFB 1481 Colloquium,
                       Aachen (Germany), 11 Dec 2024 - 11 Dec
                       2024},
      subtyp        = {Invited},
      cin          = {JSC},
      cid          = {I:(DE-Juel1)JSC-20090406},
      pnm          = {5112 - Cross-Domain Algorithms, Tools, Methods Labs (ATMLs)
                      and Research Groups (POF4-511) / OpenGPT-X - Aufbau eines
                      Gaia-X Knotens für große KI-Sprachmodelle und innovative
                      Sprachapplikations-Services; Teilvorhaben: Optimierung und
                      Skalierung auf großen HPC-Systemen (68GX21007F)},
      pid          = {G:(DE-HGF)POF4-5112 / G:(DE-Juel-1)68GX21007F},
      typ          = {PUB:(DE-HGF)31},
      doi          = {10.34734/FZJ-2024-06883},
      url          = {https://juser.fz-juelich.de/record/1034062},
}