% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@ARTICLE{Baumeister:1019082,
      author       = {Baumeister, Paul F and Nassyr, Stepan},
      title        = {tf{QMR}gpu: {A} {GPU}-accelerated linear solver with
                      block-sparse complex result matrix},
      journal      = {The journal of supercomputing},
      issn         = {0920-8542},
      address      = {Dordrecht [u.a.]},
      publisher    = {Springer Science + Business Media B.V},
      reportid     = {FZJ-2023-05135},
      year         = {2023},
      abstract     = {Linear solvers are a central component of many applications
                      in physics and engineering. In this work we present a
                      software package for simultaneously solving with multiple
                      right-hand sides using the vast compute performance and
                      memory bandwidth of graphical processors. Using the
                      transpose-free quasi minimal residual method iterative
                      linear solving does not require the implementation of an
                      adjoint operator. This C++/CUDA software packet has two ways
                      of being employed. The precompiled version of this library
                      offers linear solving for single and double precision
                      block-sparse complex matrices with interfaces to various
                      programming languages, in particular C, Fortran, Python and
                      Julia. Furthermore, the core algorithm is available for
                      custom implementations of any linear operator as a C++
                      header-only library. We showcase a matrix-free approach of a
                      custom operator for a finite-difference stencil application
                      solving the three-dimensional Helmholtz equation and compare
                      the performance of the matrix-free approach against the
                      block-sparse matrix version, both on NVIDIA hardware.},
      ddc          = {620},
      pnm          = {5111 - Domain-Specific Simulation $\&$ Data Life Cycle Labs
                      (SDLs) and Research Groups (POF4-511)},
      pid          = {G:(DE-HGF)POF4-5111},
      typ          = {PUB:(DE-HGF)25},
      doi          = {10.21203/rs.3.rs-3574519/v1},
      url          = {https://juser.fz-juelich.de/record/1019082},
}