% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@ARTICLE{Baumeister:1044949,
      author       = {Baumeister, Paul F. and Nassyr, Stepan},
      title        = {tf{QMR}gpu: a {GPU}-accelerated linear solver with
                      block-sparse complex result matrix},
      journal      = {The journal of supercomputing},
      volume       = {81},
      number       = {5},
      issn         = {0920-8542},
      address      = {Dordrecht [u.a.]},
      publisher    = {Springer Science + Business Media B.V},
      reportid     = {FZJ-2025-03449},
      pages        = {663},
      year         = {2025},
      abstract     = {We present tfQMRgpu, a GPU-accelerated iterative linear
                      solver based on the transpose-free quasi-minimal residual
                      (tfQMR) method. Designed for large-scale electronic
                      structure calculations, particularly in the context of
                      Korringa–Kohn–Rostoker density functional theory,
                      tfQMRgpu efficiently handles block-sparse complex matrices
                      arising from multiple scattering theory. The solver exploits
                      GPU parallelism to accelerate convergence while leveraging
                      memory-efficient sparse storage formats. By unifying the
                      solution of multiple right-hand side (RHS) block vectors,
                      tfQMRgpu significantly improves throughput, demonstrating up
                      to a speedup on modern GPUs. Additionally, we introduce a
                      flexible implementation framework that supports both
                      explicit matrix-based and matrix-free operator formulations,
                      such as high-order finite-difference stencils for real-space
                      grid-based Green function calculations. Benchmarks on
                      various NVIDIA GPUs demonstrate the solver’s efficiency,
                      in some cases achieving over $56\%$ of peak floating-point
                      performance for block-sparse matrix multiplications.
                      tfQMRgpu is open-source, providing interfaces for C, C++,
                      Fortran, Julia, and Python, making it a versatile tool for
                      high-performance computing applications that can benefit
                      from the unification of RHS problems.},
      cin          = {JSC},
      ddc          = {620},
      cid          = {I:(DE-Juel1)JSC-20090406},
      pnm          = {5111 - Domain-Specific Simulation $\&$ Data Life Cycle Labs
                      (SDLs) and Research Groups (POF4-511) / 5112 - Cross-Domain
                      Algorithms, Tools, Methods Labs (ATMLs) and Research Groups
                      (POF4-511) / 5122 - Future Computing $\&$ Big Data Systems
                      (POF4-512) / BMBF 01 1H1 6013, NRW 325 – 8.03 – 133340 -
                      SiVeGCS (DB001492) / ATML-X-DEV - ATML Accelerating Devices
                      (ATML-X-DEV)},
      pid          = {G:(DE-HGF)POF4-5111 / G:(DE-HGF)POF4-5112 /
                      G:(DE-HGF)POF4-5122 / G:(DE-Juel-1)DB001492 /
                      G:(DE-Juel-1)ATML-X-DEV},
      typ          = {PUB:(DE-HGF)16},
      doi          = {10.1007/s11227-025-07145-6},
      url          = {https://juser.fz-juelich.de/record/1044949},
}