% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@ARTICLE{Baumeister:1044949,
author = {Baumeister, Paul F. and Nassyr, Stepan},
title = {tf{QMR}gpu: a {GPU}-accelerated linear solver with
block-sparse complex result matrix},
journal = {The journal of supercomputing},
volume = {81},
number = {5},
issn = {0920-8542},
address = {Dordrecht [u.a.]},
publisher = {Springer Science + Business Media B.V},
reportid = {FZJ-2025-03449},
pages = {663},
year = {2025},
abstract = {We present tfQMRgpu, a GPU-accelerated iterative linear
solver based on the transpose-free quasi-minimal residual
(tfQMR) method. Designed for large-scale electronic
structure calculations, particularly in the context of
Korringa–Kohn–Rostoker density functional theory,
tfQMRgpu efficiently handles block-sparse complex matrices
arising from multiple scattering theory. The solver exploits
GPU parallelism to accelerate convergence while leveraging
memory-efficient sparse storage formats. By unifying the
solution of multiple right-hand side (RHS) block vectors,
tfQMRgpu significantly improves throughput, demonstrating up
to a speedup on modern GPUs. Additionally, we introduce a
flexible implementation framework that supports both
explicit matrix-based and matrix-free operator formulations,
such as high-order finite-difference stencils for real-space
grid-based Green function calculations. Benchmarks on
various NVIDIA GPUs demonstrate the solver’s efficiency,
in some cases achieving over $56\%$ of peak floating-point
performance for block-sparse matrix multiplications.
tfQMRgpu is open-source, providing interfaces for C, C++,
Fortran, Julia, and Python, making it a versatile tool for
high-performance computing applications that can benefit
from the unification of RHS problems.},
cin = {JSC},
ddc = {620},
cid = {I:(DE-Juel1)JSC-20090406},
pnm = {5111 - Domain-Specific Simulation $\&$ Data Life Cycle Labs
(SDLs) and Research Groups (POF4-511) / 5112 - Cross-Domain
Algorithms, Tools, Methods Labs (ATMLs) and Research Groups
(POF4-511) / 5122 - Future Computing $\&$ Big Data Systems
(POF4-512) / BMBF 01 1H1 6013, NRW 325 – 8.03 – 133340 -
SiVeGCS (DB001492) / ATML-X-DEV - ATML Accelerating Devices
(ATML-X-DEV)},
pid = {G:(DE-HGF)POF4-5111 / G:(DE-HGF)POF4-5112 /
G:(DE-HGF)POF4-5122 / G:(DE-Juel-1)DB001492 /
G:(DE-Juel-1)ATML-X-DEV},
typ = {PUB:(DE-HGF)16},
doi = {10.1007/s11227-025-07145-6},
url = {https://juser.fz-juelich.de/record/1044949},
}