% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@ARTICLE{Aach:1045002,
author = {Aach, Marcel and Sarma, Rakesh and Neukirchen, Helmut and
Riedel, Morris and Lintermann, Andreas},
title = {{R}esource-adaptive successive doubling for hyperparameter
optimization with large datasets on high-performance
computing systems},
journal = {Future generation computer systems},
volume = {175},
issn = {0167-739X},
address = {Amsterdam [u.a.]},
publisher = {Elsevier Science},
reportid = {FZJ-2025-03484},
pages = {108042 -},
year = {2026},
abstract = {The accuracy of Machine Learning (ML) models is highly
dependent on the hyperparameters that have to be chosen by
the user before the training. However, finding the optimal
set of hyperparameters is a complex process, as many
different parameter combinations need to be evaluated, and
obtaining the accuracy of each combination usually requires
a full training run. It is therefore of great interest to
reduce the computational runtime of this process. On
High-Performance Computing (HPC) systems, several
configurations can be evaluated in parallel to speed up this
Hyperparameter Optimization (HPO). State-of-the-art HPO
methods follow a bandit-based approach and build on top of
successive halving, where the final performance of a
combination is estimated based on a lower than fully trained
fidelity performance metric and more promising combinations
are assigned more resources over time. Frequently, the
number of epochs is treated as a resource, letting more
promising combinations train longer. Another option is to
use the number of workers as a resource and directly
allocate more workers to more promising configurations via
data-parallel training. This article proposes a novel
Resource-Adaptive Successive Doubling Algorithm (RASDA),
which combines a resource- adaptive successive doubling
scheme with the plain Asynchronous Successive Halving
Algorithm (ASHA). Scalability of this approach is shown on
up to 1,024 Graphics Processing Units (GPUs) on modern HPC
systems. It is applied to different types of Neural Networks
(NNs) and trained on large datasets from the Computer Vision
(CV), Computational Fluid Dynamics (CFD), and Additive
Manufacturing (AM) domains, where performing more than one
full training run is usually infeasible. Empirical results
show that RASDA outperforms ASHA by a factor of up to 1.9
with respect to the runtime. At the same time, the solution
quality of final ASHA models is maintained or even surpassed
by the implicit batch size scheduling of RASDA. With RASDA,
systematic HPO is applied to a terabyte-scale scientific
dataset for the first time in the literature, enabling
efficient optimization of complex models on massive
scientific data.},
cin = {JSC},
ddc = {004},
cid = {I:(DE-Juel1)JSC-20090406},
pnm = {5111 - Domain-Specific Simulation $\&$ Data Life Cycle Labs
(SDLs) and Research Groups (POF4-511) / RAISE - Research on
AI- and Simulation-Based Engineering at Exascale (951733)},
pid = {G:(DE-HGF)POF4-5111 / G:(EU-Grant)951733},
typ = {PUB:(DE-HGF)16},
doi = {10.1016/j.future.2025.108042},
url = {https://juser.fz-juelich.de/record/1045002},
}