% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@INPROCEEDINGS{Aach:1022355,
      author       = {Aach, Marcel and Inanc, Eray and Sarma, Rakesh and Riedel,
                      Morris and Lintermann, Andreas},
      title        = {{O}ptimal {R}esource {A}llocation for {E}arly
                      {S}topping-based {N}eural {A}rchitecture {S}earch {M}ethods},
      volume       = {228},
      publisher    = {PMLR},
      reportid     = {FZJ-2024-01461},
      series       = {Proceedings of Machine Learning Research},
      pages        = {12/1--17},
      year         = {2023},
      abstract     = {The field of NAS has been significantly benefiting from the
                      increased availability of parallel compute resources, as
                      optimization algorithms typically require sampling and
                      evaluating hundreds of model configurations. Consequently,
                      to make use of these resources, the most commonly used early
                      stopping-based NAS methods are suitable for running multiple
                      trials in parallel. At the same time, also the training time
                      of single model configurations can be reduced, e.g., by
                      employing data-parallel training using multiple GPUs. This
                      paper investigates the optimal allocation of a fixed amount
                      of parallel workers for conducting NAS. In practice, users
                      have to decide if the computational resources are primarily
                      used to assign more workers to the training of individual
                      trials or to increase the number of trials executed in
                      parallel. The first option accelerates the speed of the
                      individual trials (exploitation) but reduces the parallelism
                      of the NAS loop, whereas for the second option, the runtime
                      of the trials is longer but a larger number of
                      simultaneously processed trials in the NAS loop is achieved
                      (exploration). Our study encompasses both large- and
                      small-scale scenarios, including tuning models in parallel
                      on a single GPU, with data-parallel training on up to 16
                      GPUs, and measuring the scalability of NAS on up to 64 GPUs.
                      Our empirical results using the HyperBand, Asynchronous
                      Successive Halving, and Bayesian Optimization HyperBand
                      methods offer valuable insights for users seeking to run NAS
                      on both small and large computational budgets. By selecting
                      the appropriate number of parallel evaluations, the NAS
                      process can be accelerated by factors of ${\approx}$2–5
                      while preserving the test set accuracy compared to
                      non-optimal resource allocations.}},
      month         = {Nov},
      date          = {2023-11-12},
      organization  = {Second International Conference on
                       Automated Machine Learning, Potsdam
                       (Germany), 12 Nov 2023 - 15 Nov 2023},
      cin          = {JSC},
      cid          = {I:(DE-Juel1)JSC-20090406},
      pnm          = {5111 - Domain-Specific Simulation $\&$ Data Life Cycle Labs
                      (SDLs) and Research Groups (POF4-511) / RAISE - Research on
                      AI- and Simulation-Based Engineering at Exascale (951733)},
      pid          = {G:(DE-HGF)POF4-5111 / G:(EU-Grant)951733},
      typ          = {PUB:(DE-HGF)8 / PUB:(DE-HGF)7},
      doi          = {10.34734/FZJ-2024-01461},
      url          = {https://juser.fz-juelich.de/record/1022355},
}