% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@INPROCEEDINGS{DiNapoli:1019352,
      author       = {Di Napoli, Edoardo and Wu, Xinzhe},
      title        = {{A}dvancing the {D}istributed {M}ulti-{GPU} {C}h{ASE}
                      {L}ibrary through {A}lgorithm {O}ptimization and {NCCL}
                      {L}ibrary},
      reportid     = {FZJ-2023-05322},
      year         = {2023},
      abstract     = {As supercomputers become larger with powerful Graphics
                      Processing Unit (GPU), traditional direct eigensolvers
                      struggle to keep up with the hardware evolution and scale
                      efficiently due to communication and synchronization
                      demands. Subspace eigensolvers, like the Chebyshev
                      Accelerated Subspace Eigensolver (ChASE), have a simpler
                      structure and can overcome communication and synchronization
                      bottlenecks. ChASE is a modern subspace eigensolver that
                      uses Chebyshev polynomials to accelerate the computation of
                      extremal eigenpairs of dense Hermitian eigenproblems. In
                      this work we show how we have modified ChASE by rethinking
                      its memory layout, introducing a novel parallelization
                      scheme, switching to a more performing
                      communication-avoiding algorithm for one of its inner
                      module, and substituting MPI library by vendor-optimized
                      NCCL library. The resulting library can tackle dense
                      problems with size up to $N=O(10^6),$ and scales
                      effortlessly up to the full 900 nodes---each one powered by
                      4xA100 NVIDIA GPUs---of the JUWELS Booster hosted at the
                      Jülich Supercomputing Centre.},
      month         = {Nov},
      date          = {2023-11-12},
      organization  = {14th Workshop on Latest Advances in
                       Scalable Algorithms for Large-Scale
                       Heterogeneous Systems, Denver (USA), 12
                       Nov 2023 - 17 Nov 2023},
      subtyp        = {After Call},
      cin          = {JSC / CASA},
      cid          = {I:(DE-Juel1)JSC-20090406 / I:(DE-Juel1)CASA-20230315},
      pnm          = {5111 - Domain-Specific Simulation $\&$ Data Life Cycle Labs
                      (SDLs) and Research Groups (POF4-511) / Simulation and Data
                      Laboratory Quantum Materials (SDLQM) (SDLQM)},
      pid          = {G:(DE-HGF)POF4-5111 / G:(DE-Juel1)SDLQM},
      typ          = {PUB:(DE-HGF)6},
      url          = {https://juser.fz-juelich.de/record/1019352},
}