% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@ARTICLE{Castagna:889100,
      author       = {Castagna, Jony and Guo, Xiaohu and Seaton, Michael and
                      O’Cais, Alan},
      title        = {{T}owards extreme scale dissipative particle dynamics
                      simulations using multiple {GPGPU}s},
      journal      = {Computer physics communications},
      volume       = {251},
      issn         = {0010-4655},
      address      = {Amsterdam},
      publisher    = {North Holland Publ. Co.},
      reportid     = {FZJ-2021-00030},
      pages        = {107159 -},
      year         = {2020},
      abstract     = {A multi-GPGPU development for Mesoscale Simulations using
                      the Dissipative Particle Dynamics method is presented. This
                      distributed GPU acceleration development is an extension of
                      the $DL_MESO$ package to MPI+CUDA in order to exploit the
                      computational power of the latest NVIDIA cards on hybrid
                      CPU–GPU architectures. Details about the extensively
                      applicable algorithm implementation and memory coalescing
                      data structures are presented. The key algorithms’
                      optimizations for the nearest-neighbour list searching of
                      particle pairs for short range forces, exchange of data and
                      overlapping between computation and communications are also
                      given. We have carried out strong and weak scaling
                      performance analyses with up to 4096 GPUs. A two phase
                      mixture separation test case with 1.8 billion particles has
                      been run on the Piz Daint supercomputer from the Swiss
                      National Supercomputer Center. With CUDA aware MPI, proper
                      GPU affinity, communication and computation overlap
                      optimizations for multi-GPU version, the final optimization
                      results demonstrated more than $94\%$ efficiency for weak
                      scaling and more than $80\%$ efficiency for strong scaling.
                      As far as we know, this is the first report in the
                      literature of DPD simulations being run on this large number
                      of GPUs. The remaining challenges and future work are also
                      discussed at the end of the paper.},
      cin          = {JSC},
      ddc          = {530},
      cid          = {I:(DE-Juel1)JSC-20090406},
      pnm          = {511 - Computational Science and Mathematical Methods
                      (POF3-511) / E-CAM - An e-infrastructure for software,
                      training and consultancy in simulation and modelling
                      (676531) / PRACE CoE Allocation E-CAM $(prcoe02_20181001)$},
      pid          = {G:(DE-HGF)POF3-511 / G:(EU-Grant)676531 /
                      $G:(DE-Juel1)prcoe02_20181001$},
      typ          = {PUB:(DE-HGF)16},
      UT           = {WOS:000528002400017},
      doi          = {10.1016/j.cpc.2020.107159},
      url          = {https://juser.fz-juelich.de/record/889100},
}