% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@ARTICLE{Hoffmann:907140,
      author       = {Hoffmann, Lars and Baumeister, Paul F. and Cai, Zhongyin
                      and Clemens, Jan and Griessbach, Sabine and Günther,
                      Gebhard and Heng, Yi and Liu, Mingzhao and Haghighi Mood,
                      Kaveh and Stein, Olaf and Thomas, Nicole and Vogel, Bärbel
                      and Wu, Xue and Zou, Ling},
      title        = {{M}assive-{P}arallel {T}rajectory {C}alculations version
                      2.2 ({MPTRAC}-2.2): {L}agrangian transport simulations on
                      graphics processing units ({GPU}s)},
      journal      = {Geoscientific model development},
      volume       = {15},
      number       = {7},
      issn         = {1991-959X},
      address      = {Katlenburg-Lindau},
      publisher    = {Copernicus},
      reportid     = {FZJ-2022-01863},
      pages        = {2731 - 2762},
      year         = {2022},
      abstract     = {Lagrangian models are fundamental tools to study
                      atmospheric transport processes and for practical
                      applications such as dispersion modeling for anthropogenic
                      and natural emission sources. However, conducting
                      large-scale Lagrangian transport simulations with millions
                      of air parcels or more can become rather numerically costly.
                      In this study, we assessed the potential of exploiting
                      graphics processing units (GPUs) to accelerate Lagrangian
                      transport simulations. We ported the Massive-Parallel
                      Trajectory Calculations (MPTRAC) model to GPUs using the
                      open accelerator (OpenACC) programming model. The trajectory
                      calculations conducted within the MPTRAC model were fully
                      ported to GPUs, i.e., except for feeding in the
                      meteorological input data and for extracting the particle
                      output data, the code operates entirely on the GPU devices
                      without frequent data transfers between CPU and GPU memory.
                      Model verification, performance analyses, and scaling tests
                      of the Message Passing Interface (MPI) – Open
                      Multi-Processing (OpenMP) – OpenACC hybrid parallelization
                      of MPTRAC were conducted on the Jülich Wizard for European
                      Leadership Science (JUWELS) Booster supercomputer operated
                      by the Jülich Supercomputing Centre, Germany. The JUWELS
                      Booster comprises 3744 NVIDIA A100 Tensor Core GPUs,
                      providing a peak performance of 71.0 PFlop s−1. As of
                      June 2021, it is the most powerful supercomputer in Europe
                      and listed among the most energy-efficient systems
                      internationally. For large-scale simulations comprising 108
                      particles driven by the European Centre for Medium-Range
                      Weather Forecasts' fifth-generation reanalysis (ERA5), the
                      performance evaluation showed a maximum speed-up of a factor
                      of 16 due to the utilization of GPUs compared to CPU-only
                      runs on the JUWELS Booster. In the large-scale GPU run,
                      about $67 \%$ of the runtime is spent on the physics
                      calculations, conducted on the GPUs. Another $15 \%$ of
                      the runtime is required for file I/O, mostly to read the
                      large ERA5 data set from disk. Meteorological data
                      preprocessing on the CPUs also requires about $15 \%$ of
                      the runtime. Although this study identified potential for
                      further improvements of the GPU code, we consider the MPTRAC
                      model ready for production runs on the JUWELS Booster in its
                      present form. The GPU code provides a much faster time to
                      solution than the CPU code, which is particularly relevant
                      for near-real-time applications of a Lagrangian transport
                      model.},
      cin          = {JSC / IEK-7},
      ddc          = {550},
      cid          = {I:(DE-Juel1)JSC-20090406 / I:(DE-Juel1)IEK-7-20101013},
      pnm          = {5111 - Domain-Specific Simulation $\&$ Data Life Cycle Labs
                      (SDLs) and Research Groups (POF4-511) / 2112 - Climate
                      Feedbacks (POF4-211)},
      pid          = {G:(DE-HGF)POF4-5111 / G:(DE-HGF)POF4-2112},
      typ          = {PUB:(DE-HGF)16},
      UT           = {WOS:000780834800001},
      doi          = {10.5194/gmd-15-2731-2022},
      url          = {https://juser.fz-juelich.de/record/907140},
}