% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@ARTICLE{Hoffmann:1026442,
      author       = {Hoffmann, Lars and Haghighi Mood, Kaveh and Herten, Andreas
                      and Hrywniak, Markus and Kraus, Jiri and Clemens, Jan and
                      Liu, Mingzhao},
      title        = {{A}ccelerating {L}agrangian transport simulations on
                      graphics processing units: performance optimizations of
                      {M}assive-{P}arallel {T}rajectory {C}alculations ({MPTRAC})
                      v2.6},
      journal      = {Geoscientific model development},
      volume       = {17},
      number       = {9},
      issn         = {1991-959X},
      address      = {Katlenburg-Lindau},
      publisher    = {Copernicus},
      reportid     = {FZJ-2024-03395},
      pages        = {4077 - 4094},
      year         = {2024},
      abstract     = {Lagrangian particle dispersion models are indispensable
                      tools for the study of atmospheric transport processes.
                      However, Lagrangian transport simulations can become
                      numerically expensive when large numbers of air parcels are
                      involved. To accelerate these simulations, we made
                      considerable efforts to port the Massive-Parallel Trajectory
                      Calculations (MPTRAC) model to graphics processing units
                      (GPUs). Here we discuss performance optimizations of the
                      major bottleneck of the GPU code of MPTRAC, the advection
                      kernel. Timeline, roofline, and memory analyses of the
                      baseline GPU code revealed that the application is
                      memory-bound, and performance suffers from near-random
                      memory access patterns. By changing the data structure of
                      the horizontal wind and vertical velocity fields of the
                      global meteorological data driving the simulations from
                      structure of arrays (SoAs) to array of structures (AoSs) and
                      by introducing a sorting method for better memory alignment
                      of the particle data, performance was greatly improved. We
                      evaluated the performance on NVIDIA A100 GPUs of the Jülich
                      Wizard for European Leadership Science (JUWELS) Booster
                      module at the Jülich Supercomputing Center, Germany. For
                      our largest test case, transport simulations with 108
                      particles driven by the European Centre for Medium-Range
                      Weather Forecasts (ECMWF) ERA5 reanalysis, we found that the
                      runtime for the full set of physics computations was reduced
                      by $75 \%,$ including a reduction of $85 \%$ for the
                      advection kernel. In addition to demonstrating the benefits
                      of code optimization for GPUs, we show that the runtime of
                      central processing unit (CPU-)only simulations is also
                      improved. For our largest test case, we found a runtime
                      reduction of $34 \%$ for the physics computations,
                      including a reduction of $65 \%$ for the advection kernel.
                      The code optimizations discussed here bring the MPTRAC model
                      closer to applications on upcoming exascale high-performance
                      computing systems and will also be of interest for
                      optimizing the performance of other models using particle
                      methods.},
      cin          = {JSC / IEK-7 / CASA},
      ddc          = {550},
      cid          = {I:(DE-Juel1)JSC-20090406 / I:(DE-Juel1)IEK-7-20101013 /
                      I:(DE-Juel1)CASA-20230315},
      pnm          = {5111 - Domain-Specific Simulation $\&$ Data Life Cycle Labs
                      (SDLs) and Research Groups (POF4-511) / 2112 - Climate
                      Feedbacks (POF4-211) / 5122 - Future Computing $\&$ Big Data
                      Systems (POF4-512) / 5112 - Cross-Domain Algorithms, Tools,
                      Methods Labs (ATMLs) and Research Groups (POF4-511) /
                      ATML-X-DEV - ATML Accelerating Devices (ATML-X-DEV)},
      pid          = {G:(DE-HGF)POF4-5111 / G:(DE-HGF)POF4-2112 /
                      G:(DE-HGF)POF4-5122 / G:(DE-HGF)POF4-5112 /
                      G:(DE-Juel-1)ATML-X-DEV},
      typ          = {PUB:(DE-HGF)16},
      UT           = {WOS:001226505800001},
      doi          = {10.5194/gmd-17-4077-2024},
      url          = {https://juser.fz-juelich.de/record/1026442},
}