% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@INPROCEEDINGS{Wylie:885773,
      author       = {Wylie, Brian J. N.},
      title        = {{E}xascale potholes for {HPC}: {E}xecution performance and
                      variability analysis of the flagship application code
                      {H}eme{LB}},
      publisher    = {IEEE},
      reportid     = {FZJ-2020-04080},
      isbn         = {978-0-7381-1070-7/20},
      pages        = {59-70},
      year         = {2020},
      comment      = {Proceedings of 2020 IEEE/ACM International Workshop on HPC
                      User Support Tools (HUST) and the Workshop on Programming
                      and Performance Visualization Tools (ProTools)},
      booktitle     = {Proceedings of 2020 IEEE/ACM
                       International Workshop on HPC User
                       Support Tools (HUST) and the Workshop
                       on Programming and Performance
                       Visualization Tools (ProTools)},
      abstract     = {Performance measurement and analysis of parallel
                      applications is often challenging, despite many excellent
                      commercial and open-source tools being available. Currently
                      envisaged exascale computer systems exacerbate matters by
                      requiring extremely high scalability to effectively exploit
                      millions of processor cores. Unfortunately, significant
                      application execution performance variability arising from
                      increasingly complex interactions between hardware and
                      system software makes this situation much more difficult for
                      application developers and performance analysts alike. This
                      work considers the performance assessment of the HemeLB
                      exascale flagship application code from the EU HPC Centre of
                      Excellence (CoE) for Computational Biomedicine (CompBioMed)
                      running on the SuperMUC-NG Tier-0 leadership system, using
                      the methodology of the Performance Optimisation and
                      Productivity (POP) CoE. Although $80\%$ scaling efficiency
                      is maintained to over 100,000 MPI processes, disappointing
                      initial performance with more processes and corresponding
                      poor strong scaling was identified to originate from the
                      same few compute nodes in multiple runs, which later system
                      diagnostic checks found had faulty DIMMs and lacklustre
                      performance. Excluding these compute nodes from subsequent
                      runs improved performance of executions with over 300,000
                      MPI processes by a factor of five, resulting in 190x
                      speed-up compared to 864 MPI processes. While communication
                      efficiency remains very good up to the largest scale,
                      parallel efficiency is primarily limited by load balance
                      found to be largely due to core-to-core and run-to-run
                      variability from excessive stalls for memory accesses, that
                      affect many HPC systems with Intel Xeon Scalable processors.
                      The POP methodology for this performance diagnosis is
                      demonstrated via a detailed exposition with widely deployed
                      'standard' measurement and analysis tools.},
      month         = {Nov},
      date          = {2020-11-12},
      organization  = {Workshop on Programming and
                       Performance Visualization Tools, online
                       (online), 12 Nov 2020 - 12 Nov 2020},
      keywords     = {E-Government (gnd)},
      cin          = {JSC},
      cid          = {I:(DE-Juel1)JSC-20090406},
      pnm          = {511 - Computational Science and Mathematical Methods
                      (POF3-511) / POP2 - Performance Optimisation and
                      Productivity 2 (824080) / CompBioMed - A Centre of
                      Excellence in Computational Biomedicine (675451) / ATMLPP -
                      ATML Parallel Performance (ATMLPP)},
      pid          = {G:(DE-HGF)POF3-511 / G:(EU-Grant)824080 /
                      G:(EU-Grant)675451 / G:(DE-Juel-1)ATMLPP},
      typ          = {PUB:(DE-HGF)8 / PUB:(DE-HGF)7},
      UT           = {WOS:000679395600007},
      doi          = {10.1109/HUSTProtools51951.2020.00014},
      url          = {https://juser.fz-juelich.de/record/885773},
}