% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@INPROCEEDINGS{Wylie:885773,
author = {Wylie, Brian J. N.},
title = {{E}xascale potholes for {HPC}: {E}xecution performance and
variability analysis of the flagship application code
{H}eme{LB}},
publisher = {IEEE},
reportid = {FZJ-2020-04080},
isbn = {978-0-7381-1070-7/20},
pages = {59-70},
year = {2020},
comment = {Proceedings of 2020 IEEE/ACM International Workshop on HPC
User Support Tools (HUST) and the Workshop on Programming
and Performance Visualization Tools (ProTools)},
booktitle = {Proceedings of 2020 IEEE/ACM
International Workshop on HPC User
Support Tools (HUST) and the Workshop
on Programming and Performance
Visualization Tools (ProTools)},
abstract = {Performance measurement and analysis of parallel
applications is often challenging, despite many excellent
commercial and open-source tools being available. Currently
envisaged exascale computer systems exacerbate matters by
requiring extremely high scalability to effectively exploit
millions of processor cores. Unfortunately, significant
application execution performance variability arising from
increasingly complex interactions between hardware and
system software makes this situation much more difficult for
application developers and performance analysts alike. This
work considers the performance assessment of the HemeLB
exascale flagship application code from the EU HPC Centre of
Excellence (CoE) for Computational Biomedicine (CompBioMed)
running on the SuperMUC-NG Tier-0 leadership system, using
the methodology of the Performance Optimisation and
Productivity (POP) CoE. Although $80\%$ scaling efficiency
is maintained to over 100,000 MPI processes, disappointing
initial performance with more processes and corresponding
poor strong scaling was identified to originate from the
same few compute nodes in multiple runs, which later system
diagnostic checks found had faulty DIMMs and lacklustre
performance. Excluding these compute nodes from subsequent
runs improved performance of executions with over 300,000
MPI processes by a factor of five, resulting in 190x
speed-up compared to 864 MPI processes. While communication
efficiency remains very good up to the largest scale,
parallel efficiency is primarily limited by load balance
found to be largely due to core-to-core and run-to-run
variability from excessive stalls for memory accesses, that
affect many HPC systems with Intel Xeon Scalable processors.
The POP methodology for this performance diagnosis is
demonstrated via a detailed exposition with widely deployed
'standard' measurement and analysis tools.},
month = {Nov},
date = {2020-11-12},
organization = {Workshop on Programming and
Performance Visualization Tools, online
(online), 12 Nov 2020 - 12 Nov 2020},
keywords = {E-Government (gnd)},
cin = {JSC},
cid = {I:(DE-Juel1)JSC-20090406},
pnm = {511 - Computational Science and Mathematical Methods
(POF3-511) / POP2 - Performance Optimisation and
Productivity 2 (824080) / CompBioMed - A Centre of
Excellence in Computational Biomedicine (675451) / ATMLPP -
ATML Parallel Performance (ATMLPP)},
pid = {G:(DE-HGF)POF3-511 / G:(EU-Grant)824080 /
G:(EU-Grant)675451 / G:(DE-Juel-1)ATMLPP},
typ = {PUB:(DE-HGF)8 / PUB:(DE-HGF)7},
UT = {WOS:000679395600007},
doi = {10.1109/HUSTProtools51951.2020.00014},
url = {https://juser.fz-juelich.de/record/885773},
}