% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@ARTICLE{Brmmel:845503,
author = {Brömmel, Dirk and Frings, Wolfgang and Wylie, Brian J. N.
and Mohr, Bernd and Gibbon, Paul and Lippert, Thomas},
title = {{T}he {H}igh-{Q} {C}lub: {E}xperience with
{E}xtreme-scaling {A}pplication {C}odes},
journal = {Supercomputing frontiers and innovations},
volume = {5},
number = {1},
issn = {2313-8734},
address = {Chelyabinsk},
publisher = {South Ural State University},
reportid = {FZJ-2018-02737},
pages = {59-78},
year = {2018},
abstract = {Jülich Supercomputing Centre (JSC) started running
(extreme) scaling workshops with its first IBM Blue Gene
supercomputer, finally spanning three generations each
seeing an increase in the number of cores and available
threads. Over the years, this workshop series attracted
numerous international code teams and resulted in many
applications capable of running on all available cores of
each system.This article reviews some of the knowledge
gained with running and tuning highly-scalable applications,
focussing on JUQUEEN, the IBM Blue Gene/Q at JSC. The
ability to execute successfully on all 458752 cores with up
to 1.8 million processes or threads may qualify codes for
the High-Q Club, which serves as a showcase for diverse
codes scaling to the entire 28 racks, effectively defining a
collection of the highest scaling codes on JUQUEEN. The
intention was to encourage other developers to invest in
tuning and scaling their codes while identifying the
necessary key aspects for that goal.As this era closes, it
is timely to compare the characteristics of the 32 High-Q
Club member codes, considering their strong and/or weak
scaling, exploitation of hardware threading, and whether/how
intra-node multi-threading is employed combined with
message-passing. We also identify the obstacles for scaling
such as inefficient use of limited compute node memory and
file I/O as key governing factors. Overall, the analysis
provides guidance as to how applications may (need to) be
designed in future to exploit expected exa-scale computer
systems.},
cin = {JSC},
ddc = {004},
cid = {I:(DE-Juel1)JSC-20090406},
pnm = {511 - Computational Science and Mathematical Methods
(POF3-511) / 513 - Supercomputer Facility (POF3-513) /
ATMLPP - ATML Parallel Performance (ATMLPP) / ATMLAO - ATML
Application Optimization and User Service Tools (ATMLAO)},
pid = {G:(DE-HGF)POF3-511 / G:(DE-HGF)POF3-513 /
G:(DE-Juel-1)ATMLPP / G:(DE-Juel-1)ATMLAO},
typ = {PUB:(DE-HGF)16},
doi = {10.14529/jsfi180104},
url = {https://juser.fz-juelich.de/record/845503},
}