% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@TECHREPORT{Brmmel:283461,
author = {Brömmel, Dirk and Frings, Wolfgang and Wylie, Brian J. N.},
title = {{JUQUEEN} {E}xtreme {S}caling {W}orkshop 2016},
number = {FZJ-JSC-IB-2016-01},
reportid = {FZJ-2016-01816, FZJ-JSC-IB-2016-01},
series = {JSC Internal Report},
pages = {67 p.},
year = {2016},
abstract = {Feedback from last year's very successful workshop
motivated the organisation of a three-day workshop 1-3
February 2016, during which the entire 28-rack JUQUEEN
BlueGene/Q system with 458,752 cores was reserved for over
50 hours. Eight code teams were selected to use this
opportunity to investigate and improve their application
scalability, assisted by staff from JSC Simulation
Laboratories and Cross-sectional Teams. $Code_Saturne$ from
Daresbury Lab and Seven-League Hydro from HITS (Heidelberg)
were both able to display good strong scalability and
thereby become candidates for High-Q Club membership. Both
used 4 OpenMP threads per MPI process, over 1.8 million
threads in total. Existing members, CIAO from RWTH-ITV and
iFETI from University of Cologne and TU Freiberg, were able
to show that they had additional solvers which also scaled
acceptably. In-situ interactive visualisation was
demonstrated with a CIAO simulation using 458,752 MPI
processes running on 28 racks coupled via JUSITU to VisIt.
Two adaptive mesh refinement libraries, p4est from
University of Bonn and IciMesh from Ecole Central de Nantes,
showed that they could respectively scale to run with
917,504 and 458,752 MPI ranks, but both encountered problems
loading large meshes. Parallel file I/O limitations also
prevented large-scale executions of the FZJ IEK-6/Amphos21
PFLOTRAN subsurface flow and reactive transport code,
however, a NEST-import HDF5 module developed by the EPFL
Blue Brain Project could be optimised to use collective MPI
file reading calls to load and connect 1.9TB of neuron and
synapse data and enable large-scale data-driven neuronal
network simulations with 458,752 threads. Detailed reports
are provided by each code-team, and additional comparative
analysis to the 25 High-Q Club member codes. Despite more
mixed results than the previous workshop, we learnt more
about application file I/O limitations and inefficiencies
which continue to be the primary inhibitor to large-scale
simulations, and all of the participants found the workshop
to have been very valuable.},
cin = {JSC},
cid = {I:(DE-Juel1)JSC-20090406},
pnm = {511 - Computational Science and Mathematical Methods
(POF3-511) / 513 - Supercomputer Facility (POF3-513) /
ATMLPP - ATML Parallel Performance (ATMLPP) / ATMLAO - ATML
Application Optimization and User Service Tools (ATMLAO)},
pid = {G:(DE-HGF)POF3-511 / G:(DE-HGF)POF3-513 /
G:(DE-Juel-1)ATMLPP / G:(DE-Juel-1)ATMLAO},
typ = {PUB:(DE-HGF)3 / PUB:(DE-HGF)15},
url = {https://juser.fz-juelich.de/record/283461},
}