% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@ARTICLE{Erlingsson:852519,
author = {Erlingsson, Ernir and Neukirchen, Helmut and Cavallaro,
Gabriele and Riedel, Morris},
title = {{S}caling {DBSCAN} towards exascale computing for
clustering of big datasets},
journal = {Geophysical research abstracts},
volume = {20},
issn = {1607-7962},
address = {Katlenburg-Lindau},
publisher = {Soc.},
reportid = {FZJ-2018-05447},
pages = {EGU2018-16171},
year = {2018},
abstract = {Progress in sensor technology allows us to collect
environmental data in more detail and with better
resolutionthan ever before. One example are 3D laser
scanners that generate 3D point-cloud datasets for land
survey.Clustering can then be performed on these datasets to
identify objects such as buildings, trees, or rocks in
theunstructured point-clouds. Segmenting huge point-clouds
(of whole cities or even whole countries) into objects isa
computationally expensive operation and therefore requires
parallel processing. Density-based spatial clusteringof
applications with noise (DBSCAN) is a popular clustering
algorithm and HPDBSCAN is an efficient
parallelimplementation of it running on supercomputing
clusters. Tomorrow’s supercomputers will be able to
provideexascale computing performance by exploiting
specialised hardware accelerators, however, existing
softwareneeds to be adapted to make use of the best fitting
accelerators. To address this problem, we present a mapping
ofHPDBSCAN to a pre-exascale platform currently being
developed by the European DEEP-EST project. It is basedon
the Modular Supercomputer Architecture (MSA) that provides a
set of accelerator modules which we exploitin novel ways to
enhance HPDBSCAN to reach exascale performance. These MSA
modules include: a ClusterModule (CM) with powerful
multicore CPUs; the Extreme Scale Booster (ESB) module with
manycore CPUs;the Network Attached Memory (NAM) module which
stores datasets and provides extremely fast access to them;a
fast interconnect fabric speeds up inter-process message
passing together with the Global Collective Engine(GCE),
which includes a multi-purpose Field Programmable Gate Array
(FPGA) for, e.g., summing up valuestransmitted in messages
collected. HPDBSCAN exploits the above accelerator modules
as follows: the data that isto be clustered can be stored in
the NAM, it gets subsequently distributed and load balanced,
which is acceleratedby the GCE, to the CPU nodes of the CM;
the parallel clustering itself is performed by the powerful
CPUs of theCM which also merges the obtained cluster IDs;
the merged cluster IDs are stored in the NAM for further
level ofdetail (LoD) studies, i.e. zooming in and out based
on continuous, instead of fixed, levels of importance for
eachpoint, which can be regarded as an added dimension. The
ESB module (supported by GCE) is most suitable tocalculate
these continuous level of importance (cLoI) values and add
them to the dataset in the NAM. Based on theadded cLoI data,
the LoD studies can then be performed by re-clustering as
described previously, i.e. distributionand load balancing of
the cLoI value-enriched dataset followed by parallel
clustering. The described approach willallow to scale
HPDBSCAN-based clusterings on tomorrow’s hardware towards
exascale performance.},
month = {Apr},
date = {2018-04-08},
organization = {EGU General Assembly 2018, Wien
(Austria), 8 Apr 2018 - 13 Apr 2018},
cin = {JSC},
ddc = {550},
cid = {I:(DE-Juel1)JSC-20090406},
pnm = {512 - Data-Intensive Science and Federated Computing
(POF3-512) / DEEP-EST - DEEP - Extreme Scale Technologies
(754304)},
pid = {G:(DE-HGF)POF3-512 / G:(EU-Grant)754304},
typ = {PUB:(DE-HGF)16},
url = {https://juser.fz-juelich.de/record/852519},
}