% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@PHDTHESIS{Gtz:841390,
author = {Götz, Markus},
title = {{S}calable {D}ata {A}nalysis in {H}igh {P}erformance
{C}omputing},
school = {Universität Island},
type = {Dissertation},
address = {Reykjavik},
publisher = {Háskólaprent, Universität Island},
reportid = {FZJ-2017-08465},
isbn = {978-9935-9383-2-9},
pages = {156 p.},
year = {2017},
note = {Dissertation, Universität Island, 2017},
abstract = {Over the last decades one could observe a drastic increase
in the generation and storage of data in both, industry and
science. While the field of data analysis is not new, it is
now facing the challenge of coping with an increasing size,
bandwidth and complexity of data. This renders traditional
analysis methods and algorithms ineffective. This problem
has been coined as the Big Data challenge. Concretely in
science the major data producers are large-scale monolithic
experiments and the outputs of domain simulations. Up until
now, most of this data has not yet been completely analyzed,
but rather stored in data repositories for later
consideration due to the lack of efficient means of
processing. As a consequence, there is a need for
large-scale data analysis frameworks and algorithm libraries
allowing to study these datasets. In context of scientific
applications, potentially coupled with legacy simulations,
the designated target platform are heterogeneous
high-performance computing systems.This thesis proposes a
design and prototypical realization of such a framework
based on the experience collected from empirical
applications. For this, selected scientific use cases, with
an emphasis on earth sciences, were studied. In particular,
these are object segmentation in point cloud data and
biological imagery, outlier detection in oceanographic
time-series data as well as land cover type classification
in remote sensing images. In order to deal with the data
amounts, two analysis algorithms have been parallelized for
shared- and distributed-memory systems. Concretely, these
are HPDBSCAN, a density-based clustering algorithm, as well
as Distributed Max-Trees, a filtering step for images. The
presented parallelization strategies have been abstracted
into a generalized paradigm, enabling the formulation of
scalable algorithms for other similar analysis methods.
Moreover, it permits the definition of requirements for the
design of a large-scale data analysis framework and
algorithm library for heterogeneous, distributed
high-performance computing systems. In line with that, the
thesis presents a prototypical realization called Juelich
Machine Learning Library (JuML), providing essential
low-level components and readily usable analysis algorithm
implementations.},
cin = {JSC},
cid = {I:(DE-Juel1)JSC-20090406},
pnm = {512 - Data-Intensive Science and Federated Computing
(POF3-512) / PhD no Grant - Doktorand ohne besondere
Förderung (PHD-NO-GRANT-20170405)},
pid = {G:(DE-HGF)POF3-512 / G:(DE-Juel1)PHD-NO-GRANT-20170405},
typ = {PUB:(DE-HGF)3 / PUB:(DE-HGF)11},
url = {https://juser.fz-juelich.de/record/841390},
}