% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@INPROCEEDINGS{Hoppe:1019997,
author = {Hoppe, Fabian and Comito, Claudia and Götz, Markus and
Gutiérrez Hermosillo Murieda, Juan Pedro and Hagemeier,
Björn and Knechtges, Philipp and Krajsek, Kai and
Rüttgers, Alexander and Streit, Achim and Tarnawa, Michael},
title = {{T}he {H}elmholtz {A}nalytics {T}oolkit ({H}eat) and its
role in the landscape of massively-parallel scientific
{P}ython},
reportid = {FZJ-2023-05812},
year = {2023},
abstract = {When it comes to enhancing exploitation of massive data,
machine learning methods are at the forefront of
researchers’ awareness. Much less so is the need for, and
the complexity of, applying these techniques efficiently
across large-scale, memory-distributed data volumes. In
fact, these aspects typical for the handling of massive data
sets pose major challenges to the vast majority of research
communities, in particular to those without a background in
high-performance computing. Often, the standard approach
involves breaking up and analyzing data in smaller chunks;
this can be inefficient and prone to errors, and sometimes
it might be inappropriate at all because the context of the
overall data set can get lost.The Helmholtz Analytics
Toolkit (Heat) library offers a solution to this problem by
providing memory-distributed and hardware-accelerated array
manipulation, data analytics, and machine learning
algorithms in Python. The main objective is to make
memory-intensive data analysis possible across various
fields of research ---in particular for domain scientists
being non-experts in traditional high-performance computing
who nevertheless need to tackle data analytics problems
going beyond the capabilities of a single workstation. The
development of this interdisciplinary, general-purpose, and
open-source scientific Python library started in 2018 and is
based on collaboration of three institutions (German
Aerospace Center DLR, Forschungszentrum Jülich FZJ,
Karlsruhe Institute of Technology KIT) of the Helmholtz
Association. The pillars of its development are... ...to
enable memory distribution of n-dimensional arrays, to adopt
PyTorch as process-local compute engine (hence supporting
GPU-acceleration), to provide memory-distributed (i.e.,
multi-node, multi-GPU) array operations and algorithms,
optimizing asynchronous MPI-communication (based on mpi4py)
under the hood, and to wrap functionalities in NumPy- or
scikit-learn-like API to achieve porting of existing
applications with minimal changes and to enable the usage by
non-experts in HPC.In this talk we will give an illustrative
overview on the current features and capabilities of our
library. Moreover, we will discuss its role in the existing
ecosystem of distributed computing in Python, and we will
address technical and operational challenges in further
development.},
month = {Aug},
date = {2023-08-14},
organization = {EuroSciPy, Basel (Switzerland), 14 Aug
2023 - 17 Aug 2023},
subtyp = {After Call},
cin = {JSC},
cid = {I:(DE-Juel1)JSC-20090406},
pnm = {5111 - Domain-Specific Simulation $\&$ Data Life Cycle Labs
(SDLs) and Research Groups (POF4-511) / 5112 - Cross-Domain
Algorithms, Tools, Methods Labs (ATMLs) and Research Groups
(POF4-511) / SLNS - SimLab Neuroscience (Helmholtz-SLNS)},
pid = {G:(DE-HGF)POF4-5111 / G:(DE-HGF)POF4-5112 /
G:(DE-Juel1)Helmholtz-SLNS},
typ = {PUB:(DE-HGF)6},
url = {https://juser.fz-juelich.de/record/1019997},
}