% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@INPROCEEDINGS{Thnnien:1033590,
author = {Thönnißen, Julia and Dickscheid, Timo and Hanke, Michael},
title = {{S}calable {D}ata {M}anagement for {H}igh-{R}esolution
{M}icroscopy of the {H}uman {B}rain: {C}hallenges and
{F}uture {D}irections},
reportid = {FZJ-2024-06470},
year = {2024},
abstract = {In order to investigate the complex structural and
functional organization of the human brain, data must be
integrated across multiple modalities and resolutions. This
requires the implementation of scalable workflows for data
extraction, AI-driven analysis, and visualization. A key
challenge in this process is the storage and organization of
large image datasets in suitable repositories. Due to the
prohibitive cost of data duplication at this scale, storage
systems must adhere to community standards, enable
provenance tracking, and meet the performance demands of
high-throughput data ingestion, highly parallel processing
on HPC systems, and random access for interactive
visualization. In this context we address the case of
building a repository of high-resolution microscopy scans
for whole human brain sections in the order of multiple
petabytes.Digitizing a human brain using whole-slide imaging
of cell body stained tissue sections requires capturing
about 7,000-8,000 histological sections at 20 micrometer
thickness using high-throughput scanners. When aiming for an
isotropic resolution of 1 micrometer, each histological
section generates 29 TIFF images (“z-stack”),
representing different focus levels. These images are
automatically transferred to a gateway server for initial
organization and automated quality control (QC). After QC,
the z-stack is moved to a parallel file system (GPFS) on a
supercomputer, generating approximately 2 petabytes of image
data across 200,000 files for a single brain. These data are
then accessed by various applications and pipelines, each
with distinct requirements. HPC applications, such as deep
learning-based cell segmentation and brain mapping, rely on
fast random access and parallel I/O to efficiently stream
image patches to GPUs. In contrast, remote visualization and
annotation require access via an HTTP service, along with
higher-capacity storage for serving diverse data
concurrently. A multi-tier HPC storage system addresses
these needs: the high-performance storage tier offers low
latency and high bandwidth for analysis, while the
capacity-optimized extended storage tier meets visualization
requirements. Controlled data staging across these tiers is
crucial and is managed using DataLad, which enables
well-defined staging, comprehensive tracking, and version
control of image datasets across distributed storage
systems. Each brain section is organized as a distinct
DataLad dataset to minimize the number of files per
repository.However, the current data management approach
presents two major challenges. First, the TIFF format lacks
support for parallel I/O, leading to data duplication when
converting to HDF5 for HPC workflows. Second, the existing
data organization is not aligned with community standards,
hindering collaboration. Therefore, a major objective is
standardization of both file formats and folder structures.
However, adopting standards such as the Brain Imaging Data
Structure (BIDS) poses significant challenges due to the
large number of files created by multiple folders and
sidecar files, as well as the small-file structure of
OME-ZARR, which is incompatible with GPFS file systems that
require inode restrictions.To address these challenges,
optimizing the size of DataLad datasets and exploring ways
to reduce inode usage are essential. Questions remain about
whether file formats like ZARR v3 or HDF5, which minimize
inode consumption, should be integrated into the BIDS
standard. Community discussions may provide solutions to
these issues.},
month = {Nov},
date = {2024-11-19},
organization = {INM Retreat 2024, Jülich (Germany),
19 Nov 2024 - 20 Nov 2024},
subtyp = {After Call},
cin = {INM-1},
cid = {I:(DE-Juel1)INM-1-20090406},
pnm = {5251 - Multilevel Brain Organization and Variability
(POF4-525) / 5254 - Neuroscientific Data Analytics and AI
(POF4-525) / HIBALL - Helmholtz International BigBrain
Analytics and Learning Laboratory (HIBALL) (InterLabs-0015)
/ EBRAINS 2.0 - EBRAINS 2.0: A Research Infrastructure to
Advance Neuroscience and Brain Health (101147319) / DFG
project G:(GEPRIS)501864659 - NFDI4BIOIMAGE - Nationale
Forschungsdateninfrastruktur für Mikroskopie und
Bildanalyse (501864659)},
pid = {G:(DE-HGF)POF4-5251 / G:(DE-HGF)POF4-5254 /
G:(DE-HGF)InterLabs-0015 / G:(EU-Grant)101147319 /
G:(GEPRIS)501864659},
typ = {PUB:(DE-HGF)24},
url = {https://juser.fz-juelich.de/record/1033590},
}