% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@INPROCEEDINGS{Thnnien:1031461,
author = {Thönnißen, Julia and Dickscheid, Timo},
title = {{B}alancing {E}fficiency and {S}tandardization for a
{M}icroscopic {I}mage {R}epository on an {HPC} {S}ystem},
school = {Heinrich-Heine-University Düsseldorf},
reportid = {FZJ-2024-05680},
year = {2024},
abstract = {Understanding the human brain is one of the greatest
challenges of modern science. In order to study its complex
structural and functional organization, data from different
modalities and resolutions must be linked together. This
requires scalable and reproducible workflows ranging from
the extraction of multimodal data from different
repositories to AI-driven analysis and visualization [1].
One fundamental challenge therein is to store and organize
big image datasets in appropriate repositories. Here we
address the case of building a repository of high-resolution
microscopy scans for whole human brain sections in the order
of multiple Petabytes [1]. Since data duplication is
prohibitive for such volumes, images need to be stored in a
way that follows community standards, supports provenance
tracking, and meets performance requirements of
high-throughput ingestion, highly parallel processing on HPC
systems, as well as ad-hoc random access for interactive
visualization. To digitize an entire human brain,
high-throughput scanners need to capture over 7000
histological brain sections. During this process, a scanner
acquires a z-stack, which consists of 30 TIFF images per
tissue section, each representing a different focus level.
The images are automatically transferred from the scanner to
a gateway server, where they are pre-organised into
subfolders per brain section for detailed automated quality
control (QC). Once a z-stack passes QC, it is transferred to
the parallel file system (GPFS) on the supercomputer via
NFS-mount. For one human brain, this results in 7000 folders
with about 2 PByte of image data in about 20K files in
total. From there, the data are accessed simultaneously by
different applications and pipelines with their very
heterogeneous requirements. HPC analyses based on Deep
Learning such as cell segmentation or brain mapping rely on
fast random access and parallel I/O to stream image patches
efficiently to GPUs. Remote visualization and annotation on
the other hand requires exposure of the data through an HTTP
service on a VM, with access to higher capacity storage to
serve different data at the same time. These demands can be
covered by multi-tier HPC storage, which provides dedicated
partitions. The High Performance Storage Tier offers low
latency and high bandwidth for analysis, while the Extended
Capacity Storage Tier is capacity-optimized with a lower
latency, meeting the needs for visualization. Exposing the
data on different tiers requires controlled staging and
unstaging. We organize the image data folders via DataLad
datasets, which allows well defined staging across these
partitions for different applications, ensures that all data
is tracked and versioned from distributed storage throughout
the workflow, and enables provenance tracking. To reduce the
number of files in one DataLad repository, each section
folder has been designed as a subdataset of a superdataset
that contains all section folders. The current approach to
managing data has two deficiencies. Firstly, the TIFF format
is not optimized for HPC usage due to the lack of parallel
I/O support, resulting in data duplication due to conversion
to HDF5. Secondly, the current data organization is not
compatible with upcoming community standards, complicating
collaborative efforts. Therefore, standardization of the
file format and folder structure is a major objective for
the near future. The widely accepted community standard for
organizing neuroscience data is the Brain Imaging Data
Structure (BIDS). Its extension for microscopy proposes
splitting the data into subjects and samples, while using
either (OME-)TIFF or OME-ZARR as a file format.
Particularly, the NGFF file format OME-ZARR appears to be
the suitable choice for the workflow described, as it is
more performant on HPC and cloud compatible as opposed to
TIFF. However, restructuring the current data layout is a
complex task. Adopting the BIDS standard results in large
amounts of inodes and files because (1) multiple folders and
sidecar files are created and (2) OME-ZARR files are
comprised of many small files. DataLad annex undergoes
expansion with the increase in the number of files leading
to high inode usage and reduced performance. An effective
solution to this problem may involve the optimization of the
size of DataLad subdatasets. However, the key consideration
is that GPFS file systems enforce a limit on the number of
inodes, which cannot be surpassed. This raises the following
questions: How can usage of inodes be minimized while
adhering to BIDS and utilizing DataLad? Should performant
file formats with minimal inode usage, such as ZARR v3 or
HDF5, be incorporated into the BIDS standard? What is a good
balance for DataLad subdataset sizes? Discussions with the
community may provide valuable perspectives for advancing
this issue. [1] Amunts K, Lippert T. Brain research
challenges supercomputing. Science 374, 1054-1055 (2021).
DOI:10.1126/science.abl8519},
month = {Apr},
date = {2024-04-04},
organization = {Distribits: technologies for
distributed data management,
Düsseldorf (Germany), 4 Apr 2024 - 6
Apr 2024},
subtyp = {After Call},
cin = {INM-1},
cid = {I:(DE-Juel1)INM-1-20090406},
pnm = {5254 - Neuroscientific Data Analytics and AI (POF4-525) /
EBRAINS 2.0 - EBRAINS 2.0: A Research Infrastructure to
Advance Neuroscience and Brain Health (101147319) / HIBALL -
Helmholtz International BigBrain Analytics and Learning
Laboratory (HIBALL) (InterLabs-0015) / DFG project 501864659
- NFDI4BIOIMAGE - Nationale Forschungsdateninfrastruktur
für Mikroskopie und Bildanalyse (501864659)},
pid = {G:(DE-HGF)POF4-5254 / G:(EU-Grant)101147319 /
G:(DE-HGF)InterLabs-0015 / G:(GEPRIS)501864659},
typ = {PUB:(DE-HGF)6},
url = {https://juser.fz-juelich.de/record/1031461},
}