% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@INPROCEEDINGS{Thnnien:1031461,
      author       = {Thönnißen, Julia and Dickscheid, Timo},
      title        = {{B}alancing {E}fficiency and {S}tandardization for a
                      {M}icroscopic {I}mage {R}epository on an {HPC} {S}ystem},
      school       = {Heinrich-Heine-University Düsseldorf},
      reportid     = {FZJ-2024-05680},
      year         = {2024},
      abstract     = {Understanding the human brain is one of the greatest
                      challenges of modern science. In order to study its complex
                      structural and functional organization, data from different
                      modalities and resolutions must be linked together. This
                      requires scalable and reproducible workflows ranging from
                      the extraction of multimodal data from different
                      repositories to AI-driven analysis and visualization [1].
                      One fundamental challenge therein is to store and organize
                      big image datasets in appropriate repositories. Here we
                      address the case of building a repository of high-resolution
                      microscopy scans for whole human brain sections in the order
                      of multiple Petabytes [1]. Since data duplication is
                      prohibitive for such volumes, images need to be stored in a
                      way that follows community standards, supports provenance
                      tracking, and meets performance requirements of
                      high-throughput ingestion, highly parallel processing on HPC
                      systems, as well as ad-hoc random access for interactive
                      visualization. To digitize an entire human brain,
                      high-throughput scanners need to capture over 7000
                      histological brain sections. During this process, a scanner
                      acquires a z-stack, which consists of 30 TIFF images per
                      tissue section, each representing a different focus level.
                      The images are automatically transferred from the scanner to
                      a gateway server, where they are pre-organised into
                      subfolders per brain section for detailed automated quality
                      control (QC). Once a z-stack passes QC, it is transferred to
                      the parallel file system (GPFS) on the supercomputer via
                      NFS-mount. For one human brain, this results in 7000 folders
                      with about 2 PByte of image data in about 20K files in
                      total. From there, the data are accessed simultaneously by
                      different applications and pipelines with their very
                      heterogeneous requirements. HPC analyses based on Deep
                      Learning such as cell segmentation or brain mapping rely on
                      fast random access and parallel I/O to stream image patches
                      efficiently to GPUs. Remote visualization and annotation on
                      the other hand requires exposure of the data through an HTTP
                      service on a VM, with access to higher capacity storage to
                      serve different data at the same time. These demands can be
                      covered by multi-tier HPC storage, which provides dedicated
                      partitions. The High Performance Storage Tier offers low
                      latency and high bandwidth for analysis, while the Extended
                      Capacity Storage Tier is capacity-optimized with a lower
                      latency, meeting the needs for visualization. Exposing the
                      data on different tiers requires controlled staging and
                      unstaging. We organize the image data folders via DataLad
                      datasets, which allows well defined staging across these
                      partitions for different applications, ensures that all data
                      is tracked and versioned from distributed storage throughout
                      the workflow, and enables provenance tracking. To reduce the
                      number of files in one DataLad repository, each section
                      folder has been designed as a subdataset of a superdataset
                      that contains all section folders. The current approach to
                      managing data has two deficiencies. Firstly, the TIFF format
                      is not optimized for HPC usage due to the lack of parallel
                      I/O support, resulting in data duplication due to conversion
                      to HDF5. Secondly, the current data organization is not
                      compatible with upcoming community standards, complicating
                      collaborative efforts. Therefore, standardization of the
                      file format and folder structure is a major objective for
                      the near future. The widely accepted community standard for
                      organizing neuroscience data is the Brain Imaging Data
                      Structure (BIDS). Its extension for microscopy proposes
                      splitting the data into subjects and samples, while using
                      either (OME-)TIFF or OME-ZARR as a file format.
                      Particularly, the NGFF file format OME-ZARR appears to be
                      the suitable choice for the workflow described, as it is
                      more performant on HPC and cloud compatible as opposed to
                      TIFF. However, restructuring the current data layout is a
                      complex task. Adopting the BIDS standard results in large
                      amounts of inodes and files because (1) multiple folders and
                      sidecar files are created and (2) OME-ZARR files are
                      comprised of many small files. DataLad annex undergoes
                      expansion with the increase in the number of files leading
                      to high inode usage and reduced performance. An effective
                      solution to this problem may involve the optimization of the
                      size of DataLad subdatasets. However, the key consideration
                      is that GPFS file systems enforce a limit on the number of
                      inodes, which cannot be surpassed. This raises the following
                      questions: How can usage of inodes be minimized while
                      adhering to BIDS and utilizing DataLad? Should performant
                      file formats with minimal inode usage, such as ZARR v3 or
                      HDF5, be incorporated into the BIDS standard? What is a good
                      balance for DataLad subdataset sizes? Discussions with the
                      community may provide valuable perspectives for advancing
                      this issue. [1] Amunts K, Lippert T. Brain research
                      challenges supercomputing. Science 374, 1054-1055 (2021).
                      DOI:10.1126/science.abl8519},
      month         = {Apr},
      date          = {2024-04-04},
      organization  = {Distribits: technologies for
                       distributed data management,
                       Düsseldorf (Germany), 4 Apr 2024 - 6
                       Apr 2024},
      subtyp        = {After Call},
      cin          = {INM-1},
      cid          = {I:(DE-Juel1)INM-1-20090406},
      pnm          = {5254 - Neuroscientific Data Analytics and AI (POF4-525) /
                      EBRAINS 2.0 - EBRAINS 2.0: A Research Infrastructure to
                      Advance Neuroscience and Brain Health (101147319) / HIBALL -
                      Helmholtz International BigBrain Analytics and Learning
                      Laboratory (HIBALL) (InterLabs-0015) / DFG project 501864659
                      - NFDI4BIOIMAGE - Nationale Forschungsdateninfrastruktur
                      für Mikroskopie und Bildanalyse (501864659)},
      pid          = {G:(DE-HGF)POF4-5254 / G:(EU-Grant)101147319 /
                      G:(DE-HGF)InterLabs-0015 / G:(GEPRIS)501864659},
      typ          = {PUB:(DE-HGF)6},
      url          = {https://juser.fz-juelich.de/record/1031461},
}