% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@INPROCEEDINGS{Upschulte:1043528,
author = {Upschulte, Eric and Amunts, Katrin and Dickscheid, Timo},
title = {{C}on{T}ext {T}ransformer: {T}ext-guided {I}nstance
{S}egmentation in {S}cientific {I}maging},
reportid = {FZJ-2025-02904},
year = {2025},
abstract = {Scientific imaging gives rise to a multitude of different
segmentation tasks, many of which involve manually annotated
datasets. We have collected a large number of such
heterogeneous datasets, comprising over 10 million instance
annotations, and demonstrate that in a multi-task setting,
segmentation models at this scale cannot be effectively
trained using solely image-based supervised learning. A
major reason is that images from the same domain may be used
to address different research questions, with varying
annotation procedures and styles. For example, images of
biological tissues may be evaluated for nuclei or cell
bodies, despite using the same image modality. To overcome
these challenges, we propose using simple text-based task
descriptions to provide models with the necessary context
for solving a given objective. We introduce the ConText
Transformer, which implements a dual-stream architecture,
processing and fusing both image and text data. Based on the
provided textual descriptions, the model learns to adapt its
internal feature representations to effectively switch
between segmenting different classes and annotation styles
observed in the datasets. These descriptions can range from
simple class names (e.g., “white blood
cells”)—prompting the model to only segment the
referenced class—to more nuanced formulations such as
toggling the use of overlapping segmentations in model
predictions or segmenting a nucleus, even in the absence of
cytoplasm or membrane, as is common in datasets like
TissueNet but omitted in Cellpose. Since interpreting these
descriptions is part of the model training, it is also
possible to define dedicated terms abbreviating very complex
descriptions. ConText Transformer is designed for
compatibility. It can be used with existing segmentation
frameworks, including the Contour Proposal Network (CPN) or
Mask R-CNN. Our experiments on over 10 million instance
annotations show that ConText Transformer models achieve
competitive segmentation performance and outperform
specialized models in several benchmarks; confirming that a
single, unified model can effectively handle a wide spectrum
of segmentation tasks; and eventually may replace specialist
models in scientific image segmentation},
month = {Jun},
date = {2025-06-03},
organization = {Helmholtz AI Conference 2025,
Karlsruhe (Germany), 3 Jun 2025 - 5 Jun
2025},
subtyp = {After Call},
cin = {INM-1},
cid = {I:(DE-Juel1)INM-1-20090406},
pnm = {5254 - Neuroscientific Data Analytics and AI (POF4-525) /
DFG project G:(GEPRIS)313856816 - SPP 2041: Computational
Connectomics (313856816) / EBRAINS 2.0 - EBRAINS 2.0: A
Research Infrastructure to Advance Neuroscience and Brain
Health (101147319) / HIBALL - Helmholtz International
BigBrain Analytics and Learning Laboratory (HIBALL)
(InterLabs-0015) / Helmholtz AI - Helmholtz Artificial
Intelligence Coordination Unit – Local Unit FZJ
(E.40401.62)},
pid = {G:(DE-HGF)POF4-5254 / G:(GEPRIS)313856816 /
G:(EU-Grant)101147319 / G:(DE-HGF)InterLabs-0015 /
G:(DE-Juel-1)E.40401.62},
typ = {PUB:(DE-HGF)24},
url = {https://juser.fz-juelich.de/record/1043528},
}