% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@ARTICLE{Flge:1048464,
author = {Flöge, Klemens and Udayakumar, Srisruthi and Sommer,
Johanna and Piraud, Marie and Kesselheim, Stefan and
Fortuin, Vincent and Günnemann, Stephan and van der Weg,
Karel J. and Gohlke, Holger and Merdivan, Erinc and
Bazarova, Alina},
title = {{O}ne{P}rot: {T}owards multi-modal protein foundation
models via latent space alignment of sequence, structure,
binding sites and text encoders},
journal = {PLoS Computational Biology},
volume = {21},
number = {11},
issn = {1553-734X},
address = {San Francisco, Calif.},
publisher = {Public Library of Science},
reportid = {FZJ-2025-04662},
pages = {e1013679},
year = {2025},
abstract = {Recent advances in Artificial Intelligence have enabled
multi-modal systems to model and translate diverse
information spaces. Extending beyond text and vision, we
introduce OneProt, a multi-modal Deep Learning model for
proteins that integrates structural, sequence, text, and
binding site data. Using the ImageBind framework, OneProt
aligns the latent spaces of protein modality encoders in a
lightweight fine-tuning scheme that focuses on pairwise
alignment with sequence data, rather than requiring full
matches. This novel approach comprises a mix of Graph Neural
Networks and transformer architectures. It demonstrates good
performance in retrieval tasks and showcases the efficacy of
multi-modal systems in Protein Machine Learning through a
broad spectrum of downstream baselines, including enzyme
function prediction and binding site analysis. Furthermore,
OneProt enables the transfer of representational information
from specialized encoders to the sequence encoder, enhancing
capabilities for distinguishing evolutionarily related and
unrelated sequences and exhibiting representational
properties where evolutionarily related proteins align in
similar directions within the latent space. In addition, we
extensively investigate modality ablations to identify the
encoders that contribute the most to predictive performance,
highlighting the significance of the binding site encoder,
which has not been used in similar models previously. This
work expands the horizons of multi-modal protein models,
paving the way for transformative applications in drug
discovery, biocatalytic reaction planning, and protein
engineering.},
cin = {IBG-4 / JSC},
ddc = {610},
cid = {I:(DE-Juel1)IBG-4-20200403 / I:(DE-Juel1)JSC-20090406},
pnm = {2171 - Biological and environmental resources for
sustainable use (POF4-217) / 5112 - Cross-Domain Algorithms,
Tools, Methods Labs (ATMLs) and Research Groups (POF4-511) /
Helmholtz AI Consultant Team FB Information (E54.303.11)},
pid = {G:(DE-HGF)POF4-2171 / G:(DE-HGF)POF4-5112 /
G:(DE-Juel-1)E54.303.11},
typ = {PUB:(DE-HGF)16},
doi = {10.1371/journal.pcbi.1013679},
url = {https://juser.fz-juelich.de/record/1048464},
}