% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@ARTICLE{Upadhyay:1043532,
author = {Upadhyay, Utkarsh and Pucci, Fabrizio and Herold, Julian
and Schug, Alexander},
title = {{N}ucleo{S}eeker—precision filtering of {RNA} databases
to curate high-quality datasets},
journal = {NAR: genomics and bioinformatics},
volume = {7},
number = {1},
issn = {2631-9268},
address = {Oxford},
publisher = {Oxford University Press},
reportid = {FZJ-2025-02908},
pages = {lqaf021},
year = {2025},
abstract = {The structural prediction of biomolecules via computational
methods complements the often involved wet-lab experiments.
Unlike protein structure prediction, RNA structure
prediction remains a significant challenge in
bioinformatics, primarily due to the scarcity of annotated
RNA structure data and its varying quality. Many methods
have used this limited data to train deep learning models
but redundancy, data leakage and bad data quality hampers
their performance. In this work, we present NucleoSeeker, a
tool designed to curate high-quality, tailored datasets from
the Protein Data Bank (PDB) database. It is a unified
framework that combines multiple tools and streamlines an
otherwise complicated process of data curation. It offers
multiple filters at structure, sequence, and annotation
levels, giving researchers full control over data curation.
Further, we present several use cases. In particular, we
demonstrate how NucleoSeeker allows the creation of a
nonredundant RNA structure dataset to assess AlphaFold3’s
performance for RNA structure prediction. This demonstrates
NucleoSeeker’s effectiveness in curating valuable
nonredundant tailored datasets to both train novel and judge
existing methods. NucleoSeeker is very easy to use, highly
flexible, and can significantly increase the quality of RNA
structure datasets.},
cin = {JSC},
ddc = {570},
cid = {I:(DE-Juel1)JSC-20090406},
pnm = {5111 - Domain-Specific Simulation $\&$ Data Life Cycle Labs
(SDLs) and Research Groups (POF4-511) / Helmholtz AI -
Helmholtz Artificial Intelligence Coordination Unit –
Local Unit FZJ (E.40401.62)},
pid = {G:(DE-HGF)POF4-5111 / G:(DE-Juel-1)E.40401.62},
typ = {PUB:(DE-HGF)16},
pubmed = {40104673},
UT = {WOS:001446715300001},
doi = {10.1093/nargab/lqaf021},
url = {https://juser.fz-juelich.de/record/1043532},
}