MISATO: machine learning dataset of protein–ligand complexes for structure-based drug discovery

Siebenmorgen, Till; Merdivan, Erinc; Piraud, Marie; Popowicz, Grzegorz M.; Benassou, Sabrina; Menezes, Filipe; Liò, Pietro; Kitel, Radosław; Theis, Fabian J.; Kesselheim, Stefan; Sattler, Michael; Mourão, André Santos Dias; Didi, Kieran

doi:10.1038/s43588-024-00627-2

% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@ARTICLE{Siebenmorgen:1037654,
      author       = {Siebenmorgen, Till and Menezes, Filipe and Benassou,
                      Sabrina and Merdivan, Erinc and Didi, Kieran and Mourão,
                      André Santos Dias and Kitel, Radosław and Liò, Pietro and
                      Kesselheim, Stefan and Piraud, Marie and Theis, Fabian J.
                      and Sattler, Michael and Popowicz, Grzegorz M.},
      title        = {{MISATO}: machine learning dataset of protein–ligand
                      complexes for structure-based drug discovery},
      journal      = {Nature computational science},
      volume       = {4},
      number       = {5},
      issn         = {2662-8457},
      address      = {London},
      publisher    = {Nature Research},
      reportid     = {FZJ-2025-00819},
      pages        = {367 - 378},
      year         = {2024},
      abstract     = {Large language models have greatly enhanced our ability to
                      understand biology and chemistry, yet robust methods for
                      structure-based drug discovery, quantum chemistry and
                      structural biology are still sparse. Precise
                      biomolecule–ligand interaction datasets are urgently
                      needed for large language models. To address this, we
                      present MISATO, a dataset that combines quantum mechanical
                      properties of small molecules and associated molecular
                      dynamics simulations of ~20,000 experimental
                      protein–ligand complexes with extensive validation of
                      experimental data. Starting from the existing experimental
                      structures, semi-empirical quantum mechanics was used to
                      systematically refine these structures. A large collection
                      of molecular dynamics traces of protein–ligand complexes
                      in explicit water is included, accumulating over 170 μs.
                      We give examples of machine learning (ML) baseline models
                      proving an improvement of accuracy by employing our data. An
                      easy entry point for ML experts is provided to enable the
                      next generation of drug discovery artificial intelligence
                      models.},
      cin          = {JSC},
      ddc          = {004},
      cid          = {I:(DE-Juel1)JSC-20090406},
      pnm          = {5112 - Cross-Domain Algorithms, Tools, Methods Labs (ATMLs)
                      and Research Groups (POF4-511)},
      pid          = {G:(DE-HGF)POF4-5112},
      typ          = {PUB:(DE-HGF)16},
      pubmed       = {38730184},
      UT           = {WOS:001220857400002},
      doi          = {10.1038/s43588-024-00627-2},
      url          = {https://juser.fz-juelich.de/record/1037654},
}

guest :: login JuSER
		Search		Submit		Personalize Your alerts Your baskets Your searches		Help