% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@ARTICLE{Flint:892632,
      author       = {Flint, Claas and Cearns, Micah and Opel, Nils and Redlich,
                      Ronny and Mehler, David M. A. and Emden, Daniel and Winter,
                      Nils R. and Leenings, Ramona and Eickhoff, Simon B. and
                      Kircher, Tilo and Krug, Axel and Nenadic, Igor and Arolt,
                      Volker and Clark, Scott and Baune, Bernhard T. and Jiang,
                      Xiaoyi and Dannlowski, Udo and Hahn, Tim},
      title        = {{S}ystematic misestimation of machine learning performance
                      in neuroimaging studies of depression},
      journal      = {Neuropsychopharmacology},
      volume       = {46},
      number       = {8},
      issn         = {0893-133X},
      address      = {Basingstoke},
      publisher    = {Nature Publishing Group},
      reportid     = {FZJ-2021-02221},
      pages        = {1510-1517},
      year         = {2021},
      abstract     = {We currently observe a disconcerting phenomenon in machine
                      learning studies in psychiatry: While we would expect larger
                      samples to yield better results due to the availability of
                      more data, larger machine learning studies consistently show
                      much weaker performance than the numerous small-scale
                      studies. Here, we systematically investigated this effect
                      focusing on one of the most heavily studied questions in the
                      field, namely the classification of patients suffering from
                      Major Depressive Disorder (MDD) and healthy controls based
                      on neuroimaging data. Drawing upon structural MRI data from
                      a balanced sample of N = 1868 MDD patients and healthy
                      controls from our recent international Predictive Analytics
                      Competition (PAC), we first trained and tested a
                      classification model on the full dataset which yielded an
                      accuracy of $61\%.$ Next, we mimicked the process by which
                      researchers would draw samples of various sizes (N = 4 to N
                      = 150) from the population and showed a strong risk of
                      misestimation. Specifically, for small sample sizes (N =
                      20), we observe accuracies of up to $95\%.$ For medium
                      sample sizes (N = 100) accuracies up to $75\%$ were found.
                      Importantly, further investigation showed that sufficiently
                      large test sets effectively protect against performance
                      misestimation whereas larger datasets per se do not. While
                      these results question the validity of a substantial part of
                      the current literature, we outline the relatively low-cost
                      remedy of larger test sets, which is readily available in
                      most cases.},
      cin          = {INM-7},
      ddc          = {610},
      cid          = {I:(DE-Juel1)INM-7-20090406},
      pnm          = {525 - Decoding Brain Organization and Dysfunction
                      (POF4-525)},
      pid          = {G:(DE-HGF)POF4-525},
      typ          = {PUB:(DE-HGF)16},
      pubmed       = {33958703},
      UT           = {WOS:000647877800001},
      doi          = {10.1038/s41386-021-01020-7},
      url          = {https://juser.fz-juelich.de/record/892632},
}