% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@ARTICLE{Flint:892632,
author = {Flint, Claas and Cearns, Micah and Opel, Nils and Redlich,
Ronny and Mehler, David M. A. and Emden, Daniel and Winter,
Nils R. and Leenings, Ramona and Eickhoff, Simon B. and
Kircher, Tilo and Krug, Axel and Nenadic, Igor and Arolt,
Volker and Clark, Scott and Baune, Bernhard T. and Jiang,
Xiaoyi and Dannlowski, Udo and Hahn, Tim},
title = {{S}ystematic misestimation of machine learning performance
in neuroimaging studies of depression},
journal = {Neuropsychopharmacology},
volume = {46},
number = {8},
issn = {0893-133X},
address = {Basingstoke},
publisher = {Nature Publishing Group},
reportid = {FZJ-2021-02221},
pages = {1510-1517},
year = {2021},
abstract = {We currently observe a disconcerting phenomenon in machine
learning studies in psychiatry: While we would expect larger
samples to yield better results due to the availability of
more data, larger machine learning studies consistently show
much weaker performance than the numerous small-scale
studies. Here, we systematically investigated this effect
focusing on one of the most heavily studied questions in the
field, namely the classification of patients suffering from
Major Depressive Disorder (MDD) and healthy controls based
on neuroimaging data. Drawing upon structural MRI data from
a balanced sample of N = 1868 MDD patients and healthy
controls from our recent international Predictive Analytics
Competition (PAC), we first trained and tested a
classification model on the full dataset which yielded an
accuracy of $61\%.$ Next, we mimicked the process by which
researchers would draw samples of various sizes (N = 4 to N
= 150) from the population and showed a strong risk of
misestimation. Specifically, for small sample sizes (N =
20), we observe accuracies of up to $95\%.$ For medium
sample sizes (N = 100) accuracies up to $75\%$ were found.
Importantly, further investigation showed that sufficiently
large test sets effectively protect against performance
misestimation whereas larger datasets per se do not. While
these results question the validity of a substantial part of
the current literature, we outline the relatively low-cost
remedy of larger test sets, which is readily available in
most cases.},
cin = {INM-7},
ddc = {610},
cid = {I:(DE-Juel1)INM-7-20090406},
pnm = {525 - Decoding Brain Organization and Dysfunction
(POF4-525)},
pid = {G:(DE-HGF)POF4-525},
typ = {PUB:(DE-HGF)16},
pubmed = {33958703},
UT = {WOS:000647877800001},
doi = {10.1038/s41386-021-01020-7},
url = {https://juser.fz-juelich.de/record/892632},
}