% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@ARTICLE{Dukart:891733,
author = {Dukart, Juergen and Weis, Susanne and Genon, Sarah and
Eickhoff, Simon B.},
title = {{T}owards increasing the clinical applicability of machine
learning biomarkers in psychiatry},
journal = {Nature human behaviour},
volume = {5},
issn = {2397-3374},
address = {London},
publisher = {Nature Research},
reportid = {FZJ-2021-01703},
pages = {431–432},
year = {2021},
abstract = {Due to a lack of objective biomarkers, psychiatric
diagnoses still rely strongly on patient reporting and
clinician judgement. The ensuing subjectivity negatively
affects the definition and reliability of psychiatric
diagnoses1,2. Recent research has suggested that a
combination of advanced neuroimaging and machine learning
may provide a solution to this predicament by establishing
such objective biomarkers for psychiatric conditions,
improving the diagnostic accuracy, prognosis and development
of novel treatments3.These promises led to widespread
interest in machine learning applications for mental
health4, including a recent paper that reports a biological
marker for one of the most difficult yet momentous questions
in psychiatry—the assessment of suicidal behaviour5. Just
et al. compared a group of 17 participants with suicidal
ideation with 17 healthy controls, reporting high
discrimination accuracy using task-based functional magnetic
resonance imaging signatures of life- and death-related
concepts3. The authors further reported high discrimination
between nine ideators who had attempted suicide versus eight
ideators who had not. While being a laudable effort into a
difficult topic, this study unfortunately illustrates some
common conceptual and technical issues in the field that
limit translation into clinical practice and raise
unrealistic hopes when the results are communicated to the
general public.From a conceptual point of view, machine
learning studies aimed at clinical applications need to
carefully consider any decisions that might hamper the
interpretation or generalizability of their results.
Restrictiveness to an arbitrary setting may become
detrimental for machine learning applications by providing
overly optimistic results that are unlikely to generalize.
As an example, Just et al. excluded more than half of the
patients and healthy controls initially enrolled in the
study from the main analysis due to missing desired
functional magnetic resonance imaging effects (a rank
accuracy of at least 0.6 based on all 30 concepts). This
exclusion introduces a non-assessable bias to the
interpretation of the results, in particular when
considering that only six of the 30 concepts were selected
for the final classification procedure. While Just et al.
attempt to address this question by applying the trained
classifier to the initially excluded 21 suicidal ideators,
they explicitly omit the excluded 24 controls from this
analysis, preventing any interpretation of the extent to
which the classifier decision is dependent on this initial
choice.From a technical point of view, machine
learning-based predictions based on neuroimaging data in
small samples are intrinsically highly variable, as stable
accuracy estimates and high generalizability are only
achieved with several hundreds of participants6,7. The study
by Just et al. falls into this category of studies with a
small sample size. To estimate the impact of uncertainty on
the results by Just et al., we adapted a simulation approach
with the code and data kindly provided by the authors,
randomly permuting (800 times) the labels across the groups
using their default settings and computing the accuracies.
These results showed that the $95\%$ confidence interval for
classification accuracy obtained using this dataset is about
$20\%,$ leaving large uncertainty with respect to any
potential findings.Special care is also required with
respect to any subjective choices in feature and classifier
settings or group selection. While ad-hoc selection of a
specific setting is subjective, testing of different ones
and outcome-based post-hoc justification of such leads to
overfitting, thus limiting the generalizability of any
classification. Such overfitting may occur when multiple
models or parameter choices are tested with respect to their
ability to predict the testing data and only those that
perform best are reported. To illustrate this issue, we
performed an additional analysis with the code and data
kindly provided by Just et al. More specifically, in the
code and the manuscript, we identified the following
non-exhaustive number of prespecified settings: (1) removal
of occipital cortex data; (2) subdivision of clusters larger
than 11 mm; (3) selection of voxels with at least four
contributing participants in each group; (4) selection of
stable clusters containing at least five voxels; (5)
selection of the 1,200 most stable features; and (6) manual
copying and replacing of a cluster for one control
participant. Importantly, according to the publication or
code documentation, all of these parameters were chosen ad
hoc and for none of these settings was a parameter search
performed. We systematically evaluated the effect of each of
these choices on the accuracy for differentiation between
suicide ideators and controls in the original dataset
provided by Just et al. As shown in Fig. 1, each of the six
parameters represents an optimum choice for differentiation
accuracy in this dataset, with any (even minor) change often
resulting in substantially lower accuracy estimates.
Similarly, data leakage may also contribute to optimistic
results when information outside the training set is used to
build a prediction model. More generally, whenever human
interventions guide the development of machine learning
models for the prediction of clinical conditions, a careful
evaluation and reporting of any researcher’s degrees of
freedom is essential to avoid data leakage and overfitting.
Subsequent sharing of data processing and analysis
pipelines, as well as collected data, is a further key step
to increase reproducibility and facilitate replication of
potential findings.},
cin = {INM-7},
ddc = {150},
cid = {I:(DE-Juel1)INM-7-20090406},
pnm = {5254 - Neuroscientific Data Analytics and AI (POF4-525)},
pid = {G:(DE-HGF)POF4-5254},
typ = {PUB:(DE-HGF)16},
pubmed = {33820977},
UT = {WOS:000636920500001},
doi = {10.1038/s41562-021-01085-w},
url = {https://juser.fz-juelich.de/record/891733},
}