% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@ARTICLE{Dukart:891733,
      author       = {Dukart, Juergen and Weis, Susanne and Genon, Sarah and
                      Eickhoff, Simon B.},
      title        = {{T}owards increasing the clinical applicability of machine
                      learning biomarkers in psychiatry},
      journal      = {Nature human behaviour},
      volume       = {5},
      issn         = {2397-3374},
      address      = {London},
      publisher    = {Nature Research},
      reportid     = {FZJ-2021-01703},
      pages        = {431–432},
      year         = {2021},
      abstract     = {Due to a lack of objective biomarkers, psychiatric
                      diagnoses still rely strongly on patient reporting and
                      clinician judgement. The ensuing subjectivity negatively
                      affects the definition and reliability of psychiatric
                      diagnoses1,2. Recent research has suggested that a
                      combination of advanced neuroimaging and machine learning
                      may provide a solution to this predicament by establishing
                      such objective biomarkers for psychiatric conditions,
                      improving the diagnostic accuracy, prognosis and development
                      of novel treatments3.These promises led to widespread
                      interest in machine learning applications for mental
                      health4, including a recent paper that reports a biological
                      marker for one of the most difficult yet momentous questions
                      in psychiatry—the assessment of suicidal behaviour5. Just
                      et al. compared a group of 17 participants with suicidal
                      ideation with 17 healthy controls, reporting high
                      discrimination accuracy using task-based functional magnetic
                      resonance imaging signatures of life- and death-related
                      concepts3. The authors further reported high discrimination
                      between nine ideators who had attempted suicide versus eight
                      ideators who had not. While being a laudable effort into a
                      difficult topic, this study unfortunately illustrates some
                      common conceptual and technical issues in the field that
                      limit translation into clinical practice and raise
                      unrealistic hopes when the results are communicated to the
                      general public.From a conceptual point of view, machine
                      learning studies aimed at clinical applications need to
                      carefully consider any decisions that might hamper the
                      interpretation or generalizability of their results.
                      Restrictiveness to an arbitrary setting may become
                      detrimental for machine learning applications by providing
                      overly optimistic results that are unlikely to generalize.
                      As an example, Just et al. excluded more than half of the
                      patients and healthy controls initially enrolled in the
                      study from the main analysis due to missing desired
                      functional magnetic resonance imaging effects (a rank
                      accuracy of at least 0.6 based on all 30 concepts). This
                      exclusion introduces a non-assessable bias to the
                      interpretation of the results, in particular when
                      considering that only six of the 30 concepts were selected
                      for the final classification procedure. While Just et al.
                      attempt to address this question by applying the trained
                      classifier to the initially excluded 21 suicidal ideators,
                      they explicitly omit the excluded 24 controls from this
                      analysis, preventing any interpretation of the extent to
                      which the classifier decision is dependent on this initial
                      choice.From a technical point of view, machine
                      learning-based predictions based on neuroimaging data in
                      small samples are intrinsically highly variable, as stable
                      accuracy estimates and high generalizability are only
                      achieved with several hundreds of participants6,7. The study
                      by Just et al. falls into this category of studies with a
                      small sample size. To estimate the impact of uncertainty on
                      the results by Just et al., we adapted a simulation approach
                      with the code and data kindly provided by the authors,
                      randomly permuting (800 times) the labels across the groups
                      using their default settings and computing the accuracies.
                      These results showed that the $95\%$ confidence interval for
                      classification accuracy obtained using this dataset is about
                      $20\%,$ leaving large uncertainty with respect to any
                      potential findings.Special care is also required with
                      respect to any subjective choices in feature and classifier
                      settings or group selection. While ad-hoc selection of a
                      specific setting is subjective, testing of different ones
                      and outcome-based post-hoc justification of such leads to
                      overfitting, thus limiting the generalizability of any
                      classification. Such overfitting may occur when multiple
                      models or parameter choices are tested with respect to their
                      ability to predict the testing data and only those that
                      perform best are reported. To illustrate this issue, we
                      performed an additional analysis with the code and data
                      kindly provided by Just et al. More specifically, in the
                      code and the manuscript, we identified the following
                      non-exhaustive number of prespecified settings: (1) removal
                      of occipital cortex data; (2) subdivision of clusters larger
                      than 11 mm; (3) selection of voxels with at least four
                      contributing participants in each group; (4) selection of
                      stable clusters containing at least five voxels; (5)
                      selection of the 1,200 most stable features; and (6) manual
                      copying and replacing of a cluster for one control
                      participant. Importantly, according to the publication or
                      code documentation, all of these parameters were chosen ad
                      hoc and for none of these settings was a parameter search
                      performed. We systematically evaluated the effect of each of
                      these choices on the accuracy for differentiation between
                      suicide ideators and controls in the original dataset
                      provided by Just et al. As shown in Fig. 1, each of the six
                      parameters represents an optimum choice for differentiation
                      accuracy in this dataset, with any (even minor) change often
                      resulting in substantially lower accuracy estimates.
                      Similarly, data leakage may also contribute to optimistic
                      results when information outside the training set is used to
                      build a prediction model. More generally, whenever human
                      interventions guide the development of machine learning
                      models for the prediction of clinical conditions, a careful
                      evaluation and reporting of any researcher’s degrees of
                      freedom is essential to avoid data leakage and overfitting.
                      Subsequent sharing of data processing and analysis
                      pipelines, as well as collected data, is a further key step
                      to increase reproducibility and facilitate replication of
                      potential findings.},
      cin          = {INM-7},
      ddc          = {150},
      cid          = {I:(DE-Juel1)INM-7-20090406},
      pnm          = {5254 - Neuroscientific Data Analytics and AI (POF4-525)},
      pid          = {G:(DE-HGF)POF4-5254},
      typ          = {PUB:(DE-HGF)16},
      pubmed       = {33820977},
      UT           = {WOS:000636920500001},
      doi          = {10.1038/s41562-021-01085-w},
      url          = {https://juser.fz-juelich.de/record/891733},
}