% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@ARTICLE{Chen:915909,
      author       = {Chen, Jianzhong and Ooi, Leon Qi Rong and Li, Jingwei and
                      Asplund, Christopher L. and Eickhoff, Simon B and Bzdok,
                      Danilo and Holmes, Avram J and Yeo, B. T. Thomas},
      title        = {{T}here is no fundamental trade-off between prediction
                      accuracy and feature importance reliability},
      reportid     = {FZJ-2022-05778},
      year         = {2022},
      abstract     = {There is significant interest in using neuroimaging data to
                      predict behavior. The predictive models are often
                      interpreted by the computation of feature importance, which
                      quantifies the predictive relevance of an imaging feature.
                      Tian and Zalesky (2021) suggest that feature importance
                      estimates exhibit low test-retest reliability, pointing to a
                      potential trade-off between prediction accuracy and feature
                      importance reliability. This trade-off is counter-intuitive
                      because both prediction accuracy and test-retest reliability
                      reflect the reliability of brain-behavior relationships
                      across independent samples. Here, we revisit the
                      relationship between prediction accuracy and feature
                      importance reliability in a large well-powered dataset
                      across a wide range of behavioral measures. We demonstrate
                      that, with a sufficient sample size, feature importance
                      (operationalized as Haufe-transformed weights) can achieve
                      fair to excellent test-retest reliability. More
                      specifically, with a sample size of about 2600 participants,
                      Haufe-transformed weights achieve average intra-class
                      correlation coefficients of 0.75, 0.57 and 0.53 for
                      cognitive, personality and mental health measures
                      respectively. Haufe-transformed weights are much more
                      reliable than original regression weights and univariate
                      FC-behavior correlations. Intriguingly, feature importance
                      reliability is strongly positively correlated with
                      prediction accuracy across phenotypes. Within a particular
                      behavioral domain, there was no clear relationship between
                      prediction performance and feature importance reliability
                      across regression algorithms. Finally, we show
                      mathematically that feature importance reliability is
                      necessary, but not sufficient, for low feature importance
                      error. In the case of linear models, lower feature
                      importance error leads to lower prediction error (up to a
                      scaling by the feature covariance matrix). Overall, we find
                      no fundamental trade-off between feature importance
                      reliability and prediction accuracy.},
      cin          = {INM-7},
      cid          = {I:(DE-Juel1)INM-7-20090406},
      pnm          = {5251 - Multilevel Brain Organization and Variability
                      (POF4-525)},
      pid          = {G:(DE-HGF)POF4-5251},
      typ          = {PUB:(DE-HGF)25},
      doi          = {10.1101/2022.08.08.503167},
      url          = {https://juser.fz-juelich.de/record/915909},
}