% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@ARTICLE{Kambeitz:1049995,
author = {Kambeitz, Joseph and Schiffman, Jason and
Kambeitz-Ilankovic, Lana and Mittal, Vijay A. and Ettinger,
Ulrich and Vogeley, Kai},
title = {{T}he empirical structure of psychopathology is represented
in large language models},
journal = {Nature Mental Health},
volume = {3},
number = {12},
issn = {2731-6076},
address = {London},
publisher = {Nature Publishing Group UK},
reportid = {FZJ-2025-05709},
pages = {1482 - 1492},
year = {2025},
note = {The original studies analyzed in this work were supported
by the National Institute of Mental Health (Grant
R01MH112612) to J.S. and the Deutsche Forschungsgemeinschaft
(DFG) ET 31/7-1 to U.E. K.V. was supported within the
project SIMSUB (Grant 01GP2215) of the German Ministery of
Research, Technology and Space (BMFTR). The funders had no
role in study design, data collection and analysis, decision
to publish or preparation of the manuscript.},
abstract = {Clinical assessment and scientific research in psychiatry
are largely based on questionnaires that are used to assess
psychopathology. The development of large language models
(LLMs) offers a new perspective for analysis of the language
and terminology on which these questionnaires are based. We
used state-of-the-art LLMs to derive numerical
representations (‘text embeddings’) of the semantic and
sentiment content of items from established questionnaires
for the assessment of psychopathology. We compared the
pairwise associations between empirical data from
cross-sectional studies and text embeddings to test whether
the empirical structure of psychopathology can be
reconstructed by LLMs. Across four large-scale datasets
(n = 1,555, n = 1,099, n = 11,807 and
n = 39,755), we found a range of significant
correlations between empirical item-pair associations and
associations derived from text embeddings (r = 0.18 to
r = 0.57, all P < 0.05). Random forest regression
models based on semantic or sentiment embeddings predicted
empirical item-pair associations with moderate to high
accuracy (r = 0.33 to r = 0.81, all P < 0.05).
Similarly, empirical clustering of items and grouping to
established subdomain scores could be partly reconstructed
by text embeddings. Our results demonstrate that LLMs are
able to represent substantial components of the empirical
structure of psychopathology. Consequently, the integration
of LLMs into mental health research has the potential to
unlock numerous promising avenues. These may encompass
improving the process of developing questionnaires,
optimizing generalizability and reducing the redundancy of
existing questionnaires or facilitating the development of
new conceptualizations of mental disorders.},
cin = {INM-3},
ddc = {610},
cid = {I:(DE-Juel1)INM-3-20090406},
pnm = {5251 - Multilevel Brain Organization and Variability
(POF4-525)},
pid = {G:(DE-HGF)POF4-5251},
typ = {PUB:(DE-HGF)16},
doi = {10.1038/s44220-025-00527-y},
url = {https://juser.fz-juelich.de/record/1049995},
}