% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@ARTICLE{Bke:1049992,
author = {Böke, Annkathrin and Hacker, Hannah and Chakraborty,
Millennia and Baumeister-Lingens, Luise and Vöckel, Jasper
and Koenig, Julian and Vogel, David HV and Lichtenstein,
Theresa Katharina and Vogeley, Kai and Kambeitz-Ilankovic,
Lana and Kambeitz, Joseph},
title = {{O}bserver-{I}ndependent {A}ssessment of {C}ontent
{O}verlap in {M}ental {H}ealth {Q}uestionnaires: {L}arge
{L}anguage {M}odel–{B}ased {S}tudy},
journal = {JMIR AI},
volume = {4},
issn = {2817-1705},
address = {Toronto, Ont.},
publisher = {JMIR Publications},
reportid = {FZJ-2025-05706},
pages = {e79868 - e79868},
year = {2025},
abstract = {Background: Mental disorders are frequently evaluated using
questionnaires, which have been developed over the past
decades for the assessment of different conditions. Despite
the rigorous validation of these tools, high levels of
content divergence have been reported for questionnaires
measuring the same construct of psychopathology. Previous
studies that examined the content overlap required manual
symptom labeling, which is observer-dependent and
time-consuming.Objective: In this study, we used large
language models (LLMs) to analyze content overlap of mental
health questionnaires in an observer-independent way and
compare our results with clinical expertise.Methods: We
analyzed questionnaires from a range of mental health
conditions, including adult depression (n=7), childhood
depression (n=15), clinical high risk for psychosis (CHR-P;
n=11), mania (n=7), obsessive-compulsive disorder (n=7), and
sleep disorder (n=12). Two different LLM-based approaches
were tested. First, we used sentence Bidirectional Encoder
Representations from Transformers (sBERT) to derive
numerical representations (embeddings) for each
questionnaire item, which were then clustered using k-means
to group semantically similar symptoms. Second,
questionnaire items were prompted to a Generative Pretrained
Transformer to identify underlying symptom clusters.
Clustering results were compared to a manual categorization
by experts using the adjusted rand index. Further, we
assessed the content overlap within each diagnostic domain
based on LLM-derived clusters.Results: We observed varying
degrees of similarity between expert-based and LLM-based
clustering across diagnostic domains. Overall, agreement
between experts was higher than between experts and LLMs.
Among the 2 LLM approaches, GPT showed greater alignment
with expert ratings than sBERT, ranging from weak to strong
similarity depending on the diagnostic domain. Using
GPT-based clustering of questionnaire items to assess the
content overlap within each diagnostic domain revealed a
weak (CHR-P: 0.344) to moderate (adult depression: 0.574;
childhood depression: 0.433; mania: 0.419;
obsessive-compulsive disorder [OCD]: 0.450; sleep disorder:
0.445) content overlap of questionnaires. Compared to the
studies that manually investigated content overlap among
these scales, the results of this study exhibited
variations, though these were not substantial.Conclusions:
These findings demonstrate the feasibility of using LLMs to
objectively assess content overlap in diagnostic
questionnaires. Notably, the GPT-based approach showed
particular promise in aligning with expert-derived symptom
structures.Keywords: GPT; content overlap; large language
models; questionnaires; sBERT; scales; sentence
Bidirectional Encoder Representations from Transformers;
symptom overlap.},
cin = {INM-3},
ddc = {610},
cid = {I:(DE-Juel1)INM-3-20090406},
pnm = {5251 - Multilevel Brain Organization and Variability
(POF4-525)},
pid = {G:(DE-HGF)POF4-5251},
typ = {PUB:(DE-HGF)16},
doi = {10.2196/79868},
url = {https://juser.fz-juelich.de/record/1049992},
}