% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@ARTICLE{Bke:1049992,
      author       = {Böke, Annkathrin and Hacker, Hannah and Chakraborty,
                      Millennia and Baumeister-Lingens, Luise and Vöckel, Jasper
                      and Koenig, Julian and Vogel, David HV and Lichtenstein,
                      Theresa Katharina and Vogeley, Kai and Kambeitz-Ilankovic,
                      Lana and Kambeitz, Joseph},
      title        = {{O}bserver-{I}ndependent {A}ssessment of {C}ontent
                      {O}verlap in {M}ental {H}ealth {Q}uestionnaires: {L}arge
                      {L}anguage {M}odel–{B}ased {S}tudy},
      journal      = {JMIR AI},
      volume       = {4},
      issn         = {2817-1705},
      address      = {Toronto, Ont.},
      publisher    = {JMIR Publications},
      reportid     = {FZJ-2025-05706},
      pages        = {e79868 - e79868},
      year         = {2025},
      abstract     = {Background: Mental disorders are frequently evaluated using
                      questionnaires, which have been developed over the past
                      decades for the assessment of different conditions. Despite
                      the rigorous validation of these tools, high levels of
                      content divergence have been reported for questionnaires
                      measuring the same construct of psychopathology. Previous
                      studies that examined the content overlap required manual
                      symptom labeling, which is observer-dependent and
                      time-consuming.Objective: In this study, we used large
                      language models (LLMs) to analyze content overlap of mental
                      health questionnaires in an observer-independent way and
                      compare our results with clinical expertise.Methods: We
                      analyzed questionnaires from a range of mental health
                      conditions, including adult depression (n=7), childhood
                      depression (n=15), clinical high risk for psychosis (CHR-P;
                      n=11), mania (n=7), obsessive-compulsive disorder (n=7), and
                      sleep disorder (n=12). Two different LLM-based approaches
                      were tested. First, we used sentence Bidirectional Encoder
                      Representations from Transformers (sBERT) to derive
                      numerical representations (embeddings) for each
                      questionnaire item, which were then clustered using k-means
                      to group semantically similar symptoms. Second,
                      questionnaire items were prompted to a Generative Pretrained
                      Transformer to identify underlying symptom clusters.
                      Clustering results were compared to a manual categorization
                      by experts using the adjusted rand index. Further, we
                      assessed the content overlap within each diagnostic domain
                      based on LLM-derived clusters.Results: We observed varying
                      degrees of similarity between expert-based and LLM-based
                      clustering across diagnostic domains. Overall, agreement
                      between experts was higher than between experts and LLMs.
                      Among the 2 LLM approaches, GPT showed greater alignment
                      with expert ratings than sBERT, ranging from weak to strong
                      similarity depending on the diagnostic domain. Using
                      GPT-based clustering of questionnaire items to assess the
                      content overlap within each diagnostic domain revealed a
                      weak (CHR-P: 0.344) to moderate (adult depression: 0.574;
                      childhood depression: 0.433; mania: 0.419;
                      obsessive-compulsive disorder [OCD]: 0.450; sleep disorder:
                      0.445) content overlap of questionnaires. Compared to the
                      studies that manually investigated content overlap among
                      these scales, the results of this study exhibited
                      variations, though these were not substantial.Conclusions:
                      These findings demonstrate the feasibility of using LLMs to
                      objectively assess content overlap in diagnostic
                      questionnaires. Notably, the GPT-based approach showed
                      particular promise in aligning with expert-derived symptom
                      structures.Keywords: GPT; content overlap; large language
                      models; questionnaires; sBERT; scales; sentence
                      Bidirectional Encoder Representations from Transformers;
                      symptom overlap.},
      cin          = {INM-3},
      ddc          = {610},
      cid          = {I:(DE-Juel1)INM-3-20090406},
      pnm          = {5251 - Multilevel Brain Organization and Variability
                      (POF4-525)},
      pid          = {G:(DE-HGF)POF4-5251},
      typ          = {PUB:(DE-HGF)16},
      doi          = {10.2196/79868},
      url          = {https://juser.fz-juelich.de/record/1049992},
}