% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@ARTICLE{Nieto:1038496,
author = {Nieto, Nicolas and Eickhoff, Simon and Jung, Christian and
Reuter, Martin and Diers, Kersten and Kelm, Malte and
Lichtenberg, Artur and Raimondo, Federico and Patil,
Kaustubh},
title = {{I}mpact of {L}eakage on {D}ata {H}armonization in
{M}achine {L}earning {P}ipelines in {C}lass {I}mbalance
{A}cross {S}ites},
journal = {Arxiv},
reportid = {FZJ-2025-01491},
year = {2024},
abstract = {Machine learning (ML) models benefit from large datasets.
Collecting data in biomedical domains is costly and
challenging, hence, combining datasets has become a common
practice. However, datasets obtained under different
conditions could present undesired site-specific
variability. Data harmonization methods aim to remove
site-specific variance while retaining biologically relevant
information. This study evaluates the effectiveness of
popularly used ComBat-based methods for harmonizing data in
scenarios where the class balance is not equal across sites.
We find that these methods struggle with data leakage
issues. To overcome this problem, we propose a novel
approach PrettYharmonize, designed to harmonize data by
pretending the target labels. We validate our approach using
controlled datasets designed to benchmark the utility of
harmonization. Finally, using real-world MRI and clinical
data, we compare leakage-prone methods with PrettYharmonize
and show that it achieves comparable performance while
avoiding data leakage, particularly in
site-target-dependence scenarios.},
cin = {INM-7},
cid = {I:(DE-Juel1)INM-7-20090406},
pnm = {5254 - Neuroscientific Data Analytics and AI (POF4-525)},
pid = {G:(DE-HGF)POF4-5254},
typ = {PUB:(DE-HGF)25},
doi = {10.34734/FZJ-2025-01491},
url = {https://juser.fz-juelich.de/record/1038496},
}