Overview of leakage scenarios in supervised machine learning

Sasse, L.; Kulkarni, A.; Götz, M.; Komeyer, V.; Patil, Kaustubh R.; Nicolaisen, Eliana; Eickhoff, S. B.; Love, B. C.; Hamdan, S.; Lahnakoski, J. M.; Dukart, Jürgen; Raimondo, F.

doi:10.1186/s40537-025-01193-8

% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@ARTICLE{Sasse:1043079,
      author       = {Sasse, L. and Nicolaisen, Eliana and Dukart, Jürgen and
                      Eickhoff, S. B. and Götz, M. and Hamdan, S. and Komeyer, V.
                      and Kulkarni, A. and Lahnakoski, J. M. and Love, B. C. and
                      Raimondo, F. and Patil, Kaustubh R.},
      title        = {{O}verview of leakage scenarios in supervised machine
                      learning},
      journal      = {Journal of Big Data},
      volume       = {12},
      number       = {1},
      issn         = {2196-1115},
      address      = {Heidelberg [u.a.]},
      publisher    = {SpringerOpen},
      reportid     = {FZJ-2025-02765},
      pages        = {135},
      year         = {2025},
      abstract     = {Machine learning (ML) provides powerful tools for
                      predictive modeling. ML’s popularity stems from the
                      promise of sample-level prediction with applications across
                      a variety of fields from physics and marketing to
                      healthcare. However, if not properly implemented and
                      evaluated, ML pipelines may contain leakage typically
                      resulting in overoptimistic performance estimates and
                      failure to generalize to new data. This can have severe
                      negative financial and societal implications. Our aim is to
                      expand understanding associated with causes leading to
                      leakage when designing, implementing, and evaluating ML
                      pipelines. Illustrated by concrete examples, we provide a
                      comprehensive overview and discussion of various types of
                      leakage that may arise in ML pipelines.},
      cin          = {INM-7},
      ddc          = {004},
      cid          = {I:(DE-Juel1)INM-7-20090406},
      pnm          = {5254 - Neuroscientific Data Analytics and AI (POF4-525)},
      pid          = {G:(DE-HGF)POF4-5254},
      typ          = {PUB:(DE-HGF)16},
      UT           = {WOS:001498691400001},
      doi          = {10.1186/s40537-025-01193-8},
      url          = {https://juser.fz-juelich.de/record/1043079},
}

guest :: login JuSER
		Search		Submit		Personalize Your alerts Your baskets Your searches		Help