On Leakage in Machine Learning Pipelines

Sasse, Leonard; Raimondo, Federico; Kulkarni, Abhijit; Nicolaisen-Sobesky, Eliana; Eickhoff, Simon B.; Patil, Kaustubh R.; Love, Bradley C.; Hamdan, Sami; Lahnakoski, Juha; Dukart, Jürgen; Götz, Michael; Komeyer, Vera

doi:10.48550/ARXIV.2311.04179

% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@ARTICLE{Sasse:1018240,
      author       = {Sasse, Leonard and Nicolaisen-Sobesky, Eliana and Dukart,
                      Jürgen and Eickhoff, Simon B. and Götz, Michael and
                      Hamdan, Sami and Komeyer, Vera and Kulkarni, Abhijit and
                      Lahnakoski, Juha and Love, Bradley C. and Raimondo, Federico
                      and Patil, Kaustubh R.},
      title        = {{O}n {L}eakage in {M}achine {L}earning {P}ipelines},
      publisher    = {arXiv},
      reportid     = {FZJ-2023-04636},
      year         = {2023},
      abstract     = {Machine learning (ML) provides powerful tools for
                      predictive modeling. ML's popularity stems from the promise
                      of sample-level prediction with applications across a
                      variety of fields from physics and marketing to healthcare.
                      However, if not properly implemented and evaluated, ML
                      pipelines may contain leakage typically resulting in
                      overoptimistic performance estimates and failure to
                      generalize to new data. This can have severe negative
                      financial and societal implications. Our aim is to expand
                      understanding associated with causes leading to leakage when
                      designing, implementing, and evaluating ML pipelines.
                      Illustrated by concrete examples, we provide a comprehensive
                      overview and discussion of various types of leakage that may
                      arise in ML pipelines.},
      keywords     = {Machine Learning (cs.LG) (Other) / Artificial Intelligence
                      (cs.AI) (Other) / FOS: Computer and information sciences
                      (Other)},
      cin          = {INM-7},
      cid          = {I:(DE-Juel1)INM-7-20090406},
      pnm          = {5254 - Neuroscientific Data Analytics and AI (POF4-525)},
      pid          = {G:(DE-HGF)POF4-5254},
      typ          = {PUB:(DE-HGF)25},
      doi          = {10.48550/ARXIV.2311.04179},
      url          = {https://juser.fz-juelich.de/record/1018240},
}

guest :: login JuSER
		Search		Submit		Personalize Your alerts Your baskets Your searches		Help