% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@ARTICLE{Dagaev:1024830,
      author       = {Dagaev, Nikolay and Roads, Brett D. and Luo, Xiaoliang and
                      Barry, Daniel N. and Patil, Kaustubh R. and Love, Bradley
                      C.},
      title        = {{A} too-good-to-be-true prior to reduce shortcut reliance},
      journal      = {Pattern recognition letters},
      volume       = {166},
      issn         = {0167-8655},
      address      = {Amsterdam [u.a.]},
      publisher    = {Elsevier},
      reportid     = {FZJ-2024-02496},
      pages        = {164 - 171},
      year         = {2023},
      abstract     = {Despite their impressive performance in object recognition
                      and other tasks under standard testing conditions, deep
                      networks often fail to generalize to out-of-distribution
                      (o.o.d.) samples. One cause for this shortcoming is that
                      modern architectures tend to rely on ǣshortcutsǥ
                      superficial features that correlate with categories without
                      capturing deeper invariants that hold across contexts.
                      Real-world concepts often possess a complex structure that
                      can vary superficially across contexts, which can make the
                      most intuitive and promising solutions in one context not
                      generalize to others. One potential way to improve o.o.d.
                      generalization is to assume simple solutions are unlikely to
                      be valid across contexts and avoid them, which we refer to
                      as the too-good-to-be-true prior. A low-capacity network
                      (LCN) with a shallow architecture should only be able to
                      learn surface relationships, including shortcuts. We find
                      that LCNs can serve as shortcut detectors. Furthermore, an
                      LCN’s predictions can be used in a two-stage approach to
                      encourage a high-capacity network (HCN) to rely on deeper
                      invariant features that should generalize broadly. In
                      particular, items that the LCN can master are downweighted
                      when training the HCN. Using a modified version of the
                      CIFAR-10 dataset in which we introduced shortcuts, we found
                      that the two-stage LCN-HCN approach reduced reliance on
                      shortcuts and facilitated o.o.d. generalization.},
      cin          = {INM-7},
      ddc          = {004},
      cid          = {I:(DE-Juel1)INM-7-20090406},
      pnm          = {5251 - Multilevel Brain Organization and Variability
                      (POF4-525) / 5254 - Neuroscientific Data Analytics and AI
                      (POF4-525)},
      pid          = {G:(DE-HGF)POF4-5251 / G:(DE-HGF)POF4-5254},
      typ          = {PUB:(DE-HGF)16},
      pubmed       = {37915616},
      UT           = {WOS:000935348300001},
      doi          = {10.1016/j.patrec.2022.12.010},
      url          = {https://juser.fz-juelich.de/record/1024830},
}