% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@INPROCEEDINGS{Wang:1048764,
author = {Wang, Qin and Krajsek, Kai and Scharr, Hanno},
title = {{R}escuing {E}asy {S}amples in {S}elf-{S}upervised
{P}retraining},
publisher = {SCITEPRESS - Science and Technology Publications},
reportid = {FZJ-2025-04879},
isbn = {978-989-758-728-3},
pages = {400 - 409},
year = {2025},
comment = {Proceedings of the 20th International Joint Conference on
Computer Vision, Imaging and Computer Graphics Theory and
Applications - SCITEPRESS - Science and Technology
Publications, 2025. - ISBN 978-989-758-728-3 -
doi:10.5220/0013167900003912},
booktitle = {Proceedings of the 20th International
Joint Conference on Computer Vision,
Imaging and Computer Graphics Theory
and Applications - SCITEPRESS - Science
and Technology Publications, 2025. -
ISBN 978-989-758-728-3 -
doi:10.5220/0013167900003912},
abstract = {Many recent self-supervised pretraining methods use
augmented versions of the same image as samples for their
learning schemes. We observe that ’easy’ samples, i.e.
samples being too similar to each other after augmentation,
have only limited value as learning signal. We therefore
propose to rescue easy samples and make them harder. To do
so, we select the top k easiest samples using cosine
similarity, strongly augment them, forward-pass them through
the model, calculate cosine similarity of the output as
loss, and add it to the original loss in a weighted fashion.
This method can be adopted to all contrastive or other
augmented-pair based learning methods, whether they involve
negative pairs or not, as it changes handling of easy
positives, only. This simple but effective approach
introduces greater variability into such self-supervised
pretraining processes, significantly increasing the
performance on various downstream tasks as observed in our
experiments. We pretrain models of di fferent sizes, i.e.
ResNet-50, ViT-S, ViT-B, or ViT-L, using ImageNet with
SimCLR, MoCo v3, or DINOv2 training schemes. Here, e.g., we
consistently find to improve results for ImageNet top-1
accuracy with a linear classifier establishing new SOTA for
this task.},
month = {Feb},
date = {2025-02-26},
organization = {20th International Conference on
Computer Vision Theory and
Applications, Porto (Portugal), 26 Feb
2025 - 28 Feb 2025},
cin = {IAS-8 / JSC},
cid = {I:(DE-Juel1)IAS-8-20210421 / I:(DE-Juel1)JSC-20090406},
pnm = {5112 - Cross-Domain Algorithms, Tools, Methods Labs (ATMLs)
and Research Groups (POF4-511) / 5111 - Domain-Specific
Simulation $\&$ Data Life Cycle Labs (SDLs) and Research
Groups (POF4-511) / SLNS - SimLab Neuroscience
(Helmholtz-SLNS)},
pid = {G:(DE-HGF)POF4-5112 / G:(DE-HGF)POF4-5111 /
G:(DE-Juel1)Helmholtz-SLNS},
typ = {PUB:(DE-HGF)8 / PUB:(DE-HGF)7},
doi = {10.5220/0013167900003912},
url = {https://juser.fz-juelich.de/record/1048764},
}