% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@INPROCEEDINGS{Schuhmann:1020896,
      author       = {Schuhmann, Christoph and Beaumont, Romain and Vencu,
                      Richard and Gordon, Cade and Wightman, Ross and Cherti,
                      Mehdi and Coombes, Theo and Katta, Aarush and Mullis,
                      Clayton and Wortsman, Mitchell and Schramowsk, Patrick and
                      Kundurthy, Srivatsa and Crowson, Katherine and Schmidt,
                      Ludwig and Kaczmarczyk, Robert and Jitsev, Jenia},
      title        = {{LAION}-5{B}: {A}n open large-scale dataset for training
                      next generation image-text models},
      volume       = {35},
      address      = {Red Hook, NY},
      publisher    = {Curran Associates, Inc.},
      reportid     = {FZJ-2024-00372},
      isbn         = {9781713871088},
      series       = {Advances in neural information processing systems},
      pages        = {25278 - 25294},
      year         = {2022},
      note         = {Also on arXiv: https://doi.org/10.48550/arXiv.2210.08402},
      abstract     = {Groundbreaking language-vision architectures like CLIP and
                      DALL-E proved the utility of training on large amounts of
                      noisy image-text data, without relying on expensive accurate
                      labels used in standard vision unimodal supervised learning.
                      The resulting models showed capabilities of strong
                      text-guided image generation and transfer to downstream
                      tasks, while performing remarkably at zero-shot
                      classification with noteworthy out-of-distribution
                      robustness. Since then, large-scale language-vision models
                      like ALIGN, BASIC, GLIDE, Flamingo and Imagen made further
                      improvements. Studying the training and capabilities of such
                      models requires datasets containing billions of image-text
                      pairs. Until now, no datasets of this size have been made
                      openly available for the broader research community. To
                      address this problem and democratize research on large-scale
                      multi-modal models, we present LAION-5B - a dataset
                      consisting of 5.85 billion CLIP-filteredimage-text pairs, of
                      which 2.32B contain English language. We show successful
                      replication and fine-tuning of foundational models like
                      CLIP, GLIDE and Stable Diffusion using the dataset, and
                      discuss further experiments enabled with an openly available
                      dataset of this scale. Additionally we provide several
                      nearest neighbor indices, an improved web-interface for
                      dataset exploration and subset generation, and detection
                      scores for watermark, NSFW, and toxic content detection.},
      month         = {Nov},
      date          = {2022-11-28},
      organization  = {9781713871088, New Orleans, Louisiana
                       (USA), 28 Nov 2022 - 9 Dec 2022},
      cin          = {JSC},
      cid          = {I:(DE-Juel1)JSC-20090406},
      pnm          = {5112 - Cross-Domain Algorithms, Tools, Methods Labs (ATMLs)
                      and Research Groups (POF4-511)},
      pid          = {G:(DE-HGF)POF4-5112},
      typ          = {PUB:(DE-HGF)8 / PUB:(DE-HGF)7},
      doi          = {10.34734/FZJ-2024-00372},
      url          = {https://juser.fz-juelich.de/record/1020896},
}