% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@MASTERSTHESIS{Rahmdel:1041549,
author = {Rahmdel, Sahand},
title = {{E}xploring {L}inguistic {P}roximity in {C}4 {M}ultilingual
{D}ata through {E}fficient {E}mbedding {M}odel {A}nalysis
and {V}isualization on {HPC}},
school = {Rheinisch-Westfälische Technische Hochschule Aachen},
type = {Bachelorarbeit},
reportid = {FZJ-2025-02306},
pages = {77 p.},
year = {2025},
note = {Bachelorarbeit, Rheinisch-Westfälische Technische
Hochschule Aachen, 2025},
abstract = {This thesis investigates the proximity of different
languages and different language families by analysing how
multilingual text data are represented in a shared latent
space, focusing on the Colossal Clean Crawled Corpus (C4)
with a multilingual extension (mC4). The main focus is to
determine whether embeddings of different languages group
together based on their linguistic families, topical
content, or both. This is achieved through a
high-performance computing (HPC) system to embed 6.1TB of
textual data from 24 diverse languages. The BAAI bge-m3
embedding model served to create embeddings of dimension
1,024, which were stored in a vector database using ChromaDB
to facilitate scalable analysis and querying.Subsequent
dimensionality reduction with t-distributed Stochastic
Neighbor Embedding (t-SNE) allowed for the visualization of
language clusters in two-dimensional space for a simpler and
better understanding. Results reveal that similar thematic
or topical content often drives the embedding model to
generate vectors that lie close together, even from
different languages. However, certain clusters reflect
linguistic closeness—especially among languages from the
same family—indicating that the model also recognizes
linguistic features. Overall, the thesis uses multilingual
embeddings to check the existence of any relation between
the semantic representation of texts as vectors (embeddings)
and the linguistic structure of the origin languages,
demonstrating how HPC resources, combined with advanced
embedding models, can efficiently handle large datasets and
offer deeper insights into language proximity and topic
similarity analysis.},
cin = {JSC},
cid = {I:(DE-Juel1)JSC-20090406},
pnm = {5112 - Cross-Domain Algorithms, Tools, Methods Labs (ATMLs)
and Research Groups (POF4-511)},
pid = {G:(DE-HGF)POF4-5112},
typ = {PUB:(DE-HGF)2},
doi = {10.34734/FZJ-2025-02306},
url = {https://juser.fz-juelich.de/record/1041549},
}