little snippet to trim big dictionaries

added tokenano to the init
a new exasperated way to train the bpe, just a wild experimen that could be useful later
2025-10-07 16:05:32 +02:00 · 2025-10-04 19:03:56 +02:00 · 2025-10-04 19:03:07 +02:00 · 2025-10-04 19:01:21 +02:00 · 2025-10-04 19:00:05 +02:00 · 2025-10-04 18:58:20 +02:00
96 changed files with 8218 additions and 2 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -1,2 +1,3 @@
 Exam/Deep_Learning_2025_VIII.pdf filter=lfs diff=lfs merge=lfs -text
 Assets/** filter=lfs diff=lfs merge=lfs -text
+Assets/Dataset/1-hop/dataset.csv filter=lfs diff=lfs merge=lfs -text
--- a/.gitignore
+++ b/.gitignore
@ -189,7 +189,8 @@ ipython_config.py
 .LSOverride

 # Icon must end with two \r
-Icon
+Icon
+

 # Thumbnails
 ._*
@ -251,3 +252,7 @@ $RECYCLE.BIN/
 # .nfs files are created when an open file is removed but is still being accessed
 .nfs*

+# ---> Custom
+**/Tmp/**
+**/cache/**
+!**/.gitkeep
--- a/.vscode/extensions.json
+++ b/.vscode/extensions.json
@ -0,0 +1,14 @@
+{
+    "recommendations": [
+        "bierner.github-markdown-preview",
+        "bierner.markdown-checkbox",
+        "bierner.markdown-emoji",
+        "bierner.markdown-footnotes",
+        "bierner.markdown-mermaid",
+        "bierner.markdown-preview-github-styles",
+        "bierner.markdown-yaml-preamble",
+        "davidanson.vscode-markdownlint",
+        "kejun.markdown-alert",
+        "yzhang.markdown-all-in-one"
+    ]
+}
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@ -0,0 +1,16 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python Debugger: Current File with Arguments",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal",
+            "args": "${command:pickArgs}"
+        }
+    ]
+}
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@ -0,0 +1,55 @@
+{
+    // Always treat the project root as the working dir for Jupyter
+    "jupyter.notebookFileRoot": "${workspaceFolder}",
+    // When you click "Run Python File in Terminal", DON'T cd into the file's folder
+    "python.terminal.executeInFileDir": false,
+    // Start new integrated terminals at the project root
+    "terminal.integrated.cwd": "${workspaceFolder}",
+    // Make pytest run from the root without needing a pytest.ini
+    "python.testing.pytestEnabled": true,
+    "python.testing.cwd": "${workspaceFolder}",
+    "python.testing.pytestArgs": [
+        "src/test"
+    ],
+    // Help Pylance resolve imports like `from src...` without red squiggles
+    "python.analysis.extraPaths": [
+        "${workspaceFolder}"
+    ],
+    // For linux
+    "terminal.integrated.env.linux": {
+        "PYTHONPATH": "${workspaceFolder}"
+    },
+    // For OSX
+    "terminal.integrated.env.osx": {
+        "PYTHONPATH": "${workspaceFolder}"
+    },
+    // For Windows
+    "terminal.integrated.env.windows": {
+        "PYTHONPATH": "${workspaceFolder}"
+    },
+    "python.analysis.typeCheckingMode": "standard"
+}
+// {
+//   // Always treat the project root as the working dir for Jupyter
+//   "jupyter.notebookFileRoot": "${workspaceFolder}",
+//
+//   // When you click "Run Python File in Terminal", DON'T cd into the file's folder
+//   "python.terminal.executeInFileDir": false,
+//
+//   // Start new integrated terminals at the project root
+//   "terminal.integrated.cwd": "${workspaceFolder}",
+//
+//   // Ensure Python can import from the project root no matter which file you run
+//   // (so `src/` is on sys.path). Linux shown here; add osx/windows if needed.
+//   "terminal.integrated.env.windows": {
+//     "PYTHONPATH": "${workspaceFolder}"
+//   },
+//
+//   // Make pytest run from the root without needing a pytest.ini
+//   "python.testing.pytestEnabled": true,
+//   "python.testing.cwd": "${workspaceFolder}",
+//   "python.testing.pytestArgs": ["src/test"],
+//
+//   // Help Pylance resolve imports like `from src...` without red squiggles
+//   "python.analysis.extraPaths": ["${workspaceFolder}"]
+// }
--- a/Assets/Dataset/1-hop/dataset.csv
+++ b/Assets/Dataset/1-hop/dataset.csv
--- a/Assets/Dataset/1-hop/movie-pageid.csv
+++ b/Assets/Dataset/1-hop/movie-pageid.csv
--- a/Assets/Dataset/1-hop/movies.csv
+++ b/Assets/Dataset/1-hop/movies.csv
--- a/Assets/Dataset/1-hop/reverse.csv
+++ b/Assets/Dataset/1-hop/reverse.csv
--- a/Assets/Dataset/1-hop/uri-abbreviations.csv
+++ b/Assets/Dataset/1-hop/uri-abbreviations.csv
--- a/Assets/Dataset/1-hop/wikipedia-movie.csv
+++ b/Assets/Dataset/1-hop/wikipedia-movie.csv
--- a/Assets/Dataset/1-hop/wikipedia-summary.csv
+++ b/Assets/Dataset/1-hop/wikipedia-summary.csv
--- a/Assets/Dataset/DatawareHouse/dataset.db
+++ b/Assets/Dataset/DatawareHouse/dataset.db
--- a/Assets/Dataset/Tmp/.gitkeep
+++ b/Assets/Dataset/Tmp/.gitkeep
--- a/Assets/Model/toy_10/README.md
+++ b/Assets/Model/toy_10/README.md
--- a/Assets/Model/toy_10/toy_dictionary.json
+++ b/Assets/Model/toy_10/toy_dictionary.json
--- a/Project_Model/Libs/BPE/Classes/Encoder.py
+++ b/Project_Model/Libs/BPE/Classes/Encoder.py
@ -0,0 +1,4 @@
+from abc import ABC
+
+class Encoder(ABC):
+    pass
--- a/Project_Model/Libs/BPE/Classes/NanoSocraTraineRam.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocraTraineRam.py
@ -0,0 +1,164 @@
+from collections import deque
+import datetime
+from pathlib import Path
+import re
+from ..Classes import (
+    NanoSocratesBPE,
+    NanoSocratesChunker,
+    NanoSocratesSplitter,
+    NanoSocratesBatchMemoryBPE,
+)
+from ..Enums import TokenType
+from ..Utils import (
+    special_regex_maker,
+    iterator_with_checks,
+    save_nanos_vocabulary,
+    load_nanos_vocabulary,
+    save_json,
+    load_json,
+)
+
+
+class NanoSocraTraineRam:
+
+    def __init__(
+        self,
+        max_vocabulary: int,
+        special_vocabulary: list[str],
+        merge_treshold: int = 0,
+        max_iterations: int = 0,
+        print_after_iterations: int = 1,
+    ) -> None:
+        # Bytes
+        BYTE_RESERVED_TOKENS = 256
+        SPECIAL_RESERVED_TOKENS = len(special_vocabulary)
+        RESERVED_TOKENS = BYTE_RESERVED_TOKENS + SPECIAL_RESERVED_TOKENS
+
+        self.__max_vocabulary = max_vocabulary - RESERVED_TOKENS
+        self.__max_iterations = max_iterations
+        self.__merge_treshold = merge_treshold
+        self.__special_token_regex = special_regex_maker(special_vocabulary)
+        self.__print_after_iterations = print_after_iterations
+
+    def trainBPE(
+        self,
+        path: Path,
+        bpe: NanoSocratesBPE | None = None,
+    ) -> NanoSocratesBPE:
+
+        if not path.is_file():
+            raise FileNotFoundError()
+
+        if bpe is None:
+            bpe = NanoSocratesBPE()
+        BPE = bpe
+
+        if BPE.vocabulary_size > self.__max_vocabulary:
+            return BPE
+
+        exit = False
+        current_iteration = 0
+        data = self.__gather_data_from_file(path)
+
+        while not exit:
+
+            current_iteration = self.__increment_counter(current_iteration)
+
+            LAST_VOC_SIZE = BPE.vocabulary_size
+
+            last_memory = None
+
+            _, data, last_memory = self.__round_train(BPE, data)
+
+            NEW_VOC_SIZE = BPE.vocabulary_size
+
+            if current_iteration % self.__print_after_iterations == 0:
+
+                DELIMITER = "==============="
+
+                DEBUG = "\n".join(
+                    [
+                        DELIMITER,
+                        f"ITERATION: {current_iteration}",
+                        DELIMITER,
+                        f"\tVocabulary size: {BPE.vocabulary_size}\n",
+                        f"\tFrequencies:\n{last_memory.frequencies}\n",  # type: ignore (pretty sure it's not None)
+                        f"\tvocabulary:\n{BPE.vocabulary}",
+                        DELIMITER,
+                        "",
+                    ]
+                )
+                print(DEBUG)
+
+            if LAST_VOC_SIZE == NEW_VOC_SIZE:
+                exit = True
+                continue
+
+            if current_iteration == self.__max_iterations:
+                exit = True
+                continue
+
+            if BPE.vocabulary_size == self.__max_vocabulary:
+                exit = True
+                continue
+
+        return BPE
+
+    def __round_train(self, bpe: NanoSocratesBPE, data: list[list[int]]):
+
+        DATA_LEN = len(data)
+        NEW_DATA = []
+
+        counter = 0
+        memory = NanoSocratesBatchMemoryBPE({}, 0)
+        while len(data) > 0:
+            counter += 1
+            last_batch = len(data) == 1
+
+            piece = data.pop()
+
+            bpe, memory, output = bpe.fit(piece, memory, last_batch)
+
+            if counter % int(1E6) == 0:
+                print(f"Fitted: {counter}/{DATA_LEN}")
+
+            if len(output) < 2:
+                continue
+
+            NEW_DATA.append(output)
+
+        return (bpe, NEW_DATA, memory)
+
+    def __gather_data_from_file(self, path: Path) -> list[list[int]]:
+
+        SPLITTER = NanoSocratesSplitter(self.__special_token_regex)
+
+        DATA: list[list[int]] = []
+
+        FILE = open(path, "r", encoding="utf-8")
+        file_string = FILE.read()
+        FILE.close()
+
+        for piece, type in SPLITTER.split_text(file_string):
+
+            if type != TokenType.BPE:
+                continue
+
+            int_list = self.__make_list_ids(piece)
+            DATA.append(int_list)
+
+        return DATA
+
+    def __increment_counter(self, counter: int):
+
+        # What if overflows???
+        try:
+            counter += 1
+        except:
+            print("Integer overflow")
+            counter = 1
+
+        return counter
+
+    def __make_list_ids(self, corpus: str):
+        return list(corpus.encode("utf-8"))
--- a/Project_Model/Libs/BPE/Classes/NanoSocraTrainer.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocraTrainer.py
@ -0,0 +1,248 @@
+from collections import deque
+import datetime
+from pathlib import Path
+import re
+from ..Classes import (
+    NanoSocratesBPE,
+    NanoSocratesChunker,
+    NanoSocratesSplitter,
+    NanoSocratesBatchMemoryBPE,
+)
+from ..Enums import TokenType
+from ..Utils import (
+    special_regex_maker,
+    iterator_with_checks,
+    save_nanos_vocabulary,
+    load_nanos_vocabulary,
+    save_json,
+    load_json,
+)
+
+
+class NanoSocraTrainer:
+
+    def __init__(
+        self,
+        max_vocabulary: int,
+        special_vocabulary: list[str],
+        chunk_size: int,
+        merge_treshold: int = 0,
+        max_iterations: int = 0,
+        print_after_iterations: int = 1,
+    ) -> None:
+        # Bytes
+        BYTE_RESERVED_TOKENS = 256
+        SPECIAL_RESERVED_TOKENS = len(special_vocabulary)
+        RESERVED_TOKENS = BYTE_RESERVED_TOKENS + SPECIAL_RESERVED_TOKENS
+
+        self.__max_vocabulary = max_vocabulary - RESERVED_TOKENS
+        self.__max_iterations = max_iterations
+        self.__chunk_size = chunk_size
+        self.__merge_treshold = merge_treshold
+        self.__special_token_regex = special_regex_maker(special_vocabulary)
+        self.__print_after_iterations = print_after_iterations
+
+    def trainBPE(
+        self,
+        path: Path,
+        cache_dir: Path,
+        bpe: NanoSocratesBPE | None = None,
+        resume_from_iter: int = 0,
+    ) -> NanoSocratesBPE:
+
+        if not path.is_file():
+            raise FileNotFoundError()
+
+        if not cache_dir.is_dir():
+            raise NotADirectoryError()
+
+        if bpe is None:
+            bpe = NanoSocratesBPE()
+        BPE = bpe
+
+        if BPE.vocabulary_size > self.__max_vocabulary:
+            return BPE
+
+        exit = False
+        cached = False
+        current_iteration = 0
+        input_path = path
+
+        NEXT_ITERATION = resume_from_iter + 1 if resume_from_iter != 0 else 0
+
+        PATH_GEN = self.__switch_paths(path, cache_dir, NEXT_ITERATION)
+        MEMORY_PATH_GEN = self.__switch_memory(cache_dir, resume_from_iter)
+
+        if resume_from_iter != 0:
+            cached = True
+            current_iteration = resume_from_iter
+            input_path = next(PATH_GEN)
+            # UGLY: fixes a bug immediately, unfortunately
+            _, _ = next(MEMORY_PATH_GEN)
+            _, voc_cache_path = next(MEMORY_PATH_GEN)
+            vocabulary = load_nanos_vocabulary(voc_cache_path)
+            BPE = NanoSocratesBPE(vocabulary)
+
+        while not exit:
+
+            out_path = next(PATH_GEN)
+            internal_cache_path, vocabulary_cache = next(MEMORY_PATH_GEN)
+
+            current_iteration = self.__increment_counter(current_iteration)
+            LAST_VOC_SIZE = BPE.vocabulary_size
+
+            FILE = open(out_path, "w")
+
+            last_memory = None
+
+            for _, memory, output in self.__round_train(input_path, BPE, cached):
+                last_memory = memory
+                FILE.write(output)
+
+            FILE.close()
+
+            internal_cache = {
+                "finished_iter": current_iteration,
+                "read_from": f"{input_path}",
+                "wrote_to": f"{out_path}",
+                "at": datetime.datetime.now(datetime.timezone.utc).strftime(
+                    "%Y-%m-%d %H:%M:%S.%f"
+                )[:-3],
+            }
+
+            VOCABULARY = BPE.vocabulary
+
+            save_json(internal_cache, internal_cache_path)
+            save_nanos_vocabulary(VOCABULARY, vocabulary_cache)
+
+            cached = True
+            input_path = out_path
+
+            NEW_VOC_SIZE = BPE.vocabulary_size
+
+            if current_iteration % self.__print_after_iterations == 0:
+
+                DELIMITER = "==============="
+
+                DEBUG = "\n".join(
+                    [
+                        DELIMITER,
+                        f"ITERATION: {current_iteration}",
+                        DELIMITER,
+                        f"\tVocabulary size: {BPE.vocabulary_size}\n",
+                        f"\tFrequencies:\n{last_memory.frequencies}\n",  # type: ignore (pretty sure it's not None)
+                        f"\tvocabulary:\n{BPE.vocabulary}",
+                        DELIMITER,
+                        "",
+                    ]
+                )
+                print(DEBUG)
+
+            if LAST_VOC_SIZE == NEW_VOC_SIZE:
+                exit = True
+                continue
+
+            if current_iteration == self.__max_iterations:
+                exit = True
+                continue
+
+            if BPE.vocabulary_size == self.__max_vocabulary:
+                exit = True
+                continue
+
+        return BPE
+
+    def __round_train(self, path: Path, bpe: NanoSocratesBPE, cached: bool):
+
+        CHUNKER = NanoSocratesChunker(self.__chunk_size, self.__special_token_regex)
+        SPLITTER = NanoSocratesSplitter(self.__special_token_regex)
+
+        BPE = bpe
+        memory = NanoSocratesBatchMemoryBPE({}, self.__merge_treshold)
+
+        CHUNKER_GENERATOR = iterator_with_checks(CHUNKER.chunk(path))
+
+        for chunk, last_chunk in CHUNKER_GENERATOR:
+
+            PIECE_GENERATOR = iterator_with_checks(SPLITTER.split_text(chunk))
+
+            for piece, last_piece in PIECE_GENERATOR:
+
+                LAST_BATCH = last_chunk and last_piece
+                PIECE, TOKEN_TYPE = piece
+
+                if TOKEN_TYPE != TokenType.BPE:
+                    _, _, out = BPE.fit([], memory, LAST_BATCH)
+                    yield (BPE, memory, PIECE)
+                    continue
+
+                PIECE_DATA = self.__make_list_ids(PIECE, cached)
+
+                _, _, out = BPE.fit(PIECE_DATA, memory, LAST_BATCH)
+
+                OUT_STRING = f"{out}"
+                yield (BPE, memory, OUT_STRING)
+
+    def __increment_counter(self, counter: int):
+
+        # What if overflows???
+        try:
+            counter += 1
+        except:
+            print("Integer overflow")
+            counter = 1
+
+        return counter
+
+    def __make_list_ids(self, corpus: str, cached: bool):
+
+        if not cached:
+            return list(corpus.encode("utf-8"))
+
+        REDUCED_CORPUS_LEN = len(corpus) - 1
+
+        # Skip these cars "[" "]"
+        INTS = corpus[1:REDUCED_CORPUS_LEN]
+        INT_LIST = list(map(int, INTS.split(",")))
+        return INT_LIST
+
+    def __switch_paths(self, path: Path, cache_path: Path, initial_iteration: int):
+
+        CORPUS_TMP_1 = cache_path / "corpus-tmp1.txt"
+        CORPUS_TMP_2 = cache_path / "corpus-tmp2.txt"
+
+        switch = True
+
+        if initial_iteration % 2 == 1:
+            switch = False
+
+        del initial_iteration
+
+        while True:
+            if switch:
+                yield CORPUS_TMP_1
+            else:
+                yield CORPUS_TMP_2
+            switch = not switch
+
+    def __switch_memory(self, cache_path: Path, initial_iteration: int):
+
+        INTERNAL_TMP_1 = cache_path / "internal-tmp1.json"
+        INTERNAL_TMP_2 = cache_path / "internal-tmp2.json"
+
+        VOCAB_TMP_1 = cache_path / "voc-tmp1.json"
+        VOCAB_TMP_2 = cache_path / "voc-tmp2.json"
+
+        switch = False
+
+        if initial_iteration % 2 == 1:
+            switch = True
+
+        del initial_iteration
+
+        while True:
+            if switch:
+                yield (INTERNAL_TMP_1, VOCAB_TMP_1)
+            else:
+                yield (INTERNAL_TMP_2, VOCAB_TMP_2)
+            switch = not switch
--- a/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py
@ -0,0 +1,280 @@
+from collections import deque
+import datetime
+import itertools
+from multiprocessing import Pool
+import os
+from pathlib import Path
+import re
+import time
+from ..Classes import (
+    NanoSocratesBPE,
+    NanoSocratesChunker,
+    NanoSocratesSplitter,
+    NanoSocratesBatchMemoryBPE,
+)
+from ..Enums import TokenType
+from ..Utils import (
+    special_regex_maker,
+    iterator_with_checks,
+    save_nanos_vocabulary,
+    load_nanos_vocabulary,
+    save_json,
+    load_json,
+)
+
+
+def split(a, n):
+    k, m = divmod(len(a), n)
+    return (a[i * k + min(i, m) : (i + 1) * k + min(i + 1, m)] for i in range(n))
+
+
+def split_fit(object: tuple[NanoSocratesBPE, list[list[int]]]):
+
+    bpe, data = object
+
+    NEW_DATA: list[list[int]] = []
+
+    memory = NanoSocratesBatchMemoryBPE({}, 0)
+
+    while len(data) > 0:
+
+        piece = data.pop()
+
+        bpe, memory, output = bpe.fit(piece, memory, False)
+
+        if len(output) < 2:
+            continue
+
+        # We are sure of its type
+        NEW_DATA.append(piece)  # type: ignore
+
+    return (bpe, NEW_DATA, memory)
+
+def split_encode(object: tuple[NanoSocratesBPE, list[list[int]]]):
+
+    bpe, data = object
+
+    NEW_DATA: list[list[int]] = []
+
+    for index, piece in zip(range(0, len(data)), data):
+        output = bpe.encode_intermediate(piece)
+
+        if len(output) < 2:
+            continue
+
+        # We are sure of its type
+        NEW_DATA.append(data[index])  # type: ignore
+
+    return NEW_DATA
+
+class NanoSocraTrainerPool:
+
+    def __init__(
+        self,
+        max_vocabulary: int,
+        special_vocabulary: list[str],
+        merge_treshold: int = 0,
+        max_iterations: int = 0,
+        print_after_iterations: int = 1,
+    ) -> None:
+        # Bytes
+        BYTE_RESERVED_TOKENS = 256
+        SPECIAL_RESERVED_TOKENS = len(special_vocabulary)
+        RESERVED_TOKENS = BYTE_RESERVED_TOKENS + SPECIAL_RESERVED_TOKENS
+
+        self.__max_vocabulary = max_vocabulary - RESERVED_TOKENS
+        self.__max_iterations = max_iterations
+        self.__merge_treshold = merge_treshold
+        self.__special_token_regex = special_regex_maker(special_vocabulary)
+        self.__print_after_iterations = print_after_iterations
+
+    # TODO: add a resume function
+    def trainBPE(
+        self,
+        path: Path,
+        cache_file: Path,
+        bpe: NanoSocratesBPE | None = None,
+    ) -> NanoSocratesBPE:
+
+        if not path.is_file():
+            raise FileNotFoundError()
+
+        if not cache_file.is_file():
+            file = cache_file.open("w")
+            file.close()
+
+        if bpe is None:
+            bpe = NanoSocratesBPE()
+        BPE = bpe
+
+        if BPE.vocabulary_size > self.__max_vocabulary:
+            return BPE
+
+        exit = False
+        current_iteration = 0
+        data = self.__gather_data_from_file(path)
+        data = self.__encode_from_cache(BPE, data)
+
+
+        while not exit:
+
+            current_iteration = self.__increment_counter(current_iteration)
+
+            LAST_VOC_SIZE = BPE.vocabulary_size
+
+            last_memory = None
+
+            start = time.time_ns()
+            _, data, last_memory = self.__round_train(BPE, data)
+            end = time.time_ns()
+            NEW_VOC_SIZE = BPE.vocabulary_size
+
+            VOCABULARY = BPE.vocabulary
+
+            save_nanos_vocabulary(VOCABULARY, cache_file)
+
+            if current_iteration % self.__print_after_iterations == 0:
+
+                DELIMITER = "==============="
+
+                DEBUG = "\n".join(
+                    [
+                        DELIMITER,
+                        f"ITERATION: {current_iteration}",
+                        DELIMITER,
+                        f"\tVocabulary size: {BPE.vocabulary_size - 256}\n",
+                        f"\tTime elapsed: {(end - start)/1E9}s",
+                        DELIMITER,
+                        "",
+                    ]
+                )
+                print(DEBUG)
+
+            if LAST_VOC_SIZE == NEW_VOC_SIZE:
+                exit = True
+                continue
+
+            if current_iteration == self.__max_iterations:
+                exit = True
+                continue
+
+            if BPE.vocabulary_size == self.__max_vocabulary:
+                exit = True
+                continue
+
+        return BPE
+
+    def __round_train(self, bpe: NanoSocratesBPE, data: list[list[int]]):
+
+        NEW_DATA: list[list[int]] = []
+
+        MEMORY = NanoSocratesBatchMemoryBPE({}, self.__merge_treshold)
+
+        fit_funct = split_fit
+        CPU_COUNT = os.process_cpu_count()
+
+        if CPU_COUNT is None:
+            raise Exception()
+
+        VOCABULARY = bpe.vocabulary
+
+        data_chunks = split(data, CPU_COUNT)
+        JOBS = [(NanoSocratesBPE(VOCABULARY), chunk) for chunk in data_chunks]
+
+        JOB_RESULTS: list[
+            tuple[NanoSocratesBPE, list[list[int]], NanoSocratesBatchMemoryBPE]
+        ]
+
+        with Pool() as pool:
+            JOB_RESULTS = pool.map(fit_funct, JOBS)
+
+        for i, res in zip(range(0, CPU_COUNT), JOB_RESULTS):
+            _, job_output, job_memory = res
+            NEW_DATA.extend(job_output)
+
+            for key, value in job_memory.frequencies.items():
+                frequency = MEMORY.frequencies.get(key)
+
+                if frequency is None:
+                    frequency = 0
+                    MEMORY.frequencies[key] = 0
+
+                frequency += value
+                MEMORY.frequencies[key] = frequency
+
+            del job_output
+            del job_memory
+
+            print(f"Joined {i + 1} out of {CPU_COUNT}")
+
+        # Get new token
+        bpe.fit([], MEMORY, True)
+
+        print(f"Sentences from {len(data)} to {len(NEW_DATA)}")
+
+        return (bpe, NEW_DATA, MEMORY)
+
+    def __gather_data_from_file(self, path: Path) -> list[list[int]]:
+
+        SPLITTER = NanoSocratesSplitter(self.__special_token_regex)
+
+        DATA: list[list[int]] = []
+
+        FILE = open(path, "r", encoding="utf-8")
+        file_string = FILE.read()
+        FILE.close()
+
+        for piece, type in SPLITTER.split_text(file_string):
+
+            if type != TokenType.BPE:
+                continue
+
+            int_list = self.__make_list_ids(piece)
+            DATA.append(int_list)
+
+        return DATA
+
+    def __encode_from_cache(self, bpe: NanoSocratesBPE, data: list[list[int]]):
+
+        NEW_DATA : list[list[int]]= []
+
+        CPU_COUNT = os.process_cpu_count()
+
+        if CPU_COUNT is None:
+            raise Exception()
+
+        VOCABULARY = bpe.vocabulary
+
+        data_chunks = split(data, CPU_COUNT)
+        JOBS = [(NanoSocratesBPE(VOCABULARY), chunk) for chunk in data_chunks]
+
+        JOB_RESULTS: list[list[list[int]]]
+
+        with Pool() as pool:
+            JOB_RESULTS = pool.map(split_encode, JOBS)
+
+        for i, res in zip(range(0, CPU_COUNT), JOB_RESULTS):
+            job_output = res
+            NEW_DATA.extend(job_output)
+
+            del job_output
+
+            print(f"Joined {i + 1} out of {CPU_COUNT}")
+
+        print(f"Sentences from {len(data)} to {len(NEW_DATA)}")
+
+        return NEW_DATA
+
+    def __increment_counter(self, counter: int):
+
+        # What if overflows???
+        try:
+            counter += 1
+        except:
+            print("Integer overflow")
+            counter = 1
+
+        return counter
+
+    def __make_list_ids(self, corpus: str):
+        return list(corpus.encode("utf-8"))
--- a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py
@ -0,0 +1,219 @@
+from collections import deque
+from .Encoder import Encoder
+from ..Errors import OutOfDictionaryException, DuplicateWordException
+
+
+# ABOUT THE DICTIONARY:
+# the string is converted into utf-char bytes, that is: each char is rappresented with a set of bytes from 1 to 4.
+# each bytes get casted into an integer; such that, if an integer has its value lower then 256,
+# then it is rappresenting an utf-char-byte, otherwise it is a token-ID.
+class NanoSocratesBatchMemoryBPE:
+    """Memory to batch training. Keeps token couple frequencies, and merge_treshold"""
+
+    def __init__(
+        self, frequencies: dict[tuple[int, int], int], merge_treshold: int
+    ) -> None:
+
+        self.frequencies = frequencies
+        self.merge_treshold = merge_treshold
+
+
+class NanoSocratesBPE(Encoder):
+
+    def __init__(self, vocabulary: dict[tuple[int, int], int] | None = None) -> None:
+        super().__init__()
+
+        self.__vocabulary: dict[tuple[int, int], int] = {}
+        self.__reverse_vocabulary: dict[int, tuple[int, int]] = {}
+
+        if vocabulary is None:
+            return
+
+        for key, value in vocabulary.items():
+            if value < 256:
+                raise OutOfDictionaryException()
+                # values under 256 are used for unpaired char
+            # TODO: check if they are in order
+            self.__vocabulary[key] = value
+            self.__reverse_vocabulary[value] = key
+
+    @property
+    def vocabulary_size(self):
+        return len(self.__vocabulary) + 256
+
+    @property
+    def vocabulary(self):
+        return self.__vocabulary
+
+    @property
+    def __next_id(self) -> int:
+        """
+        Gets the next it
+        Returns:
+            int:
+        """
+        return self.vocabulary_size
+
+    # TODO: implement fit
+    def fit(
+        self,
+        chunk_data: list[int],
+        memory: NanoSocratesBatchMemoryBPE,
+        last_batch: bool,
+    ):
+
+        ENCODED_CHUNK = self.encode_intermediate(chunk_data)
+        DATA_LEN_BEFORE_LAST = len(ENCODED_CHUNK) - 1
+
+        # update frequency of each couple of element
+        for i in range(0, DATA_LEN_BEFORE_LAST):
+            CANDIDATE_COUPLE = (ENCODED_CHUNK[i], ENCODED_CHUNK[i + 1])
+
+            frequency = memory.frequencies.get(CANDIDATE_COUPLE)
+
+            # Initialize frequency
+            if frequency is None:
+                frequency = 0
+                memory.frequencies[CANDIDATE_COUPLE] = 0
+
+            frequency += 1
+            memory.frequencies[CANDIDATE_COUPLE] = frequency
+
+        if not last_batch:
+            return (self, memory, ENCODED_CHUNK)
+
+        if len(memory.frequencies) < 1:
+            return (self, memory, ENCODED_CHUNK)
+
+        FREQUENCIES = memory.frequencies
+        MAX_COUPLE = max(FREQUENCIES.items(), key=lambda item: item[1])[0]
+        FREQUENCY = FREQUENCIES[MAX_COUPLE]
+
+        if FREQUENCY < memory.merge_treshold:
+            return (self, memory, ENCODED_CHUNK)
+
+        self.__learn_word(MAX_COUPLE)
+
+        return (self, memory, ENCODED_CHUNK)
+
+    def encode(self, piece: str) -> list[int]:
+        """Encode a String into token IDs, it firt convert it into utf-8, then pass the list of integer to encode_intermediate()
+        Args:
+            piece (str):
+        Returns:
+            list[int]:
+        """
+        converted_piece = list(piece.encode("utf-8"))
+        return self.encode_intermediate(converted_piece)
+
+    def encode_intermediate(self, piece: list[int]) -> list[int]:
+        """Encode a piece (as list of integer) till its maximum
+        Args:
+            piece (list[int]): piece to encode
+        Returns:
+            list[int]: piece encoded
+        """
+        current_piece = piece
+        new_piece = self.__round_encode(current_piece)
+
+        # until current_piece is bigger then new_piece, keep encoding
+        while len(current_piece) != len(new_piece):
+            current_piece = new_piece
+            new_piece = self.__round_encode(current_piece)
+
+        return current_piece
+
+    def __round_encode(self, piece: list[int]):
+        """A single round of encode that traverse all the object. Multiple round are needed for a full encode: \n
+        1) "ABAB" -> "XX"
+        2) "XX" -> "Y"
+        Args:
+            piece (list[int]): the object to encode as a list of integer
+
+        Returns:
+            (list[int]): the one time encoded object
+        """
+
+        if len(piece) == 1:
+            return piece
+
+        PIECE_LENGTH = len(piece) - 1
+        NEW_PIECE: list[int] = []
+
+        index = 0
+        while index < PIECE_LENGTH:
+
+            CANDIDATE_WORD = (
+                piece[index],
+                piece[index + 1],
+            )  # take a tuple of consecutive element [int]
+            CANDIDATE_TOKEN = self.__vocabulary.get(CANDIDATE_WORD)
+
+            # if no token to substitute the tuple, append the first element
+            if CANDIDATE_TOKEN is None:
+                NEW_PIECE.append(piece[index])
+                index += 1
+
+                # if the latter element of the tuple is the last element of the piece, append it
+                if index == PIECE_LENGTH:
+                    NEW_PIECE.append(piece[index])
+
+                continue
+
+            # in this case there was a candidate token to substitute the couple of element
+            NEW_PIECE.append(CANDIDATE_TOKEN)
+
+            index += 2
+
+            if index == PIECE_LENGTH:
+                NEW_PIECE.append(piece[index])
+
+        return NEW_PIECE
+
+    # TODO: Remake decode to take a list of token IDs
+    def decode(self, token_ids: list[int]) -> str:
+
+        # deque: double ended queue
+        token_stack: deque[int] = deque(token_ids)
+        UTF_8_STRING_ARR: bytearray = bytearray()
+
+        while len(token_stack) > 0:
+            TOKEN_ID = token_stack.popleft()
+
+            if TOKEN_ID < 256:
+                UTF_8_STRING_ARR.append(TOKEN_ID)
+                continue
+
+            left_token, right_token = self.__token_decode(TOKEN_ID)
+
+            token_stack.appendleft(right_token)
+            token_stack.appendleft(left_token)
+
+        return UTF_8_STRING_ARR.decode("utf-8")
+
+    def __token_decode(self, token_id: int) -> tuple[int, int]:
+
+        CANDIDATE_DECODED = self.__reverse_vocabulary.get(token_id)
+
+        if CANDIDATE_DECODED is None:
+            raise OutOfDictionaryException()
+
+        return CANDIDATE_DECODED
+
+    def __learn_word(self, words: tuple[int, int]):
+        """learn a new couple of object in the vocabulary
+        Args:
+            words (tuple[int, int]): the Pair of element to substitute with a new tokenID
+
+        Raises:
+            DuplicateWordException: it launch if there is a duplicate of the new tokenID in the dictionary
+        """
+        ID = self.__next_id
+
+        DUPLICATE = self.__vocabulary.get(words)
+
+        if DUPLICATE is not None:
+            raise DuplicateWordException()
+
+        self.__vocabulary[words] = ID
+        self.__reverse_vocabulary[ID] = words
--- a/Project_Model/Libs/BPE/Classes/NanoSocratesChunker.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocratesChunker.py
@ -0,0 +1,70 @@
+from pathlib import Path
+import re
+from ..Errors import DelimiterNotFoundException
+
+
+class NanoSocratesChunker:
+
+    def __init__(self, max_size: int, special_token_regex: re.Pattern) -> None:
+        self.__max_size: int = max_size
+        self.__special_token_regex: re.Pattern = special_token_regex
+        self.__residual: str = ""
+
+    # max theorethical size of chars
+    #   between special tokens:
+    #       - min: size - len(longest_token)
+    #       - MAX: size - len(shortest_token)
+    def chunk(self, file_path: Path):
+        # read_file
+        FILE = open(file_path, "r", encoding="utf-8")
+        exit = False
+
+        while not exit:
+            REMAINING_SIZE = self.__max_size - len(self.__residual)
+            READ_SIZE = min(self.__max_size, REMAINING_SIZE)
+            FILE_CHUNK = FILE.read(READ_SIZE)
+
+            if len(FILE_CHUNK) == 0:
+                exit = True
+                continue
+
+            CHUNK = self.__append_residuals(FILE_CHUNK)
+
+            boundaries = self.__identify_boudaries(CHUNK)
+
+            if boundaries is None:
+
+                # boundaries not found in 2 chunks,
+                if len(CHUNK) > self.__max_size - 1:
+                    raise DelimiterNotFoundException()
+
+                if exit:
+                    yield CHUNK
+
+                self.__set_residual(0, CHUNK)
+                continue
+
+            start, end = boundaries
+            self.__set_residual(end, CHUNK)
+            yield CHUNK[start:end]
+
+    def __identify_boudaries(self, corpus: str) -> tuple[int, int] | None:
+
+        end = 0
+
+        for match in self.__special_token_regex.finditer(corpus):
+            # print(match)
+            end = match.end()
+
+        if end == 0:
+            return None
+
+        return (0, end)
+
+    def __append_residuals(self, corpus: str) -> str:
+        RESIDUAL = self.__residual
+        self.__residual = ""
+        return RESIDUAL + corpus
+
+    def __set_residual(self, index: int, corpus: str):
+        self.__residual = corpus[index:]
--- a/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py
@ -0,0 +1,64 @@
+from .Encoder import Encoder
+from ..Errors import OutOfDictionaryException
+
+
+class NanoSocratesSpecial(Encoder):
+
+    def __init__(
+        self, bpe_vocabulary_size: int, special_tokens: list[str] = []
+    ) -> None:
+
+        super().__init__()
+
+        self.__bpe_offset = bpe_vocabulary_size
+        self.__vocabulary: dict[str, int] = {}
+        self.__reverse_vocabulary: dict[int, str] = {}
+
+        if len(special_tokens) == 0:
+            return
+
+        for index, TOKEN in zip(range(0, len(special_tokens)), special_tokens):
+
+            CANDIDATE_ID = self.__bpe_offset + index + 1
+            self.__vocabulary[TOKEN] = CANDIDATE_ID
+            self.__reverse_vocabulary[CANDIDATE_ID] = TOKEN
+
+    @property
+    def __next_id(self):
+        BPE_OFFSET = self.__bpe_offset
+        VOC_LENGTH = len(self.__vocabulary)
+        return BPE_OFFSET + VOC_LENGTH + 1
+
+    @property
+    def vocabulary(self) -> dict[str, int]:
+        return self.__vocabulary
+
+    @property
+    def reverse_vocabulary(self) -> dict[int, str]:
+        return self.__reverse_vocabulary
+
+    def add_special_word_to_vocabulary(self, word: str):
+        CANDIDATE_INDEX = self.__next_id
+        self.__vocabulary[word] = CANDIDATE_INDEX
+        self.__reverse_vocabulary[CANDIDATE_INDEX] = word
+
+    def encode(self, word: str) -> list[int]:
+        ID = self.__vocabulary.get(word)
+
+        if ID is None:
+            raise OutOfDictionaryException()
+
+        return [ID]
+
+    def decode(self, token_id: list[int]) -> str:
+
+        if len(token_id) != 1:
+            raise OutOfDictionaryException()
+
+        ID = token_id[0]
+        WORD = self.__reverse_vocabulary.get(ID)
+
+        if WORD is None:
+            raise OutOfDictionaryException()
+
+        return WORD
--- a/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py
@ -0,0 +1,98 @@
+import re
+from collections import deque
+from typing import Generator
+from ..Enums import TokenType
+
+
+class NanoSocratesSplitter:
+
+    def __init__(
+        self, special_token_regex: re.Pattern, max_bpe_token_id: int = 255
+    ) -> None:
+        # attention the regex got already compiled
+        self.__special_token_regex = special_token_regex
+        self.__max_bpe_token_id: int = max_bpe_token_id  # used for decoding
+
+    def split_text(self, corpus: str) -> Generator[tuple[str, TokenType]]:
+        """Split a text using a regex given
+        Args:
+            corpus (str): all the corpus string to split
+        Yields:
+            Generator[tuple[str, TokenType]]: each time returns a piece of the splitted text: string and its TokenType. \n
+            TokenType describe if the string is for the BPE or a special Token [BPE, SPECIAL]
+        """
+
+        bpe_start = 0
+        bpe_end = len(corpus)  # this can be deleted!
+
+        for special_token_start, special_token_end in self.__find_boundaries(corpus):
+
+            # FIND BPE
+            bpe_end = special_token_start
+            BPE_TOKEN_TEXT = corpus[bpe_start:bpe_end]
+            if BPE_TOKEN_TEXT != "":
+                for WORD in self.__split_words(BPE_TOKEN_TEXT):
+                    yield (WORD, TokenType.BPE)
+
+            # FIND SPECIAL TOKEN
+            SPECIAL_TOKEN_TEXT = corpus[special_token_start:special_token_end]
+            if SPECIAL_TOKEN_TEXT != "":
+                yield (SPECIAL_TOKEN_TEXT, TokenType.SPECIAL)
+
+            # now save the new bpe start point
+            # it will used in the next interaction
+            bpe_start = special_token_end
+
+    def __find_boundaries(self, corpus: str) -> Generator[tuple[int, int]]:
+        """
+        Find each time the start and end (not included) of the special token
+        Args:
+            corpus (str): the string where the special token will be searched
+        Yields:
+            Generator[tuple[int, int]]: Note the end is not included
+        """
+        for match in self.__special_token_regex.finditer(corpus):
+            start = match.start()
+            end = match.end()
+
+            yield (start, end)
+
+        # make the last boundary be the end of corpus
+        # eof = len(corpus)
+        # yield(eof,eof)
+
+    def __split_words(self, bpe_piece: str) -> Generator[str]:
+
+        END_OF_STRING = len(bpe_piece)
+        bound_start = 0
+        bound_end = END_OF_STRING + 1
+        for i in range(0, END_OF_STRING):
+
+            CANDIDATE_CHAR = bpe_piece[i]
+
+            if CANDIDATE_CHAR != " ":
+                continue
+
+            bound_end = i
+
+            yield bpe_piece[bound_start:bound_end]
+
+            bound_start = bound_end
+            bound_end = END_OF_STRING + 1
+
+        yield bpe_piece[bound_start:bound_end]
+
+    def split_tokens(self, corpus: list[int]) -> Generator[tuple[list[int], TokenType]]:
+
+        not_special_token_list: list[int] = []
+        for token in corpus:
+            if token > self.__max_bpe_token_id:
+
+                if len(not_special_token_list) > 0:
+                    yield (not_special_token_list, TokenType.BPE)
+                    not_special_token_list = []
+
+                yield ([token], TokenType.SPECIAL)
+                continue
+
+            not_special_token_list.append(token)
--- a/Project_Model/Libs/BPE/Classes/TokeNano.py
+++ b/Project_Model/Libs/BPE/Classes/TokeNano.py
@ -0,0 +1,8 @@
+
+from Project_Model.Libs.BPE.Classes.TokeNanoCore import TokeNanoCore
+
+class TokeNano:
+
+    def __init__(self):
+        
+        pass
--- a/Project_Model/Libs/BPE/Classes/TokeNanoCore.py
+++ b/Project_Model/Libs/BPE/Classes/TokeNanoCore.py
@ -0,0 +1,62 @@
+from pathlib import Path
+
+from ..Classes import NanoSocratesSplitter
+from ..Classes import NanoSocratesBPE
+from ..Classes import NanoSocratesSpecial
+
+from ..Utils import special_regex_maker
+from ..Enums import TokenType
+
+
+class TokeNanoCore:
+    def __init__(
+        self,
+        bpe_vocabulary: dict[tuple[int, int], int],
+        special_token_list: list[str],
+        # special_vocabulary: dict[str, int]
+    ):
+
+        self.__bpe_encoder = NanoSocratesBPE(bpe_vocabulary)
+
+        SPECIAL_REGEX = special_regex_maker(special_token_list)
+        BPE_VOCABULARY_SIZE = self.__bpe_encoder.vocabulary_size
+
+        self.__splitter = NanoSocratesSplitter(SPECIAL_REGEX, BPE_VOCABULARY_SIZE)
+        self.__special_encoder = NanoSocratesSpecial(
+            BPE_VOCABULARY_SIZE, special_token_list
+        ) 
+
+    def encode(self, corpus: str) -> list[int]:
+        output: list[int] = []
+        for piece, token_type in self.__splitter.split_text(corpus):
+
+            if token_type == TokenType.SPECIAL:
+                ENCODED_PIECE = self.__special_encoder.encode(piece)
+                output.extend(ENCODED_PIECE)
+                continue
+
+            # slow but clear
+            if token_type == TokenType.BPE:
+                ENCODED_PIECE = self.__bpe_encoder.encode(piece)
+                output.extend(ENCODED_PIECE)
+                continue
+
+        return output
+
+    def decode(self, corpus: list[int]) -> str:
+        output_str = ""
+        for token, token_type in self.__splitter.split_tokens(corpus):
+            # token is an integer if special, a list of integer otherwise
+            if token_type == TokenType.SPECIAL:
+                output_str += self.__special_encoder.decode(
+                    token
+                )
+                continue
+
+            # slow but clear
+            if token_type == TokenType.BPE:
+                output_str += self.__bpe_encoder.decode(
+                    token
+                )
+                continue
+        return output_str
--- a/Project_Model/Libs/BPE/Classes/init.py
+++ b/Project_Model/Libs/BPE/Classes/init.py
@ -0,0 +1,18 @@
+from .NanoSocratesChunker import NanoSocratesChunker
+from .NanoSocratesSplitter import NanoSocratesSplitter
+from .NanoSocratesBPE import NanoSocratesBPE, NanoSocratesBatchMemoryBPE
+from .NanoSocraTrainer import NanoSocraTrainer
+from .NanoSocraTraineRam import NanoSocraTraineRam
+from .NanoSocraTrainerPool import NanoSocraTrainerPool
+from .NanoSocratesSpecial import NanoSocratesSpecial
+from .TokeNanoCore import TokeNanoCore
+
+__all__ = [
+    "NanoSocratesChunker",
+    "NanoSocratesSplitter",
+    "NanoSocratesBPE",
+    "NanoSocraTrainer",
+    "NanoSocraTraineRam",
+    "NanoSocraTrainerPool",
+    "TokeNanoCore"
+]
--- a/Project_Model/Libs/BPE/Enums/SpecialToken.py
+++ b/Project_Model/Libs/BPE/Enums/SpecialToken.py
@ -0,0 +1,21 @@
+from enum import Enum
+
+
+class SpecialToken(Enum):
+    # (Enum, str) -> throws an error
+    START_TRIPLE_LIST = "<SOTL>"
+    START_TRIPLE = "<SOT>"
+    END_TRIPLE = "<EOT>"
+    SUBJECT = "<SUBJ>"
+    RELATIONSHIP = "<PRED>"
+    OBJECT = "<OBJ>"
+    ABSTRACT = "<ABS>"
+    CORPUS_END = "<END>"
+
+    ## Tasks' Token
+    RDF_TO_TEXT = "<RDF2TXT>"
+    TEXT_TO_RDF = "<TEXT2RDF>"
+    CONTINUE_RDF = "<CONTINUERDF>"
+    MASK = "<MASK>"
+
+    # BPE Training:
--- a/Project_Model/Libs/BPE/Enums/TokenType.py
+++ b/Project_Model/Libs/BPE/Enums/TokenType.py
@ -0,0 +1,6 @@
+from enum import Enum, auto
+
+class TokenType(Enum):
+
+    SPECIAL = auto()
+    BPE = auto()
--- a/Project_Model/Libs/BPE/Enums/init.py
+++ b/Project_Model/Libs/BPE/Enums/init.py
@ -0,0 +1 @@
+from .TokenType import TokenType
--- a/Project_Model/Libs/BPE/Errors/DelimiterNotFoundException.py
+++ b/Project_Model/Libs/BPE/Errors/DelimiterNotFoundException.py
@ -0,0 +1,4 @@
+class DelimiterNotFoundException(Exception):
+
+    def __init__(self, *args: object) -> None:
+        super().__init__(*args)
--- a/Project_Model/Libs/BPE/Errors/DuplicateWordException.py
+++ b/Project_Model/Libs/BPE/Errors/DuplicateWordException.py
@ -0,0 +1,4 @@
+class DuplicateWordException(Exception):
+
+    def __init__(self, *args: object) -> None:
+        super().__init__(*args)
--- a/Project_Model/Libs/BPE/Errors/OutOfDictionaryException.py
+++ b/Project_Model/Libs/BPE/Errors/OutOfDictionaryException.py
@ -0,0 +1,4 @@
+class OutOfDictionaryException(Exception):
+
+    def __init__(self, *args: object) -> None:
+        super().__init__(*args)
--- a/Project_Model/Libs/BPE/Errors/SentenceTooLongException.py
+++ b/Project_Model/Libs/BPE/Errors/SentenceTooLongException.py
@ -0,0 +1,4 @@
+class SentenceTooLongException(Exception):
+
+    def __init__(self, *args: object) -> None:
+        super().__init__(*args)
--- a/Project_Model/Libs/BPE/Errors/init.py
+++ b/Project_Model/Libs/BPE/Errors/init.py
@ -0,0 +1,11 @@
+from .DelimiterNotFoundException import DelimiterNotFoundException
+from .OutOfDictionaryException import OutOfDictionaryException
+from .DuplicateWordException import DuplicateWordException
+from .SentenceTooLongException import SentenceTooLongException
+
+__all__ = [
+    "DelimiterNotFoundException",
+    "OutOfDictionaryException",
+    "DuplicateWordException",
+    "SentenceTooLongException"
+]
--- a/Project_Model/Libs/BPE/Utils/init.py
+++ b/Project_Model/Libs/BPE/Utils/init.py
@ -0,0 +1,13 @@
+from .special_regex_maker import special_regex_maker
+from .lag_checker_iterator import iterator_with_checks
+from .vocabulary import save_nanos_vocabulary, load_nanos_vocabulary
+from .json_utils import save_json, load_json
+from .special_regex_maker import special_regex_maker
+
+__all__ = [
+    "special_regex_maker",
+    "iterator_with_checks",
+    "save_nanos_vocabulary",
+    "load_nanos_vocabulary",
+    "save_json", "load_json"
+]
--- a/Project_Model/Libs/BPE/Utils/json_utils.py
+++ b/Project_Model/Libs/BPE/Utils/json_utils.py
@ -0,0 +1,18 @@
+import json
+from pathlib import Path
+
+
+def save_json(dictionary: dict, path: Path):
+
+    json_string = json.dumps(dictionary)
+    FILE = open(path, "w")
+    FILE.write(json_string)
+    FILE.close()
+
+
+def load_json(path: Path) -> dict:
+    FILE = open(path, "r")
+    json_string = FILE.read()
+    FILE.close()
+
+    return json.loads(json_string)
--- a/Project_Model/Libs/BPE/Utils/lag_checker_iterator.py
+++ b/Project_Model/Libs/BPE/Utils/lag_checker_iterator.py
@ -0,0 +1,27 @@
+from collections import deque
+from typing import Generator, TypeVar
+
+T1 = TypeVar("T1")
+T2 = TypeVar("T2")
+T3 = TypeVar("T3")
+
+
+def iterator_with_checks(
+    generator: Generator[T1, T2, T3],
+) -> Generator[tuple[T1, bool], T2, T3]:
+
+    # Here we can ignore to catch stop iteration
+    #   we will propagate it
+    last_element = next(generator)
+
+    while True:
+
+        RETURN_ELEMENT = last_element
+        try:
+            element = next(generator)
+            last_element = element
+            yield (RETURN_ELEMENT, False)
+
+        except StopIteration:
+            yield (RETURN_ELEMENT, True)
+            break
--- a/Project_Model/Libs/BPE/Utils/special_regex_maker.py
+++ b/Project_Model/Libs/BPE/Utils/special_regex_maker.py
@ -0,0 +1,15 @@
+import re
+
+
+def special_regex_maker(special_tokens: list[str]) -> re.Pattern:
+    """compile a regex for the special token
+    Args:
+        special_tokens (list[str]): the list of special token
+
+    Returns:
+        re.Pattern:
+    """
+
+    REGEX_STR = "|".join(special_tokens)
+
+    return re.compile(REGEX_STR)
--- a/Project_Model/Libs/BPE/Utils/vocabulary.py
+++ b/Project_Model/Libs/BPE/Utils/vocabulary.py
@ -0,0 +1,49 @@
+import json
+from pathlib import Path
+from ..Errors import OutOfDictionaryException
+
+
+def nanos_vocabulary2json_str(vocabulary: dict[tuple[int, int], int]) -> str:
+
+    JSON: dict[str, int] = {}
+
+    for key, item in vocabulary.items():
+        TUPLE_STR = f"{key}"
+        JSON[TUPLE_STR] = item
+
+    return json.dumps(JSON)
+
+
+def nanos_json_str2vocabulary(json_string: str) -> dict[tuple[int, int], int]:
+
+    JSON: dict[str, int] = json.loads(json_string)
+    VOCABULARY: dict[tuple[int, int], int] = {}
+
+    for key, item in JSON.items():
+        REDUCED_KEY = len(key) - 1
+        KEY_STR = key[1:REDUCED_KEY]
+        VOC_KEY = tuple(map(int, KEY_STR.split(",")))
+
+        if len(VOC_KEY) != 2:
+            raise OutOfDictionaryException()
+
+        # Checked for weird things above
+        VOCABULARY[VOC_KEY] = item  # type: ignore
+
+    return VOCABULARY
+
+
+def save_nanos_vocabulary(vocabulary: dict[tuple[int, int], int], path: Path):
+
+    json_string = nanos_vocabulary2json_str(vocabulary)
+    FILE = open(path, "w")
+    FILE.write(json_string)
+    FILE.close()
+
+
+def load_nanos_vocabulary(path: Path) -> dict[tuple[int, int], int]:
+    FILE = open(path, "r")
+    json_string = FILE.read()
+    FILE.close()
+
+    return nanos_json_str2vocabulary(json_string)
--- a/Project_Model/Libs/BPE/init.py
+++ b/Project_Model/Libs/BPE/init.py
@ -0,0 +1,9 @@
+from .Classes import *
+from .Enums import *
+from .Errors import *
+from .Utils import *
+
+from . import Classes
+from . import Enums
+from . import Errors
+from . import Utils
--- a/Project_Model/Libs/init.py
+++ b/Project_Model/Libs/init.py
@ -0,0 +1 @@
+from . import BPE
--- a/Project_Model/Tests/bpe_test.py
+++ b/Project_Model/Tests/bpe_test.py
@ -0,0 +1,74 @@
+from Project_Model.Libs.BPE.Enums import TokenType
+import Project_Model.Libs.BPE as BPE
+
+import re
+
+
+class TestBPE:
+
+    def test_bpe_encoding_simple(self):
+
+        TEXT = "abababab"
+
+        # ab = 256
+        # 256, 256 = 257
+        # 257, 257 = 258
+
+        VOCABULARY = {(ord("a"), ord("b")): 256, (256, 256): 257, (257, 257): 258}
+        EXPECTED = [258]
+
+        BPE_ENCODER = BPE.NanoSocratesBPE(VOCABULARY)
+
+        ENCODED = BPE_ENCODER.encode(TEXT)
+
+        assert len(ENCODED) == len(EXPECTED)
+
+        for encoded, expected in zip(ENCODED, EXPECTED):
+            assert encoded == expected
+
+    def test_bpe_decoding_simple(self):
+
+
+        INPUT = [258]
+
+        # ab = 256
+        # 256, 256 = 257
+        # 257, 257 = 258
+
+        VOCABULARY = {(ord("a"), ord("b")): 256, (256, 256): 257, (257, 257): 258}
+        EXPECTED = "abababab"
+
+        BPE_ENCODER = BPE.NanoSocratesBPE(VOCABULARY)
+
+        DECODED = BPE_ENCODER.decode(INPUT)
+
+        assert len(DECODED) == len(EXPECTED)
+
+        for encoded, expected in zip(DECODED, EXPECTED):
+            assert encoded == expected
+
+    def test_bpe_decoding_edge_1(self):
+
+
+        INPUT = [258, ord("c")]
+
+        # ab = 256
+        # 256, 256 = 257
+        # 257, 257 = 258
+
+        VOCABULARY = {(ord("a"), ord("b")): 256, (256, 256): 257, (257, 257): 258}
+        EXPECTED = "ababababc"
+
+        BPE_ENCODER = BPE.NanoSocratesBPE(VOCABULARY)
+
+        DECODED = BPE_ENCODER.decode(INPUT)
+
+        assert len(DECODED) == len(EXPECTED)
+
+        for encoded, expected in zip(DECODED, EXPECTED):
+            assert encoded == expected
+
+# Useful to debug weird cases
+if __name__ == "__main__":
+    # TestBPE().test_bpe_decoding_simple()
+    TestBPE().test_bpe_encoding_simple()
--- a/Project_Model/Tests/bpe_trainer_test.py
+++ b/Project_Model/Tests/bpe_trainer_test.py
@ -0,0 +1,77 @@
+from pathlib import Path
+from Project_Model.Libs.BPE.Enums import TokenType
+import Project_Model.Libs.BPE as BPE
+
+import re
+
+CACHE_DIR_PATH = Path("Project_Model/Tests/trainer_files/cache/pool-cache.json")
+
+class TestTrainBPE:
+
+    def test_bpe_train_encoding_simple(self):
+
+        TRAINER = BPE.NanoSocraTrainerPool(
+            int(32E3),
+            ["<SOT>", "<EOT>"]
+        )
+
+        TEXT = "abababab"
+        TEXT_PATH = Path("Project_Model/Tests/trainer_files/train_simple.txt")
+
+        EXPECTED = [258]
+
+        # ab = 256
+        # 256, 256 = 257
+        # 257, 257 = 258
+
+        BPE_ENCODER = TRAINER.trainBPE(
+            TEXT_PATH,
+            CACHE_DIR_PATH
+        )
+
+        ENCODED = BPE_ENCODER.encode(TEXT)
+
+        assert len(ENCODED) == len(EXPECTED)
+
+        for encoded, expected in zip(ENCODED, EXPECTED):
+            assert encoded == expected
+
+
+    def test_bpe_train_encoding_and_decoding(self):
+
+        SPECIAL_LIST = ["<ABS>", "<SOTL>"]
+        TRAINER = BPE.NanoSocraTrainerPool(
+            int(32E3),
+            SPECIAL_LIST
+        )
+
+        TEXT_PATH = Path("Project_Model/Tests/trainer_files/train_encode_decode.txt")
+        FILE = open(TEXT_PATH)
+        TEXT = FILE.read()
+        FILE.close()
+
+        EXPECTED = TEXT
+
+        # ab = 256
+        # 256, 256 = 257
+        # 257, 257 = 258
+        
+        BPE_ENCODER = TRAINER.trainBPE(
+            TEXT_PATH,
+            CACHE_DIR_PATH
+        )
+        VOCABULARY = BPE_ENCODER.vocabulary
+        TOKENANO = BPE.TokeNanoCore(VOCABULARY,SPECIAL_LIST)
+        
+        ENCODED = TOKENANO.encode(TEXT)
+        DECODED = TOKENANO.decode(ENCODED)
+
+        assert len(DECODED) == len(EXPECTED)
+
+        for decoded, expected in zip(DECODED, EXPECTED):
+            assert decoded == expected
+
+# Useful to debug weird cases
+if __name__ == "__main__":
+    # TestTrainBPE().test_bpe_train_encoding_simple()
+    TestTrainBPE().test_bpe_train_encoding_and_decoding()
--- a/Project_Model/Tests/chunker_files/edge-1.txt
+++ b/Project_Model/Tests/chunker_files/edge-1.txt
@ -0,0 +1,4 @@
+<SOT>Lorem <SEP>ipsu<SEP>m d<SEP>olor s<SEP>it ame<SEP>t,
+<SEP>conse<SEP>cte<SEP>tur adip<SEP>iscin<SEP>g elit.
+<SEP>Aenean a<SEP>t dui he<SEP>ndrer<SEP>it an<SEP>te soll<SEP>icitud
+<SEP>in sce<SEP>lerisque<EOT>
--- a/Project_Model/Tests/chunker_files/simple.txt
+++ b/Project_Model/Tests/chunker_files/simple.txt
@ -0,0 +1,2 @@
+<SOT>Lorem ipsum <SEP>dolor sit amet<SEP>, consectetur <SEP>adipiscing elit.
+<SEP>Aenean at dui <SEP>hendrerit ante <SEP>sollicitudin <SEP>scelerisque<EOT>
--- a/Project_Model/Tests/chunker_files/stress.txt
+++ b/Project_Model/Tests/chunker_files/stress.txt
@ -0,0 +1,3 @@
+<SOT>Lorem ipsum <SEP>dolor sit amet<SEP>, consectetur <SEP>adipiscing elit.
+<SEP>Aenean at dui <SEP>hendrerit an te <SEP>sollicitudin <SEP>scelerisque
+<SEP>dsdsasssdfdsdsfkjddsnfkjdsnfkjdnsjkfndf<EOT>
--- a/Project_Model/Tests/chunker_test.py
+++ b/Project_Model/Tests/chunker_test.py
@ -0,0 +1,89 @@
+from pathlib import Path
+import re
+import pytest
+import Project_Model.Libs.BPE as BPE
+
+PATTERN = "<(TOKEN|SOT|SEP|EOT)>"
+SYMBOL_REGEX = re.compile(PATTERN)
+
+class TestChunker:
+
+    def test_correct_simple(self):
+
+        FILE_PATH = Path("Project_Model/Tests/chunker_files/simple.txt")
+        LEAST_EXPECTED_CHUNKS = 3
+        ORIG_TEXT = FILE_PATH.read_text(encoding="utf-8")
+
+        CHUNKER = BPE.NanoSocratesChunker(40, SYMBOL_REGEX)
+
+        CHUNKS = []
+
+        for chunk in CHUNKER.chunk(FILE_PATH):
+            print(chunk)
+            CHUNKS.append(
+                chunk
+            )
+
+        NANO_TEXT = "".join(CHUNKS)
+
+        assert len(CHUNKS) > (LEAST_EXPECTED_CHUNKS - 1)
+        assert NANO_TEXT == ORIG_TEXT
+
+
+
+    def test_correct_edge_1(self):
+
+        FILE_PATH = Path("Project_Model/Tests/chunker_files/edge-1.txt")
+        LEAST_EXPECTED_CHUNKS = 3
+        ORIG_TEXT = FILE_PATH.read_text(encoding="utf-8")
+
+        CHUNKER = BPE.NanoSocratesChunker(15, SYMBOL_REGEX)
+
+        CHUNKS = []
+
+        for chunk in CHUNKER.chunk(FILE_PATH):
+            print(chunk)
+            CHUNKS.append(
+                chunk
+            )
+
+        NANO_TEXT = "".join(CHUNKS)
+
+        assert len(CHUNKS) > (LEAST_EXPECTED_CHUNKS - 1)
+        assert NANO_TEXT == ORIG_TEXT
+
+
+
+    def test_throwing(self):
+
+        FILE_PATH = Path("Project_Model/Tests/chunker_files/simple.txt")
+
+        CHUNKER = BPE.NanoSocratesChunker(5, SYMBOL_REGEX)
+
+        with pytest.raises(BPE.DelimiterNotFoundException):
+            for chunk in CHUNKER.chunk(FILE_PATH):
+                print(chunk)
+
+if __name__ == "__main__":
+
+    FILE_PATH = Path("Project_Model/Tests/chunker_files/stress.txt")
+    LEAST_EXPECTED_CHUNKS = 3
+    ORIG_TEXT = FILE_PATH.read_text(encoding="utf-8")
+
+    CHUNKER = BPE.NanoSocratesChunker(40, SYMBOL_REGEX)
+
+    CHUNKS = []
+
+    try:
+        for chunk in CHUNKER.chunk(FILE_PATH):
+            print(f"CHUNK START:\n{chunk} - {len(chunk)}\nCHUNK END\n")
+            CHUNKS.append(
+                chunk
+            )
+    except:
+        exit(0)
+
+    NANO_TEXT = "".join(CHUNKS)
+
+    assert len(CHUNKS) > (LEAST_EXPECTED_CHUNKS - 1)
+    assert NANO_TEXT == ORIG_TEXT
--- a/Project_Model/Tests/splitter_test.py
+++ b/Project_Model/Tests/splitter_test.py
@ -0,0 +1,182 @@
+from Project_Model.Libs.BPE.Enums import TokenType
+import Project_Model.Libs.BPE as BPE
+
+import re
+
+
+PATTERN = "<(TOKEN|SOT|SEP|EOT)>"
+SYMBOL_REGEX = re.compile(PATTERN)
+
+
+class TestSplitter:
+
+    def test_split(self):
+
+        TEXT = "<SOT>Lorem <SEP>"
+
+        SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX)
+
+        EXPECTED_CHUNKS = [
+            ("<SOT>", TokenType.SPECIAL),
+            ("Lorem", TokenType.BPE),
+            (" ", TokenType.BPE),
+            ("<SEP>", TokenType.SPECIAL),
+        ]
+
+        CHUNKS = list(SPLITTER.split_text(TEXT))
+
+        assert len(CHUNKS) == len(EXPECTED_CHUNKS)
+
+        for chunk, expected_chunk in zip(EXPECTED_CHUNKS, CHUNKS):
+            print(f"TEST:\n\tCHUNK:\t\t{chunk}\n\tEXPECTED:\t\t{expected_chunk}")
+            RECEIVED_TOKEN_STRING, RECEIVED_TOKEN_TYPE = chunk
+            EXPECTED_TOKEN_STRING, EXPECTED_TOKEN_TYPE = expected_chunk
+
+            assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING
+            assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE
+
+    def test_split_trailing_text(self):
+
+        TEXT = "ipsu<SEP>m d<SEP>olor"
+
+        SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX)
+
+        EXPECTED_CHUNKS = [
+            ("ipsu", TokenType.BPE),
+            ("<SEP>", TokenType.SPECIAL),
+            ("m", TokenType.BPE),
+            (" d", TokenType.BPE),
+            ("<SEP>", TokenType.SPECIAL),
+            # ("olor", TokenType.BPE)
+        ]
+
+        CHUNKS = list(SPLITTER.split_text(TEXT))
+
+        assert len(CHUNKS) == len(EXPECTED_CHUNKS)
+
+        for chunk, expected_chunk in zip(EXPECTED_CHUNKS, CHUNKS):
+            print(f"TEST:\n\tCHUNK:\t\t{chunk}\n\tEXPECTED:\t\t{expected_chunk}")
+            RECEIVED_TOKEN_STRING, RECEIVED_TOKEN_TYPE = chunk
+            EXPECTED_TOKEN_STRING, EXPECTED_TOKEN_TYPE = expected_chunk
+
+            assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING
+            assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE
+
+    def test_split_multi_token(self):
+
+        TEXT = "ipsu<SEP>m d<SEP><SEP><SEP>dsg<SEP>olor"
+
+        SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX)
+
+        EXPECTED_CHUNKS = [
+            ("ipsu", TokenType.BPE),
+            ("<SEP>", TokenType.SPECIAL),
+            ("m", TokenType.BPE),
+            (" d", TokenType.BPE),
+            ("<SEP>", TokenType.SPECIAL),
+            ("<SEP>", TokenType.SPECIAL),
+            ("<SEP>", TokenType.SPECIAL),
+            ("dsg", TokenType.BPE),
+            ("<SEP>", TokenType.SPECIAL),
+        ]
+
+        CHUNKS = list(SPLITTER.split_text(TEXT))
+
+        assert len(CHUNKS) == len(EXPECTED_CHUNKS)
+
+        for chunk, expected_chunk in zip(EXPECTED_CHUNKS, CHUNKS):
+            print(f"TEST:\n\tCHUNK:\t\t{chunk}\n\tEXPECTED:\t\t{expected_chunk}")
+            RECEIVED_TOKEN_STRING, RECEIVED_TOKEN_TYPE = chunk
+            EXPECTED_TOKEN_STRING, EXPECTED_TOKEN_TYPE = expected_chunk
+
+            assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING
+            assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE
+
+    def test_split_malformed_1(self):
+
+        TEXT = "<SEP>lerisque"
+
+        SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX)
+
+        EXPECTED_CHUNKS = [
+            ("<SEP>", TokenType.SPECIAL),
+        ]
+
+        CHUNKS = list(SPLITTER.split_text(TEXT))
+
+        assert len(CHUNKS) == len(EXPECTED_CHUNKS)
+
+        for chunk, expected_chunk in zip(EXPECTED_CHUNKS, CHUNKS):
+            print(f"TEST:\n\tCHUNK:\t\t{chunk}\n\tEXPECTED:\t\t{expected_chunk}")
+            RECEIVED_TOKEN_STRING, RECEIVED_TOKEN_TYPE = chunk
+            EXPECTED_TOKEN_STRING, EXPECTED_TOKEN_TYPE = expected_chunk
+
+            assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING
+            assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE
+
+    def test_split_malformed_2(self):
+
+        TEXT = "lerisque"
+
+        SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX)
+
+        EXPECTED_CHUNKS = []
+
+        CHUNKS = list(SPLITTER.split_text(TEXT))
+
+        assert len(CHUNKS) == len(EXPECTED_CHUNKS)
+
+        for chunk, expected_chunk in zip(EXPECTED_CHUNKS, CHUNKS):
+            print(f"TEST:\n\tCHUNK:\t\t{chunk}\n\tEXPECTED:\t\t{expected_chunk}")
+            RECEIVED_TOKEN_STRING, RECEIVED_TOKEN_TYPE = chunk
+            EXPECTED_TOKEN_STRING, EXPECTED_TOKEN_TYPE = expected_chunk
+
+            assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING
+            assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE
+
+    def test_split_token_decode_simple(self):
+        # to test the token split into special and bpe
+        SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX, max_bpe_token_id=1473)
+        token_list = [100, 101, 1477]
+
+        CHUNKS = list(SPLITTER.split_tokens(token_list))
+        EXPECTED_CHUNKS = [
+            ([100, 101], TokenType.BPE),
+            ([1477], TokenType.SPECIAL),
+        ]
+
+        assert len(CHUNKS) == len(EXPECTED_CHUNKS)
+
+        for chunk, expected_chunk in zip(EXPECTED_CHUNKS, CHUNKS):
+            print(f"TEST:\n\tCHUNK:\t\t{chunk}\n\tEXPECTED:\t\t{expected_chunk}")
+            RECEIVED_TOKEN_STRING, RECEIVED_TOKEN_TYPE = chunk
+            EXPECTED_TOKEN_STRING, EXPECTED_TOKEN_TYPE = expected_chunk
+
+            assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING
+            assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE
+
+    def test_split_token_decode_simple_malformed(self):
+        # to test the token split into special and bpe
+        SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX, max_bpe_token_id=1473)
+        token_list = [100, 101, 1477, 100]
+
+        CHUNKS = list(SPLITTER.split_tokens(token_list))
+        EXPECTED_CHUNKS = [
+            ([100, 101], TokenType.BPE),
+            ([1477], TokenType.SPECIAL),
+        ]
+
+        assert len(CHUNKS) == len(EXPECTED_CHUNKS)
+
+        for chunk, expected_chunk in zip(EXPECTED_CHUNKS, CHUNKS):
+            print(f"TEST:\n\tCHUNK:\t\t{chunk}\n\tEXPECTED:\t\t{expected_chunk}")
+            RECEIVED_TOKEN_STRING, RECEIVED_TOKEN_TYPE = chunk
+            EXPECTED_TOKEN_STRING, EXPECTED_TOKEN_TYPE = expected_chunk
+
+            assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING
+            assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE
+
+
+# Useful to debug weird cases
+if __name__ == "__main__":
+    TestSplitter().test_split_trailing_text()
--- a/Project_Model/Tests/tokenano_test.py
+++ b/Project_Model/Tests/tokenano_test.py
@ -0,0 +1,21 @@
+
+from Project_Model.Libs.BPE.Classes.TokeNanoCore import TokeNanoCore
+
+class TestTokeNano:
+
+    def test_decode_encode_simple(self):
+        TEXT = "<SOT>abababab<EOT>"
+
+        # ab = 256
+        # 256, 256 = 257
+        # 257, 257 = 258
+
+        VOCABULARY = {(ord("a"), ord("b")): 256, (256, 256): 257, (257, 257): 258}
+        # EXPECTED = [258]
+
+        TOKE_NANO = TokeNanoCore(VOCABULARY, ["<SOT>", "<EOT>"])
+
+        ENCODED = TOKE_NANO.encode(TEXT)
+        DECODED = TOKE_NANO.decode(ENCODED)
+
+        assert TEXT == DECODED
--- a/Project_Model/Tests/trainer_files/cache/.gitkeep
+++ b/Project_Model/Tests/trainer_files/cache/.gitkeep
--- a/Project_Model/Tests/trainer_files/train_encode_decode.txt
+++ b/Project_Model/Tests/trainer_files/train_encode_decode.txt
@ -0,0 +1 @@
+<ABS>The Dark Knight is a 2008 superhero film directed by Christopher Nolan,<SOTL>
--- a/Project_Model/Tests/trainer_files/train_simple.txt
+++ b/Project_Model/Tests/trainer_files/train_simple.txt
@ -0,0 +1 @@
+<SOT>abababab<EOT>
--- a/Project_Model/UML/bpe.excalidraw.json
+++ b/Project_Model/UML/bpe.excalidraw.json
@ -0,0 +1,695 @@
+{
+  "type": "excalidraw",
+  "version": 2,
+  "source": "https://marketplace.visualstudio.com/items?itemName=pomdtr.excalidraw-editor",
+  "elements": [
+    {
+      "id": "EcT-dGsjmfW571ov8Gg4F",
+      "type": "text",
+      "x": 425.5,
+      "y": 132,
+      "width": 506,
+      "height": 425,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [
+        "4rCC2-N1thmII8_dwNhe1"
+      ],
+      "frameId": null,
+      "index": "a3V",
+      "roundness": null,
+      "seed": 523521109,
+      "version": 883,
+      "versionNonce": 1590682729,
+      "isDeleted": false,
+      "boundElements": [
+        {
+          "id": "OA_NKjb3n3NLtUo_tKmPS",
+          "type": "arrow"
+        }
+      ],
+      "updated": 1758881654155,
+      "link": null,
+      "locked": false,
+      "text": "class NanoSocratesBPE(Encoder):\n    - vocabulary: Vocabulary\n\n    + fit(\n        data: [[int]], \n        memory: NanoSocratesBPE_BatchMemory,\n        last_sentence_chunk: bool, \n        last_batch: bool\n      ) -> (Self, NanoSocratesBPE_BatchMemory)\n\n    + encode(word: [byte]) -> [int]\n\n    + decode(token: [int]) -> [byte]\n\n    + get_vocabulary_size() -> int\n    \n",
+      "fontSize": 20,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "class NanoSocratesBPE(Encoder):\n    - vocabulary: Vocabulary\n\n    + fit(\n        data: [[int]], \n        memory: NanoSocratesBPE_BatchMemory,\n        last_sentence_chunk: bool, \n        last_batch: bool\n      ) -> (Self, NanoSocratesBPE_BatchMemory)\n\n    + encode(word: [byte]) -> [int]\n\n    + decode(token: [int]) -> [byte]\n\n    + get_vocabulary_size() -> int\n    \n",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "74i4oK-JpcM4CgAqhz_x_",
+      "type": "rectangle",
+      "x": 382.5,
+      "y": 104.5,
+      "width": 592.5,
+      "height": 421,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [
+        "4rCC2-N1thmII8_dwNhe1"
+      ],
+      "frameId": null,
+      "index": "a4",
+      "roundness": {
+        "type": 3
+      },
+      "seed": 50827893,
+      "version": 319,
+      "versionNonce": 704459557,
+      "isDeleted": false,
+      "boundElements": [],
+      "updated": 1758878226277,
+      "link": null,
+      "locked": false
+    },
+    {
+      "id": "s8I1JoKulE3Vnti9a374p",
+      "type": "text",
+      "x": 1113.5,
+      "y": 127,
+      "width": 517,
+      "height": 325,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [
+        "M6w9efVFwOZHkJGgwkyEw"
+      ],
+      "frameId": null,
+      "index": "a5",
+      "roundness": null,
+      "seed": 2091174261,
+      "version": 480,
+      "versionNonce": 1964948039,
+      "isDeleted": false,
+      "boundElements": [],
+      "updated": 1758881941367,
+      "link": null,
+      "locked": false,
+      "text": "class Vocabulary:\n\n    - vocabulary: dict<(int, int), int>\n    - reverse_vocabulary: dict<int, (int, int)>\n\n    + size -> int\n\n    + add_word(int) -> int\n    + encode(int) -> int\n    + decode(int) -> int\n    \n\n",
+      "fontSize": 20,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "class Vocabulary:\n\n    - vocabulary: dict<(int, int), int>\n    - reverse_vocabulary: dict<int, (int, int)>\n\n    + size -> int\n\n    + add_word(int) -> int\n    + encode(int) -> int\n    + decode(int) -> int\n    \n\n",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "BY_Why7XDNftdMzPcwjVZ",
+      "type": "rectangle",
+      "x": 1086.5,
+      "y": 105.5,
+      "width": 593.0000000000001,
+      "height": 325.5,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [
+        "M6w9efVFwOZHkJGgwkyEw"
+      ],
+      "frameId": null,
+      "index": "a6",
+      "roundness": {
+        "type": 3
+      },
+      "seed": 153939611,
+      "version": 234,
+      "versionNonce": 2068149129,
+      "isDeleted": false,
+      "boundElements": [
+        {
+          "id": "WcDks9DR8UqeZEaxAcRf9",
+          "type": "arrow"
+        }
+      ],
+      "updated": 1758881945661,
+      "link": null,
+      "locked": false
+    },
+    {
+      "id": "JCPDhuTKRx4MN950Q3jL-",
+      "type": "text",
+      "x": 1116.411067193676,
+      "y": 477.3809288774704,
+      "width": 416.74578857421875,
+      "height": 99.70355731225297,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [
+        "DbtlKVF_9SjH2-9iMq9zy"
+      ],
+      "frameId": null,
+      "index": "a7",
+      "roundness": null,
+      "seed": 1326854235,
+      "version": 479,
+      "versionNonce": 595084597,
+      "isDeleted": false,
+      "boundElements": [],
+      "updated": 1758902358518,
+      "link": null,
+      "locked": false,
+      "text": "class NanoSocrateBPE_BatchMemory:\n\n    + frequency: dict<(int, int), int>\n    + merge_treshold: int",
+      "fontSize": 19.940711462450594,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "class NanoSocrateBPE_BatchMemory:\n\n    + frequency: dict<(int, int), int>\n    + merge_treshold: int",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "l-O0rMS3SruV22_MPX9Jz",
+      "type": "rectangle",
+      "x": 1086.5,
+      "y": 451.4580039762846,
+      "width": 593,
+      "height": 208.0419960474308,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [
+        "DbtlKVF_9SjH2-9iMq9zy"
+      ],
+      "frameId": null,
+      "index": "a8",
+      "roundness": {
+        "type": 3
+      },
+      "seed": 1490898171,
+      "version": 305,
+      "versionNonce": 587306139,
+      "isDeleted": false,
+      "boundElements": [
+        {
+          "id": "OA_NKjb3n3NLtUo_tKmPS",
+          "type": "arrow"
+        }
+      ],
+      "updated": 1758902358518,
+      "link": null,
+      "locked": false
+    },
+    {
+      "id": "WcDks9DR8UqeZEaxAcRf9",
+      "type": "arrow",
+      "x": 773.5,
+      "y": 167,
+      "width": 297.17936724485867,
+      "height": 30,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aB",
+      "roundness": {
+        "type": 2
+      },
+      "seed": 1681364149,
+      "version": 303,
+      "versionNonce": 1262492265,
+      "isDeleted": false,
+      "boundElements": [],
+      "updated": 1758881945661,
+      "link": null,
+      "locked": false,
+      "points": [
+        [
+          0,
+          0
+        ],
+        [
+          144.5,
+          -1.5
+        ],
+        [
+          177.5,
+          -30
+        ],
+        [
+          297.17936724485867,
+          -29.020420978562214
+        ]
+      ],
+      "lastCommittedPoint": null,
+      "startBinding": null,
+      "endBinding": {
+        "elementId": "BY_Why7XDNftdMzPcwjVZ",
+        "focus": 0.77319587628866,
+        "gap": 18.25
+      },
+      "startArrowhead": null,
+      "endArrowhead": "triangle",
+      "elbowed": false
+    },
+    {
+      "id": "OA_NKjb3n3NLtUo_tKmPS",
+      "type": "arrow",
+      "x": 946.0000000000002,
+      "y": 274.95951048200493,
+      "width": 130.016707976343,
+      "height": 209.36808480159067,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aD",
+      "roundness": {
+        "type": 2
+      },
+      "seed": 1871768059,
+      "version": 1039,
+      "versionNonce": 213535035,
+      "isDeleted": false,
+      "boundElements": [],
+      "updated": 1758902358519,
+      "link": null,
+      "locked": false,
+      "points": [
+        [
+          0,
+          0
+        ],
+        [
+          54.99999999999977,
+          12.54048951799507
+        ],
+        [
+          69.49999999999977,
+          188.54048951799507
+        ],
+        [
+          130.016707976343,
+          209.36808480159067
+        ]
+      ],
+      "lastCommittedPoint": null,
+      "startBinding": {
+        "elementId": "EcT-dGsjmfW571ov8Gg4F",
+        "focus": -0.48312180762055096,
+        "gap": 14.500000000000114
+      },
+      "endBinding": {
+        "elementId": "l-O0rMS3SruV22_MPX9Jz",
+        "focus": -0.16742658425737647,
+        "gap": 11.194126334166185
+      },
+      "startArrowhead": null,
+      "endArrowhead": "triangle",
+      "elbowed": false
+    },
+    {
+      "id": "snZ__VDsIlri6NTp8M2Gf",
+      "type": "text",
+      "x": -245.25,
+      "y": 103,
+      "width": 330,
+      "height": 125,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aE",
+      "roundness": null,
+      "seed": 1758461093,
+      "version": 265,
+      "versionNonce": 1069481861,
+      "isDeleted": false,
+      "boundElements": [],
+      "updated": 1758879566916,
+      "link": null,
+      "locked": false,
+      "text": "class NanoSocratesBPETrainer:\n\n    - max_iterations: int\n    - max_vocabulary_size: int\n    - merge_treshold: int",
+      "fontSize": 20,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "class NanoSocratesBPETrainer:\n\n    - max_iterations: int\n    - max_vocabulary_size: int\n    - merge_treshold: int",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "PnbmqwEWYkP8oXElKFyTp",
+      "type": "text",
+      "x": -237.75,
+      "y": 544,
+      "width": 561,
+      "height": 125,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aH",
+      "roundness": null,
+      "seed": 501304683,
+      "version": 241,
+      "versionNonce": 1306401003,
+      "isDeleted": false,
+      "boundElements": [],
+      "updated": 1758878748210,
+      "link": null,
+      "locked": false,
+      "text": "class NanoSocratesSplitter:\n    + splitter_regex: regex\n\n    + split_text(corpus: str) -> [(str, TokenType)]\n",
+      "fontSize": 20,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "class NanoSocratesSplitter:\n    + splitter_regex: regex\n\n    + split_text(corpus: str) -> [(str, TokenType)]\n",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "xR_11IzgXX5O-m6WoRfCL",
+      "type": "text",
+      "x": -233.25,
+      "y": 366.5,
+      "width": 165,
+      "height": 75,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aI",
+      "roundness": null,
+      "seed": 2025585125,
+      "version": 395,
+      "versionNonce": 1799178985,
+      "isDeleted": false,
+      "boundElements": [],
+      "updated": 1758883940168,
+      "link": null,
+      "locked": false,
+      "text": "enum TokenType:\n    + SPECIAL\n    + BPE",
+      "fontSize": 20,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "enum TokenType:\n    + SPECIAL\n    + BPE",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "lgKSd9qCb94-5e8rd9I3r",
+      "type": "text",
+      "x": -219.75,
+      "y": 764.5,
+      "width": 462,
+      "height": 275,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aJ",
+      "roundness": null,
+      "seed": 1963214021,
+      "version": 464,
+      "versionNonce": 1104453739,
+      "isDeleted": false,
+      "boundElements": [],
+      "updated": 1759053302739,
+      "link": null,
+      "locked": false,
+      "text": "class TokeNanoCore:\n\n    - splitter: NanoSocratesSplitter\n    - bpe_encoder: NanoSocratesBPE\n    - special_encoder: NanoSocratesSpecial\n\n    + encode(corpus: str) -> [int]\n\n    - encode_special(piece: str) -> int\n\n    - encode_bpe(piece: str) -> [int]",
+      "fontSize": 20,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "class TokeNanoCore:\n\n    - splitter: NanoSocratesSplitter\n    - bpe_encoder: NanoSocratesBPE\n    - special_encoder: NanoSocratesSpecial\n\n    + encode(corpus: str) -> [int]\n\n    - encode_special(piece: str) -> int\n\n    - encode_bpe(piece: str) -> [int]",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "DwFJoUpVT2YAEe9qPYAXa",
+      "type": "text",
+      "x": 496.75,
+      "y": 666,
+      "width": 440,
+      "height": 100,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aL",
+      "roundness": null,
+      "seed": 1317596203,
+      "version": 152,
+      "versionNonce": 1840679687,
+      "isDeleted": false,
+      "boundElements": [],
+      "updated": 1758880107704,
+      "link": null,
+      "locked": false,
+      "text": "class NanoSocratesSpecial(Encoder):\n\n    + vocabulary: dict<str, int>\n    + reverse_vocabulary: dict<int, str>",
+      "fontSize": 20,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "class NanoSocratesSpecial(Encoder):\n\n    + vocabulary: dict<str, int>\n    + reverse_vocabulary: dict<int, str>",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "78gC46xatoO1_cRtaN8EC",
+      "type": "text",
+      "x": 396.375,
+      "y": -107.75,
+      "width": 396,
+      "height": 100,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aM",
+      "roundness": null,
+      "seed": 1187595241,
+      "version": 130,
+      "versionNonce": 1273030504,
+      "isDeleted": false,
+      "boundElements": [],
+      "updated": 1759070012771,
+      "link": null,
+      "locked": false,
+      "text": "class Encoder(ABC):\n\n    + encode(corpus: str) -> [int]\n    + decode(encoded: [int]) -> str ",
+      "fontSize": 20,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "class Encoder(ABC):\n\n    + encode(corpus: str) -> [int]\n    + decode(encoded: [int]) -> str ",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "3j50Ds74uU7oXoJ9kMOYJ",
+      "type": "text",
+      "x": 457.375,
+      "y": 903.75,
+      "width": 949.7594604492188,
+      "height": 25,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aN",
+      "roundness": null,
+      "seed": 1994335529,
+      "version": 198,
+      "versionNonce": 1492696519,
+      "isDeleted": false,
+      "boundElements": [],
+      "updated": 1758882694747,
+      "link": null,
+      "locked": false,
+      "text": "@@mamma@@è bell^^issima e @@^^le voglio molto b^^ene @--@ replit ^^ è molto ^^bello e^^ lo amo",
+      "fontSize": 20,
+      "fontFamily": 5,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "@@mamma@@è bell^^issima e @@^^le voglio molto b^^ene @--@ replit ^^ è molto ^^bello e^^ lo amo",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "yg-TvQvz4MwJZ0y8K7Ix0",
+      "type": "text",
+      "x": 435.375,
+      "y": 1026.25,
+      "width": 352,
+      "height": 250,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aP",
+      "roundness": null,
+      "seed": 1877486407,
+      "version": 344,
+      "versionNonce": 25830153,
+      "isDeleted": false,
+      "boundElements": [],
+      "updated": 1758883468886,
+      "link": null,
+      "locked": false,
+      "text": "class NanoSocratesChunker:\n\n    - max_bytes: int\n    - max_special_length: int\n    - special_token_regex: regex\n\n    - residuals: str\n\n    # This must be an iterator\n    + read(path: Path) -> str",
+      "fontSize": 20,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "class NanoSocratesChunker:\n\n    - max_bytes: int\n    - max_special_length: int\n    - special_token_regex: regex\n\n    - residuals: str\n\n    # This must be an iterator\n    + read(path: Path) -> str",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "2UXjWdE_jMcsCE2oQgTXn",
+      "type": "text",
+      "x": -334.75,
+      "y": 1112.5,
+      "width": 165,
+      "height": 25,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aQ",
+      "roundness": null,
+      "seed": 700532363,
+      "version": 76,
+      "versionNonce": 1671597672,
+      "isDeleted": false,
+      "boundElements": [],
+      "updated": 1759070020002,
+      "link": null,
+      "locked": false,
+      "text": "class TokeNano:",
+      "fontSize": 20,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "class TokeNano:",
+      "autoResize": true,
+      "lineHeight": 1.25
+    }
+  ],
+  "appState": {
+    "gridSize": 20,
+    "gridStep": 5,
+    "gridModeEnabled": false,
+    "viewBackgroundColor": "#ffffff"
+  },
+  "files": {}
+}
--- a/README.md
+++ b/README.md
@ -1,3 +1,47 @@
 # NanoSocrates

-This is the work project for the DeepLearning exam of 16th September 2025
+This is the work project for the DeepLearning exam of 16th September 2025
+
+## Index
+
+- [Resources](./docs/RESOURCES.md)
+
+## Setup
+
+Create and activate you Conda enviroment with:
+
+       conda env create -f environment.yaml
+       conda activate deep_learning
+
+Now install dependencies on pip:
+
+        pip install -r requirements.txt
+
+Add the following on .vscode/settings.json
+
+       ```json
+       {
+              // For linux
+              "terminal.integrated.env.linux": {
+                     "PYTHONPATH": "${workspaceFolder}"
+              },
+              // For OSX
+              "terminal.integrated.env.osx": {
+                     "PYTHONPATH": "${workspaceFolder}"
+              },
+              // For Windows
+              "terminal.integrated.env.windows": {
+                     "PYTHONPATH": "${workspaceFolder}"
+              }
+       }
+       ```
+
+## TroubleShooting
+
+Sometimes when uploading really large batch of data, git can stop the uploads thanks to the timeout.
+The solution is to locally change its settings:
+
+       git config lfs.dialtimeout 3600
+       git config lfs.activitytimeout 3600
+
+for clearance check the link: https://stackoverflow.com/questions/58961697/i-o-timeout-when-pushing-to-a-git-reporsitory
--- a/Scripts/DataBaseQueries/dataset.sql
+++ b/Scripts/DataBaseQueries/dataset.sql
@ -0,0 +1,30 @@
+-- To pass to Pandas
+SELECT *
+FROM RDFs
+INNER JOIN Subjects USING (SubjectID)
+INNER JOIN Relationships USING (RelationshipID)
+INNER JOIN Objects USING (ObjectID);
+
+-- To pass to Pandas for abstracts
+SELECT *
+FROM RDFs
+INNER JOIN WikipediaAbstracts USING (MovieID);
+
+-- To pass to Pandas for abbreviations
+SELECT *
+FROM Abbreviations;
+
+-- More complex to have clean dataset
+-- More complex to have clean dataset
+SELECT MovieID, GROUP_CONCAT('<SOT>' || '<SUB>' || SubjectURI || '<REL>' || RelationshipURI || '<OBJ>' || ObjectURI || '<EOT>', '') as RDF_String, Abstract
+FROM RDFs
+INNER JOIN SubjectsCountInRDFs USING (SubjectID)
+INNER JOIN RelationshipsCountInRDFs USING(RelationshipID)
+INNER JOIN ObjectsCountInRDFs USING (ObjectID)
+INNER JOIN ParsedSubjects USING (SubjectID)
+INNER JOIN ParsedRelationships USING (RelationshipID)
+INNER JOIN ParsedObjects USING (ObjectID)
+INNER JOIN WikipediaAbstracts USING (MovieID)
+    -- insert WHERE here
+-- WHERE SubjectID = 134626
+GROUP BY MovieID;
--- a/Scripts/DataBaseQueries/db_creation.sql
+++ b/Scripts/DataBaseQueries/db_creation.sql
@ -0,0 +1,174 @@
+CREATE TABLE IF NOT EXISTS Movies (
+    MovieID INTEGER PRIMARY KEY AUTOINCREMENT,
+    MovieURI TEXT UNIQUE NOT NULL
+);
+
+
+CREATE TABLE IF NOT EXISTS WikiPageIDs (
+    MovieID INTEGER PRIMARY KEY,
+    PageID INTEGER UNIQUE NOT NULL,
+    FOREIGN KEY(MovieID) REFERENCES Movies(MovieID)
+);
+
+
+CREATE TABLE IF NOT EXISTS WikipediaAbstracts (
+    MovieID INTEGER PRIMARY KEY,
+    Abstract TEXT NOT NULL,
+    FOREIGN KEY(MovieID) REFERENCES Movies(MovieID)
+);
+
+
+CREATE TABLE IF NOT EXISTS Origins (
+    OriginID INTEGER PRIMARY KEY AUTOINCREMENT,
+    OriginName TEXT UNIQUE NOT NULL
+);
+
+
+CREATE TABLE IF NOT EXISTS Subjects (
+    SubjectID INTEGER PRIMARY KEY AUTOINCREMENT,
+    SubjectURI TEXT UNIQUE NOT NULL,
+    OriginID BIGINT NOT NULL,
+    FOREIGN KEY(OriginID) REFERENCES Origins(OriginID)
+);
+
+
+CREATE TABLE IF NOT EXISTS Relationships (
+    RelationshipID INTEGER PRIMARY KEY AUTOINCREMENT,
+    RelationshipURI TEXT UNIQUE NOT NULL
+);
+
+
+CREATE TABLE IF NOT EXISTS Objects (
+    ObjectID INTEGER PRIMARY KEY AUTOINCREMENT,
+    ObjectURI TEXT UNIQUE NOT NULL,
+    OriginID BIGINT NOT NULL,
+    FOREIGN KEY(OriginID) REFERENCES Origins(OriginID)
+);
+
+CREATE TABLE IF NOT EXISTS RDFs (
+    RDF_ID INTEGER PRIMARY KEY AUTOINCREMENT,
+    MovieID INTEGER NOT NULL,
+    SubjectID INTEGER NOT NULL,
+    RelationshipID INTEGER NOT NULL,
+    ObjectID INTEGER NOT NULL,
+    UNIQUE(MovieID, SubjectID, RelationshipID, ObjectID),
+    FOREIGN KEY(MovieID) REFERENCES Movies(MovieID),
+    FOREIGN KEY(SubjectID) REFERENCES Subjects(SubjectID),
+    FOREIGN KEY(RelationshipID) REFERENCES Relationships(RelationshipID),
+    FOREIGN KEY(ObjectID) REFERENCES Objects(ObjectID)
+);
+
+CREATE INDEX IF NOT EXISTS idx_rdf_movie_id ON RDFs(MovieID);
+CREATE INDEX IF NOT EXISTS idx_rdf_subject_id ON RDFs(SubjectID);
+CREATE INDEX IF NOT EXISTS idx_rdf_relationship_id ON RDFs(RelationshipID);
+CREATE INDEX IF NOT EXISTS idx_rdf_object_id ON RDFs(ObjectID);
+
+CREATE TABLE IF NOT EXISTS Abbreviations (
+    AbbreviationID INTEGER PRIMARY KEY AUTOINCREMENT,
+    URI TEXT UNIQUE NOT NULL,
+    Abbreviation TEXT UNIQUE NOT NULL
+);
+
+CREATE TABLE IF NOT EXISTS Subjects_Abbreviations (
+    SubjectID INTEGER NOT NULL,
+    AbbreviationID INTEGER NOT NULL,
+    PRIMARY KEY(SubjectID, AbbreviationID),
+    FOREIGN KEY(SubjectID) REFERENCES Subjects(SubjectID),
+    FOREIGN KEY(AbbreviationID) REFERENCES Abbreviations(AbbreviationID)
+);
+
+CREATE TABLE IF NOT EXISTS Relationships_Abbreviations (
+    RelationshipID INTEGER NOT NULL,
+    AbbreviationID INTEGER NOT NULL,
+    PRIMARY KEY(RelationshipID, AbbreviationID),
+    FOREIGN KEY(RelationshipID) REFERENCES Relationships(RelationshipID),
+    FOREIGN KEY(AbbreviationID) REFERENCES Abbreviations(AbbreviationID)
+);
+
+CREATE TABLE IF NOT EXISTS Objects_Abbreviations (
+    ObjectID INTEGER NOT NULL,
+    AbbreviationID INTEGER NOT NULL,
+    PRIMARY KEY(ObjectID, AbbreviationID),
+    FOREIGN KEY(ObjectID) REFERENCES Objects(ObjectID),
+    FOREIGN KEY(AbbreviationID) REFERENCES Abbreviations(AbbreviationID)
+);
+
+CREATE INDEX IF NOT EXISTS idx_sub_abbr_sub_id ON Subjects_Abbreviations(SubjectID);
+CREATE INDEX IF NOT EXISTS idx_sub_abbr_abbr_id ON Subjects_Abbreviations(AbbreviationID);
+CREATE INDEX IF NOT EXISTS idx_rel_abbr_rel_id ON Relationships_Abbreviations(RelationshipID);
+CREATE INDEX IF NOT EXISTS idx_rel_abbr_abbr_id ON Relationships_Abbreviations(AbbreviationID);
+CREATE INDEX IF NOT EXISTS idx_obj_abbr_obj_id ON Objects_Abbreviations(ObjectID);
+CREATE INDEX IF NOT EXISTS idx_obj_abbr_abbr_id ON Objects_Abbreviations(AbbreviationID);
+
+-- Views
+-- Subjects
+CREATE VIEW IF NOT EXISTS ParsedSubjects
+AS
+SELECT
+	SubjectID,
+	CASE WHEN Abbreviation IS NULL
+		THEN SubjectURI
+		ELSE Abbreviation || ':' || replace(SubjectURI, URI, '') END
+		AS SubjectURI
+FROM Subjects
+	LEFT JOIN Subjects_Abbreviations USING (SubjectID)
+	LEFT JOIN Abbreviations USING (AbbreviationID);
+
+-- Relationships
+CREATE VIEW IF NOT EXISTS ParsedRelationships
+AS
+SELECT
+	RelationshipID,
+	CASE WHEN Abbreviation IS NULL
+		THEN RelationshipURI
+		ELSE Abbreviation || ':' || replace(RelationshipURI, URI, '') END
+		AS RelationshipURI
+FROM Relationships
+	LEFT JOIN Relationships_Abbreviations USING (RelationshipID)
+	LEFT JOIN Abbreviations USING (AbbreviationID);
+
+-- Objects
+CREATE VIEW IF NOT EXISTS ParsedObjects
+AS
+SELECT
+	ObjectID,
+	CASE WHEN Abbreviation IS NULL
+		THEN ObjectURI
+		ELSE Abbreviation || ':' || replace(ObjectURI, URI, '') END
+		AS ObjectURI
+FROM Objects
+	LEFT JOIN Objects_Abbreviations USING (ObjectID)
+	LEFT JOIN Abbreviations USING (AbbreviationID);
+
+
+-- Subject Count
+CREATE VIEW IF NOT EXISTS SubjectsCountInRDFs
+AS
+SELECT SubjectID, count(SubjectID) as Sub_Count
+FROM RDFs
+GROUP BY SubjectID;
+
+
+
+
+-- Relationship Count
+CREATE VIEW IF NOT EXISTS RelationshipsCountInRDFs
+AS
+SELECT RelationshipID, count(RelationshipID) as Rel_Count
+FROM RDFs
+GROUP BY RelationshipID;
+
+
+-- Object Count
+CREATE VIEW IF NOT EXISTS ObjectsCountInRDFs
+AS
+SELECT ObjectID, count(ObjectID) as Obj_Count
+FROM RDFs
+GROUP BY ObjectID;
+
+
+
+
+
+
+
--- a/Scripts/DataBaseQueries/query.sql
+++ b/Scripts/DataBaseQueries/query.sql
@ -0,0 +1,55 @@
+-- Insert MovieURI into Movies ; MovieID is auto incremental
+INSERT INTO  Movies (MovieURI) VALUES (?);
+
+-- Get MovieID where MovieURI equal given value
+SELECT MovieID FROM Movies WHERE MovieURI = ?;
+
+-- SetPageId
+INSERT INTO  WikiPageIDs (MovieID, PageID) VALUES (?,?);
+
+-- Get MovieId by PageID ... ( to create WikipediaAbstract)
+SELECT MovieID FROM WikiPageIDs WHERE PageID = ?;
+
+-- SetAbstract ...
+
+INSERT INTO  WikipediaAbstracts (MovieID, Abstract) VALUES (?,?);
+
+
+-- SetOrigin
+---
+INSERT INTO  Origins (OriginName) VALUES ("dataset.csv"),("reverse.csv");
+
+-- GetOrigin
+SELECT OriginID FROM Origins WHERE OriginName = ?;
+
+-- Subject, Relationship, Object, RDF
+INSERT INTO  Subjects (SubjectURI, OriginID) VALUES (?,?);
+INSERT INTO  Relationships (RelationshipURI) VALUES (?);
+INSERT INTO  Objects (ObjectURI, OriginID) VALUES (?,?);
+
+SELECT SubjectID FROM Subjects WHERE SubjectURI = ?;
+SELECT RelationshipID FROM Relationships WHERE RelationshipURI = ?;
+SELECT ObjectID FROM Objects WHERE ObjectURI = ?;
+
+
+INSERT INTO  RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?,?,?,?);
+
+-- Prefixes
+INSERT INTO Abbreviations(URI, Abbreviation) VALUES (?,?);
+INSERT INTO Objects_Abbreviations(ObjectID, AbbreviationID) VALUES (?,?);
+INSERT INTO Relationships_Abbreviations(RelationshipID, AbbreviationID) VALUES (?,?);
+INSERT INTO Subjects_Abbreviations(SubjectID, AbbreviationID) VALUES (?,?);
+
+-- Please be sure it is a URI before running this query
+--  and take at least until the domain and the first path part
+SELECT AbbreviationID FROM Abbreviations WHERE URI LIKE ?;
+
+-- Query to retrieve data
+SELECT MovieID, GROUP_CONCAT('<SOT>' || '<SUB>' || SubjectURI || '<REL>' || RelationshipURI || '<OBJ>' || ObjectURI || '<EOT>', '') as RDF_String, Abstract
+FROM RDFs
+INNER JOIN ParsedSubjects USING (SubjectID)
+INNER JOIN ParsedRelationships USING (RelationshipID)
+INNER JOIN ParsedObjects USING (ObjectID)
+INNER JOIN WikipediaAbstracts USING (MovieID)
+    -- insert WHERE here
+GROUP BY MovieID;
--- a/Scripts/DataCleaning/clean_relationship.ipynb
+++ b/Scripts/DataCleaning/clean_relationship.ipynb
@ -0,0 +1,186 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "b9081b7c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# This file deletes in the pipeline the unwanted relationship by different rules\n",
+    "import pandas as pd\n",
+    "import sqlite3\n",
+    "import numpy as np\n",
+    "\n",
+    "\n",
+    "CONN = sqlite3.connect('../../Assets/Dataset/Tmp/dataset2.db')\n",
+    "\n",
+    "def get_RDF() -> pd.DataFrame:\n",
+    "    \"\"\"\n",
+    "    QUERY = \"SELECT * FROM RDFs \" \\\n",
+    "    \"INNER JOIN Subjects USING (SubjectID) \" \\\n",
+    "    \"INNER JOIN Relationships USING (RelationshipID) \" \\\n",
+    "    \"INNER JOIN Objects USING (ObjectID);\"\n",
+    "    RDF = pd.read_sql_query(QUERY, CONN)\n",
+    "    RDF = RDF[[\"SubjectURI\", \"RelationshipURI\", \"ObjectURI\"]]\n",
+    "    RDF = RDF.dropna()\n",
+    "    \"\"\"\n",
+    "    Subjects = pd.read_sql_query('SELECT * FROM Subjects;', CONN)\n",
+    "    Objects = pd.read_sql_query('SELECT * FROM Objects;', CONN)\n",
+    "    Relationships = pd.read_sql_query('SELECT * FROM Relationships;', CONN)\n",
+    "    RDF = pd.read_sql_query('SELECT * FROM RDFs;', CONN)\n",
+    "\n",
+    "    # drop '' values \n",
+    "    Subjects = Subjects.replace('', np.nan)# .dropna()\n",
+    "    Relationships = Relationships.replace('', np.nan)# .dropna()\n",
+    "    Objects = Objects.replace('', np.nan)# .dropna()\n",
+    "\n",
+    "    # join RDF with its components\n",
+    "    RDF = RDF.merge(Subjects, left_on=\"SubjectID\", right_on=\"SubjectID\")\n",
+    "    RDF = RDF.merge(Objects, left_on=\"ObjectID\", right_on=\"ObjectID\")\n",
+    "    RDF = RDF.merge(Relationships, left_on=\"RelationshipID\", right_on=\"RelationshipID\")\n",
+    "    RDF = RDF[[\"SubjectURI\", \"RelationshipURI\", \"ObjectURI\", \"MovieID\"]]\n",
+    "    return RDF\n",
+    "\n",
+    "\n",
+    "#def delete_relationship_by_uri(RDF: pd.DataFrame, )\n",
+    "\n",
+    "def delete_relationship_by_uri(RDF: pd.DataFrame, uri: str) -> pd.DataFrame:\n",
+    "    return RDF[RDF[\"RelationshipURI\"]!= uri]\n",
+    "\n",
+    "\n",
+    "\n",
+    "RDF = get_RDF()\n",
+    "# RDF = RDF.dropna()\n",
+    "# print(RDF)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "644690bb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def filter_by_frequence_relationship_uri(RDF: pd.DataFrame, count_treshold) -> pd.DataFrame:\n",
+    "    counts = RDF[\"RelationshipURI\"].value_counts() \n",
+    "    RDF[\"RelationshipFreq\"] = RDF[\"RelationshipURI\"].map(counts)\n",
+    "    RDF = RDF[RDF[\"RelationshipFreq\"] >= count_treshold]\n",
+    "    # counts is a series as key: relationship, value: count\n",
+    "    # counts = counts[counts > count_treshold]\n",
+    "    # relationships = counts.index\n",
+    "    # RDF = RDF[RDF[\"RelationshipURI\"].isin(relationships)]\n",
+    "    # RDF = RDF.groupby(\"RelationshipURI\").filter(lambda x: len(x) >= count_treshold)\n",
+    "    return RDF\n",
+    "\n",
+    "RDF = filter_by_frequence_relationship_uri(RDF, 1)\n",
+    "# print(new_RDF)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "34525be6",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                                                 SubjectURI  \\\n",
+      "0             http://dbpedia.org/resource/Nights_of_Cabiria   \n",
+      "1         http://dbpedia.org/resource/California_Science...   \n",
+      "2                 http://dbpedia.org/resource/China_Captain   \n",
+      "3         http://dbpedia.org/resource/Caravan_of_Courage...   \n",
+      "4                http://dbpedia.org/resource/WHIH_Newsfront   \n",
+      "...                                                     ...   \n",
+      "12725500   http://dbpedia.org/resource/I_Will_Follow_(film)   \n",
+      "12725501   http://dbpedia.org/resource/I_Will_Follow_(film)   \n",
+      "12725502  http://dbpedia.org/resource/I_Witnessed_Genoci...   \n",
+      "12725503  http://dbpedia.org/resource/I_Woke_Up_Early_th...   \n",
+      "12725504           http://dbpedia.org/resource/I_Won't_Play   \n",
+      "\n",
+      "                                       RelationshipURI  \\\n",
+      "0          http://www.w3.org/2002/07/owl#differentFrom   \n",
+      "1          http://www.w3.org/2002/07/owl#differentFrom   \n",
+      "2          http://www.w3.org/2002/07/owl#differentFrom   \n",
+      "3          http://www.w3.org/2002/07/owl#differentFrom   \n",
+      "4         http://www.w3.org/2000/01/rdf-schema#seeAlso   \n",
+      "...                                                ...   \n",
+      "12725500          http://dbpedia.org/ontology/producer   \n",
+      "12725501          http://dbpedia.org/ontology/producer   \n",
+      "12725502          http://dbpedia.org/ontology/producer   \n",
+      "12725503          http://dbpedia.org/ontology/producer   \n",
+      "12725504          http://dbpedia.org/ontology/producer   \n",
+      "\n",
+      "                                                  ObjectURI  MovieID  \\\n",
+      "0                       http://dbpedia.org/resource/Cabiria       26   \n",
+      "1         http://dbpedia.org/resource/California_Academy...      185   \n",
+      "2                 http://dbpedia.org/resource/Captain_China      614   \n",
+      "3         http://dbpedia.org/resource/Caravan_of_Courage...      740   \n",
+      "4         http://dbpedia.org/resource/Captain_America:_C...      594   \n",
+      "...                                                     ...      ...   \n",
+      "12725500           http://dbpedia.org/resource/Ava_DuVernay   145854   \n",
+      "12725501           http://dbpedia.org/resource/Molly_Mayeux   145854   \n",
+      "12725502        http://dbpedia.org/resource/Headlines_Today   145861   \n",
+      "12725503             http://dbpedia.org/resource/Billy_Zane   145862   \n",
+      "12725504    http://dbpedia.org/resource/Gordon_Hollingshead   145864   \n",
+      "\n",
+      "          RelationshipFreq  MovieFreq  \n",
+      "0                     2132        216  \n",
+      "1                     2132        264  \n",
+      "2                     2132         66  \n",
+      "3                     2132        131  \n",
+      "4                     1653        133  \n",
+      "...                    ...        ...  \n",
+      "12725500             80077         95  \n",
+      "12725501             80077         95  \n",
+      "12725502             80077         41  \n",
+      "12725503             80077         98  \n",
+      "12725504             80077         91  \n",
+      "\n",
+      "[12725505 rows x 6 columns]\n"
+     ]
+    }
+   ],
+   "source": [
+    "def filter_by_frequence_movie_id(RDF: pd.DataFrame, min_treshold, max_treshold) -> pd.DataFrame:\n",
+    "    counts = RDF[\"MovieID\"].value_counts() \n",
+    "    RDF[\"MovieFreq\"] = RDF[\"MovieID\"].map(counts)\n",
+    "    RDF = RDF[RDF[\"MovieFreq\"] >= min_treshold]\n",
+    "    RDF = RDF[RDF[\"MovieFreq\"] < max_treshold]\n",
+    "    # counts is a series as key: relationship, value: count\n",
+    "    # counts = counts[counts > count_treshold]\n",
+    "    # relationships = counts.index\n",
+    "    # RDF = RDF[RDF[\"RelationshipURI\"].isin(relationships)]\n",
+    "    # RDF = RDF.groupby(\"RelationshipURI\").filter(lambda x: len(x) >= count_treshold)\n",
+    "    return RDF\n",
+    "\n",
+    "RDF = filter_by_frequence_movie_id(RDF, 1, 1500)\n",
+    "print(RDF)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "deep_learning",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/Scripts/DataCleaning/data_output_models/bpe_corpus.py
+++ b/Scripts/DataCleaning/data_output_models/bpe_corpus.py
@ -0,0 +1,21 @@
+from Scripts.Libs.Utils.dataframe_interaction import get_raw_from_dataframe
+from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
+import pandas as pd
+
+class BPE_corpus():
+
+    def __init__(self, output_path :str):
+        self.output_handler = open(output_path, "w")
+
+    def close(self):
+        # add corpus end before closing
+        self.output_handler.write(SpecialToken.CORPUS_END.value)
+        self.output_handler.close()
+        
+    def write_from_str(self, output: str):
+        if output == '':
+            return
+        self.output_handler.write(output)
+
+    def write_from_df(self, df: pd.DataFrame):
+        self.write_from_str(get_raw_from_dataframe(df))
--- a/Scripts/DataCleaning/data_output_models/rdf_completation_task.py
+++ b/Scripts/DataCleaning/data_output_models/rdf_completation_task.py
@ -0,0 +1,26 @@
+import pandas as pd
+
+class RDF_completation_task_dataset():
+    """
+        Write the CSV for the fourth task, which is "Predicting subsequent triples based on a given context".
+        Each RDF is saved as str
+        CSV Composition: ["MovieID","RDF"]
+    """
+    def __init__(self, output_path:str):
+     
+
+        self.output =  open(output_path, "w")
+        # then the first row as header
+        header = ["MovieID","RDF"]
+        self.output.write(",".join(header) + "\n")
+
+    def close(self):
+        self.output.close()
+
+    def write(self, RDF: pd.DataFrame):
+        """
+        Args:
+            RDF (pd.DataFrame): ["MovieID","RDF"]
+        """        
+
+        RDF.to_csv(self.output, index=False, header=False)
--- a/Scripts/DataCleaning/data_output_models/rdf_mask_task.py
+++ b/Scripts/DataCleaning/data_output_models/rdf_mask_task.py
@ -0,0 +1,58 @@
+import pandas as pd
+
+# do not worry about circular dependencies, this class will never call something else
+from Scripts.DataCleaning.filter import PipelineApplier
+
+class RDF_mask_task_dataset():
+    """
+        Write the CSV for the third task, which is "Predicting a masked component within an RDF triple".
+        The CSV is like: for each RDF there will be 3 rows, where every time one of the componments is missing.
+        CSV Composition: ["MovieID","IncompleteRDF","Missing","RDF"]
+    """
+    def __init__(self, output_path:str):
+     
+        # this methods will only be used by this class, but they belong in a lower level
+        self._build_triple = PipelineApplier.build_triple
+        self._build_incomplete_triple = PipelineApplier.build_incomplete_triple
+
+        self.output =  open(output_path, "w")
+        # then the first row as header
+        header = ["MovieID","IncompleteRDF","Missing","RDF"]
+        self.output.write(",".join(header) + "\n")
+
+    def close(self):
+        self.output.close()
+
+    def write(self, RDF: pd.DataFrame):
+        rdf_complete = self._build_triple(RDF)
+
+        rdf_without_subject = self._build_incomplete_triple(RDF.drop(columns=["SubjectURI"]))
+        rdf_without_relationship = self._build_incomplete_triple(RDF.drop(columns=["RelationshipURI"]))
+        rdf_without_object = self._build_incomplete_triple(RDF.drop(columns=["ObjectURI"]))
+        ####
+        df_subject = pd.DataFrame({
+            "MovieID": RDF["MovieID"],
+            "IncompleteRDF": rdf_without_subject,
+            "Missing": RDF["SubjectURI"],
+            "RDF": rdf_complete,
+        })
+
+        df_relationship = pd.DataFrame({
+            "MovieID": RDF["MovieID"],
+            "IncompleteRDF": rdf_without_relationship,
+            "Missing": RDF["RelationshipURI"],
+            "RDF": rdf_complete,
+        })
+
+        df_object = pd.DataFrame({
+            "MovieID": RDF["MovieID"],
+            "IncompleteRDF": rdf_without_object,
+            "Missing": RDF["ObjectURI"],
+            "RDF": rdf_complete,
+        })
+
+
+        output_df = pd.concat([df_subject, df_relationship, df_object], ignore_index=True)
+        output_df.to_csv(self.output, index=False, header=False)
+
+
--- a/Scripts/DataCleaning/data_output_models/rdf_text_tasks.py
+++ b/Scripts/DataCleaning/data_output_models/rdf_text_tasks.py
@ -0,0 +1,26 @@
+import pandas as pd
+
+class RDF_text_task_dataset():
+    """
+        Write the CSV for the firsts two tasks, which are "Generating structured RDF triples from natural language text" and reverse.
+        In the CVS the RDFs will be saved toghether as a string.
+        CSV Composition: ["MovieID","RDFs","Abstract"]
+    """
+    def __init__(self, output_path:str):
+     
+
+        self.output =  open(output_path, "w")
+        # then the first row as header
+        header = ["MovieID","RDFs","Abstract"]
+        self.output.write(",".join(header) + "\n")
+
+    def close(self):
+        self.output.close()
+
+    def write(self, RDF: pd.DataFrame):
+        """
+        Args:
+            RDF (pd.DataFrame): ["MovieID","Triple","Abstract"]
+        """        
+
+        RDF.to_csv(self.output, index=False, header=False)
--- a/Scripts/DataCleaning/dbpedia-uri.py
+++ b/Scripts/DataCleaning/dbpedia-uri.py
@ -0,0 +1,77 @@
+import argparse
+import sys
+
+
+
+class ProgramArgs:
+
+    def __init__(self, file: str, output: str, treshold: int):
+        self.file = file
+        self.output = output
+        self.treshold = treshold
+
+def get_args(args: list[str]) -> ProgramArgs:
+
+    PARSER = argparse.ArgumentParser()
+    PARSER.add_argument("--input-file", "-i", required=True, type=str)
+    PARSER.add_argument("--output-file", "-o", required=True, type=str)
+    PARSER.add_argument("--treshold", "-t", type=int, default=1)
+    parsed_args, _ = PARSER.parse_known_args(args)
+
+    # print(parsed_args.input_file)
+
+    return ProgramArgs(parsed_args.input_file,parsed_args.output_file, parsed_args.treshold)  # type ignore
+
+
+def print_dbpedia(file: str, out: str):
+
+
+    FILE = open(file, "r", encoding="utf-8")
+    OUT = open(out, mode="w", encoding="utf-8")
+
+    DOMAIN_PART = "dbpedia"
+
+    already_parsed : set[str] = set()
+
+
+    for row in FILE:
+
+        sections = row.split("/")
+        sections = list(filter(lambda item: item != "", sections))
+
+        # print(sections)
+
+        if len(sections) < 3:
+            continue
+
+        URI = "/".join(sections[1:3])
+        URI = "//".join([sections[0], URI])
+
+        if  URI in already_parsed:
+            continue
+
+        DOMAIN = sections[1]
+        SUBDOMAINS = DOMAIN.split(".")
+        TYPE = sections[2][0]
+
+        if DOMAIN_PART not in SUBDOMAINS:
+            continue
+
+        already_parsed.add(URI)
+
+        SUB_ID = SUBDOMAINS[0]
+
+        if len(SUB_ID) > 3:
+            SUB_ID = SUB_ID[:3]
+
+        OUT.write(f"\"{URI}/\", \"{SUB_ID}-db{TYPE}\"\n")
+
+
+    FILE.close()
+    OUT.close()
+
+
+if __name__ == "__main__":
+    ARGS = get_args(sys.argv)
+    # ARGS = get_debug_args()
+    print_dbpedia(ARGS.file, ARGS.output)
--- a/Scripts/DataCleaning/filter.py
+++ b/Scripts/DataCleaning/filter.py
@ -0,0 +1,188 @@
+# This file deletes in the pipeline the unwanted relationship by different rules
+import pandas as pd
+import sqlite3
+import numpy as np
+
+from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
+from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
+
+
+class PipelineApplier():
+
+    def __init__(self):
+
+        self.MOVIE_FILTER = pd.DataFrame()
+        self.REL_FILTER = pd.DataFrame()
+
+
+    def delete_relationship_by_str(self, RDF: pd.DataFrame, uri: str) -> pd.DataFrame:
+        return RDF[RDF["RelationshipURI"]!= uri]
+    
+    def generate_list_relationship_filter(self, filter_list: list[str]) -> None:
+        """Store RelationshipURI filters as a set """
+        self.relationship_filter_list: set[str] = set(filter_list)
+    
+    def delete_relationship_by_list_filter(self, RDF: pd.DataFrame) -> pd.DataFrame:
+        """Remove rows whose RelationshipURI is in the stored filter. Generate it first callig the generate_list_relationship_filter"""
+        return RDF[~RDF["RelationshipURI"].isin(self.relationship_filter_list)]
+
+
+    def generate_frequency_movie_filter(self, MOVIE_COUNT: pd.DataFrame ,min_treshold: int, max_treshold: int):
+        """
+        You MUST call this before filter the dataset by movie frequence [filter_by_frequence_movie_id()], 
+        since this method creates such filter
+        Args:
+            MOVIE_COUNT (pd.DataFrame): ["MovieID","Count"]
+            min_treshold (int): 
+            max_treshold (int): 
+        """        
+        MOVIE_COUNT = MOVIE_COUNT[MOVIE_COUNT["Count"] >= min_treshold]
+        MOVIE_COUNT = MOVIE_COUNT[MOVIE_COUNT["Count"] < max_treshold]
+        self.MOVIE_FILTER = MOVIE_COUNT #["MovieID"]
+
+    def generate_frequency_relationship_filter(self, REL_COUNT: pd.DataFrame ,min_treshold: int, max_treshold: int):
+        REL_COUNT = REL_COUNT[REL_COUNT["Count"] >= min_treshold]
+        REL_COUNT = REL_COUNT[REL_COUNT["Count"] < max_treshold]
+        self.REL_FILTER = REL_COUNT #["RelationshipURI"]
+
+    def filter_by_frequency_movie_id(self, RDF: pd.DataFrame) -> pd.DataFrame:
+        RDF = RDF[RDF["MovieID"].isin(self.MOVIE_FILTER["MovieID"])]
+        return RDF
+
+    def filter_by_frequency_relationship(self, RDF: pd.DataFrame) -> pd.DataFrame:
+        RDF = RDF[RDF["RelationshipURI"].isin(self.REL_FILTER["RelationshipURI"])]
+        return RDF
+
+    def rdf_add_special_token(self, RDF: pd.DataFrame):
+        """
+        Adds RDF special token to each element of the tuple. i.e: SUBJ to SubjectURI, OBJ to ObjectURI, REL to RelationshipURI. 
+        Check Scrits/Libs/CleaningPipeline/special_token.py for the up-to-date special token.
+        It only adds the special token of the three element of the RDF, no other special token.
+        Args:
+            RDF (pd.DataFrame):
+        Returns:
+            pd.DataFrame: ["MovieURI","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
+        """        
+        # if the filter runned before sliced the RDF and created a View, here the problem is resolved
+        # for more context: SettingWithCopyWarning
+        RDF = RDF.copy()
+        # at the beginning of SubjectURI RelationshipURI ObjectURI, add their special token  
+        RDF["SubjectURI"] = SpecialToken.SUBJECT.value + RDF["SubjectURI"]
+        RDF["ObjectURI"] = SpecialToken.OBJECT.value + RDF["ObjectURI"]
+        RDF["RelationshipURI"] = SpecialToken.RELATIONSHIP.value + RDF["RelationshipURI"]
+        return RDF
+
+
+    def reduce_movie_list(self, starting_offset:int , ending_offset:int):
+        end = min(len(self.MOVIE_FILTER), ending_offset)
+        self.MOVIE_FILTER = self.MOVIE_FILTER.iloc[starting_offset:end].copy()
+
+    def drop_na_from_dataset(self, RDF: pd.DataFrame) -> pd.DataFrame:
+        # dataset has SubjectURI RelationshipURI ObjectURI
+        #  want to drop the '' in them
+        # Replace empty strings with NaN
+        RDF = RDF.replace('', np.nan)
+        # Drop rows where any of the key columns are NaN
+        RDF = RDF.dropna(subset=["SubjectURI", "RelationshipURI", "ObjectURI"])
+        return RDF
+    
+    def rebuild_by_movie(self, RDF: pd.DataFrame) -> pd.DataFrame:
+        """_summary_
+
+        Args:
+            RDF (pd.DataFrame): ["MovieID","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
+
+        Returns:
+            pd.DataFrame: ["MovieID","Triple","Abstract"]
+        """        
+        # to execute this method you have to have itereted by movie_id
+        # because as design we want at the end one row for each movie
+        # MovieID and abstract can be given as input for a more generic method
+        # movie_id = RDF["MovieID"].iloc(0)
+        # abstract = RDF["Abstract"].iloc(0)
+        # first let's combine each row creating column triple as join of rdf
+        RDF["Triple"] =  RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"]
+        # special token 
+        RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
+        # combine rows into one
+        # MovieID and Abstract are unique for each other 1 <-> 1
+        RDF = RDF.groupby(["MovieID", "Abstract"])["Triple"].apply("".join).reset_index()
+        # add special token for: start of triple, end of triple and start of abstract
+        RDF["Triple"] = SpecialToken.START_TRIPLE_LIST.value + RDF["Triple"] 
+        RDF["Abstract"] = SpecialToken.ABSTRACT.value + RDF["Abstract"]
+        return RDF[["MovieID","Triple","Abstract"]]
+
+    def group_by_movie_from_triple(self, RDF: pd.DataFrame) -> pd.DataFrame:
+        """
+        Args:
+            RDF (pd.DataFrame): ["MovieID","Triple","Abstract"]
+
+        Returns:
+            pd.DataFrame: ["MovieID","Triple","Abstract"]
+        """
+        # combine rows into one
+        # MovieID and Abstract are unique for each other 1 <-> 1
+        RDF = RDF.groupby(["MovieID", "Abstract"])["Triple"].apply("".join).reset_index()
+        # add special token for: start of triple, end of triple and start of abstract
+        RDF["Triple"] = SpecialToken.START_TRIPLE_LIST.value + RDF["Triple"] 
+        RDF["Abstract"] = SpecialToken.ABSTRACT.value + RDF["Abstract"]
+        return RDF[["MovieID","Triple","Abstract"]]
+
+
+    @staticmethod
+    def build_triple(RDF: pd.DataFrame):
+        """
+        Obtains joined RDF triple in one element, togheter with START and END special token
+        Args:
+            RDF (pd.DataFrame): at least ["SubjectURI", "RelationshipURI", "ObjectURI"]
+        Returns:
+            pd.DataFrame: RDF["Triple"] (just this column)
+        """        
+        # let's combine each row creating column triple as join of rdf
+        RDF["Triple"] =  RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"]
+        # special token 
+        RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
+        return RDF["Triple"]
+
+    @staticmethod
+    def build_incomplete_triple(RDF: pd.DataFrame):
+        """
+        Method helper used for the third task: "Predicting a masked component within an RDF triple". 
+        Obtains joined RDF triple in one element, togheter with START and END special token.
+        The MISSING element will be replaced by the special token <MASK>
+        Args:
+            RDF (pd.DataFrame): 2 of the following ["SubjectURI", "RelationshipURI", "ObjectURI"]
+        Returns:
+            RDF["Triple"]: pd.Series  (just this column, NOT A DATAFRAME)
+        """        
+        # let's create a new column "Triple" with the joined RDF
+
+        # the following creates a column of MASK token of the lenght of the dataframe,
+        # it is not needed since we expect to have a dataframe of just one column, but its more robust (AND SLOW)
+        MISSING = pd.Series([SpecialToken.MASK.value] * len(RDF), index=RDF.index)
+
+        RDF["Triple"] =  ( 
+                    RDF.get("SubjectURI", MISSING) + 
+                    RDF.get("RelationshipURI", MISSING) + 
+                    RDF.get("ObjectURI", MISSING))
+        # special token 
+        RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
+        return RDF["Triple"]
+
+    @staticmethod
+    def build_for_mask_task(RDF_incomplete: pd.DataFrame, MISSING: pd.DataFrame) -> pd.DataFrame:
+        # currently not used
+        """
+        Method helper used for the third task: "Predicting a masked component within an RDF triple". 
+        Given two Dataframe, the first containing the incompleted RDF and the other only the missing componment,
+        this methods applies the special token
+        Args:
+            RDF (pd.DataFrame): _description_
+
+        Returns:
+            pd.DataFrame: _description_
+        """  
+        # take an example dataframe as ["SubjectURI",""]    
+        # as input two dataframe, one with 2 column  
+        return None
+
--- a/Scripts/DataCleaning/path_splitter_tree.py
+++ b/Scripts/DataCleaning/path_splitter_tree.py
@ -0,0 +1,161 @@
+import argparse
+import csv
+import sys
+from typing import Self
+
+
+class ProgramArgs:
+
+    def __init__(self, file: str, csv_uri_header: str, output: str, treshold: int):
+        """
+        Args:
+            file (str): 
+            csv_header (str): The name of the column of the csv file from which the program will get the URIs
+            output (str): 
+            treshold (int): 
+        """        
+        self.file = file
+        self.csv_uri_header = csv_uri_header
+        self.output = output
+        self.treshold = treshold
+
+
+class Node:
+
+    def __init__(
+        self,
+        name: str,
+        quantity: int = 0,
+    ):
+        self.name = name
+        self.quantity = quantity
+        self.children: dict[str, Node] = {}
+
+    @property
+    def is_leaf(self):
+        return len(self.children) == 0
+
+    def append_child(self, child: list[str]):
+
+        # print(child)
+        KEY = child[0]
+
+        if not self.children.get(KEY):
+            # if the key has no value, it means we are traversing this branch for the first time
+            # create another node for the key
+            self.children[KEY] = Node(KEY, 0)
+
+        # take the node for the key
+        CHILD = self.children[KEY]
+        self.quantity += 1
+
+        # if the child list to enter has only one element, which is KEY, no more node will be created
+        if len(child) == 1:
+            return
+
+        new_children = child[1:]
+
+        CHILD.append_child(new_children)
+
+    def __str__(self):
+        return f"{self.name}/ - {self.quantity}"
+
+
+def get_args(args: list[str]) -> ProgramArgs:
+
+    PARSER = argparse.ArgumentParser()
+    PARSER.add_argument("--input-file", "-i", required=True, type=str)
+    PARSER.add_argument("--header-name", "-c", required=True, type=str)                       # c stands for column
+    PARSER.add_argument("--output-file", "-o", required=True, type=str)
+    PARSER.add_argument("--treshold", "-t", type=int, default=1)
+    parsed_args, _ = PARSER.parse_known_args(args)
+
+    # print(parsed_args.input_file)
+
+    return ProgramArgs(parsed_args.input_file, parsed_args.header_name ,parsed_args.output_file, parsed_args.treshold)  # type ignore
+
+
+def get_debug_args() -> ProgramArgs:
+    # -i ./Assets/Dataset/1-hop/movies.csv  -c subject -o Assets/Dataset/Tmp/prova.csv -t 1
+    FILE = "./Assets/Dataset/1-hop/movies.csv"
+    CSV_HEADER = "subject"
+    OUTPUT = "./Assets/Dataset/Tmp/prova.csv"
+    TRESHOLD = 1
+
+    return ProgramArgs(
+        FILE,
+        CSV_HEADER,
+        OUTPUT,
+        TRESHOLD
+    )
+
+
+def tree_like(file: str, csv_uri_header:str, out: str):
+
+    INDENTATION = "    "
+
+    properties: dict[str, Node] = {}
+
+    properties["pure"] = Node("pure", 0)
+    properties["URI"] = Node("uri", 0)
+
+    FILE = open(file, "r", encoding="utf-8")
+
+    # It is needed the header-name
+    for row in csv.DictReader(FILE):
+
+        uri_element = row[csv_uri_header]
+        sections = uri_element.split("/")
+        sections = list(filter(lambda item: item != "", sections))
+
+        # print(sections)
+
+        if sections[0] != "http:" and sections[0] != "https:":
+            properties["pure"].append_child(sections)
+            continue
+
+        properties["URI"].append_child(sections)
+
+    FILE.close()
+
+    stack: list[tuple[Node, int]] = []
+
+    for _, item in properties.items():
+        stack.append((item, 0))
+
+    OUT = open(out, mode="w", encoding="utf-8")
+
+    while len(stack) > 0:
+
+        LAST_ITEM = stack.pop()
+
+        NODE: Node = LAST_ITEM[0]
+        DEPTH: int = LAST_ITEM[1]
+
+        INDENT: str = INDENTATION * DEPTH
+
+        # Leaf node have quantity 0, so if i want them to appear the threshold have to be 0
+        # if NODE.quantity < ARGS.treshold:
+        if ARGS.treshold > NODE.quantity:
+            continue
+
+        OUT.write(f"{INDENT}- {NODE}\n")
+
+        if NODE.is_leaf:
+            continue
+
+        CHILDREN = []
+
+        for _, child in NODE.children.items():
+            CHILDREN.append((child, DEPTH + 1))
+
+        stack.extend(CHILDREN)
+
+    OUT.close()
+
+
+
+if __name__ == "__main__":
+    ARGS = get_args(sys.argv)
+    # ARGS = get_debug_args()
+    tree_like(ARGS.file,ARGS.csv_uri_header, ARGS.output)
--- a/Scripts/DataCleaning/pipeline.py
+++ b/Scripts/DataCleaning/pipeline.py
@ -0,0 +1,140 @@
+import re
+from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
+from Scripts.DataCleaning.filter import PipelineApplier
+# tasks dataset builder
+from Scripts.DataCleaning.data_output_models.rdf_mask_task import RDF_mask_task_dataset
+from Scripts.DataCleaning.data_output_models.bpe_corpus import BPE_corpus
+from Scripts.DataCleaning.data_output_models.rdf_text_tasks import RDF_text_task_dataset
+from Scripts.DataCleaning.data_output_models.rdf_completation_task import RDF_completation_task_dataset
+
+import pandas as pd
+
+class Pipeline():
+    def __init__(self, 
+                 mask_task_dataset_path:str = "./Assets/Dataset/Tmp/rdf_mask.csv",
+                 bpe_corpus_path:str = "./Assets/Dataset/Tmp/corpus.txt",
+                 text_to_rdf_task_dataset_path:str = "./Assets/Dataset/Tmp/rdf_text.csv",
+                 completation_rdf_task_dataset_path:str = "./Assets/Dataset/Tmp/rdf_completation.csv",
+
+                 ):
+        self.sql_endpoint = SqlEndpoint()
+        # classes to manage taskes' datasets
+        self.task_rdf_mask = RDF_mask_task_dataset(mask_task_dataset_path)
+        self.task_bpe_corpus = BPE_corpus(bpe_corpus_path)
+        self.task_rdf_text = RDF_text_task_dataset(text_to_rdf_task_dataset_path)
+        self.task_rdf_completation = RDF_completation_task_dataset(completation_rdf_task_dataset_path)
+
+        # prepare the filter
+        # the filter applier needs to know the frequence of Movies and Relationship among all the Dataset
+        self.filter_applier = PipelineApplier()
+        MOVIE_COUNT = self.sql_endpoint.get_movies_id_count()
+        REL_COUNT = self.sql_endpoint.get_relationship_count()
+        self.filter_applier.generate_frequency_movie_filter(MOVIE_COUNT,50,3000)
+        self.filter_applier.generate_frequency_relationship_filter(REL_COUNT, 50, 2395627)
+        # prepare the filter on the relationshipURI you want to delete:
+        relationship_uri_banned_list = [
+            "dbp-dbp:wikiPageUsesTemplate","w3:2000/01/rdf-schema#label","dbp-dbo:abstract",
+            "dbp-dbo:wikiPageID","dbp-dbo:wikiPageRevisionID", "dbp-dbo:wikiPageDisambiguates",
+            "w3:2002/07/owl#sameAs","dbp-dbp:image","dbp-dbo:wikiPageLength", "w3:2000/01/rdf-schema#comment",
+            "dbp-dbo:thumbnail", "foaf:depiction", "w3:1999/02/22-rdf-syntax-ns#type"]
+        self.filter_applier.generate_list_relationship_filter(relationship_uri_banned_list)
+
+
+    def execute_task_bpe_corpus(self):
+        for RDF in self._get_cleaned_movie_rows():
+            RDF = self.filter_applier.rebuild_by_movie(RDF)
+            RDF = RDF[["Triple","Abstract"]]
+            self.task_bpe_corpus.write_from_df(RDF)
+        self._end_file_handler()
+
+
+    def execute_task_rdf_mask(self):
+        for RDF in self._get_cleaned_movie_rows():
+            self.task_rdf_mask.write(RDF)
+        self._end_file_handler()
+
+
+    def execute_tasks_rdf_text(self):
+        for RDF in self._get_cleaned_movie_rows():
+            RDF = self.filter_applier.rebuild_by_movie(RDF)
+            self.task_rdf_text.write(RDF)
+        self._end_file_handler()
+
+
+    def execute_task_rdf_completation(self):
+        for RDF in self._get_cleaned_movie_rows():
+            RDF["Triple"] = self.filter_applier.build_triple(RDF)
+            self.task_rdf_completation.write(RDF[["MovieID","Triple"]])
+        self._end_file_handler()
+
+
+    def execute_all_task(self):
+        for RDF in self._get_cleaned_movie_rows():
+            self.task_rdf_mask.write(RDF)
+
+            RDF["Triple"] = self.filter_applier.build_triple(RDF)
+            self.task_rdf_completation.write(RDF[["MovieID","Triple"]])
+
+            RDF = self.filter_applier.group_by_movie_from_triple(RDF[["MovieID","Triple","Abstract"]])
+
+            self.task_rdf_text.write(RDF)
+            self.task_bpe_corpus.write_from_df(RDF[["Triple","Abstract"]])
+
+        self._end_file_handler()
+
+
+    def _end_file_handler(self):
+        self.task_bpe_corpus.close()
+        self.task_rdf_mask.close()
+        self.task_rdf_text.close()
+        self.task_rdf_completation.close()
+
+
+    def _get_cleaned_movie_rows(self):
+        for RDF in self.sql_endpoint.get_abbreviated_dataset_by_movie_id():
+            RDF = self.filter_applier.drop_na_from_dataset(RDF)
+            RDF = self.filter_applier.filter_by_frequency_movie_id(RDF)
+            RDF = self.filter_applier.filter_by_frequency_relationship(RDF)
+            # other filter
+            #
+            RDF = self.filter_applier.delete_relationship_by_list_filter(RDF)
+            if RDF.empty:
+                continue
+            RDF = self.filter_applier.rdf_add_special_token(RDF)  # WARNING, THIS MUST BE DONE AFTER FILTER BY FREQUENCE
+            yield RDF
+
+
+    def use_toy_dataset(self):
+        # CHOOSEN MOVIE:
+        # The Dark Knight   : 117248
+        # Inception         : 147074
+        # The Avengers      : 113621
+        # Cast Away         : 1123
+        # The Departed      : 117586
+        # American Psycho   : 90177
+        # Avatar            : 71587
+        # Django Unchained  : 138952
+        # Spirited Away     : 144137
+        # Knives Out        : 148025
+        movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]
+        self.sql_endpoint.movie_ids = movie_list
+
+    def reduce_movie_list(self, starting_offset:int , ending_offset:int):
+        self.filter_applier.reduce_movie_list(starting_offset,ending_offset)
+
+
+
+# there are a lot of settings to manage
+# you only need to change settings: 
+# in the init for file paths, frequency filter limit, banned reletionshipURI
+# in the use_toy_dataset , to change the toy dataset
+# in _get_cleaned_movie_rows: to change how the pipeline behave
+
+#pipeline = Pipeline()
+
+# pipeline.use_toy_dataset()
+# pipeline.execute_task_bpe_corpus()
+# pipeline.execute_task_rdf_mask()
+# pipeline.execute_tasks_rdf_text()
+# pipeline.execute_task_rdf_completation()
+# pipeline.execute_all_task()
--- a/Scripts/DataGathering/analysis.py
+++ b/Scripts/DataGathering/analysis.py
@ -0,0 +1,53 @@
+import argparse
+import sys
+import pandas as pd
+
+
+class ProgramArgs:
+
+    def __init__(
+        self, input_file: str, column: str, output_file: str, count: bool
+    ) -> None:
+        self.input_file = input_file
+        self.column = column
+        self.output_file = output_file
+        self.count = count
+
+
+def get_args(args: list[str]) -> ProgramArgs:
+
+    PARSER = argparse.ArgumentParser()
+    PARSER.add_argument("--input-file", "--input", "-i", required=True, type=str)
+    PARSER.add_argument("--output-file", "--output", "-o", required=True, type=str)
+    PARSER.add_argument("--column", "--col", required=True, type=str)
+    PARSER.add_argument(
+        "--count", "-c", action="store_const", const=True, default=False
+    )
+    parsed_args, _ = PARSER.parse_known_args(args)
+
+    return ProgramArgs(
+        parsed_args.input_file,
+        parsed_args.column,
+        parsed_args.output_file,
+        parsed_args.count,
+    )  # type ignore
+
+
+if __name__ == "__main__":
+    ARGS = get_args(sys.argv)
+
+    OUTPUT_FILE = open(ARGS.output_file, "w+", encoding="utf-8")
+
+    # Load the CSV
+    df = pd.read_csv(ARGS.input_file)
+
+    # Count occurrences of each unique last part
+    item_counts = df[ARGS.column].value_counts()
+
+    # Print the counts
+    for item, count in item_counts.items():
+
+        if ARGS.count:
+            OUTPUT_FILE.write(f"{item}: {count}\n")
+        else:
+            OUTPUT_FILE.write(f"{item}\n")
--- a/Scripts/DataGathering/fetchdata.py
+++ b/Scripts/DataGathering/fetchdata.py
@ -0,0 +1,146 @@
+import argparse
+from math import floor
+import sys
+from time import sleep
+import SPARQLWrapper
+
+
+class ProgramData:
+
+    def __init__(
+        self,
+        local_url,
+        query_url,
+        sparql_url,
+        output_type,
+        initial_offset,
+        timeout,
+        limit,
+        max_pages,
+        verbosity_level,
+    ) -> None:
+
+        self.local_url = local_url
+        self.query_url = query_url
+        self.sparql_url = sparql_url
+        self.output_type = output_type
+        self.initial_offset = initial_offset
+        self.timeout = timeout
+        self.limit = limit
+        self.max_pages = max_pages
+        self.verbosity_level = verbosity_level
+
+    @property
+    def offset(self):
+        return self.limit
+
+    @property
+    def query(self):
+
+        with open(self.query_url, "r") as file:
+            return file.read()
+
+
+DBPEDIA_URL = "https://dbpedia.org/sparql"
+TYPE = SPARQLWrapper.CSV
+TIMEOUT_SECONDS = 1.5
+LIMIT = int(1E4)
+INITIAL_OFFSET = 0
+MAX_PAGES = int(1E9)
+
+
+def gather_cli_args(args: list[str]) -> ProgramData:
+
+    # TODO: Add argument for type
+    PARSER = argparse.ArgumentParser("sparql data fetcher")
+    PARSER.add_argument("--file-path", "--file", "--output", "-o", required=True, type=str)
+    PARSER.add_argument("--query-file", "--query", "-q", required=True, type=str)
+    PARSER.add_argument("--url", type=str, default=DBPEDIA_URL)
+    PARSER.add_argument("--limit", type=int, default=LIMIT)
+    PARSER.add_argument("--timeout", type=float, default=TIMEOUT_SECONDS)
+    PARSER.add_argument("--offset", type=int, default=INITIAL_OFFSET)
+    PARSER.add_argument("--max-pages", type=int, default=MAX_PAGES)
+    PARSER.add_argument("--verbose", "-v", action="count", default=0)
+
+    parsed_args, _ = PARSER.parse_known_args(args)
+
+    return ProgramData(
+        parsed_args.file_path,
+        parsed_args.query_file,
+        parsed_args.url,
+        SPARQLWrapper.CSV,
+        parsed_args.offset,
+        parsed_args.timeout,
+        parsed_args.limit,
+        parsed_args.max_pages,
+        parsed_args.verbose
+    )
+    # type: ignore
+
+
+def fetch_data(DATA: ProgramData):
+
+    # Take correction of page into account
+    page = int(floor(DATA.initial_offset / DATA.limit)) - 1
+    exit = False
+
+    while not exit:
+
+        print(f"Starting to get page {page}")
+
+        CURRENT_OFFSET = int(DATA.offset + (page * DATA.limit))
+        sparql = SPARQLWrapper.SPARQLWrapper(DATA.sparql_url)
+
+        sparql.setReturnFormat(TYPE)
+
+        CURRENT_PAGE_QUERY = "\n".join([
+            DATA.query,
+            f"LIMIT {LIMIT}",
+            f"OFFSET {CURRENT_OFFSET}"
+        ])
+
+        print(f"\nCurrent Query:\n{CURRENT_PAGE_QUERY}\n")
+
+        sparql.setQuery(CURRENT_PAGE_QUERY)
+
+        try:
+            res = sparql.queryAndConvert()
+            text = ""
+
+            if type(res) == bytes:
+
+                initial_offset = 0
+
+                if page != 0:
+                    initial_offset = 1
+
+                lines = res.decode("utf-8", "ignore").split("\n")
+                text = "\n".join(lines[initial_offset:])
+
+            if text == "":
+                exit = True
+                continue
+
+            with open(DATA.local_url, "a+", encoding="utf-8") as dataset:
+
+                print(f"Writing page {page} on {DATA.local_url}")
+                dataset.write(
+                    text
+                )
+
+        except Exception as ex:
+            print(f"Something went wrong during page {page}:\n\t{ex}")
+
+        print(f"Sleeping for {TIMEOUT_SECONDS}")
+
+        page += 1
+
+        if page == MAX_PAGES - 1:
+            exit = True
+
+        sleep(TIMEOUT_SECONDS)
+
+
+if __name__ == "__main__":
+    DATA = gather_cli_args(sys.argv)
+    fetch_data(DATA)
--- a/Scripts/DataGathering/wikipedia_gathering.py
+++ b/Scripts/DataGathering/wikipedia_gathering.py
@ -0,0 +1,154 @@
+from pathlib import Path
+import pandas as pd
+
+import csv
+import time
+import requests
+
+input_csv = "./Assets/Dataset/1-hop/movie-pageid.csv"
+output_csv = "./Assets/Dataset/Tmp/wikipedia-summary.csv"
+
+
+sess = requests.Session()
+
+CHUNK = 20
+
+
+# Function to get clean full text from Wikipedia PageID
+def get_clean_text(pageIDS: list[str]):
+
+    parsing_time = 0
+    start_full = time.time()
+    API_URL = "https://en.wikipedia.org/w/api.php"
+    headers = {
+        "User-Agent": "CoolBot/0.0"
+        ""
+        " (https://example.org/coolbot/; coolbot@example.org)"
+    }
+
+    ids = "|".join(pageIDS)
+
+    start_fetch = time.time()
+    res = sess.get(headers=headers, url=f"{API_URL}?action=query&pageids={ids}&prop=extracts&exintro=1&explaintext=1&format=json")
+    end_fetch = time.time()
+    fetch_time = end_fetch - start_fetch
+    print(f"Time elapsed FETCH: {fetch_time} seconds")
+
+    data = res.json()
+
+
+    abstracts = {}
+    # Make sure 'query' and the page exist
+    SKIPPED = 0
+    if "query" in data and "pages" in data["query"]:
+        for pageID in pageIDS:
+            if pageID in data["query"]["pages"]:
+                page = data["query"]["pages"][pageID]
+                extract: str = page.get("extract")
+
+                if extract:
+                    print(f"Entry FOUND for pageID {pageID}")
+                    start_parse = time.time()
+                    extract = extract.strip()
+                    extract = extract.replace("\n", "")
+                    end_parse = time.time()
+                    parsing_time = end_parse - start_parse
+                    print(f"Time elapsed PARSE: {parsing_time} seconds")
+                    abstracts[pageID] = extract
+                else:
+                    SKIPPED += 1
+                    print(f"Entry MISSING for pageID {pageID}")
+            else:
+                SKIPPED += 1
+                print(f"Page MISSING for pageID {pageID}")
+
+    print(f"Chunk done - Skipped {SKIPPED}")
+    end_full = time.time()
+
+    print(f"Time elapsed FULL: {end_full - start_full} seconds\n\tNO PARSE: {(end_full - start_full) - parsing_time} seconds")
+    return abstracts
+
+
+def flush(movie_ids):
+
+
+        abstracts = get_clean_text(movie_ids)
+
+        start = time.time()
+        with open(output_csv, "a", newline="", encoding="utf-8") as f_out:
+            writer = csv.DictWriter(f_out, fieldnames=["subject", "text"])
+
+            for id, text in abstracts.items():
+                writer.writerow({"subject": id, "text": text})
+        end = time.time()
+
+        print(f"Time elapsed WRITE: {end - start} seconds")
+
+
+
+
+def reconcile() -> int:
+
+    start = time.time()
+    input_file = open(input_csv, "r", newline="", encoding="utf-8")
+    output_file = open(output_csv, "r", newline="", encoding="utf-8")
+
+    next(input_file)
+    LAST_CHECKED = output_file.readlines()[-1].split(",")[0]
+    current_check = input_file.readline().split(",")[1]
+
+    index = 1
+
+    while current_check != LAST_CHECKED:
+        current_check = input_file.readline().split(",")[1].replace("\n", "")
+        index += 1
+
+    input_file.close()
+    output_file.close()
+    end = time.time()
+
+
+    print(f"Time elapsed RECONCILE: {end - start} seconds")
+
+    print(f"FOUND, we need to skip {index} lines")
+
+    return index
+
+
+if not Path(output_csv).is_file():
+    # Initialize output CSV
+    with open(output_csv, "w", newline="", encoding="utf-8") as f_out:
+        writer = csv.DictWriter(f_out, fieldnames=["subject", "text"])
+        writer.writeheader()
+
+
+SKIP = reconcile()
+
+
+# Read CSV in RAM
+with open(input_csv, "r", newline="", encoding="utf-8") as input:
+
+    # Skip already done
+    for i in range(0, SKIP):
+        next(input)
+
+    reader = csv.reader(input)
+
+    index = -1
+    movie_ids = []
+
+    for line in reader:
+
+        index += 1
+
+        if index == 0:
+            continue
+
+        # Save movies in map
+        movie_ids.append(line[1])
+
+        if index % CHUNK == 0:
+
+            # Flush movies
+            flush(movie_ids)
+            movie_ids = []
--- a/Scripts/DatasetMerging/datasetInfo.md
+++ b/Scripts/DatasetMerging/datasetInfo.md
@ -0,0 +1,26 @@
+# HOW THE DATASET IS BUILT AND POPULATED
+
+Note: the data are taken from CSV files in 1-hop
+
+## CSV files composition
+
+| CSV files          | Original structure                    | Saved AS                            |
+|--------------------|---------------------------------------|-------------------------------------|
+| Wikipeda-summary   | PageId / abstract                     | subject, text                       |
+| Movies             | Movie URI                             | "subject"                           |
+| Dataset            | Movie URI / Relationship / Object [RDF] | subject, relationship, object       |
+| Movies-PageId      | Movie URI / PageId (wiki)             | "subject", "object"                 |
+| Reverse            | Subject / Relationship / Movie URI    | "subject", "relationship", "object" |
+
+## Wanted tables schema
+
+| Table         | Columns                                                                 |
+|---------------|-------------------------------------------------------------------------|
+| Movies        | MovieID [PK], Movie URI                                                 |
+| WikiPageIDs   | MovieID [PK, FK], PageId [IDX] (wiki) *(Not important for now)*         |
+| Abstracts     | MovieID [PK, FK], abstract                                              |
+| Subjects      | SubjectID [PK], RDF Subject (from Dataset.csv or Reverse.csv), OriginID [FK] |
+| Relationships | RelationshipID [PK], RDF Relationship (value only, not the actual relation) |
+| Objects       | ObjectID [PK], RDF Object, OriginID [FK]                                |
+| Origins       | OriginID [PK], Origin Name                                              |
+| RDFs          | RDF_ID [PK], MovieID [FK], SubjectID [FK], RelationshipID [FK], ObjectID [FK] |
--- a/Scripts/DatasetMerging/datawarehouse.py
+++ b/Scripts/DatasetMerging/datawarehouse.py
@ -0,0 +1,633 @@
+import sqlite3
+import csv
+
+#####################################################################
+#   This file builds DatawareHouse/dataset.db from 1-hop csv files  #
+#   Its Schema in . /SQL_Queries/db_creation.sql                    #
+#   The sql query used to popualate id in . /SQL_Queries/query.sql  #
+#####################################################################
+
+# sometimes you may need to build a new db file, here a little snippet for you
+# sqlite3 ./Assets/Dataset/Tmp/dataset.db < ./Scripts/DataCleaning/SQL_Queries/db_creation.sql
+
+# --- Global configuration ---
+DB_NAME = "./Assets/Dataset/DatawareHouse/dataset.db"
+MOVIES_CSV = "./Assets/Dataset/1-hop/movies.csv"
+PAGEID_CSV = "./Assets/Dataset/1-hop/movie-pageid.csv"
+SUMMARY_CSV = "./Assets/Dataset/1-hop/wikipedia-summary.csv"
+DATASET_CSV = "./Assets/Dataset/1-hop/dataset.csv"
+REVERSE_CSV = "./Assets/Dataset/1-hop/reverse.csv"
+URI_CSV = "./Assets/Dataset/1-hop/uri-abbreviations.csv"
+
+MOVIES_CSV_HANDLER = open(MOVIES_CSV, "r", newline="", encoding="utf-8")
+PAGEID_CSV_HANDLER = open(PAGEID_CSV, "r", newline="", encoding="utf-8")
+SUMMARY_CSV_HANDLER = open(SUMMARY_CSV, "r", newline="", encoding="utf-8")
+DATASET_CSV_HANDLER = open(DATASET_CSV, "r", newline="", encoding="utf-8")
+REVERSE_CSV_HANDLER = open(REVERSE_CSV, "r", newline="", encoding="utf-8")
+URI_ABBR_CSV_HANDLER = open(URI_CSV, "r", newline="", encoding="utf-8")
+
+
+CONN = sqlite3.connect(DB_NAME)
+CURS = CONN.cursor()
+
+# MARK: SQL Definitions
+# Insert MovieURI
+
+
+def insertOrigin(curs: sqlite3.Cursor) -> bool:
+
+    QUERY = "INSERT INTO  Origins (OriginName) VALUES ('dataset.csv'),('reverse.csv');"
+    try:
+        curs.execute(QUERY)
+        return True
+    except sqlite3.IntegrityError:
+        return False
+
+
+def selectOrigin(curs: sqlite3.Cursor, originName: str) -> int | None:
+
+    QUERY = "SELECT OriginID FROM Origins WHERE OriginName = ?;"
+
+    curs.execute(QUERY, [originName])
+    originId = curs.fetchone()
+    if not originId:
+        return None
+
+    # in this case the real id is the first element of the tuple
+    return originId[0]
+
+
+def insertMovie(curs: sqlite3.Cursor, movieUri: str) -> bool:
+
+    QUERY = "INSERT INTO Movies (MovieURI) VALUES (?);"
+    try:
+        curs.execute(QUERY, [movieUri])
+        return True
+    except sqlite3.IntegrityError:
+        return False
+
+
+def selectMovieId(curs: sqlite3.Cursor, movieUri: str) -> int | None:
+
+    QUERY = "SELECT MovieID FROM Movies WHERE MovieURI = ?;"
+
+    curs.execute(QUERY, [movieUri])
+    movieId = curs.fetchone()
+    if not movieId:
+        return None
+
+    # in this case the real id is the first element of the tuple
+    return movieId[0]
+
+
+def insertWikiPageId(curs: sqlite3.Cursor, movieId: int, pageId: int) -> bool:
+    QUERY = "INSERT INTO  WikiPageIDs (MovieID, PageID) VALUES (?,?);"
+    try:
+        curs.execute(QUERY, [movieId, pageId])
+        return True
+    except sqlite3.IntegrityError:
+        return False
+
+
+def selectMovieIdFromWikiPageId(curs: sqlite3.Cursor, pageId: int) -> int | None:
+
+    QUERY = "SELECT MovieID FROM WikiPageIDs WHERE PageID = ?;"
+
+    curs.execute(QUERY, [pageId])
+    movieId = curs.fetchone()
+    if not movieId:
+        return None
+
+    # in this case the real id is the first element of the tuple
+    return movieId[0]
+
+
+def insertWikiAbstract(curs: sqlite3.Cursor, movieId: int, abstract: str) -> bool:
+    QUERY = "INSERT INTO  WikipediaAbstracts (MovieID, Abstract) VALUES (?,?);"
+    try:
+        curs.execute(QUERY, [movieId, abstract])
+        return True
+    except sqlite3.IntegrityError:
+        return False
+
+
+def insertSubject(curs: sqlite3.Cursor, subjectURI: str, originID: int) -> bool:
+    QUERY = "INSERT INTO  Subjects (SubjectURI, OriginID) VALUES (?,?);"
+    try:
+        curs.execute(QUERY, [subjectURI, originID])
+        return True
+    except sqlite3.IntegrityError:
+        return False
+
+
+def insertRelationship(curs: sqlite3.Cursor, relationshipURI: str) -> bool:
+    QUERY = "INSERT INTO  Relationships (RelationshipURI) VALUES (?);"
+    try:
+        curs.execute(QUERY, [relationshipURI])
+        return True
+    except sqlite3.IntegrityError:
+        return False
+
+
+def insertObject(curs: sqlite3.Cursor, objectURI: str, originID: int) -> bool:
+    QUERY = "INSERT INTO  objects (ObjectURI, OriginID) VALUES (?,?);"
+    try:
+        curs.execute(QUERY, [objectURI, originID])
+        return True
+    except sqlite3.IntegrityError:
+        return False
+
+
+def selectSubjectId(curs: sqlite3.Cursor, subjectURI: str) -> int | None:
+
+    QUERY = "SELECT SubjectID FROM Subjects WHERE SubjectURI = ?;"
+
+    curs.execute(QUERY, [subjectURI])
+    subjectId = curs.fetchone()
+    if not subjectId:
+        return None
+
+    # in this case the real id is the first element of the tuple
+    return subjectId[0]
+
+
+def selectRelationshipId(curs: sqlite3.Cursor, relationshipURI: str) -> int | None:
+
+    QUERY = "SELECT RelationshipID FROM Relationships WHERE RelationshipURI = ?;"
+
+    curs.execute(QUERY, [relationshipURI])
+    relationshipId = curs.fetchone()
+    if not relationshipId:
+        return None
+
+    # in this case the real id is the first element of the tuple
+    return relationshipId[0]
+
+
+def selectObjectId(curs: sqlite3.Cursor, objectURI: str) -> int | None:
+
+    QUERY = "SELECT ObjectID FROM Objects WHERE ObjectURI = ?;"
+
+    curs.execute(QUERY, [objectURI])
+    objectId = curs.fetchone()
+    if not objectId:
+        return None
+
+    # in this case the real id is the first element of the tuple
+    return objectId[0]
+
+
+def insertRDF(
+    curs: sqlite3.Cursor,
+    movieId: int,
+    subjectId: int,
+    relationshipId: int,
+    objectId: int,
+) -> bool:
+    QUERY = "INSERT INTO  RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?,?,?,?);"
+    try:
+        curs.execute(QUERY, [movieId, subjectId, relationshipId, objectId])
+        return True
+    except sqlite3.IntegrityError:
+        return False
+
+# UGLY: correct method to add cursor
+def insert_abbreviation(uri, abbreviation) -> bool:
+    QUERY = "INSERT INTO Abbreviations(URI, Abbreviation) VALUES (?,?);"
+    try:
+        CURS.execute(QUERY, [uri, abbreviation])
+        return True
+    except sqlite3.IntegrityError:
+        return False
+
+# UGLY: correct method to add cursor
+def insert_object_abbreviation(object_id, abbreviation_id) -> bool:
+    QUERY = "INSERT INTO Objects_Abbreviations(ObjectID, AbbreviationID) VALUES (?,?);"
+    try:
+        CURS.execute(QUERY, [object_id, abbreviation_id])
+        return True
+    except sqlite3.IntegrityError:
+        return False
+
+# UGLY: correct method to add cursor
+def insert_relationship_abbreviation(relationship_id, abbreviation_id) -> bool:
+    QUERY = "INSERT INTO Relationships_Abbreviations(RelationshipID, AbbreviationID) VALUES (?,?);"
+    try:
+        CURS.execute(QUERY, [relationship_id, abbreviation_id])
+        return True
+    except sqlite3.IntegrityError:
+        return False
+
+# UGLY: correct method to add cursor
+def insert_subject_abbreviation(subject_id, abbreviation_id) -> bool:
+    QUERY = (
+        "INSERT INTO Subjects_Abbreviations(SubjectID, AbbreviationID) VALUES (?,?);"
+    )
+    try:
+        CURS.execute(QUERY, [subject_id, abbreviation_id])
+        return True
+    except sqlite3.IntegrityError:
+        return False
+
+# UGLY: correct method to add cursor
+def select_abbreviation_id(uri) -> int | None:
+    QUERY = "SELECT AbbreviationID FROM Abbreviations WHERE URI LIKE ?;"
+    CURS.execute(QUERY, [uri])
+    abbreviation_id = CURS.fetchone()
+    if not abbreviation_id:
+        return None
+
+    # in this case the real id is the first element of the tuple
+    return abbreviation_id[0]
+
+
+# MARK: Parsing
+def parseMovies():
+
+    CSV_READER = csv.reader(MOVIES_CSV_HANDLER)
+    next(CSV_READER)
+    for row in CSV_READER:
+        MOVIE = row[0]
+        insertMovie(CURS, MOVIE)
+
+
+def parseWikiPageId():
+    CSV_READER = csv.DictReader(PAGEID_CSV_HANDLER)
+    for row in CSV_READER:
+        MOVIE_URI = row["subject"]
+        WIKI_PAGE_ID = int(row["object"])
+        MOVIE_ID = selectMovieId(CURS, MOVIE_URI)
+
+        if MOVIE_ID is None:
+            print(f"The MovieUri: {MOVIE_URI} has not a MovieId ")
+            continue
+
+        insertWikiPageId(CURS, MOVIE_ID, WIKI_PAGE_ID)
+
+
+def parseAbstract():
+    CSV_READER = csv.DictReader(SUMMARY_CSV_HANDLER)
+    for row in CSV_READER:
+
+        WIKI_PAGE_ID = int(row["subject"])
+        ABSTRACT = row["text"]
+        MOVIE_ID = selectMovieIdFromWikiPageId(CURS, WIKI_PAGE_ID)
+
+        if MOVIE_ID is None:
+            print(f"The WikiPageId: {WIKI_PAGE_ID} has not a MovieId ")
+            continue
+
+        insertWikiAbstract(CURS, MOVIE_ID, ABSTRACT)
+
+
+def parseAbbreviations():
+    URI_CSV = csv.DictReader(URI_ABBR_CSV_HANDLER)
+    for row in URI_CSV:
+
+        URI = row["uri"]
+        ABBREVIATION = row["abbreviation"]
+
+        insert_abbreviation(URI, ABBREVIATION)
+
+
+def parseRDF_Reverse():
+
+    REVERSE_CSV_READER = csv.DictReader(REVERSE_CSV_HANDLER)
+    REVERSE_ORIGIN_ID = selectOrigin(CURS, "reverse.csv")
+
+    if REVERSE_ORIGIN_ID is None:
+        return
+
+    total = 0
+
+    for row in REVERSE_CSV_READER:
+        SUBJECT = row["subject"]
+        RELATIONSHIP = row["relationship"]
+        OBJECT = row["object"]
+        print(f"RDF triplets:\n\t{SUBJECT} - {RELATIONSHIP} - {OBJECT}")
+        insertSubject(CURS, SUBJECT, REVERSE_ORIGIN_ID)
+        insertRelationship(CURS, RELATIONSHIP)
+        insertObject(CURS, OBJECT, REVERSE_ORIGIN_ID)
+
+        SUBJECT_ID = selectSubjectId(CURS, SUBJECT)
+        OBJECT_ID = selectObjectId(CURS, OBJECT)
+        RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
+        MOVIE_ID = selectMovieId(CURS, OBJECT)
+
+        skip = False
+
+        # guard
+        if SUBJECT_ID is None:
+            print(f"No SubjectId for {SUBJECT}")
+            skip = True
+
+        if OBJECT_ID is None:
+            print(f"No ObjectId for {OBJECT}")
+            skip = True
+
+        if RELATIONSHIP_ID is None:
+            print(f"No RelationshipId for {RELATIONSHIP}")
+            skip = True
+
+        if MOVIE_ID is None:
+            print(f"No MovieId for {OBJECT}")
+            skip = True
+
+        if skip:
+            continue
+
+        if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID):  # type: ignore
+            total += 1
+
+    print(total)
+
+
+def parseRDF_Dataset():
+
+    DATASET_CSV_READER = csv.DictReader(DATASET_CSV_HANDLER)
+    DATASET_ORIGIN_ID = selectOrigin(CURS, "dataset.csv")
+
+    if DATASET_ORIGIN_ID is None:
+        return
+
+    total = 0
+    rdf_idx = 0
+    for row in DATASET_CSV_READER:
+
+        SUBJECT = row["subject"]
+        RELATIONSHIP = row["relationship"]
+        OBJECT = row["object"]
+
+        rdf_idx += 1
+
+        if rdf_idx % 100000 == 0:
+            print(f"RDF number {rdf_idx}:\n\t{SUBJECT} - {RELATIONSHIP} - {OBJECT}")
+
+        insertSubject(CURS, SUBJECT, DATASET_ORIGIN_ID)
+        insertRelationship(CURS, RELATIONSHIP)
+        insertObject(CURS, OBJECT, DATASET_ORIGIN_ID)
+
+        SUBJECT_ID = selectSubjectId(CURS, SUBJECT)
+        OBJECT_ID = selectObjectId(CURS, OBJECT)
+        RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
+        MOVIE_ID = selectMovieId(CURS, SUBJECT)
+
+        skip = False
+
+        # guard
+        if SUBJECT_ID is None:
+            print(f"No SubjectId for {SUBJECT}")
+            skip = True
+
+        if OBJECT_ID is None:
+            print(f"No ObjectId for {OBJECT}")
+            skip = True
+
+        if RELATIONSHIP_ID is None:
+            print(f"No RelationshipId for {RELATIONSHIP}")
+            skip = True
+
+        if MOVIE_ID is None:
+            print(f"No MovieId for {SUBJECT}")
+            skip = True
+
+        if skip:
+            continue
+
+        if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID):  # type: ignore
+            total += 1
+
+    print(total)
+
+
+def parseAbbr_Reverse():
+
+    REVERSE_CSV_READER = csv.DictReader(REVERSE_CSV_HANDLER)
+    REVERSE_ORIGIN_ID = selectOrigin(CURS, "reverse.csv")
+
+    if REVERSE_ORIGIN_ID is None:
+        return
+
+    total = 0
+
+    for row in REVERSE_CSV_READER:
+        SUBJECT = row["subject"]
+        RELATIONSHIP = row["relationship"]
+        OBJECT = row["object"]
+
+        SUBJECT_ID = selectSubjectId(CURS, SUBJECT)
+        OBJECT_ID = selectObjectId(CURS, OBJECT)
+        RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
+
+        SUB_SECTIONS = SUBJECT.split("/")
+        REL_SECTIONS = RELATIONSHIP.split("/")
+        OBJ_SECTIONS = OBJECT.split("/")
+
+        SUB_ABBR_ID = None
+        REL_ABBR_ID = None
+        OBJ_ABBR_ID = None
+
+        skip = False
+
+        # guard
+        if SUBJECT_ID is None:
+            print(f"No SubjectId for {SUBJECT}")
+            skip = True
+
+        if OBJECT_ID is None:
+            print(f"No ObjectId for {OBJECT}")
+            skip = True
+
+        if RELATIONSHIP_ID is None:
+            print(f"No RelationshipId for {RELATIONSHIP}")
+            skip = True
+
+
+        if skip:
+            continue
+
+        if len(SUB_SECTIONS) > 4:
+            index = min(len(SUB_SECTIONS), 7)
+            while index > 3:
+                PATH = "/".join(SUB_SECTIONS[0:index]) + "%"
+                SUB_ABBR_ID = select_abbreviation_id(PATH)
+
+                if SUB_ABBR_ID is not None:
+                    if insert_subject_abbreviation(SUBJECT_ID, SUB_ABBR_ID):
+                        total += 1
+                    index = 0
+                index -= 1
+
+        if len(REL_SECTIONS) > 4:
+            index = min(len(REL_SECTIONS), 7)
+            while index > 2:
+                PATH = "/".join(REL_SECTIONS[0:index]) + "%"
+                REL_ABBR_ID = select_abbreviation_id(PATH)
+
+
+                if REL_ABBR_ID is not None:
+                    if insert_relationship_abbreviation(RELATIONSHIP_ID, REL_ABBR_ID):
+                        total += 1
+                    index = 0
+                index -= 1
+
+        if len(OBJ_SECTIONS) > 4:
+            index = min(len(OBJ_SECTIONS), 7)
+            while index > 3:
+                PATH = "/".join(OBJ_SECTIONS[0:index]) + "%"
+                OBJ_ABBR_ID = select_abbreviation_id(PATH)
+
+                if OBJ_ABBR_ID is not None:
+                    if insert_object_abbreviation(OBJECT_ID, OBJ_ABBR_ID):
+                        total += 1
+                    index = 0
+                index -= 1
+
+    print(total)
+
+
+def parseAbbr_Dataset():
+
+    DATASET_CSV_READER = csv.DictReader(DATASET_CSV_HANDLER)
+    DATASET_ORIGIN_ID = selectOrigin(CURS, "dataset.csv")
+
+    if DATASET_ORIGIN_ID is None:
+        return
+
+    total = 0
+    rdf_idx = 0
+    for row in DATASET_CSV_READER:
+        SUBJECT = row["subject"]
+        RELATIONSHIP = row["relationship"]
+        OBJECT = row["object"]
+
+        rdf_idx += 1
+
+        if rdf_idx % 100000 == 0:
+            print(f"RDF number {rdf_idx}:\n\t{SUBJECT} - {RELATIONSHIP} - {OBJECT}")
+
+        SUBJECT_ID = selectSubjectId(CURS, SUBJECT)
+        OBJECT_ID = selectObjectId(CURS, OBJECT)
+        RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
+
+        SUB_SECTIONS = SUBJECT.split("/")
+        REL_SECTIONS = RELATIONSHIP.split("/")
+        OBJ_SECTIONS = OBJECT.split("/")
+
+        SUB_ABBR_ID = None
+        REL_ABBR_ID = None
+        OBJ_ABBR_ID = None
+
+        skip = False
+
+        # guard
+        if SUBJECT_ID is None:
+            print(f"No SubjectId for {SUBJECT}")
+            skip = True
+
+        if OBJECT_ID is None:
+            print(f"No ObjectId for {OBJECT}")
+            skip = True
+
+        if RELATIONSHIP_ID is None:
+            print(f"No RelationshipId for {RELATIONSHIP}")
+            skip = True
+
+
+        if skip:
+            continue
+
+        if len(SUB_SECTIONS) > 4:
+            index = min(len(SUB_SECTIONS), 7)
+            while index > 3:
+                PATH = "/".join(SUB_SECTIONS[0:index]) + "%"
+                SUB_ABBR_ID = select_abbreviation_id(PATH)
+
+                if SUB_ABBR_ID is not None:
+                    if insert_subject_abbreviation(SUBJECT_ID, SUB_ABBR_ID):
+                        total += 1
+                    index = 0
+                index -= 1
+
+        if len(REL_SECTIONS) > 4:
+            index = min(len(REL_SECTIONS), 7)
+            while index > 2:
+                PATH = "/".join(REL_SECTIONS[0:index]) + "%"
+                REL_ABBR_ID = select_abbreviation_id(PATH)
+
+
+                if REL_ABBR_ID is not None:
+                    if insert_relationship_abbreviation(RELATIONSHIP_ID, REL_ABBR_ID):
+                        total += 1
+                    index = 0
+                index -= 1
+
+        if len(OBJ_SECTIONS) > 4:
+            index = min(len(OBJ_SECTIONS), 7)
+            while index > 3:
+                PATH = "/".join(OBJ_SECTIONS[0:index]) + "%"
+                OBJ_ABBR_ID = select_abbreviation_id(PATH)
+
+                if OBJ_ABBR_ID is not None:
+                    if insert_object_abbreviation(OBJECT_ID, OBJ_ABBR_ID):
+                        total += 1
+                    index = 0
+                index -= 1
+
+    print(total)
+
+
+# MARK: Actual Code
+# parseMovies()
+# parseWikiPageId()
+# parseAbstract()
+# insertOrigin(CURS)
+# parseAbbreviations()
+# parseRDF_Reverse()
+# parseRDF_Dataset()
+# parseAbbr_Reverse()
+parseAbbr_Dataset()
+
+
+CONN.commit()
+CONN.close()
+
+
+MOVIES_CSV_HANDLER.close()
+PAGEID_CSV_HANDLER.close()
+SUMMARY_CSV_HANDLER.close()
+DATASET_CSV_HANDLER.close()
+REVERSE_CSV_HANDLER.close()
+URI_ABBR_CSV_HANDLER.close()
+
+
+"""
+The MovieUri: http://dbpedia.org/resource/1%25_(film) has not a MovieId
+The MovieUri: http://dbpedia.org/resource/10%25:_What_Makes_a_Hero%3F has not a MovieId
+The MovieUri: http://dbpedia.org/resource/100%25_Arabica has not a MovieId
+The MovieUri: http://dbpedia.org/resource/100%25_Kadhal has not a MovieId
+The MovieUri: http://dbpedia.org/resource/100%25_Love_(2011_film) has not a MovieId
+The MovieUri: http://dbpedia.org/resource/100%25_Love_(2012_film) has not a MovieId
+The MovieUri: http://dbpedia.org/resource/100%25_Wolf has not a MovieId
+The MovieUri: http://dbpedia.org/resource/Who_the_$&%25_Is_Jackson_Pollock%3F has not a MovieId
+The MovieUri: http://dbpedia.org/resource/99%25:_The_Occupy_Wall_Street_Collaborative_Film has not a MovieId
+The MovieUri: http://dbpedia.org/resource/99_and_44/100%25_Dead has not a MovieId
+The MovieUri: http://dbpedia.org/resource/Postcards_from_the_48%25 has not a MovieId
+The MovieUri: http://dbpedia.org/resource/Wool_100%25 has not a MovieId
+"""
+
+"""
+The WikiPageId: 10068850 has not a MovieId
+The WikiPageId: 55069615 has not a MovieId
+The WikiPageId: 49510056 has not a MovieId
+The WikiPageId: 4049786 has not a MovieId
+The WikiPageId: 55510238 has not a MovieId
+The WikiPageId: 31239628 has not a MovieId
+The WikiPageId: 34757217 has not a MovieId
+The WikiPageId: 64311757 has not a MovieId
+The WikiPageId: 8326198 has not a MovieId
+The WikiPageId: 42162164 has not a MovieId
+The WikiPageId: 18502369 has not a MovieId
+The WikiPageId: 58092358 has not a MovieId
+The WikiPageId: 40710250 has not a MovieId
+"""
--- a/Scripts/Experiments/.gitkeep
+++ b/Scripts/Experiments/.gitkeep
--- a/Scripts/Experiments/Queries/.gitkeep
+++ b/Scripts/Experiments/Queries/.gitkeep
--- a/Scripts/Experiments/Tmp/.gitkeep
+++ b/Scripts/Experiments/Tmp/.gitkeep
--- a/Scripts/Experiments/change_me/use_bpe_pipeline.py
+++ b/Scripts/Experiments/change_me/use_bpe_pipeline.py
@ -0,0 +1,21 @@
+import Project_Model.Libs.BPE as BPE
+from pathlib import Path
+from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
+
+VOCABULARY_path = "Assets/Model/toy_10/toy_dictionary.json"
+VOCABULARY = BPE.load_nanos_vocabulary(Path(VOCABULARY_path))
+
+SPECIAL_TOKEN_LIST = [token.value for token in SpecialToken]
+
+# INPUT = "<SOTL><SOT><SUBJ>dbp-dbr:How_It_Should_Have_Ended<PRED>dbp-dbp:title<OBJ>dbp-dbr:The_Dark_Knight<EOT>"
+# INPUT = "<SOTL><SOT><SUBJ>dbp-dbr:How_It_Should_Have_Ended<PRED>"
+# INPUT = "<ABS>The Dark Knight is a 2008 superhero film directed by Christopher Nolan, from a screenplay co-written with his brother Jonathan. Based on the DC Comics superhero Batman, it is the sequel to Batman Begins (2005), and the second installment in The Dark Knight trilogy. The plot follows the vigilante Batman, police lieutenant James Gordon, and district attorney Harvey Dent, who form an alliance to dismantle organized crime in Gotham City. Their efforts are derailed by the Joker, an anarchistic mastermind who seeks to test how far Batman will go to save the city from chaos. The ensemble cast includes Christian Bale, Michael Caine, Heath Ledger, Gary Oldman, Aaron Eckhart, Maggie Gyllenhaal, and Morgan Freeman.Warner Bros. Pictures prioritized a sequel following the successful reinvention of the Batman film series with Batman Begins. Christopher and Batman Begins co-writer David S. Goyer developed the story elements, making Dent the central protagonist caught up in the battle between Batman and the Joker. In writing the screenplay, the Nolans were influenced by 1980s Batman comics and crime drama films, and sought to continue Batman Begins' heightened sense of realism. From April to November 2007, filming took place with a $185 million budget in Chicago and Hong Kong, and on sets in England. The Dark Knight was the first major motion picture to be filmed with high-resolution IMAX cameras. Christopher avoided using computer-generated imagery unless necessary, insisting on practical stunts such as flipping an 18-wheel truck and blowing up a factory.The Dark Knight was marketed with an innovative interactive viral campaign that initially focused on countering criticism of Ledger's casting by those who believed he was a poor choice to portray the Joker. Ledger died from an accidental prescription drug overdose in January 2008, leading to widespread interest from the press and public regarding his performance. When it was released in July, The Dark Knight received acclaim for its mature tone and themes, visual style, and performances—particularly that of Ledger, who received many posthumous awards including Academy, BAFTA, and Golden Globe awards for Best Supporting Actor, making The Dark Knight the first comic-book film to receive major industry awards. It broke several box-office records and became the highest-grossing 2008 film, the fourth-highest-grossing film to that time, and the highest-grossing superhero film of the time.Since its release, The Dark Knight has been assessed as one of the greatest superhero films ever, one of the best movies of the 2000s, and one of the best films ever made. It is considered the \"blueprint\" for many modern superhero films, particularly for its rejection of a typical comic-book movie style in favor of a crime film that features comic-book characters. Many filmmakers sought to repeat its success by emulating its gritty, realistic tone to varying degrees of success. The Dark Knight has been analyzed for its themes of terrorism and the limitations of morality and ethics. The United States Library of Congress selected it for preservation in the National Film Registry in 2020. A sequel, The Dark Knight Rises, concluded The Dark Knight trilogy in 2012.<SOTL>"
+INPUT = "<ABS>The Dark Knight is a 2008 superhero film directed by Christopher Nolan,<SOTL>"
+# INPUT = "<ABS> Nolan,<SOTL>"
+# 32: " "
+TOKENANO = BPE.Classes.TokeNanoCore(VOCABULARY, SPECIAL_TOKEN_LIST)
+print(f"input: {INPUT} \ninput lenght: {len(INPUT)}")
+encoded = TOKENANO.encode(INPUT)
+print(f"encode: {encoded} \nencode lenght: {len(encoded)}")
+decoded = TOKENANO.decode(encoded)
+print(f"decode: {decoded} \ndecode lenght: {len(decoded)}")
--- a/Scripts/Libs/CleaningPipeline/.gitkeep
+++ b/Scripts/Libs/CleaningPipeline/.gitkeep
--- a/Scripts/Libs/CleaningPipeline/special_token.py
+++ b/Scripts/Libs/CleaningPipeline/special_token.py
@ -0,0 +1,21 @@
+from enum import Enum
+
+
+class SpecialToken(Enum):
+    # (Enum, str) -> throws an error
+    START_TRIPLE_LIST = "<SOTL>"
+    START_TRIPLE = "<SOT>"
+    END_TRIPLE = "<EOT>"
+    SUBJECT = "<SUBJ>"
+    RELATIONSHIP = "<PRED>"
+    OBJECT = "<OBJ>"
+    ABSTRACT = "<ABS>"
+    CORPUS_END = "<END>"
+
+    ## Tasks' Token
+    RDF_TO_TEXT = "<RDF2TXT>"
+    TEXT_TO_RDF = "<TEXT2RDF>"
+    CONTINUE_RDF = "<CONTINUERDF>"
+    MASK = "<MASK>"
+
+    # BPE Training:
--- a/Scripts/Libs/CleaningPipeline/sql_endpoint.py
+++ b/Scripts/Libs/CleaningPipeline/sql_endpoint.py
@ -0,0 +1,144 @@
+#######################################################
+#   This file stand as endpoint to interact with DB   #
+#######################################################
+
+# import sqlite3
+import pandas as pd
+from sqlalchemy import create_engine
+from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
+
+
+class SqlEndpoint():
+
+    def __init__(self, DB_PATH = "./Assets/Dataset/DatawareHouse/dataset.db", chunk_size_row = 500):
+        # self.CONN = sqlite3.connect(DB_PATH) # DEPRECATED
+        self.sql_engine = create_engine(f"sqlite:///{DB_PATH}")
+        # /// 3 slash -> relative path
+        # //// 4 slash -> absolute
+        # self.conn = self.sql_engine.connect().execution_options(stream_results=True)
+        # it seems that sqlite doenst support streamer cursor
+        # PRAGMA exeutes better in writing not reading
+        self.chunk_size_row = chunk_size_row                    # not used now, since each chunk is a movie
+        self.movie_ids = movie_ids = pd.read_sql_query("SELECT MovieID FROM Movies;", self.sql_engine)["MovieID"]
+
+    def get_RDF(self) -> pd.DataFrame :
+        
+        QUERY = """
+                SELECT MovieID, SubjectURI, RelationshipURI, ObjectURI
+                FROM RDFs
+                INNER JOIN Subjects USING (SubjectID)
+                INNER JOIN Relationships USING (RelationshipID)
+                INNER JOIN Objects USING (ObjectID);
+                """
+        
+        return pd.read_sql_query(QUERY, self.CONN)
+    
+    def get_chunked_abbreviated_dataset(self) -> pd.DataFrame :
+        """
+        Returns:
+            pd.DataFrame: MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
+        """        
+        
+        QUERY = """
+                SELECT MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
+                FROM RDFs
+                INNER JOIN ParsedSubjects USING (SubjectID)
+                INNER JOIN ParsedRelationships USING (RelationshipID)
+                INNER JOIN ParsedObjects USING (ObjectID)
+                INNER JOIN WikipediaAbstracts USING (MovieID);
+                """
+        
+        # return pd.read_sql_query(QUERY, self.CONN, chunksize=500)
+        # sqlite3
+        return pd.read_sql_query(QUERY, self.sql_engine, chunksize=self.chunk_size_row)
+
+    
+    def get_chunked_abbreviated_dataset_with_start_token(self)-> pd.DataFrame:
+        # DEPRECATED !
+        start_token = SpecialToken()
+        QUERY = """
+                SELECT 
+                    MovieID, 
+                    ? || SubjectURI AS SubjectURI,
+                    ? || RelationshipURI AS RelationshipURI, 
+                    ? || ObjectURI AS ObjectURI, 
+                    Abstract
+                FROM RDFs
+                INNER JOIN ParsedSubjects USING (SubjectID)
+                INNER JOIN ParsedRelationships USING (RelationshipID)
+                INNER JOIN ParsedObjects USING (ObjectID)
+                INNER JOIN WikipediaAbstracts USING (MovieID);
+                """
+        return pd.read_sql_query(QUERY, self.sql_engine, chunksize=self.chunk_size_row)
+    
+    def get_abbreviated_dataset_by_movie_id(self):# -> iter[pd.DataFrame]:
+        """
+        Gets each time a DataFrame per movie ( with all its rows in the dataset).
+        The retrieved RDFs are already abbrevieted by the sql parser
+        Yields:
+            Pandas.DataFrame: [MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract]
+        """        
+        # chunk by movieId, abstract is the same and some intersting logic are appliable
+        # movie_ids = pd.read_sql_query("SELECT MovieID FROM Movies;", self.sql_engine)["MovieID"]
+        # CHOOSEN MOVIE:
+        # The Dark Knight   : 117248
+        # Inception         : 147074
+        # The Avengers      : 113621
+        # Cast Away         : 1123
+        # The Departed      : 117586
+        # American Psycho   : 90177
+        # Avatar            : 71587
+        # Django Unchained  : 138952
+        # Spirited Away     : 144137
+        # Knives Out        : 148025
+        # movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]
+        # movie_ids = movie_list
+
+        QUERY = """
+                SELECT MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
+                FROM RDFs
+                INNER JOIN ParsedSubjects USING (SubjectID)
+                INNER JOIN ParsedRelationships USING (RelationshipID)
+                INNER JOIN ParsedObjects USING (ObjectID)
+                INNER JOIN WikipediaAbstracts USING (MovieID)
+                WHERE MovieID = (?);
+                """        
+
+        for movie_id in self.movie_ids:
+            yield pd.read_sql_query(QUERY, self.sql_engine, params=(movie_id,))
+
+    def get_movies_id_count(self) -> pd.DataFrame:
+        """
+        Gets the count of each Movie in the Dataset
+        Returns:
+            Pandas.DataFrame: [MovieID, Count]
+        """        
+        QUERY = """
+                SELECT MovieID, COUNT(*) AS Count
+                FROM RDFs
+                GROUP BY MovieID;
+                """        
+        return pd.read_sql_query(QUERY, self.sql_engine)
+    
+    def get_relationship_count(self) -> pd.DataFrame:
+        """
+        Gets the count of each Relationship in the Dataset
+        Returns:
+            Pandas.DataFrame: [RelationshipURI, Count]
+        """       
+        QUERY = """
+                SELECT RelationshipURI, COUNT(*) AS Count
+                FROM RDFs
+                INNER JOIN ParsedRelationships USING (RelationshipID)
+                GROUP BY RelationshipURI;
+                """        
+        return pd.read_sql_query(QUERY, self.sql_engine)
+
+
+
+if __name__ == "__main__" :
+    sql_endpoint = SqlEndpoint()
+    for pandas_row in sql_endpoint.get_abbreviated_dataset_by_movie_id():
+        print(pandas_row)
+    # sql_endpoint.get_RDF()
+    print("done")
--- a/Scripts/Libs/Utils/.gitkeep
+++ b/Scripts/Libs/Utils/.gitkeep
--- a/Scripts/Libs/Utils/dataframe_interaction.py
+++ b/Scripts/Libs/Utils/dataframe_interaction.py
@ -0,0 +1,9 @@
+import pandas as pd
+
+
+
+def get_raw_from_dataframe(DF: pd.DataFrame) -> str:
+    output = ''
+    for row in DF.itertuples(index=False, name=None):
+        output += "".join(map(str, row))
+    return output
--- a/Scripts/Training/bpe_trainer.py
+++ b/Scripts/Training/bpe_trainer.py
@ -0,0 +1,101 @@
+import argparse
+import json
+from pathlib import Path
+import sys
+# TODO: make relative imports
+import Project_Model.Libs.BPE as BPE
+from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
+
+DEFAULT_CHUNK_SIZE = int(18e4)
+DEFAULT_DEBUG_AFTER_ITER = 1
+DEFAULT_MAX_VOCABULARY = int(32E3)
+DEFAULT_MERGE_TRESHOLD = 1
+DEFAULT_MAX_ITERATIONS = 0
+TOKEN_LIST = [token.value for token in SpecialToken]
+
+
+class ProgramArgs:
+
+    def __init__(
+        self,
+        input_file: str,
+        cache_dir: str,
+        output_file: str,
+        resume_at: int,
+        max_vocabulary: int,
+        max_iterations: int,
+        merge_treshold: int,
+        chunk_size: int,
+        debug_after: int,
+    ) -> None:
+        self.input_file = input_file
+        self.cache_dir = cache_dir
+        self.output_file = output_file
+        self.resume_at = resume_at
+        self.max_vocabulary = max_vocabulary
+        self.max_iterations = max_iterations
+        self.merge_treshold = merge_treshold
+        self.chunk_size = chunk_size
+        self.debug_after = debug_after
+
+
+def get_args(args: list[str]) -> ProgramArgs:
+
+    PARSER = argparse.ArgumentParser()
+    PARSER.add_argument("--input-file", "--input", "-i", required=True, type=str)
+    PARSER.add_argument("--cache-dir", "--cache", "-c", required=True, type=str)
+    PARSER.add_argument("--output-file", "--output", "-o", required=True, type=str)
+    PARSER.add_argument("--resume-at", "--resume", "-r", default=0, type=int)
+    PARSER.add_argument("--max-vocabulary", "--max-voc", default=DEFAULT_MAX_VOCABULARY, type=int)
+    PARSER.add_argument("--max-iterations", "--max-iter", default=DEFAULT_MAX_ITERATIONS, type=int)
+    PARSER.add_argument("--merge-treshold", "--tresh", default=DEFAULT_MERGE_TRESHOLD, type=int)
+    PARSER.add_argument("--chunk-size", default=DEFAULT_CHUNK_SIZE, type=int)
+    PARSER.add_argument("--debug-after", default=DEFAULT_DEBUG_AFTER_ITER, type=int)
+
+    parsed_args, _ = PARSER.parse_known_args(args)
+
+    return ProgramArgs(
+        parsed_args.input_file,
+        parsed_args.cache_dir,
+        parsed_args.output_file,
+        parsed_args.resume_at,
+        parsed_args.max_vocabulary,
+        parsed_args.max_iterations,
+        parsed_args.merge_treshold,
+        parsed_args.chunk_size,
+        parsed_args.debug_after,
+    )  # type ignore
+
+
+def train(args: ProgramArgs):
+
+    TRAINER = BPE.NanoSocraTrainer(
+        args.max_vocabulary,
+        TOKEN_LIST,
+        args.chunk_size,
+        args.merge_treshold,
+        args.max_iterations,
+        args.debug_after
+    )
+
+    DATASET_PATH = Path(args.input_file)
+    CACHE_DIR = Path(args.cache_dir)
+    VOCABULARY_PATH = Path(args.output_file)
+
+    print(f"Training BPE")
+
+    BPE_ENCODER = TRAINER.trainBPE(
+        DATASET_PATH,
+        CACHE_DIR,
+        resume_from_iter=args.resume_at
+    )
+
+    VOCABULARY = BPE_ENCODER.vocabulary
+
+    print(f"Saving Vocabulary in {VOCABULARY_PATH}")
+
+    BPE.save_nanos_vocabulary(VOCABULARY, VOCABULARY_PATH)
+
+if __name__ == "__main__":
+    ARGS = get_args(sys.argv)
+    train(ARGS)
--- a/Scripts/Training/bpe_trainer_pool.py
+++ b/Scripts/Training/bpe_trainer_pool.py
@ -0,0 +1,96 @@
+import argparse
+import json
+from pathlib import Path
+import sys
+# TODO: make relative imports
+import Project_Model.Libs.BPE as BPE
+from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
+
+DEFAULT_DEBUG_AFTER_ITER = 1
+DEFAULT_MAX_VOCABULARY = int(32E3)
+DEFAULT_MERGE_TRESHOLD = 1
+DEFAULT_MAX_ITERATIONS = 0
+TOKEN_LIST = [token.value for token in SpecialToken]
+
+
+class ProgramArgs:
+
+    def __init__(
+        self,
+        input_file: str,
+        output_file: str,
+        cache_file: str,
+        max_vocabulary: int,
+        max_iterations: int,
+        merge_treshold: int,
+        debug_after: int,
+    ) -> None:
+        self.input_file = input_file
+        self.output_file = output_file
+        self.cache_file = cache_file
+        self.max_vocabulary = max_vocabulary
+        self.max_iterations = max_iterations
+        self.merge_treshold = merge_treshold
+        self.debug_after = debug_after
+
+
+def get_args(args: list[str]) -> ProgramArgs:
+
+    PARSER = argparse.ArgumentParser()
+    PARSER.add_argument("--input-file", "--input", "-i", required=True, type=str)
+    PARSER.add_argument("--output-file", "--output", "-o", required=True, type=str)
+    PARSER.add_argument("--cache-file", "--cache", "-c", required=True, type=str)
+    PARSER.add_argument("--max-vocabulary", "--max-voc", default=DEFAULT_MAX_VOCABULARY, type=int)
+    PARSER.add_argument("--max-iterations", "--max-iter", default=DEFAULT_MAX_ITERATIONS, type=int)
+    PARSER.add_argument("--merge-treshold", "--tresh", default=DEFAULT_MERGE_TRESHOLD, type=int)
+    PARSER.add_argument("--debug-after", default=DEFAULT_DEBUG_AFTER_ITER, type=int)
+
+    parsed_args, _ = PARSER.parse_known_args(args)
+
+    return ProgramArgs(
+        parsed_args.input_file,
+        parsed_args.output_file,
+        parsed_args.cache_file,
+        parsed_args.max_vocabulary,
+        parsed_args.max_iterations,
+        parsed_args.merge_treshold,
+        parsed_args.debug_after,
+    )  # type ignore
+
+
+def train(args: ProgramArgs):
+
+    TRAINER = BPE.NanoSocraTrainerPool(
+        args.max_vocabulary,
+        TOKEN_LIST,
+        args.merge_treshold,
+        args.max_iterations,
+        args.debug_after
+    )
+
+    DATASET_PATH = Path(args.input_file)
+    VOCABULARY_PATH = Path(args.output_file)
+    CACHE_PATH = Path(args.cache_file)
+
+    start_bpe = BPE.NanoSocratesBPE()
+    if CACHE_PATH.is_file():
+        voc = BPE.load_nanos_vocabulary(CACHE_PATH)
+        start_bpe = BPE.NanoSocratesBPE(voc)
+
+    print(f"Training BPE")
+
+    BPE_ENCODER = TRAINER.trainBPE(
+        DATASET_PATH,
+        CACHE_PATH,
+        start_bpe
+    )
+
+    VOCABULARY = BPE_ENCODER.vocabulary
+
+    print(f"Saving Vocabulary in {VOCABULARY_PATH}")
+
+    BPE.save_nanos_vocabulary(VOCABULARY, VOCABULARY_PATH)
+
+if __name__ == "__main__":
+    ARGS = get_args(sys.argv)
+    train(ARGS)
--- a/Scripts/Training/bpe_trainer_ram.py
+++ b/Scripts/Training/bpe_trainer_ram.py
@ -0,0 +1,84 @@
+import argparse
+import json
+from pathlib import Path
+import sys
+# TODO: make relative imports
+import Project_Model.Libs.BPE as BPE
+from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
+
+DEFAULT_DEBUG_AFTER_ITER = 1
+DEFAULT_MAX_VOCABULARY = int(32E3)
+DEFAULT_MERGE_TRESHOLD = 1
+DEFAULT_MAX_ITERATIONS = 0
+TOKEN_LIST = [token.value for token in SpecialToken]
+
+
+class ProgramArgs:
+
+    def __init__(
+        self,
+        input_file: str,
+        output_file: str,
+        max_vocabulary: int,
+        max_iterations: int,
+        merge_treshold: int,
+        debug_after: int,
+    ) -> None:
+        self.input_file = input_file
+        self.output_file = output_file
+        self.max_vocabulary = max_vocabulary
+        self.max_iterations = max_iterations
+        self.merge_treshold = merge_treshold
+        self.debug_after = debug_after
+
+
+def get_args(args: list[str]) -> ProgramArgs:
+
+    PARSER = argparse.ArgumentParser()
+    PARSER.add_argument("--input-file", "--input", "-i", required=True, type=str)
+    PARSER.add_argument("--output-file", "--output", "-o", required=True, type=str)
+    PARSER.add_argument("--max-vocabulary", "--max-voc", default=DEFAULT_MAX_VOCABULARY, type=int)
+    PARSER.add_argument("--max-iterations", "--max-iter", default=DEFAULT_MAX_ITERATIONS, type=int)
+    PARSER.add_argument("--merge-treshold", "--tresh", default=DEFAULT_MERGE_TRESHOLD, type=int)
+    PARSER.add_argument("--debug-after", default=DEFAULT_DEBUG_AFTER_ITER, type=int)
+
+    parsed_args, _ = PARSER.parse_known_args(args)
+
+    return ProgramArgs(
+        parsed_args.input_file,
+        parsed_args.output_file,
+        parsed_args.max_vocabulary,
+        parsed_args.max_iterations,
+        parsed_args.merge_treshold,
+        parsed_args.debug_after,
+    )  # type ignore
+
+
+def train(args: ProgramArgs):
+
+    TRAINER = BPE.NanoSocraTraineRam(
+        args.max_vocabulary,
+        TOKEN_LIST,
+        args.merge_treshold,
+        args.max_iterations,
+        args.debug_after
+    )
+
+    DATASET_PATH = Path(args.input_file)
+    VOCABULARY_PATH = Path(args.output_file)
+
+    print(f"Training BPE")
+
+    BPE_ENCODER = TRAINER.trainBPE(
+        DATASET_PATH
+    )
+
+    VOCABULARY = BPE_ENCODER.vocabulary
+
+    print(f"Saving Vocabulary in {VOCABULARY_PATH}")
+
+    BPE.save_nanos_vocabulary(VOCABULARY, VOCABULARY_PATH)
+
+if __name__ == "__main__":
+    ARGS = get_args(sys.argv)
+    train(ARGS)
--- a/Scripts/Training/dictionary_adjuster.py
+++ b/Scripts/Training/dictionary_adjuster.py
@ -0,0 +1,12 @@
+# to cut the mad trained dict into a short one
+from Project_Model.Libs.BPE.Utils.vocabulary import load_nanos_vocabulary, save_nanos_vocabulary
+from pathlib import Path
+
+DICTIONARY_PATH = "Assets/Dataset/Tmp/mad_cache.json" 
+OUTPUT_PATH = "Assets/Dataset/Tmp/trimmed.json"
+
+
+big_dict = load_nanos_vocabulary(Path(DICTIONARY_PATH))
+big_dict = dict(list(big_dict.items())[:31744])
+
+save_nanos_vocabulary(big_dict,Path(OUTPUT_PATH))
--- a/Scripts/Training/mad_traininng.py
+++ b/Scripts/Training/mad_traininng.py
@ -0,0 +1,48 @@
+# generate each time a corpus big then the last, without the old data
+# then using the same vocabulary let the bpe train
+
+from Scripts.DataCleaning.pipeline import Pipeline
+from Scripts.Training.bpe_trainer_pool import train,get_args
+from pathlib import Path
+import os, shutil
+
+CORPUS_PATH = "Assets/Dataset/Tmp/toy_corpus.txt"
+VOCABULARY_PATH = "Assets/Dataset/Tmp/mad_vocabulary.json"
+CACHE_PATH = "Assets/Dataset/Tmp/mad_cache.json"
+
+
+def mad_corpus_generator(corpus_size :int, corpus_offset: int):
+    print("New Corpus")
+    pipe = Pipeline(bpe_corpus_path=CORPUS_PATH)
+    print("Pipeline Created")
+    corpus_ending_offset = corpus_size + corpus_offset
+    pipe.reduce_movie_list(corpus_offset,corpus_ending_offset)
+    print("Starting building corpus")
+    pipe.execute_task_bpe_corpus()
+    print("Corpus created")
+
+def mad_bpe_trainer():
+    argv = [
+    "--input-file", CORPUS_PATH,
+    "--output-file", VOCABULARY_PATH,
+    "--cache-file", CACHE_PATH,
+    ]
+    args = get_args(argv)
+    train(args)
+
+def mad_hatter():
+    # 10,100,500,1000,1500,2000,3000,4000,5000,10000
+    film_list = [10,100,500,1000,1500,2000,3000,4000,5000,10000]
+    starting_offset = 0
+    for corpus_size in film_list:
+
+        # mad_corpus_generator(corpus_size, starting_offset)
+        # starting_offset = starting_offset + corpus_size
+
+        mad_bpe_trainer()
+        # put dict into cache
+        shutil.copyfile(Path(VOCABULARY_PATH), Path(CACHE_PATH))
+
+
+
+mad_hatter()
--- a/Scripts/UML/CleaningPipeline/bpe-pipeline.excalidraw.json
+++ b/Scripts/UML/CleaningPipeline/bpe-pipeline.excalidraw.json
@ -0,0 +1,897 @@
+{
+  "type": "excalidraw",
+  "version": 2,
+  "source": "https://marketplace.visualstudio.com/items?itemName=pomdtr.excalidraw-editor",
+  "elements": [
+    {
+      "id": "3zbCui3XtIGozHXTVAGRp",
+      "type": "rectangle",
+      "x": 316.5,
+      "y": 123,
+      "width": 436.5,
+      "height": 145.5,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a0",
+      "roundness": {
+        "type": 3
+      },
+      "seed": 1698427950,
+      "version": 35,
+      "versionNonce": 601575602,
+      "isDeleted": false,
+      "boundElements": [
+        {
+          "id": "wD66RDbG05HfvRhAtMb0J",
+          "type": "text"
+        },
+        {
+          "id": "gus_rxauKJ6T2L_F59PfN",
+          "type": "arrow"
+        }
+      ],
+      "updated": 1758818588814,
+      "link": null,
+      "locked": false
+    },
+    {
+      "id": "wD66RDbG05HfvRhAtMb0J",
+      "type": "text",
+      "x": 480.98004150390625,
+      "y": 183.25,
+      "width": 107.5399169921875,
+      "height": 25,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a1",
+      "roundness": null,
+      "seed": 910769774,
+      "version": 31,
+      "versionNonce": 1120989938,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818416720,
+      "link": null,
+      "locked": false,
+      "text": "dataset.db",
+      "fontSize": 20,
+      "fontFamily": 5,
+      "textAlign": "center",
+      "verticalAlign": "middle",
+      "containerId": "3zbCui3XtIGozHXTVAGRp",
+      "originalText": "dataset.db",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "87-MeaiZGT1wln0nggYPZ",
+      "type": "rectangle",
+      "x": 339.5,
+      "y": 309.5,
+      "width": 392,
+      "height": 156,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a2",
+      "roundness": {
+        "type": 3
+      },
+      "seed": 655550318,
+      "version": 77,
+      "versionNonce": 1103939826,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818339000,
+      "link": null,
+      "locked": false
+    },
+    {
+      "id": "EjUxEhZqEBzwvlw0VE9eJ",
+      "type": "rectangle",
+      "x": 355.5,
+      "y": 327,
+      "width": 162,
+      "height": 125.5,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a3",
+      "roundness": {
+        "type": 3
+      },
+      "seed": 1739846638,
+      "version": 64,
+      "versionNonce": 1594290034,
+      "isDeleted": false,
+      "boundElements": [
+        {
+          "type": "text",
+          "id": "ogRkV0neHrhEKTE6zlggl"
+        }
+      ],
+      "updated": 1758818391415,
+      "link": null,
+      "locked": false
+    },
+    {
+      "id": "ogRkV0neHrhEKTE6zlggl",
+      "type": "text",
+      "x": 378.7100524902344,
+      "y": 377.25,
+      "width": 115.57989501953125,
+      "height": 25,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a3V",
+      "roundness": null,
+      "seed": 2037675630,
+      "version": 12,
+      "versionNonce": 1286472046,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818399222,
+      "link": null,
+      "locked": false,
+      "text": "RDF_String",
+      "fontSize": 20,
+      "fontFamily": 5,
+      "textAlign": "center",
+      "verticalAlign": "middle",
+      "containerId": "EjUxEhZqEBzwvlw0VE9eJ",
+      "originalText": "RDF_String",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "hoIRMNiMJZl4YDo-hovWy",
+      "type": "rectangle",
+      "x": 542.5,
+      "y": 327,
+      "width": 173,
+      "height": 125.5,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a4",
+      "roundness": {
+        "type": 3
+      },
+      "seed": 1189796530,
+      "version": 99,
+      "versionNonce": 1071057006,
+      "isDeleted": false,
+      "boundElements": [
+        {
+          "type": "text",
+          "id": "rsapATFAT5YSBCXzLupgZ"
+        },
+        {
+          "id": "gus_rxauKJ6T2L_F59PfN",
+          "type": "arrow"
+        },
+        {
+          "id": "Wk1bJbbtC31FqObEL5xWt",
+          "type": "arrow"
+        }
+      ],
+      "updated": 1758818593647,
+      "link": null,
+      "locked": false
+    },
+    {
+      "id": "rsapATFAT5YSBCXzLupgZ",
+      "type": "text",
+      "x": 585.6800384521484,
+      "y": 377.25,
+      "width": 86.63992309570312,
+      "height": 25,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a5",
+      "roundness": null,
+      "seed": 829619694,
+      "version": 12,
+      "versionNonce": 713902318,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818405150,
+      "link": null,
+      "locked": false,
+      "text": "Abstract",
+      "fontSize": 20,
+      "fontFamily": 5,
+      "textAlign": "center",
+      "verticalAlign": "middle",
+      "containerId": "hoIRMNiMJZl4YDo-hovWy",
+      "originalText": "Abstract",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "jSx8ApfhtRs_nk37VvDMb",
+      "type": "rectangle",
+      "x": 316.5,
+      "y": 511,
+      "width": 436.5,
+      "height": 145.5,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a6",
+      "roundness": {
+        "type": 3
+      },
+      "seed": 492582894,
+      "version": 132,
+      "versionNonce": 893797614,
+      "isDeleted": false,
+      "boundElements": [
+        {
+          "type": "text",
+          "id": "6E23g-rgowNqHsBxX-LuM"
+        },
+        {
+          "id": "hyFKqXwet_F79QM71atgI",
+          "type": "arrow"
+        },
+        {
+          "id": "x_DP1FcQ7jraGz0gBuDi3",
+          "type": "arrow"
+        },
+        {
+          "id": "1IGbCps2EHnzKgJUWM5nq",
+          "type": "arrow"
+        },
+        {
+          "id": "Wk1bJbbtC31FqObEL5xWt",
+          "type": "arrow"
+        }
+      ],
+      "updated": 1758818593647,
+      "link": null,
+      "locked": false
+    },
+    {
+      "id": "6E23g-rgowNqHsBxX-LuM",
+      "type": "text",
+      "x": 499.9100341796875,
+      "y": 571.25,
+      "width": 69.679931640625,
+      "height": 25,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a7",
+      "roundness": null,
+      "seed": 267696178,
+      "version": 132,
+      "versionNonce": 1668243186,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818543211,
+      "link": null,
+      "locked": false,
+      "text": "Pandas",
+      "fontSize": 20,
+      "fontFamily": 5,
+      "textAlign": "center",
+      "verticalAlign": "middle",
+      "containerId": "jSx8ApfhtRs_nk37VvDMb",
+      "originalText": "Pandas",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "ohj18N4AOTDz5lJNcV9gi",
+      "type": "rectangle",
+      "x": 261,
+      "y": 765.5,
+      "width": 157,
+      "height": 87,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a8",
+      "roundness": {
+        "type": 3
+      },
+      "seed": 1446207150,
+      "version": 279,
+      "versionNonce": 317375026,
+      "isDeleted": false,
+      "boundElements": [
+        {
+          "id": "Ea1_ke2wA0D8ZjVOUtvfY",
+          "type": "text"
+        },
+        {
+          "id": "hyFKqXwet_F79QM71atgI",
+          "type": "arrow"
+        }
+      ],
+      "updated": 1758818570993,
+      "link": null,
+      "locked": false
+    },
+    {
+      "id": "Ea1_ke2wA0D8ZjVOUtvfY",
+      "type": "text",
+      "x": 297.0800323486328,
+      "y": 796.5,
+      "width": 84.83993530273438,
+      "height": 25,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a9",
+      "roundness": null,
+      "seed": 435116270,
+      "version": 199,
+      "versionNonce": 1282911218,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818570993,
+      "link": null,
+      "locked": false,
+      "text": "train.txt",
+      "fontSize": 20,
+      "fontFamily": 5,
+      "textAlign": "center",
+      "verticalAlign": "middle",
+      "containerId": "ohj18N4AOTDz5lJNcV9gi",
+      "originalText": "train.txt",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "A4Y54Y26fe257U_QU9lxX",
+      "type": "rectangle",
+      "x": 464,
+      "y": 765.5,
+      "width": 157,
+      "height": 87,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aA",
+      "roundness": {
+        "type": 3
+      },
+      "seed": 186148850,
+      "version": 232,
+      "versionNonce": 997119858,
+      "isDeleted": false,
+      "boundElements": [
+        {
+          "id": "v4TvUlDEjH7EvPDmtbOn2",
+          "type": "text"
+        },
+        {
+          "id": "1IGbCps2EHnzKgJUWM5nq",
+          "type": "arrow"
+        }
+      ],
+      "updated": 1758818570993,
+      "link": null,
+      "locked": false
+    },
+    {
+      "id": "v4TvUlDEjH7EvPDmtbOn2",
+      "type": "text",
+      "x": 476.3500442504883,
+      "y": 796.5,
+      "width": 132.29991149902344,
+      "height": 25,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aB",
+      "roundness": null,
+      "seed": 1131059634,
+      "version": 171,
+      "versionNonce": 239540530,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818570993,
+      "link": null,
+      "locked": false,
+      "text": "validation.txt",
+      "fontSize": 20,
+      "fontFamily": 5,
+      "textAlign": "center",
+      "verticalAlign": "middle",
+      "containerId": "A4Y54Y26fe257U_QU9lxX",
+      "originalText": "validation.txt",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "mPaYpJ9Xn7tlJPmKPqJKJ",
+      "type": "rectangle",
+      "x": 674.5,
+      "y": 765.5,
+      "width": 157,
+      "height": 87,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aC",
+      "roundness": {
+        "type": 3
+      },
+      "seed": 1049323314,
+      "version": 235,
+      "versionNonce": 330560690,
+      "isDeleted": false,
+      "boundElements": [
+        {
+          "type": "text",
+          "id": "kg9nm2rpud6cax5aNPSnu"
+        },
+        {
+          "id": "x_DP1FcQ7jraGz0gBuDi3",
+          "type": "arrow"
+        }
+      ],
+      "updated": 1758818570993,
+      "link": null,
+      "locked": false
+    },
+    {
+      "id": "kg9nm2rpud6cax5aNPSnu",
+      "type": "text",
+      "x": 711.4300231933594,
+      "y": 796.5,
+      "width": 83.13995361328125,
+      "height": 25,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aD",
+      "roundness": null,
+      "seed": 522572142,
+      "version": 193,
+      "versionNonce": 1920372338,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818570993,
+      "link": null,
+      "locked": false,
+      "text": "test.txt",
+      "fontSize": 20,
+      "fontFamily": 5,
+      "textAlign": "center",
+      "verticalAlign": "middle",
+      "containerId": "mPaYpJ9Xn7tlJPmKPqJKJ",
+      "originalText": "test.txt",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "hyFKqXwet_F79QM71atgI",
+      "type": "arrow",
+      "x": 534.65,
+      "y": 661.5,
+      "width": 195.25,
+      "height": 99,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aG",
+      "roundness": null,
+      "seed": 873266098,
+      "version": 71,
+      "versionNonce": 541154738,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818570993,
+      "link": null,
+      "locked": false,
+      "points": [
+        [
+          0,
+          0
+        ],
+        [
+          0,
+          49.5
+        ],
+        [
+          -195.25,
+          49.5
+        ],
+        [
+          -195.25,
+          99
+        ]
+      ],
+      "lastCommittedPoint": null,
+      "startBinding": {
+        "elementId": "jSx8ApfhtRs_nk37VvDMb",
+        "fixedPoint": [
+          0.49977090492554405,
+          1.034364261168385
+        ],
+        "focus": 0,
+        "gap": 0
+      },
+      "endBinding": {
+        "elementId": "ohj18N4AOTDz5lJNcV9gi",
+        "fixedPoint": [
+          0.4993630573248406,
+          -0.05747126436781609
+        ],
+        "focus": 0,
+        "gap": 0
+      },
+      "startArrowhead": null,
+      "endArrowhead": "triangle",
+      "elbowed": true,
+      "fixedSegments": null,
+      "startIsSpecial": null,
+      "endIsSpecial": null
+    },
+    {
+      "id": "x_DP1FcQ7jraGz0gBuDi3",
+      "type": "arrow",
+      "x": 534.65,
+      "y": 661.5,
+      "width": 218.25,
+      "height": 99,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aH",
+      "roundness": null,
+      "seed": 1210817582,
+      "version": 77,
+      "versionNonce": 1483392370,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818580594,
+      "link": null,
+      "locked": false,
+      "points": [
+        [
+          0,
+          0
+        ],
+        [
+          0,
+          49.5
+        ],
+        [
+          218.25,
+          49.5
+        ],
+        [
+          218.25,
+          99
+        ]
+      ],
+      "lastCommittedPoint": null,
+      "startBinding": {
+        "elementId": "jSx8ApfhtRs_nk37VvDMb",
+        "fixedPoint": [
+          0.49977090492554405,
+          1.034364261168385
+        ],
+        "focus": 0,
+        "gap": 0
+      },
+      "endBinding": {
+        "elementId": "mPaYpJ9Xn7tlJPmKPqJKJ",
+        "fixedPoint": [
+          0.4993630573248406,
+          -0.05747126436781609
+        ],
+        "focus": 0,
+        "gap": 0
+      },
+      "startArrowhead": null,
+      "endArrowhead": "triangle",
+      "elbowed": true,
+      "fixedSegments": null,
+      "startIsSpecial": null,
+      "endIsSpecial": null
+    },
+    {
+      "id": "1IGbCps2EHnzKgJUWM5nq",
+      "type": "arrow",
+      "x": 534.65,
+      "y": 661.5,
+      "width": 0.5719232650604908,
+      "height": 99.07394122590165,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aK",
+      "roundness": null,
+      "seed": 1205316658,
+      "version": 96,
+      "versionNonce": 1748050674,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818570993,
+      "link": null,
+      "locked": false,
+      "points": [
+        [
+          0,
+          0
+        ],
+        [
+          -0.5719232650604908,
+          99.07394122590165
+        ]
+      ],
+      "lastCommittedPoint": null,
+      "startBinding": {
+        "elementId": "jSx8ApfhtRs_nk37VvDMb",
+        "fixedPoint": [
+          0.49977090492554405,
+          1.034364261168385
+        ],
+        "focus": 0,
+        "gap": 0
+      },
+      "endBinding": {
+        "elementId": "A4Y54Y26fe257U_QU9lxX",
+        "fixedPoint": [
+          0.44635717665566554,
+          -0.056621365219521276
+        ],
+        "focus": 0,
+        "gap": 0
+      },
+      "startArrowhead": null,
+      "endArrowhead": "triangle",
+      "elbowed": true,
+      "fixedSegments": null,
+      "startIsSpecial": null,
+      "endIsSpecial": null
+    },
+    {
+      "id": "gus_rxauKJ6T2L_F59PfN",
+      "type": "arrow",
+      "x": 539,
+      "y": 271.5,
+      "width": 0,
+      "height": 33.5,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aL",
+      "roundness": null,
+      "seed": 763990258,
+      "version": 17,
+      "versionNonce": 1028811378,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818588814,
+      "link": null,
+      "locked": false,
+      "points": [
+        [
+          0,
+          0
+        ],
+        [
+          0,
+          33.5
+        ]
+      ],
+      "lastCommittedPoint": null,
+      "startBinding": {
+        "elementId": "3zbCui3XtIGozHXTVAGRp",
+        "focus": -0.019473081328751418,
+        "gap": 3
+      },
+      "endBinding": {
+        "elementId": "hoIRMNiMJZl4YDo-hovWy",
+        "focus": -1.0404624277456647,
+        "gap": 30.7545797799829
+      },
+      "startArrowhead": null,
+      "endArrowhead": "triangle",
+      "elbowed": false
+    },
+    {
+      "id": "Wk1bJbbtC31FqObEL5xWt",
+      "type": "arrow",
+      "x": 536.5,
+      "y": 468.5,
+      "width": 0,
+      "height": 39,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aM",
+      "roundness": null,
+      "seed": 1489771054,
+      "version": 33,
+      "versionNonce": 1828178606,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818593647,
+      "link": null,
+      "locked": false,
+      "points": [
+        [
+          0,
+          0
+        ],
+        [
+          0,
+          39
+        ]
+      ],
+      "lastCommittedPoint": null,
+      "startBinding": {
+        "elementId": "hoIRMNiMJZl4YDo-hovWy",
+        "focus": 1.0693641618497107,
+        "gap": 27.157190169432425
+      },
+      "endBinding": {
+        "elementId": "jSx8ApfhtRs_nk37VvDMb",
+        "focus": 0.008018327605956525,
+        "gap": 3.5
+      },
+      "startArrowhead": null,
+      "endArrowhead": "triangle",
+      "elbowed": false
+    }
+  ],
+  "appState": {
+    "gridSize": 20,
+    "gridStep": 5,
+    "gridModeEnabled": false,
+    "viewBackgroundColor": "#ffffff"
+  },
+  "files": {}
+}
--- a/Scripts/UML/CleaningPipeline/classes.excalidraw.json
+++ b/Scripts/UML/CleaningPipeline/classes.excalidraw.json
@ -0,0 +1,826 @@
+{
+  "type": "excalidraw",
+  "version": 2,
+  "source": "https://marketplace.visualstudio.com/items?itemName=pomdtr.excalidraw-editor",
+  "elements": [
+    {
+      "type": "line",
+      "version": 4622,
+      "versionNonce": 1623045672,
+      "isDeleted": false,
+      "id": "twu_PiAvEuQ4l1YYtZLET",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 0,
+      "x": 289.8504963515835,
+      "y": 91.87474806402287,
+      "strokeColor": "#000000",
+      "backgroundColor": "#a5d8ff",
+      "width": 77.09201683999922,
+      "height": 99.49948667804088,
+      "seed": 1975340120,
+      "groupIds": [
+        "9PT4BXPfQ6UoCaB-T-h9A",
+        "dp_TZJyYdyPIH1hOkAPlb"
+      ],
+      "strokeSharpness": "round",
+      "boundElementIds": [],
+      "startBinding": null,
+      "endBinding": null,
+      "lastCommittedPoint": null,
+      "startArrowhead": null,
+      "endArrowhead": null,
+      "points": [
+        [
+          0,
+          0
+        ],
+        [
+          0.2542098813493443,
+          75.20117273657175
+        ],
+        [
+          0.011896425679918422,
+          83.76249969444815
+        ],
+        [
+          3.970409367559332,
+          87.46174320643391
+        ],
+        [
+          17.75573317066317,
+          90.59250103325854
+        ],
+        [
+          41.05683533152865,
+          91.56737225214069
+        ],
+        [
+          63.319497586673116,
+          90.01084754868091
+        ],
+        [
+          75.14781395923075,
+          86.28844687220405
+        ],
+        [
+          76.81603792670788,
+          83.15042405259751
+        ],
+        [
+          77.05033394391478,
+          76.25776215104557
+        ],
+        [
+          76.86643881413028,
+          6.3089586511537865
+        ],
+        [
+          76.45188016352971,
+          -0.2999144698665015
+        ],
+        [
+          71.50179495549581,
+          -3.9936571317850627
+        ],
+        [
+          61.077971898861186,
+          -6.132877429442784
+        ],
+        [
+          37.32348754161154,
+          -7.932114425900202
+        ],
+        [
+          18.278415656797975,
+          -6.859225353587373
+        ],
+        [
+          3.2995959613238286,
+          -3.2201165291205287
+        ],
+        [
+          -0.04168289608444441,
+          -0.045185660461322996
+        ],
+        [
+          0,
+          0
+        ]
+      ],
+      "index": "a1",
+      "frameId": null,
+      "roundness": {
+        "type": 2
+      },
+      "boundElements": [],
+      "updated": 1758646548051,
+      "link": null,
+      "locked": false
+    },
+    {
+      "type": "line",
+      "version": 2327,
+      "versionNonce": 1593094440,
+      "isDeleted": false,
+      "id": "hmJk4dH9VpOsfkrCTkhvh",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 0,
+      "x": 290.3744257898585,
+      "y": 149.00103172175278,
+      "strokeColor": "#000000",
+      "backgroundColor": "#a5d8ff",
+      "width": 77.17198221193564,
+      "height": 8.562348957853036,
+      "seed": 637665624,
+      "groupIds": [
+        "9PT4BXPfQ6UoCaB-T-h9A",
+        "dp_TZJyYdyPIH1hOkAPlb"
+      ],
+      "strokeSharpness": "round",
+      "boundElementIds": [],
+      "startBinding": null,
+      "endBinding": null,
+      "lastCommittedPoint": null,
+      "startArrowhead": null,
+      "endArrowhead": null,
+      "points": [
+        [
+          0,
+          0
+        ],
+        [
+          2.033150371639873,
+          3.413095389435587
+        ],
+        [
+          10.801287372573954,
+          6.276651055277943
+        ],
+        [
+          22.468666942209353,
+          8.010803051612635
+        ],
+        [
+          40.747074201802775,
+          8.168828515515864
+        ],
+        [
+          62.077348233027564,
+          7.0647721921469495
+        ],
+        [
+          74.53446931782398,
+          3.04824021069218
+        ],
+        [
+          77.17198221193564,
+          -0.3935204423371723
+        ]
+      ],
+      "index": "a2",
+      "frameId": null,
+      "roundness": {
+        "type": 2
+      },
+      "boundElements": [],
+      "updated": 1758646548051,
+      "link": null,
+      "locked": false
+    },
+    {
+      "type": "line",
+      "version": 2413,
+      "versionNonce": 311708712,
+      "isDeleted": false,
+      "id": "X1ldVIXm4DfBal5N2Pwn9",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 0,
+      "x": 289.3425684673547,
+      "y": 120.03697638652972,
+      "strokeColor": "#000000",
+      "backgroundColor": "#a5d8ff",
+      "width": 77.17198221193564,
+      "height": 8.562348957853036,
+      "seed": 904402520,
+      "groupIds": [
+        "9PT4BXPfQ6UoCaB-T-h9A",
+        "dp_TZJyYdyPIH1hOkAPlb"
+      ],
+      "strokeSharpness": "round",
+      "boundElementIds": [],
+      "startBinding": null,
+      "endBinding": null,
+      "lastCommittedPoint": null,
+      "startArrowhead": null,
+      "endArrowhead": null,
+      "points": [
+        [
+          0,
+          0
+        ],
+        [
+          2.033150371639873,
+          3.413095389435587
+        ],
+        [
+          10.801287372573954,
+          6.276651055277943
+        ],
+        [
+          22.468666942209353,
+          8.010803051612635
+        ],
+        [
+          40.747074201802775,
+          8.168828515515864
+        ],
+        [
+          62.077348233027564,
+          7.0647721921469495
+        ],
+        [
+          74.53446931782398,
+          3.04824021069218
+        ],
+        [
+          77.17198221193564,
+          -0.3935204423371723
+        ]
+      ],
+      "index": "a3",
+      "frameId": null,
+      "roundness": {
+        "type": 2
+      },
+      "boundElements": [],
+      "updated": 1758646548051,
+      "link": null,
+      "locked": false
+    },
+    {
+      "type": "ellipse",
+      "version": 5410,
+      "versionNonce": 92833576,
+      "isDeleted": false,
+      "id": "CFhp5ZxSVwHYzGUj4hEn1",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 0,
+      "x": 288.28461948527263,
+      "y": 84.74247943834126,
+      "strokeColor": "#000000",
+      "backgroundColor": "#a5d8ff",
+      "width": 76.59753601865496,
+      "height": 15.49127539284798,
+      "seed": 1782811480,
+      "groupIds": [
+        "9PT4BXPfQ6UoCaB-T-h9A",
+        "dp_TZJyYdyPIH1hOkAPlb"
+      ],
+      "strokeSharpness": "sharp",
+      "boundElementIds": [
+        "bxuMGTzXLn7H-uBCptINx"
+      ],
+      "index": "a4",
+      "frameId": null,
+      "roundness": null,
+      "boundElements": [],
+      "updated": 1758646548051,
+      "link": null,
+      "locked": false
+    },
+    {
+      "type": "ellipse",
+      "version": 820,
+      "versionNonce": 608002600,
+      "isDeleted": false,
+      "id": "B43R7rWwK2_vdiRHBSSPk",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 0,
+      "x": 324.77660659049513,
+      "y": 109.21914711824485,
+      "strokeColor": "#000000",
+      "backgroundColor": "#228be6",
+      "width": 11.226103154161754,
+      "height": 12.183758484455605,
+      "seed": 1298686040,
+      "groupIds": [
+        "9PT4BXPfQ6UoCaB-T-h9A",
+        "dp_TZJyYdyPIH1hOkAPlb"
+      ],
+      "strokeSharpness": "sharp",
+      "boundElementIds": [],
+      "index": "a5",
+      "frameId": null,
+      "roundness": null,
+      "boundElements": [],
+      "updated": 1758646548051,
+      "link": null,
+      "locked": false
+    },
+    {
+      "type": "ellipse",
+      "version": 1108,
+      "versionNonce": 1839127848,
+      "isDeleted": false,
+      "id": "CkKMb9wkJfVk04T217zSs",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 0,
+      "x": 325.12774837442873,
+      "y": 135.43576140530996,
+      "strokeColor": "#000000",
+      "backgroundColor": "#228be6",
+      "width": 11.226103154161754,
+      "height": 12.183758484455605,
+      "seed": 2133497176,
+      "groupIds": [
+        "9PT4BXPfQ6UoCaB-T-h9A",
+        "dp_TZJyYdyPIH1hOkAPlb"
+      ],
+      "strokeSharpness": "sharp",
+      "boundElementIds": [],
+      "index": "a6",
+      "frameId": null,
+      "roundness": null,
+      "boundElements": [],
+      "updated": 1758646548051,
+      "link": null,
+      "locked": false
+    },
+    {
+      "type": "ellipse",
+      "version": 991,
+      "versionNonce": 588838952,
+      "isDeleted": false,
+      "id": "SHJdKeQPkfpvzSoNH--3o",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 6.239590202363168,
+      "x": 325.77660659049513,
+      "y": 164.20448797661635,
+      "strokeColor": "#000000",
+      "backgroundColor": "#228be6",
+      "width": 11.226103154161754,
+      "height": 12.183758484455605,
+      "seed": 81668696,
+      "groupIds": [
+        "9PT4BXPfQ6UoCaB-T-h9A",
+        "dp_TZJyYdyPIH1hOkAPlb"
+      ],
+      "strokeSharpness": "sharp",
+      "boundElementIds": [],
+      "index": "a7",
+      "frameId": null,
+      "roundness": null,
+      "boundElements": [],
+      "updated": 1758646548051,
+      "link": null,
+      "locked": false
+    },
+    {
+      "type": "text",
+      "version": 489,
+      "versionNonce": 2023207720,
+      "isDeleted": false,
+      "id": "vUSyMBPup0jZ71CYXKyGb",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 0,
+      "x": 280.1846389770508,
+      "y": 185.79462957545917,
+      "strokeColor": "#000000",
+      "backgroundColor": "#a5d8ff",
+      "width": 95.63072204589844,
+      "height": 23.595161071904883,
+      "seed": 425140056,
+      "groupIds": [
+        "dp_TZJyYdyPIH1hOkAPlb"
+      ],
+      "strokeSharpness": "sharp",
+      "boundElementIds": [],
+      "fontSize": 17.4778970902999,
+      "fontFamily": 1,
+      "text": "dataset.db",
+      "baseline": 16.595161071904883,
+      "textAlign": "center",
+      "verticalAlign": "top",
+      "index": "a8",
+      "frameId": null,
+      "roundness": null,
+      "boundElements": [],
+      "updated": 1758646548051,
+      "link": null,
+      "locked": false,
+      "containerId": null,
+      "originalText": "dataset.db",
+      "autoResize": true,
+      "lineHeight": 1.350000000000001
+    },
+    {
+      "id": "R7pU0VP6CFKCAwuvt0xsr",
+      "type": "text",
+      "x": 295.5,
+      "y": 342,
+      "width": 374,
+      "height": 225,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "#228be6",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a9",
+      "roundness": null,
+      "seed": 705463336,
+      "version": 1130,
+      "versionNonce": 72522328,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758648226024,
+      "link": null,
+      "locked": false,
+      "text": "class Extract(Action):\n    # Static\n    + type : ActionTypes = Extract\n    \n    # Properties\n    - db_connection: Path\n    - query: str\n    - query_parameters: [str]\n    - output_mapper: [str]",
+      "fontSize": 20,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "class Extract(Action):\n    # Static\n    + type : ActionTypes = Extract\n    \n    # Properties\n    - db_connection: Path\n    - query: str\n    - query_parameters: [str]\n    - output_mapper: [str]",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "G1xIRcJgm34_NMEWQFFlW",
+      "type": "text",
+      "x": 1419.5,
+      "y": 110,
+      "width": 253,
+      "height": 75,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "#228be6",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aA",
+      "roundness": null,
+      "seed": 651981400,
+      "version": 256,
+      "versionNonce": 138082856,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758646570344,
+      "link": null,
+      "locked": false,
+      "text": "class Pipeline\n    - actions: [Action]\n    ",
+      "fontSize": 20,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "class Pipeline\n    - actions: [Action]\n    ",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "TBVy3JbJCkbA9kjVEJ8lv",
+      "type": "text",
+      "x": 694,
+      "y": 100,
+      "width": 495,
+      "height": 150,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "#228be6",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aB",
+      "roundness": null,
+      "seed": 680960040,
+      "version": 560,
+      "versionNonce": 85012520,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758649442239,
+      "link": null,
+      "locked": false,
+      "text": "class Action\n    + type: ActionTypes\n    + name: str\n    + depends_on: [str]\n\n    + execute(mem) -> [Dict<str, any>] | Void",
+      "fontSize": 20,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "class Action\n    + type: ActionTypes\n    + name: str\n    + depends_on: [str]\n\n    + execute(mem) -> [Dict<str, any>] | Void",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "an7KRTzWpCytKNKgHftKC",
+      "type": "text",
+      "x": 1528.5,
+      "y": 365.5,
+      "width": 187,
+      "height": 150,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "#228be6",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aC",
+      "roundness": null,
+      "seed": 1974317656,
+      "version": 306,
+      "versionNonce": 1574962264,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758648154009,
+      "link": null,
+      "locked": false,
+      "text": "enum ActionTypes:\n    + Extract\n    + Aggregate\n    + Filter\n    + Map\n    + Dump",
+      "fontSize": 20,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "enum ActionTypes:\n    + Extract\n    + Aggregate\n    + Filter\n    + Map\n    + Dump",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "2pQ5EULirrWs_QZPbClhh",
+      "type": "text",
+      "x": 785,
+      "y": 332.5,
+      "width": 418,
+      "height": 375,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "#228be6",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aH",
+      "roundness": null,
+      "seed": 1402251560,
+      "version": 742,
+      "versionNonce": 680432168,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758649532881,
+      "link": null,
+      "locked": false,
+      "text": "class Aggregate(Action):\n    # Static\n    + type: ActionTypes = Aggregate\n\n    # Properties\n    - actionIDs: [str]\n    - associations: [Association]\n    - output_mapper: [str]\n\n    + execute(mem):\n        tables = mem.gather(actionIDs)\n\n        for join in association:\n            \n            ",
+      "fontSize": 20,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "class Aggregate(Action):\n    # Static\n    + type: ActionTypes = Aggregate\n\n    # Properties\n    - actionIDs: [str]\n    - associations: [Association]\n    - output_mapper: [str]\n\n    + execute(mem):\n        tables = mem.gather(actionIDs)\n\n        for join in association:\n            \n            ",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "O0fso8DJqFfwJEzmpUikM",
+      "type": "text",
+      "x": 1289,
+      "y": 195,
+      "width": 594,
+      "height": 100,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "#228be6",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aI",
+      "roundness": null,
+      "seed": 1582329944,
+      "version": 459,
+      "versionNonce": 1080077144,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758647067031,
+      "link": null,
+      "locked": false,
+      "text": "input_mapper: \n    - key: ActionID (name) that produced such output\n    - value: list of strings that represent the values\n                to take",
+      "fontSize": 20,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "input_mapper: \n    - key: ActionID (name) that produced such output\n    - value: list of strings that represent the values\n                to take",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "v0kzO6vlBWOdJCV3yoG69",
+      "type": "text",
+      "x": 1379.5,
+      "y": 718.5,
+      "width": 286,
+      "height": 175,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "#228be6",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aL",
+      "roundness": null,
+      "seed": 1462407976,
+      "version": 635,
+      "versionNonce": 1012998696,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758649495598,
+      "link": null,
+      "locked": false,
+      "text": "class Association:\n    - from_actionID: str\n    - from_key_name: str\n    - from_value_name: str\n    - to_actionID: str\n    - to_value_name: str\n    - type: Type",
+      "fontSize": 20,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "class Association:\n    - from_actionID: str\n    - from_key_name: str\n    - from_value_name: str\n    - to_actionID: str\n    - to_value_name: str\n    - type: Type",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "WK34n9xeVxntypCtrlK6p",
+      "type": "text",
+      "x": 256.5,
+      "y": 787.5,
+      "width": 517,
+      "height": 175,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "#228be6",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aM",
+      "roundness": null,
+      "seed": 1166526296,
+      "version": 318,
+      "versionNonce": 1042162520,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758649002604,
+      "link": null,
+      "locked": false,
+      "text": "class Filter(Action):\n    # Static\n    + type: ActionTypes = Filter\n\n    # Properties\n    - compare: function(Dict<str, any>) -> bool\n    - output_mapper: [str]",
+      "fontSize": 20,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "class Filter(Action):\n    # Static\n    + type: ActionTypes = Filter\n\n    # Properties\n    - compare: function(Dict<str, any>) -> bool\n    - output_mapper: [str]",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "NY9jyUFLFFCNPE2sh00SX",
+      "type": "text",
+      "x": 1639,
+      "y": 606.5,
+      "width": 407,
+      "height": 200,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "#228be6",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aP",
+      "roundness": null,
+      "seed": 20345896,
+      "version": 168,
+      "versionNonce": 627282472,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758649426380,
+      "link": null,
+      "locked": false,
+      "text": "class Map(Action):\n    # Static\n    + type: ActionTypes = Map\n\n    # Properties\n    - compare_mapper: [str]\n    - mapper: function(any...) -> any\n    - output_mapper: [str]",
+      "fontSize": 20,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "class Map(Action):\n    # Static\n    + type: ActionTypes = Map\n\n    # Properties\n    - compare_mapper: [str]\n    - mapper: function(any...) -> any\n    - output_mapper: [str]",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "SkhaoW-3TTKDZzEii3Lf6",
+      "type": "text",
+      "x": 1457.5,
+      "y": 955.5,
+      "width": 121,
+      "height": 50,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "#228be6",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aQ",
+      "roundness": null,
+      "seed": 2071523672,
+      "version": 37,
+      "versionNonce": 105260376,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758648834435,
+      "link": null,
+      "locked": false,
+      "text": "class Dump:\n    -",
+      "fontSize": 20,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "class Dump:\n    -",
+      "autoResize": true,
+      "lineHeight": 1.25
+    }
+  ],
+  "appState": {
+    "gridSize": 20,
+    "gridStep": 5,
+    "gridModeEnabled": false,
+    "viewBackgroundColor": "#ffffff"
+  },
+  "files": {}
+}
--- a/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json
+++ b/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json
@ -0,0 +1,634 @@
+{
+  "type": "excalidraw",
+  "version": 2,
+  "source": "https://marketplace.visualstudio.com/items?itemName=pomdtr.excalidraw-editor",
+  "elements": [
+    {
+      "id": "JNB9z-PeqZ4s8KDfWaoXe",
+      "type": "rectangle",
+      "x": 106,
+      "y": 27,
+      "width": 653,
+      "height": 263,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a2",
+      "roundness": {
+        "type": 3
+      },
+      "seed": 710740889,
+      "version": 326,
+      "versionNonce": 1107631703,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1759156408059,
+      "link": null,
+      "locked": false
+    },
+    {
+      "id": "e13wNTgUpn2flMpmMttqx",
+      "type": "text",
+      "x": 200.5943407656526,
+      "y": 44.07937975075269,
+      "width": 307.2781467269385,
+      "height": 23.3097531902191,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a3",
+      "roundness": null,
+      "seed": 1012740663,
+      "version": 444,
+      "versionNonce": 589551257,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1759156408059,
+      "link": null,
+      "locked": false,
+      "text": "Libs/CleaningPipeline/sql_endpoint",
+      "fontSize": 18.64780255217528,
+      "fontFamily": 5,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "Libs/CleaningPipeline/sql_endpoint",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "CgxCElJkKBtIHv-5WQrbo",
+      "type": "text",
+      "x": 195,
+      "y": 80.44259472749451,
+      "width": 403.64997665852184,
+      "height": 186.4780255217528,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a4",
+      "roundness": null,
+      "seed": 1261951799,
+      "version": 507,
+      "versionNonce": 1922906999,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1759156408059,
+      "link": null,
+      "locked": false,
+      "text": "Class SqlEndpoint:\n    - sql_engine\n    + movie_ids: list[int]\n\n    #\n    + get_abbreviated_dataset_by_movie_id\n\n",
+      "fontSize": 18.64780255217528,
+      "fontFamily": 5,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "Class SqlEndpoint:\n    - sql_engine\n    + movie_ids: list[int]\n\n    #\n    + get_abbreviated_dataset_by_movie_id\n\n",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "type": "line",
+      "version": 4979,
+      "versionNonce": 1473849177,
+      "isDeleted": false,
+      "id": "sYReMTdYblr-oJtYYJALU",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 0,
+      "x": -67.14432426259049,
+      "y": 87.19293561900287,
+      "strokeColor": "#000000",
+      "backgroundColor": "#a5d8ff",
+      "width": 77.09201683999922,
+      "height": 99.49948667804088,
+      "seed": 1263944119,
+      "groupIds": [
+        "9YkNe1yqnfZy9Z1JX2xr4",
+        "BDBCTrrhjbJynRAyuf3xJ"
+      ],
+      "strokeSharpness": "round",
+      "boundElementIds": [],
+      "startBinding": null,
+      "endBinding": null,
+      "lastCommittedPoint": null,
+      "startArrowhead": null,
+      "endArrowhead": null,
+      "points": [
+        [
+          0,
+          0
+        ],
+        [
+          0.2542098813493443,
+          75.20117273657175
+        ],
+        [
+          0.011896425679918422,
+          83.76249969444815
+        ],
+        [
+          3.970409367559332,
+          87.46174320643391
+        ],
+        [
+          17.75573317066317,
+          90.59250103325854
+        ],
+        [
+          41.05683533152865,
+          91.56737225214069
+        ],
+        [
+          63.319497586673116,
+          90.01084754868091
+        ],
+        [
+          75.14781395923075,
+          86.28844687220405
+        ],
+        [
+          76.81603792670788,
+          83.15042405259751
+        ],
+        [
+          77.05033394391478,
+          76.25776215104557
+        ],
+        [
+          76.86643881413028,
+          6.3089586511537865
+        ],
+        [
+          76.45188016352971,
+          -0.2999144698665015
+        ],
+        [
+          71.50179495549581,
+          -3.9936571317850627
+        ],
+        [
+          61.077971898861186,
+          -6.132877429442784
+        ],
+        [
+          37.32348754161154,
+          -7.932114425900202
+        ],
+        [
+          18.278415656797975,
+          -6.859225353587373
+        ],
+        [
+          3.2995959613238286,
+          -3.2201165291205287
+        ],
+        [
+          -0.04168289608444441,
+          -0.045185660461322996
+        ],
+        [
+          0,
+          0
+        ]
+      ],
+      "index": "a6",
+      "frameId": null,
+      "roundness": {
+        "type": 2
+      },
+      "boundElements": [],
+      "updated": 1759158252997,
+      "link": null,
+      "locked": false
+    },
+    {
+      "type": "line",
+      "version": 2684,
+      "versionNonce": 952947769,
+      "isDeleted": false,
+      "id": "0S6dEWQVqKUVkP6Z5IX1l",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 0,
+      "x": -66.6203948243155,
+      "y": 144.31921927673278,
+      "strokeColor": "#000000",
+      "backgroundColor": "#a5d8ff",
+      "width": 77.17198221193564,
+      "height": 8.562348957853036,
+      "seed": 817033943,
+      "groupIds": [
+        "9YkNe1yqnfZy9Z1JX2xr4",
+        "BDBCTrrhjbJynRAyuf3xJ"
+      ],
+      "strokeSharpness": "round",
+      "boundElementIds": [],
+      "startBinding": null,
+      "endBinding": null,
+      "lastCommittedPoint": null,
+      "startArrowhead": null,
+      "endArrowhead": null,
+      "points": [
+        [
+          0,
+          0
+        ],
+        [
+          2.033150371639873,
+          3.413095389435587
+        ],
+        [
+          10.801287372573954,
+          6.276651055277943
+        ],
+        [
+          22.468666942209353,
+          8.010803051612635
+        ],
+        [
+          40.747074201802775,
+          8.168828515515864
+        ],
+        [
+          62.077348233027564,
+          7.0647721921469495
+        ],
+        [
+          74.53446931782398,
+          3.04824021069218
+        ],
+        [
+          77.17198221193564,
+          -0.3935204423371723
+        ]
+      ],
+      "index": "a7",
+      "frameId": null,
+      "roundness": {
+        "type": 2
+      },
+      "boundElements": [],
+      "updated": 1759158252997,
+      "link": null,
+      "locked": false
+    },
+    {
+      "type": "line",
+      "version": 2770,
+      "versionNonce": 477619481,
+      "isDeleted": false,
+      "id": "szGLND7J0nVOvRkNXX9AS",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 0,
+      "x": -67.65225214681931,
+      "y": 115.35516394150972,
+      "strokeColor": "#000000",
+      "backgroundColor": "#a5d8ff",
+      "width": 77.17198221193564,
+      "height": 8.562348957853036,
+      "seed": 1704755191,
+      "groupIds": [
+        "9YkNe1yqnfZy9Z1JX2xr4",
+        "BDBCTrrhjbJynRAyuf3xJ"
+      ],
+      "strokeSharpness": "round",
+      "boundElementIds": [],
+      "startBinding": null,
+      "endBinding": null,
+      "lastCommittedPoint": null,
+      "startArrowhead": null,
+      "endArrowhead": null,
+      "points": [
+        [
+          0,
+          0
+        ],
+        [
+          2.033150371639873,
+          3.413095389435587
+        ],
+        [
+          10.801287372573954,
+          6.276651055277943
+        ],
+        [
+          22.468666942209353,
+          8.010803051612635
+        ],
+        [
+          40.747074201802775,
+          8.168828515515864
+        ],
+        [
+          62.077348233027564,
+          7.0647721921469495
+        ],
+        [
+          74.53446931782398,
+          3.04824021069218
+        ],
+        [
+          77.17198221193564,
+          -0.3935204423371723
+        ]
+      ],
+      "index": "a8",
+      "frameId": null,
+      "roundness": {
+        "type": 2
+      },
+      "boundElements": [],
+      "updated": 1759158252997,
+      "link": null,
+      "locked": false
+    },
+    {
+      "type": "ellipse",
+      "version": 5767,
+      "versionNonce": 2119031289,
+      "isDeleted": false,
+      "id": "O3t2uGktJlDd1_OX_bpV4",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 0,
+      "x": -68.71020112890136,
+      "y": 80.06066699332126,
+      "strokeColor": "#000000",
+      "backgroundColor": "#a5d8ff",
+      "width": 76.59753601865496,
+      "height": 15.49127539284798,
+      "seed": 471296279,
+      "groupIds": [
+        "9YkNe1yqnfZy9Z1JX2xr4",
+        "BDBCTrrhjbJynRAyuf3xJ"
+      ],
+      "strokeSharpness": "sharp",
+      "boundElementIds": [
+        "bxuMGTzXLn7H-uBCptINx"
+      ],
+      "index": "a9",
+      "frameId": null,
+      "roundness": null,
+      "boundElements": [],
+      "updated": 1759158252997,
+      "link": null,
+      "locked": false
+    },
+    {
+      "type": "ellipse",
+      "version": 1177,
+      "versionNonce": 525480665,
+      "isDeleted": false,
+      "id": "_SzKlOBOvJgBg7FX0JTTM",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 0,
+      "x": -32.218214023678854,
+      "y": 104.53733467322485,
+      "strokeColor": "#000000",
+      "backgroundColor": "#228be6",
+      "width": 11.226103154161754,
+      "height": 12.183758484455605,
+      "seed": 1368927799,
+      "groupIds": [
+        "9YkNe1yqnfZy9Z1JX2xr4",
+        "BDBCTrrhjbJynRAyuf3xJ"
+      ],
+      "strokeSharpness": "sharp",
+      "boundElementIds": [],
+      "index": "aA",
+      "frameId": null,
+      "roundness": null,
+      "boundElements": [],
+      "updated": 1759158252997,
+      "link": null,
+      "locked": false
+    },
+    {
+      "type": "ellipse",
+      "version": 1465,
+      "versionNonce": 1410887609,
+      "isDeleted": false,
+      "id": "oJMl2Kxa3SPaiAY0kxo7A",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 0,
+      "x": -31.867072239745255,
+      "y": 130.75394896028996,
+      "strokeColor": "#000000",
+      "backgroundColor": "#228be6",
+      "width": 11.226103154161754,
+      "height": 12.183758484455605,
+      "seed": 1627606871,
+      "groupIds": [
+        "9YkNe1yqnfZy9Z1JX2xr4",
+        "BDBCTrrhjbJynRAyuf3xJ"
+      ],
+      "strokeSharpness": "sharp",
+      "boundElementIds": [],
+      "index": "aB",
+      "frameId": null,
+      "roundness": null,
+      "boundElements": [],
+      "updated": 1759158252997,
+      "link": null,
+      "locked": false
+    },
+    {
+      "type": "ellipse",
+      "version": 1348,
+      "versionNonce": 314839193,
+      "isDeleted": false,
+      "id": "fB6pJBSMA-pRHrpgYKaLL",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 6.239590202363168,
+      "x": -31.218214023678854,
+      "y": 159.52267553159635,
+      "strokeColor": "#000000",
+      "backgroundColor": "#228be6",
+      "width": 11.226103154161754,
+      "height": 12.183758484455605,
+      "seed": 1420643447,
+      "groupIds": [
+        "9YkNe1yqnfZy9Z1JX2xr4",
+        "BDBCTrrhjbJynRAyuf3xJ"
+      ],
+      "strokeSharpness": "sharp",
+      "boundElementIds": [],
+      "index": "aC",
+      "frameId": null,
+      "roundness": null,
+      "boundElements": [],
+      "updated": 1759158252997,
+      "link": null,
+      "locked": false
+    },
+    {
+      "type": "text",
+      "version": 846,
+      "versionNonce": 1091081593,
+      "isDeleted": false,
+      "id": "9gZ3Yy1MeP9kEOTLODqLG",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 0,
+      "x": -76.81018163712321,
+      "y": 181.11281713043917,
+      "strokeColor": "#000000",
+      "backgroundColor": "#a5d8ff",
+      "width": 95.63072204589844,
+      "height": 23.595161071904883,
+      "seed": 2019206551,
+      "groupIds": [
+        "BDBCTrrhjbJynRAyuf3xJ"
+      ],
+      "strokeSharpness": "sharp",
+      "boundElementIds": [],
+      "fontSize": 17.4778970902999,
+      "fontFamily": 1,
+      "text": "dataset.db",
+      "baseline": 16.595161071904883,
+      "textAlign": "center",
+      "verticalAlign": "top",
+      "index": "aD",
+      "frameId": null,
+      "roundness": null,
+      "boundElements": [],
+      "updated": 1759158252997,
+      "link": null,
+      "locked": false,
+      "containerId": null,
+      "originalText": "dataset.db",
+      "autoResize": true,
+      "lineHeight": 1.350000000000001
+    },
+    {
+      "id": "3eOw20xMhpB5jf_RMG24P",
+      "type": "text",
+      "x": 1131.3333333333335,
+      "y": 31.333333333333428,
+      "width": 508.3333333333333,
+      "height": 550,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aE",
+      "roundness": null,
+      "seed": 1535658041,
+      "version": 821,
+      "versionNonce": 1630266809,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1759157181677,
+      "link": null,
+      "locked": false,
+      "text": "Class PipelineApplier\n    - movie_frequence_filter : pd.DataFrame()\n    - rel_Frequence_Filter : pd.DataFrame()\n    - rel_banned_list: list[str]\n\n    + generate_movie_frequency_filter()\n    + generate_rel_frequency_filter()\n    + generate_list_relationship_filter()\n    \n    + filter_by_movie_frequency()\n    + filter_by_relationship_frequency()\n    + delete_relationship_by_list_filter()\n    + delete_relationship_by_str()\n\n    + drop_na()    \n\n    + rdf_add_special_token()\n    + group_triple_by_movie()\n    + build_by_movie()\n    # static\n    + build_triple()\n    + build_incomplete_triple()",
+      "fontSize": 20,
+      "fontFamily": 5,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "Class PipelineApplier\n    - movie_frequence_filter : pd.DataFrame()\n    - rel_Frequence_Filter : pd.DataFrame()\n    - rel_banned_list: list[str]\n\n    + generate_movie_frequency_filter()\n    + generate_rel_frequency_filter()\n    + generate_list_relationship_filter()\n    \n    + filter_by_movie_frequency()\n    + filter_by_relationship_frequency()\n    + delete_relationship_by_list_filter()\n    + delete_relationship_by_str()\n\n    + drop_na()    \n\n    + rdf_add_special_token()\n    + group_triple_by_movie()\n    + build_by_movie()\n    # static\n    + build_triple()\n    + build_incomplete_triple()",
+      "autoResize": false,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "Fbl1gpb5r7QrdRauGUWm2",
+      "type": "text",
+      "x": 158.23809523809535,
+      "y": 502.52380952380935,
+      "width": 484.2857142857143,
+      "height": 500,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aF",
+      "roundness": null,
+      "seed": 2066618807,
+      "version": 552,
+      "versionNonce": 1269344823,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1759158199532,
+      "link": null,
+      "locked": false,
+      "text": "Class Pipeline\n    - sql_endpoint: SqlEndpoint()\n\n    - task_rdf_mask_file_handler:\n    - task_bpe_corpus_file_handler:\n    - task_rdf_text_file_handler:\n    - task_rdf_completation_file_handler:\n\n    - Filter_applier : PipelineApplier()\n\n    #\n    - get_cleaned_movie_rows()\n    \n    + execute_task_bpe_corpus()\n    + execute_task_rdf_mask()\n    + execute_task_rdf_text()\n    + execute_task_rdf_completation()\n    + execute_all_task()\n\n    + use_toy_dataset()",
+      "fontSize": 20,
+      "fontFamily": 5,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "Class Pipeline\n    - sql_endpoint: SqlEndpoint()\n\n    - task_rdf_mask_file_handler:\n    - task_bpe_corpus_file_handler:\n    - task_rdf_text_file_handler:\n    - task_rdf_completation_file_handler:\n\n    - Filter_applier : PipelineApplier()\n\n    #\n    - get_cleaned_movie_rows()\n    \n    + execute_task_bpe_corpus()\n    + execute_task_rdf_mask()\n    + execute_task_rdf_text()\n    + execute_task_rdf_completation()\n    + execute_all_task()\n\n    + use_toy_dataset()",
+      "autoResize": false,
+      "lineHeight": 1.25
+    }
+  ],
+  "appState": {
+    "gridSize": 20,
+    "gridStep": 5,
+    "gridModeEnabled": false,
+    "viewBackgroundColor": "#ffffff"
+  },
+  "files": {}
+}
--- a/docs/BPE.md
+++ b/docs/BPE.md
@ -0,0 +1,22 @@
+# BPE
+
+## Reasearch Material
+
+- [BPE Wikipedia](https://en.wikipedia.org/wiki/Byte-pair_encoding)
+- [BPE Hugging Face](https://huggingface.co/learn/llm-course/chapter6/5)
+- [BPE GeeksForGeeks](https://www.geeksforgeeks.org/nlp/byte-pair-encoding-bpe-in-nlp/)
+- [BPE Medium Chetna Khanna](https://medium.com/data-science/byte-pair-encoding-subword-based-tokenization-algorithm-77828a70bee0)
+- [Stack Overflow "Explain bpe (Byte Pair Encoding) with examples?"](https://stackoverflow.com/questions/50583254/explain-bpe-byte-pair-encoding-with-examples)
+- [Implementing a byte pair encoding(BPE) Tokenizer from scratch](https://sebastianraschka.com/blog/2025/bpe-from-scratch.html)
+- [Thoretical Analysis of Byte-Pair Encoding](https://arxiv.org/pdf/2411.08671)
+- [A Formal Perspective on Byte-Pair Encoding](https://aclanthology.org/2023.findings-acl.38v2.pdf)
+- [Byte Pair Encoding is Suboptimal for Language Model Pretraining](https://arxiv.org/pdf/2004.03720)
+- [Byte pair encoding: a text compression scheme that accelerates pattern matching](https://www.researchgate.net/profile/Takeshi-Shinohara/publication/2310624_Byte_Pair_Encoding_A_Text_Compression_Scheme_That_Accelerates_Pattern_Matching/links/02e7e522f8ea00c318000000/Byte-Pair-Encoding-A-Text-Compression-Scheme-That-Accelerates-Pattern-Matching.pdf)
+- [A Formal Perspective on Byte-Pair Encoding](https://arxiv.org/pdf/2306.16837)
+- [Controlling byte pair encoding for neural machine translation](https://ieeexplore.ieee.org/abstract/document/8300571)
+- [Scaffold-BPE: Enhancing Byte Pair Encoding for Large Language Models with Simple and Effective Scaffold Token Removal](https://ojs.aaai.org/index.php/AAAI/article/view/34633)
+- [Parity-Aware Byte-Pair Encoding: Improving Cross-lingual Fairness in Tokenization](https://arxiv.org/pdf/2508.04796)
+- [Code Completion using Neural Aention and Byte Pair Encoding](https://arxiv.org/pdf/2004.06343)
+- [Getting the most out of your tokenizer for pre-training and domain adaptation](https://arxiv.org/html/2402.01035v2)
+
+
--- a/docs/DBPEDIA.md
+++ b/docs/DBPEDIA.md
@ -0,0 +1,215 @@
+# DBPedia
+
+## GraphIRI
+
+This is the graph identifier (URI):
+
+`http://dbpedia.org`
+
+## History of queries
+
+```SQL
+PREFIX dbo:  <http://dbpedia.org/ontology/>
+PREFIX dbp:  <http://dbpedia.org/property/>
+PREFIX dbr:  <http://dbpedia.org/resource/>
+
+SELECT ?subject, ?relationship, ?object
+WHERE {
+  ?subject ?relationship ?object .
+  {
+    SELECT ?object
+    WHERE {
+      ?m rdf:type dbo:Film .
+      ?object ?r ?m
+    }
+  }
+}
+```
+
+### 2 Hops
+
+```SQL
+PREFIX dbo:  <http://dbpedia.org/ontology/>
+PREFIX dbp:  <http://dbpedia.org/property/>
+PREFIX dbr:  <http://dbpedia.org/resource/>
+
+SELECT ?subject, ?relationship, ?object
+WHERE {
+  ?subject ?relationship ?object .
+  FILTER (?relationship != <http://dbpedia.org/ontology/wikiPageWikiLink>)
+  {
+    SELECT ?object
+    WHERE {
+      ?m rdf:type dbo:Film .
+      ?object ?r ?m
+      FILTER (?r != <http://dbpedia.org/ontology/wikiPageWikiLink>)
+    }
+  }
+}
+LIMIT 1000000
+```
+
+### 1 Hop
+
+```SQL
+PREFIX dbo:  <http://dbpedia.org/ontology/>
+PREFIX dbp:  <http://dbpedia.org/property/>
+PREFIX dbr:  <http://dbpedia.org/resource/>
+
+SELECT ?subject, ?relationship, ?object
+WHERE {
+  ?subject ?relationship ?object .
+  ?object rdf:type dbo:Film .
+  FILTER (?relationship != <http://dbpedia.org/ontology/wikiPageWikiLink>)
+}
+LIMIT 1000000
+```
+
+```SQL
+PREFIX dbo:  <http://dbpedia.org/ontology/>
+PREFIX dbp:  <http://dbpedia.org/property/>
+PREFIX dbr:  <http://dbpedia.org/resource/>
+
+SELECT ?subject, ?relationship, ?object
+WHERE {
+  ?subject ?relationship ?object .
+  ?subject rdf:type dbo:Film .
+}
+```
+
+```SQL
+PREFIX dbo:  <http://dbpedia.org/ontology/>
+PREFIX dbp:  <http://dbpedia.org/property/>
+PREFIX dbr:  <http://dbpedia.org/resource/>
+PREFIX foaf: <http://xmlns.com/foaf/0.1/>
+
+SELECT ?subject, ?relationship, ?object
+WHERE {
+  ?subject ?relationship ?object .
+  ?subject rdf:type dbo:Film .
+  ?a foaf:primaryTopic ?subject
+}
+```
+
+```SQL
+PREFIX dbo:  <http://dbpedia.org/ontology/>
+PREFIX dbp:  <http://dbpedia.org/property/>
+PREFIX dbr:  <http://dbpedia.org/resource/>
+
+SELECT ?subject
+WHERE {
+  ?subject rdf:type dbo:Film .
+}
+```
+
+```SQL
+PREFIX dbo:  <http://dbpedia.org/ontology/>
+PREFIX dbp:  <http://dbpedia.org/property/>
+PREFIX dbr:  <http://dbpedia.org/resource/>
+PREFIX foaf: <http://xmlns.com/foaf/0.1/>
+
+SELECT ?subject
+WHERE {
+  ?subject rdf:type dbo:Film .
+  ?a foaf:primaryTopic ?subject
+}
+```
+
+```SQL
+PREFIX dbo:  <http://dbpedia.org/ontology/>
+PREFIX dbp:  <http://dbpedia.org/property/>
+PREFIX dbr:  <http://dbpedia.org/resource/>
+PREFIX foaf: <http://xmlns.com/foaf/0.1/>
+
+SELECT ?subject, ?relationship, ?object
+WHERE {
+  ?subject ?relationship ?object .
+  ?subject rdf:type dbo:Film .
+  ?a foaf:primaryTopic ?subject
+  FILTER (?relationship NOT IN (
+    dbo:wikiPageRedirects,
+    dbo:wikiPageExternalLink,
+    dbo:wikiPageWikiLink,
+    foaf:primaryTopic
+  ))
+}
+
+```
+
+#### Wikipedia-movie
+
+a.k.a the file with the wikipedia abstract
+
+```SQL
+PREFIX dbo:  <http://dbpedia.org/ontology/>
+PREFIX dbp:  <http://dbpedia.org/property/>
+PREFIX dbr:  <http://dbpedia.org/resource/>
+PREFIX foaf: <http://xmlns.com/foaf/0.1/>
+
+SELECT  ?subject , ?object
+WHERE {
+  ?subject foaf:primaryTopic ?object .
+  ?object rdf:type dbo:Film 
+}
+```
+
+#### Reverse
+
+```SQL
+PREFIX dbo:  <http://dbpedia.org/ontology/>
+PREFIX dbp:  <http://dbpedia.org/property/>
+PREFIX dbr:  <http://dbpedia.org/resource/>
+PREFIX foaf: <http://xmlns.com/foaf/0.1/>
+
+SELECT ?subject, ?relationship, ?object
+WHERE {
+  ?subject ?relationship ?object .
+  ?object rdf:type dbo:Film .
+  ?a foaf:primaryTopic ?object
+  FILTER (?relationship NOT IN (
+    dbo:wikiPageRedirects,
+    dbo:wikiPageExternalLink,
+    dbo:wikiPageWikiLink,
+    foaf:primaryTopic
+  ))
+}
+```
+
+```SQL
+PREFIX dbo:  <http://dbpedia.org/ontology/>
+PREFIX dbp:  <http://dbpedia.org/property/>
+PREFIX dbr:  <http://dbpedia.org/resource/>
+PREFIX foaf: <http://xmlns.com/foaf/0.1/>
+
+SELECT ?subject, ?relationship, ?object
+WHERE {
+  ?subject ?relationship ?object .
+  ?object rdf:type dbo:Film .
+  ?a foaf:primaryTopic ?object
+  FILTER (?relationship NOT IN (
+    dbo:wikiPageRedirects,
+    dbo:wikiPageExternalLink,
+    dbo:wikiPageWikiLink,
+    foaf:primaryTopic
+  ))
+
+```
+
+#### Film \ wiki page ID
+
+```SQL
+PREFIX dbo:  <http://dbpedia.org/ontology/>
+PREFIX dbp:  <http://dbpedia.org/property/>
+PREFIX dbr:  <http://dbpedia.org/resource/>
+PREFIX foaf: <http://xmlns.com/foaf/0.1/>
+PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+
+SELECT ?subject ?pageID
+WHERE {
+  ?subject rdf:type dbo:Film .
+  ?subject dbo:wikiPageID ?pageID .
+  ?subject rdfs:label ?label .
+  FILTER (lang(?label) = "en")
+}
+
+```
--- a/docs/DEVELOPMENT.md
+++ b/docs/DEVELOPMENT.md
@ -0,0 +1,3 @@
+# Development
+
+## Data Gathering
--- a/docs/RESOURCES.md
+++ b/docs/RESOURCES.md
@ -0,0 +1,108 @@
+# Resources
+
+## Byte-Pair Encoding (BPE)
+
+### Overview
+
+Byte-Pair Encoding (BPE) is a simple but powerful text compression and tokenization algorithm.
+Originally introduced as a data compression method, it has been widely adopted in **Natural Language Processing (NLP)** to build subword vocabularies for models such as GPT and BERT.
+
+---
+
+### Key Idea
+
+BPE works by iteratively replacing the most frequent pair of symbols (initially characters) with a new symbol.
+Over time, frequent character sequences (e.g., common morphemes, prefixes, suffixes) are merged into single tokens.
+
+---
+
+### Algorithm Steps
+
+1. **Initialization**
+   - Treat each character of the input text as a token.
+
+2. **Find Frequent Pairs**
+   - Count all adjacent token pairs in the sequence.
+
+3. **Merge Most Frequent Pair**
+   - Replace the most frequent pair with a new symbol not used in the text.
+
+4. **Repeat**
+   - Continue until no frequent pairs remain or a desired vocabulary size is reached.
+
+---
+
+### Example
+
+Suppose the data to be encoded is:
+
+```text
+aaabdaaabac
+```
+
+#### Step 1: Merge `"aa"`
+
+Most frequent pair: `"aa"` → replace with `"Z"`
+
+```text
+ZabdZabac
+Z = aa
+```
+
+---
+
+#### Step 2: Merge `"ab"`
+
+Most frequent pair: `"ab"` → replace with `"Y"`
+
+```text
+ZYdZYac
+Y = ab
+Z = aa
+```
+
+---
+
+#### Step 3: Merge `"ZY"`
+
+Most frequent pair: `"ZY"` → replace with `"X"`
+
+```text
+XdXac
+X = ZY
+Y = ab
+Z = aa
+```
+
+---
+
+At this point, no pairs occur more than once, so the process stops.
+
+---
+
+### Decompression
+
+To recover the original data, replacements are applied in **reverse order**:
+
+```text
+XdXac
+→ ZYdZYac
+→ ZabdZabac
+→ aaabdaaabac
+```
+
+---
+
+### Advantages
+
+- **Efficient vocabulary building**: reduces the need for massive word lists.
+- **Handles rare words**: breaks them into meaningful subword units.
+- **Balances character- and word-level tokenization**.
+
+---
+
+### Limitations
+
+- Does not consider linguistic meaning—merges are frequency-based.
+- May create tokens that are not linguistically natural.
+- Vocabulary is fixed after training.
--- a/docs/SPARQL.md
+++ b/docs/SPARQL.md
@ -0,0 +1,67 @@
+# SparQL
+
+> [!NOTE]
+> Resources taken from [this website](https://sparql.dev/)
+
+## SQL Queries
+
+### SELECT
+
+```SQL
+SELECT ?var1, ?var2, ...
+```
+
+### WHERE
+
+```SQL
+WHERE {
+    pattern1 .
+    pattern2 .
+    ...
+}
+```
+
+### FILTER
+
+It's used to restrict [`WHERE`](#where) clauses
+
+```SQL
+WHERE {
+  ?person <http://example.com/hasCar> ?car .
+  FILTER (?car = <http://example.com/Car1>)
+}
+```
+
+### OPTIONAL
+
+It's used to fetch available content if exists
+
+```SQL
+SELECT ?person ?car
+WHERE {
+  ?person <http://example.com/hasCar> ?car .
+  OPTIONAL {
+    ?car <http://example.com/hasColor> ?color .
+  }
+}
+```
+
+### LIMIT
+
+Limits results
+
+```SQL
+LIMIT 10 -- Take only 10 results
+```
+
+## SparQL functions
+
+### COUNT
+
+```SQL
+SELECT (COUNT(?person) AS ?count)
+WHERE {
+  ?person <http://example.com/hasCar> ?car .
+}
+```
+
--- a/environment.yaml
+++ b/environment.yaml
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,18 @@
+certifi==2025.8.3
+charset-normalizer==3.4.3
+idna==3.10
+numpy==2.3.3
+pandas==2.3.2
+pyparsing==3.2.4
+python-dateutil==2.9.0.post0
+pytz==2025.2
+rdflib==7.1.4
+requests==2.32.5
+setuptools==78.1.1
+six==1.17.0
+SPARQLWrapper==2.0.0
+tzdata==2025.2
+urllib3==2.5.0
+wheel==0.45.1
+Wikipedia-API==0.8.1
+SQLAlchemy
Author	SHA1	Message	Date
GassiGiuseppe	1d23b9cc8b	little snippet to trim big dictionaries	2025-10-07 16:05:32 +02:00
GassiGiuseppe	165290162c	added tokenano to the init	2025-10-04 19:03:56 +02:00
GassiGiuseppe	502016f843	a new exasperated way to train the bpe, just a wild experimen that could be useful later	2025-10-04 19:03:07 +02:00
GassiGiuseppe	845c63dbef	updated tokenano to be more easy to read	2025-10-04 19:01:21 +02:00
GassiGiuseppe	bbadd4c521	update cleaning pipeline with a new method to filter also by number of films, also updated the signature of the pipeline	2025-10-04 19:00:05 +02:00
GassiGiuseppe	c2f9344c82	little test file	2025-10-04 18:58:20 +02:00
GassiGiuseppe	25f3a5d221	Logic to test BPE	2025-10-04 18:58:04 +02:00
Christian Risi	149deb407d	added cache directories	2025-10-03 18:01:05 +02:00
Christian Risi	8a21cb1b73	added python analysis	2025-10-03 18:00:52 +02:00
Christian Risi	d2a3dfe90f	Fixed bug	2025-10-03 17:59:46 +02:00
GassiGiuseppe	0f95aeb122	toy dictionary for bpe implemeted	2025-10-03 16:26:01 +02:00
Christian Risi	0ee6e48004	Fixed the same bug as before, but this time is correct	2025-10-03 16:09:53 +02:00
Christian Risi	55e0d2ac23	Fixed a encoding bug	2025-10-03 16:08:11 +02:00
Christian Risi	9c5f42153f	fixed typos	2025-10-03 15:17:44 +02:00
Christian Risi	c74689d01d	Fixed tests to reflect new version of tokenizer	2025-10-03 13:27:38 +02:00
Christian Risi	51f491d033	fixed typos	2025-10-03 13:27:17 +02:00
Christian Risi	c5c0c61f79	Fix of bugs and semantics	2025-10-03 13:26:58 +02:00
Christian Risi	6b9cb7cd35	Modified imports	2025-10-03 13:26:42 +02:00
Christian Risi	e8894504c6	Fixed a bug where a token (int) was yielded instead of a list of int	2025-10-03 11:44:44 +02:00
GassiGiuseppe	845d645348	added some stubs on special_regex_maker	2025-10-03 10:38:35 +02:00
GassiGiuseppe	09f7b39512	test files updated	2025-10-03 01:04:47 +02:00
GassiGiuseppe	070dc1b744	implemented token nano for the BPE encoding/decoding	2025-10-03 01:04:06 +02:00
GassiGiuseppe	8121c75a09	Updated NanoSocratesSplitter to split also token in decode phase	2025-10-03 01:00:36 +02:00
GassiGiuseppe	a5b8692a77	Updated NanoSocratesSpecial to work with TokeNano	2025-10-03 00:59:15 +02:00
GassiGiuseppe	7c935d2700	Update NanoSocratesBPE: corrected a minor bug about dictionary lenght, added some comment to make the code more clear	2025-10-03 00:57:19 +02:00
Christian Risi	a1d143187d	corrected test to reflect changes in BPE trainer	2025-10-02 20:11:43 +02:00
GassiGiuseppe	0eef2148a9	in NanoSocratesBPE: encode() method rewritten and tested	2025-10-02 12:12:44 +02:00
Christian Risi	856bd8909c	Added treshold	2025-10-02 11:02:03 +02:00
Christian Risi	2e595a3a23	Changed training phase to take directly data instead of its encode	2025-10-02 09:56:44 +02:00
Christian Risi	2194cc7b4f	Changed test to use pool trainer	2025-10-02 09:56:05 +02:00
Christian Risi	1eae8582b2	Fixed decoding phase	2025-10-02 09:33:58 +02:00
Christian Risi	eadba1fb82	Corrected test to reflect changes in NanoSocratesBPE	2025-10-02 09:33:47 +02:00
Christian Risi	aa765b4555	Added time checking	2025-10-02 08:48:45 +02:00
Christian Risi	17d82f0a4e	Added support to resume workload	2025-10-02 08:48:28 +02:00
Christian Risi	0975c19e69	added nwew method to encode from list of tokens	2025-10-02 08:48:13 +02:00
Christian Risi	3fe4e45ceb	Fixed a bug while joining frequencies	2025-10-02 01:50:37 +02:00
Christian Risi	d19426fa62	added multithreaded training to package	2025-10-02 01:31:05 +02:00
Christian Risi	63baf29805	Added multithreaded training	2025-10-02 01:30:24 +02:00
Christian Risi	b80b4e4112	Fixed returning type hints	2025-10-02 01:29:57 +02:00
Christian Risi	7cfaf601b4	Refactored to remove tokens that can't be compressed anymore	2025-10-01 19:42:22 +02:00
Christian Risi	fbbe6226bb	Finished uploading stubs for TokeNano	2025-10-01 18:56:53 +02:00
Christian Risi	b3d444979f	Added flag to resume work correctly	2025-10-01 12:22:09 +02:00
Christian Risi	66bcf6e55f	Added a way to recover iteration work	2025-10-01 12:21:42 +02:00
Christian Risi	dbf1d99408	Added json utils to save and load json files	2025-10-01 12:20:59 +02:00
Christian Risi	97bac464f3	Fixed JSON incompatibility	2025-10-01 00:32:43 +02:00
Christian Risi	9a8e726d74	Added cdebug configuration	2025-10-01 00:22:22 +02:00
Christian Risi	7ab9b0358e	Added script to run BPE	2025-09-30 23:59:09 +02:00
Christian Risi	30c2938d29	Fixed typing	2025-09-30 23:58:54 +02:00
Christian Risi	76f24d4eb0	Renamed file	2025-09-30 23:58:43 +02:00
Christian Risi	89a0a1f4bb	Fixed bug for utf-8 conversion	2025-09-30 23:58:31 +02:00
Christian Risi	ccacea18d8	Created files to test BPE training	2025-09-30 13:33:54 +02:00
Christian Risi	b09bd4acba	Created trainer to train BPE	2025-09-30 13:33:40 +02:00
Christian Risi	c9032cab09	Added fit method	2025-09-30 13:33:28 +02:00
Christian Risi	7020c9e683	Added utils to make regexps and iterators that check for last element	2025-09-30 13:33:12 +02:00
Christian Risi	2fe1ce9e9a	Updated Inits	2025-09-30 13:32:37 +02:00
Christian Risi	18fc2ba9d8	Added Exceptions	2025-09-30 13:32:24 +02:00
Christian Risi	5acee1d1a5	Merge branch 'dev' into dev.bpe	2025-09-30 11:35:27 +02:00
Giuseppe Gassi	2e36753da4	Merge pull request 'dev.etl' (#5 ) from dev.etl into dev Reviewed-on: #5	2025-09-30 11:28:57 +02:00
GassiGiuseppe	007f1e9554	minor updates	2025-09-29 18:53:33 +02:00
GassiGiuseppe	c319398ca0	little update to UML pipeline	2025-09-29 17:03:31 +02:00
GassiGiuseppe	255d8a072d	First implementation of the cleaning pipeline UML	2025-09-29 16:59:52 +02:00
GassiGiuseppe	8167c9d435	Added Toy Dataset entry point into the Pipeline class Before it was forced into the sql_endpoint, now all the pipeline can be managed in the Pipeline class	2025-09-29 16:03:49 +02:00
GassiGiuseppe	bd72ad3571	Added file to execute the complete cleaning pipeline	2025-09-29 15:21:26 +02:00
GassiGiuseppe	6ddb7de9da	Added sqlAlchemy to requirements	2025-09-29 15:19:19 +02:00
Christian Risi	564b0d712e	Modified UML diagram	2025-09-28 18:05:03 +02:00
Christian Risi	e433941405	Added BPE TODO: - complete the fit method	2025-09-28 18:04:44 +02:00
Christian Risi	b46df4f91a	Added Special Encoder	2025-09-28 18:03:47 +02:00
Christian Risi	d179e01971	Added Splitter to divide tokens from text	2025-09-28 18:03:16 +02:00
Christian Risi	b071145f6e	Added Chunker	2025-09-28 18:02:06 +02:00
Christian Risi	ed0255e99b	Updated imports	2025-09-28 18:01:35 +02:00
Christian Risi	3e8b5c5579	Added test for chunker	2025-09-26 18:50:32 +02:00
Christian Risi	8db35732f9	Added Chunker to restrict our domains	2025-09-26 18:50:23 +02:00
Christian Risi	9552d61f8d	Added Excetption for when we don't find a delimiter	2025-09-26 18:49:56 +02:00
Christian Risi	be8a87ce01	Modified the architecture for BPE	2025-09-26 18:49:29 +02:00
Christian Risi	5801a819e9	Added vars to make it easier to work here	2025-09-26 18:49:06 +02:00
Christian Risi	3f48b5c428	Added text files to test a chunker	2025-09-26 18:48:44 +02:00
Christian Risi	9972ab8a51	Added imports	2025-09-26 18:48:23 +02:00
GassiGiuseppe	650b37c586	Added vscode setting to execute jupyternotebook from root dir	2025-09-26 11:24:34 +02:00
Christian Risi	90012285b5	UML Diagram to explain bpe workflows	2025-09-25 20:18:21 +02:00
Christian Risi	1bbb4a0999	Added new paper	2025-09-25 20:17:48 +02:00
GassiGiuseppe	e521b0704e	deleted TODO in path_splitter_tree, as it was already resolved	2025-09-25 19:19:11 +02:00
Christian Risi	ee0aa583d5	Added Docs for BPE research	2025-09-25 19:10:45 +02:00
Christian Risi	0a698e9837	Added schema to extract from DB for BPE	2025-09-25 19:09:52 +02:00
GassiGiuseppe	9440a562f2	Merge branch 'dev.etl' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev.etl	2025-09-25 18:33:51 +02:00
Christian Risi	5eda131aac	Fixed creation query to be unique even with movieID in RDFs	2025-09-25 17:58:09 +02:00
GassiGiuseppe	57884eaf2e	CSV support added to path_splitter_tree Also resolved a minor bug to print also leaf nodes	2025-09-25 17:57:46 +02:00
Christian Risi	4548a683c2	Fixed DB	2025-09-25 17:57:45 +02:00
GassiGiuseppe	3eec49ffa5	WIP: added test file: clean_relationship.jupyter to create a first cleaning pipeline	2025-09-25 16:28:24 +02:00
Christian Risi	0bc7f4b227	Fixed Typos	2025-09-25 12:37:52 +02:00
Christian Risi	f28952b0a2	Added todo	2025-09-25 12:00:26 +02:00
Christian Risi	0b626a8e09	Modified query to take all data	2025-09-25 11:53:12 +02:00
Christian Risi	b254098532	Added views to count for subjects and objects	2025-09-25 11:40:44 +02:00
Christian Risi	ee88ffe4cf	Added View to filter over relationship counts	2025-09-25 11:32:03 +02:00
Christian Risi	70b4bd8645	Added Complex query	2025-09-25 11:31:34 +02:00
Christian Risi	6316d2bfc4	Added queries to take data from SQL for dataset	2025-09-25 11:27:19 +02:00
Christian Risi	87ca748f45	Updated DB to reflect new changes	2025-09-24 19:29:57 +02:00
Christian Risi	4315d70109	Merged abbreviation_datawarehouse into datawarehouse	2025-09-24 19:29:43 +02:00
Christian Risi	9a5d633b5e	Fixed Typos	2025-09-24 19:29:07 +02:00
Christian Risi	a6760cd52d	Updated SQL Queries to support parsing in DB	2025-09-24 19:28:55 +02:00
GassiGiuseppe	a7eb92227d	Moved all db queries file in their own folder	2025-09-24 16:44:55 +02:00
GassiGiuseppe	9f221e31cd	Merge branch 'dev.etl' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev.etl	2025-09-24 16:32:52 +02:00
GassiGiuseppe	47197194d5	WIP abbrevietion_datawarehouse to creat an abbreviation system	2025-09-24 16:32:09 +02:00
Christian Risi	0cdbf6f624	Added query to retrieve a dirty dataset from SQLite DB	2025-09-24 16:15:47 +02:00
Christian Risi	3e30489f86	Updated Queries for DB	2025-09-24 14:44:53 +02:00
Christian Risi	8a22e453e4	Fixed csv	2025-09-24 14:44:25 +02:00
Christian Risi	7feb4eb857	Fixed URI generation	2025-09-24 14:44:07 +02:00
Christian Risi	70af19d356	Removed unused imports and added trailing slashes	2025-09-24 14:04:48 +02:00
Christian Risi	a4b44ab2ee	Fixed Typos	2025-09-24 14:04:27 +02:00
Christian Risi	74b6b609dd	Fixed typos	2025-09-24 13:59:19 +02:00
Christian Risi	59796c37cb	Added script to take dbpedia uris	2025-09-24 13:49:29 +02:00
Christian Risi	f696f5950b	Added uri-abbreviations	2025-09-24 13:48:53 +02:00
Christian Risi	605b496da7	Added barebone UML diagram for a Cleaning Pipeline	2025-09-23 19:49:01 +02:00
Christian Risi	7d693964dd	Added new directories to tree structure	2025-09-23 19:47:56 +02:00
Christian Risi	25f401b577	Fixed bug for parsing and added CLI functionalities	2025-09-23 17:58:08 +02:00
Christian Risi	14c5ade230	Added CLI functionalities	2025-09-23 17:57:38 +02:00
chris-admin	4c9c51f902	Added barebone to have a splitter	2025-09-23 15:34:53 +02:00
GassiGiuseppe	63c1a4a160	added little snippet to rebuild db from db_creation.sql	2025-09-22 17:52:23 +02:00
GassiGiuseppe	51114af853	DataRetrivial deleted since it does the same thing as datawarehouse.py	2025-09-22 17:51:35 +02:00
GassiGiuseppe	3a6dca0681	Infos about Dataset contruction from csv moved from python file to markdown	2025-09-22 17:39:44 +02:00
GassiGiuseppe	346098d2b7	Added query.sql , file with the query used to populate the Dataset	2025-09-22 17:21:32 +02:00
GassiGiuseppe	64f9b41378	Built datawarehouse.py which populate the dataset	2025-09-22 17:17:22 +02:00
GassiGiuseppe	ac1ed42c49	Folder DataCleaning renamed to DatasetMerging since it doesn't clean nothing and instead Build the dataset	2025-09-22 17:11:49 +02:00
GassiGiuseppe	edd01a2c83	Dataset updated, the new one is built with the new method ( 50 new rows found ... upon 13 milion )	2025-09-22 16:57:06 +02:00
GassiGiuseppe	5aa9e3fcf3	Added in DBPEDIA the query to get Film \ wiki page ID plus some editing	2025-09-22 15:42:57 +02:00
GassiGiuseppe	0970cabf92	reverse.csv grammar correction of the header it seemed to have missplaced the header also in the middle of the csv	2025-09-22 13:47:20 +02:00
GassiGiuseppe	a26d92750f	Update movie-pageid.csv : grammar correction of the header	2025-09-22 12:59:35 +02:00
GassiGiuseppe	34c4782232	Dataset.db update. it seems to be correct	2025-09-20 23:33:56 +02:00
GassiGiuseppe	c5439533e6	DataRetrivial update, without df	2025-09-20 23:32:08 +02:00
GassiGiuseppe	8819b8e87f	DataRetrivial populate the db from csv	2025-09-20 19:56:24 +02:00
Christian Risi	1076dc8aa6	Run /Scripts/DataCleaning/SQL_Queries/db_creation.sql	2025-09-20 16:39:16 +02:00
Christian Risi	3d15e03b09	Renamed file to fix spelling	2025-09-20 16:38:38 +02:00
Christian Risi	0ee2ec6fcd	Spelling corrections	2025-09-20 16:37:57 +02:00
Christian Risi	95cfa5486c	Added instructions to create databse schema	2025-09-20 16:30:08 +02:00
GassiGiuseppe	0d30e90ee0	Created file for the db DatawareHouse Also decided firsts schema models into DBMerger	2025-09-20 15:53:32 +02:00
GassiGiuseppe	faaba17a98	Merge branch 'dev' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev	2025-09-20 14:34:25 +02:00
Christian Risi	854e5f1d98	Updated file to gather data from wikipedia	2025-09-20 14:32:30 +02:00
GassiGiuseppe	242d7f674f	wikipedia summary file uploaded Dataset composed of PageId and wikipedia Summary	2025-09-20 14:32:25 +02:00
Christian Risi	de8c2afceb	Added reconciliation	2025-09-19 22:22:09 +02:00
Christian Risi	f89dffff75	Created script to gather wikipedia abstracts	2025-09-19 19:01:38 +02:00
GassiGiuseppe	e39bad8348	Added Troubleshooting section to README where are corrected some potential issue with git and big files	2025-09-19 13:39:56 +02:00
GassiGiuseppe	7a1a221017	update of the database of movie-pageid which has subject has film uri and object wikipage id	2025-09-19 13:37:56 +02:00
Christian Risi	fafe6ae0f9	Modified tree structure with more TMP directories	2025-09-19 12:46:31 +02:00
Christian Risi	e32444df75	Updated fetchdata to be used in terminal Changes: - now you can use it as if it were a cli command Missing: - documentation	2025-09-19 12:35:15 +02:00
Christian Risi	b74b7ac4f0	Added new directories to make experiments and updated .gitignore Changes: - Added /Scripts/Experiments/Queries to keep track of important queries, once set - Added /Scripts/Experiments/Tmp to run quick experiments when still unsure while explorating datasets	2025-09-19 08:43:54 +02:00
Christian Risi	22134391d9	Added Scripts/Experiment directory This directory is to place files to make experiments	2025-09-19 08:41:46 +02:00
Christian Risi	82c9023849	Ignoring Scripts/Experiments files and always tracking .gitkeep files	2025-09-19 08:39:47 +02:00
Christian Risi	00b87e01ea	Moved fetchdata.py to reflect working tree old - ${Proj}/Scripts/fetchdata.py new - ${Proj}/Scripts/DataGathering/fetchdata.py	2025-09-19 08:37:04 +02:00
Christian Risi	ce3d4bf6c5	Renamed dir from Script to Scripts	2025-09-19 08:31:00 +02:00
GassiGiuseppe	c415b175a0	added reverse.csv with the reletion incoming to films	2025-09-18 20:26:51 +02:00
GassiGiuseppe	ec81ea7930	Added file to gather wikipedia abstract from url	2025-09-18 20:26:11 +02:00
GassiGiuseppe	4bb03f86b3	Added file to study the most frequent relationship into a csv triplet	2025-09-18 20:25:25 +02:00
GassiGiuseppe	e5f201f3db	DEVELOPMENT file makrdown created	2025-09-18 20:24:54 +02:00
GassiGiuseppe	1c715dc569	Typo correction in the markdown	2025-09-18 20:24:11 +02:00
GassiGiuseppe	6686b47328	Added SQL to obtain wikipedia url with movies	2025-09-18 20:23:10 +02:00
GassiGiuseppe	9a5a7d84fd	Merge branch 'dev' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev	2025-09-18 19:20:26 +02:00
GassiGiuseppe	9678ece9c0	Requirements changed added Pandas and some other	2025-09-18 19:07:38 +02:00
Christian Risi	67bcd732b5	Updated movies	2025-09-18 18:36:52 +02:00
Christian Risi	1a4f900500	Updated git attributes	2025-09-18 18:36:42 +02:00
Christian Risi	ca8729b67c	Merge branch 'dev' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev	2025-09-18 18:36:30 +02:00
GassiGiuseppe	9dbffc52ed	Added dataset of movies and their wikipedia's page link	2025-09-18 18:16:51 +02:00
Christian Risi	b7f504942a	Created Dataset	2025-09-18 17:24:08 +02:00
Christian Risi	7f0c5ce8d3	Updated File for fetching	2025-09-18 17:23:56 +02:00
Christian Risi	9838e287a4	Updated file	2025-09-18 12:03:09 +02:00
Christian Risi	ca6143ea3c	Updated Query histories	2025-09-18 11:46:32 +02:00
Christian Risi	16e7ab4d9f	Modified Datasets	2025-09-17 17:30:51 +02:00
Christian Risi	28723ab662	Merge branch 'dev' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev	2025-09-17 17:06:16 +02:00
Christian Risi	3e59efcf33	Generated datasets	2025-09-17 17:06:14 +02:00
Christian Risi	7c04309cc1	Added script to fetch data from DBPedia	2025-09-17 17:05:27 +02:00
Christian Risi	db87295890	Added history of queries	2025-09-17 17:04:58 +02:00
GassiGiuseppe	61568200a8	README update with setup chapter where are scripted the command to manage conda and pip	2025-09-17 16:50:50 +02:00
Christian Risi	8df2736b97	Added environments	2025-09-17 16:16:58 +02:00
Christian Risi	eb5b7f629a	Conda env	2025-09-17 15:53:17 +02:00
Christian Risi	79232b391e	First SparQL query	2025-09-17 14:26:37 +02:00
Christian Risi	72eb937b47	Fixed Markdown violations	2025-09-17 12:51:14 +02:00
Christian Risi	cececa14ce	Merge branch 'dev' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev	2025-09-17 12:48:34 +02:00
Christian Risi	2487d44abd	Added SparQL	2025-09-17 12:48:33 +02:00
GassiGiuseppe	553b86cac2	Resources file updated with Byte-Pair Encoding a technique we will use to tokenize the engress' words	2025-09-17 12:06:01 +02:00
Christian Risi	12bd781fd3	Added workspace recommendations	2025-09-17 11:38:23 +02:00
Christian Risi	463f4907b8	Added Resources documentation	2025-09-17 11:36:02 +02:00
				`@ -0,0 +1 @@`
				`<ABS>The Dark Knight is a 2008 superhero film directed by Christopher Nolan,<SOTL>`