Merge branch 'dev' into dev.embedder

2025-10-03 18:08:34 +02:00
parent 397e29742a 8e095ebb7a
commit 999141f886
61 changed files with 5135 additions and 25 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -254,4 +254,5 @@ $RECYCLE.BIN/
 # ---> Custom
 **/Tmp/**
 **/cache/**
 !**/.gitkeep
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -0,0 +1,16 @@
 {
    // Use IntelliSense to learn about possible attributes.
    // Hover to view descriptions of existing attributes.
    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
    "version": "0.2.0",
    "configurations": [
        {
            "name": "Python Debugger: Current File with Arguments",
            "type": "debugpy",
            "request": "launch",
            "program": "${file}",
            "console": "integratedTerminal",
            "args": "${command:pickArgs}"
        }
    ]
 }
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -1,24 +1,55 @@
 {
-  // Always treat the project root as the working dir for Jupyter
+    // Always treat the project root as the working dir for Jupyter
-  "jupyter.notebookFileRoot": "${workspaceFolder}",
+    "jupyter.notebookFileRoot": "${workspaceFolder}",
-
+    // When you click "Run Python File in Terminal", DON'T cd into the file's folder
-  // When you click "Run Python File in Terminal", DON'T cd into the file's folder
+    "python.terminal.executeInFileDir": false,
-  "python.terminal.executeInFileDir": false,
+    // Start new integrated terminals at the project root
-
+    "terminal.integrated.cwd": "${workspaceFolder}",
-  // Start new integrated terminals at the project root
+    // Make pytest run from the root without needing a pytest.ini
-  "terminal.integrated.cwd": "${workspaceFolder}",
+    "python.testing.pytestEnabled": true,
-
+    "python.testing.cwd": "${workspaceFolder}",
-  // Ensure Python can import from the project root no matter which file you run
+    "python.testing.pytestArgs": [
-  // (so `src/` is on sys.path). Linux shown here; add osx/windows if needed.
+        "src/test"
-  "terminal.integrated.env.linux": {
+    ],
-    "PYTHONPATH": "${workspaceFolder}"
+    // Help Pylance resolve imports like `from src...` without red squiggles
-  },
+    "python.analysis.extraPaths": [
-
+        "${workspaceFolder}"
-  // Make pytest run from the root without needing a pytest.ini
+    ],
-  "python.testing.pytestEnabled": true,
+    // For linux
-  "python.testing.cwd": "${workspaceFolder}",
+    "terminal.integrated.env.linux": {
-  "python.testing.pytestArgs": ["src/test"],
+        "PYTHONPATH": "${workspaceFolder}"
-
+    },
-  // Help Pylance resolve imports like `from src...` without red squiggles
+    // For OSX
-  "python.analysis.extraPaths": ["${workspaceFolder}"]
+    "terminal.integrated.env.osx": {
        "PYTHONPATH": "${workspaceFolder}"
    },
    // For Windows
    "terminal.integrated.env.windows": {
        "PYTHONPATH": "${workspaceFolder}"
    },
    "python.analysis.typeCheckingMode": "standard"
 }
 // {
 //   // Always treat the project root as the working dir for Jupyter
 //   "jupyter.notebookFileRoot": "${workspaceFolder}",
 //
 //   // When you click "Run Python File in Terminal", DON'T cd into the file's folder
 //   "python.terminal.executeInFileDir": false,
 //
 //   // Start new integrated terminals at the project root
 //   "terminal.integrated.cwd": "${workspaceFolder}",
 //
 //   // Ensure Python can import from the project root no matter which file you run
 //   // (so `src/` is on sys.path). Linux shown here; add osx/windows if needed.
 //   "terminal.integrated.env.windows": {
 //     "PYTHONPATH": "${workspaceFolder}"
 //   },
 //
 //   // Make pytest run from the root without needing a pytest.ini
 //   "python.testing.pytestEnabled": true,
 //   "python.testing.cwd": "${workspaceFolder}",
 //   "python.testing.pytestArgs": ["src/test"],
 //
 //   // Help Pylance resolve imports like `from src...` without red squiggles
 //   "python.analysis.extraPaths": ["${workspaceFolder}"]
 // }
--- a/Assets/Model/toy_10/README.md
+++ b/Assets/Model/toy_10/README.md
--- a/Assets/Model/toy_10/toy_dictionary.json
+++ b/Assets/Model/toy_10/toy_dictionary.json
--- a/Project_Model/Libs/BPE/Classes/Encoder.py
+++ b/Project_Model/Libs/BPE/Classes/Encoder.py
@@ -0,0 +1,4 @@
 from abc import ABC
 class Encoder(ABC):
    pass
--- a/Project_Model/Libs/BPE/Classes/NanoSocraTraineRam.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocraTraineRam.py
@@ -0,0 +1,164 @@
 from collections import deque
 import datetime
 from pathlib import Path
 import re
 from ..Classes import (
    NanoSocratesBPE,
    NanoSocratesChunker,
    NanoSocratesSplitter,
    NanoSocratesBatchMemoryBPE,
 )
 from ..Enums import TokenType
 from ..Utils import (
    special_regex_maker,
    iterator_with_checks,
    save_nanos_vocabulary,
    load_nanos_vocabulary,
    save_json,
    load_json,
 )
 class NanoSocraTraineRam:
    def __init__(
        self,
        max_vocabulary: int,
        special_vocabulary: list[str],
        merge_treshold: int = 0,
        max_iterations: int = 0,
        print_after_iterations: int = 1,
    ) -> None:
        # Bytes
        BYTE_RESERVED_TOKENS = 256
        SPECIAL_RESERVED_TOKENS = len(special_vocabulary)
        RESERVED_TOKENS = BYTE_RESERVED_TOKENS + SPECIAL_RESERVED_TOKENS
        self.__max_vocabulary = max_vocabulary - RESERVED_TOKENS
        self.__max_iterations = max_iterations
        self.__merge_treshold = merge_treshold
        self.__special_token_regex = special_regex_maker(special_vocabulary)
        self.__print_after_iterations = print_after_iterations
    def trainBPE(
        self,
        path: Path,
        bpe: NanoSocratesBPE | None = None,
    ) -> NanoSocratesBPE:
        if not path.is_file():
            raise FileNotFoundError()
        if bpe is None:
            bpe = NanoSocratesBPE()
        BPE = bpe
        if BPE.vocabulary_size > self.__max_vocabulary:
            return BPE
        exit = False
        current_iteration = 0
        data = self.__gather_data_from_file(path)
        while not exit:
            current_iteration = self.__increment_counter(current_iteration)
            LAST_VOC_SIZE = BPE.vocabulary_size
            last_memory = None
            _, data, last_memory = self.__round_train(BPE, data)
            NEW_VOC_SIZE = BPE.vocabulary_size
            if current_iteration % self.__print_after_iterations == 0:
                DELIMITER = "==============="
                DEBUG = "\n".join(
                    [
                        DELIMITER,
                        f"ITERATION: {current_iteration}",
                        DELIMITER,
                        f"\tVocabulary size: {BPE.vocabulary_size}\n",
                        f"\tFrequencies:\n{last_memory.frequencies}\n",  # type: ignore (pretty sure it's not None)
                        f"\tvocabulary:\n{BPE.vocabulary}",
                        DELIMITER,
                        "",
                    ]
                )
                print(DEBUG)
            if LAST_VOC_SIZE == NEW_VOC_SIZE:
                exit = True
                continue
            if current_iteration == self.__max_iterations:
                exit = True
                continue
            if BPE.vocabulary_size == self.__max_vocabulary:
                exit = True
                continue
        return BPE
    def __round_train(self, bpe: NanoSocratesBPE, data: list[list[int]]):
        DATA_LEN = len(data)
        NEW_DATA = []
        counter = 0
        memory = NanoSocratesBatchMemoryBPE({}, 0)
        while len(data) > 0:
            counter += 1
            last_batch = len(data) == 1
            piece = data.pop()
            bpe, memory, output = bpe.fit(piece, memory, last_batch)
            if counter % int(1E6) == 0:
                print(f"Fitted: {counter}/{DATA_LEN}")
            if len(output) < 2:
                continue
            NEW_DATA.append(output)
        return (bpe, NEW_DATA, memory)
    def __gather_data_from_file(self, path: Path) -> list[list[int]]:
        SPLITTER = NanoSocratesSplitter(self.__special_token_regex)
        DATA: list[list[int]] = []
        FILE = open(path, "r", encoding="utf-8")
        file_string = FILE.read()
        FILE.close()
        for piece, type in SPLITTER.split_text(file_string):
            if type != TokenType.BPE:
                continue
            int_list = self.__make_list_ids(piece)
            DATA.append(int_list)
        return DATA
    def __increment_counter(self, counter: int):
        # What if overflows???
        try:
            counter += 1
        except:
            print("Integer overflow")
            counter = 1
        return counter
    def __make_list_ids(self, corpus: str):
        return list(corpus.encode("utf-8"))
--- a/Project_Model/Libs/BPE/Classes/NanoSocraTrainer.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocraTrainer.py
@@ -0,0 +1,248 @@
 from collections import deque
 import datetime
 from pathlib import Path
 import re
 from ..Classes import (
    NanoSocratesBPE,
    NanoSocratesChunker,
    NanoSocratesSplitter,
    NanoSocratesBatchMemoryBPE,
 )
 from ..Enums import TokenType
 from ..Utils import (
    special_regex_maker,
    iterator_with_checks,
    save_nanos_vocabulary,
    load_nanos_vocabulary,
    save_json,
    load_json,
 )
 class NanoSocraTrainer:
    def __init__(
        self,
        max_vocabulary: int,
        special_vocabulary: list[str],
        chunk_size: int,
        merge_treshold: int = 0,
        max_iterations: int = 0,
        print_after_iterations: int = 1,
    ) -> None:
        # Bytes
        BYTE_RESERVED_TOKENS = 256
        SPECIAL_RESERVED_TOKENS = len(special_vocabulary)
        RESERVED_TOKENS = BYTE_RESERVED_TOKENS + SPECIAL_RESERVED_TOKENS
        self.__max_vocabulary = max_vocabulary - RESERVED_TOKENS
        self.__max_iterations = max_iterations
        self.__chunk_size = chunk_size
        self.__merge_treshold = merge_treshold
        self.__special_token_regex = special_regex_maker(special_vocabulary)
        self.__print_after_iterations = print_after_iterations
    def trainBPE(
        self,
        path: Path,
        cache_dir: Path,
        bpe: NanoSocratesBPE | None = None,
        resume_from_iter: int = 0,
    ) -> NanoSocratesBPE:
        if not path.is_file():
            raise FileNotFoundError()
        if not cache_dir.is_dir():
            raise NotADirectoryError()
        if bpe is None:
            bpe = NanoSocratesBPE()
        BPE = bpe
        if BPE.vocabulary_size > self.__max_vocabulary:
            return BPE
        exit = False
        cached = False
        current_iteration = 0
        input_path = path
        NEXT_ITERATION = resume_from_iter + 1 if resume_from_iter != 0 else 0
        PATH_GEN = self.__switch_paths(path, cache_dir, NEXT_ITERATION)
        MEMORY_PATH_GEN = self.__switch_memory(cache_dir, resume_from_iter)
        if resume_from_iter != 0:
            cached = True
            current_iteration = resume_from_iter
            input_path = next(PATH_GEN)
            # UGLY: fixes a bug immediately, unfortunately
            _, _ = next(MEMORY_PATH_GEN)
            _, voc_cache_path = next(MEMORY_PATH_GEN)
            vocabulary = load_nanos_vocabulary(voc_cache_path)
            BPE = NanoSocratesBPE(vocabulary)
        while not exit:
            out_path = next(PATH_GEN)
            internal_cache_path, vocabulary_cache = next(MEMORY_PATH_GEN)
            current_iteration = self.__increment_counter(current_iteration)
            LAST_VOC_SIZE = BPE.vocabulary_size
            FILE = open(out_path, "w")
            last_memory = None
            for _, memory, output in self.__round_train(input_path, BPE, cached):
                last_memory = memory
                FILE.write(output)
            FILE.close()
            internal_cache = {
                "finished_iter": current_iteration,
                "read_from": f"{input_path}",
                "wrote_to": f"{out_path}",
                "at": datetime.datetime.now(datetime.timezone.utc).strftime(
                    "%Y-%m-%d %H:%M:%S.%f"
                )[:-3],
            }
            VOCABULARY = BPE.vocabulary
            save_json(internal_cache, internal_cache_path)
            save_nanos_vocabulary(VOCABULARY, vocabulary_cache)
            cached = True
            input_path = out_path
            NEW_VOC_SIZE = BPE.vocabulary_size
            if current_iteration % self.__print_after_iterations == 0:
                DELIMITER = "==============="
                DEBUG = "\n".join(
                    [
                        DELIMITER,
                        f"ITERATION: {current_iteration}",
                        DELIMITER,
                        f"\tVocabulary size: {BPE.vocabulary_size}\n",
                        f"\tFrequencies:\n{last_memory.frequencies}\n",  # type: ignore (pretty sure it's not None)
                        f"\tvocabulary:\n{BPE.vocabulary}",
                        DELIMITER,
                        "",
                    ]
                )
                print(DEBUG)
            if LAST_VOC_SIZE == NEW_VOC_SIZE:
                exit = True
                continue
            if current_iteration == self.__max_iterations:
                exit = True
                continue
            if BPE.vocabulary_size == self.__max_vocabulary:
                exit = True
                continue
        return BPE
    def __round_train(self, path: Path, bpe: NanoSocratesBPE, cached: bool):
        CHUNKER = NanoSocratesChunker(self.__chunk_size, self.__special_token_regex)
        SPLITTER = NanoSocratesSplitter(self.__special_token_regex)
        BPE = bpe
        memory = NanoSocratesBatchMemoryBPE({}, self.__merge_treshold)
        CHUNKER_GENERATOR = iterator_with_checks(CHUNKER.chunk(path))
        for chunk, last_chunk in CHUNKER_GENERATOR:
            PIECE_GENERATOR = iterator_with_checks(SPLITTER.split_text(chunk))
            for piece, last_piece in PIECE_GENERATOR:
                LAST_BATCH = last_chunk and last_piece
                PIECE, TOKEN_TYPE = piece
                if TOKEN_TYPE != TokenType.BPE:
                    _, _, out = BPE.fit([], memory, LAST_BATCH)
                    yield (BPE, memory, PIECE)
                    continue
                PIECE_DATA = self.__make_list_ids(PIECE, cached)
                _, _, out = BPE.fit(PIECE_DATA, memory, LAST_BATCH)
                OUT_STRING = f"{out}"
                yield (BPE, memory, OUT_STRING)
    def __increment_counter(self, counter: int):
        # What if overflows???
        try:
            counter += 1
        except:
            print("Integer overflow")
            counter = 1
        return counter
    def __make_list_ids(self, corpus: str, cached: bool):
        if not cached:
            return list(corpus.encode("utf-8"))
        REDUCED_CORPUS_LEN = len(corpus) - 1
        # Skip these cars "[" "]"
        INTS = corpus[1:REDUCED_CORPUS_LEN]
        INT_LIST = list(map(int, INTS.split(",")))
        return INT_LIST
    def __switch_paths(self, path: Path, cache_path: Path, initial_iteration: int):
        CORPUS_TMP_1 = cache_path / "corpus-tmp1.txt"
        CORPUS_TMP_2 = cache_path / "corpus-tmp2.txt"
        switch = True
        if initial_iteration % 2 == 1:
            switch = False
        del initial_iteration
        while True:
            if switch:
                yield CORPUS_TMP_1
            else:
                yield CORPUS_TMP_2
            switch = not switch
    def __switch_memory(self, cache_path: Path, initial_iteration: int):
        INTERNAL_TMP_1 = cache_path / "internal-tmp1.json"
        INTERNAL_TMP_2 = cache_path / "internal-tmp2.json"
        VOCAB_TMP_1 = cache_path / "voc-tmp1.json"
        VOCAB_TMP_2 = cache_path / "voc-tmp2.json"
        switch = False
        if initial_iteration % 2 == 1:
            switch = True
        del initial_iteration
        while True:
            if switch:
                yield (INTERNAL_TMP_1, VOCAB_TMP_1)
            else:
                yield (INTERNAL_TMP_2, VOCAB_TMP_2)
            switch = not switch
--- a/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py
@@ -0,0 +1,280 @@
 from collections import deque
 import datetime
 import itertools
 from multiprocessing import Pool
 import os
 from pathlib import Path
 import re
 import time
 from ..Classes import (
    NanoSocratesBPE,
    NanoSocratesChunker,
    NanoSocratesSplitter,
    NanoSocratesBatchMemoryBPE,
 )
 from ..Enums import TokenType
 from ..Utils import (
    special_regex_maker,
    iterator_with_checks,
    save_nanos_vocabulary,
    load_nanos_vocabulary,
    save_json,
    load_json,
 )
 def split(a, n):
    k, m = divmod(len(a), n)
    return (a[i * k + min(i, m) : (i + 1) * k + min(i + 1, m)] for i in range(n))
 def split_fit(object: tuple[NanoSocratesBPE, list[list[int]]]):
    bpe, data = object
    NEW_DATA: list[list[int]] = []
    memory = NanoSocratesBatchMemoryBPE({}, 0)
    while len(data) > 0:
        piece = data.pop()
        bpe, memory, output = bpe.fit(piece, memory, False)
        if len(output) < 2:
            continue
        # We are sure of its type
        NEW_DATA.append(piece)  # type: ignore
    return (bpe, NEW_DATA, memory)
 def split_encode(object: tuple[NanoSocratesBPE, list[list[int]]]):
    bpe, data = object
    NEW_DATA: list[list[int]] = []
    for index, piece in zip(range(0, len(data)), data):
        output = bpe.encode_intermediate(piece)
        if len(output) < 2:
            continue
        # We are sure of its type
        NEW_DATA.append(data[index])  # type: ignore
    return NEW_DATA
 class NanoSocraTrainerPool:
    def __init__(
        self,
        max_vocabulary: int,
        special_vocabulary: list[str],
        merge_treshold: int = 0,
        max_iterations: int = 0,
        print_after_iterations: int = 1,
    ) -> None:
        # Bytes
        BYTE_RESERVED_TOKENS = 256
        SPECIAL_RESERVED_TOKENS = len(special_vocabulary)
        RESERVED_TOKENS = BYTE_RESERVED_TOKENS + SPECIAL_RESERVED_TOKENS
        self.__max_vocabulary = max_vocabulary - RESERVED_TOKENS
        self.__max_iterations = max_iterations
        self.__merge_treshold = merge_treshold
        self.__special_token_regex = special_regex_maker(special_vocabulary)
        self.__print_after_iterations = print_after_iterations
    # TODO: add a resume function
    def trainBPE(
        self,
        path: Path,
        cache_file: Path,
        bpe: NanoSocratesBPE | None = None,
    ) -> NanoSocratesBPE:
        if not path.is_file():
            raise FileNotFoundError()
        if not cache_file.is_file():
            file = cache_file.open("w")
            file.close()
        if bpe is None:
            bpe = NanoSocratesBPE()
        BPE = bpe
        if BPE.vocabulary_size > self.__max_vocabulary:
            return BPE
        exit = False
        current_iteration = 0
        data = self.__gather_data_from_file(path)
        data = self.__encode_from_cache(BPE, data)
        while not exit:
            current_iteration = self.__increment_counter(current_iteration)
            LAST_VOC_SIZE = BPE.vocabulary_size
            last_memory = None
            start = time.time_ns()
            _, data, last_memory = self.__round_train(BPE, data)
            end = time.time_ns()
            NEW_VOC_SIZE = BPE.vocabulary_size
            VOCABULARY = BPE.vocabulary
            save_nanos_vocabulary(VOCABULARY, cache_file)
            if current_iteration % self.__print_after_iterations == 0:
                DELIMITER = "==============="
                DEBUG = "\n".join(
                    [
                        DELIMITER,
                        f"ITERATION: {current_iteration}",
                        DELIMITER,
                        f"\tVocabulary size: {BPE.vocabulary_size - 256}\n",
                        f"\tTime elapsed: {(end - start)/1E9}s",
                        DELIMITER,
                        "",
                    ]
                )
                print(DEBUG)
            if LAST_VOC_SIZE == NEW_VOC_SIZE:
                exit = True
                continue
            if current_iteration == self.__max_iterations:
                exit = True
                continue
            if BPE.vocabulary_size == self.__max_vocabulary:
                exit = True
                continue
        return BPE
    def __round_train(self, bpe: NanoSocratesBPE, data: list[list[int]]):
        NEW_DATA: list[list[int]] = []
        MEMORY = NanoSocratesBatchMemoryBPE({}, self.__merge_treshold)
        fit_funct = split_fit
        CPU_COUNT = os.process_cpu_count()
        if CPU_COUNT is None:
            raise Exception()
        VOCABULARY = bpe.vocabulary
        data_chunks = split(data, CPU_COUNT)
        JOBS = [(NanoSocratesBPE(VOCABULARY), chunk) for chunk in data_chunks]
        JOB_RESULTS: list[
            tuple[NanoSocratesBPE, list[list[int]], NanoSocratesBatchMemoryBPE]
        ]
        with Pool() as pool:
            JOB_RESULTS = pool.map(fit_funct, JOBS)
        for i, res in zip(range(0, CPU_COUNT), JOB_RESULTS):
            _, job_output, job_memory = res
            NEW_DATA.extend(job_output)
            for key, value in job_memory.frequencies.items():
                frequency = MEMORY.frequencies.get(key)
                if frequency is None:
                    frequency = 0
                    MEMORY.frequencies[key] = 0
                frequency += value
                MEMORY.frequencies[key] = frequency
            del job_output
            del job_memory
            print(f"Joined {i + 1} out of {CPU_COUNT}")
        # Get new token
        bpe.fit([], MEMORY, True)
        print(f"Sentences from {len(data)} to {len(NEW_DATA)}")
        return (bpe, NEW_DATA, MEMORY)
    def __gather_data_from_file(self, path: Path) -> list[list[int]]:
        SPLITTER = NanoSocratesSplitter(self.__special_token_regex)
        DATA: list[list[int]] = []
        FILE = open(path, "r", encoding="utf-8")
        file_string = FILE.read()
        FILE.close()
        for piece, type in SPLITTER.split_text(file_string):
            if type != TokenType.BPE:
                continue
            int_list = self.__make_list_ids(piece)
            DATA.append(int_list)
        return DATA
    def __encode_from_cache(self, bpe: NanoSocratesBPE, data: list[list[int]]):
        NEW_DATA : list[list[int]]= []
        CPU_COUNT = os.process_cpu_count()
        if CPU_COUNT is None:
            raise Exception()
        VOCABULARY = bpe.vocabulary
        data_chunks = split(data, CPU_COUNT)
        JOBS = [(NanoSocratesBPE(VOCABULARY), chunk) for chunk in data_chunks]
        JOB_RESULTS: list[list[list[int]]]
        with Pool() as pool:
            JOB_RESULTS = pool.map(split_encode, JOBS)
        for i, res in zip(range(0, CPU_COUNT), JOB_RESULTS):
            job_output = res
            NEW_DATA.extend(job_output)
            del job_output
            print(f"Joined {i + 1} out of {CPU_COUNT}")
        print(f"Sentences from {len(data)} to {len(NEW_DATA)}")
        return NEW_DATA
    def __increment_counter(self, counter: int):
        # What if overflows???
        try:
            counter += 1
        except:
            print("Integer overflow")
            counter = 1
        return counter
    def __make_list_ids(self, corpus: str):
        return list(corpus.encode("utf-8"))
--- a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py
@@ -0,0 +1,219 @@
 from collections import deque
 from .Encoder import Encoder
 from ..Errors import OutOfDictionaryException, DuplicateWordException
 # ABOUT THE DICTIONARY:
 # the string is converted into utf-char bytes, that is: each char is rappresented with a set of bytes from 1 to 4.
 # each bytes get casted into an integer; such that, if an integer has its value lower then 256,
 # then it is rappresenting an utf-char-byte, otherwise it is a token-ID.
 class NanoSocratesBatchMemoryBPE:
    """Memory to batch training. Keeps token couple frequencies, and merge_treshold"""
    def __init__(
        self, frequencies: dict[tuple[int, int], int], merge_treshold: int
    ) -> None:
        self.frequencies = frequencies
        self.merge_treshold = merge_treshold
 class NanoSocratesBPE(Encoder):
    def __init__(self, vocabulary: dict[tuple[int, int], int] | None = None) -> None:
        super().__init__()
        self.__vocabulary: dict[tuple[int, int], int] = {}
        self.__reverse_vocabulary: dict[int, tuple[int, int]] = {}
        if vocabulary is None:
            return
        for key, value in vocabulary.items():
            if value < 256:
                raise OutOfDictionaryException()
                # values under 256 are used for unpaired char
            # TODO: check if they are in order
            self.__vocabulary[key] = value
            self.__reverse_vocabulary[value] = key
    @property
    def vocabulary_size(self):
        return len(self.__vocabulary) + 256
    @property
    def vocabulary(self):
        return self.__vocabulary
    @property
    def __next_id(self) -> int:
        """
        Gets the next it
        Returns:
            int:
        """
        return self.vocabulary_size
    # TODO: implement fit
    def fit(
        self,
        chunk_data: list[int],
        memory: NanoSocratesBatchMemoryBPE,
        last_batch: bool,
    ):
        ENCODED_CHUNK = self.encode_intermediate(chunk_data)
        DATA_LEN_BEFORE_LAST = len(ENCODED_CHUNK) - 1
        # update frequency of each couple of element
        for i in range(0, DATA_LEN_BEFORE_LAST):
            CANDIDATE_COUPLE = (ENCODED_CHUNK[i], ENCODED_CHUNK[i + 1])
            frequency = memory.frequencies.get(CANDIDATE_COUPLE)
            # Initialize frequency
            if frequency is None:
                frequency = 0
                memory.frequencies[CANDIDATE_COUPLE] = 0
            frequency += 1
            memory.frequencies[CANDIDATE_COUPLE] = frequency
        if not last_batch:
            return (self, memory, ENCODED_CHUNK)
        if len(memory.frequencies) < 1:
            return (self, memory, ENCODED_CHUNK)
        FREQUENCIES = memory.frequencies
        MAX_COUPLE = max(FREQUENCIES.items(), key=lambda item: item[1])[0]
        FREQUENCY = FREQUENCIES[MAX_COUPLE]
        if FREQUENCY < memory.merge_treshold:
            return (self, memory, ENCODED_CHUNK)
        self.__learn_word(MAX_COUPLE)
        return (self, memory, ENCODED_CHUNK)
    def encode(self, piece: str) -> list[int]:
        """Encode a String into token IDs, it firt convert it into utf-8, then pass the list of integer to encode_intermediate()
        Args:
            piece (str):
        Returns:
            list[int]:
        """
        converted_piece = list(piece.encode("utf-8"))
        return self.encode_intermediate(converted_piece)
    def encode_intermediate(self, piece: list[int]) -> list[int]:
        """Encode a piece (as list of integer) till its maximum
        Args:
            piece (list[int]): piece to encode
        Returns:
            list[int]: piece encoded
        """
        current_piece = piece
        new_piece = self.__round_encode(current_piece)
        # until current_piece is bigger then new_piece, keep encoding
        while len(current_piece) != len(new_piece):
            current_piece = new_piece
            new_piece = self.__round_encode(current_piece)
        return current_piece
    def __round_encode(self, piece: list[int]):
        """A single round of encode that traverse all the object. Multiple round are needed for a full encode: \n
        1) "ABAB" -> "XX"
        2) "XX" -> "Y"
        Args:
            piece (list[int]): the object to encode as a list of integer
        Returns:
            (list[int]): the one time encoded object
        """
        if len(piece) == 1:
            return piece
        PIECE_LENGTH = len(piece) - 1
        NEW_PIECE: list[int] = []
        index = 0
        while index < PIECE_LENGTH:
            CANDIDATE_WORD = (
                piece[index],
                piece[index + 1],
            )  # take a tuple of consecutive element [int]
            CANDIDATE_TOKEN = self.__vocabulary.get(CANDIDATE_WORD)
            # if no token to substitute the tuple, append the first element
            if CANDIDATE_TOKEN is None:
                NEW_PIECE.append(piece[index])
                index += 1
                # if the latter element of the tuple is the last element of the piece, append it
                if index == PIECE_LENGTH:
                    NEW_PIECE.append(piece[index])
                continue
            # in this case there was a candidate token to substitute the couple of element
            NEW_PIECE.append(CANDIDATE_TOKEN)
            index += 2
            if index == PIECE_LENGTH:
                NEW_PIECE.append(piece[index])
        return NEW_PIECE
    # TODO: Remake decode to take a list of token IDs
    def decode(self, token_ids: list[int]) -> str:
        # deque: double ended queue
        token_stack: deque[int] = deque(token_ids)
        UTF_8_STRING_ARR: bytearray = bytearray()
        while len(token_stack) > 0:
            TOKEN_ID = token_stack.popleft()
            if TOKEN_ID < 256:
                UTF_8_STRING_ARR.append(TOKEN_ID)
                continue
            left_token, right_token = self.__token_decode(TOKEN_ID)
            token_stack.appendleft(right_token)
            token_stack.appendleft(left_token)
        return UTF_8_STRING_ARR.decode("utf-8")
    def __token_decode(self, token_id: int) -> tuple[int, int]:
        CANDIDATE_DECODED = self.__reverse_vocabulary.get(token_id)
        if CANDIDATE_DECODED is None:
            raise OutOfDictionaryException()
        return CANDIDATE_DECODED
    def __learn_word(self, words: tuple[int, int]):
        """learn a new couple of object in the vocabulary
        Args:
            words (tuple[int, int]): the Pair of element to substitute with a new tokenID
        Raises:
            DuplicateWordException: it launch if there is a duplicate of the new tokenID in the dictionary
        """
        ID = self.__next_id
        DUPLICATE = self.__vocabulary.get(words)
        if DUPLICATE is not None:
            raise DuplicateWordException()
        self.__vocabulary[words] = ID
        self.__reverse_vocabulary[ID] = words
--- a/Project_Model/Libs/BPE/Classes/NanoSocratesChunker.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocratesChunker.py
@@ -0,0 +1,70 @@
 from pathlib import Path
 import re
 from ..Errors import DelimiterNotFoundException
 class NanoSocratesChunker:
    def __init__(self, max_size: int, special_token_regex: re.Pattern) -> None:
        self.__max_size: int = max_size
        self.__special_token_regex: re.Pattern = special_token_regex
        self.__residual: str = ""
    # max theorethical size of chars
    #   between special tokens:
    #       - min: size - len(longest_token)
    #       - MAX: size - len(shortest_token)
    def chunk(self, file_path: Path):
        # read_file
        FILE = open(file_path, "r", encoding="utf-8")
        exit = False
        while not exit:
            REMAINING_SIZE = self.__max_size - len(self.__residual)
            READ_SIZE = min(self.__max_size, REMAINING_SIZE)
            FILE_CHUNK = FILE.read(READ_SIZE)
            if len(FILE_CHUNK) == 0:
                exit = True
                continue
            CHUNK = self.__append_residuals(FILE_CHUNK)
            boundaries = self.__identify_boudaries(CHUNK)
            if boundaries is None:
                # boundaries not found in 2 chunks,
                if len(CHUNK) > self.__max_size - 1:
                    raise DelimiterNotFoundException()
                if exit:
                    yield CHUNK
                self.__set_residual(0, CHUNK)
                continue
            start, end = boundaries
            self.__set_residual(end, CHUNK)
            yield CHUNK[start:end]
    def __identify_boudaries(self, corpus: str) -> tuple[int, int] | None:
        end = 0
        for match in self.__special_token_regex.finditer(corpus):
            # print(match)
            end = match.end()
        if end == 0:
            return None
        return (0, end)
    def __append_residuals(self, corpus: str) -> str:
        RESIDUAL = self.__residual
        self.__residual = ""
        return RESIDUAL + corpus
    def __set_residual(self, index: int, corpus: str):
        self.__residual = corpus[index:]
--- a/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py
@@ -0,0 +1,64 @@
 from .Encoder import Encoder
 from ..Errors import OutOfDictionaryException
 class NanoSocratesSpecial(Encoder):
    def __init__(
        self, bpe_vocabulary_size: int, special_tokens: list[str] = []
    ) -> None:
        super().__init__()
        self.__bpe_offset = bpe_vocabulary_size
        self.__vocabulary: dict[str, int] = {}
        self.__reverse_vocabulary: dict[int, str] = {}
        if len(special_tokens) == 0:
            return
        for index, TOKEN in zip(range(0, len(special_tokens)), special_tokens):
            CANDIDATE_ID = self.__bpe_offset + index + 1
            self.__vocabulary[TOKEN] = CANDIDATE_ID
            self.__reverse_vocabulary[CANDIDATE_ID] = TOKEN
    @property
    def __next_id(self):
        BPE_OFFSET = self.__bpe_offset
        VOC_LENGTH = len(self.__vocabulary)
        return BPE_OFFSET + VOC_LENGTH + 1
    @property
    def vocabulary(self) -> dict[str, int]:
        return self.__vocabulary
    @property
    def reverse_vocabulary(self) -> dict[int, str]:
        return self.__reverse_vocabulary
    def add_special_word_to_vocabulary(self, word: str):
        CANDIDATE_INDEX = self.__next_id
        self.__vocabulary[word] = CANDIDATE_INDEX
        self.__reverse_vocabulary[CANDIDATE_INDEX] = word
    def encode(self, word: str) -> list[int]:
        ID = self.__vocabulary.get(word)
        if ID is None:
            raise OutOfDictionaryException()
        return [ID]
    def decode(self, token_id: list[int]) -> str:
        if len(token_id) != 1:
            raise OutOfDictionaryException()
        ID = token_id[0]
        WORD = self.__reverse_vocabulary.get(ID)
        if WORD is None:
            raise OutOfDictionaryException()
        return WORD
--- a/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py
@@ -0,0 +1,98 @@
 import re
 from collections import deque
 from typing import Generator
 from ..Enums import TokenType
 class NanoSocratesSplitter:
    def __init__(
        self, special_token_regex: re.Pattern, max_bpe_token_id: int = 255
    ) -> None:
        # attention the regex got already compiled
        self.__special_token_regex = special_token_regex
        self.__max_bpe_token_id: int = max_bpe_token_id  # used for decoding
    def split_text(self, corpus: str) -> Generator[tuple[str, TokenType]]:
        """Split a text using a regex given
        Args:
            corpus (str): all the corpus string to split
        Yields:
            Generator[tuple[str, TokenType]]: each time returns a piece of the splitted text: string and its TokenType. \n
            TokenType describe if the string is for the BPE or a special Token [BPE, SPECIAL]
        """
        bpe_start = 0
        bpe_end = len(corpus)  # this can be deleted!
        for special_token_start, special_token_end in self.__find_boundaries(corpus):
            # FIND BPE
            bpe_end = special_token_start
            BPE_TOKEN_TEXT = corpus[bpe_start:bpe_end]
            if BPE_TOKEN_TEXT != "":
                for WORD in self.__split_words(BPE_TOKEN_TEXT):
                    yield (WORD, TokenType.BPE)
            # FIND SPECIAL TOKEN
            SPECIAL_TOKEN_TEXT = corpus[special_token_start:special_token_end]
            if SPECIAL_TOKEN_TEXT != "":
                yield (SPECIAL_TOKEN_TEXT, TokenType.SPECIAL)
            # now save the new bpe start point
            # it will used in the next interaction
            bpe_start = special_token_end
    def __find_boundaries(self, corpus: str) -> Generator[tuple[int, int]]:
        """
        Find each time the start and end (not included) of the special token
        Args:
            corpus (str): the string where the special token will be searched
        Yields:
            Generator[tuple[int, int]]: Note the end is not included
        """
        for match in self.__special_token_regex.finditer(corpus):
            start = match.start()
            end = match.end()
            yield (start, end)
        # make the last boundary be the end of corpus
        # eof = len(corpus)
        # yield(eof,eof)
    def __split_words(self, bpe_piece: str) -> Generator[str]:
        END_OF_STRING = len(bpe_piece)
        bound_start = 0
        bound_end = END_OF_STRING + 1
        for i in range(0, END_OF_STRING):
            CANDIDATE_CHAR = bpe_piece[i]
            if CANDIDATE_CHAR != " ":
                continue
            bound_end = i
            yield bpe_piece[bound_start:bound_end]
            bound_start = bound_end
            bound_end = END_OF_STRING + 1
        yield bpe_piece[bound_start:bound_end]
    def split_tokens(self, corpus: list[int]) -> Generator[tuple[list[int], TokenType]]:
        not_special_token_list: list[int] = []
        for token in corpus:
            if token > self.__max_bpe_token_id:
                if len(not_special_token_list) > 0:
                    yield (not_special_token_list, TokenType.BPE)
                    not_special_token_list = []
                yield ([token], TokenType.SPECIAL)
                continue
            not_special_token_list.append(token)
--- a/Project_Model/Libs/BPE/Classes/TokeNano.py
+++ b/Project_Model/Libs/BPE/Classes/TokeNano.py
@@ -0,0 +1,8 @@
 from Project_Model.Libs.BPE.Classes.TokeNanoCore import TokeNanoCore
 class TokeNano:
    def __init__(self):
        pass
--- a/Project_Model/Libs/BPE/Classes/TokeNanoCore.py
+++ b/Project_Model/Libs/BPE/Classes/TokeNanoCore.py
@@ -0,0 +1,56 @@
 from pathlib import Path
 from ..Classes import NanoSocratesSplitter
 from ..Classes import NanoSocratesBPE
 from ..Classes import NanoSocratesSpecial
 from ..Utils import special_regex_maker
 from ..Enums import TokenType
 class TokeNanoCore:
    def __init__(
        self,
        bpe_vocabulary: dict[tuple[int, int], int],
        special_token_list: list[str],
        # special_vocabulary: dict[str, int]
    ):
        self.__bpe_encoder = NanoSocratesBPE(bpe_vocabulary)
        SPECIAL_REGEX = special_regex_maker(special_token_list)
        BPE_VOCABULARY_SIZE = self.__bpe_encoder.vocabulary_size
        self.__splitter = NanoSocratesSplitter(SPECIAL_REGEX, BPE_VOCABULARY_SIZE)
        self.__special_encoder = NanoSocratesSpecial(
            BPE_VOCABULARY_SIZE, special_token_list
        ) 
    def encode(self, corpus: str) -> list[int]:
        output: list[int] = []
        for piece, token_type in self.__splitter.split_text(corpus):
            if token_type == TokenType.SPECIAL:
                output.extend(self.__special_encoder.encode(piece))
            # slow but clear
            if token_type == TokenType.BPE:
                output.extend(self.__bpe_encoder.encode(piece))
        return output
    def decode(self, corpus: list[int]) -> str:
        output_str = ""
        for token, token_type in self.__splitter.split_tokens(corpus):
            # token is an integer if special, a list of integer otherwise
            if token_type == TokenType.SPECIAL:
                output_str += self.__special_encoder.decode(
                    token
                )  # it accept an integer
            # slow but clear
            if token_type == TokenType.BPE:
                output_str += self.__bpe_encoder.decode(
                    token
                )  # it accept a list of integer
        return output_str
--- a/Project_Model/Libs/BPE/Classes/init.py
+++ b/Project_Model/Libs/BPE/Classes/init.py
@@ -0,0 +1,16 @@
 from .NanoSocratesChunker import NanoSocratesChunker
 from .NanoSocratesSplitter import NanoSocratesSplitter
 from .NanoSocratesBPE import NanoSocratesBPE, NanoSocratesBatchMemoryBPE
 from .NanoSocraTrainer import NanoSocraTrainer
 from .NanoSocraTraineRam import NanoSocraTraineRam
 from .NanoSocraTrainerPool import NanoSocraTrainerPool
 from .NanoSocratesSpecial import NanoSocratesSpecial
 __all__ = [
    "NanoSocratesChunker",
    "NanoSocratesSplitter",
    "NanoSocratesBPE",
    "NanoSocraTrainer",
    "NanoSocraTraineRam",
    "NanoSocraTrainerPool"
 ]
--- a/Project_Model/Libs/BPE/Enums/SpecialToken.py
+++ b/Project_Model/Libs/BPE/Enums/SpecialToken.py
@@ -0,0 +1,21 @@
 from enum import Enum
 class SpecialToken(Enum):
    # (Enum, str) -> throws an error
    START_TRIPLE_LIST = "<SOTL>"
    START_TRIPLE = "<SOT>"
    END_TRIPLE = "<EOT>"
    SUBJECT = "<SUBJ>"
    RELATIONSHIP = "<PRED>"
    OBJECT = "<OBJ>"
    ABSTRACT = "<ABS>"
    CORPUS_END = "<END>"
    ## Tasks' Token
    RDF_TO_TEXT = "<RDF2TXT>"
    TEXT_TO_RDF = "<TEXT2RDF>"
    CONTINUE_RDF = "<CONTINUERDF>"
    MASK = "<MASK>"
    # BPE Training:
--- a/Project_Model/Libs/BPE/Enums/TokenType.py
+++ b/Project_Model/Libs/BPE/Enums/TokenType.py
@@ -0,0 +1,6 @@
 from enum import Enum, auto
 class TokenType(Enum):
    SPECIAL = auto()
    BPE = auto()
--- a/Project_Model/Libs/BPE/Enums/init.py
+++ b/Project_Model/Libs/BPE/Enums/init.py
@@ -0,0 +1 @@
 from .TokenType import TokenType
--- a/Project_Model/Libs/BPE/Errors/DelimiterNotFoundException.py
+++ b/Project_Model/Libs/BPE/Errors/DelimiterNotFoundException.py
@@ -0,0 +1,4 @@
 class DelimiterNotFoundException(Exception):
    def __init__(self, *args: object) -> None:
        super().__init__(*args)
--- a/Project_Model/Libs/BPE/Errors/DuplicateWordException.py
+++ b/Project_Model/Libs/BPE/Errors/DuplicateWordException.py
@@ -0,0 +1,4 @@
 class DuplicateWordException(Exception):
    def __init__(self, *args: object) -> None:
        super().__init__(*args)
--- a/Project_Model/Libs/BPE/Errors/OutOfDictionaryException.py
+++ b/Project_Model/Libs/BPE/Errors/OutOfDictionaryException.py
@@ -0,0 +1,4 @@
 class OutOfDictionaryException(Exception):
    def __init__(self, *args: object) -> None:
        super().__init__(*args)
--- a/Project_Model/Libs/BPE/Errors/SentenceTooLongException.py
+++ b/Project_Model/Libs/BPE/Errors/SentenceTooLongException.py
@@ -0,0 +1,4 @@
 class SentenceTooLongException(Exception):
    def __init__(self, *args: object) -> None:
        super().__init__(*args)
--- a/Project_Model/Libs/BPE/Errors/init.py
+++ b/Project_Model/Libs/BPE/Errors/init.py
@@ -0,0 +1,11 @@
 from .DelimiterNotFoundException import DelimiterNotFoundException
 from .OutOfDictionaryException import OutOfDictionaryException
 from .DuplicateWordException import DuplicateWordException
 from .SentenceTooLongException import SentenceTooLongException
 __all__ = [
    "DelimiterNotFoundException",
    "OutOfDictionaryException",
    "DuplicateWordException",
    "SentenceTooLongException"
 ]
--- a/Project_Model/Libs/BPE/Utils/init.py
+++ b/Project_Model/Libs/BPE/Utils/init.py
@@ -0,0 +1,13 @@
 from .special_regex_maker import special_regex_maker
 from .lag_checker_iterator import iterator_with_checks
 from .vocabulary import save_nanos_vocabulary, load_nanos_vocabulary
 from .json_utils import save_json, load_json
 from .special_regex_maker import special_regex_maker
 __all__ = [
    "special_regex_maker",
    "iterator_with_checks",
    "save_nanos_vocabulary",
    "load_nanos_vocabulary",
    "save_json", "load_json"
 ]
--- a/Project_Model/Libs/BPE/Utils/json_utils.py
+++ b/Project_Model/Libs/BPE/Utils/json_utils.py
@@ -0,0 +1,18 @@
 import json
 from pathlib import Path
 def save_json(dictionary: dict, path: Path):
    json_string = json.dumps(dictionary)
    FILE = open(path, "w")
    FILE.write(json_string)
    FILE.close()
 def load_json(path: Path) -> dict:
    FILE = open(path, "r")
    json_string = FILE.read()
    FILE.close()
    return json.loads(json_string)
--- a/Project_Model/Libs/BPE/Utils/lag_checker_iterator.py
+++ b/Project_Model/Libs/BPE/Utils/lag_checker_iterator.py
@@ -0,0 +1,27 @@
 from collections import deque
 from typing import Generator, TypeVar
 T1 = TypeVar("T1")
 T2 = TypeVar("T2")
 T3 = TypeVar("T3")
 def iterator_with_checks(
    generator: Generator[T1, T2, T3],
 ) -> Generator[tuple[T1, bool], T2, T3]:
    # Here we can ignore to catch stop iteration
    #   we will propagate it
    last_element = next(generator)
    while True:
        RETURN_ELEMENT = last_element
        try:
            element = next(generator)
            last_element = element
            yield (RETURN_ELEMENT, False)
        except StopIteration:
            yield (RETURN_ELEMENT, True)
            break
--- a/Project_Model/Libs/BPE/Utils/special_regex_maker.py
+++ b/Project_Model/Libs/BPE/Utils/special_regex_maker.py
@@ -0,0 +1,15 @@
 import re
 def special_regex_maker(special_tokens: list[str]) -> re.Pattern:
    """compile a regex for the special token
    Args:
        special_tokens (list[str]): the list of special token
    Returns:
        re.Pattern:
    """
    REGEX_STR = "|".join(special_tokens)
    return re.compile(REGEX_STR)
--- a/Project_Model/Libs/BPE/Utils/vocabulary.py
+++ b/Project_Model/Libs/BPE/Utils/vocabulary.py
@@ -0,0 +1,49 @@
 import json
 from pathlib import Path
 from ..Errors import OutOfDictionaryException
 def nanos_vocabulary2json_str(vocabulary: dict[tuple[int, int], int]) -> str:
    JSON: dict[str, int] = {}
    for key, item in vocabulary.items():
        TUPLE_STR = f"{key}"
        JSON[TUPLE_STR] = item
    return json.dumps(JSON)
 def nanos_json_str2vocabulary(json_string: str) -> dict[tuple[int, int], int]:
    JSON: dict[str, int] = json.loads(json_string)
    VOCABULARY: dict[tuple[int, int], int] = {}
    for key, item in JSON.items():
        REDUCED_KEY = len(key) - 1
        KEY_STR = key[1:REDUCED_KEY]
        VOC_KEY = tuple(map(int, KEY_STR.split(",")))
        if len(VOC_KEY) != 2:
            raise OutOfDictionaryException()
        # Checked for weird things above
        VOCABULARY[VOC_KEY] = item  # type: ignore
    return VOCABULARY
 def save_nanos_vocabulary(vocabulary: dict[tuple[int, int], int], path: Path):
    json_string = nanos_vocabulary2json_str(vocabulary)
    FILE = open(path, "w")
    FILE.write(json_string)
    FILE.close()
 def load_nanos_vocabulary(path: Path) -> dict[tuple[int, int], int]:
    FILE = open(path, "r")
    json_string = FILE.read()
    FILE.close()
    return nanos_json_str2vocabulary(json_string)
--- a/Project_Model/Libs/BPE/init.py
+++ b/Project_Model/Libs/BPE/init.py
@@ -0,0 +1,9 @@
 from .Classes import *
 from .Enums import *
 from .Errors import *
 from .Utils import *
 from . import Classes
 from . import Enums
 from . import Errors
 from . import Utils
--- a/Project_Model/Libs/init.py
+++ b/Project_Model/Libs/init.py
@@ -0,0 +1 @@
 from . import BPE
--- a/Project_Model/Tests/bpe_test.py
+++ b/Project_Model/Tests/bpe_test.py
@@ -0,0 +1,74 @@
 from Project_Model.Libs.BPE.Enums import TokenType
 import Project_Model.Libs.BPE as BPE
 import re
 class TestBPE:
    def test_bpe_encoding_simple(self):
        TEXT = "abababab"
        # ab = 256
        # 256, 256 = 257
        # 257, 257 = 258
        VOCABULARY = {(ord("a"), ord("b")): 256, (256, 256): 257, (257, 257): 258}
        EXPECTED = [258]
        BPE_ENCODER = BPE.NanoSocratesBPE(VOCABULARY)
        ENCODED = BPE_ENCODER.encode(TEXT)
        assert len(ENCODED) == len(EXPECTED)
        for encoded, expected in zip(ENCODED, EXPECTED):
            assert encoded == expected
    def test_bpe_decoding_simple(self):
        INPUT = [258]
        # ab = 256
        # 256, 256 = 257
        # 257, 257 = 258
        VOCABULARY = {(ord("a"), ord("b")): 256, (256, 256): 257, (257, 257): 258}
        EXPECTED = "abababab"
        BPE_ENCODER = BPE.NanoSocratesBPE(VOCABULARY)
        DECODED = BPE_ENCODER.decode(INPUT)
        assert len(DECODED) == len(EXPECTED)
        for encoded, expected in zip(DECODED, EXPECTED):
            assert encoded == expected
    def test_bpe_decoding_edge_1(self):
        INPUT = [258, ord("c")]
        # ab = 256
        # 256, 256 = 257
        # 257, 257 = 258
        VOCABULARY = {(ord("a"), ord("b")): 256, (256, 256): 257, (257, 257): 258}
        EXPECTED = "ababababc"
        BPE_ENCODER = BPE.NanoSocratesBPE(VOCABULARY)
        DECODED = BPE_ENCODER.decode(INPUT)
        assert len(DECODED) == len(EXPECTED)
        for encoded, expected in zip(DECODED, EXPECTED):
            assert encoded == expected
 # Useful to debug weird cases
 if __name__ == "__main__":
    # TestBPE().test_bpe_decoding_simple()
    TestBPE().test_bpe_encoding_simple()
--- a/Project_Model/Tests/bpe_trainer_test.py
+++ b/Project_Model/Tests/bpe_trainer_test.py
@@ -0,0 +1,41 @@
 from pathlib import Path
 from Project_Model.Libs.BPE.Enums import TokenType
 import Project_Model.Libs.BPE as BPE
 import re
 CACHE_DIR_PATH = Path("Project_Model/Tests/trainer_files/cache/pool-cache.json")
 class TestTrainBPE:
    def test_bpe_train_encoding_simple(self):
        TRAINER = BPE.NanoSocraTrainerPool(
            int(32E3),
            ["<SOT>", "<EOT>"]
        )
        TEXT = "abababab"
        TEXT_PATH = Path("Project_Model/Tests/trainer_files/train_simple.txt")
        EXPECTED = [258]
        # ab = 256
        # 256, 256 = 257
        # 257, 257 = 258
        BPE_ENCODER = TRAINER.trainBPE(
            TEXT_PATH,
            CACHE_DIR_PATH
        )
        ENCODED = BPE_ENCODER.encode(TEXT)
        assert len(ENCODED) == len(EXPECTED)
        for encoded, expected in zip(ENCODED, EXPECTED):
            assert encoded == expected
 # Useful to debug weird cases
 if __name__ == "__main__":
    TestTrainBPE().test_bpe_train_encoding_simple()
--- a/Project_Model/Tests/chunker_files/edge-1.txt
+++ b/Project_Model/Tests/chunker_files/edge-1.txt
@@ -0,0 +1,4 @@
 <SOT>Lorem <SEP>ipsu<SEP>m d<SEP>olor s<SEP>it ame<SEP>t,
 <SEP>conse<SEP>cte<SEP>tur adip<SEP>iscin<SEP>g elit.
 <SEP>Aenean a<SEP>t dui he<SEP>ndrer<SEP>it an<SEP>te soll<SEP>icitud
 <SEP>in sce<SEP>lerisque<EOT>
--- a/Project_Model/Tests/chunker_files/simple.txt
+++ b/Project_Model/Tests/chunker_files/simple.txt
@@ -0,0 +1,2 @@
 <SOT>Lorem ipsum <SEP>dolor sit amet<SEP>, consectetur <SEP>adipiscing elit.
 <SEP>Aenean at dui <SEP>hendrerit ante <SEP>sollicitudin <SEP>scelerisque<EOT>
--- a/Project_Model/Tests/chunker_files/stress.txt
+++ b/Project_Model/Tests/chunker_files/stress.txt
@@ -0,0 +1,3 @@
 <SOT>Lorem ipsum <SEP>dolor sit amet<SEP>, consectetur <SEP>adipiscing elit.
 <SEP>Aenean at dui <SEP>hendrerit an te <SEP>sollicitudin <SEP>scelerisque
 <SEP>dsdsasssdfdsdsfkjddsnfkjdsnfkjdnsjkfndf<EOT>
--- a/Project_Model/Tests/chunker_test.py
+++ b/Project_Model/Tests/chunker_test.py
@@ -0,0 +1,89 @@
 from pathlib import Path
 import re
 import pytest
 import Project_Model.Libs.BPE as BPE
 PATTERN = "<(TOKEN|SOT|SEP|EOT)>"
 SYMBOL_REGEX = re.compile(PATTERN)
 class TestChunker:
    def test_correct_simple(self):
        FILE_PATH = Path("Project_Model/Tests/chunker_files/simple.txt")
        LEAST_EXPECTED_CHUNKS = 3
        ORIG_TEXT = FILE_PATH.read_text(encoding="utf-8")
        CHUNKER = BPE.NanoSocratesChunker(40, SYMBOL_REGEX)
        CHUNKS = []
        for chunk in CHUNKER.chunk(FILE_PATH):
            print(chunk)
            CHUNKS.append(
                chunk
            )
        NANO_TEXT = "".join(CHUNKS)
        assert len(CHUNKS) > (LEAST_EXPECTED_CHUNKS - 1)
        assert NANO_TEXT == ORIG_TEXT
    def test_correct_edge_1(self):
        FILE_PATH = Path("Project_Model/Tests/chunker_files/edge-1.txt")
        LEAST_EXPECTED_CHUNKS = 3
        ORIG_TEXT = FILE_PATH.read_text(encoding="utf-8")
        CHUNKER = BPE.NanoSocratesChunker(15, SYMBOL_REGEX)
        CHUNKS = []
        for chunk in CHUNKER.chunk(FILE_PATH):
            print(chunk)
            CHUNKS.append(
                chunk
            )
        NANO_TEXT = "".join(CHUNKS)
        assert len(CHUNKS) > (LEAST_EXPECTED_CHUNKS - 1)
        assert NANO_TEXT == ORIG_TEXT
    def test_throwing(self):
        FILE_PATH = Path("Project_Model/Tests/chunker_files/simple.txt")
        CHUNKER = BPE.NanoSocratesChunker(5, SYMBOL_REGEX)
        with pytest.raises(BPE.DelimiterNotFoundException):
            for chunk in CHUNKER.chunk(FILE_PATH):
                print(chunk)
 if __name__ == "__main__":
    FILE_PATH = Path("Project_Model/Tests/chunker_files/stress.txt")
    LEAST_EXPECTED_CHUNKS = 3
    ORIG_TEXT = FILE_PATH.read_text(encoding="utf-8")
    CHUNKER = BPE.NanoSocratesChunker(40, SYMBOL_REGEX)
    CHUNKS = []
    try:
        for chunk in CHUNKER.chunk(FILE_PATH):
            print(f"CHUNK START:\n{chunk} - {len(chunk)}\nCHUNK END\n")
            CHUNKS.append(
                chunk
            )
    except:
        exit(0)
    NANO_TEXT = "".join(CHUNKS)
    assert len(CHUNKS) > (LEAST_EXPECTED_CHUNKS - 1)
    assert NANO_TEXT == ORIG_TEXT
--- a/Project_Model/Tests/splitter_test.py
+++ b/Project_Model/Tests/splitter_test.py
@@ -0,0 +1,182 @@
 from Project_Model.Libs.BPE.Enums import TokenType
 import Project_Model.Libs.BPE as BPE
 import re
 PATTERN = "<(TOKEN|SOT|SEP|EOT)>"
 SYMBOL_REGEX = re.compile(PATTERN)
 class TestSplitter:
    def test_split(self):
        TEXT = "<SOT>Lorem <SEP>"
        SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX)
        EXPECTED_CHUNKS = [
            ("<SOT>", TokenType.SPECIAL),
            ("Lorem", TokenType.BPE),
            (" ", TokenType.BPE),
            ("<SEP>", TokenType.SPECIAL),
        ]
        CHUNKS = list(SPLITTER.split_text(TEXT))
        assert len(CHUNKS) == len(EXPECTED_CHUNKS)
        for chunk, expected_chunk in zip(EXPECTED_CHUNKS, CHUNKS):
            print(f"TEST:\n\tCHUNK:\t\t{chunk}\n\tEXPECTED:\t\t{expected_chunk}")
            RECEIVED_TOKEN_STRING, RECEIVED_TOKEN_TYPE = chunk
            EXPECTED_TOKEN_STRING, EXPECTED_TOKEN_TYPE = expected_chunk
            assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING
            assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE
    def test_split_trailing_text(self):
        TEXT = "ipsu<SEP>m d<SEP>olor"
        SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX)
        EXPECTED_CHUNKS = [
            ("ipsu", TokenType.BPE),
            ("<SEP>", TokenType.SPECIAL),
            ("m", TokenType.BPE),
            (" d", TokenType.BPE),
            ("<SEP>", TokenType.SPECIAL),
            # ("olor", TokenType.BPE)
        ]
        CHUNKS = list(SPLITTER.split_text(TEXT))
        assert len(CHUNKS) == len(EXPECTED_CHUNKS)
        for chunk, expected_chunk in zip(EXPECTED_CHUNKS, CHUNKS):
            print(f"TEST:\n\tCHUNK:\t\t{chunk}\n\tEXPECTED:\t\t{expected_chunk}")
            RECEIVED_TOKEN_STRING, RECEIVED_TOKEN_TYPE = chunk
            EXPECTED_TOKEN_STRING, EXPECTED_TOKEN_TYPE = expected_chunk
            assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING
            assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE
    def test_split_multi_token(self):
        TEXT = "ipsu<SEP>m d<SEP><SEP><SEP>dsg<SEP>olor"
        SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX)
        EXPECTED_CHUNKS = [
            ("ipsu", TokenType.BPE),
            ("<SEP>", TokenType.SPECIAL),
            ("m", TokenType.BPE),
            (" d", TokenType.BPE),
            ("<SEP>", TokenType.SPECIAL),
            ("<SEP>", TokenType.SPECIAL),
            ("<SEP>", TokenType.SPECIAL),
            ("dsg", TokenType.BPE),
            ("<SEP>", TokenType.SPECIAL),
        ]
        CHUNKS = list(SPLITTER.split_text(TEXT))
        assert len(CHUNKS) == len(EXPECTED_CHUNKS)
        for chunk, expected_chunk in zip(EXPECTED_CHUNKS, CHUNKS):
            print(f"TEST:\n\tCHUNK:\t\t{chunk}\n\tEXPECTED:\t\t{expected_chunk}")
            RECEIVED_TOKEN_STRING, RECEIVED_TOKEN_TYPE = chunk
            EXPECTED_TOKEN_STRING, EXPECTED_TOKEN_TYPE = expected_chunk
            assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING
            assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE
    def test_split_malformed_1(self):
        TEXT = "<SEP>lerisque"
        SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX)
        EXPECTED_CHUNKS = [
            ("<SEP>", TokenType.SPECIAL),
        ]
        CHUNKS = list(SPLITTER.split_text(TEXT))
        assert len(CHUNKS) == len(EXPECTED_CHUNKS)
        for chunk, expected_chunk in zip(EXPECTED_CHUNKS, CHUNKS):
            print(f"TEST:\n\tCHUNK:\t\t{chunk}\n\tEXPECTED:\t\t{expected_chunk}")
            RECEIVED_TOKEN_STRING, RECEIVED_TOKEN_TYPE = chunk
            EXPECTED_TOKEN_STRING, EXPECTED_TOKEN_TYPE = expected_chunk
            assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING
            assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE
    def test_split_malformed_2(self):
        TEXT = "lerisque"
        SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX)
        EXPECTED_CHUNKS = []
        CHUNKS = list(SPLITTER.split_text(TEXT))
        assert len(CHUNKS) == len(EXPECTED_CHUNKS)
        for chunk, expected_chunk in zip(EXPECTED_CHUNKS, CHUNKS):
            print(f"TEST:\n\tCHUNK:\t\t{chunk}\n\tEXPECTED:\t\t{expected_chunk}")
            RECEIVED_TOKEN_STRING, RECEIVED_TOKEN_TYPE = chunk
            EXPECTED_TOKEN_STRING, EXPECTED_TOKEN_TYPE = expected_chunk
            assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING
            assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE
    def test_split_token_decode_simple(self):
        # to test the token split into special and bpe
        SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX, max_bpe_token_id=1473)
        token_list = [100, 101, 1477]
        CHUNKS = list(SPLITTER.split_tokens(token_list))
        EXPECTED_CHUNKS = [
            ([100, 101], TokenType.BPE),
            ([1477], TokenType.SPECIAL),
        ]
        assert len(CHUNKS) == len(EXPECTED_CHUNKS)
        for chunk, expected_chunk in zip(EXPECTED_CHUNKS, CHUNKS):
            print(f"TEST:\n\tCHUNK:\t\t{chunk}\n\tEXPECTED:\t\t{expected_chunk}")
            RECEIVED_TOKEN_STRING, RECEIVED_TOKEN_TYPE = chunk
            EXPECTED_TOKEN_STRING, EXPECTED_TOKEN_TYPE = expected_chunk
            assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING
            assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE
    def test_split_token_decode_simple_malformed(self):
        # to test the token split into special and bpe
        SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX, max_bpe_token_id=1473)
        token_list = [100, 101, 1477, 100]
        CHUNKS = list(SPLITTER.split_tokens(token_list))
        EXPECTED_CHUNKS = [
            ([100, 101], TokenType.BPE),
            ([1477], TokenType.SPECIAL),
        ]
        assert len(CHUNKS) == len(EXPECTED_CHUNKS)
        for chunk, expected_chunk in zip(EXPECTED_CHUNKS, CHUNKS):
            print(f"TEST:\n\tCHUNK:\t\t{chunk}\n\tEXPECTED:\t\t{expected_chunk}")
            RECEIVED_TOKEN_STRING, RECEIVED_TOKEN_TYPE = chunk
            EXPECTED_TOKEN_STRING, EXPECTED_TOKEN_TYPE = expected_chunk
            assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING
            assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE
 # Useful to debug weird cases
 if __name__ == "__main__":
    TestSplitter().test_split_trailing_text()
--- a/Project_Model/Tests/tokenano_test.py
+++ b/Project_Model/Tests/tokenano_test.py
@@ -0,0 +1,21 @@
 from Project_Model.Libs.BPE.Classes.TokeNanoCore import TokeNanoCore
 class TestTokeNano:
    def test_decode_encode_simple(self):
        TEXT = "<SOT>abababab<EOT>"
        # ab = 256
        # 256, 256 = 257
        # 257, 257 = 258
        VOCABULARY = {(ord("a"), ord("b")): 256, (256, 256): 257, (257, 257): 258}
        # EXPECTED = [258]
        TOKE_NANO = TokeNanoCore(VOCABULARY, ["<SOT>", "<EOT>"])
        ENCODED = TOKE_NANO.encode(TEXT)
        DECODED = TOKE_NANO.decode(ENCODED)
        assert TEXT == DECODED
--- a/Project_Model/Tests/trainer_files/cache/.gitkeep
+++ b/Project_Model/Tests/trainer_files/cache/.gitkeep
--- a/Project_Model/Tests/trainer_files/train_simple.txt
+++ b/Project_Model/Tests/trainer_files/train_simple.txt
@@ -0,0 +1 @@
 <SOT>abababab<EOT>
--- a/Project_Model/UML/bpe.excalidraw.json
+++ b/Project_Model/UML/bpe.excalidraw.json
@@ -0,0 +1,695 @@
 {
  "type": "excalidraw",
  "version": 2,
  "source": "https://marketplace.visualstudio.com/items?itemName=pomdtr.excalidraw-editor",
  "elements": [
    {
      "id": "EcT-dGsjmfW571ov8Gg4F",
      "type": "text",
      "x": 425.5,
      "y": 132,
      "width": 506,
      "height": 425,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [
        "4rCC2-N1thmII8_dwNhe1"
      ],
      "frameId": null,
      "index": "a3V",
      "roundness": null,
      "seed": 523521109,
      "version": 883,
      "versionNonce": 1590682729,
      "isDeleted": false,
      "boundElements": [
        {
          "id": "OA_NKjb3n3NLtUo_tKmPS",
          "type": "arrow"
        }
      ],
      "updated": 1758881654155,
      "link": null,
      "locked": false,
      "text": "class NanoSocratesBPE(Encoder):\n    - vocabulary: Vocabulary\n\n    + fit(\n        data: [[int]], \n        memory: NanoSocratesBPE_BatchMemory,\n        last_sentence_chunk: bool, \n        last_batch: bool\n      ) -> (Self, NanoSocratesBPE_BatchMemory)\n\n    + encode(word: [byte]) -> [int]\n\n    + decode(token: [int]) -> [byte]\n\n    + get_vocabulary_size() -> int\n    \n",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class NanoSocratesBPE(Encoder):\n    - vocabulary: Vocabulary\n\n    + fit(\n        data: [[int]], \n        memory: NanoSocratesBPE_BatchMemory,\n        last_sentence_chunk: bool, \n        last_batch: bool\n      ) -> (Self, NanoSocratesBPE_BatchMemory)\n\n    + encode(word: [byte]) -> [int]\n\n    + decode(token: [int]) -> [byte]\n\n    + get_vocabulary_size() -> int\n    \n",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "74i4oK-JpcM4CgAqhz_x_",
      "type": "rectangle",
      "x": 382.5,
      "y": 104.5,
      "width": 592.5,
      "height": 421,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [
        "4rCC2-N1thmII8_dwNhe1"
      ],
      "frameId": null,
      "index": "a4",
      "roundness": {
        "type": 3
      },
      "seed": 50827893,
      "version": 319,
      "versionNonce": 704459557,
      "isDeleted": false,
      "boundElements": [],
      "updated": 1758878226277,
      "link": null,
      "locked": false
    },
    {
      "id": "s8I1JoKulE3Vnti9a374p",
      "type": "text",
      "x": 1113.5,
      "y": 127,
      "width": 517,
      "height": 325,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [
        "M6w9efVFwOZHkJGgwkyEw"
      ],
      "frameId": null,
      "index": "a5",
      "roundness": null,
      "seed": 2091174261,
      "version": 480,
      "versionNonce": 1964948039,
      "isDeleted": false,
      "boundElements": [],
      "updated": 1758881941367,
      "link": null,
      "locked": false,
      "text": "class Vocabulary:\n\n    - vocabulary: dict<(int, int), int>\n    - reverse_vocabulary: dict<int, (int, int)>\n\n    + size -> int\n\n    + add_word(int) -> int\n    + encode(int) -> int\n    + decode(int) -> int\n    \n\n",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class Vocabulary:\n\n    - vocabulary: dict<(int, int), int>\n    - reverse_vocabulary: dict<int, (int, int)>\n\n    + size -> int\n\n    + add_word(int) -> int\n    + encode(int) -> int\n    + decode(int) -> int\n    \n\n",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "BY_Why7XDNftdMzPcwjVZ",
      "type": "rectangle",
      "x": 1086.5,
      "y": 105.5,
      "width": 593.0000000000001,
      "height": 325.5,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [
        "M6w9efVFwOZHkJGgwkyEw"
      ],
      "frameId": null,
      "index": "a6",
      "roundness": {
        "type": 3
      },
      "seed": 153939611,
      "version": 234,
      "versionNonce": 2068149129,
      "isDeleted": false,
      "boundElements": [
        {
          "id": "WcDks9DR8UqeZEaxAcRf9",
          "type": "arrow"
        }
      ],
      "updated": 1758881945661,
      "link": null,
      "locked": false
    },
    {
      "id": "JCPDhuTKRx4MN950Q3jL-",
      "type": "text",
      "x": 1116.411067193676,
      "y": 477.3809288774704,
      "width": 416.74578857421875,
      "height": 99.70355731225297,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [
        "DbtlKVF_9SjH2-9iMq9zy"
      ],
      "frameId": null,
      "index": "a7",
      "roundness": null,
      "seed": 1326854235,
      "version": 479,
      "versionNonce": 595084597,
      "isDeleted": false,
      "boundElements": [],
      "updated": 1758902358518,
      "link": null,
      "locked": false,
      "text": "class NanoSocrateBPE_BatchMemory:\n\n    + frequency: dict<(int, int), int>\n    + merge_treshold: int",
      "fontSize": 19.940711462450594,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class NanoSocrateBPE_BatchMemory:\n\n    + frequency: dict<(int, int), int>\n    + merge_treshold: int",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "l-O0rMS3SruV22_MPX9Jz",
      "type": "rectangle",
      "x": 1086.5,
      "y": 451.4580039762846,
      "width": 593,
      "height": 208.0419960474308,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [
        "DbtlKVF_9SjH2-9iMq9zy"
      ],
      "frameId": null,
      "index": "a8",
      "roundness": {
        "type": 3
      },
      "seed": 1490898171,
      "version": 305,
      "versionNonce": 587306139,
      "isDeleted": false,
      "boundElements": [
        {
          "id": "OA_NKjb3n3NLtUo_tKmPS",
          "type": "arrow"
        }
      ],
      "updated": 1758902358518,
      "link": null,
      "locked": false
    },
    {
      "id": "WcDks9DR8UqeZEaxAcRf9",
      "type": "arrow",
      "x": 773.5,
      "y": 167,
      "width": 297.17936724485867,
      "height": 30,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aB",
      "roundness": {
        "type": 2
      },
      "seed": 1681364149,
      "version": 303,
      "versionNonce": 1262492265,
      "isDeleted": false,
      "boundElements": [],
      "updated": 1758881945661,
      "link": null,
      "locked": false,
      "points": [
        [
          0,
          0
        ],
        [
          144.5,
          -1.5
        ],
        [
          177.5,
          -30
        ],
        [
          297.17936724485867,
          -29.020420978562214
        ]
      ],
      "lastCommittedPoint": null,
      "startBinding": null,
      "endBinding": {
        "elementId": "BY_Why7XDNftdMzPcwjVZ",
        "focus": 0.77319587628866,
        "gap": 18.25
      },
      "startArrowhead": null,
      "endArrowhead": "triangle",
      "elbowed": false
    },
    {
      "id": "OA_NKjb3n3NLtUo_tKmPS",
      "type": "arrow",
      "x": 946.0000000000002,
      "y": 274.95951048200493,
      "width": 130.016707976343,
      "height": 209.36808480159067,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aD",
      "roundness": {
        "type": 2
      },
      "seed": 1871768059,
      "version": 1039,
      "versionNonce": 213535035,
      "isDeleted": false,
      "boundElements": [],
      "updated": 1758902358519,
      "link": null,
      "locked": false,
      "points": [
        [
          0,
          0
        ],
        [
          54.99999999999977,
          12.54048951799507
        ],
        [
          69.49999999999977,
          188.54048951799507
        ],
        [
          130.016707976343,
          209.36808480159067
        ]
      ],
      "lastCommittedPoint": null,
      "startBinding": {
        "elementId": "EcT-dGsjmfW571ov8Gg4F",
        "focus": -0.48312180762055096,
        "gap": 14.500000000000114
      },
      "endBinding": {
        "elementId": "l-O0rMS3SruV22_MPX9Jz",
        "focus": -0.16742658425737647,
        "gap": 11.194126334166185
      },
      "startArrowhead": null,
      "endArrowhead": "triangle",
      "elbowed": false
    },
    {
      "id": "snZ__VDsIlri6NTp8M2Gf",
      "type": "text",
      "x": -245.25,
      "y": 103,
      "width": 330,
      "height": 125,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aE",
      "roundness": null,
      "seed": 1758461093,
      "version": 265,
      "versionNonce": 1069481861,
      "isDeleted": false,
      "boundElements": [],
      "updated": 1758879566916,
      "link": null,
      "locked": false,
      "text": "class NanoSocratesBPETrainer:\n\n    - max_iterations: int\n    - max_vocabulary_size: int\n    - merge_treshold: int",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class NanoSocratesBPETrainer:\n\n    - max_iterations: int\n    - max_vocabulary_size: int\n    - merge_treshold: int",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "PnbmqwEWYkP8oXElKFyTp",
      "type": "text",
      "x": -237.75,
      "y": 544,
      "width": 561,
      "height": 125,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aH",
      "roundness": null,
      "seed": 501304683,
      "version": 241,
      "versionNonce": 1306401003,
      "isDeleted": false,
      "boundElements": [],
      "updated": 1758878748210,
      "link": null,
      "locked": false,
      "text": "class NanoSocratesSplitter:\n    + splitter_regex: regex\n\n    + split_text(corpus: str) -> [(str, TokenType)]\n",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class NanoSocratesSplitter:\n    + splitter_regex: regex\n\n    + split_text(corpus: str) -> [(str, TokenType)]\n",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "xR_11IzgXX5O-m6WoRfCL",
      "type": "text",
      "x": -233.25,
      "y": 366.5,
      "width": 165,
      "height": 75,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aI",
      "roundness": null,
      "seed": 2025585125,
      "version": 395,
      "versionNonce": 1799178985,
      "isDeleted": false,
      "boundElements": [],
      "updated": 1758883940168,
      "link": null,
      "locked": false,
      "text": "enum TokenType:\n    + SPECIAL\n    + BPE",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "enum TokenType:\n    + SPECIAL\n    + BPE",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "lgKSd9qCb94-5e8rd9I3r",
      "type": "text",
      "x": -219.75,
      "y": 764.5,
      "width": 462,
      "height": 275,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aJ",
      "roundness": null,
      "seed": 1963214021,
      "version": 464,
      "versionNonce": 1104453739,
      "isDeleted": false,
      "boundElements": [],
      "updated": 1759053302739,
      "link": null,
      "locked": false,
      "text": "class TokeNanoCore:\n\n    - splitter: NanoSocratesSplitter\n    - bpe_encoder: NanoSocratesBPE\n    - special_encoder: NanoSocratesSpecial\n\n    + encode(corpus: str) -> [int]\n\n    - encode_special(piece: str) -> int\n\n    - encode_bpe(piece: str) -> [int]",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class TokeNanoCore:\n\n    - splitter: NanoSocratesSplitter\n    - bpe_encoder: NanoSocratesBPE\n    - special_encoder: NanoSocratesSpecial\n\n    + encode(corpus: str) -> [int]\n\n    - encode_special(piece: str) -> int\n\n    - encode_bpe(piece: str) -> [int]",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "DwFJoUpVT2YAEe9qPYAXa",
      "type": "text",
      "x": 496.75,
      "y": 666,
      "width": 440,
      "height": 100,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aL",
      "roundness": null,
      "seed": 1317596203,
      "version": 152,
      "versionNonce": 1840679687,
      "isDeleted": false,
      "boundElements": [],
      "updated": 1758880107704,
      "link": null,
      "locked": false,
      "text": "class NanoSocratesSpecial(Encoder):\n\n    + vocabulary: dict<str, int>\n    + reverse_vocabulary: dict<int, str>",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class NanoSocratesSpecial(Encoder):\n\n    + vocabulary: dict<str, int>\n    + reverse_vocabulary: dict<int, str>",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "78gC46xatoO1_cRtaN8EC",
      "type": "text",
      "x": 396.375,
      "y": -107.75,
      "width": 396,
      "height": 100,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aM",
      "roundness": null,
      "seed": 1187595241,
      "version": 130,
      "versionNonce": 1273030504,
      "isDeleted": false,
      "boundElements": [],
      "updated": 1759070012771,
      "link": null,
      "locked": false,
      "text": "class Encoder(ABC):\n\n    + encode(corpus: str) -> [int]\n    + decode(encoded: [int]) -> str ",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class Encoder(ABC):\n\n    + encode(corpus: str) -> [int]\n    + decode(encoded: [int]) -> str ",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "3j50Ds74uU7oXoJ9kMOYJ",
      "type": "text",
      "x": 457.375,
      "y": 903.75,
      "width": 949.7594604492188,
      "height": 25,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aN",
      "roundness": null,
      "seed": 1994335529,
      "version": 198,
      "versionNonce": 1492696519,
      "isDeleted": false,
      "boundElements": [],
      "updated": 1758882694747,
      "link": null,
      "locked": false,
      "text": "@@mamma@@è bell^^issima e @@^^le voglio molto b^^ene @--@ replit ^^ è molto ^^bello e^^ lo amo",
      "fontSize": 20,
      "fontFamily": 5,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "@@mamma@@è bell^^issima e @@^^le voglio molto b^^ene @--@ replit ^^ è molto ^^bello e^^ lo amo",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "yg-TvQvz4MwJZ0y8K7Ix0",
      "type": "text",
      "x": 435.375,
      "y": 1026.25,
      "width": 352,
      "height": 250,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aP",
      "roundness": null,
      "seed": 1877486407,
      "version": 344,
      "versionNonce": 25830153,
      "isDeleted": false,
      "boundElements": [],
      "updated": 1758883468886,
      "link": null,
      "locked": false,
      "text": "class NanoSocratesChunker:\n\n    - max_bytes: int\n    - max_special_length: int\n    - special_token_regex: regex\n\n    - residuals: str\n\n    # This must be an iterator\n    + read(path: Path) -> str",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class NanoSocratesChunker:\n\n    - max_bytes: int\n    - max_special_length: int\n    - special_token_regex: regex\n\n    - residuals: str\n\n    # This must be an iterator\n    + read(path: Path) -> str",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "2UXjWdE_jMcsCE2oQgTXn",
      "type": "text",
      "x": -334.75,
      "y": 1112.5,
      "width": 165,
      "height": 25,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aQ",
      "roundness": null,
      "seed": 700532363,
      "version": 76,
      "versionNonce": 1671597672,
      "isDeleted": false,
      "boundElements": [],
      "updated": 1759070020002,
      "link": null,
      "locked": false,
      "text": "class TokeNano:",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class TokeNano:",
      "autoResize": true,
      "lineHeight": 1.25
    }
  ],
  "appState": {
    "gridSize": 20,
    "gridStep": 5,
    "gridModeEnabled": false,
    "viewBackgroundColor": "#ffffff"
  },
  "files": {}
 }
--- a/README.md
+++ b/README.md
@@ -17,6 +17,25 @@ Now install dependencies on pip:
        pip install -r requirements.txt
 Add the following on .vscode/settings.json
       ```json
       {
              // For linux
              "terminal.integrated.env.linux": {
                     "PYTHONPATH": "${workspaceFolder}"
              },
              // For OSX
              "terminal.integrated.env.osx": {
                     "PYTHONPATH": "${workspaceFolder}"
              },
              // For Windows
              "terminal.integrated.env.windows": {
                     "PYTHONPATH": "${workspaceFolder}"
              }
       }
       ```
 ## TroubleShooting
 Sometimes when uploading really large batch of data, git can stop the uploads thanks to the timeout.
--- a/Scripts/DataCleaning/data_output_models/bpe_corpus.py
+++ b/Scripts/DataCleaning/data_output_models/bpe_corpus.py
@@ -0,0 +1,21 @@
 from Scripts.Libs.Utils.dataframe_interaction import get_raw_from_dataframe
 from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
 import pandas as pd
 class BPE_corpus():
    def __init__(self, output_path :str):
        self.output_handler = open(output_path, "w")
    def close(self):
        # add corpus end before closing
        self.output_handler.write(SpecialToken.CORPUS_END.value)
        self.output_handler.close()
    def write_from_str(self, output: str):
        if output == '':
            return
        self.output_handler.write(output)
    def write_from_df(self, df: pd.DataFrame):
        self.write_from_str(get_raw_from_dataframe(df))
--- a/Scripts/DataCleaning/data_output_models/rdf_completation_task.py
+++ b/Scripts/DataCleaning/data_output_models/rdf_completation_task.py
@@ -0,0 +1,26 @@
 import pandas as pd
 class RDF_completation_task_dataset():
    """
        Write the CSV for the fourth task, which is "Predicting subsequent triples based on a given context".
        Each RDF is saved as str
        CSV Composition: ["MovieID","RDF"]
    """
    def __init__(self, output_path:str):
        self.output =  open(output_path, "w")
        # then the first row as header
        header = ["MovieID","RDF"]
        self.output.write(",".join(header) + "\n")
    def close(self):
        self.output.close()
    def write(self, RDF: pd.DataFrame):
        """
        Args:
            RDF (pd.DataFrame): ["MovieID","RDF"]
        """        
        RDF.to_csv(self.output, index=False, header=False)
--- a/Scripts/DataCleaning/data_output_models/rdf_mask_task.py
+++ b/Scripts/DataCleaning/data_output_models/rdf_mask_task.py
@@ -0,0 +1,58 @@
 import pandas as pd
 # do not worry about circular dependencies, this class will never call something else
 from Scripts.DataCleaning.filter import PipelineApplier
 class RDF_mask_task_dataset():
    """
        Write the CSV for the third task, which is "Predicting a masked component within an RDF triple".
        The CSV is like: for each RDF there will be 3 rows, where every time one of the componments is missing.
        CSV Composition: ["MovieID","IncompleteRDF","Missing","RDF"]
    """
    def __init__(self, output_path:str):
        # this methods will only be used by this class, but they belong in a lower level
        self._build_triple = PipelineApplier.build_triple
        self._build_incomplete_triple = PipelineApplier.build_incomplete_triple
        self.output =  open(output_path, "w")
        # then the first row as header
        header = ["MovieID","IncompleteRDF","Missing","RDF"]
        self.output.write(",".join(header) + "\n")
    def close(self):
        self.output.close()
    def write(self, RDF: pd.DataFrame):
        rdf_complete = self._build_triple(RDF)
        rdf_without_subject = self._build_incomplete_triple(RDF.drop(columns=["SubjectURI"]))
        rdf_without_relationship = self._build_incomplete_triple(RDF.drop(columns=["RelationshipURI"]))
        rdf_without_object = self._build_incomplete_triple(RDF.drop(columns=["ObjectURI"]))
        ####
        df_subject = pd.DataFrame({
            "MovieID": RDF["MovieID"],
            "IncompleteRDF": rdf_without_subject,
            "Missing": RDF["SubjectURI"],
            "RDF": rdf_complete,
        })
        df_relationship = pd.DataFrame({
            "MovieID": RDF["MovieID"],
            "IncompleteRDF": rdf_without_relationship,
            "Missing": RDF["RelationshipURI"],
            "RDF": rdf_complete,
        })
        df_object = pd.DataFrame({
            "MovieID": RDF["MovieID"],
            "IncompleteRDF": rdf_without_object,
            "Missing": RDF["ObjectURI"],
            "RDF": rdf_complete,
        })
        output_df = pd.concat([df_subject, df_relationship, df_object], ignore_index=True)
        output_df.to_csv(self.output, index=False, header=False)
--- a/Scripts/DataCleaning/data_output_models/rdf_text_tasks.py
+++ b/Scripts/DataCleaning/data_output_models/rdf_text_tasks.py
@@ -0,0 +1,26 @@
 import pandas as pd
 class RDF_text_task_dataset():
    """
        Write the CSV for the firsts two tasks, which are "Generating structured RDF triples from natural language text" and reverse.
        In the CVS the RDFs will be saved toghether as a string.
        CSV Composition: ["MovieID","RDFs","Abstract"]
    """
    def __init__(self, output_path:str):
        self.output =  open(output_path, "w")
        # then the first row as header
        header = ["MovieID","RDFs","Abstract"]
        self.output.write(",".join(header) + "\n")
    def close(self):
        self.output.close()
    def write(self, RDF: pd.DataFrame):
        """
        Args:
            RDF (pd.DataFrame): ["MovieID","Triple","Abstract"]
        """        
        RDF.to_csv(self.output, index=False, header=False)
--- a/Scripts/DataCleaning/filter.py
+++ b/Scripts/DataCleaning/filter.py
@@ -0,0 +1,184 @@
 # This file deletes in the pipeline the unwanted relationship by different rules
 import pandas as pd
 import sqlite3
 import numpy as np
 from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
 from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
 class PipelineApplier():
    def __init__(self):
        self.MOVIE_FILTER = pd.DataFrame()
        self.REL_FILTER = pd.DataFrame()
    def delete_relationship_by_str(self, RDF: pd.DataFrame, uri: str) -> pd.DataFrame:
        return RDF[RDF["RelationshipURI"]!= uri]
    def generate_list_relationship_filter(self, filter_list: list[str]) -> None:
        """Store RelationshipURI filters as a set """
        self.relationship_filter_list: set[str] = set(filter_list)
    def delete_relationship_by_list_filter(self, RDF: pd.DataFrame) -> pd.DataFrame:
        """Remove rows whose RelationshipURI is in the stored filter. Generate it first callig the generate_list_relationship_filter"""
        return RDF[~RDF["RelationshipURI"].isin(self.relationship_filter_list)]
    def generate_frequency_movie_filter(self, MOVIE_COUNT: pd.DataFrame ,min_treshold: int, max_treshold: int):
        """
        You MUST call this before filter the dataset by movie frequence [filter_by_frequence_movie_id()], 
        since this method creates such filter
        Args:
            MOVIE_COUNT (pd.DataFrame): ["MovieID","Count"]
            min_treshold (int): 
            max_treshold (int): 
        """        
        MOVIE_COUNT = MOVIE_COUNT[MOVIE_COUNT["Count"] >= min_treshold]
        MOVIE_COUNT = MOVIE_COUNT[MOVIE_COUNT["Count"] < max_treshold]
        self.MOVIE_FILTER = MOVIE_COUNT #["MovieID"]
    def generate_frequency_relationship_filter(self, REL_COUNT: pd.DataFrame ,min_treshold: int, max_treshold: int):
        REL_COUNT = REL_COUNT[REL_COUNT["Count"] >= min_treshold]
        REL_COUNT = REL_COUNT[REL_COUNT["Count"] < max_treshold]
        self.REL_FILTER = REL_COUNT #["RelationshipURI"]
    def filter_by_frequency_movie_id(self, RDF: pd.DataFrame) -> pd.DataFrame:
        RDF = RDF[RDF["MovieID"].isin(self.MOVIE_FILTER["MovieID"])]
        return RDF
    def filter_by_frequency_relationship(self, RDF: pd.DataFrame) -> pd.DataFrame:
        RDF = RDF[RDF["RelationshipURI"].isin(self.REL_FILTER["RelationshipURI"])]
        return RDF
    def rdf_add_special_token(self, RDF: pd.DataFrame):
        """
        Adds RDF special token to each element of the tuple. i.e: SUBJ to SubjectURI, OBJ to ObjectURI, REL to RelationshipURI. 
        Check Scrits/Libs/CleaningPipeline/special_token.py for the up-to-date special token.
        It only adds the special token of the three element of the RDF, no other special token.
        Args:
            RDF (pd.DataFrame):
        Returns:
            pd.DataFrame: ["MovieURI","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
        """        
        # if the filter runned before sliced the RDF and created a View, here the problem is resolved
        # for more context: SettingWithCopyWarning
        RDF = RDF.copy()
        # at the beginning of SubjectURI RelationshipURI ObjectURI, add their special token  
        RDF["SubjectURI"] = SpecialToken.SUBJECT.value + RDF["SubjectURI"]
        RDF["ObjectURI"] = SpecialToken.OBJECT.value + RDF["ObjectURI"]
        RDF["RelationshipURI"] = SpecialToken.RELATIONSHIP.value + RDF["RelationshipURI"]
        return RDF
    def drop_na_from_dataset(self, RDF: pd.DataFrame) -> pd.DataFrame:
        # dataset has SubjectURI RelationshipURI ObjectURI
        #  want to drop the '' in them
        # Replace empty strings with NaN
        RDF = RDF.replace('', np.nan)
        # Drop rows where any of the key columns are NaN
        RDF = RDF.dropna(subset=["SubjectURI", "RelationshipURI", "ObjectURI"])
        return RDF
    def rebuild_by_movie(self, RDF: pd.DataFrame) -> pd.DataFrame:
        """_summary_
        Args:
            RDF (pd.DataFrame): ["MovieID","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
        Returns:
            pd.DataFrame: ["MovieID","Triple","Abstract"]
        """        
        # to execute this method you have to have itereted by movie_id
        # because as design we want at the end one row for each movie
        # MovieID and abstract can be given as input for a more generic method
        # movie_id = RDF["MovieID"].iloc(0)
        # abstract = RDF["Abstract"].iloc(0)
        # first let's combine each row creating column triple as join of rdf
        RDF["Triple"] =  RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"]
        # special token 
        RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
        # combine rows into one
        # MovieID and Abstract are unique for each other 1 <-> 1
        RDF = RDF.groupby(["MovieID", "Abstract"])["Triple"].apply("".join).reset_index()
        # add special token for: start of triple, end of triple and start of abstract
        RDF["Triple"] = SpecialToken.START_TRIPLE_LIST.value + RDF["Triple"] 
        RDF["Abstract"] = SpecialToken.ABSTRACT.value + RDF["Abstract"]
        return RDF[["MovieID","Triple","Abstract"]]
    def group_by_movie_from_triple(self, RDF: pd.DataFrame) -> pd.DataFrame:
        """
        Args:
            RDF (pd.DataFrame): ["MovieID","Triple","Abstract"]
        Returns:
            pd.DataFrame: ["MovieID","Triple","Abstract"]
        """
        # combine rows into one
        # MovieID and Abstract are unique for each other 1 <-> 1
        RDF = RDF.groupby(["MovieID", "Abstract"])["Triple"].apply("".join).reset_index()
        # add special token for: start of triple, end of triple and start of abstract
        RDF["Triple"] = SpecialToken.START_TRIPLE_LIST.value + RDF["Triple"] 
        RDF["Abstract"] = SpecialToken.ABSTRACT.value + RDF["Abstract"]
        return RDF[["MovieID","Triple","Abstract"]]
    @staticmethod
    def build_triple(RDF: pd.DataFrame):
        """
        Obtains joined RDF triple in one element, togheter with START and END special token
        Args:
            RDF (pd.DataFrame): at least ["SubjectURI", "RelationshipURI", "ObjectURI"]
        Returns:
            pd.DataFrame: RDF["Triple"] (just this column)
        """        
        # let's combine each row creating column triple as join of rdf
        RDF["Triple"] =  RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"]
        # special token 
        RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
        return RDF["Triple"]
    @staticmethod
    def build_incomplete_triple(RDF: pd.DataFrame):
        """
        Method helper used for the third task: "Predicting a masked component within an RDF triple". 
        Obtains joined RDF triple in one element, togheter with START and END special token.
        The MISSING element will be replaced by the special token <MASK>
        Args:
            RDF (pd.DataFrame): 2 of the following ["SubjectURI", "RelationshipURI", "ObjectURI"]
        Returns:
            RDF["Triple"]: pd.Series  (just this column, NOT A DATAFRAME)
        """        
        # let's create a new column "Triple" with the joined RDF
        # the following creates a column of MASK token of the lenght of the dataframe,
        # it is not needed since we expect to have a dataframe of just one column, but its more robust (AND SLOW)
        MISSING = pd.Series([SpecialToken.MASK.value] * len(RDF), index=RDF.index)
        RDF["Triple"] =  ( 
                    RDF.get("SubjectURI", MISSING) + 
                    RDF.get("RelationshipURI", MISSING) + 
                    RDF.get("ObjectURI", MISSING))
        # special token 
        RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
        return RDF["Triple"]
    @staticmethod
    def build_for_mask_task(RDF_incomplete: pd.DataFrame, MISSING: pd.DataFrame) -> pd.DataFrame:
        # currently not used
        """
        Method helper used for the third task: "Predicting a masked component within an RDF triple". 
        Given two Dataframe, the first containing the incompleted RDF and the other only the missing componment,
        this methods applies the special token
        Args:
            RDF (pd.DataFrame): _description_
        Returns:
            pd.DataFrame: _description_
        """  
        # take an example dataframe as ["SubjectURI",""]    
        # as input two dataframe, one with 2 column  
        return None
--- a/Scripts/DataCleaning/path_splitter_tree.py
+++ b/Scripts/DataCleaning/path_splitter_tree.py
@@ -101,7 +101,6 @@ def tree_like(file: str, csv_uri_header:str, out: str):
    FILE = open(file, "r", encoding="utf-8")
    # TODO: Change here so it takes single URI from a CSV file
    # It is needed the header-name
    for row in csv.DictReader(FILE):
--- a/Scripts/DataCleaning/pipeline.py
+++ b/Scripts/DataCleaning/pipeline.py
@@ -0,0 +1,131 @@
 import re
 from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
 from Scripts.DataCleaning.filter import PipelineApplier
 # tasks dataset builder
 from Scripts.DataCleaning.data_output_models.rdf_mask_task import RDF_mask_task_dataset
 from Scripts.DataCleaning.data_output_models.bpe_corpus import BPE_corpus
 from Scripts.DataCleaning.data_output_models.rdf_text_tasks import RDF_text_task_dataset
 from Scripts.DataCleaning.data_output_models.rdf_completation_task import RDF_completation_task_dataset
 import pandas as pd
 class Pipeline():
    def __init__(self):
        self.sql_endpoint = SqlEndpoint()
        # classes to manage taskes' datasets
        self.task_rdf_mask = RDF_mask_task_dataset("./Assets/Dataset/Tmp/rdf_mask.csv")
        self.task_bpe_corpus = BPE_corpus("./Assets/Dataset/Tmp/corpus.txt")
        self.task_rdf_text = RDF_text_task_dataset("./Assets/Dataset/Tmp/rdf_text.csv")
        self.task_rdf_completation = RDF_completation_task_dataset("./Assets/Dataset/Tmp/rdf_completation.csv")
        # prepare the filter
        # the filter applier needs to know the frequence of Movies and Relationship among all the Dataset
        self.filter_applier = PipelineApplier()
        MOVIE_COUNT = self.sql_endpoint.get_movies_id_count()
        REL_COUNT = self.sql_endpoint.get_relationship_count()
        self.filter_applier.generate_frequency_movie_filter(MOVIE_COUNT,50,3000)
        self.filter_applier.generate_frequency_relationship_filter(REL_COUNT, 50, 2395627)
        # prepare the filter on the relationshipURI you want to delete:
        relationship_uri_banned_list = [
            "dbp-dbp:wikiPageUsesTemplate","w3:2000/01/rdf-schema#label","dbp-dbo:abstract",
            "dbp-dbo:wikiPageID","dbp-dbo:wikiPageRevisionID", "dbp-dbo:wikiPageDisambiguates",
            "w3:2002/07/owl#sameAs","dbp-dbp:image","dbp-dbo:wikiPageLength", "w3:2000/01/rdf-schema#comment",
            "dbp-dbo:thumbnail", "foaf:depiction", "w3:1999/02/22-rdf-syntax-ns#type"]
        self.filter_applier.generate_list_relationship_filter(relationship_uri_banned_list)
    def execute_task_bpe_corpus(self):
        for RDF in self._get_cleaned_movie_rows():
            RDF = self.filter_applier.rebuild_by_movie(RDF)
            RDF = RDF[["Triple","Abstract"]]
            self.task_bpe_corpus.write_from_df(RDF)
        self._end_file_handler()
    def execute_task_rdf_mask(self):
        for RDF in self._get_cleaned_movie_rows():
            self.task_rdf_mask.write(RDF)
        self._end_file_handler()
    def execute_tasks_rdf_text(self):
        for RDF in self._get_cleaned_movie_rows():
            RDF = self.filter_applier.rebuild_by_movie(RDF)
            self.task_rdf_text.write(RDF)
        self._end_file_handler()
    def execute_task_rdf_completation(self):
        for RDF in self._get_cleaned_movie_rows():
            RDF["Triple"] = self.filter_applier.build_triple(RDF)
            self.task_rdf_completation.write(RDF[["MovieID","Triple"]])
        self._end_file_handler()
    def execute_all_task(self):
        for RDF in self._get_cleaned_movie_rows():
            self.task_rdf_mask.write(RDF)
            RDF["Triple"] = self.filter_applier.build_triple(RDF)
            self.task_rdf_completation.write(RDF[["MovieID","Triple"]])
            RDF = self.filter_applier.group_by_movie_from_triple(RDF[["MovieID","Triple","Abstract"]])
            self.task_rdf_text.write(RDF)
            self.task_bpe_corpus.write_from_df(RDF[["Triple","Abstract"]])
        self._end_file_handler()
    def _end_file_handler(self):
        self.task_bpe_corpus.close()
        self.task_rdf_mask.close()
        self.task_rdf_text.close()
        self.task_rdf_completation.close()
    def _get_cleaned_movie_rows(self):
        for RDF in self.sql_endpoint.get_abbreviated_dataset_by_movie_id():
            RDF = self.filter_applier.drop_na_from_dataset(RDF)
            RDF = self.filter_applier.filter_by_frequency_movie_id(RDF)
            RDF = self.filter_applier.filter_by_frequency_relationship(RDF)
            # other filter
            #
            RDF = self.filter_applier.delete_relationship_by_list_filter(RDF)
            if RDF.empty:
                continue
            RDF = self.filter_applier.rdf_add_special_token(RDF)  # WARNING, THIS MUST BE DONE AFTER FILTER BY FREQUENCE
            yield RDF
    def use_toy_dataset(self):
        # CHOOSEN MOVIE:
        # The Dark Knight   : 117248
        # Inception         : 147074
        # The Avengers      : 113621
        # Cast Away         : 1123
        # The Departed      : 117586
        # American Psycho   : 90177
        # Avatar            : 71587
        # Django Unchained  : 138952
        # Spirited Away     : 144137
        # Knives Out        : 148025
        movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]
        self.sql_endpoint.movie_ids = movie_list
 # there are a lot of settings to manage
 # you only need to change settings: 
 # in the init for file paths, frequency filter limit, banned reletionshipURI
 # in the use_toy_dataset , to change the toy dataset
 # in _get_cleaned_movie_rows: to change how the pipeline behave
 pipeline = Pipeline()
 # pipeline.use_toy_dataset()
 # pipeline.execute_task_bpe_corpus()
 # pipeline.execute_task_rdf_mask()
 # pipeline.execute_tasks_rdf_text()
 # pipeline.execute_task_rdf_completation()
 pipeline.execute_all_task()
--- a/Scripts/Libs/CleaningPipeline/special_token.py
+++ b/Scripts/Libs/CleaningPipeline/special_token.py
@@ -0,0 +1,21 @@
 from enum import Enum
 class SpecialToken(Enum):
    # (Enum, str) -> throws an error
    START_TRIPLE_LIST = "<SOTL>"
    START_TRIPLE = "<SOT>"
    END_TRIPLE = "<EOT>"
    SUBJECT = "<SUBJ>"
    RELATIONSHIP = "<PRED>"
    OBJECT = "<OBJ>"
    ABSTRACT = "<ABS>"
    CORPUS_END = "<END>"
    ## Tasks' Token
    RDF_TO_TEXT = "<RDF2TXT>"
    TEXT_TO_RDF = "<TEXT2RDF>"
    CONTINUE_RDF = "<CONTINUERDF>"
    MASK = "<MASK>"
    # BPE Training:
--- a/Scripts/Libs/CleaningPipeline/sql_endpoint.py
+++ b/Scripts/Libs/CleaningPipeline/sql_endpoint.py
@@ -0,0 +1,144 @@
 #######################################################
 #   This file stand as endpoint to interact with DB   #
 #######################################################
 # import sqlite3
 import pandas as pd
 from sqlalchemy import create_engine
 from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
 class SqlEndpoint():
    def __init__(self, DB_PATH = "./Assets/Dataset/DatawareHouse/dataset.db", chunk_size_row = 500):
        # self.CONN = sqlite3.connect(DB_PATH) # DEPRECATED
        self.sql_engine = create_engine(f"sqlite:///{DB_PATH}")
        # /// 3 slash -> relative path
        # //// 4 slash -> absolute
        # self.conn = self.sql_engine.connect().execution_options(stream_results=True)
        # it seems that sqlite doenst support streamer cursor
        # PRAGMA exeutes better in writing not reading
        self.chunk_size_row = chunk_size_row                    # not used now, since each chunk is a movie
        self.movie_ids = movie_ids = pd.read_sql_query("SELECT MovieID FROM Movies;", self.sql_engine)["MovieID"]
    def get_RDF(self) -> pd.DataFrame :
        QUERY = """
                SELECT MovieID, SubjectURI, RelationshipURI, ObjectURI
                FROM RDFs
                INNER JOIN Subjects USING (SubjectID)
                INNER JOIN Relationships USING (RelationshipID)
                INNER JOIN Objects USING (ObjectID);
                """
        return pd.read_sql_query(QUERY, self.CONN)
    def get_chunked_abbreviated_dataset(self) -> pd.DataFrame :
        """
        Returns:
            pd.DataFrame: MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
        """        
        QUERY = """
                SELECT MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
                FROM RDFs
                INNER JOIN ParsedSubjects USING (SubjectID)
                INNER JOIN ParsedRelationships USING (RelationshipID)
                INNER JOIN ParsedObjects USING (ObjectID)
                INNER JOIN WikipediaAbstracts USING (MovieID);
                """
        # return pd.read_sql_query(QUERY, self.CONN, chunksize=500)
        # sqlite3
        return pd.read_sql_query(QUERY, self.sql_engine, chunksize=self.chunk_size_row)
    def get_chunked_abbreviated_dataset_with_start_token(self)-> pd.DataFrame:
        # DEPRECATED !
        start_token = SpecialToken()
        QUERY = """
                SELECT 
                    MovieID, 
                    ? || SubjectURI AS SubjectURI,
                    ? || RelationshipURI AS RelationshipURI, 
                    ? || ObjectURI AS ObjectURI, 
                    Abstract
                FROM RDFs
                INNER JOIN ParsedSubjects USING (SubjectID)
                INNER JOIN ParsedRelationships USING (RelationshipID)
                INNER JOIN ParsedObjects USING (ObjectID)
                INNER JOIN WikipediaAbstracts USING (MovieID);
                """
        return pd.read_sql_query(QUERY, self.sql_engine, chunksize=self.chunk_size_row)
    def get_abbreviated_dataset_by_movie_id(self):# -> iter[pd.DataFrame]:
        """
        Gets each time a DataFrame per movie ( with all its rows in the dataset).
        The retrieved RDFs are already abbrevieted by the sql parser
        Yields:
            Pandas.DataFrame: [MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract]
        """        
        # chunk by movieId, abstract is the same and some intersting logic are appliable
        # movie_ids = pd.read_sql_query("SELECT MovieID FROM Movies;", self.sql_engine)["MovieID"]
        # CHOOSEN MOVIE:
        # The Dark Knight   : 117248
        # Inception         : 147074
        # The Avengers      : 113621
        # Cast Away         : 1123
        # The Departed      : 117586
        # American Psycho   : 90177
        # Avatar            : 71587
        # Django Unchained  : 138952
        # Spirited Away     : 144137
        # Knives Out        : 148025
        # movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]
        # movie_ids = movie_list
        QUERY = """
                SELECT MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
                FROM RDFs
                INNER JOIN ParsedSubjects USING (SubjectID)
                INNER JOIN ParsedRelationships USING (RelationshipID)
                INNER JOIN ParsedObjects USING (ObjectID)
                INNER JOIN WikipediaAbstracts USING (MovieID)
                WHERE MovieID = (?);
                """        
        for movie_id in self.movie_ids:
            yield pd.read_sql_query(QUERY, self.sql_engine, params=(movie_id,))
    def get_movies_id_count(self) -> pd.DataFrame:
        """
        Gets the count of each Movie in the Dataset
        Returns:
            Pandas.DataFrame: [MovieID, Count]
        """        
        QUERY = """
                SELECT MovieID, COUNT(*) AS Count
                FROM RDFs
                GROUP BY MovieID;
                """        
        return pd.read_sql_query(QUERY, self.sql_engine)
    def get_relationship_count(self) -> pd.DataFrame:
        """
        Gets the count of each Relationship in the Dataset
        Returns:
            Pandas.DataFrame: [RelationshipURI, Count]
        """       
        QUERY = """
                SELECT RelationshipURI, COUNT(*) AS Count
                FROM RDFs
                INNER JOIN ParsedRelationships USING (RelationshipID)
                GROUP BY RelationshipURI;
                """        
        return pd.read_sql_query(QUERY, self.sql_engine)
 if __name__ == "__main__" :
    sql_endpoint = SqlEndpoint()
    for pandas_row in sql_endpoint.get_abbreviated_dataset_by_movie_id():
        print(pandas_row)
    # sql_endpoint.get_RDF()
    print("done")
--- a/Scripts/Libs/Utils/dataframe_interaction.py
+++ b/Scripts/Libs/Utils/dataframe_interaction.py
@@ -0,0 +1,9 @@
 import pandas as pd
 def get_raw_from_dataframe(DF: pd.DataFrame) -> str:
    output = ''
    for row in DF.itertuples(index=False, name=None):
        output += "".join(map(str, row))
    return output
--- a/Scripts/Training/bpe_trainer.py
+++ b/Scripts/Training/bpe_trainer.py
@@ -0,0 +1,101 @@
 import argparse
 import json
 from pathlib import Path
 import sys
 # TODO: make relative imports
 import Project_Model.Libs.BPE as BPE
 from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
 DEFAULT_CHUNK_SIZE = int(18e4)
 DEFAULT_DEBUG_AFTER_ITER = 1
 DEFAULT_MAX_VOCABULARY = int(32E3)
 DEFAULT_MERGE_TRESHOLD = 1
 DEFAULT_MAX_ITERATIONS = 0
 TOKEN_LIST = [token.value for token in SpecialToken]
 class ProgramArgs:
    def __init__(
        self,
        input_file: str,
        cache_dir: str,
        output_file: str,
        resume_at: int,
        max_vocabulary: int,
        max_iterations: int,
        merge_treshold: int,
        chunk_size: int,
        debug_after: int,
    ) -> None:
        self.input_file = input_file
        self.cache_dir = cache_dir
        self.output_file = output_file
        self.resume_at = resume_at
        self.max_vocabulary = max_vocabulary
        self.max_iterations = max_iterations
        self.merge_treshold = merge_treshold
        self.chunk_size = chunk_size
        self.debug_after = debug_after
 def get_args(args: list[str]) -> ProgramArgs:
    PARSER = argparse.ArgumentParser()
    PARSER.add_argument("--input-file", "--input", "-i", required=True, type=str)
    PARSER.add_argument("--cache-dir", "--cache", "-c", required=True, type=str)
    PARSER.add_argument("--output-file", "--output", "-o", required=True, type=str)
    PARSER.add_argument("--resume-at", "--resume", "-r", default=0, type=int)
    PARSER.add_argument("--max-vocabulary", "--max-voc", default=DEFAULT_MAX_VOCABULARY, type=int)
    PARSER.add_argument("--max-iterations", "--max-iter", default=DEFAULT_MAX_ITERATIONS, type=int)
    PARSER.add_argument("--merge-treshold", "--tresh", default=DEFAULT_MERGE_TRESHOLD, type=int)
    PARSER.add_argument("--chunk-size", default=DEFAULT_CHUNK_SIZE, type=int)
    PARSER.add_argument("--debug-after", default=DEFAULT_DEBUG_AFTER_ITER, type=int)
    parsed_args, _ = PARSER.parse_known_args(args)
    return ProgramArgs(
        parsed_args.input_file,
        parsed_args.cache_dir,
        parsed_args.output_file,
        parsed_args.resume_at,
        parsed_args.max_vocabulary,
        parsed_args.max_iterations,
        parsed_args.merge_treshold,
        parsed_args.chunk_size,
        parsed_args.debug_after,
    )  # type ignore
 def train(args: ProgramArgs):
    TRAINER = BPE.NanoSocraTrainer(
        args.max_vocabulary,
        TOKEN_LIST,
        args.chunk_size,
        args.merge_treshold,
        args.max_iterations,
        args.debug_after
    )
    DATASET_PATH = Path(args.input_file)
    CACHE_DIR = Path(args.cache_dir)
    VOCABULARY_PATH = Path(args.output_file)
    print(f"Training BPE")
    BPE_ENCODER = TRAINER.trainBPE(
        DATASET_PATH,
        CACHE_DIR,
        resume_from_iter=args.resume_at
    )
    VOCABULARY = BPE_ENCODER.vocabulary
    print(f"Saving Vocabulary in {VOCABULARY_PATH}")
    BPE.save_nanos_vocabulary(VOCABULARY, VOCABULARY_PATH)
 if __name__ == "__main__":
    ARGS = get_args(sys.argv)
    train(ARGS)
--- a/Scripts/Training/bpe_trainer_pool.py
+++ b/Scripts/Training/bpe_trainer_pool.py
@@ -0,0 +1,96 @@
 import argparse
 import json
 from pathlib import Path
 import sys
 # TODO: make relative imports
 import Project_Model.Libs.BPE as BPE
 from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
 DEFAULT_DEBUG_AFTER_ITER = 1
 DEFAULT_MAX_VOCABULARY = int(32E3)
 DEFAULT_MERGE_TRESHOLD = 1
 DEFAULT_MAX_ITERATIONS = 0
 TOKEN_LIST = [token.value for token in SpecialToken]
 class ProgramArgs:
    def __init__(
        self,
        input_file: str,
        output_file: str,
        cache_file: str,
        max_vocabulary: int,
        max_iterations: int,
        merge_treshold: int,
        debug_after: int,
    ) -> None:
        self.input_file = input_file
        self.output_file = output_file
        self.cache_file = cache_file
        self.max_vocabulary = max_vocabulary
        self.max_iterations = max_iterations
        self.merge_treshold = merge_treshold
        self.debug_after = debug_after
 def get_args(args: list[str]) -> ProgramArgs:
    PARSER = argparse.ArgumentParser()
    PARSER.add_argument("--input-file", "--input", "-i", required=True, type=str)
    PARSER.add_argument("--output-file", "--output", "-o", required=True, type=str)
    PARSER.add_argument("--cache-file", "--cache", "-c", required=True, type=str)
    PARSER.add_argument("--max-vocabulary", "--max-voc", default=DEFAULT_MAX_VOCABULARY, type=int)
    PARSER.add_argument("--max-iterations", "--max-iter", default=DEFAULT_MAX_ITERATIONS, type=int)
    PARSER.add_argument("--merge-treshold", "--tresh", default=DEFAULT_MERGE_TRESHOLD, type=int)
    PARSER.add_argument("--debug-after", default=DEFAULT_DEBUG_AFTER_ITER, type=int)
    parsed_args, _ = PARSER.parse_known_args(args)
    return ProgramArgs(
        parsed_args.input_file,
        parsed_args.output_file,
        parsed_args.cache_file,
        parsed_args.max_vocabulary,
        parsed_args.max_iterations,
        parsed_args.merge_treshold,
        parsed_args.debug_after,
    )  # type ignore
 def train(args: ProgramArgs):
    TRAINER = BPE.NanoSocraTrainerPool(
        args.max_vocabulary,
        TOKEN_LIST,
        args.merge_treshold,
        args.max_iterations,
        args.debug_after
    )
    DATASET_PATH = Path(args.input_file)
    VOCABULARY_PATH = Path(args.output_file)
    CACHE_PATH = Path(args.cache_file)
    start_bpe = BPE.NanoSocratesBPE()
    if CACHE_PATH.is_file():
        voc = BPE.load_nanos_vocabulary(CACHE_PATH)
        start_bpe = BPE.NanoSocratesBPE(voc)
    print(f"Training BPE")
    BPE_ENCODER = TRAINER.trainBPE(
        DATASET_PATH,
        CACHE_PATH,
        start_bpe
    )
    VOCABULARY = BPE_ENCODER.vocabulary
    print(f"Saving Vocabulary in {VOCABULARY_PATH}")
    BPE.save_nanos_vocabulary(VOCABULARY, VOCABULARY_PATH)
 if __name__ == "__main__":
    ARGS = get_args(sys.argv)
    train(ARGS)
--- a/Scripts/Training/bpe_trainer_ram.py
+++ b/Scripts/Training/bpe_trainer_ram.py
@@ -0,0 +1,84 @@
 import argparse
 import json
 from pathlib import Path
 import sys
 # TODO: make relative imports
 import Project_Model.Libs.BPE as BPE
 from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
 DEFAULT_DEBUG_AFTER_ITER = 1
 DEFAULT_MAX_VOCABULARY = int(32E3)
 DEFAULT_MERGE_TRESHOLD = 1
 DEFAULT_MAX_ITERATIONS = 0
 TOKEN_LIST = [token.value for token in SpecialToken]
 class ProgramArgs:
    def __init__(
        self,
        input_file: str,
        output_file: str,
        max_vocabulary: int,
        max_iterations: int,
        merge_treshold: int,
        debug_after: int,
    ) -> None:
        self.input_file = input_file
        self.output_file = output_file
        self.max_vocabulary = max_vocabulary
        self.max_iterations = max_iterations
        self.merge_treshold = merge_treshold
        self.debug_after = debug_after
 def get_args(args: list[str]) -> ProgramArgs:
    PARSER = argparse.ArgumentParser()
    PARSER.add_argument("--input-file", "--input", "-i", required=True, type=str)
    PARSER.add_argument("--output-file", "--output", "-o", required=True, type=str)
    PARSER.add_argument("--max-vocabulary", "--max-voc", default=DEFAULT_MAX_VOCABULARY, type=int)
    PARSER.add_argument("--max-iterations", "--max-iter", default=DEFAULT_MAX_ITERATIONS, type=int)
    PARSER.add_argument("--merge-treshold", "--tresh", default=DEFAULT_MERGE_TRESHOLD, type=int)
    PARSER.add_argument("--debug-after", default=DEFAULT_DEBUG_AFTER_ITER, type=int)
    parsed_args, _ = PARSER.parse_known_args(args)
    return ProgramArgs(
        parsed_args.input_file,
        parsed_args.output_file,
        parsed_args.max_vocabulary,
        parsed_args.max_iterations,
        parsed_args.merge_treshold,
        parsed_args.debug_after,
    )  # type ignore
 def train(args: ProgramArgs):
    TRAINER = BPE.NanoSocraTraineRam(
        args.max_vocabulary,
        TOKEN_LIST,
        args.merge_treshold,
        args.max_iterations,
        args.debug_after
    )
    DATASET_PATH = Path(args.input_file)
    VOCABULARY_PATH = Path(args.output_file)
    print(f"Training BPE")
    BPE_ENCODER = TRAINER.trainBPE(
        DATASET_PATH
    )
    VOCABULARY = BPE_ENCODER.vocabulary
    print(f"Saving Vocabulary in {VOCABULARY_PATH}")
    BPE.save_nanos_vocabulary(VOCABULARY, VOCABULARY_PATH)
 if __name__ == "__main__":
    ARGS = get_args(sys.argv)
    train(ARGS)
--- a/Scripts/UML/CleaningPipeline/bpe-pipeline.excalidraw.json
+++ b/Scripts/UML/CleaningPipeline/bpe-pipeline.excalidraw.json
@@ -0,0 +1,897 @@
 {
  "type": "excalidraw",
  "version": 2,
  "source": "https://marketplace.visualstudio.com/items?itemName=pomdtr.excalidraw-editor",
  "elements": [
    {
      "id": "3zbCui3XtIGozHXTVAGRp",
      "type": "rectangle",
      "x": 316.5,
      "y": 123,
      "width": 436.5,
      "height": 145.5,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a0",
      "roundness": {
        "type": 3
      },
      "seed": 1698427950,
      "version": 35,
      "versionNonce": 601575602,
      "isDeleted": false,
      "boundElements": [
        {
          "id": "wD66RDbG05HfvRhAtMb0J",
          "type": "text"
        },
        {
          "id": "gus_rxauKJ6T2L_F59PfN",
          "type": "arrow"
        }
      ],
      "updated": 1758818588814,
      "link": null,
      "locked": false
    },
    {
      "id": "wD66RDbG05HfvRhAtMb0J",
      "type": "text",
      "x": 480.98004150390625,
      "y": 183.25,
      "width": 107.5399169921875,
      "height": 25,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a1",
      "roundness": null,
      "seed": 910769774,
      "version": 31,
      "versionNonce": 1120989938,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758818416720,
      "link": null,
      "locked": false,
      "text": "dataset.db",
      "fontSize": 20,
      "fontFamily": 5,
      "textAlign": "center",
      "verticalAlign": "middle",
      "containerId": "3zbCui3XtIGozHXTVAGRp",
      "originalText": "dataset.db",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "87-MeaiZGT1wln0nggYPZ",
      "type": "rectangle",
      "x": 339.5,
      "y": 309.5,
      "width": 392,
      "height": 156,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a2",
      "roundness": {
        "type": 3
      },
      "seed": 655550318,
      "version": 77,
      "versionNonce": 1103939826,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758818339000,
      "link": null,
      "locked": false
    },
    {
      "id": "EjUxEhZqEBzwvlw0VE9eJ",
      "type": "rectangle",
      "x": 355.5,
      "y": 327,
      "width": 162,
      "height": 125.5,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a3",
      "roundness": {
        "type": 3
      },
      "seed": 1739846638,
      "version": 64,
      "versionNonce": 1594290034,
      "isDeleted": false,
      "boundElements": [
        {
          "type": "text",
          "id": "ogRkV0neHrhEKTE6zlggl"
        }
      ],
      "updated": 1758818391415,
      "link": null,
      "locked": false
    },
    {
      "id": "ogRkV0neHrhEKTE6zlggl",
      "type": "text",
      "x": 378.7100524902344,
      "y": 377.25,
      "width": 115.57989501953125,
      "height": 25,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a3V",
      "roundness": null,
      "seed": 2037675630,
      "version": 12,
      "versionNonce": 1286472046,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758818399222,
      "link": null,
      "locked": false,
      "text": "RDF_String",
      "fontSize": 20,
      "fontFamily": 5,
      "textAlign": "center",
      "verticalAlign": "middle",
      "containerId": "EjUxEhZqEBzwvlw0VE9eJ",
      "originalText": "RDF_String",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "hoIRMNiMJZl4YDo-hovWy",
      "type": "rectangle",
      "x": 542.5,
      "y": 327,
      "width": 173,
      "height": 125.5,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a4",
      "roundness": {
        "type": 3
      },
      "seed": 1189796530,
      "version": 99,
      "versionNonce": 1071057006,
      "isDeleted": false,
      "boundElements": [
        {
          "type": "text",
          "id": "rsapATFAT5YSBCXzLupgZ"
        },
        {
          "id": "gus_rxauKJ6T2L_F59PfN",
          "type": "arrow"
        },
        {
          "id": "Wk1bJbbtC31FqObEL5xWt",
          "type": "arrow"
        }
      ],
      "updated": 1758818593647,
      "link": null,
      "locked": false
    },
    {
      "id": "rsapATFAT5YSBCXzLupgZ",
      "type": "text",
      "x": 585.6800384521484,
      "y": 377.25,
      "width": 86.63992309570312,
      "height": 25,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a5",
      "roundness": null,
      "seed": 829619694,
      "version": 12,
      "versionNonce": 713902318,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758818405150,
      "link": null,
      "locked": false,
      "text": "Abstract",
      "fontSize": 20,
      "fontFamily": 5,
      "textAlign": "center",
      "verticalAlign": "middle",
      "containerId": "hoIRMNiMJZl4YDo-hovWy",
      "originalText": "Abstract",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "jSx8ApfhtRs_nk37VvDMb",
      "type": "rectangle",
      "x": 316.5,
      "y": 511,
      "width": 436.5,
      "height": 145.5,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a6",
      "roundness": {
        "type": 3
      },
      "seed": 492582894,
      "version": 132,
      "versionNonce": 893797614,
      "isDeleted": false,
      "boundElements": [
        {
          "type": "text",
          "id": "6E23g-rgowNqHsBxX-LuM"
        },
        {
          "id": "hyFKqXwet_F79QM71atgI",
          "type": "arrow"
        },
        {
          "id": "x_DP1FcQ7jraGz0gBuDi3",
          "type": "arrow"
        },
        {
          "id": "1IGbCps2EHnzKgJUWM5nq",
          "type": "arrow"
        },
        {
          "id": "Wk1bJbbtC31FqObEL5xWt",
          "type": "arrow"
        }
      ],
      "updated": 1758818593647,
      "link": null,
      "locked": false
    },
    {
      "id": "6E23g-rgowNqHsBxX-LuM",
      "type": "text",
      "x": 499.9100341796875,
      "y": 571.25,
      "width": 69.679931640625,
      "height": 25,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a7",
      "roundness": null,
      "seed": 267696178,
      "version": 132,
      "versionNonce": 1668243186,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758818543211,
      "link": null,
      "locked": false,
      "text": "Pandas",
      "fontSize": 20,
      "fontFamily": 5,
      "textAlign": "center",
      "verticalAlign": "middle",
      "containerId": "jSx8ApfhtRs_nk37VvDMb",
      "originalText": "Pandas",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "ohj18N4AOTDz5lJNcV9gi",
      "type": "rectangle",
      "x": 261,
      "y": 765.5,
      "width": 157,
      "height": 87,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a8",
      "roundness": {
        "type": 3
      },
      "seed": 1446207150,
      "version": 279,
      "versionNonce": 317375026,
      "isDeleted": false,
      "boundElements": [
        {
          "id": "Ea1_ke2wA0D8ZjVOUtvfY",
          "type": "text"
        },
        {
          "id": "hyFKqXwet_F79QM71atgI",
          "type": "arrow"
        }
      ],
      "updated": 1758818570993,
      "link": null,
      "locked": false
    },
    {
      "id": "Ea1_ke2wA0D8ZjVOUtvfY",
      "type": "text",
      "x": 297.0800323486328,
      "y": 796.5,
      "width": 84.83993530273438,
      "height": 25,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a9",
      "roundness": null,
      "seed": 435116270,
      "version": 199,
      "versionNonce": 1282911218,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758818570993,
      "link": null,
      "locked": false,
      "text": "train.txt",
      "fontSize": 20,
      "fontFamily": 5,
      "textAlign": "center",
      "verticalAlign": "middle",
      "containerId": "ohj18N4AOTDz5lJNcV9gi",
      "originalText": "train.txt",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "A4Y54Y26fe257U_QU9lxX",
      "type": "rectangle",
      "x": 464,
      "y": 765.5,
      "width": 157,
      "height": 87,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aA",
      "roundness": {
        "type": 3
      },
      "seed": 186148850,
      "version": 232,
      "versionNonce": 997119858,
      "isDeleted": false,
      "boundElements": [
        {
          "id": "v4TvUlDEjH7EvPDmtbOn2",
          "type": "text"
        },
        {
          "id": "1IGbCps2EHnzKgJUWM5nq",
          "type": "arrow"
        }
      ],
      "updated": 1758818570993,
      "link": null,
      "locked": false
    },
    {
      "id": "v4TvUlDEjH7EvPDmtbOn2",
      "type": "text",
      "x": 476.3500442504883,
      "y": 796.5,
      "width": 132.29991149902344,
      "height": 25,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aB",
      "roundness": null,
      "seed": 1131059634,
      "version": 171,
      "versionNonce": 239540530,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758818570993,
      "link": null,
      "locked": false,
      "text": "validation.txt",
      "fontSize": 20,
      "fontFamily": 5,
      "textAlign": "center",
      "verticalAlign": "middle",
      "containerId": "A4Y54Y26fe257U_QU9lxX",
      "originalText": "validation.txt",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "mPaYpJ9Xn7tlJPmKPqJKJ",
      "type": "rectangle",
      "x": 674.5,
      "y": 765.5,
      "width": 157,
      "height": 87,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aC",
      "roundness": {
        "type": 3
      },
      "seed": 1049323314,
      "version": 235,
      "versionNonce": 330560690,
      "isDeleted": false,
      "boundElements": [
        {
          "type": "text",
          "id": "kg9nm2rpud6cax5aNPSnu"
        },
        {
          "id": "x_DP1FcQ7jraGz0gBuDi3",
          "type": "arrow"
        }
      ],
      "updated": 1758818570993,
      "link": null,
      "locked": false
    },
    {
      "id": "kg9nm2rpud6cax5aNPSnu",
      "type": "text",
      "x": 711.4300231933594,
      "y": 796.5,
      "width": 83.13995361328125,
      "height": 25,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aD",
      "roundness": null,
      "seed": 522572142,
      "version": 193,
      "versionNonce": 1920372338,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758818570993,
      "link": null,
      "locked": false,
      "text": "test.txt",
      "fontSize": 20,
      "fontFamily": 5,
      "textAlign": "center",
      "verticalAlign": "middle",
      "containerId": "mPaYpJ9Xn7tlJPmKPqJKJ",
      "originalText": "test.txt",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "hyFKqXwet_F79QM71atgI",
      "type": "arrow",
      "x": 534.65,
      "y": 661.5,
      "width": 195.25,
      "height": 99,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aG",
      "roundness": null,
      "seed": 873266098,
      "version": 71,
      "versionNonce": 541154738,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758818570993,
      "link": null,
      "locked": false,
      "points": [
        [
          0,
          0
        ],
        [
          0,
          49.5
        ],
        [
          -195.25,
          49.5
        ],
        [
          -195.25,
          99
        ]
      ],
      "lastCommittedPoint": null,
      "startBinding": {
        "elementId": "jSx8ApfhtRs_nk37VvDMb",
        "fixedPoint": [
          0.49977090492554405,
          1.034364261168385
        ],
        "focus": 0,
        "gap": 0
      },
      "endBinding": {
        "elementId": "ohj18N4AOTDz5lJNcV9gi",
        "fixedPoint": [
          0.4993630573248406,
          -0.05747126436781609
        ],
        "focus": 0,
        "gap": 0
      },
      "startArrowhead": null,
      "endArrowhead": "triangle",
      "elbowed": true,
      "fixedSegments": null,
      "startIsSpecial": null,
      "endIsSpecial": null
    },
    {
      "id": "x_DP1FcQ7jraGz0gBuDi3",
      "type": "arrow",
      "x": 534.65,
      "y": 661.5,
      "width": 218.25,
      "height": 99,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aH",
      "roundness": null,
      "seed": 1210817582,
      "version": 77,
      "versionNonce": 1483392370,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758818580594,
      "link": null,
      "locked": false,
      "points": [
        [
          0,
          0
        ],
        [
          0,
          49.5
        ],
        [
          218.25,
          49.5
        ],
        [
          218.25,
          99
        ]
      ],
      "lastCommittedPoint": null,
      "startBinding": {
        "elementId": "jSx8ApfhtRs_nk37VvDMb",
        "fixedPoint": [
          0.49977090492554405,
          1.034364261168385
        ],
        "focus": 0,
        "gap": 0
      },
      "endBinding": {
        "elementId": "mPaYpJ9Xn7tlJPmKPqJKJ",
        "fixedPoint": [
          0.4993630573248406,
          -0.05747126436781609
        ],
        "focus": 0,
        "gap": 0
      },
      "startArrowhead": null,
      "endArrowhead": "triangle",
      "elbowed": true,
      "fixedSegments": null,
      "startIsSpecial": null,
      "endIsSpecial": null
    },
    {
      "id": "1IGbCps2EHnzKgJUWM5nq",
      "type": "arrow",
      "x": 534.65,
      "y": 661.5,
      "width": 0.5719232650604908,
      "height": 99.07394122590165,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aK",
      "roundness": null,
      "seed": 1205316658,
      "version": 96,
      "versionNonce": 1748050674,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758818570993,
      "link": null,
      "locked": false,
      "points": [
        [
          0,
          0
        ],
        [
          -0.5719232650604908,
          99.07394122590165
        ]
      ],
      "lastCommittedPoint": null,
      "startBinding": {
        "elementId": "jSx8ApfhtRs_nk37VvDMb",
        "fixedPoint": [
          0.49977090492554405,
          1.034364261168385
        ],
        "focus": 0,
        "gap": 0
      },
      "endBinding": {
        "elementId": "A4Y54Y26fe257U_QU9lxX",
        "fixedPoint": [
          0.44635717665566554,
          -0.056621365219521276
        ],
        "focus": 0,
        "gap": 0
      },
      "startArrowhead": null,
      "endArrowhead": "triangle",
      "elbowed": true,
      "fixedSegments": null,
      "startIsSpecial": null,
      "endIsSpecial": null
    },
    {
      "id": "gus_rxauKJ6T2L_F59PfN",
      "type": "arrow",
      "x": 539,
      "y": 271.5,
      "width": 0,
      "height": 33.5,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aL",
      "roundness": null,
      "seed": 763990258,
      "version": 17,
      "versionNonce": 1028811378,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758818588814,
      "link": null,
      "locked": false,
      "points": [
        [
          0,
          0
        ],
        [
          0,
          33.5
        ]
      ],
      "lastCommittedPoint": null,
      "startBinding": {
        "elementId": "3zbCui3XtIGozHXTVAGRp",
        "focus": -0.019473081328751418,
        "gap": 3
      },
      "endBinding": {
        "elementId": "hoIRMNiMJZl4YDo-hovWy",
        "focus": -1.0404624277456647,
        "gap": 30.7545797799829
      },
      "startArrowhead": null,
      "endArrowhead": "triangle",
      "elbowed": false
    },
    {
      "id": "Wk1bJbbtC31FqObEL5xWt",
      "type": "arrow",
      "x": 536.5,
      "y": 468.5,
      "width": 0,
      "height": 39,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aM",
      "roundness": null,
      "seed": 1489771054,
      "version": 33,
      "versionNonce": 1828178606,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758818593647,
      "link": null,
      "locked": false,
      "points": [
        [
          0,
          0
        ],
        [
          0,
          39
        ]
      ],
      "lastCommittedPoint": null,
      "startBinding": {
        "elementId": "hoIRMNiMJZl4YDo-hovWy",
        "focus": 1.0693641618497107,
        "gap": 27.157190169432425
      },
      "endBinding": {
        "elementId": "jSx8ApfhtRs_nk37VvDMb",
        "focus": 0.008018327605956525,
        "gap": 3.5
      },
      "startArrowhead": null,
      "endArrowhead": "triangle",
      "elbowed": false
    }
  ],
  "appState": {
    "gridSize": 20,
    "gridStep": 5,
    "gridModeEnabled": false,
    "viewBackgroundColor": "#ffffff"
  },
  "files": {}
 }
--- a/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json
+++ b/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json
@@ -0,0 +1,634 @@
 {
  "type": "excalidraw",
  "version": 2,
  "source": "https://marketplace.visualstudio.com/items?itemName=pomdtr.excalidraw-editor",
  "elements": [
    {
      "id": "JNB9z-PeqZ4s8KDfWaoXe",
      "type": "rectangle",
      "x": 106,
      "y": 27,
      "width": 653,
      "height": 263,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a2",
      "roundness": {
        "type": 3
      },
      "seed": 710740889,
      "version": 326,
      "versionNonce": 1107631703,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1759156408059,
      "link": null,
      "locked": false
    },
    {
      "id": "e13wNTgUpn2flMpmMttqx",
      "type": "text",
      "x": 200.5943407656526,
      "y": 44.07937975075269,
      "width": 307.2781467269385,
      "height": 23.3097531902191,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a3",
      "roundness": null,
      "seed": 1012740663,
      "version": 444,
      "versionNonce": 589551257,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1759156408059,
      "link": null,
      "locked": false,
      "text": "Libs/CleaningPipeline/sql_endpoint",
      "fontSize": 18.64780255217528,
      "fontFamily": 5,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "Libs/CleaningPipeline/sql_endpoint",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "CgxCElJkKBtIHv-5WQrbo",
      "type": "text",
      "x": 195,
      "y": 80.44259472749451,
      "width": 403.64997665852184,
      "height": 186.4780255217528,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a4",
      "roundness": null,
      "seed": 1261951799,
      "version": 507,
      "versionNonce": 1922906999,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1759156408059,
      "link": null,
      "locked": false,
      "text": "Class SqlEndpoint:\n    - sql_engine\n    + movie_ids: list[int]\n\n    #\n    + get_abbreviated_dataset_by_movie_id\n\n",
      "fontSize": 18.64780255217528,
      "fontFamily": 5,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "Class SqlEndpoint:\n    - sql_engine\n    + movie_ids: list[int]\n\n    #\n    + get_abbreviated_dataset_by_movie_id\n\n",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "type": "line",
      "version": 4979,
      "versionNonce": 1473849177,
      "isDeleted": false,
      "id": "sYReMTdYblr-oJtYYJALU",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": -67.14432426259049,
      "y": 87.19293561900287,
      "strokeColor": "#000000",
      "backgroundColor": "#a5d8ff",
      "width": 77.09201683999922,
      "height": 99.49948667804088,
      "seed": 1263944119,
      "groupIds": [
        "9YkNe1yqnfZy9Z1JX2xr4",
        "BDBCTrrhjbJynRAyuf3xJ"
      ],
      "strokeSharpness": "round",
      "boundElementIds": [],
      "startBinding": null,
      "endBinding": null,
      "lastCommittedPoint": null,
      "startArrowhead": null,
      "endArrowhead": null,
      "points": [
        [
          0,
          0
        ],
        [
          0.2542098813493443,
          75.20117273657175
        ],
        [
          0.011896425679918422,
          83.76249969444815
        ],
        [
          3.970409367559332,
          87.46174320643391
        ],
        [
          17.75573317066317,
          90.59250103325854
        ],
        [
          41.05683533152865,
          91.56737225214069
        ],
        [
          63.319497586673116,
          90.01084754868091
        ],
        [
          75.14781395923075,
          86.28844687220405
        ],
        [
          76.81603792670788,
          83.15042405259751
        ],
        [
          77.05033394391478,
          76.25776215104557
        ],
        [
          76.86643881413028,
          6.3089586511537865
        ],
        [
          76.45188016352971,
          -0.2999144698665015
        ],
        [
          71.50179495549581,
          -3.9936571317850627
        ],
        [
          61.077971898861186,
          -6.132877429442784
        ],
        [
          37.32348754161154,
          -7.932114425900202
        ],
        [
          18.278415656797975,
          -6.859225353587373
        ],
        [
          3.2995959613238286,
          -3.2201165291205287
        ],
        [
          -0.04168289608444441,
          -0.045185660461322996
        ],
        [
          0,
          0
        ]
      ],
      "index": "a6",
      "frameId": null,
      "roundness": {
        "type": 2
      },
      "boundElements": [],
      "updated": 1759158252997,
      "link": null,
      "locked": false
    },
    {
      "type": "line",
      "version": 2684,
      "versionNonce": 952947769,
      "isDeleted": false,
      "id": "0S6dEWQVqKUVkP6Z5IX1l",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": -66.6203948243155,
      "y": 144.31921927673278,
      "strokeColor": "#000000",
      "backgroundColor": "#a5d8ff",
      "width": 77.17198221193564,
      "height": 8.562348957853036,
      "seed": 817033943,
      "groupIds": [
        "9YkNe1yqnfZy9Z1JX2xr4",
        "BDBCTrrhjbJynRAyuf3xJ"
      ],
      "strokeSharpness": "round",
      "boundElementIds": [],
      "startBinding": null,
      "endBinding": null,
      "lastCommittedPoint": null,
      "startArrowhead": null,
      "endArrowhead": null,
      "points": [
        [
          0,
          0
        ],
        [
          2.033150371639873,
          3.413095389435587
        ],
        [
          10.801287372573954,
          6.276651055277943
        ],
        [
          22.468666942209353,
          8.010803051612635
        ],
        [
          40.747074201802775,
          8.168828515515864
        ],
        [
          62.077348233027564,
          7.0647721921469495
        ],
        [
          74.53446931782398,
          3.04824021069218
        ],
        [
          77.17198221193564,
          -0.3935204423371723
        ]
      ],
      "index": "a7",
      "frameId": null,
      "roundness": {
        "type": 2
      },
      "boundElements": [],
      "updated": 1759158252997,
      "link": null,
      "locked": false
    },
    {
      "type": "line",
      "version": 2770,
      "versionNonce": 477619481,
      "isDeleted": false,
      "id": "szGLND7J0nVOvRkNXX9AS",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": -67.65225214681931,
      "y": 115.35516394150972,
      "strokeColor": "#000000",
      "backgroundColor": "#a5d8ff",
      "width": 77.17198221193564,
      "height": 8.562348957853036,
      "seed": 1704755191,
      "groupIds": [
        "9YkNe1yqnfZy9Z1JX2xr4",
        "BDBCTrrhjbJynRAyuf3xJ"
      ],
      "strokeSharpness": "round",
      "boundElementIds": [],
      "startBinding": null,
      "endBinding": null,
      "lastCommittedPoint": null,
      "startArrowhead": null,
      "endArrowhead": null,
      "points": [
        [
          0,
          0
        ],
        [
          2.033150371639873,
          3.413095389435587
        ],
        [
          10.801287372573954,
          6.276651055277943
        ],
        [
          22.468666942209353,
          8.010803051612635
        ],
        [
          40.747074201802775,
          8.168828515515864
        ],
        [
          62.077348233027564,
          7.0647721921469495
        ],
        [
          74.53446931782398,
          3.04824021069218
        ],
        [
          77.17198221193564,
          -0.3935204423371723
        ]
      ],
      "index": "a8",
      "frameId": null,
      "roundness": {
        "type": 2
      },
      "boundElements": [],
      "updated": 1759158252997,
      "link": null,
      "locked": false
    },
    {
      "type": "ellipse",
      "version": 5767,
      "versionNonce": 2119031289,
      "isDeleted": false,
      "id": "O3t2uGktJlDd1_OX_bpV4",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": -68.71020112890136,
      "y": 80.06066699332126,
      "strokeColor": "#000000",
      "backgroundColor": "#a5d8ff",
      "width": 76.59753601865496,
      "height": 15.49127539284798,
      "seed": 471296279,
      "groupIds": [
        "9YkNe1yqnfZy9Z1JX2xr4",
        "BDBCTrrhjbJynRAyuf3xJ"
      ],
      "strokeSharpness": "sharp",
      "boundElementIds": [
        "bxuMGTzXLn7H-uBCptINx"
      ],
      "index": "a9",
      "frameId": null,
      "roundness": null,
      "boundElements": [],
      "updated": 1759158252997,
      "link": null,
      "locked": false
    },
    {
      "type": "ellipse",
      "version": 1177,
      "versionNonce": 525480665,
      "isDeleted": false,
      "id": "_SzKlOBOvJgBg7FX0JTTM",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": -32.218214023678854,
      "y": 104.53733467322485,
      "strokeColor": "#000000",
      "backgroundColor": "#228be6",
      "width": 11.226103154161754,
      "height": 12.183758484455605,
      "seed": 1368927799,
      "groupIds": [
        "9YkNe1yqnfZy9Z1JX2xr4",
        "BDBCTrrhjbJynRAyuf3xJ"
      ],
      "strokeSharpness": "sharp",
      "boundElementIds": [],
      "index": "aA",
      "frameId": null,
      "roundness": null,
      "boundElements": [],
      "updated": 1759158252997,
      "link": null,
      "locked": false
    },
    {
      "type": "ellipse",
      "version": 1465,
      "versionNonce": 1410887609,
      "isDeleted": false,
      "id": "oJMl2Kxa3SPaiAY0kxo7A",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": -31.867072239745255,
      "y": 130.75394896028996,
      "strokeColor": "#000000",
      "backgroundColor": "#228be6",
      "width": 11.226103154161754,
      "height": 12.183758484455605,
      "seed": 1627606871,
      "groupIds": [
        "9YkNe1yqnfZy9Z1JX2xr4",
        "BDBCTrrhjbJynRAyuf3xJ"
      ],
      "strokeSharpness": "sharp",
      "boundElementIds": [],
      "index": "aB",
      "frameId": null,
      "roundness": null,
      "boundElements": [],
      "updated": 1759158252997,
      "link": null,
      "locked": false
    },
    {
      "type": "ellipse",
      "version": 1348,
      "versionNonce": 314839193,
      "isDeleted": false,
      "id": "fB6pJBSMA-pRHrpgYKaLL",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 6.239590202363168,
      "x": -31.218214023678854,
      "y": 159.52267553159635,
      "strokeColor": "#000000",
      "backgroundColor": "#228be6",
      "width": 11.226103154161754,
      "height": 12.183758484455605,
      "seed": 1420643447,
      "groupIds": [
        "9YkNe1yqnfZy9Z1JX2xr4",
        "BDBCTrrhjbJynRAyuf3xJ"
      ],
      "strokeSharpness": "sharp",
      "boundElementIds": [],
      "index": "aC",
      "frameId": null,
      "roundness": null,
      "boundElements": [],
      "updated": 1759158252997,
      "link": null,
      "locked": false
    },
    {
      "type": "text",
      "version": 846,
      "versionNonce": 1091081593,
      "isDeleted": false,
      "id": "9gZ3Yy1MeP9kEOTLODqLG",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": -76.81018163712321,
      "y": 181.11281713043917,
      "strokeColor": "#000000",
      "backgroundColor": "#a5d8ff",
      "width": 95.63072204589844,
      "height": 23.595161071904883,
      "seed": 2019206551,
      "groupIds": [
        "BDBCTrrhjbJynRAyuf3xJ"
      ],
      "strokeSharpness": "sharp",
      "boundElementIds": [],
      "fontSize": 17.4778970902999,
      "fontFamily": 1,
      "text": "dataset.db",
      "baseline": 16.595161071904883,
      "textAlign": "center",
      "verticalAlign": "top",
      "index": "aD",
      "frameId": null,
      "roundness": null,
      "boundElements": [],
      "updated": 1759158252997,
      "link": null,
      "locked": false,
      "containerId": null,
      "originalText": "dataset.db",
      "autoResize": true,
      "lineHeight": 1.350000000000001
    },
    {
      "id": "3eOw20xMhpB5jf_RMG24P",
      "type": "text",
      "x": 1131.3333333333335,
      "y": 31.333333333333428,
      "width": 508.3333333333333,
      "height": 550,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aE",
      "roundness": null,
      "seed": 1535658041,
      "version": 821,
      "versionNonce": 1630266809,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1759157181677,
      "link": null,
      "locked": false,
      "text": "Class PipelineApplier\n    - movie_frequence_filter : pd.DataFrame()\n    - rel_Frequence_Filter : pd.DataFrame()\n    - rel_banned_list: list[str]\n\n    + generate_movie_frequency_filter()\n    + generate_rel_frequency_filter()\n    + generate_list_relationship_filter()\n    \n    + filter_by_movie_frequency()\n    + filter_by_relationship_frequency()\n    + delete_relationship_by_list_filter()\n    + delete_relationship_by_str()\n\n    + drop_na()    \n\n    + rdf_add_special_token()\n    + group_triple_by_movie()\n    + build_by_movie()\n    # static\n    + build_triple()\n    + build_incomplete_triple()",
      "fontSize": 20,
      "fontFamily": 5,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "Class PipelineApplier\n    - movie_frequence_filter : pd.DataFrame()\n    - rel_Frequence_Filter : pd.DataFrame()\n    - rel_banned_list: list[str]\n\n    + generate_movie_frequency_filter()\n    + generate_rel_frequency_filter()\n    + generate_list_relationship_filter()\n    \n    + filter_by_movie_frequency()\n    + filter_by_relationship_frequency()\n    + delete_relationship_by_list_filter()\n    + delete_relationship_by_str()\n\n    + drop_na()    \n\n    + rdf_add_special_token()\n    + group_triple_by_movie()\n    + build_by_movie()\n    # static\n    + build_triple()\n    + build_incomplete_triple()",
      "autoResize": false,
      "lineHeight": 1.25
    },
    {
      "id": "Fbl1gpb5r7QrdRauGUWm2",
      "type": "text",
      "x": 158.23809523809535,
      "y": 502.52380952380935,
      "width": 484.2857142857143,
      "height": 500,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aF",
      "roundness": null,
      "seed": 2066618807,
      "version": 552,
      "versionNonce": 1269344823,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1759158199532,
      "link": null,
      "locked": false,
      "text": "Class Pipeline\n    - sql_endpoint: SqlEndpoint()\n\n    - task_rdf_mask_file_handler:\n    - task_bpe_corpus_file_handler:\n    - task_rdf_text_file_handler:\n    - task_rdf_completation_file_handler:\n\n    - Filter_applier : PipelineApplier()\n\n    #\n    - get_cleaned_movie_rows()\n    \n    + execute_task_bpe_corpus()\n    + execute_task_rdf_mask()\n    + execute_task_rdf_text()\n    + execute_task_rdf_completation()\n    + execute_all_task()\n\n    + use_toy_dataset()",
      "fontSize": 20,
      "fontFamily": 5,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "Class Pipeline\n    - sql_endpoint: SqlEndpoint()\n\n    - task_rdf_mask_file_handler:\n    - task_bpe_corpus_file_handler:\n    - task_rdf_text_file_handler:\n    - task_rdf_completation_file_handler:\n\n    - Filter_applier : PipelineApplier()\n\n    #\n    - get_cleaned_movie_rows()\n    \n    + execute_task_bpe_corpus()\n    + execute_task_rdf_mask()\n    + execute_task_rdf_text()\n    + execute_task_rdf_completation()\n    + execute_all_task()\n\n    + use_toy_dataset()",
      "autoResize": false,
      "lineHeight": 1.25
    }
  ],
  "appState": {
    "gridSize": 20,
    "gridStep": 5,
    "gridModeEnabled": false,
    "viewBackgroundColor": "#ffffff"
  },
  "files": {}
 }
--- a/docs/BPE.md
+++ b/docs/BPE.md
@@ -0,0 +1,22 @@
 # BPE
 ## Reasearch Material
 - [BPE Wikipedia](https://en.wikipedia.org/wiki/Byte-pair_encoding)
 - [BPE Hugging Face](https://huggingface.co/learn/llm-course/chapter6/5)
 - [BPE GeeksForGeeks](https://www.geeksforgeeks.org/nlp/byte-pair-encoding-bpe-in-nlp/)
 - [BPE Medium Chetna Khanna](https://medium.com/data-science/byte-pair-encoding-subword-based-tokenization-algorithm-77828a70bee0)
 - [Stack Overflow "Explain bpe (Byte Pair Encoding) with examples?"](https://stackoverflow.com/questions/50583254/explain-bpe-byte-pair-encoding-with-examples)
 - [Implementing a byte pair encoding(BPE) Tokenizer from scratch](https://sebastianraschka.com/blog/2025/bpe-from-scratch.html)
 - [Thoretical Analysis of Byte-Pair Encoding](https://arxiv.org/pdf/2411.08671)
 - [A Formal Perspective on Byte-Pair Encoding](https://aclanthology.org/2023.findings-acl.38v2.pdf)
 - [Byte Pair Encoding is Suboptimal for Language Model Pretraining](https://arxiv.org/pdf/2004.03720)
 - [Byte pair encoding: a text compression scheme that accelerates pattern matching](https://www.researchgate.net/profile/Takeshi-Shinohara/publication/2310624_Byte_Pair_Encoding_A_Text_Compression_Scheme_That_Accelerates_Pattern_Matching/links/02e7e522f8ea00c318000000/Byte-Pair-Encoding-A-Text-Compression-Scheme-That-Accelerates-Pattern-Matching.pdf)
 - [A Formal Perspective on Byte-Pair Encoding](https://arxiv.org/pdf/2306.16837)
 - [Controlling byte pair encoding for neural machine translation](https://ieeexplore.ieee.org/abstract/document/8300571)
 - [Scaffold-BPE: Enhancing Byte Pair Encoding for Large Language Models with Simple and Effective Scaffold Token Removal](https://ojs.aaai.org/index.php/AAAI/article/view/34633)
 - [Parity-Aware Byte-Pair Encoding: Improving Cross-lingual Fairness in Tokenization](https://arxiv.org/pdf/2508.04796)
 - [Code Completion using Neural Aention and Byte Pair Encoding](https://arxiv.org/pdf/2004.06343)
 - [Getting the most out of your tokenizer for pre-training and domain adaptation](https://arxiv.org/html/2402.01035v2)
--- a/docs/PAPERS.md
+++ b/docs/PAPERS.md
@@ -0,0 +1,57 @@
 # Research Material
 ## BPE
 - [BPE Wikipedia](https://en.wikipedia.org/wiki/Byte-pair_encoding)
 - [BPE Hugging Face](https://huggingface.co/learn/llm-course/chapter6/5)
 - [BPE GeeksForGeeks](https://www.geeksforgeeks.org/nlp/byte-pair-encoding-bpe-in-nlp/)
 - [BPE Medium Chetna Khanna](https://medium.com/data-science/byte-pair-encoding-subword-based-tokenization-algorithm-77828a70bee0)
 - [Stack Overflow "Explain bpe (Byte Pair Encoding) with examples?"](https://stackoverflow.com/questions/50583254/explain-bpe-byte-pair-encoding-with-examples)
 - [Implementing a byte pair encoding(BPE) Tokenizer from scratch](https://sebastianraschka.com/blog/2025/bpe-from-scratch.html)
 - [Thoretical Analysis of Byte-Pair Encoding](https://arxiv.org/pdf/2411.08671)
 - [A Formal Perspective on Byte-Pair Encoding](https://aclanthology.org/2023.findings-acl.38v2.pdf)
 - [Byte Pair Encoding is Suboptimal for Language Model Pretraining](https://arxiv.org/pdf/2004.03720)
 - [Byte pair encoding: a text compression scheme that accelerates pattern matching](https://www.researchgate.net/profile/Takeshi-Shinohara/publication/2310624_Byte_Pair_Encoding_A_Text_Compression_Scheme_That_Accelerates_Pattern_Matching/links/02e7e522f8ea00c318000000/Byte-Pair-Encoding-A-Text-Compression-Scheme-That-Accelerates-Pattern-Matching.pdf)
 - [A Formal Perspective on Byte-Pair Encoding](https://arxiv.org/pdf/2306.16837)
 - [Controlling byte pair encoding for neural machine translation](https://ieeexplore.ieee.org/abstract/document/8300571)
 - [Scaffold-BPE: Enhancing Byte Pair Encoding for Large Language Models with Simple and Effective Scaffold Token Removal](https://ojs.aaai.org/index.php/AAAI/article/view/34633)
 - [Parity-Aware Byte-Pair Encoding: Improving Cross-lingual Fairness in Tokenization](https://arxiv.org/pdf/2508.04796)
 - [Code Completion using Neural Aention and Byte Pair Encoding](https://arxiv.org/pdf/2004.06343)
 - [Getting the most out of your tokenizer for pre-training and domain adaptation](https://arxiv.org/html/2402.01035v2)
 ## Embedder
 - [ROFORMER: ENHANCED TRANSFORMER WITH ROTARY POSITION EMBEDDING](https://arxiv.org/pdf/2104.09864)
 - [You could have designed state of the art positional encoding](https://huggingface.co/blog/designing-positional-encoding)
 - [Rotary Embeddings: A Relative Revolution](https://blog.eleuther.ai/rotary-embeddings/)
 - [Round and Round We Go! What makes Rotary Positional Encodings useful?](https://arxiv.org/html/2410.06205v1)
 - [Inside RoPE: Rotary Magic into Position Embeddings](https://learnopencv.com/rope-position-embeddings/)
 - [What Rotary Position Embedding Can Tell Us: Identifying Query and Key Weights Corresponding to Basic Syntactic or High-level Semantic Information](https://openreview.net/pdf?id=e5Mv7iWfVW)
 - [A gentle introduction to Rotary Position Embedding](https://krasserm.github.io/2022/12/13/rotary-position-embedding/)
 - [Context-aware Rotary Position Embedding](https://arxiv.org/pdf/2507.23083)
 - [LIERE: GENERALIZING ROTARY POSITION ENCODINGS TO HIGHER DIMENSIONAL INPUTS](https://openreview.net/pdf?id=xHMMt7r3GW)
 - [Rotary Positional Embeddings (RoPE)](https://nn.labml.ai/transformers/rope/index.html)
 - [Decoding Llama3: An explainer for tinkerers](https://hasgeek.com/simrathanspal/the-llama3-guide/sub/decoding-llama3-part-4-rotary-positional-embedding-3K8ZHpdLi6E56N8ejnaWzm)
 ## Attention
 - [Standard Self-Attention (Attention is all you need)](https://arxiv.org/pdf/1706.03762)
 - [TransMLA: Multi-Head Latent Attention Is All You Need](https://arxiv.org/pdf/2502.07864)
 - [A Gentle Introduction to Multi-Head Latent Attention (MLA)](https://machinelearningmastery.com/a-gentle-introduction-to-multi-head-latent-attention-mla/)
 - [Understanding Multi-Head Latent Attention](https://planetbanatt.net/articles/mla.html)
 - [DeepSeek's Multi-Head Latent Attention](https://liorsinai.github.io/machine-learning/2025/02/22/mla.html)
 - [MatchFormer: Interleaving Attention in Transformers for Feature Matching](https://arxiv.org/pdf/2203.09645)
 - [FIT: Far-reaching Interleaved Transformers](https://arxiv.org/pdf/2305.12689)
 - [Gemma explained: What’s new in Gemma 3](https://developers.googleblog.com/en/gemma-explained-whats-new-in-gemma-3/)
 - [The Llama 4 herd: The beginning of a new era of natively multimodal AI innovation](https://ai.meta.com/blog/llama-4-multimodal-intelligence/)
 - [Attention was never enough: Tracing the rise of hybrid LLMs](https://www.ai21.com/blog/rise-of-hybrid-llms/)
 -
 ## Spanned Masking
 - [Salient Span Masking for Temporal Understanding](https://arxiv.org/pdf/2303.12860)
 - [PMI-MASKING: PRINCIPLED MASKING OF CORRELATED SPANS](https://arxiv.org/pdf/2010.01825)
 ## Models
 - [What Language Model Architecture and Pretraining Objective Work Best for Zero-Shot Generalization?](https://arxiv.org/pdf/2204.05832)
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,3 +15,4 @@ tzdata==2025.2
 urllib3==2.5.0
 wheel==0.45.1
 Wikipedia-API==0.8.1
 SQLAlchemy
		`@@ -0,0 +1,2 @@`
							`<SOT>Lorem ipsum <SEP>dolor sit amet<SEP>, consectetur <SEP>adipiscing elit.`
							`<SEP>Aenean at dui <SEP>hendrerit ante <SEP>sollicitudin <SEP>scelerisque<EOT>`