little snippet to trim big dictionaries

added tokenano to the init
a new exasperated way to train the bpe, just a wild experimen that could be useful later
2025-10-07 16:05:32 +02:00 · 2025-10-04 19:03:56 +02:00 · 2025-10-04 19:03:07 +02:00 · 2025-10-04 19:01:21 +02:00 · 2025-10-04 19:00:05 +02:00 · 2025-10-04 18:58:20 +02:00
76 changed files with 6901 additions and 153 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -254,4 +254,5 @@ $RECYCLE.BIN/
 # ---> Custom
 **/Tmp/**
 **/cache/**
 !**/.gitkeep
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -0,0 +1,16 @@
 {
    // Use IntelliSense to learn about possible attributes.
    // Hover to view descriptions of existing attributes.
    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
    "version": "0.2.0",
    "configurations": [
        {
            "name": "Python Debugger: Current File with Arguments",
            "type": "debugpy",
            "request": "launch",
            "program": "${file}",
            "console": "integratedTerminal",
            "args": "${command:pickArgs}"
        }
    ]
 }
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -0,0 +1,55 @@
 {
    // Always treat the project root as the working dir for Jupyter
    "jupyter.notebookFileRoot": "${workspaceFolder}",
    // When you click "Run Python File in Terminal", DON'T cd into the file's folder
    "python.terminal.executeInFileDir": false,
    // Start new integrated terminals at the project root
    "terminal.integrated.cwd": "${workspaceFolder}",
    // Make pytest run from the root without needing a pytest.ini
    "python.testing.pytestEnabled": true,
    "python.testing.cwd": "${workspaceFolder}",
    "python.testing.pytestArgs": [
        "src/test"
    ],
    // Help Pylance resolve imports like `from src...` without red squiggles
    "python.analysis.extraPaths": [
        "${workspaceFolder}"
    ],
    // For linux
    "terminal.integrated.env.linux": {
        "PYTHONPATH": "${workspaceFolder}"
    },
    // For OSX
    "terminal.integrated.env.osx": {
        "PYTHONPATH": "${workspaceFolder}"
    },
    // For Windows
    "terminal.integrated.env.windows": {
        "PYTHONPATH": "${workspaceFolder}"
    },
    "python.analysis.typeCheckingMode": "standard"
 }
 // {
 //   // Always treat the project root as the working dir for Jupyter
 //   "jupyter.notebookFileRoot": "${workspaceFolder}",
 //
 //   // When you click "Run Python File in Terminal", DON'T cd into the file's folder
 //   "python.terminal.executeInFileDir": false,
 //
 //   // Start new integrated terminals at the project root
 //   "terminal.integrated.cwd": "${workspaceFolder}",
 //
 //   // Ensure Python can import from the project root no matter which file you run
 //   // (so `src/` is on sys.path). Linux shown here; add osx/windows if needed.
 //   "terminal.integrated.env.windows": {
 //     "PYTHONPATH": "${workspaceFolder}"
 //   },
 //
 //   // Make pytest run from the root without needing a pytest.ini
 //   "python.testing.pytestEnabled": true,
 //   "python.testing.cwd": "${workspaceFolder}",
 //   "python.testing.pytestArgs": ["src/test"],
 //
 //   // Help Pylance resolve imports like `from src...` without red squiggles
 //   "python.analysis.extraPaths": ["${workspaceFolder}"]
 // }
--- a/Assets/Dataset/1-hop/uri-abbreviations.csv
+++ b/Assets/Dataset/1-hop/uri-abbreviations.csv
--- a/Assets/Dataset/DatawareHouse/dataset.db
+++ b/Assets/Dataset/DatawareHouse/dataset.db
--- a/Assets/Model/toy_10/README.md
+++ b/Assets/Model/toy_10/README.md
--- a/Assets/Model/toy_10/toy_dictionary.json
+++ b/Assets/Model/toy_10/toy_dictionary.json
--- a/Project_Model/Libs/BPE/Classes/Encoder.py
+++ b/Project_Model/Libs/BPE/Classes/Encoder.py
@@ -0,0 +1,4 @@
 from abc import ABC
 class Encoder(ABC):
    pass
--- a/Project_Model/Libs/BPE/Classes/NanoSocraTraineRam.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocraTraineRam.py
@@ -0,0 +1,164 @@
 from collections import deque
 import datetime
 from pathlib import Path
 import re
 from ..Classes import (
    NanoSocratesBPE,
    NanoSocratesChunker,
    NanoSocratesSplitter,
    NanoSocratesBatchMemoryBPE,
 )
 from ..Enums import TokenType
 from ..Utils import (
    special_regex_maker,
    iterator_with_checks,
    save_nanos_vocabulary,
    load_nanos_vocabulary,
    save_json,
    load_json,
 )
 class NanoSocraTraineRam:
    def __init__(
        self,
        max_vocabulary: int,
        special_vocabulary: list[str],
        merge_treshold: int = 0,
        max_iterations: int = 0,
        print_after_iterations: int = 1,
    ) -> None:
        # Bytes
        BYTE_RESERVED_TOKENS = 256
        SPECIAL_RESERVED_TOKENS = len(special_vocabulary)
        RESERVED_TOKENS = BYTE_RESERVED_TOKENS + SPECIAL_RESERVED_TOKENS
        self.__max_vocabulary = max_vocabulary - RESERVED_TOKENS
        self.__max_iterations = max_iterations
        self.__merge_treshold = merge_treshold
        self.__special_token_regex = special_regex_maker(special_vocabulary)
        self.__print_after_iterations = print_after_iterations
    def trainBPE(
        self,
        path: Path,
        bpe: NanoSocratesBPE | None = None,
    ) -> NanoSocratesBPE:
        if not path.is_file():
            raise FileNotFoundError()
        if bpe is None:
            bpe = NanoSocratesBPE()
        BPE = bpe
        if BPE.vocabulary_size > self.__max_vocabulary:
            return BPE
        exit = False
        current_iteration = 0
        data = self.__gather_data_from_file(path)
        while not exit:
            current_iteration = self.__increment_counter(current_iteration)
            LAST_VOC_SIZE = BPE.vocabulary_size
            last_memory = None
            _, data, last_memory = self.__round_train(BPE, data)
            NEW_VOC_SIZE = BPE.vocabulary_size
            if current_iteration % self.__print_after_iterations == 0:
                DELIMITER = "==============="
                DEBUG = "\n".join(
                    [
                        DELIMITER,
                        f"ITERATION: {current_iteration}",
                        DELIMITER,
                        f"\tVocabulary size: {BPE.vocabulary_size}\n",
                        f"\tFrequencies:\n{last_memory.frequencies}\n",  # type: ignore (pretty sure it's not None)
                        f"\tvocabulary:\n{BPE.vocabulary}",
                        DELIMITER,
                        "",
                    ]
                )
                print(DEBUG)
            if LAST_VOC_SIZE == NEW_VOC_SIZE:
                exit = True
                continue
            if current_iteration == self.__max_iterations:
                exit = True
                continue
            if BPE.vocabulary_size == self.__max_vocabulary:
                exit = True
                continue
        return BPE
    def __round_train(self, bpe: NanoSocratesBPE, data: list[list[int]]):
        DATA_LEN = len(data)
        NEW_DATA = []
        counter = 0
        memory = NanoSocratesBatchMemoryBPE({}, 0)
        while len(data) > 0:
            counter += 1
            last_batch = len(data) == 1
            piece = data.pop()
            bpe, memory, output = bpe.fit(piece, memory, last_batch)
            if counter % int(1E6) == 0:
                print(f"Fitted: {counter}/{DATA_LEN}")
            if len(output) < 2:
                continue
            NEW_DATA.append(output)
        return (bpe, NEW_DATA, memory)
    def __gather_data_from_file(self, path: Path) -> list[list[int]]:
        SPLITTER = NanoSocratesSplitter(self.__special_token_regex)
        DATA: list[list[int]] = []
        FILE = open(path, "r", encoding="utf-8")
        file_string = FILE.read()
        FILE.close()
        for piece, type in SPLITTER.split_text(file_string):
            if type != TokenType.BPE:
                continue
            int_list = self.__make_list_ids(piece)
            DATA.append(int_list)
        return DATA
    def __increment_counter(self, counter: int):
        # What if overflows???
        try:
            counter += 1
        except:
            print("Integer overflow")
            counter = 1
        return counter
    def __make_list_ids(self, corpus: str):
        return list(corpus.encode("utf-8"))
--- a/Project_Model/Libs/BPE/Classes/NanoSocraTrainer.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocraTrainer.py
@@ -0,0 +1,248 @@
 from collections import deque
 import datetime
 from pathlib import Path
 import re
 from ..Classes import (
    NanoSocratesBPE,
    NanoSocratesChunker,
    NanoSocratesSplitter,
    NanoSocratesBatchMemoryBPE,
 )
 from ..Enums import TokenType
 from ..Utils import (
    special_regex_maker,
    iterator_with_checks,
    save_nanos_vocabulary,
    load_nanos_vocabulary,
    save_json,
    load_json,
 )
 class NanoSocraTrainer:
    def __init__(
        self,
        max_vocabulary: int,
        special_vocabulary: list[str],
        chunk_size: int,
        merge_treshold: int = 0,
        max_iterations: int = 0,
        print_after_iterations: int = 1,
    ) -> None:
        # Bytes
        BYTE_RESERVED_TOKENS = 256
        SPECIAL_RESERVED_TOKENS = len(special_vocabulary)
        RESERVED_TOKENS = BYTE_RESERVED_TOKENS + SPECIAL_RESERVED_TOKENS
        self.__max_vocabulary = max_vocabulary - RESERVED_TOKENS
        self.__max_iterations = max_iterations
        self.__chunk_size = chunk_size
        self.__merge_treshold = merge_treshold
        self.__special_token_regex = special_regex_maker(special_vocabulary)
        self.__print_after_iterations = print_after_iterations
    def trainBPE(
        self,
        path: Path,
        cache_dir: Path,
        bpe: NanoSocratesBPE | None = None,
        resume_from_iter: int = 0,
    ) -> NanoSocratesBPE:
        if not path.is_file():
            raise FileNotFoundError()
        if not cache_dir.is_dir():
            raise NotADirectoryError()
        if bpe is None:
            bpe = NanoSocratesBPE()
        BPE = bpe
        if BPE.vocabulary_size > self.__max_vocabulary:
            return BPE
        exit = False
        cached = False
        current_iteration = 0
        input_path = path
        NEXT_ITERATION = resume_from_iter + 1 if resume_from_iter != 0 else 0
        PATH_GEN = self.__switch_paths(path, cache_dir, NEXT_ITERATION)
        MEMORY_PATH_GEN = self.__switch_memory(cache_dir, resume_from_iter)
        if resume_from_iter != 0:
            cached = True
            current_iteration = resume_from_iter
            input_path = next(PATH_GEN)
            # UGLY: fixes a bug immediately, unfortunately
            _, _ = next(MEMORY_PATH_GEN)
            _, voc_cache_path = next(MEMORY_PATH_GEN)
            vocabulary = load_nanos_vocabulary(voc_cache_path)
            BPE = NanoSocratesBPE(vocabulary)
        while not exit:
            out_path = next(PATH_GEN)
            internal_cache_path, vocabulary_cache = next(MEMORY_PATH_GEN)
            current_iteration = self.__increment_counter(current_iteration)
            LAST_VOC_SIZE = BPE.vocabulary_size
            FILE = open(out_path, "w")
            last_memory = None
            for _, memory, output in self.__round_train(input_path, BPE, cached):
                last_memory = memory
                FILE.write(output)
            FILE.close()
            internal_cache = {
                "finished_iter": current_iteration,
                "read_from": f"{input_path}",
                "wrote_to": f"{out_path}",
                "at": datetime.datetime.now(datetime.timezone.utc).strftime(
                    "%Y-%m-%d %H:%M:%S.%f"
                )[:-3],
            }
            VOCABULARY = BPE.vocabulary
            save_json(internal_cache, internal_cache_path)
            save_nanos_vocabulary(VOCABULARY, vocabulary_cache)
            cached = True
            input_path = out_path
            NEW_VOC_SIZE = BPE.vocabulary_size
            if current_iteration % self.__print_after_iterations == 0:
                DELIMITER = "==============="
                DEBUG = "\n".join(
                    [
                        DELIMITER,
                        f"ITERATION: {current_iteration}",
                        DELIMITER,
                        f"\tVocabulary size: {BPE.vocabulary_size}\n",
                        f"\tFrequencies:\n{last_memory.frequencies}\n",  # type: ignore (pretty sure it's not None)
                        f"\tvocabulary:\n{BPE.vocabulary}",
                        DELIMITER,
                        "",
                    ]
                )
                print(DEBUG)
            if LAST_VOC_SIZE == NEW_VOC_SIZE:
                exit = True
                continue
            if current_iteration == self.__max_iterations:
                exit = True
                continue
            if BPE.vocabulary_size == self.__max_vocabulary:
                exit = True
                continue
        return BPE
    def __round_train(self, path: Path, bpe: NanoSocratesBPE, cached: bool):
        CHUNKER = NanoSocratesChunker(self.__chunk_size, self.__special_token_regex)
        SPLITTER = NanoSocratesSplitter(self.__special_token_regex)
        BPE = bpe
        memory = NanoSocratesBatchMemoryBPE({}, self.__merge_treshold)
        CHUNKER_GENERATOR = iterator_with_checks(CHUNKER.chunk(path))
        for chunk, last_chunk in CHUNKER_GENERATOR:
            PIECE_GENERATOR = iterator_with_checks(SPLITTER.split_text(chunk))
            for piece, last_piece in PIECE_GENERATOR:
                LAST_BATCH = last_chunk and last_piece
                PIECE, TOKEN_TYPE = piece
                if TOKEN_TYPE != TokenType.BPE:
                    _, _, out = BPE.fit([], memory, LAST_BATCH)
                    yield (BPE, memory, PIECE)
                    continue
                PIECE_DATA = self.__make_list_ids(PIECE, cached)
                _, _, out = BPE.fit(PIECE_DATA, memory, LAST_BATCH)
                OUT_STRING = f"{out}"
                yield (BPE, memory, OUT_STRING)
    def __increment_counter(self, counter: int):
        # What if overflows???
        try:
            counter += 1
        except:
            print("Integer overflow")
            counter = 1
        return counter
    def __make_list_ids(self, corpus: str, cached: bool):
        if not cached:
            return list(corpus.encode("utf-8"))
        REDUCED_CORPUS_LEN = len(corpus) - 1
        # Skip these cars "[" "]"
        INTS = corpus[1:REDUCED_CORPUS_LEN]
        INT_LIST = list(map(int, INTS.split(",")))
        return INT_LIST
    def __switch_paths(self, path: Path, cache_path: Path, initial_iteration: int):
        CORPUS_TMP_1 = cache_path / "corpus-tmp1.txt"
        CORPUS_TMP_2 = cache_path / "corpus-tmp2.txt"
        switch = True
        if initial_iteration % 2 == 1:
            switch = False
        del initial_iteration
        while True:
            if switch:
                yield CORPUS_TMP_1
            else:
                yield CORPUS_TMP_2
            switch = not switch
    def __switch_memory(self, cache_path: Path, initial_iteration: int):
        INTERNAL_TMP_1 = cache_path / "internal-tmp1.json"
        INTERNAL_TMP_2 = cache_path / "internal-tmp2.json"
        VOCAB_TMP_1 = cache_path / "voc-tmp1.json"
        VOCAB_TMP_2 = cache_path / "voc-tmp2.json"
        switch = False
        if initial_iteration % 2 == 1:
            switch = True
        del initial_iteration
        while True:
            if switch:
                yield (INTERNAL_TMP_1, VOCAB_TMP_1)
            else:
                yield (INTERNAL_TMP_2, VOCAB_TMP_2)
            switch = not switch
--- a/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py
@@ -0,0 +1,280 @@
 from collections import deque
 import datetime
 import itertools
 from multiprocessing import Pool
 import os
 from pathlib import Path
 import re
 import time
 from ..Classes import (
    NanoSocratesBPE,
    NanoSocratesChunker,
    NanoSocratesSplitter,
    NanoSocratesBatchMemoryBPE,
 )
 from ..Enums import TokenType
 from ..Utils import (
    special_regex_maker,
    iterator_with_checks,
    save_nanos_vocabulary,
    load_nanos_vocabulary,
    save_json,
    load_json,
 )
 def split(a, n):
    k, m = divmod(len(a), n)
    return (a[i * k + min(i, m) : (i + 1) * k + min(i + 1, m)] for i in range(n))
 def split_fit(object: tuple[NanoSocratesBPE, list[list[int]]]):
    bpe, data = object
    NEW_DATA: list[list[int]] = []
    memory = NanoSocratesBatchMemoryBPE({}, 0)
    while len(data) > 0:
        piece = data.pop()
        bpe, memory, output = bpe.fit(piece, memory, False)
        if len(output) < 2:
            continue
        # We are sure of its type
        NEW_DATA.append(piece)  # type: ignore
    return (bpe, NEW_DATA, memory)
 def split_encode(object: tuple[NanoSocratesBPE, list[list[int]]]):
    bpe, data = object
    NEW_DATA: list[list[int]] = []
    for index, piece in zip(range(0, len(data)), data):
        output = bpe.encode_intermediate(piece)
        if len(output) < 2:
            continue
        # We are sure of its type
        NEW_DATA.append(data[index])  # type: ignore
    return NEW_DATA
 class NanoSocraTrainerPool:
    def __init__(
        self,
        max_vocabulary: int,
        special_vocabulary: list[str],
        merge_treshold: int = 0,
        max_iterations: int = 0,
        print_after_iterations: int = 1,
    ) -> None:
        # Bytes
        BYTE_RESERVED_TOKENS = 256
        SPECIAL_RESERVED_TOKENS = len(special_vocabulary)
        RESERVED_TOKENS = BYTE_RESERVED_TOKENS + SPECIAL_RESERVED_TOKENS
        self.__max_vocabulary = max_vocabulary - RESERVED_TOKENS
        self.__max_iterations = max_iterations
        self.__merge_treshold = merge_treshold
        self.__special_token_regex = special_regex_maker(special_vocabulary)
        self.__print_after_iterations = print_after_iterations
    # TODO: add a resume function
    def trainBPE(
        self,
        path: Path,
        cache_file: Path,
        bpe: NanoSocratesBPE | None = None,
    ) -> NanoSocratesBPE:
        if not path.is_file():
            raise FileNotFoundError()
        if not cache_file.is_file():
            file = cache_file.open("w")
            file.close()
        if bpe is None:
            bpe = NanoSocratesBPE()
        BPE = bpe
        if BPE.vocabulary_size > self.__max_vocabulary:
            return BPE
        exit = False
        current_iteration = 0
        data = self.__gather_data_from_file(path)
        data = self.__encode_from_cache(BPE, data)
        while not exit:
            current_iteration = self.__increment_counter(current_iteration)
            LAST_VOC_SIZE = BPE.vocabulary_size
            last_memory = None
            start = time.time_ns()
            _, data, last_memory = self.__round_train(BPE, data)
            end = time.time_ns()
            NEW_VOC_SIZE = BPE.vocabulary_size
            VOCABULARY = BPE.vocabulary
            save_nanos_vocabulary(VOCABULARY, cache_file)
            if current_iteration % self.__print_after_iterations == 0:
                DELIMITER = "==============="
                DEBUG = "\n".join(
                    [
                        DELIMITER,
                        f"ITERATION: {current_iteration}",
                        DELIMITER,
                        f"\tVocabulary size: {BPE.vocabulary_size - 256}\n",
                        f"\tTime elapsed: {(end - start)/1E9}s",
                        DELIMITER,
                        "",
                    ]
                )
                print(DEBUG)
            if LAST_VOC_SIZE == NEW_VOC_SIZE:
                exit = True
                continue
            if current_iteration == self.__max_iterations:
                exit = True
                continue
            if BPE.vocabulary_size == self.__max_vocabulary:
                exit = True
                continue
        return BPE
    def __round_train(self, bpe: NanoSocratesBPE, data: list[list[int]]):
        NEW_DATA: list[list[int]] = []
        MEMORY = NanoSocratesBatchMemoryBPE({}, self.__merge_treshold)
        fit_funct = split_fit
        CPU_COUNT = os.process_cpu_count()
        if CPU_COUNT is None:
            raise Exception()
        VOCABULARY = bpe.vocabulary
        data_chunks = split(data, CPU_COUNT)
        JOBS = [(NanoSocratesBPE(VOCABULARY), chunk) for chunk in data_chunks]
        JOB_RESULTS: list[
            tuple[NanoSocratesBPE, list[list[int]], NanoSocratesBatchMemoryBPE]
        ]
        with Pool() as pool:
            JOB_RESULTS = pool.map(fit_funct, JOBS)
        for i, res in zip(range(0, CPU_COUNT), JOB_RESULTS):
            _, job_output, job_memory = res
            NEW_DATA.extend(job_output)
            for key, value in job_memory.frequencies.items():
                frequency = MEMORY.frequencies.get(key)
                if frequency is None:
                    frequency = 0
                    MEMORY.frequencies[key] = 0
                frequency += value
                MEMORY.frequencies[key] = frequency
            del job_output
            del job_memory
            print(f"Joined {i + 1} out of {CPU_COUNT}")
        # Get new token
        bpe.fit([], MEMORY, True)
        print(f"Sentences from {len(data)} to {len(NEW_DATA)}")
        return (bpe, NEW_DATA, MEMORY)
    def __gather_data_from_file(self, path: Path) -> list[list[int]]:
        SPLITTER = NanoSocratesSplitter(self.__special_token_regex)
        DATA: list[list[int]] = []
        FILE = open(path, "r", encoding="utf-8")
        file_string = FILE.read()
        FILE.close()
        for piece, type in SPLITTER.split_text(file_string):
            if type != TokenType.BPE:
                continue
            int_list = self.__make_list_ids(piece)
            DATA.append(int_list)
        return DATA
    def __encode_from_cache(self, bpe: NanoSocratesBPE, data: list[list[int]]):
        NEW_DATA : list[list[int]]= []
        CPU_COUNT = os.process_cpu_count()
        if CPU_COUNT is None:
            raise Exception()
        VOCABULARY = bpe.vocabulary
        data_chunks = split(data, CPU_COUNT)
        JOBS = [(NanoSocratesBPE(VOCABULARY), chunk) for chunk in data_chunks]
        JOB_RESULTS: list[list[list[int]]]
        with Pool() as pool:
            JOB_RESULTS = pool.map(split_encode, JOBS)
        for i, res in zip(range(0, CPU_COUNT), JOB_RESULTS):
            job_output = res
            NEW_DATA.extend(job_output)
            del job_output
            print(f"Joined {i + 1} out of {CPU_COUNT}")
        print(f"Sentences from {len(data)} to {len(NEW_DATA)}")
        return NEW_DATA
    def __increment_counter(self, counter: int):
        # What if overflows???
        try:
            counter += 1
        except:
            print("Integer overflow")
            counter = 1
        return counter
    def __make_list_ids(self, corpus: str):
        return list(corpus.encode("utf-8"))
--- a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py
@@ -0,0 +1,219 @@
 from collections import deque
 from .Encoder import Encoder
 from ..Errors import OutOfDictionaryException, DuplicateWordException
 # ABOUT THE DICTIONARY:
 # the string is converted into utf-char bytes, that is: each char is rappresented with a set of bytes from 1 to 4.
 # each bytes get casted into an integer; such that, if an integer has its value lower then 256,
 # then it is rappresenting an utf-char-byte, otherwise it is a token-ID.
 class NanoSocratesBatchMemoryBPE:
    """Memory to batch training. Keeps token couple frequencies, and merge_treshold"""
    def __init__(
        self, frequencies: dict[tuple[int, int], int], merge_treshold: int
    ) -> None:
        self.frequencies = frequencies
        self.merge_treshold = merge_treshold
 class NanoSocratesBPE(Encoder):
    def __init__(self, vocabulary: dict[tuple[int, int], int] | None = None) -> None:
        super().__init__()
        self.__vocabulary: dict[tuple[int, int], int] = {}
        self.__reverse_vocabulary: dict[int, tuple[int, int]] = {}
        if vocabulary is None:
            return
        for key, value in vocabulary.items():
            if value < 256:
                raise OutOfDictionaryException()
                # values under 256 are used for unpaired char
            # TODO: check if they are in order
            self.__vocabulary[key] = value
            self.__reverse_vocabulary[value] = key
    @property
    def vocabulary_size(self):
        return len(self.__vocabulary) + 256
    @property
    def vocabulary(self):
        return self.__vocabulary
    @property
    def __next_id(self) -> int:
        """
        Gets the next it
        Returns:
            int:
        """
        return self.vocabulary_size
    # TODO: implement fit
    def fit(
        self,
        chunk_data: list[int],
        memory: NanoSocratesBatchMemoryBPE,
        last_batch: bool,
    ):
        ENCODED_CHUNK = self.encode_intermediate(chunk_data)
        DATA_LEN_BEFORE_LAST = len(ENCODED_CHUNK) - 1
        # update frequency of each couple of element
        for i in range(0, DATA_LEN_BEFORE_LAST):
            CANDIDATE_COUPLE = (ENCODED_CHUNK[i], ENCODED_CHUNK[i + 1])
            frequency = memory.frequencies.get(CANDIDATE_COUPLE)
            # Initialize frequency
            if frequency is None:
                frequency = 0
                memory.frequencies[CANDIDATE_COUPLE] = 0
            frequency += 1
            memory.frequencies[CANDIDATE_COUPLE] = frequency
        if not last_batch:
            return (self, memory, ENCODED_CHUNK)
        if len(memory.frequencies) < 1:
            return (self, memory, ENCODED_CHUNK)
        FREQUENCIES = memory.frequencies
        MAX_COUPLE = max(FREQUENCIES.items(), key=lambda item: item[1])[0]
        FREQUENCY = FREQUENCIES[MAX_COUPLE]
        if FREQUENCY < memory.merge_treshold:
            return (self, memory, ENCODED_CHUNK)
        self.__learn_word(MAX_COUPLE)
        return (self, memory, ENCODED_CHUNK)
    def encode(self, piece: str) -> list[int]:
        """Encode a String into token IDs, it firt convert it into utf-8, then pass the list of integer to encode_intermediate()
        Args:
            piece (str):
        Returns:
            list[int]:
        """
        converted_piece = list(piece.encode("utf-8"))
        return self.encode_intermediate(converted_piece)
    def encode_intermediate(self, piece: list[int]) -> list[int]:
        """Encode a piece (as list of integer) till its maximum
        Args:
            piece (list[int]): piece to encode
        Returns:
            list[int]: piece encoded
        """
        current_piece = piece
        new_piece = self.__round_encode(current_piece)
        # until current_piece is bigger then new_piece, keep encoding
        while len(current_piece) != len(new_piece):
            current_piece = new_piece
            new_piece = self.__round_encode(current_piece)
        return current_piece
    def __round_encode(self, piece: list[int]):
        """A single round of encode that traverse all the object. Multiple round are needed for a full encode: \n
        1) "ABAB" -> "XX"
        2) "XX" -> "Y"
        Args:
            piece (list[int]): the object to encode as a list of integer
        Returns:
            (list[int]): the one time encoded object
        """
        if len(piece) == 1:
            return piece
        PIECE_LENGTH = len(piece) - 1
        NEW_PIECE: list[int] = []
        index = 0
        while index < PIECE_LENGTH:
            CANDIDATE_WORD = (
                piece[index],
                piece[index + 1],
            )  # take a tuple of consecutive element [int]
            CANDIDATE_TOKEN = self.__vocabulary.get(CANDIDATE_WORD)
            # if no token to substitute the tuple, append the first element
            if CANDIDATE_TOKEN is None:
                NEW_PIECE.append(piece[index])
                index += 1
                # if the latter element of the tuple is the last element of the piece, append it
                if index == PIECE_LENGTH:
                    NEW_PIECE.append(piece[index])
                continue
            # in this case there was a candidate token to substitute the couple of element
            NEW_PIECE.append(CANDIDATE_TOKEN)
            index += 2
            if index == PIECE_LENGTH:
                NEW_PIECE.append(piece[index])
        return NEW_PIECE
    # TODO: Remake decode to take a list of token IDs
    def decode(self, token_ids: list[int]) -> str:
        # deque: double ended queue
        token_stack: deque[int] = deque(token_ids)
        UTF_8_STRING_ARR: bytearray = bytearray()
        while len(token_stack) > 0:
            TOKEN_ID = token_stack.popleft()
            if TOKEN_ID < 256:
                UTF_8_STRING_ARR.append(TOKEN_ID)
                continue
            left_token, right_token = self.__token_decode(TOKEN_ID)
            token_stack.appendleft(right_token)
            token_stack.appendleft(left_token)
        return UTF_8_STRING_ARR.decode("utf-8")
    def __token_decode(self, token_id: int) -> tuple[int, int]:
        CANDIDATE_DECODED = self.__reverse_vocabulary.get(token_id)
        if CANDIDATE_DECODED is None:
            raise OutOfDictionaryException()
        return CANDIDATE_DECODED
    def __learn_word(self, words: tuple[int, int]):
        """learn a new couple of object in the vocabulary
        Args:
            words (tuple[int, int]): the Pair of element to substitute with a new tokenID
        Raises:
            DuplicateWordException: it launch if there is a duplicate of the new tokenID in the dictionary
        """
        ID = self.__next_id
        DUPLICATE = self.__vocabulary.get(words)
        if DUPLICATE is not None:
            raise DuplicateWordException()
        self.__vocabulary[words] = ID
        self.__reverse_vocabulary[ID] = words
--- a/Project_Model/Libs/BPE/Classes/NanoSocratesChunker.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocratesChunker.py
@@ -0,0 +1,70 @@
 from pathlib import Path
 import re
 from ..Errors import DelimiterNotFoundException
 class NanoSocratesChunker:
    def __init__(self, max_size: int, special_token_regex: re.Pattern) -> None:
        self.__max_size: int = max_size
        self.__special_token_regex: re.Pattern = special_token_regex
        self.__residual: str = ""
    # max theorethical size of chars
    #   between special tokens:
    #       - min: size - len(longest_token)
    #       - MAX: size - len(shortest_token)
    def chunk(self, file_path: Path):
        # read_file
        FILE = open(file_path, "r", encoding="utf-8")
        exit = False
        while not exit:
            REMAINING_SIZE = self.__max_size - len(self.__residual)
            READ_SIZE = min(self.__max_size, REMAINING_SIZE)
            FILE_CHUNK = FILE.read(READ_SIZE)
            if len(FILE_CHUNK) == 0:
                exit = True
                continue
            CHUNK = self.__append_residuals(FILE_CHUNK)
            boundaries = self.__identify_boudaries(CHUNK)
            if boundaries is None:
                # boundaries not found in 2 chunks,
                if len(CHUNK) > self.__max_size - 1:
                    raise DelimiterNotFoundException()
                if exit:
                    yield CHUNK
                self.__set_residual(0, CHUNK)
                continue
            start, end = boundaries
            self.__set_residual(end, CHUNK)
            yield CHUNK[start:end]
    def __identify_boudaries(self, corpus: str) -> tuple[int, int] | None:
        end = 0
        for match in self.__special_token_regex.finditer(corpus):
            # print(match)
            end = match.end()
        if end == 0:
            return None
        return (0, end)
    def __append_residuals(self, corpus: str) -> str:
        RESIDUAL = self.__residual
        self.__residual = ""
        return RESIDUAL + corpus
    def __set_residual(self, index: int, corpus: str):
        self.__residual = corpus[index:]
--- a/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py
@@ -0,0 +1,64 @@
 from .Encoder import Encoder
 from ..Errors import OutOfDictionaryException
 class NanoSocratesSpecial(Encoder):
    def __init__(
        self, bpe_vocabulary_size: int, special_tokens: list[str] = []
    ) -> None:
        super().__init__()
        self.__bpe_offset = bpe_vocabulary_size
        self.__vocabulary: dict[str, int] = {}
        self.__reverse_vocabulary: dict[int, str] = {}
        if len(special_tokens) == 0:
            return
        for index, TOKEN in zip(range(0, len(special_tokens)), special_tokens):
            CANDIDATE_ID = self.__bpe_offset + index + 1
            self.__vocabulary[TOKEN] = CANDIDATE_ID
            self.__reverse_vocabulary[CANDIDATE_ID] = TOKEN
    @property
    def __next_id(self):
        BPE_OFFSET = self.__bpe_offset
        VOC_LENGTH = len(self.__vocabulary)
        return BPE_OFFSET + VOC_LENGTH + 1
    @property
    def vocabulary(self) -> dict[str, int]:
        return self.__vocabulary
    @property
    def reverse_vocabulary(self) -> dict[int, str]:
        return self.__reverse_vocabulary
    def add_special_word_to_vocabulary(self, word: str):
        CANDIDATE_INDEX = self.__next_id
        self.__vocabulary[word] = CANDIDATE_INDEX
        self.__reverse_vocabulary[CANDIDATE_INDEX] = word
    def encode(self, word: str) -> list[int]:
        ID = self.__vocabulary.get(word)
        if ID is None:
            raise OutOfDictionaryException()
        return [ID]
    def decode(self, token_id: list[int]) -> str:
        if len(token_id) != 1:
            raise OutOfDictionaryException()
        ID = token_id[0]
        WORD = self.__reverse_vocabulary.get(ID)
        if WORD is None:
            raise OutOfDictionaryException()
        return WORD
--- a/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py
@@ -0,0 +1,98 @@
 import re
 from collections import deque
 from typing import Generator
 from ..Enums import TokenType
 class NanoSocratesSplitter:
    def __init__(
        self, special_token_regex: re.Pattern, max_bpe_token_id: int = 255
    ) -> None:
        # attention the regex got already compiled
        self.__special_token_regex = special_token_regex
        self.__max_bpe_token_id: int = max_bpe_token_id  # used for decoding
    def split_text(self, corpus: str) -> Generator[tuple[str, TokenType]]:
        """Split a text using a regex given
        Args:
            corpus (str): all the corpus string to split
        Yields:
            Generator[tuple[str, TokenType]]: each time returns a piece of the splitted text: string and its TokenType. \n
            TokenType describe if the string is for the BPE or a special Token [BPE, SPECIAL]
        """
        bpe_start = 0
        bpe_end = len(corpus)  # this can be deleted!
        for special_token_start, special_token_end in self.__find_boundaries(corpus):
            # FIND BPE
            bpe_end = special_token_start
            BPE_TOKEN_TEXT = corpus[bpe_start:bpe_end]
            if BPE_TOKEN_TEXT != "":
                for WORD in self.__split_words(BPE_TOKEN_TEXT):
                    yield (WORD, TokenType.BPE)
            # FIND SPECIAL TOKEN
            SPECIAL_TOKEN_TEXT = corpus[special_token_start:special_token_end]
            if SPECIAL_TOKEN_TEXT != "":
                yield (SPECIAL_TOKEN_TEXT, TokenType.SPECIAL)
            # now save the new bpe start point
            # it will used in the next interaction
            bpe_start = special_token_end
    def __find_boundaries(self, corpus: str) -> Generator[tuple[int, int]]:
        """
        Find each time the start and end (not included) of the special token
        Args:
            corpus (str): the string where the special token will be searched
        Yields:
            Generator[tuple[int, int]]: Note the end is not included
        """
        for match in self.__special_token_regex.finditer(corpus):
            start = match.start()
            end = match.end()
            yield (start, end)
        # make the last boundary be the end of corpus
        # eof = len(corpus)
        # yield(eof,eof)
    def __split_words(self, bpe_piece: str) -> Generator[str]:
        END_OF_STRING = len(bpe_piece)
        bound_start = 0
        bound_end = END_OF_STRING + 1
        for i in range(0, END_OF_STRING):
            CANDIDATE_CHAR = bpe_piece[i]
            if CANDIDATE_CHAR != " ":
                continue
            bound_end = i
            yield bpe_piece[bound_start:bound_end]
            bound_start = bound_end
            bound_end = END_OF_STRING + 1
        yield bpe_piece[bound_start:bound_end]
    def split_tokens(self, corpus: list[int]) -> Generator[tuple[list[int], TokenType]]:
        not_special_token_list: list[int] = []
        for token in corpus:
            if token > self.__max_bpe_token_id:
                if len(not_special_token_list) > 0:
                    yield (not_special_token_list, TokenType.BPE)
                    not_special_token_list = []
                yield ([token], TokenType.SPECIAL)
                continue
            not_special_token_list.append(token)
--- a/Project_Model/Libs/BPE/Classes/TokeNano.py
+++ b/Project_Model/Libs/BPE/Classes/TokeNano.py
@@ -0,0 +1,8 @@
 from Project_Model.Libs.BPE.Classes.TokeNanoCore import TokeNanoCore
 class TokeNano:
    def __init__(self):
        pass
--- a/Project_Model/Libs/BPE/Classes/TokeNanoCore.py
+++ b/Project_Model/Libs/BPE/Classes/TokeNanoCore.py
@@ -0,0 +1,62 @@
 from pathlib import Path
 from ..Classes import NanoSocratesSplitter
 from ..Classes import NanoSocratesBPE
 from ..Classes import NanoSocratesSpecial
 from ..Utils import special_regex_maker
 from ..Enums import TokenType
 class TokeNanoCore:
    def __init__(
        self,
        bpe_vocabulary: dict[tuple[int, int], int],
        special_token_list: list[str],
        # special_vocabulary: dict[str, int]
    ):
        self.__bpe_encoder = NanoSocratesBPE(bpe_vocabulary)
        SPECIAL_REGEX = special_regex_maker(special_token_list)
        BPE_VOCABULARY_SIZE = self.__bpe_encoder.vocabulary_size
        self.__splitter = NanoSocratesSplitter(SPECIAL_REGEX, BPE_VOCABULARY_SIZE)
        self.__special_encoder = NanoSocratesSpecial(
            BPE_VOCABULARY_SIZE, special_token_list
        ) 
    def encode(self, corpus: str) -> list[int]:
        output: list[int] = []
        for piece, token_type in self.__splitter.split_text(corpus):
            if token_type == TokenType.SPECIAL:
                ENCODED_PIECE = self.__special_encoder.encode(piece)
                output.extend(ENCODED_PIECE)
                continue
            # slow but clear
            if token_type == TokenType.BPE:
                ENCODED_PIECE = self.__bpe_encoder.encode(piece)
                output.extend(ENCODED_PIECE)
                continue
        return output
    def decode(self, corpus: list[int]) -> str:
        output_str = ""
        for token, token_type in self.__splitter.split_tokens(corpus):
            # token is an integer if special, a list of integer otherwise
            if token_type == TokenType.SPECIAL:
                output_str += self.__special_encoder.decode(
                    token
                )
                continue
            # slow but clear
            if token_type == TokenType.BPE:
                output_str += self.__bpe_encoder.decode(
                    token
                )
                continue
        return output_str
--- a/Project_Model/Libs/BPE/Classes/init.py
+++ b/Project_Model/Libs/BPE/Classes/init.py
@@ -0,0 +1,18 @@
 from .NanoSocratesChunker import NanoSocratesChunker
 from .NanoSocratesSplitter import NanoSocratesSplitter
 from .NanoSocratesBPE import NanoSocratesBPE, NanoSocratesBatchMemoryBPE
 from .NanoSocraTrainer import NanoSocraTrainer
 from .NanoSocraTraineRam import NanoSocraTraineRam
 from .NanoSocraTrainerPool import NanoSocraTrainerPool
 from .NanoSocratesSpecial import NanoSocratesSpecial
 from .TokeNanoCore import TokeNanoCore
 __all__ = [
    "NanoSocratesChunker",
    "NanoSocratesSplitter",
    "NanoSocratesBPE",
    "NanoSocraTrainer",
    "NanoSocraTraineRam",
    "NanoSocraTrainerPool",
    "TokeNanoCore"
 ]
--- a/Project_Model/Libs/BPE/Enums/SpecialToken.py
+++ b/Project_Model/Libs/BPE/Enums/SpecialToken.py
@@ -0,0 +1,21 @@
 from enum import Enum
 class SpecialToken(Enum):
    # (Enum, str) -> throws an error
    START_TRIPLE_LIST = "<SOTL>"
    START_TRIPLE = "<SOT>"
    END_TRIPLE = "<EOT>"
    SUBJECT = "<SUBJ>"
    RELATIONSHIP = "<PRED>"
    OBJECT = "<OBJ>"
    ABSTRACT = "<ABS>"
    CORPUS_END = "<END>"
    ## Tasks' Token
    RDF_TO_TEXT = "<RDF2TXT>"
    TEXT_TO_RDF = "<TEXT2RDF>"
    CONTINUE_RDF = "<CONTINUERDF>"
    MASK = "<MASK>"
    # BPE Training:
--- a/Project_Model/Libs/BPE/Enums/TokenType.py
+++ b/Project_Model/Libs/BPE/Enums/TokenType.py
@@ -0,0 +1,6 @@
 from enum import Enum, auto
 class TokenType(Enum):
    SPECIAL = auto()
    BPE = auto()
--- a/Project_Model/Libs/BPE/Enums/init.py
+++ b/Project_Model/Libs/BPE/Enums/init.py
@@ -0,0 +1 @@
 from .TokenType import TokenType
--- a/Project_Model/Libs/BPE/Errors/DelimiterNotFoundException.py
+++ b/Project_Model/Libs/BPE/Errors/DelimiterNotFoundException.py
@@ -0,0 +1,4 @@
 class DelimiterNotFoundException(Exception):
    def __init__(self, *args: object) -> None:
        super().__init__(*args)
--- a/Project_Model/Libs/BPE/Errors/DuplicateWordException.py
+++ b/Project_Model/Libs/BPE/Errors/DuplicateWordException.py
@@ -0,0 +1,4 @@
 class DuplicateWordException(Exception):
    def __init__(self, *args: object) -> None:
        super().__init__(*args)
--- a/Project_Model/Libs/BPE/Errors/OutOfDictionaryException.py
+++ b/Project_Model/Libs/BPE/Errors/OutOfDictionaryException.py
@@ -0,0 +1,4 @@
 class OutOfDictionaryException(Exception):
    def __init__(self, *args: object) -> None:
        super().__init__(*args)
--- a/Project_Model/Libs/BPE/Errors/SentenceTooLongException.py
+++ b/Project_Model/Libs/BPE/Errors/SentenceTooLongException.py
@@ -0,0 +1,4 @@
 class SentenceTooLongException(Exception):
    def __init__(self, *args: object) -> None:
        super().__init__(*args)
--- a/Project_Model/Libs/BPE/Errors/init.py
+++ b/Project_Model/Libs/BPE/Errors/init.py
@@ -0,0 +1,11 @@
 from .DelimiterNotFoundException import DelimiterNotFoundException
 from .OutOfDictionaryException import OutOfDictionaryException
 from .DuplicateWordException import DuplicateWordException
 from .SentenceTooLongException import SentenceTooLongException
 __all__ = [
    "DelimiterNotFoundException",
    "OutOfDictionaryException",
    "DuplicateWordException",
    "SentenceTooLongException"
 ]
--- a/Project_Model/Libs/BPE/Utils/init.py
+++ b/Project_Model/Libs/BPE/Utils/init.py
@@ -0,0 +1,13 @@
 from .special_regex_maker import special_regex_maker
 from .lag_checker_iterator import iterator_with_checks
 from .vocabulary import save_nanos_vocabulary, load_nanos_vocabulary
 from .json_utils import save_json, load_json
 from .special_regex_maker import special_regex_maker
 __all__ = [
    "special_regex_maker",
    "iterator_with_checks",
    "save_nanos_vocabulary",
    "load_nanos_vocabulary",
    "save_json", "load_json"
 ]
--- a/Project_Model/Libs/BPE/Utils/json_utils.py
+++ b/Project_Model/Libs/BPE/Utils/json_utils.py
@@ -0,0 +1,18 @@
 import json
 from pathlib import Path
 def save_json(dictionary: dict, path: Path):
    json_string = json.dumps(dictionary)
    FILE = open(path, "w")
    FILE.write(json_string)
    FILE.close()
 def load_json(path: Path) -> dict:
    FILE = open(path, "r")
    json_string = FILE.read()
    FILE.close()
    return json.loads(json_string)
--- a/Project_Model/Libs/BPE/Utils/lag_checker_iterator.py
+++ b/Project_Model/Libs/BPE/Utils/lag_checker_iterator.py
@@ -0,0 +1,27 @@
 from collections import deque
 from typing import Generator, TypeVar
 T1 = TypeVar("T1")
 T2 = TypeVar("T2")
 T3 = TypeVar("T3")
 def iterator_with_checks(
    generator: Generator[T1, T2, T3],
 ) -> Generator[tuple[T1, bool], T2, T3]:
    # Here we can ignore to catch stop iteration
    #   we will propagate it
    last_element = next(generator)
    while True:
        RETURN_ELEMENT = last_element
        try:
            element = next(generator)
            last_element = element
            yield (RETURN_ELEMENT, False)
        except StopIteration:
            yield (RETURN_ELEMENT, True)
            break
--- a/Project_Model/Libs/BPE/Utils/special_regex_maker.py
+++ b/Project_Model/Libs/BPE/Utils/special_regex_maker.py
@@ -0,0 +1,15 @@
 import re
 def special_regex_maker(special_tokens: list[str]) -> re.Pattern:
    """compile a regex for the special token
    Args:
        special_tokens (list[str]): the list of special token
    Returns:
        re.Pattern:
    """
    REGEX_STR = "|".join(special_tokens)
    return re.compile(REGEX_STR)
--- a/Project_Model/Libs/BPE/Utils/vocabulary.py
+++ b/Project_Model/Libs/BPE/Utils/vocabulary.py
@@ -0,0 +1,49 @@
 import json
 from pathlib import Path
 from ..Errors import OutOfDictionaryException
 def nanos_vocabulary2json_str(vocabulary: dict[tuple[int, int], int]) -> str:
    JSON: dict[str, int] = {}
    for key, item in vocabulary.items():
        TUPLE_STR = f"{key}"
        JSON[TUPLE_STR] = item
    return json.dumps(JSON)
 def nanos_json_str2vocabulary(json_string: str) -> dict[tuple[int, int], int]:
    JSON: dict[str, int] = json.loads(json_string)
    VOCABULARY: dict[tuple[int, int], int] = {}
    for key, item in JSON.items():
        REDUCED_KEY = len(key) - 1
        KEY_STR = key[1:REDUCED_KEY]
        VOC_KEY = tuple(map(int, KEY_STR.split(",")))
        if len(VOC_KEY) != 2:
            raise OutOfDictionaryException()
        # Checked for weird things above
        VOCABULARY[VOC_KEY] = item  # type: ignore
    return VOCABULARY
 def save_nanos_vocabulary(vocabulary: dict[tuple[int, int], int], path: Path):
    json_string = nanos_vocabulary2json_str(vocabulary)
    FILE = open(path, "w")
    FILE.write(json_string)
    FILE.close()
 def load_nanos_vocabulary(path: Path) -> dict[tuple[int, int], int]:
    FILE = open(path, "r")
    json_string = FILE.read()
    FILE.close()
    return nanos_json_str2vocabulary(json_string)
--- a/Project_Model/Libs/BPE/init.py
+++ b/Project_Model/Libs/BPE/init.py
@@ -0,0 +1,9 @@
 from .Classes import *
 from .Enums import *
 from .Errors import *
 from .Utils import *
 from . import Classes
 from . import Enums
 from . import Errors
 from . import Utils
--- a/Project_Model/Libs/init.py
+++ b/Project_Model/Libs/init.py
@@ -0,0 +1 @@
 from . import BPE
--- a/Project_Model/Tests/bpe_test.py
+++ b/Project_Model/Tests/bpe_test.py
@@ -0,0 +1,74 @@
 from Project_Model.Libs.BPE.Enums import TokenType
 import Project_Model.Libs.BPE as BPE
 import re
 class TestBPE:
    def test_bpe_encoding_simple(self):
        TEXT = "abababab"
        # ab = 256
        # 256, 256 = 257
        # 257, 257 = 258
        VOCABULARY = {(ord("a"), ord("b")): 256, (256, 256): 257, (257, 257): 258}
        EXPECTED = [258]
        BPE_ENCODER = BPE.NanoSocratesBPE(VOCABULARY)
        ENCODED = BPE_ENCODER.encode(TEXT)
        assert len(ENCODED) == len(EXPECTED)
        for encoded, expected in zip(ENCODED, EXPECTED):
            assert encoded == expected
    def test_bpe_decoding_simple(self):
        INPUT = [258]
        # ab = 256
        # 256, 256 = 257
        # 257, 257 = 258
        VOCABULARY = {(ord("a"), ord("b")): 256, (256, 256): 257, (257, 257): 258}
        EXPECTED = "abababab"
        BPE_ENCODER = BPE.NanoSocratesBPE(VOCABULARY)
        DECODED = BPE_ENCODER.decode(INPUT)
        assert len(DECODED) == len(EXPECTED)
        for encoded, expected in zip(DECODED, EXPECTED):
            assert encoded == expected
    def test_bpe_decoding_edge_1(self):
        INPUT = [258, ord("c")]
        # ab = 256
        # 256, 256 = 257
        # 257, 257 = 258
        VOCABULARY = {(ord("a"), ord("b")): 256, (256, 256): 257, (257, 257): 258}
        EXPECTED = "ababababc"
        BPE_ENCODER = BPE.NanoSocratesBPE(VOCABULARY)
        DECODED = BPE_ENCODER.decode(INPUT)
        assert len(DECODED) == len(EXPECTED)
        for encoded, expected in zip(DECODED, EXPECTED):
            assert encoded == expected
 # Useful to debug weird cases
 if __name__ == "__main__":
    # TestBPE().test_bpe_decoding_simple()
    TestBPE().test_bpe_encoding_simple()
--- a/Project_Model/Tests/bpe_trainer_test.py
+++ b/Project_Model/Tests/bpe_trainer_test.py
@@ -0,0 +1,77 @@
 from pathlib import Path
 from Project_Model.Libs.BPE.Enums import TokenType
 import Project_Model.Libs.BPE as BPE
 import re
 CACHE_DIR_PATH = Path("Project_Model/Tests/trainer_files/cache/pool-cache.json")
 class TestTrainBPE:
    def test_bpe_train_encoding_simple(self):
        TRAINER = BPE.NanoSocraTrainerPool(
            int(32E3),
            ["<SOT>", "<EOT>"]
        )
        TEXT = "abababab"
        TEXT_PATH = Path("Project_Model/Tests/trainer_files/train_simple.txt")
        EXPECTED = [258]
        # ab = 256
        # 256, 256 = 257
        # 257, 257 = 258
        BPE_ENCODER = TRAINER.trainBPE(
            TEXT_PATH,
            CACHE_DIR_PATH
        )
        ENCODED = BPE_ENCODER.encode(TEXT)
        assert len(ENCODED) == len(EXPECTED)
        for encoded, expected in zip(ENCODED, EXPECTED):
            assert encoded == expected
    def test_bpe_train_encoding_and_decoding(self):
        SPECIAL_LIST = ["<ABS>", "<SOTL>"]
        TRAINER = BPE.NanoSocraTrainerPool(
            int(32E3),
            SPECIAL_LIST
        )
        TEXT_PATH = Path("Project_Model/Tests/trainer_files/train_encode_decode.txt")
        FILE = open(TEXT_PATH)
        TEXT = FILE.read()
        FILE.close()
        EXPECTED = TEXT
        # ab = 256
        # 256, 256 = 257
        # 257, 257 = 258
        BPE_ENCODER = TRAINER.trainBPE(
            TEXT_PATH,
            CACHE_DIR_PATH
        )
        VOCABULARY = BPE_ENCODER.vocabulary
        TOKENANO = BPE.TokeNanoCore(VOCABULARY,SPECIAL_LIST)
        ENCODED = TOKENANO.encode(TEXT)
        DECODED = TOKENANO.decode(ENCODED)
        assert len(DECODED) == len(EXPECTED)
        for decoded, expected in zip(DECODED, EXPECTED):
            assert decoded == expected
 # Useful to debug weird cases
 if __name__ == "__main__":
    # TestTrainBPE().test_bpe_train_encoding_simple()
    TestTrainBPE().test_bpe_train_encoding_and_decoding()
--- a/Project_Model/Tests/chunker_files/edge-1.txt
+++ b/Project_Model/Tests/chunker_files/edge-1.txt
@@ -0,0 +1,4 @@
 <SOT>Lorem <SEP>ipsu<SEP>m d<SEP>olor s<SEP>it ame<SEP>t,
 <SEP>conse<SEP>cte<SEP>tur adip<SEP>iscin<SEP>g elit.
 <SEP>Aenean a<SEP>t dui he<SEP>ndrer<SEP>it an<SEP>te soll<SEP>icitud
 <SEP>in sce<SEP>lerisque<EOT>
--- a/Project_Model/Tests/chunker_files/simple.txt
+++ b/Project_Model/Tests/chunker_files/simple.txt
@@ -0,0 +1,2 @@
 <SOT>Lorem ipsum <SEP>dolor sit amet<SEP>, consectetur <SEP>adipiscing elit.
 <SEP>Aenean at dui <SEP>hendrerit ante <SEP>sollicitudin <SEP>scelerisque<EOT>
--- a/Project_Model/Tests/chunker_files/stress.txt
+++ b/Project_Model/Tests/chunker_files/stress.txt
@@ -0,0 +1,3 @@
 <SOT>Lorem ipsum <SEP>dolor sit amet<SEP>, consectetur <SEP>adipiscing elit.
 <SEP>Aenean at dui <SEP>hendrerit an te <SEP>sollicitudin <SEP>scelerisque
 <SEP>dsdsasssdfdsdsfkjddsnfkjdsnfkjdnsjkfndf<EOT>
--- a/Project_Model/Tests/chunker_test.py
+++ b/Project_Model/Tests/chunker_test.py
@@ -0,0 +1,89 @@
 from pathlib import Path
 import re
 import pytest
 import Project_Model.Libs.BPE as BPE
 PATTERN = "<(TOKEN|SOT|SEP|EOT)>"
 SYMBOL_REGEX = re.compile(PATTERN)
 class TestChunker:
    def test_correct_simple(self):
        FILE_PATH = Path("Project_Model/Tests/chunker_files/simple.txt")
        LEAST_EXPECTED_CHUNKS = 3
        ORIG_TEXT = FILE_PATH.read_text(encoding="utf-8")
        CHUNKER = BPE.NanoSocratesChunker(40, SYMBOL_REGEX)
        CHUNKS = []
        for chunk in CHUNKER.chunk(FILE_PATH):
            print(chunk)
            CHUNKS.append(
                chunk
            )
        NANO_TEXT = "".join(CHUNKS)
        assert len(CHUNKS) > (LEAST_EXPECTED_CHUNKS - 1)
        assert NANO_TEXT == ORIG_TEXT
    def test_correct_edge_1(self):
        FILE_PATH = Path("Project_Model/Tests/chunker_files/edge-1.txt")
        LEAST_EXPECTED_CHUNKS = 3
        ORIG_TEXT = FILE_PATH.read_text(encoding="utf-8")
        CHUNKER = BPE.NanoSocratesChunker(15, SYMBOL_REGEX)
        CHUNKS = []
        for chunk in CHUNKER.chunk(FILE_PATH):
            print(chunk)
            CHUNKS.append(
                chunk
            )
        NANO_TEXT = "".join(CHUNKS)
        assert len(CHUNKS) > (LEAST_EXPECTED_CHUNKS - 1)
        assert NANO_TEXT == ORIG_TEXT
    def test_throwing(self):
        FILE_PATH = Path("Project_Model/Tests/chunker_files/simple.txt")
        CHUNKER = BPE.NanoSocratesChunker(5, SYMBOL_REGEX)
        with pytest.raises(BPE.DelimiterNotFoundException):
            for chunk in CHUNKER.chunk(FILE_PATH):
                print(chunk)
 if __name__ == "__main__":
    FILE_PATH = Path("Project_Model/Tests/chunker_files/stress.txt")
    LEAST_EXPECTED_CHUNKS = 3
    ORIG_TEXT = FILE_PATH.read_text(encoding="utf-8")
    CHUNKER = BPE.NanoSocratesChunker(40, SYMBOL_REGEX)
    CHUNKS = []
    try:
        for chunk in CHUNKER.chunk(FILE_PATH):
            print(f"CHUNK START:\n{chunk} - {len(chunk)}\nCHUNK END\n")
            CHUNKS.append(
                chunk
            )
    except:
        exit(0)
    NANO_TEXT = "".join(CHUNKS)
    assert len(CHUNKS) > (LEAST_EXPECTED_CHUNKS - 1)
    assert NANO_TEXT == ORIG_TEXT
--- a/Project_Model/Tests/splitter_test.py
+++ b/Project_Model/Tests/splitter_test.py
@@ -0,0 +1,182 @@
 from Project_Model.Libs.BPE.Enums import TokenType
 import Project_Model.Libs.BPE as BPE
 import re
 PATTERN = "<(TOKEN|SOT|SEP|EOT)>"
 SYMBOL_REGEX = re.compile(PATTERN)
 class TestSplitter:
    def test_split(self):
        TEXT = "<SOT>Lorem <SEP>"
        SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX)
        EXPECTED_CHUNKS = [
            ("<SOT>", TokenType.SPECIAL),
            ("Lorem", TokenType.BPE),
            (" ", TokenType.BPE),
            ("<SEP>", TokenType.SPECIAL),
        ]
        CHUNKS = list(SPLITTER.split_text(TEXT))
        assert len(CHUNKS) == len(EXPECTED_CHUNKS)
        for chunk, expected_chunk in zip(EXPECTED_CHUNKS, CHUNKS):
            print(f"TEST:\n\tCHUNK:\t\t{chunk}\n\tEXPECTED:\t\t{expected_chunk}")
            RECEIVED_TOKEN_STRING, RECEIVED_TOKEN_TYPE = chunk
            EXPECTED_TOKEN_STRING, EXPECTED_TOKEN_TYPE = expected_chunk
            assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING
            assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE
    def test_split_trailing_text(self):
        TEXT = "ipsu<SEP>m d<SEP>olor"
        SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX)
        EXPECTED_CHUNKS = [
            ("ipsu", TokenType.BPE),
            ("<SEP>", TokenType.SPECIAL),
            ("m", TokenType.BPE),
            (" d", TokenType.BPE),
            ("<SEP>", TokenType.SPECIAL),
            # ("olor", TokenType.BPE)
        ]
        CHUNKS = list(SPLITTER.split_text(TEXT))
        assert len(CHUNKS) == len(EXPECTED_CHUNKS)
        for chunk, expected_chunk in zip(EXPECTED_CHUNKS, CHUNKS):
            print(f"TEST:\n\tCHUNK:\t\t{chunk}\n\tEXPECTED:\t\t{expected_chunk}")
            RECEIVED_TOKEN_STRING, RECEIVED_TOKEN_TYPE = chunk
            EXPECTED_TOKEN_STRING, EXPECTED_TOKEN_TYPE = expected_chunk
            assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING
            assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE
    def test_split_multi_token(self):
        TEXT = "ipsu<SEP>m d<SEP><SEP><SEP>dsg<SEP>olor"
        SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX)
        EXPECTED_CHUNKS = [
            ("ipsu", TokenType.BPE),
            ("<SEP>", TokenType.SPECIAL),
            ("m", TokenType.BPE),
            (" d", TokenType.BPE),
            ("<SEP>", TokenType.SPECIAL),
            ("<SEP>", TokenType.SPECIAL),
            ("<SEP>", TokenType.SPECIAL),
            ("dsg", TokenType.BPE),
            ("<SEP>", TokenType.SPECIAL),
        ]
        CHUNKS = list(SPLITTER.split_text(TEXT))
        assert len(CHUNKS) == len(EXPECTED_CHUNKS)
        for chunk, expected_chunk in zip(EXPECTED_CHUNKS, CHUNKS):
            print(f"TEST:\n\tCHUNK:\t\t{chunk}\n\tEXPECTED:\t\t{expected_chunk}")
            RECEIVED_TOKEN_STRING, RECEIVED_TOKEN_TYPE = chunk
            EXPECTED_TOKEN_STRING, EXPECTED_TOKEN_TYPE = expected_chunk
            assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING
            assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE
    def test_split_malformed_1(self):
        TEXT = "<SEP>lerisque"
        SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX)
        EXPECTED_CHUNKS = [
            ("<SEP>", TokenType.SPECIAL),
        ]
        CHUNKS = list(SPLITTER.split_text(TEXT))
        assert len(CHUNKS) == len(EXPECTED_CHUNKS)
        for chunk, expected_chunk in zip(EXPECTED_CHUNKS, CHUNKS):
            print(f"TEST:\n\tCHUNK:\t\t{chunk}\n\tEXPECTED:\t\t{expected_chunk}")
            RECEIVED_TOKEN_STRING, RECEIVED_TOKEN_TYPE = chunk
            EXPECTED_TOKEN_STRING, EXPECTED_TOKEN_TYPE = expected_chunk
            assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING
            assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE
    def test_split_malformed_2(self):
        TEXT = "lerisque"
        SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX)
        EXPECTED_CHUNKS = []
        CHUNKS = list(SPLITTER.split_text(TEXT))
        assert len(CHUNKS) == len(EXPECTED_CHUNKS)
        for chunk, expected_chunk in zip(EXPECTED_CHUNKS, CHUNKS):
            print(f"TEST:\n\tCHUNK:\t\t{chunk}\n\tEXPECTED:\t\t{expected_chunk}")
            RECEIVED_TOKEN_STRING, RECEIVED_TOKEN_TYPE = chunk
            EXPECTED_TOKEN_STRING, EXPECTED_TOKEN_TYPE = expected_chunk
            assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING
            assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE
    def test_split_token_decode_simple(self):
        # to test the token split into special and bpe
        SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX, max_bpe_token_id=1473)
        token_list = [100, 101, 1477]
        CHUNKS = list(SPLITTER.split_tokens(token_list))
        EXPECTED_CHUNKS = [
            ([100, 101], TokenType.BPE),
            ([1477], TokenType.SPECIAL),
        ]
        assert len(CHUNKS) == len(EXPECTED_CHUNKS)
        for chunk, expected_chunk in zip(EXPECTED_CHUNKS, CHUNKS):
            print(f"TEST:\n\tCHUNK:\t\t{chunk}\n\tEXPECTED:\t\t{expected_chunk}")
            RECEIVED_TOKEN_STRING, RECEIVED_TOKEN_TYPE = chunk
            EXPECTED_TOKEN_STRING, EXPECTED_TOKEN_TYPE = expected_chunk
            assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING
            assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE
    def test_split_token_decode_simple_malformed(self):
        # to test the token split into special and bpe
        SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX, max_bpe_token_id=1473)
        token_list = [100, 101, 1477, 100]
        CHUNKS = list(SPLITTER.split_tokens(token_list))
        EXPECTED_CHUNKS = [
            ([100, 101], TokenType.BPE),
            ([1477], TokenType.SPECIAL),
        ]
        assert len(CHUNKS) == len(EXPECTED_CHUNKS)
        for chunk, expected_chunk in zip(EXPECTED_CHUNKS, CHUNKS):
            print(f"TEST:\n\tCHUNK:\t\t{chunk}\n\tEXPECTED:\t\t{expected_chunk}")
            RECEIVED_TOKEN_STRING, RECEIVED_TOKEN_TYPE = chunk
            EXPECTED_TOKEN_STRING, EXPECTED_TOKEN_TYPE = expected_chunk
            assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING
            assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE
 # Useful to debug weird cases
 if __name__ == "__main__":
    TestSplitter().test_split_trailing_text()
--- a/Project_Model/Tests/tokenano_test.py
+++ b/Project_Model/Tests/tokenano_test.py
@@ -0,0 +1,21 @@
 from Project_Model.Libs.BPE.Classes.TokeNanoCore import TokeNanoCore
 class TestTokeNano:
    def test_decode_encode_simple(self):
        TEXT = "<SOT>abababab<EOT>"
        # ab = 256
        # 256, 256 = 257
        # 257, 257 = 258
        VOCABULARY = {(ord("a"), ord("b")): 256, (256, 256): 257, (257, 257): 258}
        # EXPECTED = [258]
        TOKE_NANO = TokeNanoCore(VOCABULARY, ["<SOT>", "<EOT>"])
        ENCODED = TOKE_NANO.encode(TEXT)
        DECODED = TOKE_NANO.decode(ENCODED)
        assert TEXT == DECODED
--- a/Project_Model/Tests/trainer_files/cache/.gitkeep
+++ b/Project_Model/Tests/trainer_files/cache/.gitkeep
--- a/Project_Model/Tests/trainer_files/train_encode_decode.txt
+++ b/Project_Model/Tests/trainer_files/train_encode_decode.txt
@@ -0,0 +1 @@
 <ABS>The Dark Knight is a 2008 superhero film directed by Christopher Nolan,<SOTL>
--- a/Project_Model/Tests/trainer_files/train_simple.txt
+++ b/Project_Model/Tests/trainer_files/train_simple.txt
@@ -0,0 +1 @@
 <SOT>abababab<EOT>
--- a/Project_Model/UML/bpe.excalidraw.json
+++ b/Project_Model/UML/bpe.excalidraw.json
@@ -0,0 +1,695 @@
 {
  "type": "excalidraw",
  "version": 2,
  "source": "https://marketplace.visualstudio.com/items?itemName=pomdtr.excalidraw-editor",
  "elements": [
    {
      "id": "EcT-dGsjmfW571ov8Gg4F",
      "type": "text",
      "x": 425.5,
      "y": 132,
      "width": 506,
      "height": 425,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [
        "4rCC2-N1thmII8_dwNhe1"
      ],
      "frameId": null,
      "index": "a3V",
      "roundness": null,
      "seed": 523521109,
      "version": 883,
      "versionNonce": 1590682729,
      "isDeleted": false,
      "boundElements": [
        {
          "id": "OA_NKjb3n3NLtUo_tKmPS",
          "type": "arrow"
        }
      ],
      "updated": 1758881654155,
      "link": null,
      "locked": false,
      "text": "class NanoSocratesBPE(Encoder):\n    - vocabulary: Vocabulary\n\n    + fit(\n        data: [[int]], \n        memory: NanoSocratesBPE_BatchMemory,\n        last_sentence_chunk: bool, \n        last_batch: bool\n      ) -> (Self, NanoSocratesBPE_BatchMemory)\n\n    + encode(word: [byte]) -> [int]\n\n    + decode(token: [int]) -> [byte]\n\n    + get_vocabulary_size() -> int\n    \n",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class NanoSocratesBPE(Encoder):\n    - vocabulary: Vocabulary\n\n    + fit(\n        data: [[int]], \n        memory: NanoSocratesBPE_BatchMemory,\n        last_sentence_chunk: bool, \n        last_batch: bool\n      ) -> (Self, NanoSocratesBPE_BatchMemory)\n\n    + encode(word: [byte]) -> [int]\n\n    + decode(token: [int]) -> [byte]\n\n    + get_vocabulary_size() -> int\n    \n",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "74i4oK-JpcM4CgAqhz_x_",
      "type": "rectangle",
      "x": 382.5,
      "y": 104.5,
      "width": 592.5,
      "height": 421,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [
        "4rCC2-N1thmII8_dwNhe1"
      ],
      "frameId": null,
      "index": "a4",
      "roundness": {
        "type": 3
      },
      "seed": 50827893,
      "version": 319,
      "versionNonce": 704459557,
      "isDeleted": false,
      "boundElements": [],
      "updated": 1758878226277,
      "link": null,
      "locked": false
    },
    {
      "id": "s8I1JoKulE3Vnti9a374p",
      "type": "text",
      "x": 1113.5,
      "y": 127,
      "width": 517,
      "height": 325,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [
        "M6w9efVFwOZHkJGgwkyEw"
      ],
      "frameId": null,
      "index": "a5",
      "roundness": null,
      "seed": 2091174261,
      "version": 480,
      "versionNonce": 1964948039,
      "isDeleted": false,
      "boundElements": [],
      "updated": 1758881941367,
      "link": null,
      "locked": false,
      "text": "class Vocabulary:\n\n    - vocabulary: dict<(int, int), int>\n    - reverse_vocabulary: dict<int, (int, int)>\n\n    + size -> int\n\n    + add_word(int) -> int\n    + encode(int) -> int\n    + decode(int) -> int\n    \n\n",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class Vocabulary:\n\n    - vocabulary: dict<(int, int), int>\n    - reverse_vocabulary: dict<int, (int, int)>\n\n    + size -> int\n\n    + add_word(int) -> int\n    + encode(int) -> int\n    + decode(int) -> int\n    \n\n",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "BY_Why7XDNftdMzPcwjVZ",
      "type": "rectangle",
      "x": 1086.5,
      "y": 105.5,
      "width": 593.0000000000001,
      "height": 325.5,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [
        "M6w9efVFwOZHkJGgwkyEw"
      ],
      "frameId": null,
      "index": "a6",
      "roundness": {
        "type": 3
      },
      "seed": 153939611,
      "version": 234,
      "versionNonce": 2068149129,
      "isDeleted": false,
      "boundElements": [
        {
          "id": "WcDks9DR8UqeZEaxAcRf9",
          "type": "arrow"
        }
      ],
      "updated": 1758881945661,
      "link": null,
      "locked": false
    },
    {
      "id": "JCPDhuTKRx4MN950Q3jL-",
      "type": "text",
      "x": 1116.411067193676,
      "y": 477.3809288774704,
      "width": 416.74578857421875,
      "height": 99.70355731225297,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [
        "DbtlKVF_9SjH2-9iMq9zy"
      ],
      "frameId": null,
      "index": "a7",
      "roundness": null,
      "seed": 1326854235,
      "version": 479,
      "versionNonce": 595084597,
      "isDeleted": false,
      "boundElements": [],
      "updated": 1758902358518,
      "link": null,
      "locked": false,
      "text": "class NanoSocrateBPE_BatchMemory:\n\n    + frequency: dict<(int, int), int>\n    + merge_treshold: int",
      "fontSize": 19.940711462450594,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class NanoSocrateBPE_BatchMemory:\n\n    + frequency: dict<(int, int), int>\n    + merge_treshold: int",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "l-O0rMS3SruV22_MPX9Jz",
      "type": "rectangle",
      "x": 1086.5,
      "y": 451.4580039762846,
      "width": 593,
      "height": 208.0419960474308,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [
        "DbtlKVF_9SjH2-9iMq9zy"
      ],
      "frameId": null,
      "index": "a8",
      "roundness": {
        "type": 3
      },
      "seed": 1490898171,
      "version": 305,
      "versionNonce": 587306139,
      "isDeleted": false,
      "boundElements": [
        {
          "id": "OA_NKjb3n3NLtUo_tKmPS",
          "type": "arrow"
        }
      ],
      "updated": 1758902358518,
      "link": null,
      "locked": false
    },
    {
      "id": "WcDks9DR8UqeZEaxAcRf9",
      "type": "arrow",
      "x": 773.5,
      "y": 167,
      "width": 297.17936724485867,
      "height": 30,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aB",
      "roundness": {
        "type": 2
      },
      "seed": 1681364149,
      "version": 303,
      "versionNonce": 1262492265,
      "isDeleted": false,
      "boundElements": [],
      "updated": 1758881945661,
      "link": null,
      "locked": false,
      "points": [
        [
          0,
          0
        ],
        [
          144.5,
          -1.5
        ],
        [
          177.5,
          -30
        ],
        [
          297.17936724485867,
          -29.020420978562214
        ]
      ],
      "lastCommittedPoint": null,
      "startBinding": null,
      "endBinding": {
        "elementId": "BY_Why7XDNftdMzPcwjVZ",
        "focus": 0.77319587628866,
        "gap": 18.25
      },
      "startArrowhead": null,
      "endArrowhead": "triangle",
      "elbowed": false
    },
    {
      "id": "OA_NKjb3n3NLtUo_tKmPS",
      "type": "arrow",
      "x": 946.0000000000002,
      "y": 274.95951048200493,
      "width": 130.016707976343,
      "height": 209.36808480159067,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aD",
      "roundness": {
        "type": 2
      },
      "seed": 1871768059,
      "version": 1039,
      "versionNonce": 213535035,
      "isDeleted": false,
      "boundElements": [],
      "updated": 1758902358519,
      "link": null,
      "locked": false,
      "points": [
        [
          0,
          0
        ],
        [
          54.99999999999977,
          12.54048951799507
        ],
        [
          69.49999999999977,
          188.54048951799507
        ],
        [
          130.016707976343,
          209.36808480159067
        ]
      ],
      "lastCommittedPoint": null,
      "startBinding": {
        "elementId": "EcT-dGsjmfW571ov8Gg4F",
        "focus": -0.48312180762055096,
        "gap": 14.500000000000114
      },
      "endBinding": {
        "elementId": "l-O0rMS3SruV22_MPX9Jz",
        "focus": -0.16742658425737647,
        "gap": 11.194126334166185
      },
      "startArrowhead": null,
      "endArrowhead": "triangle",
      "elbowed": false
    },
    {
      "id": "snZ__VDsIlri6NTp8M2Gf",
      "type": "text",
      "x": -245.25,
      "y": 103,
      "width": 330,
      "height": 125,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aE",
      "roundness": null,
      "seed": 1758461093,
      "version": 265,
      "versionNonce": 1069481861,
      "isDeleted": false,
      "boundElements": [],
      "updated": 1758879566916,
      "link": null,
      "locked": false,
      "text": "class NanoSocratesBPETrainer:\n\n    - max_iterations: int\n    - max_vocabulary_size: int\n    - merge_treshold: int",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class NanoSocratesBPETrainer:\n\n    - max_iterations: int\n    - max_vocabulary_size: int\n    - merge_treshold: int",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "PnbmqwEWYkP8oXElKFyTp",
      "type": "text",
      "x": -237.75,
      "y": 544,
      "width": 561,
      "height": 125,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aH",
      "roundness": null,
      "seed": 501304683,
      "version": 241,
      "versionNonce": 1306401003,
      "isDeleted": false,
      "boundElements": [],
      "updated": 1758878748210,
      "link": null,
      "locked": false,
      "text": "class NanoSocratesSplitter:\n    + splitter_regex: regex\n\n    + split_text(corpus: str) -> [(str, TokenType)]\n",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class NanoSocratesSplitter:\n    + splitter_regex: regex\n\n    + split_text(corpus: str) -> [(str, TokenType)]\n",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "xR_11IzgXX5O-m6WoRfCL",
      "type": "text",
      "x": -233.25,
      "y": 366.5,
      "width": 165,
      "height": 75,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aI",
      "roundness": null,
      "seed": 2025585125,
      "version": 395,
      "versionNonce": 1799178985,
      "isDeleted": false,
      "boundElements": [],
      "updated": 1758883940168,
      "link": null,
      "locked": false,
      "text": "enum TokenType:\n    + SPECIAL\n    + BPE",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "enum TokenType:\n    + SPECIAL\n    + BPE",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "lgKSd9qCb94-5e8rd9I3r",
      "type": "text",
      "x": -219.75,
      "y": 764.5,
      "width": 462,
      "height": 275,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aJ",
      "roundness": null,
      "seed": 1963214021,
      "version": 464,
      "versionNonce": 1104453739,
      "isDeleted": false,
      "boundElements": [],
      "updated": 1759053302739,
      "link": null,
      "locked": false,
      "text": "class TokeNanoCore:\n\n    - splitter: NanoSocratesSplitter\n    - bpe_encoder: NanoSocratesBPE\n    - special_encoder: NanoSocratesSpecial\n\n    + encode(corpus: str) -> [int]\n\n    - encode_special(piece: str) -> int\n\n    - encode_bpe(piece: str) -> [int]",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class TokeNanoCore:\n\n    - splitter: NanoSocratesSplitter\n    - bpe_encoder: NanoSocratesBPE\n    - special_encoder: NanoSocratesSpecial\n\n    + encode(corpus: str) -> [int]\n\n    - encode_special(piece: str) -> int\n\n    - encode_bpe(piece: str) -> [int]",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "DwFJoUpVT2YAEe9qPYAXa",
      "type": "text",
      "x": 496.75,
      "y": 666,
      "width": 440,
      "height": 100,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aL",
      "roundness": null,
      "seed": 1317596203,
      "version": 152,
      "versionNonce": 1840679687,
      "isDeleted": false,
      "boundElements": [],
      "updated": 1758880107704,
      "link": null,
      "locked": false,
      "text": "class NanoSocratesSpecial(Encoder):\n\n    + vocabulary: dict<str, int>\n    + reverse_vocabulary: dict<int, str>",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class NanoSocratesSpecial(Encoder):\n\n    + vocabulary: dict<str, int>\n    + reverse_vocabulary: dict<int, str>",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "78gC46xatoO1_cRtaN8EC",
      "type": "text",
      "x": 396.375,
      "y": -107.75,
      "width": 396,
      "height": 100,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aM",
      "roundness": null,
      "seed": 1187595241,
      "version": 130,
      "versionNonce": 1273030504,
      "isDeleted": false,
      "boundElements": [],
      "updated": 1759070012771,
      "link": null,
      "locked": false,
      "text": "class Encoder(ABC):\n\n    + encode(corpus: str) -> [int]\n    + decode(encoded: [int]) -> str ",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class Encoder(ABC):\n\n    + encode(corpus: str) -> [int]\n    + decode(encoded: [int]) -> str ",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "3j50Ds74uU7oXoJ9kMOYJ",
      "type": "text",
      "x": 457.375,
      "y": 903.75,
      "width": 949.7594604492188,
      "height": 25,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aN",
      "roundness": null,
      "seed": 1994335529,
      "version": 198,
      "versionNonce": 1492696519,
      "isDeleted": false,
      "boundElements": [],
      "updated": 1758882694747,
      "link": null,
      "locked": false,
      "text": "@@mamma@@è bell^^issima e @@^^le voglio molto b^^ene @--@ replit ^^ è molto ^^bello e^^ lo amo",
      "fontSize": 20,
      "fontFamily": 5,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "@@mamma@@è bell^^issima e @@^^le voglio molto b^^ene @--@ replit ^^ è molto ^^bello e^^ lo amo",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "yg-TvQvz4MwJZ0y8K7Ix0",
      "type": "text",
      "x": 435.375,
      "y": 1026.25,
      "width": 352,
      "height": 250,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aP",
      "roundness": null,
      "seed": 1877486407,
      "version": 344,
      "versionNonce": 25830153,
      "isDeleted": false,
      "boundElements": [],
      "updated": 1758883468886,
      "link": null,
      "locked": false,
      "text": "class NanoSocratesChunker:\n\n    - max_bytes: int\n    - max_special_length: int\n    - special_token_regex: regex\n\n    - residuals: str\n\n    # This must be an iterator\n    + read(path: Path) -> str",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class NanoSocratesChunker:\n\n    - max_bytes: int\n    - max_special_length: int\n    - special_token_regex: regex\n\n    - residuals: str\n\n    # This must be an iterator\n    + read(path: Path) -> str",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "2UXjWdE_jMcsCE2oQgTXn",
      "type": "text",
      "x": -334.75,
      "y": 1112.5,
      "width": 165,
      "height": 25,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aQ",
      "roundness": null,
      "seed": 700532363,
      "version": 76,
      "versionNonce": 1671597672,
      "isDeleted": false,
      "boundElements": [],
      "updated": 1759070020002,
      "link": null,
      "locked": false,
      "text": "class TokeNano:",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class TokeNano:",
      "autoResize": true,
      "lineHeight": 1.25
    }
  ],
  "appState": {
    "gridSize": 20,
    "gridStep": 5,
    "gridModeEnabled": false,
    "viewBackgroundColor": "#ffffff"
  },
  "files": {}
 }
--- a/README.md
+++ b/README.md
@@ -17,6 +17,25 @@ Now install dependencies on pip:
        pip install -r requirements.txt
 Add the following on .vscode/settings.json
       ```json
       {
              // For linux
              "terminal.integrated.env.linux": {
                     "PYTHONPATH": "${workspaceFolder}"
              },
              // For OSX
              "terminal.integrated.env.osx": {
                     "PYTHONPATH": "${workspaceFolder}"
              },
              // For Windows
              "terminal.integrated.env.windows": {
                     "PYTHONPATH": "${workspaceFolder}"
              }
       }
       ```
 ## TroubleShooting
 Sometimes when uploading really large batch of data, git can stop the uploads thanks to the timeout.
--- a/Scripts/DataBaseQueries/dataset.sql
+++ b/Scripts/DataBaseQueries/dataset.sql
@@ -0,0 +1,30 @@
 -- To pass to Pandas
 SELECT *
 FROM RDFs
 INNER JOIN Subjects USING (SubjectID)
 INNER JOIN Relationships USING (RelationshipID)
 INNER JOIN Objects USING (ObjectID);
 -- To pass to Pandas for abstracts
 SELECT *
 FROM RDFs
 INNER JOIN WikipediaAbstracts USING (MovieID);
 -- To pass to Pandas for abbreviations
 SELECT *
 FROM Abbreviations;
 -- More complex to have clean dataset
 -- More complex to have clean dataset
 SELECT MovieID, GROUP_CONCAT('<SOT>' || '<SUB>' || SubjectURI || '<REL>' || RelationshipURI || '<OBJ>' || ObjectURI || '<EOT>', '') as RDF_String, Abstract
 FROM RDFs
 INNER JOIN SubjectsCountInRDFs USING (SubjectID)
 INNER JOIN RelationshipsCountInRDFs USING(RelationshipID)
 INNER JOIN ObjectsCountInRDFs USING (ObjectID)
 INNER JOIN ParsedSubjects USING (SubjectID)
 INNER JOIN ParsedRelationships USING (RelationshipID)
 INNER JOIN ParsedObjects USING (ObjectID)
 INNER JOIN WikipediaAbstracts USING (MovieID)
    -- insert WHERE here
 -- WHERE SubjectID = 134626
 GROUP BY MovieID;
--- a/Scripts/DataBaseQueries/db_creation.sql
+++ b/Scripts/DataBaseQueries/db_creation.sql
@@ -0,0 +1,174 @@
 CREATE TABLE IF NOT EXISTS Movies (
    MovieID INTEGER PRIMARY KEY AUTOINCREMENT,
    MovieURI TEXT UNIQUE NOT NULL
 );
 CREATE TABLE IF NOT EXISTS WikiPageIDs (
    MovieID INTEGER PRIMARY KEY,
    PageID INTEGER UNIQUE NOT NULL,
    FOREIGN KEY(MovieID) REFERENCES Movies(MovieID)
 );
 CREATE TABLE IF NOT EXISTS WikipediaAbstracts (
    MovieID INTEGER PRIMARY KEY,
    Abstract TEXT NOT NULL,
    FOREIGN KEY(MovieID) REFERENCES Movies(MovieID)
 );
 CREATE TABLE IF NOT EXISTS Origins (
    OriginID INTEGER PRIMARY KEY AUTOINCREMENT,
    OriginName TEXT UNIQUE NOT NULL
 );
 CREATE TABLE IF NOT EXISTS Subjects (
    SubjectID INTEGER PRIMARY KEY AUTOINCREMENT,
    SubjectURI TEXT UNIQUE NOT NULL,
    OriginID BIGINT NOT NULL,
    FOREIGN KEY(OriginID) REFERENCES Origins(OriginID)
 );
 CREATE TABLE IF NOT EXISTS Relationships (
    RelationshipID INTEGER PRIMARY KEY AUTOINCREMENT,
    RelationshipURI TEXT UNIQUE NOT NULL
 );
 CREATE TABLE IF NOT EXISTS Objects (
    ObjectID INTEGER PRIMARY KEY AUTOINCREMENT,
    ObjectURI TEXT UNIQUE NOT NULL,
    OriginID BIGINT NOT NULL,
    FOREIGN KEY(OriginID) REFERENCES Origins(OriginID)
 );
 CREATE TABLE IF NOT EXISTS RDFs (
    RDF_ID INTEGER PRIMARY KEY AUTOINCREMENT,
    MovieID INTEGER NOT NULL,
    SubjectID INTEGER NOT NULL,
    RelationshipID INTEGER NOT NULL,
    ObjectID INTEGER NOT NULL,
    UNIQUE(MovieID, SubjectID, RelationshipID, ObjectID),
    FOREIGN KEY(MovieID) REFERENCES Movies(MovieID),
    FOREIGN KEY(SubjectID) REFERENCES Subjects(SubjectID),
    FOREIGN KEY(RelationshipID) REFERENCES Relationships(RelationshipID),
    FOREIGN KEY(ObjectID) REFERENCES Objects(ObjectID)
 );
 CREATE INDEX IF NOT EXISTS idx_rdf_movie_id ON RDFs(MovieID);
 CREATE INDEX IF NOT EXISTS idx_rdf_subject_id ON RDFs(SubjectID);
 CREATE INDEX IF NOT EXISTS idx_rdf_relationship_id ON RDFs(RelationshipID);
 CREATE INDEX IF NOT EXISTS idx_rdf_object_id ON RDFs(ObjectID);
 CREATE TABLE IF NOT EXISTS Abbreviations (
    AbbreviationID INTEGER PRIMARY KEY AUTOINCREMENT,
    URI TEXT UNIQUE NOT NULL,
    Abbreviation TEXT UNIQUE NOT NULL
 );
 CREATE TABLE IF NOT EXISTS Subjects_Abbreviations (
    SubjectID INTEGER NOT NULL,
    AbbreviationID INTEGER NOT NULL,
    PRIMARY KEY(SubjectID, AbbreviationID),
    FOREIGN KEY(SubjectID) REFERENCES Subjects(SubjectID),
    FOREIGN KEY(AbbreviationID) REFERENCES Abbreviations(AbbreviationID)
 );
 CREATE TABLE IF NOT EXISTS Relationships_Abbreviations (
    RelationshipID INTEGER NOT NULL,
    AbbreviationID INTEGER NOT NULL,
    PRIMARY KEY(RelationshipID, AbbreviationID),
    FOREIGN KEY(RelationshipID) REFERENCES Relationships(RelationshipID),
    FOREIGN KEY(AbbreviationID) REFERENCES Abbreviations(AbbreviationID)
 );
 CREATE TABLE IF NOT EXISTS Objects_Abbreviations (
    ObjectID INTEGER NOT NULL,
    AbbreviationID INTEGER NOT NULL,
    PRIMARY KEY(ObjectID, AbbreviationID),
    FOREIGN KEY(ObjectID) REFERENCES Objects(ObjectID),
    FOREIGN KEY(AbbreviationID) REFERENCES Abbreviations(AbbreviationID)
 );
 CREATE INDEX IF NOT EXISTS idx_sub_abbr_sub_id ON Subjects_Abbreviations(SubjectID);
 CREATE INDEX IF NOT EXISTS idx_sub_abbr_abbr_id ON Subjects_Abbreviations(AbbreviationID);
 CREATE INDEX IF NOT EXISTS idx_rel_abbr_rel_id ON Relationships_Abbreviations(RelationshipID);
 CREATE INDEX IF NOT EXISTS idx_rel_abbr_abbr_id ON Relationships_Abbreviations(AbbreviationID);
 CREATE INDEX IF NOT EXISTS idx_obj_abbr_obj_id ON Objects_Abbreviations(ObjectID);
 CREATE INDEX IF NOT EXISTS idx_obj_abbr_abbr_id ON Objects_Abbreviations(AbbreviationID);
 -- Views
 -- Subjects
 CREATE VIEW IF NOT EXISTS ParsedSubjects
 AS
 SELECT
 	SubjectID,
 	CASE WHEN Abbreviation IS NULL
 		THEN SubjectURI
 		ELSE Abbreviation || ':' || replace(SubjectURI, URI, '') END
 		AS SubjectURI
 FROM Subjects
 	LEFT JOIN Subjects_Abbreviations USING (SubjectID)
 	LEFT JOIN Abbreviations USING (AbbreviationID);
 -- Relationships
 CREATE VIEW IF NOT EXISTS ParsedRelationships
 AS
 SELECT
 	RelationshipID,
 	CASE WHEN Abbreviation IS NULL
 		THEN RelationshipURI
 		ELSE Abbreviation || ':' || replace(RelationshipURI, URI, '') END
 		AS RelationshipURI
 FROM Relationships
 	LEFT JOIN Relationships_Abbreviations USING (RelationshipID)
 	LEFT JOIN Abbreviations USING (AbbreviationID);
 -- Objects
 CREATE VIEW IF NOT EXISTS ParsedObjects
 AS
 SELECT
 	ObjectID,
 	CASE WHEN Abbreviation IS NULL
 		THEN ObjectURI
 		ELSE Abbreviation || ':' || replace(ObjectURI, URI, '') END
 		AS ObjectURI
 FROM Objects
 	LEFT JOIN Objects_Abbreviations USING (ObjectID)
 	LEFT JOIN Abbreviations USING (AbbreviationID);
 -- Subject Count
 CREATE VIEW IF NOT EXISTS SubjectsCountInRDFs
 AS
 SELECT SubjectID, count(SubjectID) as Sub_Count
 FROM RDFs
 GROUP BY SubjectID;
 -- Relationship Count
 CREATE VIEW IF NOT EXISTS RelationshipsCountInRDFs
 AS
 SELECT RelationshipID, count(RelationshipID) as Rel_Count
 FROM RDFs
 GROUP BY RelationshipID;
 -- Object Count
 CREATE VIEW IF NOT EXISTS ObjectsCountInRDFs
 AS
 SELECT ObjectID, count(ObjectID) as Obj_Count
 FROM RDFs
 GROUP BY ObjectID;
--- a/Scripts/DatasetMerging/SQL_Queries/query.sql
+++ b/Scripts/DatasetMerging/SQL_Queries/query.sql
@@ -33,3 +33,23 @@ SELECT ObjectID FROM Objects WHERE ObjectURI = ?;
 INSERT INTO  RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?,?,?,?);
 -- Prefixes
 INSERT INTO Abbreviations(URI, Abbreviation) VALUES (?,?);
 INSERT INTO Objects_Abbreviations(ObjectID, AbbreviationID) VALUES (?,?);
 INSERT INTO Relationships_Abbreviations(RelationshipID, AbbreviationID) VALUES (?,?);
 INSERT INTO Subjects_Abbreviations(SubjectID, AbbreviationID) VALUES (?,?);
 -- Please be sure it is a URI before running this query
 --  and take at least until the domain and the first path part
 SELECT AbbreviationID FROM Abbreviations WHERE URI LIKE ?;
 -- Query to retrieve data
 SELECT MovieID, GROUP_CONCAT('<SOT>' || '<SUB>' || SubjectURI || '<REL>' || RelationshipURI || '<OBJ>' || ObjectURI || '<EOT>', '') as RDF_String, Abstract
 FROM RDFs
 INNER JOIN ParsedSubjects USING (SubjectID)
 INNER JOIN ParsedRelationships USING (RelationshipID)
 INNER JOIN ParsedObjects USING (ObjectID)
 INNER JOIN WikipediaAbstracts USING (MovieID)
    -- insert WHERE here
 GROUP BY MovieID;
--- a/Scripts/DataCleaning/clean_relationship.ipynb
+++ b/Scripts/DataCleaning/clean_relationship.ipynb
@@ -0,0 +1,186 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "b9081b7c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# This file deletes in the pipeline the unwanted relationship by different rules\n",
    "import pandas as pd\n",
    "import sqlite3\n",
    "import numpy as np\n",
    "\n",
    "\n",
    "CONN = sqlite3.connect('../../Assets/Dataset/Tmp/dataset2.db')\n",
    "\n",
    "def get_RDF() -> pd.DataFrame:\n",
    "    \"\"\"\n",
    "    QUERY = \"SELECT * FROM RDFs \" \\\n",
    "    \"INNER JOIN Subjects USING (SubjectID) \" \\\n",
    "    \"INNER JOIN Relationships USING (RelationshipID) \" \\\n",
    "    \"INNER JOIN Objects USING (ObjectID);\"\n",
    "    RDF = pd.read_sql_query(QUERY, CONN)\n",
    "    RDF = RDF[[\"SubjectURI\", \"RelationshipURI\", \"ObjectURI\"]]\n",
    "    RDF = RDF.dropna()\n",
    "    \"\"\"\n",
    "    Subjects = pd.read_sql_query('SELECT * FROM Subjects;', CONN)\n",
    "    Objects = pd.read_sql_query('SELECT * FROM Objects;', CONN)\n",
    "    Relationships = pd.read_sql_query('SELECT * FROM Relationships;', CONN)\n",
    "    RDF = pd.read_sql_query('SELECT * FROM RDFs;', CONN)\n",
    "\n",
    "    # drop '' values \n",
    "    Subjects = Subjects.replace('', np.nan)# .dropna()\n",
    "    Relationships = Relationships.replace('', np.nan)# .dropna()\n",
    "    Objects = Objects.replace('', np.nan)# .dropna()\n",
    "\n",
    "    # join RDF with its components\n",
    "    RDF = RDF.merge(Subjects, left_on=\"SubjectID\", right_on=\"SubjectID\")\n",
    "    RDF = RDF.merge(Objects, left_on=\"ObjectID\", right_on=\"ObjectID\")\n",
    "    RDF = RDF.merge(Relationships, left_on=\"RelationshipID\", right_on=\"RelationshipID\")\n",
    "    RDF = RDF[[\"SubjectURI\", \"RelationshipURI\", \"ObjectURI\", \"MovieID\"]]\n",
    "    return RDF\n",
    "\n",
    "\n",
    "#def delete_relationship_by_uri(RDF: pd.DataFrame, )\n",
    "\n",
    "def delete_relationship_by_uri(RDF: pd.DataFrame, uri: str) -> pd.DataFrame:\n",
    "    return RDF[RDF[\"RelationshipURI\"]!= uri]\n",
    "\n",
    "\n",
    "\n",
    "RDF = get_RDF()\n",
    "# RDF = RDF.dropna()\n",
    "# print(RDF)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "644690bb",
   "metadata": {},
   "outputs": [],
   "source": [
    "def filter_by_frequence_relationship_uri(RDF: pd.DataFrame, count_treshold) -> pd.DataFrame:\n",
    "    counts = RDF[\"RelationshipURI\"].value_counts() \n",
    "    RDF[\"RelationshipFreq\"] = RDF[\"RelationshipURI\"].map(counts)\n",
    "    RDF = RDF[RDF[\"RelationshipFreq\"] >= count_treshold]\n",
    "    # counts is a series as key: relationship, value: count\n",
    "    # counts = counts[counts > count_treshold]\n",
    "    # relationships = counts.index\n",
    "    # RDF = RDF[RDF[\"RelationshipURI\"].isin(relationships)]\n",
    "    # RDF = RDF.groupby(\"RelationshipURI\").filter(lambda x: len(x) >= count_treshold)\n",
    "    return RDF\n",
    "\n",
    "RDF = filter_by_frequence_relationship_uri(RDF, 1)\n",
    "# print(new_RDF)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "34525be6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                                 SubjectURI  \\\n",
      "0             http://dbpedia.org/resource/Nights_of_Cabiria   \n",
      "1         http://dbpedia.org/resource/California_Science...   \n",
      "2                 http://dbpedia.org/resource/China_Captain   \n",
      "3         http://dbpedia.org/resource/Caravan_of_Courage...   \n",
      "4                http://dbpedia.org/resource/WHIH_Newsfront   \n",
      "...                                                     ...   \n",
      "12725500   http://dbpedia.org/resource/I_Will_Follow_(film)   \n",
      "12725501   http://dbpedia.org/resource/I_Will_Follow_(film)   \n",
      "12725502  http://dbpedia.org/resource/I_Witnessed_Genoci...   \n",
      "12725503  http://dbpedia.org/resource/I_Woke_Up_Early_th...   \n",
      "12725504           http://dbpedia.org/resource/I_Won't_Play   \n",
      "\n",
      "                                       RelationshipURI  \\\n",
      "0          http://www.w3.org/2002/07/owl#differentFrom   \n",
      "1          http://www.w3.org/2002/07/owl#differentFrom   \n",
      "2          http://www.w3.org/2002/07/owl#differentFrom   \n",
      "3          http://www.w3.org/2002/07/owl#differentFrom   \n",
      "4         http://www.w3.org/2000/01/rdf-schema#seeAlso   \n",
      "...                                                ...   \n",
      "12725500          http://dbpedia.org/ontology/producer   \n",
      "12725501          http://dbpedia.org/ontology/producer   \n",
      "12725502          http://dbpedia.org/ontology/producer   \n",
      "12725503          http://dbpedia.org/ontology/producer   \n",
      "12725504          http://dbpedia.org/ontology/producer   \n",
      "\n",
      "                                                  ObjectURI  MovieID  \\\n",
      "0                       http://dbpedia.org/resource/Cabiria       26   \n",
      "1         http://dbpedia.org/resource/California_Academy...      185   \n",
      "2                 http://dbpedia.org/resource/Captain_China      614   \n",
      "3         http://dbpedia.org/resource/Caravan_of_Courage...      740   \n",
      "4         http://dbpedia.org/resource/Captain_America:_C...      594   \n",
      "...                                                     ...      ...   \n",
      "12725500           http://dbpedia.org/resource/Ava_DuVernay   145854   \n",
      "12725501           http://dbpedia.org/resource/Molly_Mayeux   145854   \n",
      "12725502        http://dbpedia.org/resource/Headlines_Today   145861   \n",
      "12725503             http://dbpedia.org/resource/Billy_Zane   145862   \n",
      "12725504    http://dbpedia.org/resource/Gordon_Hollingshead   145864   \n",
      "\n",
      "          RelationshipFreq  MovieFreq  \n",
      "0                     2132        216  \n",
      "1                     2132        264  \n",
      "2                     2132         66  \n",
      "3                     2132        131  \n",
      "4                     1653        133  \n",
      "...                    ...        ...  \n",
      "12725500             80077         95  \n",
      "12725501             80077         95  \n",
      "12725502             80077         41  \n",
      "12725503             80077         98  \n",
      "12725504             80077         91  \n",
      "\n",
      "[12725505 rows x 6 columns]\n"
     ]
    }
   ],
   "source": [
    "def filter_by_frequence_movie_id(RDF: pd.DataFrame, min_treshold, max_treshold) -> pd.DataFrame:\n",
    "    counts = RDF[\"MovieID\"].value_counts() \n",
    "    RDF[\"MovieFreq\"] = RDF[\"MovieID\"].map(counts)\n",
    "    RDF = RDF[RDF[\"MovieFreq\"] >= min_treshold]\n",
    "    RDF = RDF[RDF[\"MovieFreq\"] < max_treshold]\n",
    "    # counts is a series as key: relationship, value: count\n",
    "    # counts = counts[counts > count_treshold]\n",
    "    # relationships = counts.index\n",
    "    # RDF = RDF[RDF[\"RelationshipURI\"].isin(relationships)]\n",
    "    # RDF = RDF.groupby(\"RelationshipURI\").filter(lambda x: len(x) >= count_treshold)\n",
    "    return RDF\n",
    "\n",
    "RDF = filter_by_frequence_movie_id(RDF, 1, 1500)\n",
    "print(RDF)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "deep_learning",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/Scripts/DataCleaning/data_output_models/bpe_corpus.py
+++ b/Scripts/DataCleaning/data_output_models/bpe_corpus.py
@@ -0,0 +1,21 @@
 from Scripts.Libs.Utils.dataframe_interaction import get_raw_from_dataframe
 from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
 import pandas as pd
 class BPE_corpus():
    def __init__(self, output_path :str):
        self.output_handler = open(output_path, "w")
    def close(self):
        # add corpus end before closing
        self.output_handler.write(SpecialToken.CORPUS_END.value)
        self.output_handler.close()
    def write_from_str(self, output: str):
        if output == '':
            return
        self.output_handler.write(output)
    def write_from_df(self, df: pd.DataFrame):
        self.write_from_str(get_raw_from_dataframe(df))
--- a/Scripts/DataCleaning/data_output_models/rdf_completation_task.py
+++ b/Scripts/DataCleaning/data_output_models/rdf_completation_task.py
@@ -0,0 +1,26 @@
 import pandas as pd
 class RDF_completation_task_dataset():
    """
        Write the CSV for the fourth task, which is "Predicting subsequent triples based on a given context".
        Each RDF is saved as str
        CSV Composition: ["MovieID","RDF"]
    """
    def __init__(self, output_path:str):
        self.output =  open(output_path, "w")
        # then the first row as header
        header = ["MovieID","RDF"]
        self.output.write(",".join(header) + "\n")
    def close(self):
        self.output.close()
    def write(self, RDF: pd.DataFrame):
        """
        Args:
            RDF (pd.DataFrame): ["MovieID","RDF"]
        """        
        RDF.to_csv(self.output, index=False, header=False)
--- a/Scripts/DataCleaning/data_output_models/rdf_mask_task.py
+++ b/Scripts/DataCleaning/data_output_models/rdf_mask_task.py
@@ -0,0 +1,58 @@
 import pandas as pd
 # do not worry about circular dependencies, this class will never call something else
 from Scripts.DataCleaning.filter import PipelineApplier
 class RDF_mask_task_dataset():
    """
        Write the CSV for the third task, which is "Predicting a masked component within an RDF triple".
        The CSV is like: for each RDF there will be 3 rows, where every time one of the componments is missing.
        CSV Composition: ["MovieID","IncompleteRDF","Missing","RDF"]
    """
    def __init__(self, output_path:str):
        # this methods will only be used by this class, but they belong in a lower level
        self._build_triple = PipelineApplier.build_triple
        self._build_incomplete_triple = PipelineApplier.build_incomplete_triple
        self.output =  open(output_path, "w")
        # then the first row as header
        header = ["MovieID","IncompleteRDF","Missing","RDF"]
        self.output.write(",".join(header) + "\n")
    def close(self):
        self.output.close()
    def write(self, RDF: pd.DataFrame):
        rdf_complete = self._build_triple(RDF)
        rdf_without_subject = self._build_incomplete_triple(RDF.drop(columns=["SubjectURI"]))
        rdf_without_relationship = self._build_incomplete_triple(RDF.drop(columns=["RelationshipURI"]))
        rdf_without_object = self._build_incomplete_triple(RDF.drop(columns=["ObjectURI"]))
        ####
        df_subject = pd.DataFrame({
            "MovieID": RDF["MovieID"],
            "IncompleteRDF": rdf_without_subject,
            "Missing": RDF["SubjectURI"],
            "RDF": rdf_complete,
        })
        df_relationship = pd.DataFrame({
            "MovieID": RDF["MovieID"],
            "IncompleteRDF": rdf_without_relationship,
            "Missing": RDF["RelationshipURI"],
            "RDF": rdf_complete,
        })
        df_object = pd.DataFrame({
            "MovieID": RDF["MovieID"],
            "IncompleteRDF": rdf_without_object,
            "Missing": RDF["ObjectURI"],
            "RDF": rdf_complete,
        })
        output_df = pd.concat([df_subject, df_relationship, df_object], ignore_index=True)
        output_df.to_csv(self.output, index=False, header=False)
--- a/Scripts/DataCleaning/data_output_models/rdf_text_tasks.py
+++ b/Scripts/DataCleaning/data_output_models/rdf_text_tasks.py
@@ -0,0 +1,26 @@
 import pandas as pd
 class RDF_text_task_dataset():
    """
        Write the CSV for the firsts two tasks, which are "Generating structured RDF triples from natural language text" and reverse.
        In the CVS the RDFs will be saved toghether as a string.
        CSV Composition: ["MovieID","RDFs","Abstract"]
    """
    def __init__(self, output_path:str):
        self.output =  open(output_path, "w")
        # then the first row as header
        header = ["MovieID","RDFs","Abstract"]
        self.output.write(",".join(header) + "\n")
    def close(self):
        self.output.close()
    def write(self, RDF: pd.DataFrame):
        """
        Args:
            RDF (pd.DataFrame): ["MovieID","Triple","Abstract"]
        """        
        RDF.to_csv(self.output, index=False, header=False)
--- a/Scripts/DataCleaning/dbpedia-uri.py
+++ b/Scripts/DataCleaning/dbpedia-uri.py
@@ -0,0 +1,77 @@
 import argparse
 import sys
 class ProgramArgs:
    def __init__(self, file: str, output: str, treshold: int):
        self.file = file
        self.output = output
        self.treshold = treshold
 def get_args(args: list[str]) -> ProgramArgs:
    PARSER = argparse.ArgumentParser()
    PARSER.add_argument("--input-file", "-i", required=True, type=str)
    PARSER.add_argument("--output-file", "-o", required=True, type=str)
    PARSER.add_argument("--treshold", "-t", type=int, default=1)
    parsed_args, _ = PARSER.parse_known_args(args)
    # print(parsed_args.input_file)
    return ProgramArgs(parsed_args.input_file,parsed_args.output_file, parsed_args.treshold)  # type ignore
 def print_dbpedia(file: str, out: str):
    FILE = open(file, "r", encoding="utf-8")
    OUT = open(out, mode="w", encoding="utf-8")
    DOMAIN_PART = "dbpedia"
    already_parsed : set[str] = set()
    for row in FILE:
        sections = row.split("/")
        sections = list(filter(lambda item: item != "", sections))
        # print(sections)
        if len(sections) < 3:
            continue
        URI = "/".join(sections[1:3])
        URI = "//".join([sections[0], URI])
        if  URI in already_parsed:
            continue
        DOMAIN = sections[1]
        SUBDOMAINS = DOMAIN.split(".")
        TYPE = sections[2][0]
        if DOMAIN_PART not in SUBDOMAINS:
            continue
        already_parsed.add(URI)
        SUB_ID = SUBDOMAINS[0]
        if len(SUB_ID) > 3:
            SUB_ID = SUB_ID[:3]
        OUT.write(f"\"{URI}/\", \"{SUB_ID}-db{TYPE}\"\n")
    FILE.close()
    OUT.close()
 if __name__ == "__main__":
    ARGS = get_args(sys.argv)
    # ARGS = get_debug_args()
    print_dbpedia(ARGS.file, ARGS.output)
--- a/Scripts/DataCleaning/filter.py
+++ b/Scripts/DataCleaning/filter.py
@@ -0,0 +1,188 @@
 # This file deletes in the pipeline the unwanted relationship by different rules
 import pandas as pd
 import sqlite3
 import numpy as np
 from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
 from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
 class PipelineApplier():
    def __init__(self):
        self.MOVIE_FILTER = pd.DataFrame()
        self.REL_FILTER = pd.DataFrame()
    def delete_relationship_by_str(self, RDF: pd.DataFrame, uri: str) -> pd.DataFrame:
        return RDF[RDF["RelationshipURI"]!= uri]
    def generate_list_relationship_filter(self, filter_list: list[str]) -> None:
        """Store RelationshipURI filters as a set """
        self.relationship_filter_list: set[str] = set(filter_list)
    def delete_relationship_by_list_filter(self, RDF: pd.DataFrame) -> pd.DataFrame:
        """Remove rows whose RelationshipURI is in the stored filter. Generate it first callig the generate_list_relationship_filter"""
        return RDF[~RDF["RelationshipURI"].isin(self.relationship_filter_list)]
    def generate_frequency_movie_filter(self, MOVIE_COUNT: pd.DataFrame ,min_treshold: int, max_treshold: int):
        """
        You MUST call this before filter the dataset by movie frequence [filter_by_frequence_movie_id()], 
        since this method creates such filter
        Args:
            MOVIE_COUNT (pd.DataFrame): ["MovieID","Count"]
            min_treshold (int): 
            max_treshold (int): 
        """        
        MOVIE_COUNT = MOVIE_COUNT[MOVIE_COUNT["Count"] >= min_treshold]
        MOVIE_COUNT = MOVIE_COUNT[MOVIE_COUNT["Count"] < max_treshold]
        self.MOVIE_FILTER = MOVIE_COUNT #["MovieID"]
    def generate_frequency_relationship_filter(self, REL_COUNT: pd.DataFrame ,min_treshold: int, max_treshold: int):
        REL_COUNT = REL_COUNT[REL_COUNT["Count"] >= min_treshold]
        REL_COUNT = REL_COUNT[REL_COUNT["Count"] < max_treshold]
        self.REL_FILTER = REL_COUNT #["RelationshipURI"]
    def filter_by_frequency_movie_id(self, RDF: pd.DataFrame) -> pd.DataFrame:
        RDF = RDF[RDF["MovieID"].isin(self.MOVIE_FILTER["MovieID"])]
        return RDF
    def filter_by_frequency_relationship(self, RDF: pd.DataFrame) -> pd.DataFrame:
        RDF = RDF[RDF["RelationshipURI"].isin(self.REL_FILTER["RelationshipURI"])]
        return RDF
    def rdf_add_special_token(self, RDF: pd.DataFrame):
        """
        Adds RDF special token to each element of the tuple. i.e: SUBJ to SubjectURI, OBJ to ObjectURI, REL to RelationshipURI. 
        Check Scrits/Libs/CleaningPipeline/special_token.py for the up-to-date special token.
        It only adds the special token of the three element of the RDF, no other special token.
        Args:
            RDF (pd.DataFrame):
        Returns:
            pd.DataFrame: ["MovieURI","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
        """        
        # if the filter runned before sliced the RDF and created a View, here the problem is resolved
        # for more context: SettingWithCopyWarning
        RDF = RDF.copy()
        # at the beginning of SubjectURI RelationshipURI ObjectURI, add their special token  
        RDF["SubjectURI"] = SpecialToken.SUBJECT.value + RDF["SubjectURI"]
        RDF["ObjectURI"] = SpecialToken.OBJECT.value + RDF["ObjectURI"]
        RDF["RelationshipURI"] = SpecialToken.RELATIONSHIP.value + RDF["RelationshipURI"]
        return RDF
    def reduce_movie_list(self, starting_offset:int , ending_offset:int):
        end = min(len(self.MOVIE_FILTER), ending_offset)
        self.MOVIE_FILTER = self.MOVIE_FILTER.iloc[starting_offset:end].copy()
    def drop_na_from_dataset(self, RDF: pd.DataFrame) -> pd.DataFrame:
        # dataset has SubjectURI RelationshipURI ObjectURI
        #  want to drop the '' in them
        # Replace empty strings with NaN
        RDF = RDF.replace('', np.nan)
        # Drop rows where any of the key columns are NaN
        RDF = RDF.dropna(subset=["SubjectURI", "RelationshipURI", "ObjectURI"])
        return RDF
    def rebuild_by_movie(self, RDF: pd.DataFrame) -> pd.DataFrame:
        """_summary_
        Args:
            RDF (pd.DataFrame): ["MovieID","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
        Returns:
            pd.DataFrame: ["MovieID","Triple","Abstract"]
        """        
        # to execute this method you have to have itereted by movie_id
        # because as design we want at the end one row for each movie
        # MovieID and abstract can be given as input for a more generic method
        # movie_id = RDF["MovieID"].iloc(0)
        # abstract = RDF["Abstract"].iloc(0)
        # first let's combine each row creating column triple as join of rdf
        RDF["Triple"] =  RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"]
        # special token 
        RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
        # combine rows into one
        # MovieID and Abstract are unique for each other 1 <-> 1
        RDF = RDF.groupby(["MovieID", "Abstract"])["Triple"].apply("".join).reset_index()
        # add special token for: start of triple, end of triple and start of abstract
        RDF["Triple"] = SpecialToken.START_TRIPLE_LIST.value + RDF["Triple"] 
        RDF["Abstract"] = SpecialToken.ABSTRACT.value + RDF["Abstract"]
        return RDF[["MovieID","Triple","Abstract"]]
    def group_by_movie_from_triple(self, RDF: pd.DataFrame) -> pd.DataFrame:
        """
        Args:
            RDF (pd.DataFrame): ["MovieID","Triple","Abstract"]
        Returns:
            pd.DataFrame: ["MovieID","Triple","Abstract"]
        """
        # combine rows into one
        # MovieID and Abstract are unique for each other 1 <-> 1
        RDF = RDF.groupby(["MovieID", "Abstract"])["Triple"].apply("".join).reset_index()
        # add special token for: start of triple, end of triple and start of abstract
        RDF["Triple"] = SpecialToken.START_TRIPLE_LIST.value + RDF["Triple"] 
        RDF["Abstract"] = SpecialToken.ABSTRACT.value + RDF["Abstract"]
        return RDF[["MovieID","Triple","Abstract"]]
    @staticmethod
    def build_triple(RDF: pd.DataFrame):
        """
        Obtains joined RDF triple in one element, togheter with START and END special token
        Args:
            RDF (pd.DataFrame): at least ["SubjectURI", "RelationshipURI", "ObjectURI"]
        Returns:
            pd.DataFrame: RDF["Triple"] (just this column)
        """        
        # let's combine each row creating column triple as join of rdf
        RDF["Triple"] =  RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"]
        # special token 
        RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
        return RDF["Triple"]
    @staticmethod
    def build_incomplete_triple(RDF: pd.DataFrame):
        """
        Method helper used for the third task: "Predicting a masked component within an RDF triple". 
        Obtains joined RDF triple in one element, togheter with START and END special token.
        The MISSING element will be replaced by the special token <MASK>
        Args:
            RDF (pd.DataFrame): 2 of the following ["SubjectURI", "RelationshipURI", "ObjectURI"]
        Returns:
            RDF["Triple"]: pd.Series  (just this column, NOT A DATAFRAME)
        """        
        # let's create a new column "Triple" with the joined RDF
        # the following creates a column of MASK token of the lenght of the dataframe,
        # it is not needed since we expect to have a dataframe of just one column, but its more robust (AND SLOW)
        MISSING = pd.Series([SpecialToken.MASK.value] * len(RDF), index=RDF.index)
        RDF["Triple"] =  ( 
                    RDF.get("SubjectURI", MISSING) + 
                    RDF.get("RelationshipURI", MISSING) + 
                    RDF.get("ObjectURI", MISSING))
        # special token 
        RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
        return RDF["Triple"]
    @staticmethod
    def build_for_mask_task(RDF_incomplete: pd.DataFrame, MISSING: pd.DataFrame) -> pd.DataFrame:
        # currently not used
        """
        Method helper used for the third task: "Predicting a masked component within an RDF triple". 
        Given two Dataframe, the first containing the incompleted RDF and the other only the missing componment,
        this methods applies the special token
        Args:
            RDF (pd.DataFrame): _description_
        Returns:
            pd.DataFrame: _description_
        """  
        # take an example dataframe as ["SubjectURI",""]    
        # as input two dataframe, one with 2 column  
        return None
--- a/Scripts/DataCleaning/path_splitter_tree.py
+++ b/Scripts/DataCleaning/path_splitter_tree.py
@@ -6,8 +6,16 @@ from typing import Self
 class ProgramArgs:
-    def __init__(self, file: str, output: str, treshold: int):
+    def __init__(self, file: str, csv_uri_header: str, output: str, treshold: int):
        """
        Args:
            file (str): 
            csv_header (str): The name of the column of the csv file from which the program will get the URIs
            output (str): 
            treshold (int): 
        """        
        self.file = file
        self.csv_uri_header = csv_uri_header
        self.output = output
        self.treshold = treshold
@@ -33,11 +41,15 @@ class Node:
        KEY = child[0]
        if not self.children.get(KEY):
            # if the key has no value, it means we are traversing this branch for the first time
            # create another node for the key
            self.children[KEY] = Node(KEY, 0)
        # take the node for the key
        CHILD = self.children[KEY]
        self.quantity += 1
        # if the child list to enter has only one element, which is KEY, no more node will be created
        if len(child) == 1:
            return
@@ -53,27 +65,32 @@ def get_args(args: list[str]) -> ProgramArgs:
    PARSER = argparse.ArgumentParser()
    PARSER.add_argument("--input-file", "-i", required=True, type=str)
    PARSER.add_argument("--header-name", "-c", required=True, type=str)                       # c stands for column
    PARSER.add_argument("--output-file", "-o", required=True, type=str)
    PARSER.add_argument("--treshold", "-t", type=int, default=1)
    parsed_args, _ = PARSER.parse_known_args(args)
    # print(parsed_args.input_file)
-    return ProgramArgs(parsed_args.input_file,parsed_args.output_file, parsed_args.treshold)  # type ignore
+    return ProgramArgs(parsed_args.input_file, parsed_args.header_name ,parsed_args.output_file, parsed_args.treshold)  # type ignore
 def get_debug_args() -> ProgramArgs:
-
+    # -i ./Assets/Dataset/1-hop/movies.csv  -c subject -o Assets/Dataset/Tmp/prova.csv -t 1
-    FILE = "./Assets/Dataset/Tmp/reverse-rel.txt"
+    FILE = "./Assets/Dataset/1-hop/movies.csv"
    CSV_HEADER = "subject"
    OUTPUT = "./Assets/Dataset/Tmp/prova.csv"
    TRESHOLD = 1
    return ProgramArgs(
        FILE,
        CSV_HEADER,
        OUTPUT,
        TRESHOLD
    )
-def tree_like(file: str, out: str):
+def tree_like(file: str, csv_uri_header:str, out: str):
    INDENTATION = "    "
@@ -84,9 +101,11 @@ def tree_like(file: str, out: str):
    FILE = open(file, "r", encoding="utf-8")
-    for row in FILE:
+    # It is needed the header-name
    for row in csv.DictReader(FILE):
-        sections = row.split("/")
+        uri_element = row[csv_uri_header]
        sections = uri_element.split("/")
        sections = list(filter(lambda item: item != "", sections))
        # print(sections)
@@ -115,7 +134,9 @@ def tree_like(file: str, out: str):
        INDENT: str = INDENTATION * DEPTH
-        if NODE.quantity < ARGS.treshold:
+        # Leaf node have quantity 0, so if i want them to appear the threshold have to be 0
        # if NODE.quantity < ARGS.treshold:
        if ARGS.treshold > NODE.quantity:
            continue
        OUT.write(f"{INDENT}- {NODE}\n")
@@ -133,7 +154,8 @@ def tree_like(file: str, out: str):
    OUT.close()
 if __name__ == "__main__":
    ARGS = get_args(sys.argv)
    # ARGS = get_debug_args()
-    tree_like(ARGS.file, ARGS.output)
+    tree_like(ARGS.file,ARGS.csv_uri_header, ARGS.output)
--- a/Scripts/DataCleaning/pipeline.py
+++ b/Scripts/DataCleaning/pipeline.py
@@ -0,0 +1,140 @@
 import re
 from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
 from Scripts.DataCleaning.filter import PipelineApplier
 # tasks dataset builder
 from Scripts.DataCleaning.data_output_models.rdf_mask_task import RDF_mask_task_dataset
 from Scripts.DataCleaning.data_output_models.bpe_corpus import BPE_corpus
 from Scripts.DataCleaning.data_output_models.rdf_text_tasks import RDF_text_task_dataset
 from Scripts.DataCleaning.data_output_models.rdf_completation_task import RDF_completation_task_dataset
 import pandas as pd
 class Pipeline():
    def __init__(self, 
                 mask_task_dataset_path:str = "./Assets/Dataset/Tmp/rdf_mask.csv",
                 bpe_corpus_path:str = "./Assets/Dataset/Tmp/corpus.txt",
                 text_to_rdf_task_dataset_path:str = "./Assets/Dataset/Tmp/rdf_text.csv",
                 completation_rdf_task_dataset_path:str = "./Assets/Dataset/Tmp/rdf_completation.csv",
                 ):
        self.sql_endpoint = SqlEndpoint()
        # classes to manage taskes' datasets
        self.task_rdf_mask = RDF_mask_task_dataset(mask_task_dataset_path)
        self.task_bpe_corpus = BPE_corpus(bpe_corpus_path)
        self.task_rdf_text = RDF_text_task_dataset(text_to_rdf_task_dataset_path)
        self.task_rdf_completation = RDF_completation_task_dataset(completation_rdf_task_dataset_path)
        # prepare the filter
        # the filter applier needs to know the frequence of Movies and Relationship among all the Dataset
        self.filter_applier = PipelineApplier()
        MOVIE_COUNT = self.sql_endpoint.get_movies_id_count()
        REL_COUNT = self.sql_endpoint.get_relationship_count()
        self.filter_applier.generate_frequency_movie_filter(MOVIE_COUNT,50,3000)
        self.filter_applier.generate_frequency_relationship_filter(REL_COUNT, 50, 2395627)
        # prepare the filter on the relationshipURI you want to delete:
        relationship_uri_banned_list = [
            "dbp-dbp:wikiPageUsesTemplate","w3:2000/01/rdf-schema#label","dbp-dbo:abstract",
            "dbp-dbo:wikiPageID","dbp-dbo:wikiPageRevisionID", "dbp-dbo:wikiPageDisambiguates",
            "w3:2002/07/owl#sameAs","dbp-dbp:image","dbp-dbo:wikiPageLength", "w3:2000/01/rdf-schema#comment",
            "dbp-dbo:thumbnail", "foaf:depiction", "w3:1999/02/22-rdf-syntax-ns#type"]
        self.filter_applier.generate_list_relationship_filter(relationship_uri_banned_list)
    def execute_task_bpe_corpus(self):
        for RDF in self._get_cleaned_movie_rows():
            RDF = self.filter_applier.rebuild_by_movie(RDF)
            RDF = RDF[["Triple","Abstract"]]
            self.task_bpe_corpus.write_from_df(RDF)
        self._end_file_handler()
    def execute_task_rdf_mask(self):
        for RDF in self._get_cleaned_movie_rows():
            self.task_rdf_mask.write(RDF)
        self._end_file_handler()
    def execute_tasks_rdf_text(self):
        for RDF in self._get_cleaned_movie_rows():
            RDF = self.filter_applier.rebuild_by_movie(RDF)
            self.task_rdf_text.write(RDF)
        self._end_file_handler()
    def execute_task_rdf_completation(self):
        for RDF in self._get_cleaned_movie_rows():
            RDF["Triple"] = self.filter_applier.build_triple(RDF)
            self.task_rdf_completation.write(RDF[["MovieID","Triple"]])
        self._end_file_handler()
    def execute_all_task(self):
        for RDF in self._get_cleaned_movie_rows():
            self.task_rdf_mask.write(RDF)
            RDF["Triple"] = self.filter_applier.build_triple(RDF)
            self.task_rdf_completation.write(RDF[["MovieID","Triple"]])
            RDF = self.filter_applier.group_by_movie_from_triple(RDF[["MovieID","Triple","Abstract"]])
            self.task_rdf_text.write(RDF)
            self.task_bpe_corpus.write_from_df(RDF[["Triple","Abstract"]])
        self._end_file_handler()
    def _end_file_handler(self):
        self.task_bpe_corpus.close()
        self.task_rdf_mask.close()
        self.task_rdf_text.close()
        self.task_rdf_completation.close()
    def _get_cleaned_movie_rows(self):
        for RDF in self.sql_endpoint.get_abbreviated_dataset_by_movie_id():
            RDF = self.filter_applier.drop_na_from_dataset(RDF)
            RDF = self.filter_applier.filter_by_frequency_movie_id(RDF)
            RDF = self.filter_applier.filter_by_frequency_relationship(RDF)
            # other filter
            #
            RDF = self.filter_applier.delete_relationship_by_list_filter(RDF)
            if RDF.empty:
                continue
            RDF = self.filter_applier.rdf_add_special_token(RDF)  # WARNING, THIS MUST BE DONE AFTER FILTER BY FREQUENCE
            yield RDF
    def use_toy_dataset(self):
        # CHOOSEN MOVIE:
        # The Dark Knight   : 117248
        # Inception         : 147074
        # The Avengers      : 113621
        # Cast Away         : 1123
        # The Departed      : 117586
        # American Psycho   : 90177
        # Avatar            : 71587
        # Django Unchained  : 138952
        # Spirited Away     : 144137
        # Knives Out        : 148025
        movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]
        self.sql_endpoint.movie_ids = movie_list
    def reduce_movie_list(self, starting_offset:int , ending_offset:int):
        self.filter_applier.reduce_movie_list(starting_offset,ending_offset)
 # there are a lot of settings to manage
 # you only need to change settings: 
 # in the init for file paths, frequency filter limit, banned reletionshipURI
 # in the use_toy_dataset , to change the toy dataset
 # in _get_cleaned_movie_rows: to change how the pipeline behave
 #pipeline = Pipeline()
 # pipeline.use_toy_dataset()
 # pipeline.execute_task_bpe_corpus()
 # pipeline.execute_task_rdf_mask()
 # pipeline.execute_tasks_rdf_text()
 # pipeline.execute_task_rdf_completation()
 # pipeline.execute_all_task()
--- a/Scripts/DatasetMerging/SQL_Queries/db_creation.sql
+++ b/Scripts/DatasetMerging/SQL_Queries/db_creation.sql
@@ -1,65 +0,0 @@
 CREATE TABLE IF NOT EXISTS Movies (
    MovieID INTEGER PRIMARY KEY AUTOINCREMENT,
    MovieURI TEXT UNIQUE NOT NULL
 );
 CREATE TABLE IF NOT EXISTS WikiPageIDs (
    MovieID INTEGER PRIMARY KEY,
    PageID INTEGER UNIQUE NOT NULL,
    FOREIGN KEY(MovieID) REFERENCES Movies(MovieID)
 );
 CREATE TABLE IF NOT EXISTS WikipediaAbstracts (
    MovieID INTEGER PRIMARY KEY,
    Abstract TEXT NOT NULL,
    FOREIGN KEY(MovieID) REFERENCES Movies(MovieID)
 );
 CREATE TABLE IF NOT EXISTS Origins (
    OriginID INTEGER PRIMARY KEY AUTOINCREMENT,
    OriginName TEXT UNIQUE NOT NULL
 );
 CREATE TABLE IF NOT EXISTS Subjects (
    SubjectID INTEGER PRIMARY KEY AUTOINCREMENT,
    SubjectURI TEXT UNIQUE NOT NULL,
    OriginID BIGINT NOT NULL,
    FOREIGN KEY(OriginID) REFERENCES Origins(OriginID)
 );
 CREATE TABLE IF NOT EXISTS Relationships (
    RelationshipID INTEGER PRIMARY KEY AUTOINCREMENT,
    RelationshipURI TEXT UNIQUE NOT NULL
 );
 CREATE TABLE IF NOT EXISTS Objects (
    ObjectID INTEGER PRIMARY KEY AUTOINCREMENT,
    ObjectURI TEXT UNIQUE NOT NULL,
    OriginID BIGINT NOT NULL,
    FOREIGN KEY(OriginID) REFERENCES Origins(OriginID)
 );
 CREATE TABLE IF NOT EXISTS RDFs (
    RDF_ID INTEGER PRIMARY KEY AUTOINCREMENT,
    MovieID INTEGER NOT NULL,
    SubjectID INTEGER NOT NULL,
    RelationshipID INTEGER NOT NULL,
    ObjectID INTEGER NOT NULL,
    UNIQUE(SubjectID, RelationshipID, ObjectID),
    FOREIGN KEY(MovieID) REFERENCES Movies(MovieID),
    FOREIGN KEY(SubjectID) REFERENCES Subjects(SubjectID),
    FOREIGN KEY(RelationshipID) REFERENCES Relationships(RelationshipID),
    FOREIGN KEY(ObjectID) REFERENCES Objects(ObjectID)
 );
 CREATE INDEX IF NOT EXISTS idx_rdf_movie_id ON RDFs(MovieID);
 CREATE INDEX IF NOT EXISTS idx_rdf_subject_id ON RDFs(SubjectID);
 CREATE INDEX IF NOT EXISTS idx_rdf_relationship_id ON RDFs(RelationshipID);
 CREATE INDEX IF NOT EXISTS idx_rdf_object_id ON RDFs(ObjectID);
--- a/Scripts/DatasetMerging/datawarehouse.py
+++ b/Scripts/DatasetMerging/datawarehouse.py
@@ -17,12 +17,15 @@ PAGEID_CSV = "./Assets/Dataset/1-hop/movie-pageid.csv"
 SUMMARY_CSV = "./Assets/Dataset/1-hop/wikipedia-summary.csv"
 DATASET_CSV = "./Assets/Dataset/1-hop/dataset.csv"
 REVERSE_CSV = "./Assets/Dataset/1-hop/reverse.csv"
 URI_CSV = "./Assets/Dataset/1-hop/uri-abbreviations.csv"
 MOVIES_CSV_HANDLER = open(MOVIES_CSV, "r", newline="", encoding="utf-8")
 PAGEID_CSV_HANDLER = open(PAGEID_CSV, "r", newline="", encoding="utf-8")
 SUMMARY_CSV_HANDLER = open(SUMMARY_CSV, "r", newline="", encoding="utf-8")
 DATASET_CSV_HANDLER = open(DATASET_CSV, "r", newline="", encoding="utf-8")
 REVERSE_CSV_HANDLER = open(REVERSE_CSV, "r", newline="", encoding="utf-8")
 URI_ABBR_CSV_HANDLER = open(URI_CSV, "r", newline="", encoding="utf-8")
 MOVIES_CSV_HANDLER = open(MOVIES_CSV,"r",newline='', encoding="utf-8")
 PAGEID_CSV_HANDLER = open(PAGEID_CSV,"r",newline='', encoding="utf-8")
 SUMMARY_CSV_HANDLER = open(SUMMARY_CSV,"r",newline='', encoding="utf-8")
 DATASET_CSV_HANDLER = open(DATASET_CSV,"r",newline='', encoding="utf-8")
 REVERSE_CSV_HANDLER = open(REVERSE_CSV,"r",newline='', encoding="utf-8")
 CONN = sqlite3.connect(DB_NAME)
 CURS = CONN.cursor()
@@ -30,6 +33,7 @@ CURS = CONN.cursor()
 # MARK: SQL Definitions
 # Insert MovieURI
 def insertOrigin(curs: sqlite3.Cursor) -> bool:
    QUERY = "INSERT INTO  Origins (OriginName) VALUES ('dataset.csv'),('reverse.csv');"
@@ -39,6 +43,7 @@ def insertOrigin(curs : sqlite3.Cursor ) -> bool:
    except sqlite3.IntegrityError:
        return False
 def selectOrigin(curs: sqlite3.Cursor, originName: str) -> int | None:
    QUERY = "SELECT OriginID FROM Origins WHERE OriginName = ?;"
@@ -51,6 +56,7 @@ def selectOrigin(curs: sqlite3.Cursor, originName: str) -> int | None:
    # in this case the real id is the first element of the tuple
    return originId[0]
 def insertMovie(curs: sqlite3.Cursor, movieUri: str) -> bool:
    QUERY = "INSERT INTO Movies (MovieURI) VALUES (?);"
@@ -82,6 +88,7 @@ def insertWikiPageId(curs: sqlite3.Cursor, movieId: int, pageId: int) -> bool:
    except sqlite3.IntegrityError:
        return False
 def selectMovieIdFromWikiPageId(curs: sqlite3.Cursor, pageId: int) -> int | None:
    QUERY = "SELECT MovieID FROM WikiPageIDs WHERE PageID = ?;"
@@ -94,6 +101,7 @@ def selectMovieIdFromWikiPageId(curs: sqlite3.Cursor,pageId: int) -> int | None:
    # in this case the real id is the first element of the tuple
    return movieId[0]
 def insertWikiAbstract(curs: sqlite3.Cursor, movieId: int, abstract: str) -> bool:
    QUERY = "INSERT INTO  WikipediaAbstracts (MovieID, Abstract) VALUES (?,?);"
    try:
@@ -102,6 +110,7 @@ def insertWikiAbstract(curs: sqlite3.Cursor, movieId: int, abstract: str) -> boo
    except sqlite3.IntegrityError:
        return False
 def insertSubject(curs: sqlite3.Cursor, subjectURI: str, originID: int) -> bool:
    QUERY = "INSERT INTO  Subjects (SubjectURI, OriginID) VALUES (?,?);"
    try:
@@ -110,6 +119,7 @@ def insertSubject(curs: sqlite3.Cursor, subjectURI: str, originID: int) -> bool:
    except sqlite3.IntegrityError:
        return False
 def insertRelationship(curs: sqlite3.Cursor, relationshipURI: str) -> bool:
    QUERY = "INSERT INTO  Relationships (RelationshipURI) VALUES (?);"
    try:
@@ -118,6 +128,7 @@ def insertRelationship(curs: sqlite3.Cursor, relationshipURI: str) -> bool:
    except sqlite3.IntegrityError:
        return False
 def insertObject(curs: sqlite3.Cursor, objectURI: str, originID: int) -> bool:
    QUERY = "INSERT INTO  objects (ObjectURI, OriginID) VALUES (?,?);"
    try:
@@ -126,6 +137,7 @@ def insertObject(curs: sqlite3.Cursor, objectURI: str, originID: int) -> bool:
    except sqlite3.IntegrityError:
        return False
 def selectSubjectId(curs: sqlite3.Cursor, subjectURI: str) -> int | None:
    QUERY = "SELECT SubjectID FROM Subjects WHERE SubjectURI = ?;"
@@ -138,6 +150,7 @@ def selectSubjectId(curs: sqlite3.Cursor, subjectURI: str) -> int | None:
    # in this case the real id is the first element of the tuple
    return subjectId[0]
 def selectRelationshipId(curs: sqlite3.Cursor, relationshipURI: str) -> int | None:
    QUERY = "SELECT RelationshipID FROM Relationships WHERE RelationshipURI = ?;"
@@ -150,6 +163,7 @@ def selectRelationshipId(curs: sqlite3.Cursor, relationshipURI: str) -> int | No
    # in this case the real id is the first element of the tuple
    return relationshipId[0]
 def selectObjectId(curs: sqlite3.Cursor, objectURI: str) -> int | None:
    QUERY = "SELECT ObjectID FROM Objects WHERE ObjectURI = ?;"
@@ -162,12 +176,13 @@ def selectObjectId(curs: sqlite3.Cursor, objectURI: str) -> int | None:
    # in this case the real id is the first element of the tuple
    return objectId[0]
 def insertRDF(
    curs: sqlite3.Cursor,
    movieId: int,
    subjectId: int,
    relationshipId: int,
-    objectId: int 
+    objectId: int,
 ) -> bool:
    QUERY = "INSERT INTO  RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?,?,?,?);"
    try:
@@ -176,6 +191,56 @@ def insertRDF(
    except sqlite3.IntegrityError:
        return False
 # UGLY: correct method to add cursor
 def insert_abbreviation(uri, abbreviation) -> bool:
    QUERY = "INSERT INTO Abbreviations(URI, Abbreviation) VALUES (?,?);"
    try:
        CURS.execute(QUERY, [uri, abbreviation])
        return True
    except sqlite3.IntegrityError:
        return False
 # UGLY: correct method to add cursor
 def insert_object_abbreviation(object_id, abbreviation_id) -> bool:
    QUERY = "INSERT INTO Objects_Abbreviations(ObjectID, AbbreviationID) VALUES (?,?);"
    try:
        CURS.execute(QUERY, [object_id, abbreviation_id])
        return True
    except sqlite3.IntegrityError:
        return False
 # UGLY: correct method to add cursor
 def insert_relationship_abbreviation(relationship_id, abbreviation_id) -> bool:
    QUERY = "INSERT INTO Relationships_Abbreviations(RelationshipID, AbbreviationID) VALUES (?,?);"
    try:
        CURS.execute(QUERY, [relationship_id, abbreviation_id])
        return True
    except sqlite3.IntegrityError:
        return False
 # UGLY: correct method to add cursor
 def insert_subject_abbreviation(subject_id, abbreviation_id) -> bool:
    QUERY = (
        "INSERT INTO Subjects_Abbreviations(SubjectID, AbbreviationID) VALUES (?,?);"
    )
    try:
        CURS.execute(QUERY, [subject_id, abbreviation_id])
        return True
    except sqlite3.IntegrityError:
        return False
 # UGLY: correct method to add cursor
 def select_abbreviation_id(uri) -> int | None:
    QUERY = "SELECT AbbreviationID FROM Abbreviations WHERE URI LIKE ?;"
    CURS.execute(QUERY, [uri])
    abbreviation_id = CURS.fetchone()
    if not abbreviation_id:
        return None
    # in this case the real id is the first element of the tuple
    return abbreviation_id[0]
 # MARK: Parsing
 def parseMovies():
@@ -208,7 +273,6 @@ def parseAbstract():
        ABSTRACT = row["text"]
        MOVIE_ID = selectMovieIdFromWikiPageId(CURS, WIKI_PAGE_ID)
        if MOVIE_ID is None:
            print(f"The WikiPageId: {WIKI_PAGE_ID} has not a MovieId ")
            continue
@@ -216,10 +280,24 @@ def parseAbstract():
        insertWikiAbstract(CURS, MOVIE_ID, ABSTRACT)
 def parseAbbreviations():
    URI_CSV = csv.DictReader(URI_ABBR_CSV_HANDLER)
    for row in URI_CSV:
        URI = row["uri"]
        ABBREVIATION = row["abbreviation"]
        insert_abbreviation(URI, ABBREVIATION)
 def parseRDF_Reverse():
    REVERSE_CSV_READER = csv.DictReader(REVERSE_CSV_HANDLER)
-    REVERSE_ORIGIN_ID = selectOrigin(CURS, 'reverse.csv')
+    REVERSE_ORIGIN_ID = selectOrigin(CURS, "reverse.csv")
    if REVERSE_ORIGIN_ID is None:
        return
    total = 0
    for row in REVERSE_CSV_READER:
@@ -236,7 +314,6 @@ def parseRDF_Reverse():
        RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
        MOVIE_ID = selectMovieId(CURS, OBJECT)
        skip = False
        # guard
@@ -259,17 +336,19 @@ def parseRDF_Reverse():
        if skip:
            continue
-        if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID):
+        if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID):  # type: ignore
            total += 1
    print(total)
 def parseRDF_Dataset():
    DATASET_CSV_READER = csv.DictReader(DATASET_CSV_HANDLER)
-    DATASET_ORIGIN_ID = selectOrigin(CURS, 'dataset.csv')
+    DATASET_ORIGIN_ID = selectOrigin(CURS, "dataset.csv")
    if DATASET_ORIGIN_ID is None:
        return
    total = 0
    rdf_idx = 0
@@ -293,7 +372,6 @@ def parseRDF_Dataset():
        RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
        MOVIE_ID = selectMovieId(CURS, SUBJECT)
        skip = False
        # guard
@@ -316,31 +394,211 @@ def parseRDF_Dataset():
        if skip:
            continue
-        if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID):
+        if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID):  # type: ignore
            total += 1
    print(total)
 def parseAbbr_Reverse():
    REVERSE_CSV_READER = csv.DictReader(REVERSE_CSV_HANDLER)
    REVERSE_ORIGIN_ID = selectOrigin(CURS, "reverse.csv")
    if REVERSE_ORIGIN_ID is None:
        return
    total = 0
    for row in REVERSE_CSV_READER:
        SUBJECT = row["subject"]
        RELATIONSHIP = row["relationship"]
        OBJECT = row["object"]
        SUBJECT_ID = selectSubjectId(CURS, SUBJECT)
        OBJECT_ID = selectObjectId(CURS, OBJECT)
        RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
        SUB_SECTIONS = SUBJECT.split("/")
        REL_SECTIONS = RELATIONSHIP.split("/")
        OBJ_SECTIONS = OBJECT.split("/")
        SUB_ABBR_ID = None
        REL_ABBR_ID = None
        OBJ_ABBR_ID = None
        skip = False
        # guard
        if SUBJECT_ID is None:
            print(f"No SubjectId for {SUBJECT}")
            skip = True
        if OBJECT_ID is None:
            print(f"No ObjectId for {OBJECT}")
            skip = True
        if RELATIONSHIP_ID is None:
            print(f"No RelationshipId for {RELATIONSHIP}")
            skip = True
        if skip:
            continue
        if len(SUB_SECTIONS) > 4:
            index = min(len(SUB_SECTIONS), 7)
            while index > 3:
                PATH = "/".join(SUB_SECTIONS[0:index]) + "%"
                SUB_ABBR_ID = select_abbreviation_id(PATH)
                if SUB_ABBR_ID is not None:
                    if insert_subject_abbreviation(SUBJECT_ID, SUB_ABBR_ID):
                        total += 1
                    index = 0
                index -= 1
        if len(REL_SECTIONS) > 4:
            index = min(len(REL_SECTIONS), 7)
            while index > 2:
                PATH = "/".join(REL_SECTIONS[0:index]) + "%"
                REL_ABBR_ID = select_abbreviation_id(PATH)
                if REL_ABBR_ID is not None:
                    if insert_relationship_abbreviation(RELATIONSHIP_ID, REL_ABBR_ID):
                        total += 1
                    index = 0
                index -= 1
        if len(OBJ_SECTIONS) > 4:
            index = min(len(OBJ_SECTIONS), 7)
            while index > 3:
                PATH = "/".join(OBJ_SECTIONS[0:index]) + "%"
                OBJ_ABBR_ID = select_abbreviation_id(PATH)
                if OBJ_ABBR_ID is not None:
                    if insert_object_abbreviation(OBJECT_ID, OBJ_ABBR_ID):
                        total += 1
                    index = 0
                index -= 1
    print(total)
 def parseAbbr_Dataset():
    DATASET_CSV_READER = csv.DictReader(DATASET_CSV_HANDLER)
    DATASET_ORIGIN_ID = selectOrigin(CURS, "dataset.csv")
    if DATASET_ORIGIN_ID is None:
        return
    total = 0
    rdf_idx = 0
    for row in DATASET_CSV_READER:
        SUBJECT = row["subject"]
        RELATIONSHIP = row["relationship"]
        OBJECT = row["object"]
        rdf_idx += 1
        if rdf_idx % 100000 == 0:
            print(f"RDF number {rdf_idx}:\n\t{SUBJECT} - {RELATIONSHIP} - {OBJECT}")
        SUBJECT_ID = selectSubjectId(CURS, SUBJECT)
        OBJECT_ID = selectObjectId(CURS, OBJECT)
        RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
        SUB_SECTIONS = SUBJECT.split("/")
        REL_SECTIONS = RELATIONSHIP.split("/")
        OBJ_SECTIONS = OBJECT.split("/")
        SUB_ABBR_ID = None
        REL_ABBR_ID = None
        OBJ_ABBR_ID = None
        skip = False
        # guard
        if SUBJECT_ID is None:
            print(f"No SubjectId for {SUBJECT}")
            skip = True
        if OBJECT_ID is None:
            print(f"No ObjectId for {OBJECT}")
            skip = True
        if RELATIONSHIP_ID is None:
            print(f"No RelationshipId for {RELATIONSHIP}")
            skip = True
        if skip:
            continue
        if len(SUB_SECTIONS) > 4:
            index = min(len(SUB_SECTIONS), 7)
            while index > 3:
                PATH = "/".join(SUB_SECTIONS[0:index]) + "%"
                SUB_ABBR_ID = select_abbreviation_id(PATH)
                if SUB_ABBR_ID is not None:
                    if insert_subject_abbreviation(SUBJECT_ID, SUB_ABBR_ID):
                        total += 1
                    index = 0
                index -= 1
        if len(REL_SECTIONS) > 4:
            index = min(len(REL_SECTIONS), 7)
            while index > 2:
                PATH = "/".join(REL_SECTIONS[0:index]) + "%"
                REL_ABBR_ID = select_abbreviation_id(PATH)
                if REL_ABBR_ID is not None:
                    if insert_relationship_abbreviation(RELATIONSHIP_ID, REL_ABBR_ID):
                        total += 1
                    index = 0
                index -= 1
        if len(OBJ_SECTIONS) > 4:
            index = min(len(OBJ_SECTIONS), 7)
            while index > 3:
                PATH = "/".join(OBJ_SECTIONS[0:index]) + "%"
                OBJ_ABBR_ID = select_abbreviation_id(PATH)
                if OBJ_ABBR_ID is not None:
                    if insert_object_abbreviation(OBJECT_ID, OBJ_ABBR_ID):
                        total += 1
                    index = 0
                index -= 1
    print(total)
 # MARK: Actual Code
 # parseMovies()
 # parseWikiPageId()
 # parseAbstract()
 # insertOrigin(CURS)
 # parseAbbreviations()
 # parseRDF_Reverse()
 # parseRDF_Dataset()
 # parseAbbr_Reverse()
 parseAbbr_Dataset()
 CONN.commit()
 CONN.close()
 MOVIES_CSV_HANDLER.close()
 PAGEID_CSV_HANDLER.close()
 SUMMARY_CSV_HANDLER.close()
 DATASET_CSV_HANDLER.close()
 REVERSE_CSV_HANDLER.close()
 URI_ABBR_CSV_HANDLER.close()
 """
--- a/Scripts/Experiments/change_me/use_bpe_pipeline.py
+++ b/Scripts/Experiments/change_me/use_bpe_pipeline.py
@@ -0,0 +1,21 @@
 import Project_Model.Libs.BPE as BPE
 from pathlib import Path
 from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
 VOCABULARY_path = "Assets/Model/toy_10/toy_dictionary.json"
 VOCABULARY = BPE.load_nanos_vocabulary(Path(VOCABULARY_path))
 SPECIAL_TOKEN_LIST = [token.value for token in SpecialToken]
 # INPUT = "<SOTL><SOT><SUBJ>dbp-dbr:How_It_Should_Have_Ended<PRED>dbp-dbp:title<OBJ>dbp-dbr:The_Dark_Knight<EOT>"
 # INPUT = "<SOTL><SOT><SUBJ>dbp-dbr:How_It_Should_Have_Ended<PRED>"
 # INPUT = "<ABS>The Dark Knight is a 2008 superhero film directed by Christopher Nolan, from a screenplay co-written with his brother Jonathan. Based on the DC Comics superhero Batman, it is the sequel to Batman Begins (2005), and the second installment in The Dark Knight trilogy. The plot follows the vigilante Batman, police lieutenant James Gordon, and district attorney Harvey Dent, who form an alliance to dismantle organized crime in Gotham City. Their efforts are derailed by the Joker, an anarchistic mastermind who seeks to test how far Batman will go to save the city from chaos. The ensemble cast includes Christian Bale, Michael Caine, Heath Ledger, Gary Oldman, Aaron Eckhart, Maggie Gyllenhaal, and Morgan Freeman.Warner Bros. Pictures prioritized a sequel following the successful reinvention of the Batman film series with Batman Begins. Christopher and Batman Begins co-writer David S. Goyer developed the story elements, making Dent the central protagonist caught up in the battle between Batman and the Joker. In writing the screenplay, the Nolans were influenced by 1980s Batman comics and crime drama films, and sought to continue Batman Begins' heightened sense of realism. From April to November 2007, filming took place with a $185 million budget in Chicago and Hong Kong, and on sets in England. The Dark Knight was the first major motion picture to be filmed with high-resolution IMAX cameras. Christopher avoided using computer-generated imagery unless necessary, insisting on practical stunts such as flipping an 18-wheel truck and blowing up a factory.The Dark Knight was marketed with an innovative interactive viral campaign that initially focused on countering criticism of Ledger's casting by those who believed he was a poor choice to portray the Joker. Ledger died from an accidental prescription drug overdose in January 2008, leading to widespread interest from the press and public regarding his performance. When it was released in July, The Dark Knight received acclaim for its mature tone and themes, visual style, and performances—particularly that of Ledger, who received many posthumous awards including Academy, BAFTA, and Golden Globe awards for Best Supporting Actor, making The Dark Knight the first comic-book film to receive major industry awards. It broke several box-office records and became the highest-grossing 2008 film, the fourth-highest-grossing film to that time, and the highest-grossing superhero film of the time.Since its release, The Dark Knight has been assessed as one of the greatest superhero films ever, one of the best movies of the 2000s, and one of the best films ever made. It is considered the \"blueprint\" for many modern superhero films, particularly for its rejection of a typical comic-book movie style in favor of a crime film that features comic-book characters. Many filmmakers sought to repeat its success by emulating its gritty, realistic tone to varying degrees of success. The Dark Knight has been analyzed for its themes of terrorism and the limitations of morality and ethics. The United States Library of Congress selected it for preservation in the National Film Registry in 2020. A sequel, The Dark Knight Rises, concluded The Dark Knight trilogy in 2012.<SOTL>"
 INPUT = "<ABS>The Dark Knight is a 2008 superhero film directed by Christopher Nolan,<SOTL>"
 # INPUT = "<ABS> Nolan,<SOTL>"
 # 32: " "
 TOKENANO = BPE.Classes.TokeNanoCore(VOCABULARY, SPECIAL_TOKEN_LIST)
 print(f"input: {INPUT} \ninput lenght: {len(INPUT)}")
 encoded = TOKENANO.encode(INPUT)
 print(f"encode: {encoded} \nencode lenght: {len(encoded)}")
 decoded = TOKENANO.decode(encoded)
 print(f"decode: {decoded} \ndecode lenght: {len(decoded)}")
--- a/Scripts/Libs/CleaningPipeline/.gitkeep
+++ b/Scripts/Libs/CleaningPipeline/.gitkeep
--- a/Scripts/Libs/CleaningPipeline/special_token.py
+++ b/Scripts/Libs/CleaningPipeline/special_token.py
@@ -0,0 +1,21 @@
 from enum import Enum
 class SpecialToken(Enum):
    # (Enum, str) -> throws an error
    START_TRIPLE_LIST = "<SOTL>"
    START_TRIPLE = "<SOT>"
    END_TRIPLE = "<EOT>"
    SUBJECT = "<SUBJ>"
    RELATIONSHIP = "<PRED>"
    OBJECT = "<OBJ>"
    ABSTRACT = "<ABS>"
    CORPUS_END = "<END>"
    ## Tasks' Token
    RDF_TO_TEXT = "<RDF2TXT>"
    TEXT_TO_RDF = "<TEXT2RDF>"
    CONTINUE_RDF = "<CONTINUERDF>"
    MASK = "<MASK>"
    # BPE Training:
--- a/Scripts/Libs/CleaningPipeline/sql_endpoint.py
+++ b/Scripts/Libs/CleaningPipeline/sql_endpoint.py
@@ -0,0 +1,144 @@
 #######################################################
 #   This file stand as endpoint to interact with DB   #
 #######################################################
 # import sqlite3
 import pandas as pd
 from sqlalchemy import create_engine
 from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
 class SqlEndpoint():
    def __init__(self, DB_PATH = "./Assets/Dataset/DatawareHouse/dataset.db", chunk_size_row = 500):
        # self.CONN = sqlite3.connect(DB_PATH) # DEPRECATED
        self.sql_engine = create_engine(f"sqlite:///{DB_PATH}")
        # /// 3 slash -> relative path
        # //// 4 slash -> absolute
        # self.conn = self.sql_engine.connect().execution_options(stream_results=True)
        # it seems that sqlite doenst support streamer cursor
        # PRAGMA exeutes better in writing not reading
        self.chunk_size_row = chunk_size_row                    # not used now, since each chunk is a movie
        self.movie_ids = movie_ids = pd.read_sql_query("SELECT MovieID FROM Movies;", self.sql_engine)["MovieID"]
    def get_RDF(self) -> pd.DataFrame :
        QUERY = """
                SELECT MovieID, SubjectURI, RelationshipURI, ObjectURI
                FROM RDFs
                INNER JOIN Subjects USING (SubjectID)
                INNER JOIN Relationships USING (RelationshipID)
                INNER JOIN Objects USING (ObjectID);
                """
        return pd.read_sql_query(QUERY, self.CONN)
    def get_chunked_abbreviated_dataset(self) -> pd.DataFrame :
        """
        Returns:
            pd.DataFrame: MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
        """        
        QUERY = """
                SELECT MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
                FROM RDFs
                INNER JOIN ParsedSubjects USING (SubjectID)
                INNER JOIN ParsedRelationships USING (RelationshipID)
                INNER JOIN ParsedObjects USING (ObjectID)
                INNER JOIN WikipediaAbstracts USING (MovieID);
                """
        # return pd.read_sql_query(QUERY, self.CONN, chunksize=500)
        # sqlite3
        return pd.read_sql_query(QUERY, self.sql_engine, chunksize=self.chunk_size_row)
    def get_chunked_abbreviated_dataset_with_start_token(self)-> pd.DataFrame:
        # DEPRECATED !
        start_token = SpecialToken()
        QUERY = """
                SELECT 
                    MovieID, 
                    ? || SubjectURI AS SubjectURI,
                    ? || RelationshipURI AS RelationshipURI, 
                    ? || ObjectURI AS ObjectURI, 
                    Abstract
                FROM RDFs
                INNER JOIN ParsedSubjects USING (SubjectID)
                INNER JOIN ParsedRelationships USING (RelationshipID)
                INNER JOIN ParsedObjects USING (ObjectID)
                INNER JOIN WikipediaAbstracts USING (MovieID);
                """
        return pd.read_sql_query(QUERY, self.sql_engine, chunksize=self.chunk_size_row)
    def get_abbreviated_dataset_by_movie_id(self):# -> iter[pd.DataFrame]:
        """
        Gets each time a DataFrame per movie ( with all its rows in the dataset).
        The retrieved RDFs are already abbrevieted by the sql parser
        Yields:
            Pandas.DataFrame: [MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract]
        """        
        # chunk by movieId, abstract is the same and some intersting logic are appliable
        # movie_ids = pd.read_sql_query("SELECT MovieID FROM Movies;", self.sql_engine)["MovieID"]
        # CHOOSEN MOVIE:
        # The Dark Knight   : 117248
        # Inception         : 147074
        # The Avengers      : 113621
        # Cast Away         : 1123
        # The Departed      : 117586
        # American Psycho   : 90177
        # Avatar            : 71587
        # Django Unchained  : 138952
        # Spirited Away     : 144137
        # Knives Out        : 148025
        # movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]
        # movie_ids = movie_list
        QUERY = """
                SELECT MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
                FROM RDFs
                INNER JOIN ParsedSubjects USING (SubjectID)
                INNER JOIN ParsedRelationships USING (RelationshipID)
                INNER JOIN ParsedObjects USING (ObjectID)
                INNER JOIN WikipediaAbstracts USING (MovieID)
                WHERE MovieID = (?);
                """        
        for movie_id in self.movie_ids:
            yield pd.read_sql_query(QUERY, self.sql_engine, params=(movie_id,))
    def get_movies_id_count(self) -> pd.DataFrame:
        """
        Gets the count of each Movie in the Dataset
        Returns:
            Pandas.DataFrame: [MovieID, Count]
        """        
        QUERY = """
                SELECT MovieID, COUNT(*) AS Count
                FROM RDFs
                GROUP BY MovieID;
                """        
        return pd.read_sql_query(QUERY, self.sql_engine)
    def get_relationship_count(self) -> pd.DataFrame:
        """
        Gets the count of each Relationship in the Dataset
        Returns:
            Pandas.DataFrame: [RelationshipURI, Count]
        """       
        QUERY = """
                SELECT RelationshipURI, COUNT(*) AS Count
                FROM RDFs
                INNER JOIN ParsedRelationships USING (RelationshipID)
                GROUP BY RelationshipURI;
                """        
        return pd.read_sql_query(QUERY, self.sql_engine)
 if __name__ == "__main__" :
    sql_endpoint = SqlEndpoint()
    for pandas_row in sql_endpoint.get_abbreviated_dataset_by_movie_id():
        print(pandas_row)
    # sql_endpoint.get_RDF()
    print("done")
--- a/Scripts/Libs/Utils/.gitkeep
+++ b/Scripts/Libs/Utils/.gitkeep
--- a/Scripts/Libs/Utils/dataframe_interaction.py
+++ b/Scripts/Libs/Utils/dataframe_interaction.py
@@ -0,0 +1,9 @@
 import pandas as pd
 def get_raw_from_dataframe(DF: pd.DataFrame) -> str:
    output = ''
    for row in DF.itertuples(index=False, name=None):
        output += "".join(map(str, row))
    return output
--- a/Scripts/Training/bpe_trainer.py
+++ b/Scripts/Training/bpe_trainer.py
@@ -0,0 +1,101 @@
 import argparse
 import json
 from pathlib import Path
 import sys
 # TODO: make relative imports
 import Project_Model.Libs.BPE as BPE
 from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
 DEFAULT_CHUNK_SIZE = int(18e4)
 DEFAULT_DEBUG_AFTER_ITER = 1
 DEFAULT_MAX_VOCABULARY = int(32E3)
 DEFAULT_MERGE_TRESHOLD = 1
 DEFAULT_MAX_ITERATIONS = 0
 TOKEN_LIST = [token.value for token in SpecialToken]
 class ProgramArgs:
    def __init__(
        self,
        input_file: str,
        cache_dir: str,
        output_file: str,
        resume_at: int,
        max_vocabulary: int,
        max_iterations: int,
        merge_treshold: int,
        chunk_size: int,
        debug_after: int,
    ) -> None:
        self.input_file = input_file
        self.cache_dir = cache_dir
        self.output_file = output_file
        self.resume_at = resume_at
        self.max_vocabulary = max_vocabulary
        self.max_iterations = max_iterations
        self.merge_treshold = merge_treshold
        self.chunk_size = chunk_size
        self.debug_after = debug_after
 def get_args(args: list[str]) -> ProgramArgs:
    PARSER = argparse.ArgumentParser()
    PARSER.add_argument("--input-file", "--input", "-i", required=True, type=str)
    PARSER.add_argument("--cache-dir", "--cache", "-c", required=True, type=str)
    PARSER.add_argument("--output-file", "--output", "-o", required=True, type=str)
    PARSER.add_argument("--resume-at", "--resume", "-r", default=0, type=int)
    PARSER.add_argument("--max-vocabulary", "--max-voc", default=DEFAULT_MAX_VOCABULARY, type=int)
    PARSER.add_argument("--max-iterations", "--max-iter", default=DEFAULT_MAX_ITERATIONS, type=int)
    PARSER.add_argument("--merge-treshold", "--tresh", default=DEFAULT_MERGE_TRESHOLD, type=int)
    PARSER.add_argument("--chunk-size", default=DEFAULT_CHUNK_SIZE, type=int)
    PARSER.add_argument("--debug-after", default=DEFAULT_DEBUG_AFTER_ITER, type=int)
    parsed_args, _ = PARSER.parse_known_args(args)
    return ProgramArgs(
        parsed_args.input_file,
        parsed_args.cache_dir,
        parsed_args.output_file,
        parsed_args.resume_at,
        parsed_args.max_vocabulary,
        parsed_args.max_iterations,
        parsed_args.merge_treshold,
        parsed_args.chunk_size,
        parsed_args.debug_after,
    )  # type ignore
 def train(args: ProgramArgs):
    TRAINER = BPE.NanoSocraTrainer(
        args.max_vocabulary,
        TOKEN_LIST,
        args.chunk_size,
        args.merge_treshold,
        args.max_iterations,
        args.debug_after
    )
    DATASET_PATH = Path(args.input_file)
    CACHE_DIR = Path(args.cache_dir)
    VOCABULARY_PATH = Path(args.output_file)
    print(f"Training BPE")
    BPE_ENCODER = TRAINER.trainBPE(
        DATASET_PATH,
        CACHE_DIR,
        resume_from_iter=args.resume_at
    )
    VOCABULARY = BPE_ENCODER.vocabulary
    print(f"Saving Vocabulary in {VOCABULARY_PATH}")
    BPE.save_nanos_vocabulary(VOCABULARY, VOCABULARY_PATH)
 if __name__ == "__main__":
    ARGS = get_args(sys.argv)
    train(ARGS)
--- a/Scripts/Training/bpe_trainer_pool.py
+++ b/Scripts/Training/bpe_trainer_pool.py
@@ -0,0 +1,96 @@
 import argparse
 import json
 from pathlib import Path
 import sys
 # TODO: make relative imports
 import Project_Model.Libs.BPE as BPE
 from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
 DEFAULT_DEBUG_AFTER_ITER = 1
 DEFAULT_MAX_VOCABULARY = int(32E3)
 DEFAULT_MERGE_TRESHOLD = 1
 DEFAULT_MAX_ITERATIONS = 0
 TOKEN_LIST = [token.value for token in SpecialToken]
 class ProgramArgs:
    def __init__(
        self,
        input_file: str,
        output_file: str,
        cache_file: str,
        max_vocabulary: int,
        max_iterations: int,
        merge_treshold: int,
        debug_after: int,
    ) -> None:
        self.input_file = input_file
        self.output_file = output_file
        self.cache_file = cache_file
        self.max_vocabulary = max_vocabulary
        self.max_iterations = max_iterations
        self.merge_treshold = merge_treshold
        self.debug_after = debug_after
 def get_args(args: list[str]) -> ProgramArgs:
    PARSER = argparse.ArgumentParser()
    PARSER.add_argument("--input-file", "--input", "-i", required=True, type=str)
    PARSER.add_argument("--output-file", "--output", "-o", required=True, type=str)
    PARSER.add_argument("--cache-file", "--cache", "-c", required=True, type=str)
    PARSER.add_argument("--max-vocabulary", "--max-voc", default=DEFAULT_MAX_VOCABULARY, type=int)
    PARSER.add_argument("--max-iterations", "--max-iter", default=DEFAULT_MAX_ITERATIONS, type=int)
    PARSER.add_argument("--merge-treshold", "--tresh", default=DEFAULT_MERGE_TRESHOLD, type=int)
    PARSER.add_argument("--debug-after", default=DEFAULT_DEBUG_AFTER_ITER, type=int)
    parsed_args, _ = PARSER.parse_known_args(args)
    return ProgramArgs(
        parsed_args.input_file,
        parsed_args.output_file,
        parsed_args.cache_file,
        parsed_args.max_vocabulary,
        parsed_args.max_iterations,
        parsed_args.merge_treshold,
        parsed_args.debug_after,
    )  # type ignore
 def train(args: ProgramArgs):
    TRAINER = BPE.NanoSocraTrainerPool(
        args.max_vocabulary,
        TOKEN_LIST,
        args.merge_treshold,
        args.max_iterations,
        args.debug_after
    )
    DATASET_PATH = Path(args.input_file)
    VOCABULARY_PATH = Path(args.output_file)
    CACHE_PATH = Path(args.cache_file)
    start_bpe = BPE.NanoSocratesBPE()
    if CACHE_PATH.is_file():
        voc = BPE.load_nanos_vocabulary(CACHE_PATH)
        start_bpe = BPE.NanoSocratesBPE(voc)
    print(f"Training BPE")
    BPE_ENCODER = TRAINER.trainBPE(
        DATASET_PATH,
        CACHE_PATH,
        start_bpe
    )
    VOCABULARY = BPE_ENCODER.vocabulary
    print(f"Saving Vocabulary in {VOCABULARY_PATH}")
    BPE.save_nanos_vocabulary(VOCABULARY, VOCABULARY_PATH)
 if __name__ == "__main__":
    ARGS = get_args(sys.argv)
    train(ARGS)
--- a/Scripts/Training/bpe_trainer_ram.py
+++ b/Scripts/Training/bpe_trainer_ram.py
@@ -0,0 +1,84 @@
 import argparse
 import json
 from pathlib import Path
 import sys
 # TODO: make relative imports
 import Project_Model.Libs.BPE as BPE
 from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
 DEFAULT_DEBUG_AFTER_ITER = 1
 DEFAULT_MAX_VOCABULARY = int(32E3)
 DEFAULT_MERGE_TRESHOLD = 1
 DEFAULT_MAX_ITERATIONS = 0
 TOKEN_LIST = [token.value for token in SpecialToken]
 class ProgramArgs:
    def __init__(
        self,
        input_file: str,
        output_file: str,
        max_vocabulary: int,
        max_iterations: int,
        merge_treshold: int,
        debug_after: int,
    ) -> None:
        self.input_file = input_file
        self.output_file = output_file
        self.max_vocabulary = max_vocabulary
        self.max_iterations = max_iterations
        self.merge_treshold = merge_treshold
        self.debug_after = debug_after
 def get_args(args: list[str]) -> ProgramArgs:
    PARSER = argparse.ArgumentParser()
    PARSER.add_argument("--input-file", "--input", "-i", required=True, type=str)
    PARSER.add_argument("--output-file", "--output", "-o", required=True, type=str)
    PARSER.add_argument("--max-vocabulary", "--max-voc", default=DEFAULT_MAX_VOCABULARY, type=int)
    PARSER.add_argument("--max-iterations", "--max-iter", default=DEFAULT_MAX_ITERATIONS, type=int)
    PARSER.add_argument("--merge-treshold", "--tresh", default=DEFAULT_MERGE_TRESHOLD, type=int)
    PARSER.add_argument("--debug-after", default=DEFAULT_DEBUG_AFTER_ITER, type=int)
    parsed_args, _ = PARSER.parse_known_args(args)
    return ProgramArgs(
        parsed_args.input_file,
        parsed_args.output_file,
        parsed_args.max_vocabulary,
        parsed_args.max_iterations,
        parsed_args.merge_treshold,
        parsed_args.debug_after,
    )  # type ignore
 def train(args: ProgramArgs):
    TRAINER = BPE.NanoSocraTraineRam(
        args.max_vocabulary,
        TOKEN_LIST,
        args.merge_treshold,
        args.max_iterations,
        args.debug_after
    )
    DATASET_PATH = Path(args.input_file)
    VOCABULARY_PATH = Path(args.output_file)
    print(f"Training BPE")
    BPE_ENCODER = TRAINER.trainBPE(
        DATASET_PATH
    )
    VOCABULARY = BPE_ENCODER.vocabulary
    print(f"Saving Vocabulary in {VOCABULARY_PATH}")
    BPE.save_nanos_vocabulary(VOCABULARY, VOCABULARY_PATH)
 if __name__ == "__main__":
    ARGS = get_args(sys.argv)
    train(ARGS)
--- a/Scripts/Training/dictionary_adjuster.py
+++ b/Scripts/Training/dictionary_adjuster.py
@@ -0,0 +1,12 @@
 # to cut the mad trained dict into a short one
 from Project_Model.Libs.BPE.Utils.vocabulary import load_nanos_vocabulary, save_nanos_vocabulary
 from pathlib import Path
 DICTIONARY_PATH = "Assets/Dataset/Tmp/mad_cache.json" 
 OUTPUT_PATH = "Assets/Dataset/Tmp/trimmed.json"
 big_dict = load_nanos_vocabulary(Path(DICTIONARY_PATH))
 big_dict = dict(list(big_dict.items())[:31744])
 save_nanos_vocabulary(big_dict,Path(OUTPUT_PATH))
--- a/Scripts/Training/mad_traininng.py
+++ b/Scripts/Training/mad_traininng.py
@@ -0,0 +1,48 @@
 # generate each time a corpus big then the last, without the old data
 # then using the same vocabulary let the bpe train
 from Scripts.DataCleaning.pipeline import Pipeline
 from Scripts.Training.bpe_trainer_pool import train,get_args
 from pathlib import Path
 import os, shutil
 CORPUS_PATH = "Assets/Dataset/Tmp/toy_corpus.txt"
 VOCABULARY_PATH = "Assets/Dataset/Tmp/mad_vocabulary.json"
 CACHE_PATH = "Assets/Dataset/Tmp/mad_cache.json"
 def mad_corpus_generator(corpus_size :int, corpus_offset: int):
    print("New Corpus")
    pipe = Pipeline(bpe_corpus_path=CORPUS_PATH)
    print("Pipeline Created")
    corpus_ending_offset = corpus_size + corpus_offset
    pipe.reduce_movie_list(corpus_offset,corpus_ending_offset)
    print("Starting building corpus")
    pipe.execute_task_bpe_corpus()
    print("Corpus created")
 def mad_bpe_trainer():
    argv = [
    "--input-file", CORPUS_PATH,
    "--output-file", VOCABULARY_PATH,
    "--cache-file", CACHE_PATH,
    ]
    args = get_args(argv)
    train(args)
 def mad_hatter():
    # 10,100,500,1000,1500,2000,3000,4000,5000,10000
    film_list = [10,100,500,1000,1500,2000,3000,4000,5000,10000]
    starting_offset = 0
    for corpus_size in film_list:
        # mad_corpus_generator(corpus_size, starting_offset)
        # starting_offset = starting_offset + corpus_size
        mad_bpe_trainer()
        # put dict into cache
        shutil.copyfile(Path(VOCABULARY_PATH), Path(CACHE_PATH))
 mad_hatter()
--- a/Scripts/UML/CleaningPipeline/bpe-pipeline.excalidraw.json
+++ b/Scripts/UML/CleaningPipeline/bpe-pipeline.excalidraw.json
@@ -0,0 +1,897 @@
 {
  "type": "excalidraw",
  "version": 2,
  "source": "https://marketplace.visualstudio.com/items?itemName=pomdtr.excalidraw-editor",
  "elements": [
    {
      "id": "3zbCui3XtIGozHXTVAGRp",
      "type": "rectangle",
      "x": 316.5,
      "y": 123,
      "width": 436.5,
      "height": 145.5,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a0",
      "roundness": {
        "type": 3
      },
      "seed": 1698427950,
      "version": 35,
      "versionNonce": 601575602,
      "isDeleted": false,
      "boundElements": [
        {
          "id": "wD66RDbG05HfvRhAtMb0J",
          "type": "text"
        },
        {
          "id": "gus_rxauKJ6T2L_F59PfN",
          "type": "arrow"
        }
      ],
      "updated": 1758818588814,
      "link": null,
      "locked": false
    },
    {
      "id": "wD66RDbG05HfvRhAtMb0J",
      "type": "text",
      "x": 480.98004150390625,
      "y": 183.25,
      "width": 107.5399169921875,
      "height": 25,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a1",
      "roundness": null,
      "seed": 910769774,
      "version": 31,
      "versionNonce": 1120989938,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758818416720,
      "link": null,
      "locked": false,
      "text": "dataset.db",
      "fontSize": 20,
      "fontFamily": 5,
      "textAlign": "center",
      "verticalAlign": "middle",
      "containerId": "3zbCui3XtIGozHXTVAGRp",
      "originalText": "dataset.db",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "87-MeaiZGT1wln0nggYPZ",
      "type": "rectangle",
      "x": 339.5,
      "y": 309.5,
      "width": 392,
      "height": 156,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a2",
      "roundness": {
        "type": 3
      },
      "seed": 655550318,
      "version": 77,
      "versionNonce": 1103939826,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758818339000,
      "link": null,
      "locked": false
    },
    {
      "id": "EjUxEhZqEBzwvlw0VE9eJ",
      "type": "rectangle",
      "x": 355.5,
      "y": 327,
      "width": 162,
      "height": 125.5,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a3",
      "roundness": {
        "type": 3
      },
      "seed": 1739846638,
      "version": 64,
      "versionNonce": 1594290034,
      "isDeleted": false,
      "boundElements": [
        {
          "type": "text",
          "id": "ogRkV0neHrhEKTE6zlggl"
        }
      ],
      "updated": 1758818391415,
      "link": null,
      "locked": false
    },
    {
      "id": "ogRkV0neHrhEKTE6zlggl",
      "type": "text",
      "x": 378.7100524902344,
      "y": 377.25,
      "width": 115.57989501953125,
      "height": 25,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a3V",
      "roundness": null,
      "seed": 2037675630,
      "version": 12,
      "versionNonce": 1286472046,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758818399222,
      "link": null,
      "locked": false,
      "text": "RDF_String",
      "fontSize": 20,
      "fontFamily": 5,
      "textAlign": "center",
      "verticalAlign": "middle",
      "containerId": "EjUxEhZqEBzwvlw0VE9eJ",
      "originalText": "RDF_String",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "hoIRMNiMJZl4YDo-hovWy",
      "type": "rectangle",
      "x": 542.5,
      "y": 327,
      "width": 173,
      "height": 125.5,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a4",
      "roundness": {
        "type": 3
      },
      "seed": 1189796530,
      "version": 99,
      "versionNonce": 1071057006,
      "isDeleted": false,
      "boundElements": [
        {
          "type": "text",
          "id": "rsapATFAT5YSBCXzLupgZ"
        },
        {
          "id": "gus_rxauKJ6T2L_F59PfN",
          "type": "arrow"
        },
        {
          "id": "Wk1bJbbtC31FqObEL5xWt",
          "type": "arrow"
        }
      ],
      "updated": 1758818593647,
      "link": null,
      "locked": false
    },
    {
      "id": "rsapATFAT5YSBCXzLupgZ",
      "type": "text",
      "x": 585.6800384521484,
      "y": 377.25,
      "width": 86.63992309570312,
      "height": 25,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a5",
      "roundness": null,
      "seed": 829619694,
      "version": 12,
      "versionNonce": 713902318,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758818405150,
      "link": null,
      "locked": false,
      "text": "Abstract",
      "fontSize": 20,
      "fontFamily": 5,
      "textAlign": "center",
      "verticalAlign": "middle",
      "containerId": "hoIRMNiMJZl4YDo-hovWy",
      "originalText": "Abstract",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "jSx8ApfhtRs_nk37VvDMb",
      "type": "rectangle",
      "x": 316.5,
      "y": 511,
      "width": 436.5,
      "height": 145.5,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a6",
      "roundness": {
        "type": 3
      },
      "seed": 492582894,
      "version": 132,
      "versionNonce": 893797614,
      "isDeleted": false,
      "boundElements": [
        {
          "type": "text",
          "id": "6E23g-rgowNqHsBxX-LuM"
        },
        {
          "id": "hyFKqXwet_F79QM71atgI",
          "type": "arrow"
        },
        {
          "id": "x_DP1FcQ7jraGz0gBuDi3",
          "type": "arrow"
        },
        {
          "id": "1IGbCps2EHnzKgJUWM5nq",
          "type": "arrow"
        },
        {
          "id": "Wk1bJbbtC31FqObEL5xWt",
          "type": "arrow"
        }
      ],
      "updated": 1758818593647,
      "link": null,
      "locked": false
    },
    {
      "id": "6E23g-rgowNqHsBxX-LuM",
      "type": "text",
      "x": 499.9100341796875,
      "y": 571.25,
      "width": 69.679931640625,
      "height": 25,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a7",
      "roundness": null,
      "seed": 267696178,
      "version": 132,
      "versionNonce": 1668243186,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758818543211,
      "link": null,
      "locked": false,
      "text": "Pandas",
      "fontSize": 20,
      "fontFamily": 5,
      "textAlign": "center",
      "verticalAlign": "middle",
      "containerId": "jSx8ApfhtRs_nk37VvDMb",
      "originalText": "Pandas",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "ohj18N4AOTDz5lJNcV9gi",
      "type": "rectangle",
      "x": 261,
      "y": 765.5,
      "width": 157,
      "height": 87,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a8",
      "roundness": {
        "type": 3
      },
      "seed": 1446207150,
      "version": 279,
      "versionNonce": 317375026,
      "isDeleted": false,
      "boundElements": [
        {
          "id": "Ea1_ke2wA0D8ZjVOUtvfY",
          "type": "text"
        },
        {
          "id": "hyFKqXwet_F79QM71atgI",
          "type": "arrow"
        }
      ],
      "updated": 1758818570993,
      "link": null,
      "locked": false
    },
    {
      "id": "Ea1_ke2wA0D8ZjVOUtvfY",
      "type": "text",
      "x": 297.0800323486328,
      "y": 796.5,
      "width": 84.83993530273438,
      "height": 25,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a9",
      "roundness": null,
      "seed": 435116270,
      "version": 199,
      "versionNonce": 1282911218,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758818570993,
      "link": null,
      "locked": false,
      "text": "train.txt",
      "fontSize": 20,
      "fontFamily": 5,
      "textAlign": "center",
      "verticalAlign": "middle",
      "containerId": "ohj18N4AOTDz5lJNcV9gi",
      "originalText": "train.txt",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "A4Y54Y26fe257U_QU9lxX",
      "type": "rectangle",
      "x": 464,
      "y": 765.5,
      "width": 157,
      "height": 87,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aA",
      "roundness": {
        "type": 3
      },
      "seed": 186148850,
      "version": 232,
      "versionNonce": 997119858,
      "isDeleted": false,
      "boundElements": [
        {
          "id": "v4TvUlDEjH7EvPDmtbOn2",
          "type": "text"
        },
        {
          "id": "1IGbCps2EHnzKgJUWM5nq",
          "type": "arrow"
        }
      ],
      "updated": 1758818570993,
      "link": null,
      "locked": false
    },
    {
      "id": "v4TvUlDEjH7EvPDmtbOn2",
      "type": "text",
      "x": 476.3500442504883,
      "y": 796.5,
      "width": 132.29991149902344,
      "height": 25,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aB",
      "roundness": null,
      "seed": 1131059634,
      "version": 171,
      "versionNonce": 239540530,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758818570993,
      "link": null,
      "locked": false,
      "text": "validation.txt",
      "fontSize": 20,
      "fontFamily": 5,
      "textAlign": "center",
      "verticalAlign": "middle",
      "containerId": "A4Y54Y26fe257U_QU9lxX",
      "originalText": "validation.txt",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "mPaYpJ9Xn7tlJPmKPqJKJ",
      "type": "rectangle",
      "x": 674.5,
      "y": 765.5,
      "width": 157,
      "height": 87,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aC",
      "roundness": {
        "type": 3
      },
      "seed": 1049323314,
      "version": 235,
      "versionNonce": 330560690,
      "isDeleted": false,
      "boundElements": [
        {
          "type": "text",
          "id": "kg9nm2rpud6cax5aNPSnu"
        },
        {
          "id": "x_DP1FcQ7jraGz0gBuDi3",
          "type": "arrow"
        }
      ],
      "updated": 1758818570993,
      "link": null,
      "locked": false
    },
    {
      "id": "kg9nm2rpud6cax5aNPSnu",
      "type": "text",
      "x": 711.4300231933594,
      "y": 796.5,
      "width": 83.13995361328125,
      "height": 25,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aD",
      "roundness": null,
      "seed": 522572142,
      "version": 193,
      "versionNonce": 1920372338,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758818570993,
      "link": null,
      "locked": false,
      "text": "test.txt",
      "fontSize": 20,
      "fontFamily": 5,
      "textAlign": "center",
      "verticalAlign": "middle",
      "containerId": "mPaYpJ9Xn7tlJPmKPqJKJ",
      "originalText": "test.txt",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "hyFKqXwet_F79QM71atgI",
      "type": "arrow",
      "x": 534.65,
      "y": 661.5,
      "width": 195.25,
      "height": 99,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aG",
      "roundness": null,
      "seed": 873266098,
      "version": 71,
      "versionNonce": 541154738,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758818570993,
      "link": null,
      "locked": false,
      "points": [
        [
          0,
          0
        ],
        [
          0,
          49.5
        ],
        [
          -195.25,
          49.5
        ],
        [
          -195.25,
          99
        ]
      ],
      "lastCommittedPoint": null,
      "startBinding": {
        "elementId": "jSx8ApfhtRs_nk37VvDMb",
        "fixedPoint": [
          0.49977090492554405,
          1.034364261168385
        ],
        "focus": 0,
        "gap": 0
      },
      "endBinding": {
        "elementId": "ohj18N4AOTDz5lJNcV9gi",
        "fixedPoint": [
          0.4993630573248406,
          -0.05747126436781609
        ],
        "focus": 0,
        "gap": 0
      },
      "startArrowhead": null,
      "endArrowhead": "triangle",
      "elbowed": true,
      "fixedSegments": null,
      "startIsSpecial": null,
      "endIsSpecial": null
    },
    {
      "id": "x_DP1FcQ7jraGz0gBuDi3",
      "type": "arrow",
      "x": 534.65,
      "y": 661.5,
      "width": 218.25,
      "height": 99,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aH",
      "roundness": null,
      "seed": 1210817582,
      "version": 77,
      "versionNonce": 1483392370,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758818580594,
      "link": null,
      "locked": false,
      "points": [
        [
          0,
          0
        ],
        [
          0,
          49.5
        ],
        [
          218.25,
          49.5
        ],
        [
          218.25,
          99
        ]
      ],
      "lastCommittedPoint": null,
      "startBinding": {
        "elementId": "jSx8ApfhtRs_nk37VvDMb",
        "fixedPoint": [
          0.49977090492554405,
          1.034364261168385
        ],
        "focus": 0,
        "gap": 0
      },
      "endBinding": {
        "elementId": "mPaYpJ9Xn7tlJPmKPqJKJ",
        "fixedPoint": [
          0.4993630573248406,
          -0.05747126436781609
        ],
        "focus": 0,
        "gap": 0
      },
      "startArrowhead": null,
      "endArrowhead": "triangle",
      "elbowed": true,
      "fixedSegments": null,
      "startIsSpecial": null,
      "endIsSpecial": null
    },
    {
      "id": "1IGbCps2EHnzKgJUWM5nq",
      "type": "arrow",
      "x": 534.65,
      "y": 661.5,
      "width": 0.5719232650604908,
      "height": 99.07394122590165,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aK",
      "roundness": null,
      "seed": 1205316658,
      "version": 96,
      "versionNonce": 1748050674,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758818570993,
      "link": null,
      "locked": false,
      "points": [
        [
          0,
          0
        ],
        [
          -0.5719232650604908,
          99.07394122590165
        ]
      ],
      "lastCommittedPoint": null,
      "startBinding": {
        "elementId": "jSx8ApfhtRs_nk37VvDMb",
        "fixedPoint": [
          0.49977090492554405,
          1.034364261168385
        ],
        "focus": 0,
        "gap": 0
      },
      "endBinding": {
        "elementId": "A4Y54Y26fe257U_QU9lxX",
        "fixedPoint": [
          0.44635717665566554,
          -0.056621365219521276
        ],
        "focus": 0,
        "gap": 0
      },
      "startArrowhead": null,
      "endArrowhead": "triangle",
      "elbowed": true,
      "fixedSegments": null,
      "startIsSpecial": null,
      "endIsSpecial": null
    },
    {
      "id": "gus_rxauKJ6T2L_F59PfN",
      "type": "arrow",
      "x": 539,
      "y": 271.5,
      "width": 0,
      "height": 33.5,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aL",
      "roundness": null,
      "seed": 763990258,
      "version": 17,
      "versionNonce": 1028811378,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758818588814,
      "link": null,
      "locked": false,
      "points": [
        [
          0,
          0
        ],
        [
          0,
          33.5
        ]
      ],
      "lastCommittedPoint": null,
      "startBinding": {
        "elementId": "3zbCui3XtIGozHXTVAGRp",
        "focus": -0.019473081328751418,
        "gap": 3
      },
      "endBinding": {
        "elementId": "hoIRMNiMJZl4YDo-hovWy",
        "focus": -1.0404624277456647,
        "gap": 30.7545797799829
      },
      "startArrowhead": null,
      "endArrowhead": "triangle",
      "elbowed": false
    },
    {
      "id": "Wk1bJbbtC31FqObEL5xWt",
      "type": "arrow",
      "x": 536.5,
      "y": 468.5,
      "width": 0,
      "height": 39,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aM",
      "roundness": null,
      "seed": 1489771054,
      "version": 33,
      "versionNonce": 1828178606,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758818593647,
      "link": null,
      "locked": false,
      "points": [
        [
          0,
          0
        ],
        [
          0,
          39
        ]
      ],
      "lastCommittedPoint": null,
      "startBinding": {
        "elementId": "hoIRMNiMJZl4YDo-hovWy",
        "focus": 1.0693641618497107,
        "gap": 27.157190169432425
      },
      "endBinding": {
        "elementId": "jSx8ApfhtRs_nk37VvDMb",
        "focus": 0.008018327605956525,
        "gap": 3.5
      },
      "startArrowhead": null,
      "endArrowhead": "triangle",
      "elbowed": false
    }
  ],
  "appState": {
    "gridSize": 20,
    "gridStep": 5,
    "gridModeEnabled": false,
    "viewBackgroundColor": "#ffffff"
  },
  "files": {}
 }
--- a/Scripts/UML/CleaningPipeline/classes.excalidraw.json
+++ b/Scripts/UML/CleaningPipeline/classes.excalidraw.json
@@ -0,0 +1,826 @@
 {
  "type": "excalidraw",
  "version": 2,
  "source": "https://marketplace.visualstudio.com/items?itemName=pomdtr.excalidraw-editor",
  "elements": [
    {
      "type": "line",
      "version": 4622,
      "versionNonce": 1623045672,
      "isDeleted": false,
      "id": "twu_PiAvEuQ4l1YYtZLET",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 289.8504963515835,
      "y": 91.87474806402287,
      "strokeColor": "#000000",
      "backgroundColor": "#a5d8ff",
      "width": 77.09201683999922,
      "height": 99.49948667804088,
      "seed": 1975340120,
      "groupIds": [
        "9PT4BXPfQ6UoCaB-T-h9A",
        "dp_TZJyYdyPIH1hOkAPlb"
      ],
      "strokeSharpness": "round",
      "boundElementIds": [],
      "startBinding": null,
      "endBinding": null,
      "lastCommittedPoint": null,
      "startArrowhead": null,
      "endArrowhead": null,
      "points": [
        [
          0,
          0
        ],
        [
          0.2542098813493443,
          75.20117273657175
        ],
        [
          0.011896425679918422,
          83.76249969444815
        ],
        [
          3.970409367559332,
          87.46174320643391
        ],
        [
          17.75573317066317,
          90.59250103325854
        ],
        [
          41.05683533152865,
          91.56737225214069
        ],
        [
          63.319497586673116,
          90.01084754868091
        ],
        [
          75.14781395923075,
          86.28844687220405
        ],
        [
          76.81603792670788,
          83.15042405259751
        ],
        [
          77.05033394391478,
          76.25776215104557
        ],
        [
          76.86643881413028,
          6.3089586511537865
        ],
        [
          76.45188016352971,
          -0.2999144698665015
        ],
        [
          71.50179495549581,
          -3.9936571317850627
        ],
        [
          61.077971898861186,
          -6.132877429442784
        ],
        [
          37.32348754161154,
          -7.932114425900202
        ],
        [
          18.278415656797975,
          -6.859225353587373
        ],
        [
          3.2995959613238286,
          -3.2201165291205287
        ],
        [
          -0.04168289608444441,
          -0.045185660461322996
        ],
        [
          0,
          0
        ]
      ],
      "index": "a1",
      "frameId": null,
      "roundness": {
        "type": 2
      },
      "boundElements": [],
      "updated": 1758646548051,
      "link": null,
      "locked": false
    },
    {
      "type": "line",
      "version": 2327,
      "versionNonce": 1593094440,
      "isDeleted": false,
      "id": "hmJk4dH9VpOsfkrCTkhvh",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 290.3744257898585,
      "y": 149.00103172175278,
      "strokeColor": "#000000",
      "backgroundColor": "#a5d8ff",
      "width": 77.17198221193564,
      "height": 8.562348957853036,
      "seed": 637665624,
      "groupIds": [
        "9PT4BXPfQ6UoCaB-T-h9A",
        "dp_TZJyYdyPIH1hOkAPlb"
      ],
      "strokeSharpness": "round",
      "boundElementIds": [],
      "startBinding": null,
      "endBinding": null,
      "lastCommittedPoint": null,
      "startArrowhead": null,
      "endArrowhead": null,
      "points": [
        [
          0,
          0
        ],
        [
          2.033150371639873,
          3.413095389435587
        ],
        [
          10.801287372573954,
          6.276651055277943
        ],
        [
          22.468666942209353,
          8.010803051612635
        ],
        [
          40.747074201802775,
          8.168828515515864
        ],
        [
          62.077348233027564,
          7.0647721921469495
        ],
        [
          74.53446931782398,
          3.04824021069218
        ],
        [
          77.17198221193564,
          -0.3935204423371723
        ]
      ],
      "index": "a2",
      "frameId": null,
      "roundness": {
        "type": 2
      },
      "boundElements": [],
      "updated": 1758646548051,
      "link": null,
      "locked": false
    },
    {
      "type": "line",
      "version": 2413,
      "versionNonce": 311708712,
      "isDeleted": false,
      "id": "X1ldVIXm4DfBal5N2Pwn9",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 289.3425684673547,
      "y": 120.03697638652972,
      "strokeColor": "#000000",
      "backgroundColor": "#a5d8ff",
      "width": 77.17198221193564,
      "height": 8.562348957853036,
      "seed": 904402520,
      "groupIds": [
        "9PT4BXPfQ6UoCaB-T-h9A",
        "dp_TZJyYdyPIH1hOkAPlb"
      ],
      "strokeSharpness": "round",
      "boundElementIds": [],
      "startBinding": null,
      "endBinding": null,
      "lastCommittedPoint": null,
      "startArrowhead": null,
      "endArrowhead": null,
      "points": [
        [
          0,
          0
        ],
        [
          2.033150371639873,
          3.413095389435587
        ],
        [
          10.801287372573954,
          6.276651055277943
        ],
        [
          22.468666942209353,
          8.010803051612635
        ],
        [
          40.747074201802775,
          8.168828515515864
        ],
        [
          62.077348233027564,
          7.0647721921469495
        ],
        [
          74.53446931782398,
          3.04824021069218
        ],
        [
          77.17198221193564,
          -0.3935204423371723
        ]
      ],
      "index": "a3",
      "frameId": null,
      "roundness": {
        "type": 2
      },
      "boundElements": [],
      "updated": 1758646548051,
      "link": null,
      "locked": false
    },
    {
      "type": "ellipse",
      "version": 5410,
      "versionNonce": 92833576,
      "isDeleted": false,
      "id": "CFhp5ZxSVwHYzGUj4hEn1",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 288.28461948527263,
      "y": 84.74247943834126,
      "strokeColor": "#000000",
      "backgroundColor": "#a5d8ff",
      "width": 76.59753601865496,
      "height": 15.49127539284798,
      "seed": 1782811480,
      "groupIds": [
        "9PT4BXPfQ6UoCaB-T-h9A",
        "dp_TZJyYdyPIH1hOkAPlb"
      ],
      "strokeSharpness": "sharp",
      "boundElementIds": [
        "bxuMGTzXLn7H-uBCptINx"
      ],
      "index": "a4",
      "frameId": null,
      "roundness": null,
      "boundElements": [],
      "updated": 1758646548051,
      "link": null,
      "locked": false
    },
    {
      "type": "ellipse",
      "version": 820,
      "versionNonce": 608002600,
      "isDeleted": false,
      "id": "B43R7rWwK2_vdiRHBSSPk",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 324.77660659049513,
      "y": 109.21914711824485,
      "strokeColor": "#000000",
      "backgroundColor": "#228be6",
      "width": 11.226103154161754,
      "height": 12.183758484455605,
      "seed": 1298686040,
      "groupIds": [
        "9PT4BXPfQ6UoCaB-T-h9A",
        "dp_TZJyYdyPIH1hOkAPlb"
      ],
      "strokeSharpness": "sharp",
      "boundElementIds": [],
      "index": "a5",
      "frameId": null,
      "roundness": null,
      "boundElements": [],
      "updated": 1758646548051,
      "link": null,
      "locked": false
    },
    {
      "type": "ellipse",
      "version": 1108,
      "versionNonce": 1839127848,
      "isDeleted": false,
      "id": "CkKMb9wkJfVk04T217zSs",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 325.12774837442873,
      "y": 135.43576140530996,
      "strokeColor": "#000000",
      "backgroundColor": "#228be6",
      "width": 11.226103154161754,
      "height": 12.183758484455605,
      "seed": 2133497176,
      "groupIds": [
        "9PT4BXPfQ6UoCaB-T-h9A",
        "dp_TZJyYdyPIH1hOkAPlb"
      ],
      "strokeSharpness": "sharp",
      "boundElementIds": [],
      "index": "a6",
      "frameId": null,
      "roundness": null,
      "boundElements": [],
      "updated": 1758646548051,
      "link": null,
      "locked": false
    },
    {
      "type": "ellipse",
      "version": 991,
      "versionNonce": 588838952,
      "isDeleted": false,
      "id": "SHJdKeQPkfpvzSoNH--3o",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 6.239590202363168,
      "x": 325.77660659049513,
      "y": 164.20448797661635,
      "strokeColor": "#000000",
      "backgroundColor": "#228be6",
      "width": 11.226103154161754,
      "height": 12.183758484455605,
      "seed": 81668696,
      "groupIds": [
        "9PT4BXPfQ6UoCaB-T-h9A",
        "dp_TZJyYdyPIH1hOkAPlb"
      ],
      "strokeSharpness": "sharp",
      "boundElementIds": [],
      "index": "a7",
      "frameId": null,
      "roundness": null,
      "boundElements": [],
      "updated": 1758646548051,
      "link": null,
      "locked": false
    },
    {
      "type": "text",
      "version": 489,
      "versionNonce": 2023207720,
      "isDeleted": false,
      "id": "vUSyMBPup0jZ71CYXKyGb",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 280.1846389770508,
      "y": 185.79462957545917,
      "strokeColor": "#000000",
      "backgroundColor": "#a5d8ff",
      "width": 95.63072204589844,
      "height": 23.595161071904883,
      "seed": 425140056,
      "groupIds": [
        "dp_TZJyYdyPIH1hOkAPlb"
      ],
      "strokeSharpness": "sharp",
      "boundElementIds": [],
      "fontSize": 17.4778970902999,
      "fontFamily": 1,
      "text": "dataset.db",
      "baseline": 16.595161071904883,
      "textAlign": "center",
      "verticalAlign": "top",
      "index": "a8",
      "frameId": null,
      "roundness": null,
      "boundElements": [],
      "updated": 1758646548051,
      "link": null,
      "locked": false,
      "containerId": null,
      "originalText": "dataset.db",
      "autoResize": true,
      "lineHeight": 1.350000000000001
    },
    {
      "id": "R7pU0VP6CFKCAwuvt0xsr",
      "type": "text",
      "x": 295.5,
      "y": 342,
      "width": 374,
      "height": 225,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "#228be6",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a9",
      "roundness": null,
      "seed": 705463336,
      "version": 1130,
      "versionNonce": 72522328,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758648226024,
      "link": null,
      "locked": false,
      "text": "class Extract(Action):\n    # Static\n    + type : ActionTypes = Extract\n    \n    # Properties\n    - db_connection: Path\n    - query: str\n    - query_parameters: [str]\n    - output_mapper: [str]",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class Extract(Action):\n    # Static\n    + type : ActionTypes = Extract\n    \n    # Properties\n    - db_connection: Path\n    - query: str\n    - query_parameters: [str]\n    - output_mapper: [str]",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "G1xIRcJgm34_NMEWQFFlW",
      "type": "text",
      "x": 1419.5,
      "y": 110,
      "width": 253,
      "height": 75,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "#228be6",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aA",
      "roundness": null,
      "seed": 651981400,
      "version": 256,
      "versionNonce": 138082856,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758646570344,
      "link": null,
      "locked": false,
      "text": "class Pipeline\n    - actions: [Action]\n    ",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class Pipeline\n    - actions: [Action]\n    ",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "TBVy3JbJCkbA9kjVEJ8lv",
      "type": "text",
      "x": 694,
      "y": 100,
      "width": 495,
      "height": 150,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "#228be6",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aB",
      "roundness": null,
      "seed": 680960040,
      "version": 560,
      "versionNonce": 85012520,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758649442239,
      "link": null,
      "locked": false,
      "text": "class Action\n    + type: ActionTypes\n    + name: str\n    + depends_on: [str]\n\n    + execute(mem) -> [Dict<str, any>] | Void",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class Action\n    + type: ActionTypes\n    + name: str\n    + depends_on: [str]\n\n    + execute(mem) -> [Dict<str, any>] | Void",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "an7KRTzWpCytKNKgHftKC",
      "type": "text",
      "x": 1528.5,
      "y": 365.5,
      "width": 187,
      "height": 150,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "#228be6",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aC",
      "roundness": null,
      "seed": 1974317656,
      "version": 306,
      "versionNonce": 1574962264,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758648154009,
      "link": null,
      "locked": false,
      "text": "enum ActionTypes:\n    + Extract\n    + Aggregate\n    + Filter\n    + Map\n    + Dump",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "enum ActionTypes:\n    + Extract\n    + Aggregate\n    + Filter\n    + Map\n    + Dump",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "2pQ5EULirrWs_QZPbClhh",
      "type": "text",
      "x": 785,
      "y": 332.5,
      "width": 418,
      "height": 375,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "#228be6",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aH",
      "roundness": null,
      "seed": 1402251560,
      "version": 742,
      "versionNonce": 680432168,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758649532881,
      "link": null,
      "locked": false,
      "text": "class Aggregate(Action):\n    # Static\n    + type: ActionTypes = Aggregate\n\n    # Properties\n    - actionIDs: [str]\n    - associations: [Association]\n    - output_mapper: [str]\n\n    + execute(mem):\n        tables = mem.gather(actionIDs)\n\n        for join in association:\n            \n            ",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class Aggregate(Action):\n    # Static\n    + type: ActionTypes = Aggregate\n\n    # Properties\n    - actionIDs: [str]\n    - associations: [Association]\n    - output_mapper: [str]\n\n    + execute(mem):\n        tables = mem.gather(actionIDs)\n\n        for join in association:\n            \n            ",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "O0fso8DJqFfwJEzmpUikM",
      "type": "text",
      "x": 1289,
      "y": 195,
      "width": 594,
      "height": 100,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "#228be6",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aI",
      "roundness": null,
      "seed": 1582329944,
      "version": 459,
      "versionNonce": 1080077144,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758647067031,
      "link": null,
      "locked": false,
      "text": "input_mapper: \n    - key: ActionID (name) that produced such output\n    - value: list of strings that represent the values\n                to take",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "input_mapper: \n    - key: ActionID (name) that produced such output\n    - value: list of strings that represent the values\n                to take",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "v0kzO6vlBWOdJCV3yoG69",
      "type": "text",
      "x": 1379.5,
      "y": 718.5,
      "width": 286,
      "height": 175,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "#228be6",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aL",
      "roundness": null,
      "seed": 1462407976,
      "version": 635,
      "versionNonce": 1012998696,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758649495598,
      "link": null,
      "locked": false,
      "text": "class Association:\n    - from_actionID: str\n    - from_key_name: str\n    - from_value_name: str\n    - to_actionID: str\n    - to_value_name: str\n    - type: Type",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class Association:\n    - from_actionID: str\n    - from_key_name: str\n    - from_value_name: str\n    - to_actionID: str\n    - to_value_name: str\n    - type: Type",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "WK34n9xeVxntypCtrlK6p",
      "type": "text",
      "x": 256.5,
      "y": 787.5,
      "width": 517,
      "height": 175,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "#228be6",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aM",
      "roundness": null,
      "seed": 1166526296,
      "version": 318,
      "versionNonce": 1042162520,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758649002604,
      "link": null,
      "locked": false,
      "text": "class Filter(Action):\n    # Static\n    + type: ActionTypes = Filter\n\n    # Properties\n    - compare: function(Dict<str, any>) -> bool\n    - output_mapper: [str]",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class Filter(Action):\n    # Static\n    + type: ActionTypes = Filter\n\n    # Properties\n    - compare: function(Dict<str, any>) -> bool\n    - output_mapper: [str]",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "NY9jyUFLFFCNPE2sh00SX",
      "type": "text",
      "x": 1639,
      "y": 606.5,
      "width": 407,
      "height": 200,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "#228be6",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aP",
      "roundness": null,
      "seed": 20345896,
      "version": 168,
      "versionNonce": 627282472,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758649426380,
      "link": null,
      "locked": false,
      "text": "class Map(Action):\n    # Static\n    + type: ActionTypes = Map\n\n    # Properties\n    - compare_mapper: [str]\n    - mapper: function(any...) -> any\n    - output_mapper: [str]",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class Map(Action):\n    # Static\n    + type: ActionTypes = Map\n\n    # Properties\n    - compare_mapper: [str]\n    - mapper: function(any...) -> any\n    - output_mapper: [str]",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "SkhaoW-3TTKDZzEii3Lf6",
      "type": "text",
      "x": 1457.5,
      "y": 955.5,
      "width": 121,
      "height": 50,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "#228be6",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aQ",
      "roundness": null,
      "seed": 2071523672,
      "version": 37,
      "versionNonce": 105260376,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758648834435,
      "link": null,
      "locked": false,
      "text": "class Dump:\n    -",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class Dump:\n    -",
      "autoResize": true,
      "lineHeight": 1.25
    }
  ],
  "appState": {
    "gridSize": 20,
    "gridStep": 5,
    "gridModeEnabled": false,
    "viewBackgroundColor": "#ffffff"
  },
  "files": {}
 }
--- a/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json
+++ b/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json
@@ -0,0 +1,634 @@
 {
  "type": "excalidraw",
  "version": 2,
  "source": "https://marketplace.visualstudio.com/items?itemName=pomdtr.excalidraw-editor",
  "elements": [
    {
      "id": "JNB9z-PeqZ4s8KDfWaoXe",
      "type": "rectangle",
      "x": 106,
      "y": 27,
      "width": 653,
      "height": 263,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a2",
      "roundness": {
        "type": 3
      },
      "seed": 710740889,
      "version": 326,
      "versionNonce": 1107631703,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1759156408059,
      "link": null,
      "locked": false
    },
    {
      "id": "e13wNTgUpn2flMpmMttqx",
      "type": "text",
      "x": 200.5943407656526,
      "y": 44.07937975075269,
      "width": 307.2781467269385,
      "height": 23.3097531902191,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a3",
      "roundness": null,
      "seed": 1012740663,
      "version": 444,
      "versionNonce": 589551257,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1759156408059,
      "link": null,
      "locked": false,
      "text": "Libs/CleaningPipeline/sql_endpoint",
      "fontSize": 18.64780255217528,
      "fontFamily": 5,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "Libs/CleaningPipeline/sql_endpoint",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "CgxCElJkKBtIHv-5WQrbo",
      "type": "text",
      "x": 195,
      "y": 80.44259472749451,
      "width": 403.64997665852184,
      "height": 186.4780255217528,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a4",
      "roundness": null,
      "seed": 1261951799,
      "version": 507,
      "versionNonce": 1922906999,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1759156408059,
      "link": null,
      "locked": false,
      "text": "Class SqlEndpoint:\n    - sql_engine\n    + movie_ids: list[int]\n\n    #\n    + get_abbreviated_dataset_by_movie_id\n\n",
      "fontSize": 18.64780255217528,
      "fontFamily": 5,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "Class SqlEndpoint:\n    - sql_engine\n    + movie_ids: list[int]\n\n    #\n    + get_abbreviated_dataset_by_movie_id\n\n",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "type": "line",
      "version": 4979,
      "versionNonce": 1473849177,
      "isDeleted": false,
      "id": "sYReMTdYblr-oJtYYJALU",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": -67.14432426259049,
      "y": 87.19293561900287,
      "strokeColor": "#000000",
      "backgroundColor": "#a5d8ff",
      "width": 77.09201683999922,
      "height": 99.49948667804088,
      "seed": 1263944119,
      "groupIds": [
        "9YkNe1yqnfZy9Z1JX2xr4",
        "BDBCTrrhjbJynRAyuf3xJ"
      ],
      "strokeSharpness": "round",
      "boundElementIds": [],
      "startBinding": null,
      "endBinding": null,
      "lastCommittedPoint": null,
      "startArrowhead": null,
      "endArrowhead": null,
      "points": [
        [
          0,
          0
        ],
        [
          0.2542098813493443,
          75.20117273657175
        ],
        [
          0.011896425679918422,
          83.76249969444815
        ],
        [
          3.970409367559332,
          87.46174320643391
        ],
        [
          17.75573317066317,
          90.59250103325854
        ],
        [
          41.05683533152865,
          91.56737225214069
        ],
        [
          63.319497586673116,
          90.01084754868091
        ],
        [
          75.14781395923075,
          86.28844687220405
        ],
        [
          76.81603792670788,
          83.15042405259751
        ],
        [
          77.05033394391478,
          76.25776215104557
        ],
        [
          76.86643881413028,
          6.3089586511537865
        ],
        [
          76.45188016352971,
          -0.2999144698665015
        ],
        [
          71.50179495549581,
          -3.9936571317850627
        ],
        [
          61.077971898861186,
          -6.132877429442784
        ],
        [
          37.32348754161154,
          -7.932114425900202
        ],
        [
          18.278415656797975,
          -6.859225353587373
        ],
        [
          3.2995959613238286,
          -3.2201165291205287
        ],
        [
          -0.04168289608444441,
          -0.045185660461322996
        ],
        [
          0,
          0
        ]
      ],
      "index": "a6",
      "frameId": null,
      "roundness": {
        "type": 2
      },
      "boundElements": [],
      "updated": 1759158252997,
      "link": null,
      "locked": false
    },
    {
      "type": "line",
      "version": 2684,
      "versionNonce": 952947769,
      "isDeleted": false,
      "id": "0S6dEWQVqKUVkP6Z5IX1l",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": -66.6203948243155,
      "y": 144.31921927673278,
      "strokeColor": "#000000",
      "backgroundColor": "#a5d8ff",
      "width": 77.17198221193564,
      "height": 8.562348957853036,
      "seed": 817033943,
      "groupIds": [
        "9YkNe1yqnfZy9Z1JX2xr4",
        "BDBCTrrhjbJynRAyuf3xJ"
      ],
      "strokeSharpness": "round",
      "boundElementIds": [],
      "startBinding": null,
      "endBinding": null,
      "lastCommittedPoint": null,
      "startArrowhead": null,
      "endArrowhead": null,
      "points": [
        [
          0,
          0
        ],
        [
          2.033150371639873,
          3.413095389435587
        ],
        [
          10.801287372573954,
          6.276651055277943
        ],
        [
          22.468666942209353,
          8.010803051612635
        ],
        [
          40.747074201802775,
          8.168828515515864
        ],
        [
          62.077348233027564,
          7.0647721921469495
        ],
        [
          74.53446931782398,
          3.04824021069218
        ],
        [
          77.17198221193564,
          -0.3935204423371723
        ]
      ],
      "index": "a7",
      "frameId": null,
      "roundness": {
        "type": 2
      },
      "boundElements": [],
      "updated": 1759158252997,
      "link": null,
      "locked": false
    },
    {
      "type": "line",
      "version": 2770,
      "versionNonce": 477619481,
      "isDeleted": false,
      "id": "szGLND7J0nVOvRkNXX9AS",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": -67.65225214681931,
      "y": 115.35516394150972,
      "strokeColor": "#000000",
      "backgroundColor": "#a5d8ff",
      "width": 77.17198221193564,
      "height": 8.562348957853036,
      "seed": 1704755191,
      "groupIds": [
        "9YkNe1yqnfZy9Z1JX2xr4",
        "BDBCTrrhjbJynRAyuf3xJ"
      ],
      "strokeSharpness": "round",
      "boundElementIds": [],
      "startBinding": null,
      "endBinding": null,
      "lastCommittedPoint": null,
      "startArrowhead": null,
      "endArrowhead": null,
      "points": [
        [
          0,
          0
        ],
        [
          2.033150371639873,
          3.413095389435587
        ],
        [
          10.801287372573954,
          6.276651055277943
        ],
        [
          22.468666942209353,
          8.010803051612635
        ],
        [
          40.747074201802775,
          8.168828515515864
        ],
        [
          62.077348233027564,
          7.0647721921469495
        ],
        [
          74.53446931782398,
          3.04824021069218
        ],
        [
          77.17198221193564,
          -0.3935204423371723
        ]
      ],
      "index": "a8",
      "frameId": null,
      "roundness": {
        "type": 2
      },
      "boundElements": [],
      "updated": 1759158252997,
      "link": null,
      "locked": false
    },
    {
      "type": "ellipse",
      "version": 5767,
      "versionNonce": 2119031289,
      "isDeleted": false,
      "id": "O3t2uGktJlDd1_OX_bpV4",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": -68.71020112890136,
      "y": 80.06066699332126,
      "strokeColor": "#000000",
      "backgroundColor": "#a5d8ff",
      "width": 76.59753601865496,
      "height": 15.49127539284798,
      "seed": 471296279,
      "groupIds": [
        "9YkNe1yqnfZy9Z1JX2xr4",
        "BDBCTrrhjbJynRAyuf3xJ"
      ],
      "strokeSharpness": "sharp",
      "boundElementIds": [
        "bxuMGTzXLn7H-uBCptINx"
      ],
      "index": "a9",
      "frameId": null,
      "roundness": null,
      "boundElements": [],
      "updated": 1759158252997,
      "link": null,
      "locked": false
    },
    {
      "type": "ellipse",
      "version": 1177,
      "versionNonce": 525480665,
      "isDeleted": false,
      "id": "_SzKlOBOvJgBg7FX0JTTM",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": -32.218214023678854,
      "y": 104.53733467322485,
      "strokeColor": "#000000",
      "backgroundColor": "#228be6",
      "width": 11.226103154161754,
      "height": 12.183758484455605,
      "seed": 1368927799,
      "groupIds": [
        "9YkNe1yqnfZy9Z1JX2xr4",
        "BDBCTrrhjbJynRAyuf3xJ"
      ],
      "strokeSharpness": "sharp",
      "boundElementIds": [],
      "index": "aA",
      "frameId": null,
      "roundness": null,
      "boundElements": [],
      "updated": 1759158252997,
      "link": null,
      "locked": false
    },
    {
      "type": "ellipse",
      "version": 1465,
      "versionNonce": 1410887609,
      "isDeleted": false,
      "id": "oJMl2Kxa3SPaiAY0kxo7A",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": -31.867072239745255,
      "y": 130.75394896028996,
      "strokeColor": "#000000",
      "backgroundColor": "#228be6",
      "width": 11.226103154161754,
      "height": 12.183758484455605,
      "seed": 1627606871,
      "groupIds": [
        "9YkNe1yqnfZy9Z1JX2xr4",
        "BDBCTrrhjbJynRAyuf3xJ"
      ],
      "strokeSharpness": "sharp",
      "boundElementIds": [],
      "index": "aB",
      "frameId": null,
      "roundness": null,
      "boundElements": [],
      "updated": 1759158252997,
      "link": null,
      "locked": false
    },
    {
      "type": "ellipse",
      "version": 1348,
      "versionNonce": 314839193,
      "isDeleted": false,
      "id": "fB6pJBSMA-pRHrpgYKaLL",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 6.239590202363168,
      "x": -31.218214023678854,
      "y": 159.52267553159635,
      "strokeColor": "#000000",
      "backgroundColor": "#228be6",
      "width": 11.226103154161754,
      "height": 12.183758484455605,
      "seed": 1420643447,
      "groupIds": [
        "9YkNe1yqnfZy9Z1JX2xr4",
        "BDBCTrrhjbJynRAyuf3xJ"
      ],
      "strokeSharpness": "sharp",
      "boundElementIds": [],
      "index": "aC",
      "frameId": null,
      "roundness": null,
      "boundElements": [],
      "updated": 1759158252997,
      "link": null,
      "locked": false
    },
    {
      "type": "text",
      "version": 846,
      "versionNonce": 1091081593,
      "isDeleted": false,
      "id": "9gZ3Yy1MeP9kEOTLODqLG",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": -76.81018163712321,
      "y": 181.11281713043917,
      "strokeColor": "#000000",
      "backgroundColor": "#a5d8ff",
      "width": 95.63072204589844,
      "height": 23.595161071904883,
      "seed": 2019206551,
      "groupIds": [
        "BDBCTrrhjbJynRAyuf3xJ"
      ],
      "strokeSharpness": "sharp",
      "boundElementIds": [],
      "fontSize": 17.4778970902999,
      "fontFamily": 1,
      "text": "dataset.db",
      "baseline": 16.595161071904883,
      "textAlign": "center",
      "verticalAlign": "top",
      "index": "aD",
      "frameId": null,
      "roundness": null,
      "boundElements": [],
      "updated": 1759158252997,
      "link": null,
      "locked": false,
      "containerId": null,
      "originalText": "dataset.db",
      "autoResize": true,
      "lineHeight": 1.350000000000001
    },
    {
      "id": "3eOw20xMhpB5jf_RMG24P",
      "type": "text",
      "x": 1131.3333333333335,
      "y": 31.333333333333428,
      "width": 508.3333333333333,
      "height": 550,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aE",
      "roundness": null,
      "seed": 1535658041,
      "version": 821,
      "versionNonce": 1630266809,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1759157181677,
      "link": null,
      "locked": false,
      "text": "Class PipelineApplier\n    - movie_frequence_filter : pd.DataFrame()\n    - rel_Frequence_Filter : pd.DataFrame()\n    - rel_banned_list: list[str]\n\n    + generate_movie_frequency_filter()\n    + generate_rel_frequency_filter()\n    + generate_list_relationship_filter()\n    \n    + filter_by_movie_frequency()\n    + filter_by_relationship_frequency()\n    + delete_relationship_by_list_filter()\n    + delete_relationship_by_str()\n\n    + drop_na()    \n\n    + rdf_add_special_token()\n    + group_triple_by_movie()\n    + build_by_movie()\n    # static\n    + build_triple()\n    + build_incomplete_triple()",
      "fontSize": 20,
      "fontFamily": 5,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "Class PipelineApplier\n    - movie_frequence_filter : pd.DataFrame()\n    - rel_Frequence_Filter : pd.DataFrame()\n    - rel_banned_list: list[str]\n\n    + generate_movie_frequency_filter()\n    + generate_rel_frequency_filter()\n    + generate_list_relationship_filter()\n    \n    + filter_by_movie_frequency()\n    + filter_by_relationship_frequency()\n    + delete_relationship_by_list_filter()\n    + delete_relationship_by_str()\n\n    + drop_na()    \n\n    + rdf_add_special_token()\n    + group_triple_by_movie()\n    + build_by_movie()\n    # static\n    + build_triple()\n    + build_incomplete_triple()",
      "autoResize": false,
      "lineHeight": 1.25
    },
    {
      "id": "Fbl1gpb5r7QrdRauGUWm2",
      "type": "text",
      "x": 158.23809523809535,
      "y": 502.52380952380935,
      "width": 484.2857142857143,
      "height": 500,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aF",
      "roundness": null,
      "seed": 2066618807,
      "version": 552,
      "versionNonce": 1269344823,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1759158199532,
      "link": null,
      "locked": false,
      "text": "Class Pipeline\n    - sql_endpoint: SqlEndpoint()\n\n    - task_rdf_mask_file_handler:\n    - task_bpe_corpus_file_handler:\n    - task_rdf_text_file_handler:\n    - task_rdf_completation_file_handler:\n\n    - Filter_applier : PipelineApplier()\n\n    #\n    - get_cleaned_movie_rows()\n    \n    + execute_task_bpe_corpus()\n    + execute_task_rdf_mask()\n    + execute_task_rdf_text()\n    + execute_task_rdf_completation()\n    + execute_all_task()\n\n    + use_toy_dataset()",
      "fontSize": 20,
      "fontFamily": 5,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "Class Pipeline\n    - sql_endpoint: SqlEndpoint()\n\n    - task_rdf_mask_file_handler:\n    - task_bpe_corpus_file_handler:\n    - task_rdf_text_file_handler:\n    - task_rdf_completation_file_handler:\n\n    - Filter_applier : PipelineApplier()\n\n    #\n    - get_cleaned_movie_rows()\n    \n    + execute_task_bpe_corpus()\n    + execute_task_rdf_mask()\n    + execute_task_rdf_text()\n    + execute_task_rdf_completation()\n    + execute_all_task()\n\n    + use_toy_dataset()",
      "autoResize": false,
      "lineHeight": 1.25
    }
  ],
  "appState": {
    "gridSize": 20,
    "gridStep": 5,
    "gridModeEnabled": false,
    "viewBackgroundColor": "#ffffff"
  },
  "files": {}
 }
--- a/docs/BPE.md
+++ b/docs/BPE.md
@@ -0,0 +1,22 @@
 # BPE
 ## Reasearch Material
 - [BPE Wikipedia](https://en.wikipedia.org/wiki/Byte-pair_encoding)
 - [BPE Hugging Face](https://huggingface.co/learn/llm-course/chapter6/5)
 - [BPE GeeksForGeeks](https://www.geeksforgeeks.org/nlp/byte-pair-encoding-bpe-in-nlp/)
 - [BPE Medium Chetna Khanna](https://medium.com/data-science/byte-pair-encoding-subword-based-tokenization-algorithm-77828a70bee0)
 - [Stack Overflow "Explain bpe (Byte Pair Encoding) with examples?"](https://stackoverflow.com/questions/50583254/explain-bpe-byte-pair-encoding-with-examples)
 - [Implementing a byte pair encoding(BPE) Tokenizer from scratch](https://sebastianraschka.com/blog/2025/bpe-from-scratch.html)
 - [Thoretical Analysis of Byte-Pair Encoding](https://arxiv.org/pdf/2411.08671)
 - [A Formal Perspective on Byte-Pair Encoding](https://aclanthology.org/2023.findings-acl.38v2.pdf)
 - [Byte Pair Encoding is Suboptimal for Language Model Pretraining](https://arxiv.org/pdf/2004.03720)
 - [Byte pair encoding: a text compression scheme that accelerates pattern matching](https://www.researchgate.net/profile/Takeshi-Shinohara/publication/2310624_Byte_Pair_Encoding_A_Text_Compression_Scheme_That_Accelerates_Pattern_Matching/links/02e7e522f8ea00c318000000/Byte-Pair-Encoding-A-Text-Compression-Scheme-That-Accelerates-Pattern-Matching.pdf)
 - [A Formal Perspective on Byte-Pair Encoding](https://arxiv.org/pdf/2306.16837)
 - [Controlling byte pair encoding for neural machine translation](https://ieeexplore.ieee.org/abstract/document/8300571)
 - [Scaffold-BPE: Enhancing Byte Pair Encoding for Large Language Models with Simple and Effective Scaffold Token Removal](https://ojs.aaai.org/index.php/AAAI/article/view/34633)
 - [Parity-Aware Byte-Pair Encoding: Improving Cross-lingual Fairness in Tokenization](https://arxiv.org/pdf/2508.04796)
 - [Code Completion using Neural Aention and Byte Pair Encoding](https://arxiv.org/pdf/2004.06343)
 - [Getting the most out of your tokenizer for pre-training and domain adaptation](https://arxiv.org/html/2402.01035v2)
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,3 +15,4 @@ tzdata==2025.2
 urllib3==2.5.0
 wheel==0.45.1
 Wikipedia-API==0.8.1
 SQLAlchemy
Author	SHA1	Message	Date
GassiGiuseppe	1d23b9cc8b	little snippet to trim big dictionaries	2025-10-07 16:05:32 +02:00
GassiGiuseppe	165290162c	added tokenano to the init	2025-10-04 19:03:56 +02:00
GassiGiuseppe	502016f843	a new exasperated way to train the bpe, just a wild experimen that could be useful later	2025-10-04 19:03:07 +02:00
GassiGiuseppe	845c63dbef	updated tokenano to be more easy to read	2025-10-04 19:01:21 +02:00
GassiGiuseppe	bbadd4c521	update cleaning pipeline with a new method to filter also by number of films, also updated the signature of the pipeline	2025-10-04 19:00:05 +02:00
GassiGiuseppe	c2f9344c82	little test file	2025-10-04 18:58:20 +02:00
GassiGiuseppe	25f3a5d221	Logic to test BPE	2025-10-04 18:58:04 +02:00
Christian Risi	149deb407d	added cache directories	2025-10-03 18:01:05 +02:00
Christian Risi	8a21cb1b73	added python analysis	2025-10-03 18:00:52 +02:00
Christian Risi	d2a3dfe90f	Fixed bug	2025-10-03 17:59:46 +02:00
GassiGiuseppe	0f95aeb122	toy dictionary for bpe implemeted	2025-10-03 16:26:01 +02:00
Christian Risi	0ee6e48004	Fixed the same bug as before, but this time is correct	2025-10-03 16:09:53 +02:00
Christian Risi	55e0d2ac23	Fixed a encoding bug	2025-10-03 16:08:11 +02:00
Christian Risi	9c5f42153f	fixed typos	2025-10-03 15:17:44 +02:00
Christian Risi	c74689d01d	Fixed tests to reflect new version of tokenizer	2025-10-03 13:27:38 +02:00
Christian Risi	51f491d033	fixed typos	2025-10-03 13:27:17 +02:00
Christian Risi	c5c0c61f79	Fix of bugs and semantics	2025-10-03 13:26:58 +02:00
Christian Risi	6b9cb7cd35	Modified imports	2025-10-03 13:26:42 +02:00
Christian Risi	e8894504c6	Fixed a bug where a token (int) was yielded instead of a list of int	2025-10-03 11:44:44 +02:00
GassiGiuseppe	845d645348	added some stubs on special_regex_maker	2025-10-03 10:38:35 +02:00
GassiGiuseppe	09f7b39512	test files updated	2025-10-03 01:04:47 +02:00
GassiGiuseppe	070dc1b744	implemented token nano for the BPE encoding/decoding	2025-10-03 01:04:06 +02:00
GassiGiuseppe	8121c75a09	Updated NanoSocratesSplitter to split also token in decode phase	2025-10-03 01:00:36 +02:00
GassiGiuseppe	a5b8692a77	Updated NanoSocratesSpecial to work with TokeNano	2025-10-03 00:59:15 +02:00
GassiGiuseppe	7c935d2700	Update NanoSocratesBPE: corrected a minor bug about dictionary lenght, added some comment to make the code more clear	2025-10-03 00:57:19 +02:00
Christian Risi	a1d143187d	corrected test to reflect changes in BPE trainer	2025-10-02 20:11:43 +02:00
GassiGiuseppe	0eef2148a9	in NanoSocratesBPE: encode() method rewritten and tested	2025-10-02 12:12:44 +02:00
Christian Risi	856bd8909c	Added treshold	2025-10-02 11:02:03 +02:00
Christian Risi	2e595a3a23	Changed training phase to take directly data instead of its encode	2025-10-02 09:56:44 +02:00
Christian Risi	2194cc7b4f	Changed test to use pool trainer	2025-10-02 09:56:05 +02:00
Christian Risi	1eae8582b2	Fixed decoding phase	2025-10-02 09:33:58 +02:00
Christian Risi	eadba1fb82	Corrected test to reflect changes in NanoSocratesBPE	2025-10-02 09:33:47 +02:00
Christian Risi	aa765b4555	Added time checking	2025-10-02 08:48:45 +02:00
Christian Risi	17d82f0a4e	Added support to resume workload	2025-10-02 08:48:28 +02:00
Christian Risi	0975c19e69	added nwew method to encode from list of tokens	2025-10-02 08:48:13 +02:00
Christian Risi	3fe4e45ceb	Fixed a bug while joining frequencies	2025-10-02 01:50:37 +02:00
Christian Risi	d19426fa62	added multithreaded training to package	2025-10-02 01:31:05 +02:00
Christian Risi	63baf29805	Added multithreaded training	2025-10-02 01:30:24 +02:00
Christian Risi	b80b4e4112	Fixed returning type hints	2025-10-02 01:29:57 +02:00
Christian Risi	7cfaf601b4	Refactored to remove tokens that can't be compressed anymore	2025-10-01 19:42:22 +02:00
Christian Risi	fbbe6226bb	Finished uploading stubs for TokeNano	2025-10-01 18:56:53 +02:00
Christian Risi	b3d444979f	Added flag to resume work correctly	2025-10-01 12:22:09 +02:00
Christian Risi	66bcf6e55f	Added a way to recover iteration work	2025-10-01 12:21:42 +02:00
Christian Risi	dbf1d99408	Added json utils to save and load json files	2025-10-01 12:20:59 +02:00
Christian Risi	97bac464f3	Fixed JSON incompatibility	2025-10-01 00:32:43 +02:00
Christian Risi	9a8e726d74	Added cdebug configuration	2025-10-01 00:22:22 +02:00
Christian Risi	7ab9b0358e	Added script to run BPE	2025-09-30 23:59:09 +02:00
Christian Risi	30c2938d29	Fixed typing	2025-09-30 23:58:54 +02:00
Christian Risi	76f24d4eb0	Renamed file	2025-09-30 23:58:43 +02:00
Christian Risi	89a0a1f4bb	Fixed bug for utf-8 conversion	2025-09-30 23:58:31 +02:00
Christian Risi	ccacea18d8	Created files to test BPE training	2025-09-30 13:33:54 +02:00
Christian Risi	b09bd4acba	Created trainer to train BPE	2025-09-30 13:33:40 +02:00
Christian Risi	c9032cab09	Added fit method	2025-09-30 13:33:28 +02:00
Christian Risi	7020c9e683	Added utils to make regexps and iterators that check for last element	2025-09-30 13:33:12 +02:00
Christian Risi	2fe1ce9e9a	Updated Inits	2025-09-30 13:32:37 +02:00
Christian Risi	18fc2ba9d8	Added Exceptions	2025-09-30 13:32:24 +02:00
Christian Risi	5acee1d1a5	Merge branch 'dev' into dev.bpe	2025-09-30 11:35:27 +02:00
Giuseppe Gassi	2e36753da4	Merge pull request 'dev.etl' (#5 ) from dev.etl into dev Reviewed-on: #5	2025-09-30 11:28:57 +02:00
GassiGiuseppe	007f1e9554	minor updates	2025-09-29 18:53:33 +02:00
GassiGiuseppe	c319398ca0	little update to UML pipeline	2025-09-29 17:03:31 +02:00
GassiGiuseppe	255d8a072d	First implementation of the cleaning pipeline UML	2025-09-29 16:59:52 +02:00
GassiGiuseppe	8167c9d435	Added Toy Dataset entry point into the Pipeline class Before it was forced into the sql_endpoint, now all the pipeline can be managed in the Pipeline class	2025-09-29 16:03:49 +02:00
GassiGiuseppe	bd72ad3571	Added file to execute the complete cleaning pipeline	2025-09-29 15:21:26 +02:00
GassiGiuseppe	6ddb7de9da	Added sqlAlchemy to requirements	2025-09-29 15:19:19 +02:00
Christian Risi	564b0d712e	Modified UML diagram	2025-09-28 18:05:03 +02:00
Christian Risi	e433941405	Added BPE TODO: - complete the fit method	2025-09-28 18:04:44 +02:00
Christian Risi	b46df4f91a	Added Special Encoder	2025-09-28 18:03:47 +02:00
Christian Risi	d179e01971	Added Splitter to divide tokens from text	2025-09-28 18:03:16 +02:00
Christian Risi	b071145f6e	Added Chunker	2025-09-28 18:02:06 +02:00
Christian Risi	ed0255e99b	Updated imports	2025-09-28 18:01:35 +02:00
Christian Risi	3e8b5c5579	Added test for chunker	2025-09-26 18:50:32 +02:00
Christian Risi	8db35732f9	Added Chunker to restrict our domains	2025-09-26 18:50:23 +02:00
Christian Risi	9552d61f8d	Added Excetption for when we don't find a delimiter	2025-09-26 18:49:56 +02:00
Christian Risi	be8a87ce01	Modified the architecture for BPE	2025-09-26 18:49:29 +02:00
Christian Risi	5801a819e9	Added vars to make it easier to work here	2025-09-26 18:49:06 +02:00
Christian Risi	3f48b5c428	Added text files to test a chunker	2025-09-26 18:48:44 +02:00
Christian Risi	9972ab8a51	Added imports	2025-09-26 18:48:23 +02:00
GassiGiuseppe	650b37c586	Added vscode setting to execute jupyternotebook from root dir	2025-09-26 11:24:34 +02:00
Christian Risi	90012285b5	UML Diagram to explain bpe workflows	2025-09-25 20:18:21 +02:00
Christian Risi	1bbb4a0999	Added new paper	2025-09-25 20:17:48 +02:00
GassiGiuseppe	e521b0704e	deleted TODO in path_splitter_tree, as it was already resolved	2025-09-25 19:19:11 +02:00
Christian Risi	ee0aa583d5	Added Docs for BPE research	2025-09-25 19:10:45 +02:00
Christian Risi	0a698e9837	Added schema to extract from DB for BPE	2025-09-25 19:09:52 +02:00
GassiGiuseppe	9440a562f2	Merge branch 'dev.etl' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev.etl	2025-09-25 18:33:51 +02:00
Christian Risi	5eda131aac	Fixed creation query to be unique even with movieID in RDFs	2025-09-25 17:58:09 +02:00
GassiGiuseppe	57884eaf2e	CSV support added to path_splitter_tree Also resolved a minor bug to print also leaf nodes	2025-09-25 17:57:46 +02:00
Christian Risi	4548a683c2	Fixed DB	2025-09-25 17:57:45 +02:00
GassiGiuseppe	3eec49ffa5	WIP: added test file: clean_relationship.jupyter to create a first cleaning pipeline	2025-09-25 16:28:24 +02:00
Christian Risi	0bc7f4b227	Fixed Typos	2025-09-25 12:37:52 +02:00
Christian Risi	f28952b0a2	Added todo	2025-09-25 12:00:26 +02:00
Christian Risi	0b626a8e09	Modified query to take all data	2025-09-25 11:53:12 +02:00
Christian Risi	b254098532	Added views to count for subjects and objects	2025-09-25 11:40:44 +02:00
Christian Risi	ee88ffe4cf	Added View to filter over relationship counts	2025-09-25 11:32:03 +02:00
Christian Risi	70b4bd8645	Added Complex query	2025-09-25 11:31:34 +02:00
Christian Risi	6316d2bfc4	Added queries to take data from SQL for dataset	2025-09-25 11:27:19 +02:00
Christian Risi	87ca748f45	Updated DB to reflect new changes	2025-09-24 19:29:57 +02:00
Christian Risi	4315d70109	Merged abbreviation_datawarehouse into datawarehouse	2025-09-24 19:29:43 +02:00
Christian Risi	9a5d633b5e	Fixed Typos	2025-09-24 19:29:07 +02:00
Christian Risi	a6760cd52d	Updated SQL Queries to support parsing in DB	2025-09-24 19:28:55 +02:00
GassiGiuseppe	a7eb92227d	Moved all db queries file in their own folder	2025-09-24 16:44:55 +02:00
GassiGiuseppe	9f221e31cd	Merge branch 'dev.etl' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev.etl	2025-09-24 16:32:52 +02:00
GassiGiuseppe	47197194d5	WIP abbrevietion_datawarehouse to creat an abbreviation system	2025-09-24 16:32:09 +02:00
Christian Risi	0cdbf6f624	Added query to retrieve a dirty dataset from SQLite DB	2025-09-24 16:15:47 +02:00
Christian Risi	3e30489f86	Updated Queries for DB	2025-09-24 14:44:53 +02:00
Christian Risi	8a22e453e4	Fixed csv	2025-09-24 14:44:25 +02:00
Christian Risi	7feb4eb857	Fixed URI generation	2025-09-24 14:44:07 +02:00
Christian Risi	70af19d356	Removed unused imports and added trailing slashes	2025-09-24 14:04:48 +02:00
Christian Risi	a4b44ab2ee	Fixed Typos	2025-09-24 14:04:27 +02:00
Christian Risi	74b6b609dd	Fixed typos	2025-09-24 13:59:19 +02:00
Christian Risi	59796c37cb	Added script to take dbpedia uris	2025-09-24 13:49:29 +02:00
Christian Risi	f696f5950b	Added uri-abbreviations	2025-09-24 13:48:53 +02:00
Christian Risi	605b496da7	Added barebone UML diagram for a Cleaning Pipeline	2025-09-23 19:49:01 +02:00
Christian Risi	7d693964dd	Added new directories to tree structure	2025-09-23 19:47:56 +02:00
		`@@ -0,0 +1,2 @@`
							`<SOT>Lorem ipsum <SEP>dolor sit amet<SEP>, consectetur <SEP>adipiscing elit.`
							`<SEP>Aenean at dui <SEP>hendrerit ante <SEP>sollicitudin <SEP>scelerisque<EOT>`
		`@@ -0,0 +1 @@`
							`<ABS>The Dark Knight is a 2008 superhero film directed by Christopher Nolan,<SOTL>`