little snippet to trim big dictionaries

added tokenano to the init
a new exasperated way to train the bpe, just a wild experimen that could be useful later
2025-10-07 16:05:32 +02:00 · 2025-10-04 19:03:56 +02:00 · 2025-10-04 19:03:07 +02:00 · 2025-10-04 19:01:21 +02:00 · 2025-10-04 19:00:05 +02:00 · 2025-10-04 18:58:20 +02:00
65 changed files with 3079 additions and 1074 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -254,4 +254,5 @@ $RECYCLE.BIN/
 # ---> Custom
 **/Tmp/**
 **/cache/**
 !**/.gitkeep
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -0,0 +1,16 @@
 {
    // Use IntelliSense to learn about possible attributes.
    // Hover to view descriptions of existing attributes.
    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
    "version": "0.2.0",
    "configurations": [
        {
            "name": "Python Debugger: Current File with Arguments",
            "type": "debugpy",
            "request": "launch",
            "program": "${file}",
            "console": "integratedTerminal",
            "args": "${command:pickArgs}"
        }
    ]
 }
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -1,24 +1,55 @@
 {
-  // Always treat the project root as the working dir for Jupyter
+    // Always treat the project root as the working dir for Jupyter
-  "jupyter.notebookFileRoot": "${workspaceFolder}",
+    "jupyter.notebookFileRoot": "${workspaceFolder}",
-
+    // When you click "Run Python File in Terminal", DON'T cd into the file's folder
-  // When you click "Run Python File in Terminal", DON'T cd into the file's folder
+    "python.terminal.executeInFileDir": false,
-  "python.terminal.executeInFileDir": false,
+    // Start new integrated terminals at the project root
-
+    "terminal.integrated.cwd": "${workspaceFolder}",
-  // Start new integrated terminals at the project root
+    // Make pytest run from the root without needing a pytest.ini
-  "terminal.integrated.cwd": "${workspaceFolder}",
+    "python.testing.pytestEnabled": true,
-
+    "python.testing.cwd": "${workspaceFolder}",
-  // Ensure Python can import from the project root no matter which file you run
+    "python.testing.pytestArgs": [
-  // (so `src/` is on sys.path). Linux shown here; add osx/windows if needed.
+        "src/test"
-  "terminal.integrated.env.linux": {
+    ],
-    "PYTHONPATH": "${workspaceFolder}"
+    // Help Pylance resolve imports like `from src...` without red squiggles
-  },
+    "python.analysis.extraPaths": [
-
+        "${workspaceFolder}"
-  // Make pytest run from the root without needing a pytest.ini
+    ],
-  "python.testing.pytestEnabled": true,
+    // For linux
-  "python.testing.cwd": "${workspaceFolder}",
+    "terminal.integrated.env.linux": {
-  "python.testing.pytestArgs": ["src/test"],
+        "PYTHONPATH": "${workspaceFolder}"
-
+    },
-  // Help Pylance resolve imports like `from src...` without red squiggles
+    // For OSX
-  "python.analysis.extraPaths": ["${workspaceFolder}"]
+    "terminal.integrated.env.osx": {
-}
+        "PYTHONPATH": "${workspaceFolder}"
    },
    // For Windows
    "terminal.integrated.env.windows": {
        "PYTHONPATH": "${workspaceFolder}"
    },
    "python.analysis.typeCheckingMode": "standard"
 }
 // {
 //   // Always treat the project root as the working dir for Jupyter
 //   "jupyter.notebookFileRoot": "${workspaceFolder}",
 //
 //   // When you click "Run Python File in Terminal", DON'T cd into the file's folder
 //   "python.terminal.executeInFileDir": false,
 //
 //   // Start new integrated terminals at the project root
 //   "terminal.integrated.cwd": "${workspaceFolder}",
 //
 //   // Ensure Python can import from the project root no matter which file you run
 //   // (so `src/` is on sys.path). Linux shown here; add osx/windows if needed.
 //   "terminal.integrated.env.windows": {
 //     "PYTHONPATH": "${workspaceFolder}"
 //   },
 //
 //   // Make pytest run from the root without needing a pytest.ini
 //   "python.testing.pytestEnabled": true,
 //   "python.testing.cwd": "${workspaceFolder}",
 //   "python.testing.pytestArgs": ["src/test"],
 //
 //   // Help Pylance resolve imports like `from src...` without red squiggles
 //   "python.analysis.extraPaths": ["${workspaceFolder}"]
 // }
--- a/Assets/Model/toy_10/README.md
+++ b/Assets/Model/toy_10/README.md
--- a/Assets/Model/toy_10/toy_dictionary.json
+++ b/Assets/Model/toy_10/toy_dictionary.json
--- a/Project_Model/Libs/BPE/Classes/Encoder.py
+++ b/Project_Model/Libs/BPE/Classes/Encoder.py
@@ -0,0 +1,4 @@
 from abc import ABC
 class Encoder(ABC):
    pass
--- a/Project_Model/Libs/BPE/Classes/NanoSocraTraineRam.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocraTraineRam.py
@@ -0,0 +1,164 @@
 from collections import deque
 import datetime
 from pathlib import Path
 import re
 from ..Classes import (
    NanoSocratesBPE,
    NanoSocratesChunker,
    NanoSocratesSplitter,
    NanoSocratesBatchMemoryBPE,
 )
 from ..Enums import TokenType
 from ..Utils import (
    special_regex_maker,
    iterator_with_checks,
    save_nanos_vocabulary,
    load_nanos_vocabulary,
    save_json,
    load_json,
 )
 class NanoSocraTraineRam:
    def __init__(
        self,
        max_vocabulary: int,
        special_vocabulary: list[str],
        merge_treshold: int = 0,
        max_iterations: int = 0,
        print_after_iterations: int = 1,
    ) -> None:
        # Bytes
        BYTE_RESERVED_TOKENS = 256
        SPECIAL_RESERVED_TOKENS = len(special_vocabulary)
        RESERVED_TOKENS = BYTE_RESERVED_TOKENS + SPECIAL_RESERVED_TOKENS
        self.__max_vocabulary = max_vocabulary - RESERVED_TOKENS
        self.__max_iterations = max_iterations
        self.__merge_treshold = merge_treshold
        self.__special_token_regex = special_regex_maker(special_vocabulary)
        self.__print_after_iterations = print_after_iterations
    def trainBPE(
        self,
        path: Path,
        bpe: NanoSocratesBPE | None = None,
    ) -> NanoSocratesBPE:
        if not path.is_file():
            raise FileNotFoundError()
        if bpe is None:
            bpe = NanoSocratesBPE()
        BPE = bpe
        if BPE.vocabulary_size > self.__max_vocabulary:
            return BPE
        exit = False
        current_iteration = 0
        data = self.__gather_data_from_file(path)
        while not exit:
            current_iteration = self.__increment_counter(current_iteration)
            LAST_VOC_SIZE = BPE.vocabulary_size
            last_memory = None
            _, data, last_memory = self.__round_train(BPE, data)
            NEW_VOC_SIZE = BPE.vocabulary_size
            if current_iteration % self.__print_after_iterations == 0:
                DELIMITER = "==============="
                DEBUG = "\n".join(
                    [
                        DELIMITER,
                        f"ITERATION: {current_iteration}",
                        DELIMITER,
                        f"\tVocabulary size: {BPE.vocabulary_size}\n",
                        f"\tFrequencies:\n{last_memory.frequencies}\n",  # type: ignore (pretty sure it's not None)
                        f"\tvocabulary:\n{BPE.vocabulary}",
                        DELIMITER,
                        "",
                    ]
                )
                print(DEBUG)
            if LAST_VOC_SIZE == NEW_VOC_SIZE:
                exit = True
                continue
            if current_iteration == self.__max_iterations:
                exit = True
                continue
            if BPE.vocabulary_size == self.__max_vocabulary:
                exit = True
                continue
        return BPE
    def __round_train(self, bpe: NanoSocratesBPE, data: list[list[int]]):
        DATA_LEN = len(data)
        NEW_DATA = []
        counter = 0
        memory = NanoSocratesBatchMemoryBPE({}, 0)
        while len(data) > 0:
            counter += 1
            last_batch = len(data) == 1
            piece = data.pop()
            bpe, memory, output = bpe.fit(piece, memory, last_batch)
            if counter % int(1E6) == 0:
                print(f"Fitted: {counter}/{DATA_LEN}")
            if len(output) < 2:
                continue
            NEW_DATA.append(output)
        return (bpe, NEW_DATA, memory)
    def __gather_data_from_file(self, path: Path) -> list[list[int]]:
        SPLITTER = NanoSocratesSplitter(self.__special_token_regex)
        DATA: list[list[int]] = []
        FILE = open(path, "r", encoding="utf-8")
        file_string = FILE.read()
        FILE.close()
        for piece, type in SPLITTER.split_text(file_string):
            if type != TokenType.BPE:
                continue
            int_list = self.__make_list_ids(piece)
            DATA.append(int_list)
        return DATA
    def __increment_counter(self, counter: int):
        # What if overflows???
        try:
            counter += 1
        except:
            print("Integer overflow")
            counter = 1
        return counter
    def __make_list_ids(self, corpus: str):
        return list(corpus.encode("utf-8"))
--- a/Project_Model/Libs/BPE/Classes/NanoSocraTrainer.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocraTrainer.py
@@ -0,0 +1,248 @@
 from collections import deque
 import datetime
 from pathlib import Path
 import re
 from ..Classes import (
    NanoSocratesBPE,
    NanoSocratesChunker,
    NanoSocratesSplitter,
    NanoSocratesBatchMemoryBPE,
 )
 from ..Enums import TokenType
 from ..Utils import (
    special_regex_maker,
    iterator_with_checks,
    save_nanos_vocabulary,
    load_nanos_vocabulary,
    save_json,
    load_json,
 )
 class NanoSocraTrainer:
    def __init__(
        self,
        max_vocabulary: int,
        special_vocabulary: list[str],
        chunk_size: int,
        merge_treshold: int = 0,
        max_iterations: int = 0,
        print_after_iterations: int = 1,
    ) -> None:
        # Bytes
        BYTE_RESERVED_TOKENS = 256
        SPECIAL_RESERVED_TOKENS = len(special_vocabulary)
        RESERVED_TOKENS = BYTE_RESERVED_TOKENS + SPECIAL_RESERVED_TOKENS
        self.__max_vocabulary = max_vocabulary - RESERVED_TOKENS
        self.__max_iterations = max_iterations
        self.__chunk_size = chunk_size
        self.__merge_treshold = merge_treshold
        self.__special_token_regex = special_regex_maker(special_vocabulary)
        self.__print_after_iterations = print_after_iterations
    def trainBPE(
        self,
        path: Path,
        cache_dir: Path,
        bpe: NanoSocratesBPE | None = None,
        resume_from_iter: int = 0,
    ) -> NanoSocratesBPE:
        if not path.is_file():
            raise FileNotFoundError()
        if not cache_dir.is_dir():
            raise NotADirectoryError()
        if bpe is None:
            bpe = NanoSocratesBPE()
        BPE = bpe
        if BPE.vocabulary_size > self.__max_vocabulary:
            return BPE
        exit = False
        cached = False
        current_iteration = 0
        input_path = path
        NEXT_ITERATION = resume_from_iter + 1 if resume_from_iter != 0 else 0
        PATH_GEN = self.__switch_paths(path, cache_dir, NEXT_ITERATION)
        MEMORY_PATH_GEN = self.__switch_memory(cache_dir, resume_from_iter)
        if resume_from_iter != 0:
            cached = True
            current_iteration = resume_from_iter
            input_path = next(PATH_GEN)
            # UGLY: fixes a bug immediately, unfortunately
            _, _ = next(MEMORY_PATH_GEN)
            _, voc_cache_path = next(MEMORY_PATH_GEN)
            vocabulary = load_nanos_vocabulary(voc_cache_path)
            BPE = NanoSocratesBPE(vocabulary)
        while not exit:
            out_path = next(PATH_GEN)
            internal_cache_path, vocabulary_cache = next(MEMORY_PATH_GEN)
            current_iteration = self.__increment_counter(current_iteration)
            LAST_VOC_SIZE = BPE.vocabulary_size
            FILE = open(out_path, "w")
            last_memory = None
            for _, memory, output in self.__round_train(input_path, BPE, cached):
                last_memory = memory
                FILE.write(output)
            FILE.close()
            internal_cache = {
                "finished_iter": current_iteration,
                "read_from": f"{input_path}",
                "wrote_to": f"{out_path}",
                "at": datetime.datetime.now(datetime.timezone.utc).strftime(
                    "%Y-%m-%d %H:%M:%S.%f"
                )[:-3],
            }
            VOCABULARY = BPE.vocabulary
            save_json(internal_cache, internal_cache_path)
            save_nanos_vocabulary(VOCABULARY, vocabulary_cache)
            cached = True
            input_path = out_path
            NEW_VOC_SIZE = BPE.vocabulary_size
            if current_iteration % self.__print_after_iterations == 0:
                DELIMITER = "==============="
                DEBUG = "\n".join(
                    [
                        DELIMITER,
                        f"ITERATION: {current_iteration}",
                        DELIMITER,
                        f"\tVocabulary size: {BPE.vocabulary_size}\n",
                        f"\tFrequencies:\n{last_memory.frequencies}\n",  # type: ignore (pretty sure it's not None)
                        f"\tvocabulary:\n{BPE.vocabulary}",
                        DELIMITER,
                        "",
                    ]
                )
                print(DEBUG)
            if LAST_VOC_SIZE == NEW_VOC_SIZE:
                exit = True
                continue
            if current_iteration == self.__max_iterations:
                exit = True
                continue
            if BPE.vocabulary_size == self.__max_vocabulary:
                exit = True
                continue
        return BPE
    def __round_train(self, path: Path, bpe: NanoSocratesBPE, cached: bool):
        CHUNKER = NanoSocratesChunker(self.__chunk_size, self.__special_token_regex)
        SPLITTER = NanoSocratesSplitter(self.__special_token_regex)
        BPE = bpe
        memory = NanoSocratesBatchMemoryBPE({}, self.__merge_treshold)
        CHUNKER_GENERATOR = iterator_with_checks(CHUNKER.chunk(path))
        for chunk, last_chunk in CHUNKER_GENERATOR:
            PIECE_GENERATOR = iterator_with_checks(SPLITTER.split_text(chunk))
            for piece, last_piece in PIECE_GENERATOR:
                LAST_BATCH = last_chunk and last_piece
                PIECE, TOKEN_TYPE = piece
                if TOKEN_TYPE != TokenType.BPE:
                    _, _, out = BPE.fit([], memory, LAST_BATCH)
                    yield (BPE, memory, PIECE)
                    continue
                PIECE_DATA = self.__make_list_ids(PIECE, cached)
                _, _, out = BPE.fit(PIECE_DATA, memory, LAST_BATCH)
                OUT_STRING = f"{out}"
                yield (BPE, memory, OUT_STRING)
    def __increment_counter(self, counter: int):
        # What if overflows???
        try:
            counter += 1
        except:
            print("Integer overflow")
            counter = 1
        return counter
    def __make_list_ids(self, corpus: str, cached: bool):
        if not cached:
            return list(corpus.encode("utf-8"))
        REDUCED_CORPUS_LEN = len(corpus) - 1
        # Skip these cars "[" "]"
        INTS = corpus[1:REDUCED_CORPUS_LEN]
        INT_LIST = list(map(int, INTS.split(",")))
        return INT_LIST
    def __switch_paths(self, path: Path, cache_path: Path, initial_iteration: int):
        CORPUS_TMP_1 = cache_path / "corpus-tmp1.txt"
        CORPUS_TMP_2 = cache_path / "corpus-tmp2.txt"
        switch = True
        if initial_iteration % 2 == 1:
            switch = False
        del initial_iteration
        while True:
            if switch:
                yield CORPUS_TMP_1
            else:
                yield CORPUS_TMP_2
            switch = not switch
    def __switch_memory(self, cache_path: Path, initial_iteration: int):
        INTERNAL_TMP_1 = cache_path / "internal-tmp1.json"
        INTERNAL_TMP_2 = cache_path / "internal-tmp2.json"
        VOCAB_TMP_1 = cache_path / "voc-tmp1.json"
        VOCAB_TMP_2 = cache_path / "voc-tmp2.json"
        switch = False
        if initial_iteration % 2 == 1:
            switch = True
        del initial_iteration
        while True:
            if switch:
                yield (INTERNAL_TMP_1, VOCAB_TMP_1)
            else:
                yield (INTERNAL_TMP_2, VOCAB_TMP_2)
            switch = not switch
--- a/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py
@@ -0,0 +1,280 @@
 from collections import deque
 import datetime
 import itertools
 from multiprocessing import Pool
 import os
 from pathlib import Path
 import re
 import time
 from ..Classes import (
    NanoSocratesBPE,
    NanoSocratesChunker,
    NanoSocratesSplitter,
    NanoSocratesBatchMemoryBPE,
 )
 from ..Enums import TokenType
 from ..Utils import (
    special_regex_maker,
    iterator_with_checks,
    save_nanos_vocabulary,
    load_nanos_vocabulary,
    save_json,
    load_json,
 )
 def split(a, n):
    k, m = divmod(len(a), n)
    return (a[i * k + min(i, m) : (i + 1) * k + min(i + 1, m)] for i in range(n))
 def split_fit(object: tuple[NanoSocratesBPE, list[list[int]]]):
    bpe, data = object
    NEW_DATA: list[list[int]] = []
    memory = NanoSocratesBatchMemoryBPE({}, 0)
    while len(data) > 0:
        piece = data.pop()
        bpe, memory, output = bpe.fit(piece, memory, False)
        if len(output) < 2:
            continue
        # We are sure of its type
        NEW_DATA.append(piece)  # type: ignore
    return (bpe, NEW_DATA, memory)
 def split_encode(object: tuple[NanoSocratesBPE, list[list[int]]]):
    bpe, data = object
    NEW_DATA: list[list[int]] = []
    for index, piece in zip(range(0, len(data)), data):
        output = bpe.encode_intermediate(piece)
        if len(output) < 2:
            continue
        # We are sure of its type
        NEW_DATA.append(data[index])  # type: ignore
    return NEW_DATA
 class NanoSocraTrainerPool:
    def __init__(
        self,
        max_vocabulary: int,
        special_vocabulary: list[str],
        merge_treshold: int = 0,
        max_iterations: int = 0,
        print_after_iterations: int = 1,
    ) -> None:
        # Bytes
        BYTE_RESERVED_TOKENS = 256
        SPECIAL_RESERVED_TOKENS = len(special_vocabulary)
        RESERVED_TOKENS = BYTE_RESERVED_TOKENS + SPECIAL_RESERVED_TOKENS
        self.__max_vocabulary = max_vocabulary - RESERVED_TOKENS
        self.__max_iterations = max_iterations
        self.__merge_treshold = merge_treshold
        self.__special_token_regex = special_regex_maker(special_vocabulary)
        self.__print_after_iterations = print_after_iterations
    # TODO: add a resume function
    def trainBPE(
        self,
        path: Path,
        cache_file: Path,
        bpe: NanoSocratesBPE | None = None,
    ) -> NanoSocratesBPE:
        if not path.is_file():
            raise FileNotFoundError()
        if not cache_file.is_file():
            file = cache_file.open("w")
            file.close()
        if bpe is None:
            bpe = NanoSocratesBPE()
        BPE = bpe
        if BPE.vocabulary_size > self.__max_vocabulary:
            return BPE
        exit = False
        current_iteration = 0
        data = self.__gather_data_from_file(path)
        data = self.__encode_from_cache(BPE, data)
        while not exit:
            current_iteration = self.__increment_counter(current_iteration)
            LAST_VOC_SIZE = BPE.vocabulary_size
            last_memory = None
            start = time.time_ns()
            _, data, last_memory = self.__round_train(BPE, data)
            end = time.time_ns()
            NEW_VOC_SIZE = BPE.vocabulary_size
            VOCABULARY = BPE.vocabulary
            save_nanos_vocabulary(VOCABULARY, cache_file)
            if current_iteration % self.__print_after_iterations == 0:
                DELIMITER = "==============="
                DEBUG = "\n".join(
                    [
                        DELIMITER,
                        f"ITERATION: {current_iteration}",
                        DELIMITER,
                        f"\tVocabulary size: {BPE.vocabulary_size - 256}\n",
                        f"\tTime elapsed: {(end - start)/1E9}s",
                        DELIMITER,
                        "",
                    ]
                )
                print(DEBUG)
            if LAST_VOC_SIZE == NEW_VOC_SIZE:
                exit = True
                continue
            if current_iteration == self.__max_iterations:
                exit = True
                continue
            if BPE.vocabulary_size == self.__max_vocabulary:
                exit = True
                continue
        return BPE
    def __round_train(self, bpe: NanoSocratesBPE, data: list[list[int]]):
        NEW_DATA: list[list[int]] = []
        MEMORY = NanoSocratesBatchMemoryBPE({}, self.__merge_treshold)
        fit_funct = split_fit
        CPU_COUNT = os.process_cpu_count()
        if CPU_COUNT is None:
            raise Exception()
        VOCABULARY = bpe.vocabulary
        data_chunks = split(data, CPU_COUNT)
        JOBS = [(NanoSocratesBPE(VOCABULARY), chunk) for chunk in data_chunks]
        JOB_RESULTS: list[
            tuple[NanoSocratesBPE, list[list[int]], NanoSocratesBatchMemoryBPE]
        ]
        with Pool() as pool:
            JOB_RESULTS = pool.map(fit_funct, JOBS)
        for i, res in zip(range(0, CPU_COUNT), JOB_RESULTS):
            _, job_output, job_memory = res
            NEW_DATA.extend(job_output)
            for key, value in job_memory.frequencies.items():
                frequency = MEMORY.frequencies.get(key)
                if frequency is None:
                    frequency = 0
                    MEMORY.frequencies[key] = 0
                frequency += value
                MEMORY.frequencies[key] = frequency
            del job_output
            del job_memory
            print(f"Joined {i + 1} out of {CPU_COUNT}")
        # Get new token
        bpe.fit([], MEMORY, True)
        print(f"Sentences from {len(data)} to {len(NEW_DATA)}")
        return (bpe, NEW_DATA, MEMORY)
    def __gather_data_from_file(self, path: Path) -> list[list[int]]:
        SPLITTER = NanoSocratesSplitter(self.__special_token_regex)
        DATA: list[list[int]] = []
        FILE = open(path, "r", encoding="utf-8")
        file_string = FILE.read()
        FILE.close()
        for piece, type in SPLITTER.split_text(file_string):
            if type != TokenType.BPE:
                continue
            int_list = self.__make_list_ids(piece)
            DATA.append(int_list)
        return DATA
    def __encode_from_cache(self, bpe: NanoSocratesBPE, data: list[list[int]]):
        NEW_DATA : list[list[int]]= []
        CPU_COUNT = os.process_cpu_count()
        if CPU_COUNT is None:
            raise Exception()
        VOCABULARY = bpe.vocabulary
        data_chunks = split(data, CPU_COUNT)
        JOBS = [(NanoSocratesBPE(VOCABULARY), chunk) for chunk in data_chunks]
        JOB_RESULTS: list[list[list[int]]]
        with Pool() as pool:
            JOB_RESULTS = pool.map(split_encode, JOBS)
        for i, res in zip(range(0, CPU_COUNT), JOB_RESULTS):
            job_output = res
            NEW_DATA.extend(job_output)
            del job_output
            print(f"Joined {i + 1} out of {CPU_COUNT}")
        print(f"Sentences from {len(data)} to {len(NEW_DATA)}")
        return NEW_DATA
    def __increment_counter(self, counter: int):
        # What if overflows???
        try:
            counter += 1
        except:
            print("Integer overflow")
            counter = 1
        return counter
    def __make_list_ids(self, corpus: str):
        return list(corpus.encode("utf-8"))
--- a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py
@@ -0,0 +1,219 @@
 from collections import deque
 from .Encoder import Encoder
 from ..Errors import OutOfDictionaryException, DuplicateWordException
 # ABOUT THE DICTIONARY:
 # the string is converted into utf-char bytes, that is: each char is rappresented with a set of bytes from 1 to 4.
 # each bytes get casted into an integer; such that, if an integer has its value lower then 256,
 # then it is rappresenting an utf-char-byte, otherwise it is a token-ID.
 class NanoSocratesBatchMemoryBPE:
    """Memory to batch training. Keeps token couple frequencies, and merge_treshold"""
    def __init__(
        self, frequencies: dict[tuple[int, int], int], merge_treshold: int
    ) -> None:
        self.frequencies = frequencies
        self.merge_treshold = merge_treshold
 class NanoSocratesBPE(Encoder):
    def __init__(self, vocabulary: dict[tuple[int, int], int] | None = None) -> None:
        super().__init__()
        self.__vocabulary: dict[tuple[int, int], int] = {}
        self.__reverse_vocabulary: dict[int, tuple[int, int]] = {}
        if vocabulary is None:
            return
        for key, value in vocabulary.items():
            if value < 256:
                raise OutOfDictionaryException()
                # values under 256 are used for unpaired char
            # TODO: check if they are in order
            self.__vocabulary[key] = value
            self.__reverse_vocabulary[value] = key
    @property
    def vocabulary_size(self):
        return len(self.__vocabulary) + 256
    @property
    def vocabulary(self):
        return self.__vocabulary
    @property
    def __next_id(self) -> int:
        """
        Gets the next it
        Returns:
            int:
        """
        return self.vocabulary_size
    # TODO: implement fit
    def fit(
        self,
        chunk_data: list[int],
        memory: NanoSocratesBatchMemoryBPE,
        last_batch: bool,
    ):
        ENCODED_CHUNK = self.encode_intermediate(chunk_data)
        DATA_LEN_BEFORE_LAST = len(ENCODED_CHUNK) - 1
        # update frequency of each couple of element
        for i in range(0, DATA_LEN_BEFORE_LAST):
            CANDIDATE_COUPLE = (ENCODED_CHUNK[i], ENCODED_CHUNK[i + 1])
            frequency = memory.frequencies.get(CANDIDATE_COUPLE)
            # Initialize frequency
            if frequency is None:
                frequency = 0
                memory.frequencies[CANDIDATE_COUPLE] = 0
            frequency += 1
            memory.frequencies[CANDIDATE_COUPLE] = frequency
        if not last_batch:
            return (self, memory, ENCODED_CHUNK)
        if len(memory.frequencies) < 1:
            return (self, memory, ENCODED_CHUNK)
        FREQUENCIES = memory.frequencies
        MAX_COUPLE = max(FREQUENCIES.items(), key=lambda item: item[1])[0]
        FREQUENCY = FREQUENCIES[MAX_COUPLE]
        if FREQUENCY < memory.merge_treshold:
            return (self, memory, ENCODED_CHUNK)
        self.__learn_word(MAX_COUPLE)
        return (self, memory, ENCODED_CHUNK)
    def encode(self, piece: str) -> list[int]:
        """Encode a String into token IDs, it firt convert it into utf-8, then pass the list of integer to encode_intermediate()
        Args:
            piece (str):
        Returns:
            list[int]:
        """
        converted_piece = list(piece.encode("utf-8"))
        return self.encode_intermediate(converted_piece)
    def encode_intermediate(self, piece: list[int]) -> list[int]:
        """Encode a piece (as list of integer) till its maximum
        Args:
            piece (list[int]): piece to encode
        Returns:
            list[int]: piece encoded
        """
        current_piece = piece
        new_piece = self.__round_encode(current_piece)
        # until current_piece is bigger then new_piece, keep encoding
        while len(current_piece) != len(new_piece):
            current_piece = new_piece
            new_piece = self.__round_encode(current_piece)
        return current_piece
    def __round_encode(self, piece: list[int]):
        """A single round of encode that traverse all the object. Multiple round are needed for a full encode: \n
        1) "ABAB" -> "XX"
        2) "XX" -> "Y"
        Args:
            piece (list[int]): the object to encode as a list of integer
        Returns:
            (list[int]): the one time encoded object
        """
        if len(piece) == 1:
            return piece
        PIECE_LENGTH = len(piece) - 1
        NEW_PIECE: list[int] = []
        index = 0
        while index < PIECE_LENGTH:
            CANDIDATE_WORD = (
                piece[index],
                piece[index + 1],
            )  # take a tuple of consecutive element [int]
            CANDIDATE_TOKEN = self.__vocabulary.get(CANDIDATE_WORD)
            # if no token to substitute the tuple, append the first element
            if CANDIDATE_TOKEN is None:
                NEW_PIECE.append(piece[index])
                index += 1
                # if the latter element of the tuple is the last element of the piece, append it
                if index == PIECE_LENGTH:
                    NEW_PIECE.append(piece[index])
                continue
            # in this case there was a candidate token to substitute the couple of element
            NEW_PIECE.append(CANDIDATE_TOKEN)
            index += 2
            if index == PIECE_LENGTH:
                NEW_PIECE.append(piece[index])
        return NEW_PIECE
    # TODO: Remake decode to take a list of token IDs
    def decode(self, token_ids: list[int]) -> str:
        # deque: double ended queue
        token_stack: deque[int] = deque(token_ids)
        UTF_8_STRING_ARR: bytearray = bytearray()
        while len(token_stack) > 0:
            TOKEN_ID = token_stack.popleft()
            if TOKEN_ID < 256:
                UTF_8_STRING_ARR.append(TOKEN_ID)
                continue
            left_token, right_token = self.__token_decode(TOKEN_ID)
            token_stack.appendleft(right_token)
            token_stack.appendleft(left_token)
        return UTF_8_STRING_ARR.decode("utf-8")
    def __token_decode(self, token_id: int) -> tuple[int, int]:
        CANDIDATE_DECODED = self.__reverse_vocabulary.get(token_id)
        if CANDIDATE_DECODED is None:
            raise OutOfDictionaryException()
        return CANDIDATE_DECODED
    def __learn_word(self, words: tuple[int, int]):
        """learn a new couple of object in the vocabulary
        Args:
            words (tuple[int, int]): the Pair of element to substitute with a new tokenID
        Raises:
            DuplicateWordException: it launch if there is a duplicate of the new tokenID in the dictionary
        """
        ID = self.__next_id
        DUPLICATE = self.__vocabulary.get(words)
        if DUPLICATE is not None:
            raise DuplicateWordException()
        self.__vocabulary[words] = ID
        self.__reverse_vocabulary[ID] = words
--- a/Project_Model/Libs/BPE/Classes/NanoSocratesChunker.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocratesChunker.py
@@ -0,0 +1,70 @@
 from pathlib import Path
 import re
 from ..Errors import DelimiterNotFoundException
 class NanoSocratesChunker:
    def __init__(self, max_size: int, special_token_regex: re.Pattern) -> None:
        self.__max_size: int = max_size
        self.__special_token_regex: re.Pattern = special_token_regex
        self.__residual: str = ""
    # max theorethical size of chars
    #   between special tokens:
    #       - min: size - len(longest_token)
    #       - MAX: size - len(shortest_token)
    def chunk(self, file_path: Path):
        # read_file
        FILE = open(file_path, "r", encoding="utf-8")
        exit = False
        while not exit:
            REMAINING_SIZE = self.__max_size - len(self.__residual)
            READ_SIZE = min(self.__max_size, REMAINING_SIZE)
            FILE_CHUNK = FILE.read(READ_SIZE)
            if len(FILE_CHUNK) == 0:
                exit = True
                continue
            CHUNK = self.__append_residuals(FILE_CHUNK)
            boundaries = self.__identify_boudaries(CHUNK)
            if boundaries is None:
                # boundaries not found in 2 chunks,
                if len(CHUNK) > self.__max_size - 1:
                    raise DelimiterNotFoundException()
                if exit:
                    yield CHUNK
                self.__set_residual(0, CHUNK)
                continue
            start, end = boundaries
            self.__set_residual(end, CHUNK)
            yield CHUNK[start:end]
    def __identify_boudaries(self, corpus: str) -> tuple[int, int] | None:
        end = 0
        for match in self.__special_token_regex.finditer(corpus):
            # print(match)
            end = match.end()
        if end == 0:
            return None
        return (0, end)
    def __append_residuals(self, corpus: str) -> str:
        RESIDUAL = self.__residual
        self.__residual = ""
        return RESIDUAL + corpus
    def __set_residual(self, index: int, corpus: str):
        self.__residual = corpus[index:]
--- a/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py
@@ -0,0 +1,64 @@
 from .Encoder import Encoder
 from ..Errors import OutOfDictionaryException
 class NanoSocratesSpecial(Encoder):
    def __init__(
        self, bpe_vocabulary_size: int, special_tokens: list[str] = []
    ) -> None:
        super().__init__()
        self.__bpe_offset = bpe_vocabulary_size
        self.__vocabulary: dict[str, int] = {}
        self.__reverse_vocabulary: dict[int, str] = {}
        if len(special_tokens) == 0:
            return
        for index, TOKEN in zip(range(0, len(special_tokens)), special_tokens):
            CANDIDATE_ID = self.__bpe_offset + index + 1
            self.__vocabulary[TOKEN] = CANDIDATE_ID
            self.__reverse_vocabulary[CANDIDATE_ID] = TOKEN
    @property
    def __next_id(self):
        BPE_OFFSET = self.__bpe_offset
        VOC_LENGTH = len(self.__vocabulary)
        return BPE_OFFSET + VOC_LENGTH + 1
    @property
    def vocabulary(self) -> dict[str, int]:
        return self.__vocabulary
    @property
    def reverse_vocabulary(self) -> dict[int, str]:
        return self.__reverse_vocabulary
    def add_special_word_to_vocabulary(self, word: str):
        CANDIDATE_INDEX = self.__next_id
        self.__vocabulary[word] = CANDIDATE_INDEX
        self.__reverse_vocabulary[CANDIDATE_INDEX] = word
    def encode(self, word: str) -> list[int]:
        ID = self.__vocabulary.get(word)
        if ID is None:
            raise OutOfDictionaryException()
        return [ID]
    def decode(self, token_id: list[int]) -> str:
        if len(token_id) != 1:
            raise OutOfDictionaryException()
        ID = token_id[0]
        WORD = self.__reverse_vocabulary.get(ID)
        if WORD is None:
            raise OutOfDictionaryException()
        return WORD
--- a/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py
@@ -0,0 +1,98 @@
 import re
 from collections import deque
 from typing import Generator
 from ..Enums import TokenType
 class NanoSocratesSplitter:
    def __init__(
        self, special_token_regex: re.Pattern, max_bpe_token_id: int = 255
    ) -> None:
        # attention the regex got already compiled
        self.__special_token_regex = special_token_regex
        self.__max_bpe_token_id: int = max_bpe_token_id  # used for decoding
    def split_text(self, corpus: str) -> Generator[tuple[str, TokenType]]:
        """Split a text using a regex given
        Args:
            corpus (str): all the corpus string to split
        Yields:
            Generator[tuple[str, TokenType]]: each time returns a piece of the splitted text: string and its TokenType. \n
            TokenType describe if the string is for the BPE or a special Token [BPE, SPECIAL]
        """
        bpe_start = 0
        bpe_end = len(corpus)  # this can be deleted!
        for special_token_start, special_token_end in self.__find_boundaries(corpus):
            # FIND BPE
            bpe_end = special_token_start
            BPE_TOKEN_TEXT = corpus[bpe_start:bpe_end]
            if BPE_TOKEN_TEXT != "":
                for WORD in self.__split_words(BPE_TOKEN_TEXT):
                    yield (WORD, TokenType.BPE)
            # FIND SPECIAL TOKEN
            SPECIAL_TOKEN_TEXT = corpus[special_token_start:special_token_end]
            if SPECIAL_TOKEN_TEXT != "":
                yield (SPECIAL_TOKEN_TEXT, TokenType.SPECIAL)
            # now save the new bpe start point
            # it will used in the next interaction
            bpe_start = special_token_end
    def __find_boundaries(self, corpus: str) -> Generator[tuple[int, int]]:
        """
        Find each time the start and end (not included) of the special token
        Args:
            corpus (str): the string where the special token will be searched
        Yields:
            Generator[tuple[int, int]]: Note the end is not included
        """
        for match in self.__special_token_regex.finditer(corpus):
            start = match.start()
            end = match.end()
            yield (start, end)
        # make the last boundary be the end of corpus
        # eof = len(corpus)
        # yield(eof,eof)
    def __split_words(self, bpe_piece: str) -> Generator[str]:
        END_OF_STRING = len(bpe_piece)
        bound_start = 0
        bound_end = END_OF_STRING + 1
        for i in range(0, END_OF_STRING):
            CANDIDATE_CHAR = bpe_piece[i]
            if CANDIDATE_CHAR != " ":
                continue
            bound_end = i
            yield bpe_piece[bound_start:bound_end]
            bound_start = bound_end
            bound_end = END_OF_STRING + 1
        yield bpe_piece[bound_start:bound_end]
    def split_tokens(self, corpus: list[int]) -> Generator[tuple[list[int], TokenType]]:
        not_special_token_list: list[int] = []
        for token in corpus:
            if token > self.__max_bpe_token_id:
                if len(not_special_token_list) > 0:
                    yield (not_special_token_list, TokenType.BPE)
                    not_special_token_list = []
                yield ([token], TokenType.SPECIAL)
                continue
            not_special_token_list.append(token)
--- a/Project_Model/Libs/BPE/Classes/TokeNano.py
+++ b/Project_Model/Libs/BPE/Classes/TokeNano.py
@@ -0,0 +1,8 @@
 from Project_Model.Libs.BPE.Classes.TokeNanoCore import TokeNanoCore
 class TokeNano:
    def __init__(self):
        pass
--- a/Project_Model/Libs/BPE/Classes/TokeNanoCore.py
+++ b/Project_Model/Libs/BPE/Classes/TokeNanoCore.py
@@ -0,0 +1,62 @@
 from pathlib import Path
 from ..Classes import NanoSocratesSplitter
 from ..Classes import NanoSocratesBPE
 from ..Classes import NanoSocratesSpecial
 from ..Utils import special_regex_maker
 from ..Enums import TokenType
 class TokeNanoCore:
    def __init__(
        self,
        bpe_vocabulary: dict[tuple[int, int], int],
        special_token_list: list[str],
        # special_vocabulary: dict[str, int]
    ):
        self.__bpe_encoder = NanoSocratesBPE(bpe_vocabulary)
        SPECIAL_REGEX = special_regex_maker(special_token_list)
        BPE_VOCABULARY_SIZE = self.__bpe_encoder.vocabulary_size
        self.__splitter = NanoSocratesSplitter(SPECIAL_REGEX, BPE_VOCABULARY_SIZE)
        self.__special_encoder = NanoSocratesSpecial(
            BPE_VOCABULARY_SIZE, special_token_list
        ) 
    def encode(self, corpus: str) -> list[int]:
        output: list[int] = []
        for piece, token_type in self.__splitter.split_text(corpus):
            if token_type == TokenType.SPECIAL:
                ENCODED_PIECE = self.__special_encoder.encode(piece)
                output.extend(ENCODED_PIECE)
                continue
            # slow but clear
            if token_type == TokenType.BPE:
                ENCODED_PIECE = self.__bpe_encoder.encode(piece)
                output.extend(ENCODED_PIECE)
                continue
        return output
    def decode(self, corpus: list[int]) -> str:
        output_str = ""
        for token, token_type in self.__splitter.split_tokens(corpus):
            # token is an integer if special, a list of integer otherwise
            if token_type == TokenType.SPECIAL:
                output_str += self.__special_encoder.decode(
                    token
                )
                continue
            # slow but clear
            if token_type == TokenType.BPE:
                output_str += self.__bpe_encoder.decode(
                    token
                )
                continue
        return output_str
--- a/Project_Model/Libs/BPE/Classes/init.py
+++ b/Project_Model/Libs/BPE/Classes/init.py
@@ -0,0 +1,18 @@
 from .NanoSocratesChunker import NanoSocratesChunker
 from .NanoSocratesSplitter import NanoSocratesSplitter
 from .NanoSocratesBPE import NanoSocratesBPE, NanoSocratesBatchMemoryBPE
 from .NanoSocraTrainer import NanoSocraTrainer
 from .NanoSocraTraineRam import NanoSocraTraineRam
 from .NanoSocraTrainerPool import NanoSocraTrainerPool
 from .NanoSocratesSpecial import NanoSocratesSpecial
 from .TokeNanoCore import TokeNanoCore
 __all__ = [
    "NanoSocratesChunker",
    "NanoSocratesSplitter",
    "NanoSocratesBPE",
    "NanoSocraTrainer",
    "NanoSocraTraineRam",
    "NanoSocraTrainerPool",
    "TokeNanoCore"
 ]
--- a/Project_Model/Libs/BPE/Enums/SpecialToken.py
+++ b/Project_Model/Libs/BPE/Enums/SpecialToken.py
@@ -0,0 +1,21 @@
 from enum import Enum
 class SpecialToken(Enum):
    # (Enum, str) -> throws an error
    START_TRIPLE_LIST = "<SOTL>"
    START_TRIPLE = "<SOT>"
    END_TRIPLE = "<EOT>"
    SUBJECT = "<SUBJ>"
    RELATIONSHIP = "<PRED>"
    OBJECT = "<OBJ>"
    ABSTRACT = "<ABS>"
    CORPUS_END = "<END>"
    ## Tasks' Token
    RDF_TO_TEXT = "<RDF2TXT>"
    TEXT_TO_RDF = "<TEXT2RDF>"
    CONTINUE_RDF = "<CONTINUERDF>"
    MASK = "<MASK>"
    # BPE Training:
--- a/Project_Model/Libs/BPE/Enums/TokenType.py
+++ b/Project_Model/Libs/BPE/Enums/TokenType.py
@@ -0,0 +1,6 @@
 from enum import Enum, auto
 class TokenType(Enum):
    SPECIAL = auto()
    BPE = auto()
--- a/Project_Model/Libs/BPE/Enums/init.py
+++ b/Project_Model/Libs/BPE/Enums/init.py
@@ -0,0 +1 @@
 from .TokenType import TokenType
--- a/Project_Model/Libs/BPE/Errors/DelimiterNotFoundException.py
+++ b/Project_Model/Libs/BPE/Errors/DelimiterNotFoundException.py
@@ -0,0 +1,4 @@
 class DelimiterNotFoundException(Exception):
    def __init__(self, *args: object) -> None:
        super().__init__(*args)
--- a/Project_Model/Libs/BPE/Errors/DuplicateWordException.py
+++ b/Project_Model/Libs/BPE/Errors/DuplicateWordException.py
@@ -0,0 +1,4 @@
 class DuplicateWordException(Exception):
    def __init__(self, *args: object) -> None:
        super().__init__(*args)
--- a/Project_Model/Libs/BPE/Errors/OutOfDictionaryException.py
+++ b/Project_Model/Libs/BPE/Errors/OutOfDictionaryException.py
@@ -0,0 +1,4 @@
 class OutOfDictionaryException(Exception):
    def __init__(self, *args: object) -> None:
        super().__init__(*args)
--- a/Project_Model/Libs/BPE/Errors/SentenceTooLongException.py
+++ b/Project_Model/Libs/BPE/Errors/SentenceTooLongException.py
@@ -0,0 +1,4 @@
 class SentenceTooLongException(Exception):
    def __init__(self, *args: object) -> None:
        super().__init__(*args)
--- a/Project_Model/Libs/BPE/Errors/init.py
+++ b/Project_Model/Libs/BPE/Errors/init.py
@@ -0,0 +1,11 @@
 from .DelimiterNotFoundException import DelimiterNotFoundException
 from .OutOfDictionaryException import OutOfDictionaryException
 from .DuplicateWordException import DuplicateWordException
 from .SentenceTooLongException import SentenceTooLongException
 __all__ = [
    "DelimiterNotFoundException",
    "OutOfDictionaryException",
    "DuplicateWordException",
    "SentenceTooLongException"
 ]
--- a/Project_Model/Libs/BPE/Utils/init.py
+++ b/Project_Model/Libs/BPE/Utils/init.py
@@ -0,0 +1,13 @@
 from .special_regex_maker import special_regex_maker
 from .lag_checker_iterator import iterator_with_checks
 from .vocabulary import save_nanos_vocabulary, load_nanos_vocabulary
 from .json_utils import save_json, load_json
 from .special_regex_maker import special_regex_maker
 __all__ = [
    "special_regex_maker",
    "iterator_with_checks",
    "save_nanos_vocabulary",
    "load_nanos_vocabulary",
    "save_json", "load_json"
 ]
--- a/Project_Model/Libs/BPE/Utils/json_utils.py
+++ b/Project_Model/Libs/BPE/Utils/json_utils.py
@@ -0,0 +1,18 @@
 import json
 from pathlib import Path
 def save_json(dictionary: dict, path: Path):
    json_string = json.dumps(dictionary)
    FILE = open(path, "w")
    FILE.write(json_string)
    FILE.close()
 def load_json(path: Path) -> dict:
    FILE = open(path, "r")
    json_string = FILE.read()
    FILE.close()
    return json.loads(json_string)
--- a/Project_Model/Libs/BPE/Utils/lag_checker_iterator.py
+++ b/Project_Model/Libs/BPE/Utils/lag_checker_iterator.py
@@ -0,0 +1,27 @@
 from collections import deque
 from typing import Generator, TypeVar
 T1 = TypeVar("T1")
 T2 = TypeVar("T2")
 T3 = TypeVar("T3")
 def iterator_with_checks(
    generator: Generator[T1, T2, T3],
 ) -> Generator[tuple[T1, bool], T2, T3]:
    # Here we can ignore to catch stop iteration
    #   we will propagate it
    last_element = next(generator)
    while True:
        RETURN_ELEMENT = last_element
        try:
            element = next(generator)
            last_element = element
            yield (RETURN_ELEMENT, False)
        except StopIteration:
            yield (RETURN_ELEMENT, True)
            break
--- a/Project_Model/Libs/BPE/Utils/special_regex_maker.py
+++ b/Project_Model/Libs/BPE/Utils/special_regex_maker.py
@@ -0,0 +1,15 @@
 import re
 def special_regex_maker(special_tokens: list[str]) -> re.Pattern:
    """compile a regex for the special token
    Args:
        special_tokens (list[str]): the list of special token
    Returns:
        re.Pattern:
    """
    REGEX_STR = "|".join(special_tokens)
    return re.compile(REGEX_STR)
--- a/Project_Model/Libs/BPE/Utils/vocabulary.py
+++ b/Project_Model/Libs/BPE/Utils/vocabulary.py
@@ -0,0 +1,49 @@
 import json
 from pathlib import Path
 from ..Errors import OutOfDictionaryException
 def nanos_vocabulary2json_str(vocabulary: dict[tuple[int, int], int]) -> str:
    JSON: dict[str, int] = {}
    for key, item in vocabulary.items():
        TUPLE_STR = f"{key}"
        JSON[TUPLE_STR] = item
    return json.dumps(JSON)
 def nanos_json_str2vocabulary(json_string: str) -> dict[tuple[int, int], int]:
    JSON: dict[str, int] = json.loads(json_string)
    VOCABULARY: dict[tuple[int, int], int] = {}
    for key, item in JSON.items():
        REDUCED_KEY = len(key) - 1
        KEY_STR = key[1:REDUCED_KEY]
        VOC_KEY = tuple(map(int, KEY_STR.split(",")))
        if len(VOC_KEY) != 2:
            raise OutOfDictionaryException()
        # Checked for weird things above
        VOCABULARY[VOC_KEY] = item  # type: ignore
    return VOCABULARY
 def save_nanos_vocabulary(vocabulary: dict[tuple[int, int], int], path: Path):
    json_string = nanos_vocabulary2json_str(vocabulary)
    FILE = open(path, "w")
    FILE.write(json_string)
    FILE.close()
 def load_nanos_vocabulary(path: Path) -> dict[tuple[int, int], int]:
    FILE = open(path, "r")
    json_string = FILE.read()
    FILE.close()
    return nanos_json_str2vocabulary(json_string)
--- a/Project_Model/Libs/BPE/init.py
+++ b/Project_Model/Libs/BPE/init.py
@@ -0,0 +1,9 @@
 from .Classes import *
 from .Enums import *
 from .Errors import *
 from .Utils import *
 from . import Classes
 from . import Enums
 from . import Errors
 from . import Utils
--- a/Project_Model/Libs/init.py
+++ b/Project_Model/Libs/init.py
@@ -0,0 +1 @@
 from . import BPE
--- a/Project_Model/Tests/bpe_test.py
+++ b/Project_Model/Tests/bpe_test.py
@@ -0,0 +1,74 @@
 from Project_Model.Libs.BPE.Enums import TokenType
 import Project_Model.Libs.BPE as BPE
 import re
 class TestBPE:
    def test_bpe_encoding_simple(self):
        TEXT = "abababab"
        # ab = 256
        # 256, 256 = 257
        # 257, 257 = 258
        VOCABULARY = {(ord("a"), ord("b")): 256, (256, 256): 257, (257, 257): 258}
        EXPECTED = [258]
        BPE_ENCODER = BPE.NanoSocratesBPE(VOCABULARY)
        ENCODED = BPE_ENCODER.encode(TEXT)
        assert len(ENCODED) == len(EXPECTED)
        for encoded, expected in zip(ENCODED, EXPECTED):
            assert encoded == expected
    def test_bpe_decoding_simple(self):
        INPUT = [258]
        # ab = 256
        # 256, 256 = 257
        # 257, 257 = 258
        VOCABULARY = {(ord("a"), ord("b")): 256, (256, 256): 257, (257, 257): 258}
        EXPECTED = "abababab"
        BPE_ENCODER = BPE.NanoSocratesBPE(VOCABULARY)
        DECODED = BPE_ENCODER.decode(INPUT)
        assert len(DECODED) == len(EXPECTED)
        for encoded, expected in zip(DECODED, EXPECTED):
            assert encoded == expected
    def test_bpe_decoding_edge_1(self):
        INPUT = [258, ord("c")]
        # ab = 256
        # 256, 256 = 257
        # 257, 257 = 258
        VOCABULARY = {(ord("a"), ord("b")): 256, (256, 256): 257, (257, 257): 258}
        EXPECTED = "ababababc"
        BPE_ENCODER = BPE.NanoSocratesBPE(VOCABULARY)
        DECODED = BPE_ENCODER.decode(INPUT)
        assert len(DECODED) == len(EXPECTED)
        for encoded, expected in zip(DECODED, EXPECTED):
            assert encoded == expected
 # Useful to debug weird cases
 if __name__ == "__main__":
    # TestBPE().test_bpe_decoding_simple()
    TestBPE().test_bpe_encoding_simple()
--- a/Project_Model/Tests/bpe_trainer_test.py
+++ b/Project_Model/Tests/bpe_trainer_test.py
@@ -0,0 +1,77 @@
 from pathlib import Path
 from Project_Model.Libs.BPE.Enums import TokenType
 import Project_Model.Libs.BPE as BPE
 import re
 CACHE_DIR_PATH = Path("Project_Model/Tests/trainer_files/cache/pool-cache.json")
 class TestTrainBPE:
    def test_bpe_train_encoding_simple(self):
        TRAINER = BPE.NanoSocraTrainerPool(
            int(32E3),
            ["<SOT>", "<EOT>"]
        )
        TEXT = "abababab"
        TEXT_PATH = Path("Project_Model/Tests/trainer_files/train_simple.txt")
        EXPECTED = [258]
        # ab = 256
        # 256, 256 = 257
        # 257, 257 = 258
        BPE_ENCODER = TRAINER.trainBPE(
            TEXT_PATH,
            CACHE_DIR_PATH
        )
        ENCODED = BPE_ENCODER.encode(TEXT)
        assert len(ENCODED) == len(EXPECTED)
        for encoded, expected in zip(ENCODED, EXPECTED):
            assert encoded == expected
    def test_bpe_train_encoding_and_decoding(self):
        SPECIAL_LIST = ["<ABS>", "<SOTL>"]
        TRAINER = BPE.NanoSocraTrainerPool(
            int(32E3),
            SPECIAL_LIST
        )
        TEXT_PATH = Path("Project_Model/Tests/trainer_files/train_encode_decode.txt")
        FILE = open(TEXT_PATH)
        TEXT = FILE.read()
        FILE.close()
        EXPECTED = TEXT
        # ab = 256
        # 256, 256 = 257
        # 257, 257 = 258
        BPE_ENCODER = TRAINER.trainBPE(
            TEXT_PATH,
            CACHE_DIR_PATH
        )
        VOCABULARY = BPE_ENCODER.vocabulary
        TOKENANO = BPE.TokeNanoCore(VOCABULARY,SPECIAL_LIST)
        ENCODED = TOKENANO.encode(TEXT)
        DECODED = TOKENANO.decode(ENCODED)
        assert len(DECODED) == len(EXPECTED)
        for decoded, expected in zip(DECODED, EXPECTED):
            assert decoded == expected
 # Useful to debug weird cases
 if __name__ == "__main__":
    # TestTrainBPE().test_bpe_train_encoding_simple()
    TestTrainBPE().test_bpe_train_encoding_and_decoding()
--- a/Project_Model/Tests/chunker_files/edge-1.txt
+++ b/Project_Model/Tests/chunker_files/edge-1.txt
@@ -0,0 +1,4 @@
 <SOT>Lorem <SEP>ipsu<SEP>m d<SEP>olor s<SEP>it ame<SEP>t,
 <SEP>conse<SEP>cte<SEP>tur adip<SEP>iscin<SEP>g elit.
 <SEP>Aenean a<SEP>t dui he<SEP>ndrer<SEP>it an<SEP>te soll<SEP>icitud
 <SEP>in sce<SEP>lerisque<EOT>
--- a/Project_Model/Tests/chunker_files/simple.txt
+++ b/Project_Model/Tests/chunker_files/simple.txt
@@ -0,0 +1,2 @@
 <SOT>Lorem ipsum <SEP>dolor sit amet<SEP>, consectetur <SEP>adipiscing elit.
 <SEP>Aenean at dui <SEP>hendrerit ante <SEP>sollicitudin <SEP>scelerisque<EOT>
--- a/Project_Model/Tests/chunker_files/stress.txt
+++ b/Project_Model/Tests/chunker_files/stress.txt
@@ -0,0 +1,3 @@
 <SOT>Lorem ipsum <SEP>dolor sit amet<SEP>, consectetur <SEP>adipiscing elit.
 <SEP>Aenean at dui <SEP>hendrerit an te <SEP>sollicitudin <SEP>scelerisque
 <SEP>dsdsasssdfdsdsfkjddsnfkjdsnfkjdnsjkfndf<EOT>
--- a/Project_Model/Tests/chunker_test.py
+++ b/Project_Model/Tests/chunker_test.py
@@ -0,0 +1,89 @@
 from pathlib import Path
 import re
 import pytest
 import Project_Model.Libs.BPE as BPE
 PATTERN = "<(TOKEN|SOT|SEP|EOT)>"
 SYMBOL_REGEX = re.compile(PATTERN)
 class TestChunker:
    def test_correct_simple(self):
        FILE_PATH = Path("Project_Model/Tests/chunker_files/simple.txt")
        LEAST_EXPECTED_CHUNKS = 3
        ORIG_TEXT = FILE_PATH.read_text(encoding="utf-8")
        CHUNKER = BPE.NanoSocratesChunker(40, SYMBOL_REGEX)
        CHUNKS = []
        for chunk in CHUNKER.chunk(FILE_PATH):
            print(chunk)
            CHUNKS.append(
                chunk
            )
        NANO_TEXT = "".join(CHUNKS)
        assert len(CHUNKS) > (LEAST_EXPECTED_CHUNKS - 1)
        assert NANO_TEXT == ORIG_TEXT
    def test_correct_edge_1(self):
        FILE_PATH = Path("Project_Model/Tests/chunker_files/edge-1.txt")
        LEAST_EXPECTED_CHUNKS = 3
        ORIG_TEXT = FILE_PATH.read_text(encoding="utf-8")
        CHUNKER = BPE.NanoSocratesChunker(15, SYMBOL_REGEX)
        CHUNKS = []
        for chunk in CHUNKER.chunk(FILE_PATH):
            print(chunk)
            CHUNKS.append(
                chunk
            )
        NANO_TEXT = "".join(CHUNKS)
        assert len(CHUNKS) > (LEAST_EXPECTED_CHUNKS - 1)
        assert NANO_TEXT == ORIG_TEXT
    def test_throwing(self):
        FILE_PATH = Path("Project_Model/Tests/chunker_files/simple.txt")
        CHUNKER = BPE.NanoSocratesChunker(5, SYMBOL_REGEX)
        with pytest.raises(BPE.DelimiterNotFoundException):
            for chunk in CHUNKER.chunk(FILE_PATH):
                print(chunk)
 if __name__ == "__main__":
    FILE_PATH = Path("Project_Model/Tests/chunker_files/stress.txt")
    LEAST_EXPECTED_CHUNKS = 3
    ORIG_TEXT = FILE_PATH.read_text(encoding="utf-8")
    CHUNKER = BPE.NanoSocratesChunker(40, SYMBOL_REGEX)
    CHUNKS = []
    try:
        for chunk in CHUNKER.chunk(FILE_PATH):
            print(f"CHUNK START:\n{chunk} - {len(chunk)}\nCHUNK END\n")
            CHUNKS.append(
                chunk
            )
    except:
        exit(0)
    NANO_TEXT = "".join(CHUNKS)
    assert len(CHUNKS) > (LEAST_EXPECTED_CHUNKS - 1)
    assert NANO_TEXT == ORIG_TEXT
--- a/Project_Model/Tests/splitter_test.py
+++ b/Project_Model/Tests/splitter_test.py
@@ -0,0 +1,182 @@
 from Project_Model.Libs.BPE.Enums import TokenType
 import Project_Model.Libs.BPE as BPE
 import re
 PATTERN = "<(TOKEN|SOT|SEP|EOT)>"
 SYMBOL_REGEX = re.compile(PATTERN)
 class TestSplitter:
    def test_split(self):
        TEXT = "<SOT>Lorem <SEP>"
        SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX)
        EXPECTED_CHUNKS = [
            ("<SOT>", TokenType.SPECIAL),
            ("Lorem", TokenType.BPE),
            (" ", TokenType.BPE),
            ("<SEP>", TokenType.SPECIAL),
        ]
        CHUNKS = list(SPLITTER.split_text(TEXT))
        assert len(CHUNKS) == len(EXPECTED_CHUNKS)
        for chunk, expected_chunk in zip(EXPECTED_CHUNKS, CHUNKS):
            print(f"TEST:\n\tCHUNK:\t\t{chunk}\n\tEXPECTED:\t\t{expected_chunk}")
            RECEIVED_TOKEN_STRING, RECEIVED_TOKEN_TYPE = chunk
            EXPECTED_TOKEN_STRING, EXPECTED_TOKEN_TYPE = expected_chunk
            assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING
            assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE
    def test_split_trailing_text(self):
        TEXT = "ipsu<SEP>m d<SEP>olor"
        SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX)
        EXPECTED_CHUNKS = [
            ("ipsu", TokenType.BPE),
            ("<SEP>", TokenType.SPECIAL),
            ("m", TokenType.BPE),
            (" d", TokenType.BPE),
            ("<SEP>", TokenType.SPECIAL),
            # ("olor", TokenType.BPE)
        ]
        CHUNKS = list(SPLITTER.split_text(TEXT))
        assert len(CHUNKS) == len(EXPECTED_CHUNKS)
        for chunk, expected_chunk in zip(EXPECTED_CHUNKS, CHUNKS):
            print(f"TEST:\n\tCHUNK:\t\t{chunk}\n\tEXPECTED:\t\t{expected_chunk}")
            RECEIVED_TOKEN_STRING, RECEIVED_TOKEN_TYPE = chunk
            EXPECTED_TOKEN_STRING, EXPECTED_TOKEN_TYPE = expected_chunk
            assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING
            assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE
    def test_split_multi_token(self):
        TEXT = "ipsu<SEP>m d<SEP><SEP><SEP>dsg<SEP>olor"
        SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX)
        EXPECTED_CHUNKS = [
            ("ipsu", TokenType.BPE),
            ("<SEP>", TokenType.SPECIAL),
            ("m", TokenType.BPE),
            (" d", TokenType.BPE),
            ("<SEP>", TokenType.SPECIAL),
            ("<SEP>", TokenType.SPECIAL),
            ("<SEP>", TokenType.SPECIAL),
            ("dsg", TokenType.BPE),
            ("<SEP>", TokenType.SPECIAL),
        ]
        CHUNKS = list(SPLITTER.split_text(TEXT))
        assert len(CHUNKS) == len(EXPECTED_CHUNKS)
        for chunk, expected_chunk in zip(EXPECTED_CHUNKS, CHUNKS):
            print(f"TEST:\n\tCHUNK:\t\t{chunk}\n\tEXPECTED:\t\t{expected_chunk}")
            RECEIVED_TOKEN_STRING, RECEIVED_TOKEN_TYPE = chunk
            EXPECTED_TOKEN_STRING, EXPECTED_TOKEN_TYPE = expected_chunk
            assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING
            assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE
    def test_split_malformed_1(self):
        TEXT = "<SEP>lerisque"
        SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX)
        EXPECTED_CHUNKS = [
            ("<SEP>", TokenType.SPECIAL),
        ]
        CHUNKS = list(SPLITTER.split_text(TEXT))
        assert len(CHUNKS) == len(EXPECTED_CHUNKS)
        for chunk, expected_chunk in zip(EXPECTED_CHUNKS, CHUNKS):
            print(f"TEST:\n\tCHUNK:\t\t{chunk}\n\tEXPECTED:\t\t{expected_chunk}")
            RECEIVED_TOKEN_STRING, RECEIVED_TOKEN_TYPE = chunk
            EXPECTED_TOKEN_STRING, EXPECTED_TOKEN_TYPE = expected_chunk
            assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING
            assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE
    def test_split_malformed_2(self):
        TEXT = "lerisque"
        SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX)
        EXPECTED_CHUNKS = []
        CHUNKS = list(SPLITTER.split_text(TEXT))
        assert len(CHUNKS) == len(EXPECTED_CHUNKS)
        for chunk, expected_chunk in zip(EXPECTED_CHUNKS, CHUNKS):
            print(f"TEST:\n\tCHUNK:\t\t{chunk}\n\tEXPECTED:\t\t{expected_chunk}")
            RECEIVED_TOKEN_STRING, RECEIVED_TOKEN_TYPE = chunk
            EXPECTED_TOKEN_STRING, EXPECTED_TOKEN_TYPE = expected_chunk
            assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING
            assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE
    def test_split_token_decode_simple(self):
        # to test the token split into special and bpe
        SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX, max_bpe_token_id=1473)
        token_list = [100, 101, 1477]
        CHUNKS = list(SPLITTER.split_tokens(token_list))
        EXPECTED_CHUNKS = [
            ([100, 101], TokenType.BPE),
            ([1477], TokenType.SPECIAL),
        ]
        assert len(CHUNKS) == len(EXPECTED_CHUNKS)
        for chunk, expected_chunk in zip(EXPECTED_CHUNKS, CHUNKS):
            print(f"TEST:\n\tCHUNK:\t\t{chunk}\n\tEXPECTED:\t\t{expected_chunk}")
            RECEIVED_TOKEN_STRING, RECEIVED_TOKEN_TYPE = chunk
            EXPECTED_TOKEN_STRING, EXPECTED_TOKEN_TYPE = expected_chunk
            assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING
            assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE
    def test_split_token_decode_simple_malformed(self):
        # to test the token split into special and bpe
        SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX, max_bpe_token_id=1473)
        token_list = [100, 101, 1477, 100]
        CHUNKS = list(SPLITTER.split_tokens(token_list))
        EXPECTED_CHUNKS = [
            ([100, 101], TokenType.BPE),
            ([1477], TokenType.SPECIAL),
        ]
        assert len(CHUNKS) == len(EXPECTED_CHUNKS)
        for chunk, expected_chunk in zip(EXPECTED_CHUNKS, CHUNKS):
            print(f"TEST:\n\tCHUNK:\t\t{chunk}\n\tEXPECTED:\t\t{expected_chunk}")
            RECEIVED_TOKEN_STRING, RECEIVED_TOKEN_TYPE = chunk
            EXPECTED_TOKEN_STRING, EXPECTED_TOKEN_TYPE = expected_chunk
            assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING
            assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE
 # Useful to debug weird cases
 if __name__ == "__main__":
    TestSplitter().test_split_trailing_text()
--- a/Project_Model/Tests/tokenano_test.py
+++ b/Project_Model/Tests/tokenano_test.py
@@ -0,0 +1,21 @@
 from Project_Model.Libs.BPE.Classes.TokeNanoCore import TokeNanoCore
 class TestTokeNano:
    def test_decode_encode_simple(self):
        TEXT = "<SOT>abababab<EOT>"
        # ab = 256
        # 256, 256 = 257
        # 257, 257 = 258
        VOCABULARY = {(ord("a"), ord("b")): 256, (256, 256): 257, (257, 257): 258}
        # EXPECTED = [258]
        TOKE_NANO = TokeNanoCore(VOCABULARY, ["<SOT>", "<EOT>"])
        ENCODED = TOKE_NANO.encode(TEXT)
        DECODED = TOKE_NANO.decode(ENCODED)
        assert TEXT == DECODED
--- a/Project_Model/Tests/trainer_files/cache/.gitkeep
+++ b/Project_Model/Tests/trainer_files/cache/.gitkeep
--- a/Project_Model/Tests/trainer_files/train_encode_decode.txt
+++ b/Project_Model/Tests/trainer_files/train_encode_decode.txt
@@ -0,0 +1 @@
 <ABS>The Dark Knight is a 2008 superhero film directed by Christopher Nolan,<SOTL>
--- a/Project_Model/Tests/trainer_files/train_simple.txt
+++ b/Project_Model/Tests/trainer_files/train_simple.txt
@@ -0,0 +1 @@
 <SOT>abababab<EOT>
--- a/Project_Model/UML/bpe.excalidraw.json
+++ b/Project_Model/UML/bpe.excalidraw.json
@@ -0,0 +1,695 @@
 {
  "type": "excalidraw",
  "version": 2,
  "source": "https://marketplace.visualstudio.com/items?itemName=pomdtr.excalidraw-editor",
  "elements": [
    {
      "id": "EcT-dGsjmfW571ov8Gg4F",
      "type": "text",
      "x": 425.5,
      "y": 132,
      "width": 506,
      "height": 425,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [
        "4rCC2-N1thmII8_dwNhe1"
      ],
      "frameId": null,
      "index": "a3V",
      "roundness": null,
      "seed": 523521109,
      "version": 883,
      "versionNonce": 1590682729,
      "isDeleted": false,
      "boundElements": [
        {
          "id": "OA_NKjb3n3NLtUo_tKmPS",
          "type": "arrow"
        }
      ],
      "updated": 1758881654155,
      "link": null,
      "locked": false,
      "text": "class NanoSocratesBPE(Encoder):\n    - vocabulary: Vocabulary\n\n    + fit(\n        data: [[int]], \n        memory: NanoSocratesBPE_BatchMemory,\n        last_sentence_chunk: bool, \n        last_batch: bool\n      ) -> (Self, NanoSocratesBPE_BatchMemory)\n\n    + encode(word: [byte]) -> [int]\n\n    + decode(token: [int]) -> [byte]\n\n    + get_vocabulary_size() -> int\n    \n",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class NanoSocratesBPE(Encoder):\n    - vocabulary: Vocabulary\n\n    + fit(\n        data: [[int]], \n        memory: NanoSocratesBPE_BatchMemory,\n        last_sentence_chunk: bool, \n        last_batch: bool\n      ) -> (Self, NanoSocratesBPE_BatchMemory)\n\n    + encode(word: [byte]) -> [int]\n\n    + decode(token: [int]) -> [byte]\n\n    + get_vocabulary_size() -> int\n    \n",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "74i4oK-JpcM4CgAqhz_x_",
      "type": "rectangle",
      "x": 382.5,
      "y": 104.5,
      "width": 592.5,
      "height": 421,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [
        "4rCC2-N1thmII8_dwNhe1"
      ],
      "frameId": null,
      "index": "a4",
      "roundness": {
        "type": 3
      },
      "seed": 50827893,
      "version": 319,
      "versionNonce": 704459557,
      "isDeleted": false,
      "boundElements": [],
      "updated": 1758878226277,
      "link": null,
      "locked": false
    },
    {
      "id": "s8I1JoKulE3Vnti9a374p",
      "type": "text",
      "x": 1113.5,
      "y": 127,
      "width": 517,
      "height": 325,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [
        "M6w9efVFwOZHkJGgwkyEw"
      ],
      "frameId": null,
      "index": "a5",
      "roundness": null,
      "seed": 2091174261,
      "version": 480,
      "versionNonce": 1964948039,
      "isDeleted": false,
      "boundElements": [],
      "updated": 1758881941367,
      "link": null,
      "locked": false,
      "text": "class Vocabulary:\n\n    - vocabulary: dict<(int, int), int>\n    - reverse_vocabulary: dict<int, (int, int)>\n\n    + size -> int\n\n    + add_word(int) -> int\n    + encode(int) -> int\n    + decode(int) -> int\n    \n\n",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class Vocabulary:\n\n    - vocabulary: dict<(int, int), int>\n    - reverse_vocabulary: dict<int, (int, int)>\n\n    + size -> int\n\n    + add_word(int) -> int\n    + encode(int) -> int\n    + decode(int) -> int\n    \n\n",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "BY_Why7XDNftdMzPcwjVZ",
      "type": "rectangle",
      "x": 1086.5,
      "y": 105.5,
      "width": 593.0000000000001,
      "height": 325.5,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [
        "M6w9efVFwOZHkJGgwkyEw"
      ],
      "frameId": null,
      "index": "a6",
      "roundness": {
        "type": 3
      },
      "seed": 153939611,
      "version": 234,
      "versionNonce": 2068149129,
      "isDeleted": false,
      "boundElements": [
        {
          "id": "WcDks9DR8UqeZEaxAcRf9",
          "type": "arrow"
        }
      ],
      "updated": 1758881945661,
      "link": null,
      "locked": false
    },
    {
      "id": "JCPDhuTKRx4MN950Q3jL-",
      "type": "text",
      "x": 1116.411067193676,
      "y": 477.3809288774704,
      "width": 416.74578857421875,
      "height": 99.70355731225297,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [
        "DbtlKVF_9SjH2-9iMq9zy"
      ],
      "frameId": null,
      "index": "a7",
      "roundness": null,
      "seed": 1326854235,
      "version": 479,
      "versionNonce": 595084597,
      "isDeleted": false,
      "boundElements": [],
      "updated": 1758902358518,
      "link": null,
      "locked": false,
      "text": "class NanoSocrateBPE_BatchMemory:\n\n    + frequency: dict<(int, int), int>\n    + merge_treshold: int",
      "fontSize": 19.940711462450594,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class NanoSocrateBPE_BatchMemory:\n\n    + frequency: dict<(int, int), int>\n    + merge_treshold: int",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "l-O0rMS3SruV22_MPX9Jz",
      "type": "rectangle",
      "x": 1086.5,
      "y": 451.4580039762846,
      "width": 593,
      "height": 208.0419960474308,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [
        "DbtlKVF_9SjH2-9iMq9zy"
      ],
      "frameId": null,
      "index": "a8",
      "roundness": {
        "type": 3
      },
      "seed": 1490898171,
      "version": 305,
      "versionNonce": 587306139,
      "isDeleted": false,
      "boundElements": [
        {
          "id": "OA_NKjb3n3NLtUo_tKmPS",
          "type": "arrow"
        }
      ],
      "updated": 1758902358518,
      "link": null,
      "locked": false
    },
    {
      "id": "WcDks9DR8UqeZEaxAcRf9",
      "type": "arrow",
      "x": 773.5,
      "y": 167,
      "width": 297.17936724485867,
      "height": 30,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aB",
      "roundness": {
        "type": 2
      },
      "seed": 1681364149,
      "version": 303,
      "versionNonce": 1262492265,
      "isDeleted": false,
      "boundElements": [],
      "updated": 1758881945661,
      "link": null,
      "locked": false,
      "points": [
        [
          0,
          0
        ],
        [
          144.5,
          -1.5
        ],
        [
          177.5,
          -30
        ],
        [
          297.17936724485867,
          -29.020420978562214
        ]
      ],
      "lastCommittedPoint": null,
      "startBinding": null,
      "endBinding": {
        "elementId": "BY_Why7XDNftdMzPcwjVZ",
        "focus": 0.77319587628866,
        "gap": 18.25
      },
      "startArrowhead": null,
      "endArrowhead": "triangle",
      "elbowed": false
    },
    {
      "id": "OA_NKjb3n3NLtUo_tKmPS",
      "type": "arrow",
      "x": 946.0000000000002,
      "y": 274.95951048200493,
      "width": 130.016707976343,
      "height": 209.36808480159067,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aD",
      "roundness": {
        "type": 2
      },
      "seed": 1871768059,
      "version": 1039,
      "versionNonce": 213535035,
      "isDeleted": false,
      "boundElements": [],
      "updated": 1758902358519,
      "link": null,
      "locked": false,
      "points": [
        [
          0,
          0
        ],
        [
          54.99999999999977,
          12.54048951799507
        ],
        [
          69.49999999999977,
          188.54048951799507
        ],
        [
          130.016707976343,
          209.36808480159067
        ]
      ],
      "lastCommittedPoint": null,
      "startBinding": {
        "elementId": "EcT-dGsjmfW571ov8Gg4F",
        "focus": -0.48312180762055096,
        "gap": 14.500000000000114
      },
      "endBinding": {
        "elementId": "l-O0rMS3SruV22_MPX9Jz",
        "focus": -0.16742658425737647,
        "gap": 11.194126334166185
      },
      "startArrowhead": null,
      "endArrowhead": "triangle",
      "elbowed": false
    },
    {
      "id": "snZ__VDsIlri6NTp8M2Gf",
      "type": "text",
      "x": -245.25,
      "y": 103,
      "width": 330,
      "height": 125,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aE",
      "roundness": null,
      "seed": 1758461093,
      "version": 265,
      "versionNonce": 1069481861,
      "isDeleted": false,
      "boundElements": [],
      "updated": 1758879566916,
      "link": null,
      "locked": false,
      "text": "class NanoSocratesBPETrainer:\n\n    - max_iterations: int\n    - max_vocabulary_size: int\n    - merge_treshold: int",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class NanoSocratesBPETrainer:\n\n    - max_iterations: int\n    - max_vocabulary_size: int\n    - merge_treshold: int",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "PnbmqwEWYkP8oXElKFyTp",
      "type": "text",
      "x": -237.75,
      "y": 544,
      "width": 561,
      "height": 125,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aH",
      "roundness": null,
      "seed": 501304683,
      "version": 241,
      "versionNonce": 1306401003,
      "isDeleted": false,
      "boundElements": [],
      "updated": 1758878748210,
      "link": null,
      "locked": false,
      "text": "class NanoSocratesSplitter:\n    + splitter_regex: regex\n\n    + split_text(corpus: str) -> [(str, TokenType)]\n",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class NanoSocratesSplitter:\n    + splitter_regex: regex\n\n    + split_text(corpus: str) -> [(str, TokenType)]\n",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "xR_11IzgXX5O-m6WoRfCL",
      "type": "text",
      "x": -233.25,
      "y": 366.5,
      "width": 165,
      "height": 75,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aI",
      "roundness": null,
      "seed": 2025585125,
      "version": 395,
      "versionNonce": 1799178985,
      "isDeleted": false,
      "boundElements": [],
      "updated": 1758883940168,
      "link": null,
      "locked": false,
      "text": "enum TokenType:\n    + SPECIAL\n    + BPE",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "enum TokenType:\n    + SPECIAL\n    + BPE",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "lgKSd9qCb94-5e8rd9I3r",
      "type": "text",
      "x": -219.75,
      "y": 764.5,
      "width": 462,
      "height": 275,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aJ",
      "roundness": null,
      "seed": 1963214021,
      "version": 464,
      "versionNonce": 1104453739,
      "isDeleted": false,
      "boundElements": [],
      "updated": 1759053302739,
      "link": null,
      "locked": false,
      "text": "class TokeNanoCore:\n\n    - splitter: NanoSocratesSplitter\n    - bpe_encoder: NanoSocratesBPE\n    - special_encoder: NanoSocratesSpecial\n\n    + encode(corpus: str) -> [int]\n\n    - encode_special(piece: str) -> int\n\n    - encode_bpe(piece: str) -> [int]",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class TokeNanoCore:\n\n    - splitter: NanoSocratesSplitter\n    - bpe_encoder: NanoSocratesBPE\n    - special_encoder: NanoSocratesSpecial\n\n    + encode(corpus: str) -> [int]\n\n    - encode_special(piece: str) -> int\n\n    - encode_bpe(piece: str) -> [int]",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "DwFJoUpVT2YAEe9qPYAXa",
      "type": "text",
      "x": 496.75,
      "y": 666,
      "width": 440,
      "height": 100,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aL",
      "roundness": null,
      "seed": 1317596203,
      "version": 152,
      "versionNonce": 1840679687,
      "isDeleted": false,
      "boundElements": [],
      "updated": 1758880107704,
      "link": null,
      "locked": false,
      "text": "class NanoSocratesSpecial(Encoder):\n\n    + vocabulary: dict<str, int>\n    + reverse_vocabulary: dict<int, str>",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class NanoSocratesSpecial(Encoder):\n\n    + vocabulary: dict<str, int>\n    + reverse_vocabulary: dict<int, str>",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "78gC46xatoO1_cRtaN8EC",
      "type": "text",
      "x": 396.375,
      "y": -107.75,
      "width": 396,
      "height": 100,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aM",
      "roundness": null,
      "seed": 1187595241,
      "version": 130,
      "versionNonce": 1273030504,
      "isDeleted": false,
      "boundElements": [],
      "updated": 1759070012771,
      "link": null,
      "locked": false,
      "text": "class Encoder(ABC):\n\n    + encode(corpus: str) -> [int]\n    + decode(encoded: [int]) -> str ",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class Encoder(ABC):\n\n    + encode(corpus: str) -> [int]\n    + decode(encoded: [int]) -> str ",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "3j50Ds74uU7oXoJ9kMOYJ",
      "type": "text",
      "x": 457.375,
      "y": 903.75,
      "width": 949.7594604492188,
      "height": 25,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aN",
      "roundness": null,
      "seed": 1994335529,
      "version": 198,
      "versionNonce": 1492696519,
      "isDeleted": false,
      "boundElements": [],
      "updated": 1758882694747,
      "link": null,
      "locked": false,
      "text": "@@mamma@@è bell^^issima e @@^^le voglio molto b^^ene @--@ replit ^^ è molto ^^bello e^^ lo amo",
      "fontSize": 20,
      "fontFamily": 5,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "@@mamma@@è bell^^issima e @@^^le voglio molto b^^ene @--@ replit ^^ è molto ^^bello e^^ lo amo",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "yg-TvQvz4MwJZ0y8K7Ix0",
      "type": "text",
      "x": 435.375,
      "y": 1026.25,
      "width": 352,
      "height": 250,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aP",
      "roundness": null,
      "seed": 1877486407,
      "version": 344,
      "versionNonce": 25830153,
      "isDeleted": false,
      "boundElements": [],
      "updated": 1758883468886,
      "link": null,
      "locked": false,
      "text": "class NanoSocratesChunker:\n\n    - max_bytes: int\n    - max_special_length: int\n    - special_token_regex: regex\n\n    - residuals: str\n\n    # This must be an iterator\n    + read(path: Path) -> str",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class NanoSocratesChunker:\n\n    - max_bytes: int\n    - max_special_length: int\n    - special_token_regex: regex\n\n    - residuals: str\n\n    # This must be an iterator\n    + read(path: Path) -> str",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "2UXjWdE_jMcsCE2oQgTXn",
      "type": "text",
      "x": -334.75,
      "y": 1112.5,
      "width": 165,
      "height": 25,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aQ",
      "roundness": null,
      "seed": 700532363,
      "version": 76,
      "versionNonce": 1671597672,
      "isDeleted": false,
      "boundElements": [],
      "updated": 1759070020002,
      "link": null,
      "locked": false,
      "text": "class TokeNano:",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class TokeNano:",
      "autoResize": true,
      "lineHeight": 1.25
    }
  ],
  "appState": {
    "gridSize": 20,
    "gridStep": 5,
    "gridModeEnabled": false,
    "viewBackgroundColor": "#ffffff"
  },
  "files": {}
 }
--- a/README.md
+++ b/README.md
@@ -12,11 +12,30 @@ Create and activate you Conda enviroment with:
       conda env create -f environment.yaml
       conda activate deep_learning
-  
+
 Now install dependencies on pip:
        pip install -r requirements.txt
 Add the following on .vscode/settings.json
       ```json
       {
              // For linux
              "terminal.integrated.env.linux": {
                     "PYTHONPATH": "${workspaceFolder}"
              },
              // For OSX
              "terminal.integrated.env.osx": {
                     "PYTHONPATH": "${workspaceFolder}"
              },
              // For Windows
              "terminal.integrated.env.windows": {
                     "PYTHONPATH": "${workspaceFolder}"
              }
       }
       ```
 ## TroubleShooting
 Sometimes when uploading really large batch of data, git can stop the uploads thanks to the timeout.
--- a/Scripts/DataCleaning/data_output_models/debug_csv.py
+++ b/Scripts/DataCleaning/data_output_models/debug_csv.py
@@ -1,21 +0,0 @@
 import pandas as pd
 class Debug_csv():
    def __init__(self, output_path:str):
        self.output =  open(output_path, "w")
        # then the first row as header
        header = ["MovieURI","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
        self.output.write(",".join(header) + "\n")
    def close(self):
        self.output.close()
    def write(self, RDF: pd.DataFrame):
        """
        Args:
            RDF (pd.DataFrame): ["MovieURI","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
        """        
        RDF.to_csv(self.output, index=False, header=False)
--- a/Scripts/DataCleaning/data_output_models/rdf_mask_task.py
+++ b/Scripts/DataCleaning/data_output_models/rdf_mask_task.py
@@ -1,7 +1,7 @@
 import pandas as pd
 # do not worry about circular dependencies, this class will never call something else
-from Scripts.DataCleaning.legacy.filter import PipelineApplier
+from Scripts.DataCleaning.filter import PipelineApplier
 class RDF_mask_task_dataset():
    """
--- a/Scripts/DataCleaning/legacy/filter.py
+++ b/Scripts/DataCleaning/legacy/filter.py
@@ -26,7 +26,6 @@ class PipelineApplier():
        """Remove rows whose RelationshipURI is in the stored filter. Generate it first callig the generate_list_relationship_filter"""
        return RDF[~RDF["RelationshipURI"].isin(self.relationship_filter_list)]
    # def filter_movie_by_rel_uri_frequence()
    def generate_frequency_movie_filter(self, MOVIE_COUNT: pd.DataFrame ,min_treshold: int, max_treshold: int):
        """
@@ -74,6 +73,10 @@ class PipelineApplier():
        return RDF
    def reduce_movie_list(self, starting_offset:int , ending_offset:int):
        end = min(len(self.MOVIE_FILTER), ending_offset)
        self.MOVIE_FILTER = self.MOVIE_FILTER.iloc[starting_offset:end].copy()
    def drop_na_from_dataset(self, RDF: pd.DataFrame) -> pd.DataFrame:
        # dataset has SubjectURI RelationshipURI ObjectURI
        #  want to drop the '' in them
@@ -183,9 +186,3 @@ class PipelineApplier():
        # as input two dataframe, one with 2 column  
        return None
    def regex_on_objects(self, RDF: pd.DataFrame) -> pd.DataFrame:
        RDF["ObjectURI"] = (RDF["ObjectURI"].astype("string")
                   .str.replace(r"\r?\n+", ", ", regex=True)   # newlines -> ", "
                   .str.replace(r"\*", "", regex=True))        # delete all asterisks
        return RDF
--- a/Scripts/DataCleaning/hold_out/divide.py
+++ b/Scripts/DataCleaning/hold_out/divide.py
@@ -1,29 +0,0 @@
 import pandas as pd
 def split_csv_by_percent(csv_path, train=70, val=15, test=15, seed=42):
    # 1) Read and shuffle rows with a fixed seed for reproducibility
    df = pd.read_csv(csv_path).sample(frac=1, random_state=seed).reset_index(drop=True)
    # 2) Turn the three inputs into proportions relative to their sum
    total = train + val + test # eheh you got it there :p
    n = len(df)
    n_train = int(n * train / total)   # floor to keep indices integral
    n_val   = int(n * val   / total)
    # 3) Give the remainder to test to ensure every row is assigned
    #    (this naturally absorbs any rounding loss)
    train_df = df.iloc[:n_train].reset_index(drop=True)
    val_df   = df.iloc[n_train:n_train + n_val].reset_index(drop=True)
    test_df  = df.iloc[n_train + n_val:].reset_index(drop=True)
    return train_df, val_df, test_df
 # usage:
 DATASET =       "Assets/Dataset/Tmp/rdf_text.csv"
 TRAIN =         "Assets/Dataset/Tmp/hold_out/train.csv"
 TEST =          "Assets/Dataset/Tmp/hold_out/test.csv"
 EVALUATION =    "Assets/Dataset/Tmp/hold_out/evaluation.csv"
 train_df, val_df, test_df = split_csv_by_percent(DATASET, train=80, val=10, test=10, seed=42)
 train_df.to_csv(TRAIN)
 val_df.to_csv(EVALUATION)
 test_df.to_csv(TEST)
--- a/Scripts/DataCleaning/legacy/deprecated.py
+++ b/Scripts/DataCleaning/legacy/deprecated.py
@@ -1,381 +0,0 @@
 # This file deletes in the pipeline the unwanted relationship by different rules
 # -----------------------------------------------------------------------------
 # SQL-FIRST VERSION
 # -----------------------------------------------------------------------------
 # In the original (pandas) version this module:
 #   - stored frequency filters in DataFrames,
 #   - filtered/cleaned DataFrames in-memory,
 #   - added special tokens via string ops,
 #   - rebuilt one row per movie using groupby/aggregation.
 #
 # In this rewrite:
 #   - Every transformation RETURNS a SQLAlchemy `Select` object instead of a DataFrame.
 #   - Your pipeline can pass this `Select` (a "dataview") from one stage to the next,
 #     composing more SQL lazily. Nothing is executed until you call `session.execute(...)`.
 #   - Frequency filters are represented as SUBSELECTS, applied with `WHERE IN (subquery)`.
 #
 # Notes:
 #   - We keep the same CLASS and METHOD NAMES to preserve call sites.
 #   - Method comments/docstrings from your original file are carried over and updated
 #     to reflect Select-based behavior and return types.
 #   - We drop pandas/numpy/sqlite3 imports because filtering is pushed into SQL.
 #   - `GROUP_CONCAT` is used for the rebuild phase (SQLite-compatible). For other DBs,
 #     swap with an equivalent string-agg function.
 # -----------------------------------------------------------------------------
 from __future__ import annotations
 from typing import Optional
 from sqlalchemy import select, func, literal
 from sqlalchemy.sql import Select
 from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
 class PipelineApplier():
    """
    SQL-first pipeline applier.
    In the pandas version, frequency filters were stored as DataFrames (self.MOVIE_FILTER / self.REL_FILTER)
    and every method worked with/returned pandas.DataFrame. In this SQLAlchemy rewrite:
      - self.MOVIE_FILTER and self.REL_FILTER become *subselects* (Select objects) that yield a single
        column each (MovieID or RelationshipURI). These subselects can be applied via `WHERE IN (subquery)`.
      - Every method that previously returned a DataFrame now returns a *Select* that represents the same
        logical transformation, but pushed into the database engine.
      - Comments and docstrings are updated to reflect SQL semantics while preserving your original intent.
    """
    def __init__(self):
        # In the pandas version these were DataFrames storing allowed keys.
        # Here they are Select objects (single-column subselects) or None.
        # Expected column names:
        #   - self.MOVIE_FILTER:      "MovieID"
        #   - self.REL_FILTER:        "RelationshipURI"
        self.MOVIE_FILTER: Optional[Select] = None
        self.REL_FILTER: Optional[Select] = None
    # -------------------------------------------------------------------------
    # Relationship deletion
    # -------------------------------------------------------------------------
    def delete_relationship_by_str(self, RDF: Select, uri: str) -> Select:
        """
        Return a Select where rows having the given relationship URI are removed.
        Original signature (pandas):
            def delete_relationship_by_str(self, RDF: pd.DataFrame, uri: str) -> pd.DataFrame
        Updated behavior:
            - RDF is a Select with columns: MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
            - We apply a WHERE clause: RelationshipURI != <uri>
            - Returns a Select you can continue composing.
        Args:
            RDF (Select): a selectable representing the RDF joined view
            uri (str): RelationshipURI to exclude
        Returns:
            Select: filtered selectable (no execution yet)
        """
        sc = RDF.selected_columns
        return RDF.where(sc.RelationshipURI != literal(uri))
    # -------------------------------------------------------------------------
    # Frequency filter: MOVIE
    # -------------------------------------------------------------------------
    def generate_frequency_movie_filter(self, MOVIE_COUNT: Select, min_treshold: int, max_treshold: int):
        """
        You MUST call this before filtering by movie frequency [filter_by_frequency_movie_id()],
        since this method creates such filter.
        Original behavior:
            - Input MOVIE_COUNT as DataFrame ["MovieID","Count"]
            - Keep rows where Count in [min_treshold, max_treshold)
            - Store the filtered keys in self.MOVIE_FILTER
        Updated behavior (SQL):
            - MOVIE_COUNT is a Select that yields ["MovieID","Count"].
            - We build and store a *subselect* of allowed MovieID (single column) to be used by WHERE IN.
            - No query is executed here; we only create a new Select.
        Args:
            MOVIE_COUNT (Select): yields columns MovieID, Count
            min_treshold (int):
            max_treshold (int):
        """
        sc = MOVIE_COUNT.selected_columns
        filtered = MOVIE_COUNT.where(sc.Count >= min_treshold).where(sc.Count < max_treshold)
        # Keep only the key column so it can be used in an IN (subquery)
        self.MOVIE_FILTER = select(filtered.selected_columns.MovieID)
    # -------------------------------------------------------------------------
    # Frequency filter: RELATIONSHIP
    # -------------------------------------------------------------------------
    def generate_frequency_relationship_filter(self, REL_COUNT: Select, min_treshold: int, max_treshold: int):
        """
        Original behavior:
            - Input REL_COUNT as DataFrame ["RelationshipURI","Count"]
            - Keep rows where Count in [min_treshold, max_treshold)
            - Store the filtered keys in self.REL_FILTER
        Updated behavior (SQL):
            - REL_COUNT is a Select that yields ["RelationshipURI","Count"].
            - We build and store a *subselect* of allowed RelationshipURI (single column) to be used by WHERE IN.
            - No query is executed here; we only create a new Select.
        Args:
            REL_COUNT (Select): yields columns RelationshipURI, Count
            min_treshold (int):
            max_treshold (int):
        """
        sc = REL_COUNT.selected_columns
        filtered = REL_COUNT.where(sc.Count >= min_treshold).where(sc.Count < max_treshold)
        self.REL_FILTER = select(filtered.selected_columns.RelationshipURI)
    # -------------------------------------------------------------------------
    # Apply frequency filters
    # -------------------------------------------------------------------------
    def filter_by_frequency_movie_id(self, RDF: Select) -> Select:
        """
        Original behavior (pandas):
            RDF = RDF[RDF["MovieID"].isin(self.MOVIE_FILTER["MovieID"])]
        Updated behavior (SQL):
            - If self.MOVIE_FILTER is present, apply: WHERE MovieID IN ( <subselect> )
            - Otherwise, return RDF unchanged.
        Args:
            RDF (Select): current dataset
        Returns:
            Select: filtered dataset (or unchanged if no filter exists)
        """
        if self.MOVIE_FILTER is None:
            return RDF
        sc = RDF.selected_columns
        return RDF.where(sc.MovieID.in_(self.MOVIE_FILTER))
    def filter_by_frequency_relationship(self, RDF: Select) -> Select:
        """
        Original behavior (pandas):
            RDF = RDF[RDF["RelationshipURI"].isin(self.REL_FILTER["RelationshipURI"])]
        Updated behavior (SQL):
            - If self.REL_FILTER is present, apply: WHERE RelationshipURI IN ( <subselect> )
            - Otherwise, return RDF unchanged.
        Args:
            RDF (Select): current dataset
        Returns:
            Select: filtered dataset (or unchanged if no filter exists)
        """
        if self.REL_FILTER is None:
            return RDF
        sc = RDF.selected_columns
        return RDF.where(sc.RelationshipURI.in_(self.REL_FILTER))
    # -------------------------------------------------------------------------
    # Token prefixing (SubjectURI/RelationshipURI/ObjectURI)
    # -------------------------------------------------------------------------
    def rdf_add_special_token(self, RDF: Select) -> Select:
        """
        Adds RDF special token to each element of the tuple. i.e: SUBJ to SubjectURI,
        OBJ to ObjectURI, REL to RelationshipURI. Check
        Scripts/Libs/CleaningPipeline/special_token.py for the up-to-date special token.
        It only adds the special token of the three elements of the RDF; no other special token.
        Original behavior (pandas):
            - String concatenation with columns in a DataFrame.
            - Returned a new DataFrame.
        Updated behavior (SQL):
            - Build projected columns using SQL string concatenation.
            - Return a new Select with the same output column names:
              ["MovieID","SubjectURI","RelationshipURI","ObjectURI","Abstract"].
        Args:
            RDF (Select): current dataset
        Returns:
            Select: projected dataset with tokenized SubjectURI/RelationshipURI/ObjectURI
        """
        sc = RDF.selected_columns
        subj_tok = literal(SpecialToken.SUBJECT.value) + sc.SubjectURI
        rel_tok  = literal(SpecialToken.RELATIONSHIP.value) + sc.RelationshipURI
        obj_tok  = literal(SpecialToken.OBJECT.value) + sc.ObjectURI
        return RDF.with_only_columns(
            sc.MovieID.label("MovieID"),
            subj_tok.label("SubjectURI"),
            rel_tok.label("RelationshipURI"),
            obj_tok.label("ObjectURI"),
            sc.Abstract.label("Abstract"),
        )
    # -------------------------------------------------------------------------
    # NA/empty drop on key columns (SubjectURI, RelationshipURI, ObjectURI)
    # -------------------------------------------------------------------------
    def drop_na_from_dataset(self, RDF: Select) -> Select:
        """
        Dataset has SubjectURI, RelationshipURI, ObjectURI. We want to drop rows
        where any of these is empty or NULL.
        Original behavior (pandas):
            - Replace '' with NaN and dropna on the three columns.
        Updated behavior (SQL):
            - Apply WHERE clauses checking for NOT NULL and not empty string.
        Args:
            RDF (Select): current dataset
        Returns:
            Select: dataset filtered to non-empty SubjectURI/RelationshipURI/ObjectURI
        """
        sc = RDF.selected_columns
        return RDF.where(
            (sc.SubjectURI.is_not(None)) & (sc.SubjectURI != "") &
            (sc.RelationshipURI.is_not(None)) & (sc.RelationshipURI != "") &
            (sc.ObjectURI.is_not(None)) & (sc.ObjectURI != "")
        )
    # -------------------------------------------------------------------------
    # Rebuild by movie (one row per movie)
    # -------------------------------------------------------------------------
    def rebuild_by_movie(self, RDF: Select) -> Select:
        """
        To execute this method you have to have iterated by movie_id conceptually,
        because as design we want at the end one row for each movie.
        Original behavior (pandas):
            - Build per-row "Triple" as SubjectURI + RelationshipURI + ObjectURI,
              wrapped with START_TRIPLE/END_TRIPLE.
            - Group by ["MovieID", "Abstract"] and join ("".join) all Triple strings into one.
            - Prefix the whole list with START_TRIPLE_LIST and Abstract with ABSTRACT.
            - Return DataFrame [["MovieID","Triple","Abstract"]].
        Updated behavior (SQL):
            - Build per-row Triple using SQL string concatenation and constants.
            - Use GROUP_CONCAT (empty separator) to aggregate per-movie.
            - Prefix with START_TRIPLE_LIST and ABSTRACT in SQL.
            - Return a Select with columns: ["MovieID","Triple","Abstract"].
        Args:
            RDF (Select): current dataset with columns
                          MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
        Returns:
            Select: aggregated dataset with one row per movie
        """
        sc = RDF.selected_columns
        # Per-row triple with START/END_TRIPLE tokens
        row_triple = (
            literal(SpecialToken.START_TRIPLE.value) +
            (sc.SubjectURI + sc.RelationshipURI + sc.ObjectURI) +
            literal(SpecialToken.END_TRIPLE.value)
        ).label("Triple")
        # Prefixed abstract
        abstract_tok = (literal(SpecialToken.ABSTRACT.value) + sc.Abstract).label("Abstract")
        # Subquery of per-row triples / abstracts
        row_view = RDF.with_only_columns(
            sc.MovieID.label("MovieID"),
            row_triple,
            abstract_tok,
        ).subquery()
        # Concatenate all triples for each movie (SQLite syntax; adjust for other DBs)
        triple_concat = (
            literal(SpecialToken.START_TRIPLE_LIST.value) +
            func.group_concat(row_view.c.Triple, literal(""))
        ).label("Triple")
        return (
            select(
                row_view.c.MovieID.label("MovieID"),
                triple_concat,
                row_view.c.Abstract.label("Abstract"),
            )
            .group_by(row_view.c.MovieID, row_view.c.Abstract)
        )
    # -------------------------------------------------------------------------
    # Build triple(s) projection
    # -------------------------------------------------------------------------
    @staticmethod
    def build_triple(RDF: Select) -> Select:
        """
        Obtains joined RDF triple in one element, together with START and END special tokens.
        Original behavior (pandas):
            - Returned a Series/DataFrame column "Triple" built from three string columns.
        Updated behavior (SQL):
            - Returns a Select with a single column "Triple" built in SQL.
        Args:
            RDF (Select): at least columns ["SubjectURI", "RelationshipURI", "ObjectURI"]
        Returns:
            Select: a projection containing one column named "Triple"
        """
        sc = RDF.selected_columns
        triple = (
            literal(SpecialToken.START_TRIPLE.value) +
            (sc.SubjectURI + sc.RelationshipURI + sc.ObjectURI) +
            literal(SpecialToken.END_TRIPLE.value)
        ).label("Triple")
        return RDF.with_only_columns(triple)
    @staticmethod
    def build_incomplete_triple(RDF: Select) -> Select:
        """
        Method helper used for the third task: "Predicting a masked component within an RDF triple".
        Obtains joined RDF triple in one element, together with START and END special tokens.
        The MISSING element will be replaced by the special token <MASK>.
        Original behavior (pandas):
            - Created a Series "Triple" using fallback values for missing columns.
        Updated behavior (SQL):
            - Uses COALESCE to replace NULLs with <MASK> directly in SQL.
            - Returns a Select with a single column "Triple".
        Args:
            RDF (Select): 2 of the following columns present ["SubjectURI", "RelationshipURI", "ObjectURI"]
        Returns:
            Select: projection with column "Triple"
        """
        sc = RDF.selected_columns
        mask = literal(SpecialToken.MASK.value)
        triple = (
            literal(SpecialToken.START_TRIPLE.value) +
            (func.coalesce(sc.SubjectURI, mask) +
             func.coalesce(sc.RelationshipURI, mask) +
             func.coalesce(sc.ObjectURI, mask)) +
            literal(SpecialToken.END_TRIPLE.value)
        ).label("Triple")
        return RDF.with_only_columns(triple)
    @staticmethod
    def build_for_mask_task(RDF_incomplete: Select, MISSING) -> None:
        """
        Currently not used.
        Original intention:
            Given two DataFrames (one incomplete RDF and another with just the missing component),
            apply special tokens accordingly.
        Updated note:
            This stub remains for API parity. If needed in the future, it can be implemented
            as a Select-building helper that merges/COALESCEs columns from different selects.
        """
        return None
--- a/Scripts/DataCleaning/legacy/fast_filter.py
+++ b/Scripts/DataCleaning/legacy/fast_filter.py
@@ -1,148 +0,0 @@
 # This file deletes in the pipeline the unwanted relationship by different rules
 import pandas as pd
 import sqlite3  # kept for compatibility
 import numpy as np
 from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
 from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
 class PipelineApplier:
    def __init__(self):
        # Fast internal caches for O(1) membership checks
        self._MOVIE_FILTER_SET = set()
        self._REL_FILTER_SET = set()
    # ------------------------------
    # Filters
    # ------------------------------
    def delete_relationship_by_str(self, RDF: pd.DataFrame, uri: str) -> pd.DataFrame:
        # Vectorized boolean mask
        return RDF.loc[RDF["RelationshipURI"] != uri]
    def generate_frequency_movie_filter(self, MOVIE_COUNT: pd.DataFrame, min_threshold: int, max_threshold: int):
        """
        You MUST call this before filter the dataset by movie frequency [filter_by_frequency_movie_id()],
        since this method creates such filter.
        Args:
            MOVIE_COUNT (pd.DataFrame): ["MovieID","Count"]
        """
        sel = (MOVIE_COUNT["Count"] >= min_threshold) & (MOVIE_COUNT["Count"] < max_threshold)
        self._MOVIE_FILTER_SET = set(MOVIE_COUNT.loc[sel, "MovieID"].tolist())
    def generate_frequency_relationship_filter(self, REL_COUNT: pd.DataFrame, min_threshold: int, max_threshold: int):
        sel = (REL_COUNT["Count"] >= min_threshold) & (REL_COUNT["Count"] < max_threshold)
        self._REL_FILTER_SET = set(REL_COUNT.loc[sel, "RelationshipURI"].tolist())
    def filter_by_frequency_movie_id(self, RDF: pd.DataFrame) -> pd.DataFrame:
        # Set-backed isin is the fastest path
        return RDF.loc[RDF["MovieID"].isin(self._MOVIE_FILTER_SET)]
    def filter_by_frequency_relationship(self, RDF: pd.DataFrame) -> pd.DataFrame:
        return RDF.loc[RDF["RelationshipURI"].isin(self._REL_FILTER_SET)]
    # ------------------------------
    # Cleaning & preprocessing
    # ------------------------------
    def rdf_add_special_token(self, RDF: pd.DataFrame) -> pd.DataFrame:
        """
        Adds RDF special token to SubjectURI / RelationshipURI / ObjectURI.
        Returns a new DataFrame (no inplace modification of the caller's object).
        """
        subj = np.char.add(SpecialToken.SUBJECT.value,      RDF["SubjectURI"].to_numpy(dtype=object))
        rel  = np.char.add(SpecialToken.RELATIONSHIP.value, RDF["RelationshipURI"].to_numpy(dtype=object))
        obj  = np.char.add(SpecialToken.OBJECT.value,        RDF["ObjectURI"].to_numpy(dtype=object))
        return RDF.assign(SubjectURI=subj, RelationshipURI=rel, ObjectURI=obj)
    def drop_na_from_dataset(self, RDF: pd.DataFrame) -> pd.DataFrame:
        """
        Replace '' with NaN only on key columns, then drop rows missing any of them.
        """
        cols = ["SubjectURI", "RelationshipURI", "ObjectURI"]
        rdf = RDF.copy()
        for c in cols:
            m = rdf[c] == ""
            if m.any():
                rdf.loc[m, c] = np.nan
        return rdf.dropna(subset=cols)
    # ------------------------------
    # Building triples
    # ------------------------------
    @staticmethod
    def build_triple(RDF: pd.DataFrame):
        """
        Obtains joined RDF triple in one element, together with START and END special token.
        Returns:
            pd.Series: RDF["Triple"] (just this column). Side-effect: sets RDF["Triple"].
        """
        start = SpecialToken.START_TRIPLE.value
        end   = SpecialToken.END_TRIPLE.value
        subj = RDF["SubjectURI"].to_numpy(dtype=object)
        rel  = RDF["RelationshipURI"].to_numpy(dtype=object)
        obj  = RDF["ObjectURI"].to_numpy(dtype=object)
        arr = np.char.add(np.char.add(np.char.add(start, subj),
                                      np.char.add(rel, obj)),
                          end)
        RDF["Triple"] = pd.Series(arr, index=RDF.index, dtype=object, name="Triple")
        return RDF["Triple"]
    @staticmethod
    def build_incomplete_triple(RDF: pd.DataFrame):
        """
        Helper used for the third task: "Predicting a masked component within an RDF triple".
        Accepts any subset of ["SubjectURI","RelationshipURI","ObjectURI"] (typically 2 of 3).
        Missing components are replaced by <MASK>.
        Returns:
            pd.Series: RDF["Triple"] (just this column). Side-effect: sets RDF["Triple"].
        """
        start = SpecialToken.START_TRIPLE.value
        end   = SpecialToken.END_TRIPLE.value
        maskv = SpecialToken.MASK.value
        n = len(RDF.index)
        subj = RDF["SubjectURI"].to_numpy(dtype=object)      if "SubjectURI"      in RDF else np.full(n, maskv, dtype=object)
        rel  = RDF["RelationshipURI"].to_numpy(dtype=object) if "RelationshipURI" in RDF else np.full(n, maskv, dtype=object)
        obj  = RDF["ObjectURI"].to_numpy(dtype=object)       if "ObjectURI"       in RDF else np.full(n, maskv, dtype=object)
        arr = np.char.add(np.char.add(np.char.add(start, subj),
                                      np.char.add(rel, obj)),
                          end)
        RDF["Triple"] = pd.Series(arr, index=RDF.index, dtype=object, name="Triple")
        return RDF["Triple"]
    def rebuild_by_movie(self, RDF: pd.DataFrame):
        """
        Collapse triples + abstract into a single row per movie.
        Returns: ["MovieID","Triple","Abstract"]
        """
        # Build triples once (vectorized); method also sets RDF["Triple"]
        triples = self.build_triple(RDF)
        # Minimal frame for grouping (avoid carrying extra columns)
        tmp = pd.DataFrame({
            "MovieID":  RDF["MovieID"].to_numpy(),
            "Abstract": RDF["Abstract"].to_numpy(),
            "Triple":   triples.to_numpy(),
        })
        # Factorize high-cardinality keys to fast integer codes, group on codes,
        # then map back to labels; sum concatenates strings for object dtype.
        mid_codes, mid_uniques = pd.factorize(tmp["MovieID"], sort=False)
        abs_codes, abs_uniques = pd.factorize(tmp["Abstract"], sort=False)
        tmp["_mid"] = mid_codes
        tmp["_abs"] = abs_codes
        grouped = tmp.groupby(["_mid", "_abs"], sort=False, as_index=False)["Triple"].sum()
        grouped["MovieID"]  = grouped["_mid"].map(lambda i: mid_uniques[i])
        grouped["Abstract"] = grouped["_abs"].map(lambda i: abs_uniques[i])
        # Final tokens
        grouped["Triple"]   = SpecialToken.START_TRIPLE_LIST.value + grouped["Triple"]
        grouped["Abstract"] = SpecialToken.ABSTRACT.value         + grouped["Abstract"]
        return grouped[["MovieID", "Triple", "Abstract"]]
--- a/Scripts/DataCleaning/legacy/pipeline.py
+++ b/Scripts/DataCleaning/legacy/pipeline.py
@@ -1,23 +1,28 @@
 import re
 from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
-from Scripts.DataCleaning.legacy.filter import PipelineApplier
+from Scripts.DataCleaning.filter import PipelineApplier
 # tasks dataset builder
 from Scripts.DataCleaning.data_output_models.rdf_mask_task import RDF_mask_task_dataset
 from Scripts.DataCleaning.data_output_models.bpe_corpus import BPE_corpus
 from Scripts.DataCleaning.data_output_models.rdf_text_tasks import RDF_text_task_dataset
 from Scripts.DataCleaning.data_output_models.rdf_completation_task import RDF_completation_task_dataset
 from Scripts.DataCleaning.data_output_models.debug_csv import Debug_csv
 import pandas as pd
 class Pipeline():
-    def __init__(self):
+    def __init__(self, 
                 mask_task_dataset_path:str = "./Assets/Dataset/Tmp/rdf_mask.csv",
                 bpe_corpus_path:str = "./Assets/Dataset/Tmp/corpus.txt",
                 text_to_rdf_task_dataset_path:str = "./Assets/Dataset/Tmp/rdf_text.csv",
                 completation_rdf_task_dataset_path:str = "./Assets/Dataset/Tmp/rdf_completation.csv",
                 ):
        self.sql_endpoint = SqlEndpoint()
        # classes to manage taskes' datasets
-        self.task_rdf_mask = RDF_mask_task_dataset("./Assets/Dataset/Tmp/rdf_mask.csv")
+        self.task_rdf_mask = RDF_mask_task_dataset(mask_task_dataset_path)
-        self.task_bpe_corpus = BPE_corpus("./Assets/Dataset/Tmp/corpus.txt")
+        self.task_bpe_corpus = BPE_corpus(bpe_corpus_path)
-        self.task_rdf_text = RDF_text_task_dataset("./Assets/Dataset/Tmp/rdf_text.csv")
+        self.task_rdf_text = RDF_text_task_dataset(text_to_rdf_task_dataset_path)
-        self.task_rdf_completation = RDF_completation_task_dataset("./Assets/Dataset/Tmp/rdf_completation.csv")
+        self.task_rdf_completation = RDF_completation_task_dataset(completation_rdf_task_dataset_path)
        # prepare the filter
        # the filter applier needs to know the frequence of Movies and Relationship among all the Dataset
@@ -25,16 +30,13 @@ class Pipeline():
        MOVIE_COUNT = self.sql_endpoint.get_movies_id_count()
        REL_COUNT = self.sql_endpoint.get_relationship_count()
        self.filter_applier.generate_frequency_movie_filter(MOVIE_COUNT,50,3000)
-        self.filter_applier.generate_frequency_relationship_filter(REL_COUNT, 50, 2395627) # from 2718 to 3069 
+        self.filter_applier.generate_frequency_relationship_filter(REL_COUNT, 50, 2395627)
        # prepare the filter on the relationshipURI you want to delete:
        relationship_uri_banned_list = [
            "dbp-dbp:wikiPageUsesTemplate","w3:2000/01/rdf-schema#label","dbp-dbo:abstract",
            "dbp-dbo:wikiPageID","dbp-dbo:wikiPageRevisionID", "dbp-dbo:wikiPageDisambiguates",
            "w3:2002/07/owl#sameAs","dbp-dbp:image","dbp-dbo:wikiPageLength", "w3:2000/01/rdf-schema#comment",
-            "dbp-dbo:thumbnail", "foaf:depiction", "w3:1999/02/22-rdf-syntax-ns#type", 
+            "dbp-dbo:thumbnail", "foaf:depiction", "w3:1999/02/22-rdf-syntax-ns#type"]
            "dbp-dbp:id","dbp-dbp:totalWidth", "w3:ns/prov#wasDerivedFrom", "dbp-dbp:n", "dbp-dbp:alt",
            "dbp-dbo:soundRecording"
            ]
        self.filter_applier.generate_list_relationship_filter(relationship_uri_banned_list)
@@ -96,8 +98,6 @@ class Pipeline():
            # other filter
            #
            RDF = self.filter_applier.delete_relationship_by_list_filter(RDF)
            # regex on ObjectURI
            RDF = self.filter_applier.regex_on_objects(RDF)
            if RDF.empty:
                continue
            RDF = self.filter_applier.rdf_add_special_token(RDF)  # WARNING, THIS MUST BE DONE AFTER FILTER BY FREQUENCE
@@ -119,13 +119,9 @@ class Pipeline():
        movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]
        self.sql_endpoint.movie_ids = movie_list
-    def generate_csv_debug_file(self, debug_path:str):
+    def reduce_movie_list(self, starting_offset:int , ending_offset:int):
-        debug_csv = Debug_csv(debug_path)
+        self.filter_applier.reduce_movie_list(starting_offset,ending_offset)
        for RDF in self._get_cleaned_movie_rows():
            debug_csv.write(RDF)
        debug_csv.close()
 # there are a lot of settings to manage
@@ -134,12 +130,11 @@ class Pipeline():
 # in the use_toy_dataset , to change the toy dataset
 # in _get_cleaned_movie_rows: to change how the pipeline behave
-pipeline = Pipeline()
+#pipeline = Pipeline()
-pipeline.use_toy_dataset()
+# pipeline.use_toy_dataset()
 # pipeline.execute_task_bpe_corpus()
 # pipeline.execute_task_rdf_mask()
 # pipeline.execute_tasks_rdf_text()
 # pipeline.execute_task_rdf_completation()
-# pipeline.execute_all_task()
+# pipeline.execute_all_task()
 pipeline.generate_csv_debug_file("Assets/Dataset/Tmp/debug.csv")
--- a/Scripts/DataCleaning/pipeline/cleaner.py
+++ b/Scripts/DataCleaning/pipeline/cleaner.py
@@ -1,86 +0,0 @@
 # This file deletes in the pipeline the unwanted relationship by different rules
 import pandas as pd
 import sqlite3
 import numpy as np
 from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
 from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
 class PipelineApplier():
    def __init__(self):
        pass
    def rdf_add_special_token(self, RDF: pd.DataFrame):
        """
        Adds RDF special token to each element of the tuple. i.e: SUBJ to SubjectURI, OBJ to ObjectURI, REL to RelationshipURI. 
        Check Scrits/Libs/CleaningPipeline/special_token.py for the up-to-date special token.
        It only adds the special token of the three element of the RDF, no other special token.
        Args:
            RDF (pd.DataFrame):
        Returns:
            pd.DataFrame: ["MovieURI","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
        """        
        # if the filter runned before sliced the RDF and created a View, here the problem is resolved
        # for more context: SettingWithCopyWarning
        RDF = RDF.copy()
        # at the beginning of SubjectURI RelationshipURI ObjectURI, add their special token  
        RDF["SubjectURI"] = SpecialToken.SUBJECT.value + RDF["SubjectURI"]
        RDF["ObjectURI"] = SpecialToken.OBJECT.value + RDF["ObjectURI"]
        RDF["RelationshipURI"] = SpecialToken.RELATIONSHIP.value + RDF["RelationshipURI"]
        return RDF
    def drop_na_from_dataset(self, RDF: pd.DataFrame) -> pd.DataFrame:
        RDF = RDF.replace('', np.nan)
        # Drop rows where any of the key columns are NaN
        RDF = RDF.dropna(subset=["SubjectURI", "RelationshipURI", "ObjectURI"])
        return RDF
    def rebuild_by_movie(self, RDF: pd.DataFrame) -> pd.DataFrame:
        """
        Args:
            RDF (pd.DataFrame): ["MovieID","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
        Returns:
            pd.DataFrame: ["MovieID","Triple","Abstract"]
        """        
        # to execute this method you have to have itereted by movie_id
        # because as design we want at the end one row for each movie
        # MovieID and abstract can be given as input for a more generic method
        # first let's combine each row creating column triple as join of rdf
        RDF["Triple"] =  RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"]
        # special token 
        RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
        # combine rows into one
        # MovieID and Abstract are unique for each other 1 <-> 1
        RDF = RDF.groupby(["MovieID", "Abstract"])["Triple"].apply("".join).reset_index()
        # add special token for: start of triple, end of triple and start of abstract
        RDF["Triple"] = SpecialToken.START_TRIPLE_LIST.value + RDF["Triple"]+SpecialToken.END_OF_SENTENCE.value
        RDF["Abstract"] = SpecialToken.ABSTRACT.value + RDF["Abstract"] + SpecialToken.END_OF_SENTENCE.value
        return RDF[["MovieID","Triple","Abstract"]]
    @staticmethod
    def build_triple(RDF: pd.DataFrame):
        """
        Obtains joined RDF triple in one element, togheter with START and END special token
        Args:
            RDF (pd.DataFrame): at least ["SubjectURI", "RelationshipURI", "ObjectURI"]
        Returns:
            pd.DataFrame: RDF["Triple"] (just this column)
        """        
        # let's combine each row creating column triple as join of rdf
        RDF["Triple"] =  RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"]
        # special token 
        RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
        return RDF["Triple"]
    def regex_on_objects(self, RDF: pd.DataFrame) -> pd.DataFrame:
        RDF["ObjectURI"] = (RDF["ObjectURI"].astype("string")
                   .str.replace(r"\r?\n+", ", ", regex=True)   # newlines -> ", "
                   .str.replace(r"\*", "", regex=True))        # delete all asterisks
        return RDF
--- a/Scripts/DataCleaning/pipeline/movie_filter.py
+++ b/Scripts/DataCleaning/pipeline/movie_filter.py
@@ -1,103 +0,0 @@
 import pandas as pd
 from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
 class MovieFilter:
    def __init__(self) -> None:
        self.sql_endpoint = SqlEndpoint()
        # first obtain all movie_id
        movie_query = "SELECT MovieID FROM Movies"
        self.MOVIE_FILTER = self.sql_endpoint.get_dataframe_from_query(movie_query)
    def frequency_filter(self, min_treshold:int, max_treshold:int):
        movie_list_placeholder = ",".join(["?"] * len(self.MOVIE_FILTER))
        filter_query = f"""
            SELECT MovieID
            FROM RDFs
            WHERE MovieID IN ({movie_list_placeholder})
            GROUP BY MovieID
            HAVING COUNT(*) BETWEEN {min_treshold} AND {max_treshold};
        """
        self.MOVIE_FILTER = self.sql_endpoint.get_dataframe_from_query(filter_query, tuple(self.MOVIE_FILTER["MovieID"].to_list()))
    def get_movie_id(self):
        return self.MOVIE_FILTER
    def relation_filter(self, parsed_rel_uri: str, min_treshold:int, max_treshold:int):
        movie_ids = self.MOVIE_FILTER["MovieID"].to_list()
        movie_list_placeholder = ",".join(["?"] * len(movie_ids))
        filter_query = f"""
            SELECT MovieID
            FROM RDFs
            JOIN ParsedRelationships ON ParsedRelationships.RelationshipID = RDFs.RelationshipID
            WHERE MovieID IN ({movie_list_placeholder})
            GROUP BY MovieID
            HAVING SUM(CASE WHEN ParsedRelationships.RelationshipURI = '{parsed_rel_uri}'  THEN 1 ELSE 0 END)
                BETWEEN {min_treshold} AND {max_treshold};
        """
        params = tuple(movie_ids) # + (parsed_rel_uri, min_treshold, max_treshold)
        self.MOVIE_FILTER = self.sql_endpoint.get_dataframe_from_query(filter_query, params)
    def filter_by_director(self):
        director_list = ['dbp-dbo:director','dbp-dbp:director']
        movie_ids = self.MOVIE_FILTER["MovieID"].to_list()
        movie_list_placeholder = ",".join(["?"] * len(movie_ids))
        filter_query = f"""
            SELECT DISTINCT RDFs.MovieID
            FROM RDFs
            JOIN ParsedRelationships USING (RelationshipID)
            WHERE RDFs.MovieID IN ({movie_list_placeholder})
            AND ParsedRelationships.RelationshipURI IN {tuple(director_list)};
        """
        params = tuple(movie_ids)
        self.MOVIE_FILTER = self.sql_endpoint.get_dataframe_from_query(filter_query, params)
    def filter_by_english_movies(self):
        movie_ids = self.MOVIE_FILTER["MovieID"].to_list()
        movie_list_placeholder = ",".join(["?"] * len(movie_ids))
        relationship = ["dbp-dbp:language"]
        objects_list = ["English", "dbp-dbr:English_language"]
        filter_query = f"""
            SELECT DISTINCT RDFs.MovieID
            FROM RDFs
            INNER JOIN ParsedRelationships USING (RelationshipID)
            INNER JOIN ParsedObjects USING (ObjectID)
            WHERE RDFs.MovieID IN ({movie_list_placeholder})
            AND ParsedRelationships.RelationshipURI IN ('{relationship[0]}')
            AND ParsedObjects.ObjectURI in {tuple(objects_list)};
        """
        other_query = f"""
            SELECT RDFs.MovieID
            FROM RDFs
            INNER JOIN ParsedRelationships USING (RelationshipID)
            INNER JOIN ParsedObjects USING (ObjectID)
            WHERE RDFs.MovieID IN ({movie_list_placeholder})
            AND ParsedRelationships.RelationshipURI IN ('{relationship[0]}')
            GROUP BY RDFs.MovieID
            HAVING
            SUM(CASE WHEN ParsedObjects.ObjectURI IN {tuple(objects_list)} THEN 1 ELSE 0 END) >= 1
            AND
            SUM(CASE WHEN ParsedObjects.ObjectURI NOT IN {tuple(objects_list)} THEN 1 ELSE 0 END) = 0;
        """
        params = tuple(movie_ids)
        self.MOVIE_FILTER = self.sql_endpoint.get_dataframe_from_query(other_query, params)
 # movie_filter = MovieFilter()
 # movie_filter.frequency_filter(5,10)
--- a/Scripts/DataCleaning/pipeline/pipeline.py
+++ b/Scripts/DataCleaning/pipeline/pipeline.py
@@ -1,155 +0,0 @@
 from movie_filter import MovieFilter
 from relationship_filter import RelationshipFilter
 from rdf_filter import RdfFilter
 from cleaner import PipelineApplier
 from Scripts.DataCleaning.data_output_models.bpe_corpus import BPE_corpus
 from Scripts.DataCleaning.data_output_models.rdf_text_tasks import RDF_text_task_dataset
 from Scripts.DataCleaning.data_output_models.rdf_completation_task import RDF_completation_task_dataset
 from Scripts.DataCleaning.data_output_models.debug_csv import Debug_csv
 import pandas as pd
 RELATIONSHIP_FILTER_LIST = [
            "dbp-dbp:wikiPageUsesTemplate","w3:2000/01/rdf-schema#label","dbp-dbo:abstract",
            "dbp-dbo:wikiPageID","dbp-dbo:wikiPageRevisionID", "dbp-dbo:wikiPageDisambiguates",
            "w3:2002/07/owl#sameAs","dbp-dbp:image","dbp-dbo:wikiPageLength", "w3:2000/01/rdf-schema#comment",
            "dbp-dbo:thumbnail", "foaf:depiction", "w3:1999/02/22-rdf-syntax-ns#type", 
            "dbp-dbp:id","dbp-dbp:totalWidth", "w3:ns/prov#wasDerivedFrom", "dbp-dbp:n", "dbp-dbp:alt",
            "dbp-dbo:soundRecording", "dbp-dbp:align", "dbp-dbp:format", 
            "dbp-dbp:filename", "dbp-dbp:wikt", "foaf:isPrimaryTopicOf", "dbp-dbp:quote", "foaf:homepage",
            "dbp-dbp:wordnet_type", "dbp-dbp:length","dbp-dbp:caption", "dbp-dbo:imdbId", "dbp-dbp:border", "dbp-dbp:note",
            "dbp-dbp:postalCodeType", "dbp-dbp:extraColumn", "foaf:homepage", "dbp-dbp:bgcolor","dbp-dbp:prevTitle", 
            "dbp-dbp:imageUpright", "dbp-dbp:url", "dbp-dbp:italicTitle", "dbp-dbp:imageSize", "dbp-dbp:text", 
            "dbp-dbp:captionAlign", "dbp-dbp:headerAlign", "dbp-dbp:height", "dbp-dbp:link", "dbp-dbo:wikiPageInterLanguageLink",
            "w3:2003/01/geo/wgs84_pos#lat", "w3:2003/01/geo/wgs84_pos#long", "http://www.georss.org/georss/point", 
            "dbp-dbp:bgcolor", "dbp-dbp:mc", "dbp-dbp:rev3score", "dbp-dbp:rev4score", "dbp-dbp:imageAlt", 
            "dbp-dbp:b", "dbp-dbp:s", "dbp-dbp:c", "dbp-dbp:d", "dbp-dbp:m", "dbp-dbp:v", "dbp-dbp:mw", "dbp-dbp:fontsize",
            "dbp-dbp:salign", "dbp-dbp:q", "dbp-dbp:portal", "dbp-dbp:dSearch", "dbp-dbp:header", "w3:2003/01/geo/wgs84_pos#geometry",
            "dbp-dbp:shortsummary", "dbp-dbp:fixAttempted", "dbp-dbo:developer", "dbp-dbp:no", "dbp-dbp:ref", "dbp-dbp:infoa"
            "dbp-dbp:infob", "dbp-dbp:1a", "dbp-dbp:1p", "dbp-dbp:2a", "dbp-dbp:2p", "http://rdvocab.info/RDARelationshipsWEMI/manifestationOfWork",
            "dbp-dbp:isbn", "dbp-dbp:titleWidth", "dbp-dbp:prodcode", "dbp-dbp:page", "w3:2004/02/skos/core#closeMatch",
            "dbp-dbp:colwidth", "dbp-dbp:imagesize", "dbp-dbp:rr", "dbp-dbp:date", "dbp-dbp:type", "dbp-dbp:list", 
            "dbp-dbp:listEpisodes", "dbp-dbp:footerAlign", "dbp-dbp:float", "dbp-dbp:bot", "dbp-dbp:p", "dbp-dbp:l", "dbp-dbp:t", "dbp-dbp:j",
            "dbp-dbp:1y", "dbp-dbp:2y", "dbp-dbp:1pp", "dbp-dbp:vgs", "dbp-dbp:3a", "dbp-dbp:3p", "dbp-dbp:3y", "dbp-dbp:4a", "dbp-dbp:4y",
            "dbp-dbp:website"
            ]
 RELATIONSHIP_WHITE_LIST = [
            "dbp-dbp:director","dbp-dbo:starring", "dbp-dbo:writer", "dbp-dbp:name", "dbp-dbp:genre", "purl:dc/terms/subject"
            ]
 """ 
 SELECT DISTINCT field3
 FROM debug
 """
 class Pipeline():
    def __init__(self) -> None:
        self._movie_filter = MovieFilter()
        self._relationship_filter = RelationshipFilter()
        self._rdf_filter = RdfFilter()
        self._pipeline = PipelineApplier()
        self.task_bpe_corpus = BPE_corpus("./Assets/Dataset/Tmp/corpus.txt")
        self.task_rdf_text = RDF_text_task_dataset("./Assets/Dataset/Tmp/rdf_text.csv")
        self.task_rdf_completation = RDF_completation_task_dataset("./Assets/Dataset/Tmp/rdf_completation.csv")
        self._movie_filter.frequency_filter(50,3000)
        self._relationship_filter.frequency_filter(25, 2395627) # from 2718 to 3069 
        self._relationship_filter.delete_relationship_uri_by_list(RELATIONSHIP_FILTER_LIST)
    def other_filter(self):
        self._movie_filter.relation_filter("purl:dc/terms/subject",5,100)
        self._movie_filter.filter_by_director()
        self._movie_filter.filter_by_english_movies()
        self._movie_filter.relation_filter("dbp-dbp:budget",1,100)      # the most important film have relationship budget
        self._movie_filter.relation_filter("dbp-dbp:released",1,100)    # to cut to 2000 :(
    def _get_cleaned_movie_rows(self):
        movie_ids = self._movie_filter.get_movie_id()
        rel_ids = self._relationship_filter.get_relationship_id()
        # rel_ids = self._relationship_filter.get_relationship_id_from_white_list(RELATIONSHIP_WHITE_LIST)
        for RDF in self._rdf_filter.yield_movie_abbreviated_rdfs(movie_ids,rel_ids):
            RDF = self._pipeline.drop_na_from_dataset(RDF)
            RDF = self._pipeline.regex_on_objects(RDF)
            RDF = self._pipeline.rdf_add_special_token(RDF)
            if RDF.empty:
                continue
            yield RDF
    def execute_task_bpe_corpus(self):
        for RDF in self._get_cleaned_movie_rows():
            RDF = self._pipeline.rebuild_by_movie(RDF)
            RDF = RDF[["Triple","Abstract"]]
            self.task_bpe_corpus.write_from_df(RDF)
        self._end_file_handler()
    def execute_tasks_rdf_text(self):
        for RDF in self._get_cleaned_movie_rows():
            RDF = self._pipeline.rebuild_by_movie(RDF)
            self.task_rdf_text.write(RDF)
        self._end_file_handler()
    def execute_task_rdf_completation(self):
        for RDF in self._get_cleaned_movie_rows():
            RDF["Triple"] = self._pipeline.build_triple(RDF)
            self.task_rdf_completation.write(RDF[["MovieID","Triple"]])
        self._end_file_handler()
    def _end_file_handler(self):
        self.task_bpe_corpus.close()
        self.task_rdf_text.close()
        self.task_rdf_completation.close()
    def execute_all_task(self):
        for RDF in self._get_cleaned_movie_rows():
            completation_RDF = RDF.copy()
            completation_RDF["Triple"] = self._pipeline.build_triple(completation_RDF)
            self.task_rdf_completation.write(completation_RDF[["MovieID","Triple"]])
            RDF = self._pipeline.rebuild_by_movie(RDF)
            self.task_rdf_text.write(RDF)
            self.task_bpe_corpus.write_from_df(RDF[["Triple","Abstract"]])
        self._end_file_handler()
    def use_toy_dataset(self):
        # CHOOSEN MOVIE:
        # The Dark Knight   : 117248
        # Inception         : 147074
        # The Avengers      : 113621
        # Cast Away         : 1123
        # The Departed      : 117586
        # American Psycho   : 90177
        # Avatar            : 71587
        # Django Unchained  : 138952
        # Spirited Away     : 144137
        # Knives Out        : 148025
        # [106465,106466,106467,106468,106469,106470,106471,106472,106473]
        movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]
        self._movie_filter.MOVIE_FILTER = pd.DataFrame({"MovieID": movie_list})
    def generate_csv_debug_file(self, debug_path:str):
        debug_csv = Debug_csv(debug_path)
        for RDF in self._get_cleaned_movie_rows():
            debug_csv.write(RDF)
        debug_csv.close()
 pipe = Pipeline()
 #pipe.use_toy_dataset()
 pipe.other_filter()
 # pipe.execute_all_task()
 pipe.generate_csv_debug_file("Assets/Dataset/Tmp/debug.csv")
--- a/Scripts/DataCleaning/pipeline/rdf_filter.py
+++ b/Scripts/DataCleaning/pipeline/rdf_filter.py
@@ -1,32 +0,0 @@
 import pandas as pd
 from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
 class RdfFilter:
    def __init__(self) -> None:
        self.sql_endpoint = SqlEndpoint()
    # def delete_hyperum_when_movie(self):
        # purl:linguistics/gold/hypernym 
        # is almost ever as "dbp-dbr:Movie" or "dbp-dbr:Film"
        # banned triple
    def yield_movie_abbreviated_rdfs(self, MOVIE_ID: pd.DataFrame, REL_ID: pd.DataFrame):
        relationship_placeholder = ",".join(["?"] * len(REL_ID))
        param = tuple(REL_ID["RelationshipID"].to_list())
        QUERY = f"""
                SELECT MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
                FROM RDFs
                INNER JOIN ParsedSubjects USING (SubjectID)
                INNER JOIN ParsedRelationships USING (RelationshipID)
                INNER JOIN ParsedObjects USING (ObjectID)
                INNER JOIN WikipediaAbstracts USING (MovieID)
                WHERE MovieID = (?) AND RelationshipID IN ({relationship_placeholder});
                """        
        for movie_id in MOVIE_ID["MovieID"].to_list():
            params = (movie_id,) + param
            yield self.sql_endpoint.get_dataframe_from_query(QUERY, params=params)
--- a/Scripts/DataCleaning/pipeline/relationship_filter.py
+++ b/Scripts/DataCleaning/pipeline/relationship_filter.py
@@ -1,54 +0,0 @@
 import pandas as pd
 from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
 class RelationshipFilter:
    def __init__(self) -> None:
        self.sql_endpoint = SqlEndpoint()
        # first obtain all relationship_id
        relationship_query = "SELECT RelationshipID FROM Relationships"
        self.RELATIONSHIP_FILTER = self.sql_endpoint.get_dataframe_from_query(relationship_query)
    def frequency_filter(self, min_treshold:int, max_treshold:int):
        movie_list_placeholder = ",".join(["?"] * len(self.RELATIONSHIP_FILTER))
        filter_query = f"""
            SELECT RelationshipID
            FROM RDFs
            WHERE RelationshipID IN ({movie_list_placeholder})
            GROUP BY RelationshipID
            HAVING COUNT(*) BETWEEN {min_treshold} AND {max_treshold};
        """
        self.RELATIONSHIP_FILTER = self.sql_endpoint.get_dataframe_from_query(filter_query, tuple(self.RELATIONSHIP_FILTER["RelationshipID"].to_list()))
    def get_relationship_id(self):
        return self.RELATIONSHIP_FILTER
    def get_relationship_id_from_white_list(self, relationship_list: list[str]):
        ids_placeholder = ",".join(["?"] * len(self.RELATIONSHIP_FILTER))
        uri_placeholder = ",".join(["?"] * len(relationship_list))
        filter_query = f"""
            SELECT RelationshipID
            FROM ParsedRelationships
            WHERE RelationshipID IN ({ids_placeholder})
            AND RelationshipURI IN ({uri_placeholder});
        """
        params = tuple(self.RELATIONSHIP_FILTER["RelationshipID"].to_list()) + tuple(relationship_list)
        return self.sql_endpoint.get_dataframe_from_query(filter_query, params)
    def delete_relationship_uri_by_list(self, filter_list: list[str]):
        ids_placeholder = ",".join(["?"] * len(self.RELATIONSHIP_FILTER))
        uri_placeholder = ",".join(["?"] * len(filter_list))
        filter_query = f"""
            SELECT RelationshipID
            FROM ParsedRelationships
            WHERE RelationshipID IN ({ids_placeholder})
            AND RelationshipURI NOT IN ({uri_placeholder});
        """
        params = tuple(self.RELATIONSHIP_FILTER["RelationshipID"].to_list()) + tuple(filter_list)
        self.RELATIONSHIP_FILTER = self.sql_endpoint.get_dataframe_from_query(filter_query, params)
--- a/Scripts/Experiments/change_me/use_bpe_pipeline.py
+++ b/Scripts/Experiments/change_me/use_bpe_pipeline.py
@@ -0,0 +1,21 @@
 import Project_Model.Libs.BPE as BPE
 from pathlib import Path
 from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
 VOCABULARY_path = "Assets/Model/toy_10/toy_dictionary.json"
 VOCABULARY = BPE.load_nanos_vocabulary(Path(VOCABULARY_path))
 SPECIAL_TOKEN_LIST = [token.value for token in SpecialToken]
 # INPUT = "<SOTL><SOT><SUBJ>dbp-dbr:How_It_Should_Have_Ended<PRED>dbp-dbp:title<OBJ>dbp-dbr:The_Dark_Knight<EOT>"
 # INPUT = "<SOTL><SOT><SUBJ>dbp-dbr:How_It_Should_Have_Ended<PRED>"
 # INPUT = "<ABS>The Dark Knight is a 2008 superhero film directed by Christopher Nolan, from a screenplay co-written with his brother Jonathan. Based on the DC Comics superhero Batman, it is the sequel to Batman Begins (2005), and the second installment in The Dark Knight trilogy. The plot follows the vigilante Batman, police lieutenant James Gordon, and district attorney Harvey Dent, who form an alliance to dismantle organized crime in Gotham City. Their efforts are derailed by the Joker, an anarchistic mastermind who seeks to test how far Batman will go to save the city from chaos. The ensemble cast includes Christian Bale, Michael Caine, Heath Ledger, Gary Oldman, Aaron Eckhart, Maggie Gyllenhaal, and Morgan Freeman.Warner Bros. Pictures prioritized a sequel following the successful reinvention of the Batman film series with Batman Begins. Christopher and Batman Begins co-writer David S. Goyer developed the story elements, making Dent the central protagonist caught up in the battle between Batman and the Joker. In writing the screenplay, the Nolans were influenced by 1980s Batman comics and crime drama films, and sought to continue Batman Begins' heightened sense of realism. From April to November 2007, filming took place with a $185 million budget in Chicago and Hong Kong, and on sets in England. The Dark Knight was the first major motion picture to be filmed with high-resolution IMAX cameras. Christopher avoided using computer-generated imagery unless necessary, insisting on practical stunts such as flipping an 18-wheel truck and blowing up a factory.The Dark Knight was marketed with an innovative interactive viral campaign that initially focused on countering criticism of Ledger's casting by those who believed he was a poor choice to portray the Joker. Ledger died from an accidental prescription drug overdose in January 2008, leading to widespread interest from the press and public regarding his performance. When it was released in July, The Dark Knight received acclaim for its mature tone and themes, visual style, and performances—particularly that of Ledger, who received many posthumous awards including Academy, BAFTA, and Golden Globe awards for Best Supporting Actor, making The Dark Knight the first comic-book film to receive major industry awards. It broke several box-office records and became the highest-grossing 2008 film, the fourth-highest-grossing film to that time, and the highest-grossing superhero film of the time.Since its release, The Dark Knight has been assessed as one of the greatest superhero films ever, one of the best movies of the 2000s, and one of the best films ever made. It is considered the \"blueprint\" for many modern superhero films, particularly for its rejection of a typical comic-book movie style in favor of a crime film that features comic-book characters. Many filmmakers sought to repeat its success by emulating its gritty, realistic tone to varying degrees of success. The Dark Knight has been analyzed for its themes of terrorism and the limitations of morality and ethics. The United States Library of Congress selected it for preservation in the National Film Registry in 2020. A sequel, The Dark Knight Rises, concluded The Dark Knight trilogy in 2012.<SOTL>"
 INPUT = "<ABS>The Dark Knight is a 2008 superhero film directed by Christopher Nolan,<SOTL>"
 # INPUT = "<ABS> Nolan,<SOTL>"
 # 32: " "
 TOKENANO = BPE.Classes.TokeNanoCore(VOCABULARY, SPECIAL_TOKEN_LIST)
 print(f"input: {INPUT} \ninput lenght: {len(INPUT)}")
 encoded = TOKENANO.encode(INPUT)
 print(f"encode: {encoded} \nencode lenght: {len(encoded)}")
 decoded = TOKENANO.decode(encoded)
 print(f"decode: {decoded} \ndecode lenght: {len(decoded)}")
--- a/Scripts/Libs/CleaningPipeline/special_token.py
+++ b/Scripts/Libs/CleaningPipeline/special_token.py
@@ -1,6 +1,7 @@
 from enum import Enum
-class SpecialToken(str, Enum):
+
 class SpecialToken(Enum):
    # (Enum, str) -> throws an error
    START_TRIPLE_LIST = "<SOTL>"
    START_TRIPLE = "<SOT>"
@@ -9,7 +10,6 @@ class SpecialToken(str, Enum):
    RELATIONSHIP = "<PRED>"
    OBJECT = "<OBJ>"
    ABSTRACT = "<ABS>"
    END_OF_SENTENCE = "<EOS>"
    CORPUS_END = "<END>"
    ## Tasks' Token
@@ -18,5 +18,4 @@ class SpecialToken(str, Enum):
    CONTINUE_RDF = "<CONTINUERDF>"
    MASK = "<MASK>"
-    #BPE Training:
+    # BPE Training:
--- a/Scripts/Libs/CleaningPipeline/sql_endpoint.py
+++ b/Scripts/Libs/CleaningPipeline/sql_endpoint.py
@@ -133,11 +133,6 @@ class SqlEndpoint():
                GROUP BY RelationshipURI;
                """        
        return pd.read_sql_query(QUERY, self.sql_engine)
    def get_dataframe_from_query(self, query: str, params=None):
        if params is None:
            return pd.read_sql_query(query, self.sql_engine)
        return pd.read_sql_query(query, self.sql_engine, params=params)
--- a/Scripts/Training/bpe_trainer.py
+++ b/Scripts/Training/bpe_trainer.py
@@ -0,0 +1,101 @@
 import argparse
 import json
 from pathlib import Path
 import sys
 # TODO: make relative imports
 import Project_Model.Libs.BPE as BPE
 from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
 DEFAULT_CHUNK_SIZE = int(18e4)
 DEFAULT_DEBUG_AFTER_ITER = 1
 DEFAULT_MAX_VOCABULARY = int(32E3)
 DEFAULT_MERGE_TRESHOLD = 1
 DEFAULT_MAX_ITERATIONS = 0
 TOKEN_LIST = [token.value for token in SpecialToken]
 class ProgramArgs:
    def __init__(
        self,
        input_file: str,
        cache_dir: str,
        output_file: str,
        resume_at: int,
        max_vocabulary: int,
        max_iterations: int,
        merge_treshold: int,
        chunk_size: int,
        debug_after: int,
    ) -> None:
        self.input_file = input_file
        self.cache_dir = cache_dir
        self.output_file = output_file
        self.resume_at = resume_at
        self.max_vocabulary = max_vocabulary
        self.max_iterations = max_iterations
        self.merge_treshold = merge_treshold
        self.chunk_size = chunk_size
        self.debug_after = debug_after
 def get_args(args: list[str]) -> ProgramArgs:
    PARSER = argparse.ArgumentParser()
    PARSER.add_argument("--input-file", "--input", "-i", required=True, type=str)
    PARSER.add_argument("--cache-dir", "--cache", "-c", required=True, type=str)
    PARSER.add_argument("--output-file", "--output", "-o", required=True, type=str)
    PARSER.add_argument("--resume-at", "--resume", "-r", default=0, type=int)
    PARSER.add_argument("--max-vocabulary", "--max-voc", default=DEFAULT_MAX_VOCABULARY, type=int)
    PARSER.add_argument("--max-iterations", "--max-iter", default=DEFAULT_MAX_ITERATIONS, type=int)
    PARSER.add_argument("--merge-treshold", "--tresh", default=DEFAULT_MERGE_TRESHOLD, type=int)
    PARSER.add_argument("--chunk-size", default=DEFAULT_CHUNK_SIZE, type=int)
    PARSER.add_argument("--debug-after", default=DEFAULT_DEBUG_AFTER_ITER, type=int)
    parsed_args, _ = PARSER.parse_known_args(args)
    return ProgramArgs(
        parsed_args.input_file,
        parsed_args.cache_dir,
        parsed_args.output_file,
        parsed_args.resume_at,
        parsed_args.max_vocabulary,
        parsed_args.max_iterations,
        parsed_args.merge_treshold,
        parsed_args.chunk_size,
        parsed_args.debug_after,
    )  # type ignore
 def train(args: ProgramArgs):
    TRAINER = BPE.NanoSocraTrainer(
        args.max_vocabulary,
        TOKEN_LIST,
        args.chunk_size,
        args.merge_treshold,
        args.max_iterations,
        args.debug_after
    )
    DATASET_PATH = Path(args.input_file)
    CACHE_DIR = Path(args.cache_dir)
    VOCABULARY_PATH = Path(args.output_file)
    print(f"Training BPE")
    BPE_ENCODER = TRAINER.trainBPE(
        DATASET_PATH,
        CACHE_DIR,
        resume_from_iter=args.resume_at
    )
    VOCABULARY = BPE_ENCODER.vocabulary
    print(f"Saving Vocabulary in {VOCABULARY_PATH}")
    BPE.save_nanos_vocabulary(VOCABULARY, VOCABULARY_PATH)
 if __name__ == "__main__":
    ARGS = get_args(sys.argv)
    train(ARGS)
--- a/Scripts/Training/bpe_trainer_pool.py
+++ b/Scripts/Training/bpe_trainer_pool.py
@@ -0,0 +1,96 @@
 import argparse
 import json
 from pathlib import Path
 import sys
 # TODO: make relative imports
 import Project_Model.Libs.BPE as BPE
 from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
 DEFAULT_DEBUG_AFTER_ITER = 1
 DEFAULT_MAX_VOCABULARY = int(32E3)
 DEFAULT_MERGE_TRESHOLD = 1
 DEFAULT_MAX_ITERATIONS = 0
 TOKEN_LIST = [token.value for token in SpecialToken]
 class ProgramArgs:
    def __init__(
        self,
        input_file: str,
        output_file: str,
        cache_file: str,
        max_vocabulary: int,
        max_iterations: int,
        merge_treshold: int,
        debug_after: int,
    ) -> None:
        self.input_file = input_file
        self.output_file = output_file
        self.cache_file = cache_file
        self.max_vocabulary = max_vocabulary
        self.max_iterations = max_iterations
        self.merge_treshold = merge_treshold
        self.debug_after = debug_after
 def get_args(args: list[str]) -> ProgramArgs:
    PARSER = argparse.ArgumentParser()
    PARSER.add_argument("--input-file", "--input", "-i", required=True, type=str)
    PARSER.add_argument("--output-file", "--output", "-o", required=True, type=str)
    PARSER.add_argument("--cache-file", "--cache", "-c", required=True, type=str)
    PARSER.add_argument("--max-vocabulary", "--max-voc", default=DEFAULT_MAX_VOCABULARY, type=int)
    PARSER.add_argument("--max-iterations", "--max-iter", default=DEFAULT_MAX_ITERATIONS, type=int)
    PARSER.add_argument("--merge-treshold", "--tresh", default=DEFAULT_MERGE_TRESHOLD, type=int)
    PARSER.add_argument("--debug-after", default=DEFAULT_DEBUG_AFTER_ITER, type=int)
    parsed_args, _ = PARSER.parse_known_args(args)
    return ProgramArgs(
        parsed_args.input_file,
        parsed_args.output_file,
        parsed_args.cache_file,
        parsed_args.max_vocabulary,
        parsed_args.max_iterations,
        parsed_args.merge_treshold,
        parsed_args.debug_after,
    )  # type ignore
 def train(args: ProgramArgs):
    TRAINER = BPE.NanoSocraTrainerPool(
        args.max_vocabulary,
        TOKEN_LIST,
        args.merge_treshold,
        args.max_iterations,
        args.debug_after
    )
    DATASET_PATH = Path(args.input_file)
    VOCABULARY_PATH = Path(args.output_file)
    CACHE_PATH = Path(args.cache_file)
    start_bpe = BPE.NanoSocratesBPE()
    if CACHE_PATH.is_file():
        voc = BPE.load_nanos_vocabulary(CACHE_PATH)
        start_bpe = BPE.NanoSocratesBPE(voc)
    print(f"Training BPE")
    BPE_ENCODER = TRAINER.trainBPE(
        DATASET_PATH,
        CACHE_PATH,
        start_bpe
    )
    VOCABULARY = BPE_ENCODER.vocabulary
    print(f"Saving Vocabulary in {VOCABULARY_PATH}")
    BPE.save_nanos_vocabulary(VOCABULARY, VOCABULARY_PATH)
 if __name__ == "__main__":
    ARGS = get_args(sys.argv)
    train(ARGS)
--- a/Scripts/Training/bpe_trainer_ram.py
+++ b/Scripts/Training/bpe_trainer_ram.py
@@ -0,0 +1,84 @@
 import argparse
 import json
 from pathlib import Path
 import sys
 # TODO: make relative imports
 import Project_Model.Libs.BPE as BPE
 from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
 DEFAULT_DEBUG_AFTER_ITER = 1
 DEFAULT_MAX_VOCABULARY = int(32E3)
 DEFAULT_MERGE_TRESHOLD = 1
 DEFAULT_MAX_ITERATIONS = 0
 TOKEN_LIST = [token.value for token in SpecialToken]
 class ProgramArgs:
    def __init__(
        self,
        input_file: str,
        output_file: str,
        max_vocabulary: int,
        max_iterations: int,
        merge_treshold: int,
        debug_after: int,
    ) -> None:
        self.input_file = input_file
        self.output_file = output_file
        self.max_vocabulary = max_vocabulary
        self.max_iterations = max_iterations
        self.merge_treshold = merge_treshold
        self.debug_after = debug_after
 def get_args(args: list[str]) -> ProgramArgs:
    PARSER = argparse.ArgumentParser()
    PARSER.add_argument("--input-file", "--input", "-i", required=True, type=str)
    PARSER.add_argument("--output-file", "--output", "-o", required=True, type=str)
    PARSER.add_argument("--max-vocabulary", "--max-voc", default=DEFAULT_MAX_VOCABULARY, type=int)
    PARSER.add_argument("--max-iterations", "--max-iter", default=DEFAULT_MAX_ITERATIONS, type=int)
    PARSER.add_argument("--merge-treshold", "--tresh", default=DEFAULT_MERGE_TRESHOLD, type=int)
    PARSER.add_argument("--debug-after", default=DEFAULT_DEBUG_AFTER_ITER, type=int)
    parsed_args, _ = PARSER.parse_known_args(args)
    return ProgramArgs(
        parsed_args.input_file,
        parsed_args.output_file,
        parsed_args.max_vocabulary,
        parsed_args.max_iterations,
        parsed_args.merge_treshold,
        parsed_args.debug_after,
    )  # type ignore
 def train(args: ProgramArgs):
    TRAINER = BPE.NanoSocraTraineRam(
        args.max_vocabulary,
        TOKEN_LIST,
        args.merge_treshold,
        args.max_iterations,
        args.debug_after
    )
    DATASET_PATH = Path(args.input_file)
    VOCABULARY_PATH = Path(args.output_file)
    print(f"Training BPE")
    BPE_ENCODER = TRAINER.trainBPE(
        DATASET_PATH
    )
    VOCABULARY = BPE_ENCODER.vocabulary
    print(f"Saving Vocabulary in {VOCABULARY_PATH}")
    BPE.save_nanos_vocabulary(VOCABULARY, VOCABULARY_PATH)
 if __name__ == "__main__":
    ARGS = get_args(sys.argv)
    train(ARGS)
--- a/Scripts/Training/dictionary_adjuster.py
+++ b/Scripts/Training/dictionary_adjuster.py
@@ -0,0 +1,12 @@
 # to cut the mad trained dict into a short one
 from Project_Model.Libs.BPE.Utils.vocabulary import load_nanos_vocabulary, save_nanos_vocabulary
 from pathlib import Path
 DICTIONARY_PATH = "Assets/Dataset/Tmp/mad_cache.json" 
 OUTPUT_PATH = "Assets/Dataset/Tmp/trimmed.json"
 big_dict = load_nanos_vocabulary(Path(DICTIONARY_PATH))
 big_dict = dict(list(big_dict.items())[:31744])
 save_nanos_vocabulary(big_dict,Path(OUTPUT_PATH))
--- a/Scripts/Training/mad_traininng.py
+++ b/Scripts/Training/mad_traininng.py
@@ -0,0 +1,48 @@
 # generate each time a corpus big then the last, without the old data
 # then using the same vocabulary let the bpe train
 from Scripts.DataCleaning.pipeline import Pipeline
 from Scripts.Training.bpe_trainer_pool import train,get_args
 from pathlib import Path
 import os, shutil
 CORPUS_PATH = "Assets/Dataset/Tmp/toy_corpus.txt"
 VOCABULARY_PATH = "Assets/Dataset/Tmp/mad_vocabulary.json"
 CACHE_PATH = "Assets/Dataset/Tmp/mad_cache.json"
 def mad_corpus_generator(corpus_size :int, corpus_offset: int):
    print("New Corpus")
    pipe = Pipeline(bpe_corpus_path=CORPUS_PATH)
    print("Pipeline Created")
    corpus_ending_offset = corpus_size + corpus_offset
    pipe.reduce_movie_list(corpus_offset,corpus_ending_offset)
    print("Starting building corpus")
    pipe.execute_task_bpe_corpus()
    print("Corpus created")
 def mad_bpe_trainer():
    argv = [
    "--input-file", CORPUS_PATH,
    "--output-file", VOCABULARY_PATH,
    "--cache-file", CACHE_PATH,
    ]
    args = get_args(argv)
    train(args)
 def mad_hatter():
    # 10,100,500,1000,1500,2000,3000,4000,5000,10000
    film_list = [10,100,500,1000,1500,2000,3000,4000,5000,10000]
    starting_offset = 0
    for corpus_size in film_list:
        # mad_corpus_generator(corpus_size, starting_offset)
        # starting_offset = starting_offset + corpus_size
        mad_bpe_trainer()
        # put dict into cache
        shutil.copyfile(Path(VOCABULARY_PATH), Path(CACHE_PATH))
 mad_hatter()
--- a/docs/BPE.md
+++ b/docs/BPE.md
@@ -0,0 +1,22 @@
 # BPE
 ## Reasearch Material
 - [BPE Wikipedia](https://en.wikipedia.org/wiki/Byte-pair_encoding)
 - [BPE Hugging Face](https://huggingface.co/learn/llm-course/chapter6/5)
 - [BPE GeeksForGeeks](https://www.geeksforgeeks.org/nlp/byte-pair-encoding-bpe-in-nlp/)
 - [BPE Medium Chetna Khanna](https://medium.com/data-science/byte-pair-encoding-subword-based-tokenization-algorithm-77828a70bee0)
 - [Stack Overflow "Explain bpe (Byte Pair Encoding) with examples?"](https://stackoverflow.com/questions/50583254/explain-bpe-byte-pair-encoding-with-examples)
 - [Implementing a byte pair encoding(BPE) Tokenizer from scratch](https://sebastianraschka.com/blog/2025/bpe-from-scratch.html)
 - [Thoretical Analysis of Byte-Pair Encoding](https://arxiv.org/pdf/2411.08671)
 - [A Formal Perspective on Byte-Pair Encoding](https://aclanthology.org/2023.findings-acl.38v2.pdf)
 - [Byte Pair Encoding is Suboptimal for Language Model Pretraining](https://arxiv.org/pdf/2004.03720)
 - [Byte pair encoding: a text compression scheme that accelerates pattern matching](https://www.researchgate.net/profile/Takeshi-Shinohara/publication/2310624_Byte_Pair_Encoding_A_Text_Compression_Scheme_That_Accelerates_Pattern_Matching/links/02e7e522f8ea00c318000000/Byte-Pair-Encoding-A-Text-Compression-Scheme-That-Accelerates-Pattern-Matching.pdf)
 - [A Formal Perspective on Byte-Pair Encoding](https://arxiv.org/pdf/2306.16837)
 - [Controlling byte pair encoding for neural machine translation](https://ieeexplore.ieee.org/abstract/document/8300571)
 - [Scaffold-BPE: Enhancing Byte Pair Encoding for Large Language Models with Simple and Effective Scaffold Token Removal](https://ojs.aaai.org/index.php/AAAI/article/view/34633)
 - [Parity-Aware Byte-Pair Encoding: Improving Cross-lingual Fairness in Tokenization](https://arxiv.org/pdf/2508.04796)
 - [Code Completion using Neural Aention and Byte Pair Encoding](https://arxiv.org/pdf/2004.06343)
 - [Getting the most out of your tokenizer for pre-training and domain adaptation](https://arxiv.org/html/2402.01035v2)
Author	SHA1	Message	Date
GassiGiuseppe	1d23b9cc8b	little snippet to trim big dictionaries	2025-10-07 16:05:32 +02:00
GassiGiuseppe	165290162c	added tokenano to the init	2025-10-04 19:03:56 +02:00
GassiGiuseppe	502016f843	a new exasperated way to train the bpe, just a wild experimen that could be useful later	2025-10-04 19:03:07 +02:00
GassiGiuseppe	845c63dbef	updated tokenano to be more easy to read	2025-10-04 19:01:21 +02:00
GassiGiuseppe	bbadd4c521	update cleaning pipeline with a new method to filter also by number of films, also updated the signature of the pipeline	2025-10-04 19:00:05 +02:00
GassiGiuseppe	c2f9344c82	little test file	2025-10-04 18:58:20 +02:00
GassiGiuseppe	25f3a5d221	Logic to test BPE	2025-10-04 18:58:04 +02:00
Christian Risi	149deb407d	added cache directories	2025-10-03 18:01:05 +02:00
Christian Risi	8a21cb1b73	added python analysis	2025-10-03 18:00:52 +02:00
Christian Risi	d2a3dfe90f	Fixed bug	2025-10-03 17:59:46 +02:00
GassiGiuseppe	0f95aeb122	toy dictionary for bpe implemeted	2025-10-03 16:26:01 +02:00
Christian Risi	0ee6e48004	Fixed the same bug as before, but this time is correct	2025-10-03 16:09:53 +02:00
Christian Risi	55e0d2ac23	Fixed a encoding bug	2025-10-03 16:08:11 +02:00
Christian Risi	9c5f42153f	fixed typos	2025-10-03 15:17:44 +02:00
Christian Risi	c74689d01d	Fixed tests to reflect new version of tokenizer	2025-10-03 13:27:38 +02:00
Christian Risi	51f491d033	fixed typos	2025-10-03 13:27:17 +02:00
Christian Risi	c5c0c61f79	Fix of bugs and semantics	2025-10-03 13:26:58 +02:00
Christian Risi	6b9cb7cd35	Modified imports	2025-10-03 13:26:42 +02:00
Christian Risi	e8894504c6	Fixed a bug where a token (int) was yielded instead of a list of int	2025-10-03 11:44:44 +02:00
GassiGiuseppe	845d645348	added some stubs on special_regex_maker	2025-10-03 10:38:35 +02:00
GassiGiuseppe	09f7b39512	test files updated	2025-10-03 01:04:47 +02:00
GassiGiuseppe	070dc1b744	implemented token nano for the BPE encoding/decoding	2025-10-03 01:04:06 +02:00
GassiGiuseppe	8121c75a09	Updated NanoSocratesSplitter to split also token in decode phase	2025-10-03 01:00:36 +02:00
GassiGiuseppe	a5b8692a77	Updated NanoSocratesSpecial to work with TokeNano	2025-10-03 00:59:15 +02:00
GassiGiuseppe	7c935d2700	Update NanoSocratesBPE: corrected a minor bug about dictionary lenght, added some comment to make the code more clear	2025-10-03 00:57:19 +02:00
Christian Risi	a1d143187d	corrected test to reflect changes in BPE trainer	2025-10-02 20:11:43 +02:00
GassiGiuseppe	0eef2148a9	in NanoSocratesBPE: encode() method rewritten and tested	2025-10-02 12:12:44 +02:00
Christian Risi	856bd8909c	Added treshold	2025-10-02 11:02:03 +02:00
Christian Risi	2e595a3a23	Changed training phase to take directly data instead of its encode	2025-10-02 09:56:44 +02:00
Christian Risi	2194cc7b4f	Changed test to use pool trainer	2025-10-02 09:56:05 +02:00
Christian Risi	1eae8582b2	Fixed decoding phase	2025-10-02 09:33:58 +02:00
Christian Risi	eadba1fb82	Corrected test to reflect changes in NanoSocratesBPE	2025-10-02 09:33:47 +02:00
Christian Risi	aa765b4555	Added time checking	2025-10-02 08:48:45 +02:00
Christian Risi	17d82f0a4e	Added support to resume workload	2025-10-02 08:48:28 +02:00
Christian Risi	0975c19e69	added nwew method to encode from list of tokens	2025-10-02 08:48:13 +02:00
Christian Risi	3fe4e45ceb	Fixed a bug while joining frequencies	2025-10-02 01:50:37 +02:00
Christian Risi	d19426fa62	added multithreaded training to package	2025-10-02 01:31:05 +02:00
Christian Risi	63baf29805	Added multithreaded training	2025-10-02 01:30:24 +02:00
Christian Risi	b80b4e4112	Fixed returning type hints	2025-10-02 01:29:57 +02:00
Christian Risi	7cfaf601b4	Refactored to remove tokens that can't be compressed anymore	2025-10-01 19:42:22 +02:00
Christian Risi	fbbe6226bb	Finished uploading stubs for TokeNano	2025-10-01 18:56:53 +02:00
Christian Risi	b3d444979f	Added flag to resume work correctly	2025-10-01 12:22:09 +02:00
Christian Risi	66bcf6e55f	Added a way to recover iteration work	2025-10-01 12:21:42 +02:00
Christian Risi	dbf1d99408	Added json utils to save and load json files	2025-10-01 12:20:59 +02:00
Christian Risi	97bac464f3	Fixed JSON incompatibility	2025-10-01 00:32:43 +02:00
Christian Risi	9a8e726d74	Added cdebug configuration	2025-10-01 00:22:22 +02:00
Christian Risi	7ab9b0358e	Added script to run BPE	2025-09-30 23:59:09 +02:00
Christian Risi	30c2938d29	Fixed typing	2025-09-30 23:58:54 +02:00
Christian Risi	76f24d4eb0	Renamed file	2025-09-30 23:58:43 +02:00
Christian Risi	89a0a1f4bb	Fixed bug for utf-8 conversion	2025-09-30 23:58:31 +02:00
Christian Risi	ccacea18d8	Created files to test BPE training	2025-09-30 13:33:54 +02:00
Christian Risi	b09bd4acba	Created trainer to train BPE	2025-09-30 13:33:40 +02:00
Christian Risi	c9032cab09	Added fit method	2025-09-30 13:33:28 +02:00
Christian Risi	7020c9e683	Added utils to make regexps and iterators that check for last element	2025-09-30 13:33:12 +02:00
Christian Risi	2fe1ce9e9a	Updated Inits	2025-09-30 13:32:37 +02:00
Christian Risi	18fc2ba9d8	Added Exceptions	2025-09-30 13:32:24 +02:00
Christian Risi	5acee1d1a5	Merge branch 'dev' into dev.bpe	2025-09-30 11:35:27 +02:00
Giuseppe Gassi	2e36753da4	Merge pull request 'dev.etl' (#5 ) from dev.etl into dev Reviewed-on: #5	2025-09-30 11:28:57 +02:00
Christian Risi	564b0d712e	Modified UML diagram	2025-09-28 18:05:03 +02:00
Christian Risi	e433941405	Added BPE TODO: - complete the fit method	2025-09-28 18:04:44 +02:00
Christian Risi	b46df4f91a	Added Special Encoder	2025-09-28 18:03:47 +02:00
Christian Risi	d179e01971	Added Splitter to divide tokens from text	2025-09-28 18:03:16 +02:00
Christian Risi	b071145f6e	Added Chunker	2025-09-28 18:02:06 +02:00
Christian Risi	ed0255e99b	Updated imports	2025-09-28 18:01:35 +02:00
Christian Risi	3e8b5c5579	Added test for chunker	2025-09-26 18:50:32 +02:00
Christian Risi	8db35732f9	Added Chunker to restrict our domains	2025-09-26 18:50:23 +02:00
Christian Risi	9552d61f8d	Added Excetption for when we don't find a delimiter	2025-09-26 18:49:56 +02:00
Christian Risi	be8a87ce01	Modified the architecture for BPE	2025-09-26 18:49:29 +02:00
Christian Risi	5801a819e9	Added vars to make it easier to work here	2025-09-26 18:49:06 +02:00
Christian Risi	3f48b5c428	Added text files to test a chunker	2025-09-26 18:48:44 +02:00
Christian Risi	9972ab8a51	Added imports	2025-09-26 18:48:23 +02:00
Christian Risi	90012285b5	UML Diagram to explain bpe workflows	2025-09-25 20:18:21 +02:00
Christian Risi	1bbb4a0999	Added new paper	2025-09-25 20:17:48 +02:00
Christian Risi	ee0aa583d5	Added Docs for BPE research	2025-09-25 19:10:45 +02:00
		`@@ -0,0 +1,2 @@`
							`<SOT>Lorem ipsum <SEP>dolor sit amet<SEP>, consectetur <SEP>adipiscing elit.`
							`<SEP>Aenean at dui <SEP>hendrerit ante <SEP>sollicitudin <SEP>scelerisque<EOT>`
		`@@ -0,0 +1 @@`
							`<ABS>The Dark Knight is a 2008 superhero film directed by Christopher Nolan,<SOTL>`