From c5c0c61f797773a96f1a3fe582e8998c5d5254cd Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Fri, 3 Oct 2025 13:26:58 +0200
Subject: [PATCH] Fix of bugs and semantics

---
 .../Libs/BPE/Classes/NanoSocratesBPE.py       | 56 ++++-------
 .../Libs/BPE/Classes/NanoSocratesSpecial.py   | 65 ++++++-------
 .../Libs/BPE/Classes/NanoSocratesSplitter.py  | 24 ++++-
 .../Libs/BPE/Classes/TokeNanoCore.py          | 97 +++++++------------
 Project_Model/Libs/BPE/Enums/SpecialToken.py  | 21 ++++
 5 files changed, 134 insertions(+), 129 deletions(-)
 create mode 100644 Project_Model/Libs/BPE/Enums/SpecialToken.py

diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py
index baa5efd..d517f04 100644
--- a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py
@@ -2,20 +2,18 @@ from collections import deque
 from .Encoder import Encoder
 from ..Errors import OutOfDictionaryException, DuplicateWordException
 
+
 # ABOUT THE DICTIONARY:
 # the string is converted into utf-char bytes, that is: each char is rappresented with a set of bytes from 1 to 4.
 # each bytes get casted into an integer; such that, if an integer has its value lower then 256,
 # then it is rappresenting an utf-char-byte, otherwise it is a token-ID.
 class NanoSocratesBatchMemoryBPE:
-    """ Memory to batch training. Keeps token couple frequencies, and merge_treshold 
-    """
+    """Memory to batch training. Keeps token couple frequencies, and merge_treshold"""
 
     def __init__(
-        self,
-        frequencies: dict[tuple[int, int], int],
-        merge_treshold: int
+        self, frequencies: dict[tuple[int, int], int], merge_treshold: int
     ) -> None:
-        
+
         self.frequencies = frequencies
         self.merge_treshold = merge_treshold
 
@@ -39,7 +37,6 @@ class NanoSocratesBPE(Encoder):
             self.__vocabulary[key] = value
             self.__reverse_vocabulary[value] = key
 
-
     @property
     def vocabulary_size(self):
         return len(self.__vocabulary) + 256
@@ -62,7 +59,7 @@ class NanoSocratesBPE(Encoder):
         self,
         chunk_data: list[int],
         memory: NanoSocratesBatchMemoryBPE,
-        last_batch: bool
+        last_batch: bool,
     ):
 
         ENCODED_CHUNK = self.encode_intermediate(chunk_data)
@@ -70,7 +67,7 @@ class NanoSocratesBPE(Encoder):
 
         # update frequency of each couple of element
         for i in range(0, DATA_LEN_BEFORE_LAST):
-            CANDIDATE_COUPLE = (ENCODED_CHUNK[i], ENCODED_CHUNK[i+1])
+            CANDIDATE_COUPLE = (ENCODED_CHUNK[i], ENCODED_CHUNK[i + 1])
 
             frequency = memory.frequencies.get(CANDIDATE_COUPLE)
 
@@ -82,7 +79,6 @@ class NanoSocratesBPE(Encoder):
             frequency += 1
             memory.frequencies[CANDIDATE_COUPLE] = frequency
 
-
         if not last_batch:
             return (self, memory, ENCODED_CHUNK)
 
@@ -100,9 +96,6 @@ class NanoSocratesBPE(Encoder):
 
         return (self, memory, ENCODED_CHUNK)
 
-
-
-
     def encode(self, piece: str) -> list[int]:
         """Encode a String into token IDs, it firt convert it into utf-8, then pass the list of integer to encode_intermediate()
         Args:
@@ -114,12 +107,12 @@ class NanoSocratesBPE(Encoder):
         return self.encode_intermediate(converted_piece)
 
     def encode_intermediate(self, piece: list[int]) -> list[int]:
-        """ Encode a piece (as list of integer) till its maximum
+        """Encode a piece (as list of integer) till its maximum
         Args:
             piece (list[int]): piece to encode
         Returns:
-            list[int]: piece encoded 
-        """        
+            list[int]: piece encoded
+        """
         current_piece = piece
         new_piece = self.__round_encode(current_piece)
 
@@ -130,9 +123,8 @@ class NanoSocratesBPE(Encoder):
 
         return current_piece
 
-
     def __round_encode(self, piece: list[int]):
-        """ A single round of encode that traverse all the object. Multiple round are needed for a full encode: \n
+        """A single round of encode that traverse all the object. Multiple round are needed for a full encode: \n
         1) "ABAB" -> "XX"
         2) "XX" -> "Y"
         Args:
@@ -146,22 +138,25 @@ class NanoSocratesBPE(Encoder):
             return piece
 
         PIECE_LENGTH = len(piece) - 1
-        NEW_PIECE : list[int]= []
+        NEW_PIECE: list[int] = []
 
         index = 0
         while index < PIECE_LENGTH:
 
-            CANDIDATE_WORD = (piece[index], piece[index + 1]) # take a tuple of consecutive element [int]
+            CANDIDATE_WORD = (
+                piece[index],
+                piece[index + 1],
+            )  # take a tuple of consecutive element [int]
             CANDIDATE_TOKEN = self.__vocabulary.get(CANDIDATE_WORD)
 
             # if no token to substitute the tuple, append the first element
             if CANDIDATE_TOKEN is None:
-                NEW_PIECE.append(piece[index]) 
+                NEW_PIECE.append(piece[index])
                 index += 1
 
                 # if the latter element of the tuple is the last element of the piece, append it
                 if index == PIECE_LENGTH:
-                    NEW_PIECE.append(piece[index]) 
+                    NEW_PIECE.append(piece[index])
 
                 continue
 
@@ -169,13 +164,10 @@ class NanoSocratesBPE(Encoder):
             NEW_PIECE.append(CANDIDATE_TOKEN)
             index += 2
 
-
         return NEW_PIECE
 
-
     # TODO: Remake decode to take a list of token IDs
     def decode(self, token_ids: list[int]) -> str:
-        
 
         # deque: double ended queue
         token_stack: deque[int] = deque(token_ids)
@@ -185,19 +177,13 @@ class NanoSocratesBPE(Encoder):
             TOKEN_ID = token_stack.popleft()
 
             if TOKEN_ID < 256:
-                UTF_8_STRING_ARR.append(
-                    TOKEN_ID
-                )
+                UTF_8_STRING_ARR.append(TOKEN_ID)
                 continue
 
             left_token, right_token = self.__token_decode(TOKEN_ID)
 
-            token_stack.appendleft(
-                right_token
-            )
-            token_stack.appendleft(
-                left_token
-            )
+            token_stack.appendleft(right_token)
+            token_stack.appendleft(left_token)
 
         return UTF_8_STRING_ARR.decode("utf-8")
 
@@ -211,7 +197,7 @@ class NanoSocratesBPE(Encoder):
         return CANDIDATE_DECODED
 
     def __learn_word(self, words: tuple[int, int]):
-        """ learn a new couple of object in the vocabulary
+        """learn a new couple of object in the vocabulary
         Args:
             words (tuple[int, int]): the Pair of element to substitute with a new tokenID
 
diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py b/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py
index 8fe81bb..61d4741 100644
--- a/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py
@@ -1,47 +1,46 @@
 from .Encoder import Encoder
 from ..Errors import OutOfDictionaryException
 
+
 class NanoSocratesSpecial(Encoder):
 
     def __init__(
-        self,
-        vocabulary_index: int ,
-        vocabulary: dict[str, int] | None = None
-        ) -> None:
-        
+        self, bpe_vocabulary_size: int, special_tokens: list[str] = []
+    ) -> None:
+
         super().__init__()
 
-        if vocabulary is None:
-            self.__vocabulary: dict[str, int] = {}
-        else:
-            self.__vocabulary:  dict[str, int] = vocabulary
-        
+        self.__bpe_offset = bpe_vocabulary_size
+        self.__vocabulary: dict[str, int] = {}
         self.__reverse_vocabulary: dict[int, str] = {}
 
-        if vocabulary_index is None:
-            self.__vocabulary_index = 0
-        else:
-            self.__vocabulary_index = vocabulary_index
+        if len(special_tokens) == 0:
+            return
 
-        # self.__build_reverse_vocabulary()
+        for index, TOKEN in zip(range(0, len(special_tokens)), special_tokens):
 
+            CANDIDATE_ID = self.__bpe_offset + index + 1
+            self.__vocabulary[TOKEN] = CANDIDATE_ID
+            self.__reverse_vocabulary[CANDIDATE_ID] = TOKEN
 
+    @property
+    def __next_id(self):
+        BPE_OFFSET = self.__bpe_offset
+        VOC_LENGTH = len(self.__vocabulary)
+        return BPE_OFFSET + VOC_LENGTH + 1
 
-    def build_reverse_vocabulary(self):
-        self.__reverse_vocabulary = {v: k for k, v in self.__vocabulary.items()}
+    @property
+    def vocabulary(self) -> dict[str, int]:
+        return self.__vocabulary
 
-    # @property
-    # def vocabulary_size(self):
-    #     return self.__current_index
+    @property
+    def reverse_vocabulary(self) -> dict[int, str]:
+        return self.__reverse_vocabulary
 
-    def set_vocabulary_index(self, vocabulary_index: int):
-        self.__vocabulary_index = vocabulary_index
-
-    def add_special_word_to_vocabulary(self, word:str):
-        self.__vocabulary_index = self.__vocabulary_index + 1
-        CURRENT_INDEX = self.__vocabulary_index
-        self.__vocabulary[word] = CURRENT_INDEX
-        self.__reverse_vocabulary[CURRENT_INDEX] = word
+    def add_special_word_to_vocabulary(self, word: str):
+        CANDIDATE_INDEX = self.__next_id
+        self.__vocabulary[word] = CANDIDATE_INDEX
+        self.__reverse_vocabulary[CANDIDATE_INDEX] = word
 
     def encode(self, word: str) -> list[int]:
         ID = self.__vocabulary.get(word)
@@ -51,15 +50,15 @@ class NanoSocratesSpecial(Encoder):
 
         return [ID]
 
-    def decode(self, token_id: int) -> str:
+    def decode(self, token_id: list[int]) -> str:
 
-        ID = token_id
+        if len(token_id) != 1:
+            raise OutOfDictionaryException()
+
+        ID = token_id[0]
         WORD = self.__reverse_vocabulary.get(ID)
 
         if WORD is None:
             raise OutOfDictionaryException()
 
         return WORD
-
-    def get_reverse_vocabulary(self)-> dict[int, str]:
-        return self.__reverse_vocabulary
diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py b/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py
index 6e0abc2..02a8ccf 100644
--- a/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py
@@ -31,7 +31,8 @@ class NanoSocratesSplitter:
             bpe_end = special_token_start
             BPE_TOKEN_TEXT = corpus[bpe_start:bpe_end]
             if BPE_TOKEN_TEXT != "":
-                yield (BPE_TOKEN_TEXT, TokenType.BPE)
+                for WORD in self.__split_words(BPE_TOKEN_TEXT):
+                    yield (WORD, TokenType.BPE)
 
             # FIND SPECIAL TOKEN
             SPECIAL_TOKEN_TEXT = corpus[special_token_start:special_token_end]
@@ -60,6 +61,27 @@ class NanoSocratesSplitter:
         # eof = len(corpus)
         # yield(eof,eof)
 
+    def __split_words(self, bpe_piece: str) -> Generator[str]:
+
+        END_OF_STRING = len(bpe_piece)
+        bound_start = 0
+        bound_end = END_OF_STRING + 1
+        for i in range(0, END_OF_STRING):
+
+            CANDIDATE_CHAR = bpe_piece[i]
+
+            if CANDIDATE_CHAR != " ":
+                continue
+
+            bound_end = i
+
+            yield bpe_piece[bound_start:bound_end]
+
+            bound_start = bound_end
+            bound_end = END_OF_STRING + 1
+
+        yield bpe_piece[bound_start:bound_end]
+
     def split_tokens(self, corpus: list[int]) -> Generator[tuple[list[int], TokenType]]:
 
         not_special_token_list: list[int] = []
diff --git a/Project_Model/Libs/BPE/Classes/TokeNanoCore.py b/Project_Model/Libs/BPE/Classes/TokeNanoCore.py
index c719219..f726a95 100644
--- a/Project_Model/Libs/BPE/Classes/TokeNanoCore.py
+++ b/Project_Model/Libs/BPE/Classes/TokeNanoCore.py
@@ -1,79 +1,56 @@
 from pathlib import Path
 
-from Project_Model.Libs.BPE.Classes.NanoSocratesSplitter import NanoSocratesSplitter
-from Project_Model.Libs.BPE.Classes.NanoSocratesBPE import NanoSocratesBPE
-from Project_Model.Libs.BPE.Classes.NanoSocratesSpecial import NanoSocratesSpecial
+from ..Classes import NanoSocratesSplitter
+from ..Classes import NanoSocratesBPE
+from ..Classes import NanoSocratesSpecial
+
+from ..Utils import special_regex_maker
+from ..Enums import TokenType
+
 
-from Project_Model.Libs.BPE.Utils.special_regex_maker import special_regex_maker
-from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
-from Project_Model.Libs.BPE.Enums import TokenType
-from Project_Model.Libs.BPE.Utils.json_utils import load_json
 class TokeNanoCore:
-    def __init__(self, 
-                bpe_vocabulary: dict[tuple[int, int], int]
-                # special_vocabulary: dict[str, int] 
-                ):
-        self._bpe = NanoSocratesBPE(bpe_vocabulary)
-        
-        # special_vocabulary = [token.value for token in SpecialToken]
-        special_token_list = [token.value for token in SpecialToken]
-        self._splitter = NanoSocratesSplitter(special_regex_maker(special_token_list),self._bpe.vocabulary_size)
+    def __init__(
+        self,
+        bpe_vocabulary: dict[tuple[int, int], int],
+        special_token_list: list[str],
+        # special_vocabulary: dict[str, int]
+    ):
 
-        self._special_bpe =  NanoSocratesSpecial(vocabulary_index=None) # technically its not a bpe but more something like an "autoencoder"
-        self.prepare_special_token_vocabulary()
-        
-        
-    def encode(self, corpus : str) -> list[int]:
-        output : list[int] = []
-        for piece, token_type in self._splitter.split_text(corpus):
+        self.__bpe_encoder = NanoSocratesBPE(bpe_vocabulary)
+
+        SPECIAL_REGEX = special_regex_maker(special_token_list)
+        BPE_VOCABULARY_SIZE = self.__bpe_encoder.vocabulary_size
+
+        self.__splitter = NanoSocratesSplitter(SPECIAL_REGEX, BPE_VOCABULARY_SIZE)
+        self.__special_encoder = NanoSocratesSpecial(
+            BPE_VOCABULARY_SIZE, special_token_list
+        ) 
+
+    def encode(self, corpus: str) -> list[int]:
+        output: list[int] = []
+        for piece, token_type in self.__splitter.split_text(corpus):
 
             if token_type == TokenType.SPECIAL:
-                output.extend(self._special_bpe.encode(piece))
+                output.extend(self.__special_encoder.encode(piece))
 
             # slow but clear
             if token_type == TokenType.BPE:
-                output.extend(self._bpe.encode(piece))
+                output.extend(self.__bpe_encoder.encode(piece))
 
         return output
 
-    
-
-    def decode(self, corpus : list[int])-> str:
-        output_str = ''
-        for token, token_type in self._splitter.split_tokens(corpus):
+    def decode(self, corpus: list[int]) -> str:
+        output_str = ""
+        for token, token_type in self.__splitter.split_tokens(corpus):
             # token is an integer if special, a list of integer otherwise
             if token_type == TokenType.SPECIAL:
-                output_str += self._special_bpe.decode(token) # it accept an integer
+                output_str += self.__special_encoder.decode(
+                    token
+                )  # it accept an integer
 
             # slow but clear
             if token_type == TokenType.BPE:
-                output_str += self._bpe.decode(token) # it accept a list of integer
+                output_str += self.__bpe_encoder.decode(
+                    token
+                )  # it accept a list of integer
         return output_str
-
-        
-
-    def prepare_special_token_vocabulary(self):
-        self._special_bpe.set_vocabulary_index(self._bpe.vocabulary_size) 
-
-        for special_token in [token.value for token in SpecialToken]:
-            self._special_bpe.add_special_word_to_vocabulary(special_token)
-
-        self._special_bpe.build_reverse_vocabulary()
-
-
-if __name__ == "__main__":
-    dictionary_path = "Assets/Dataset/Tmp/toy_dictionary.json"
-    dictionary = load_json(Path(dictionary_path))
-
-    tokenano = TokeNanoCore(dictionary)
-
-    corpus = "<SOTL><SOT><SUBJ>dbp-dbr:How_It_Should_Have_Ended<PRED>"
-    print(corpus)
-
-    encoded_list = tokenano.encode(corpus)
-    print(encoded_list)
-
-    decoded_string = tokenano.decode(encoded_list)
-    print(decoded_string)
-
-# [1474, 1475, 1477, 100, 98, 112, 45, 100, 98, 114, 58, 72, 111, 119, 95, 73, 116, 95, 83, 104, 111, 117, 108, 100, 95, 72, 97, 118, 101, 95, 69, 110, 100, 101, 100, 1478]
\ No newline at end of file
diff --git a/Project_Model/Libs/BPE/Enums/SpecialToken.py b/Project_Model/Libs/BPE/Enums/SpecialToken.py
new file mode 100644
index 0000000..3f25a2d
--- /dev/null
+++ b/Project_Model/Libs/BPE/Enums/SpecialToken.py
@@ -0,0 +1,21 @@
+from enum import Enum
+
+
+class SpecialToken(Enum):
+    # (Enum, str) -> throws an error
+    START_TRIPLE_LIST = "<SOTL>"
+    START_TRIPLE = "<SOT>"
+    END_TRIPLE = "<EOT>"
+    SUBJECT = "<SUBJ>"
+    RELATIONSHIP = "<PRED>"
+    OBJECT = "<OBJ>"
+    ABSTRACT = "<ABS>"
+    CORPUS_END = "<END>"
+
+    ## Tasks' Token
+    RDF_TO_TEXT = "<RDF2TXT>"
+    TEXT_TO_RDF = "<TEXT2RDF>"
+    CONTINUE_RDF = "<CONTINUERDF>"
+    MASK = "<MASK>"
+
+    # BPE Training: