From c5c0c61f797773a96f1a3fe582e8998c5d5254cd Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Fri, 3 Oct 2025 13:26:58 +0200 Subject: [PATCH] Fix of bugs and semantics --- .../Libs/BPE/Classes/NanoSocratesBPE.py | 56 ++++------- .../Libs/BPE/Classes/NanoSocratesSpecial.py | 65 ++++++------- .../Libs/BPE/Classes/NanoSocratesSplitter.py | 24 ++++- .../Libs/BPE/Classes/TokeNanoCore.py | 97 +++++++------------ Project_Model/Libs/BPE/Enums/SpecialToken.py | 21 ++++ 5 files changed, 134 insertions(+), 129 deletions(-) create mode 100644 Project_Model/Libs/BPE/Enums/SpecialToken.py diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py index baa5efd..d517f04 100644 --- a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py +++ b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py @@ -2,20 +2,18 @@ from collections import deque from .Encoder import Encoder from ..Errors import OutOfDictionaryException, DuplicateWordException + # ABOUT THE DICTIONARY: # the string is converted into utf-char bytes, that is: each char is rappresented with a set of bytes from 1 to 4. # each bytes get casted into an integer; such that, if an integer has its value lower then 256, # then it is rappresenting an utf-char-byte, otherwise it is a token-ID. class NanoSocratesBatchMemoryBPE: - """ Memory to batch training. Keeps token couple frequencies, and merge_treshold - """ + """Memory to batch training. Keeps token couple frequencies, and merge_treshold""" def __init__( - self, - frequencies: dict[tuple[int, int], int], - merge_treshold: int + self, frequencies: dict[tuple[int, int], int], merge_treshold: int ) -> None: - + self.frequencies = frequencies self.merge_treshold = merge_treshold @@ -39,7 +37,6 @@ class NanoSocratesBPE(Encoder): self.__vocabulary[key] = value self.__reverse_vocabulary[value] = key - @property def vocabulary_size(self): return len(self.__vocabulary) + 256 @@ -62,7 +59,7 @@ class NanoSocratesBPE(Encoder): self, chunk_data: list[int], memory: NanoSocratesBatchMemoryBPE, - last_batch: bool + last_batch: bool, ): ENCODED_CHUNK = self.encode_intermediate(chunk_data) @@ -70,7 +67,7 @@ class NanoSocratesBPE(Encoder): # update frequency of each couple of element for i in range(0, DATA_LEN_BEFORE_LAST): - CANDIDATE_COUPLE = (ENCODED_CHUNK[i], ENCODED_CHUNK[i+1]) + CANDIDATE_COUPLE = (ENCODED_CHUNK[i], ENCODED_CHUNK[i + 1]) frequency = memory.frequencies.get(CANDIDATE_COUPLE) @@ -82,7 +79,6 @@ class NanoSocratesBPE(Encoder): frequency += 1 memory.frequencies[CANDIDATE_COUPLE] = frequency - if not last_batch: return (self, memory, ENCODED_CHUNK) @@ -100,9 +96,6 @@ class NanoSocratesBPE(Encoder): return (self, memory, ENCODED_CHUNK) - - - def encode(self, piece: str) -> list[int]: """Encode a String into token IDs, it firt convert it into utf-8, then pass the list of integer to encode_intermediate() Args: @@ -114,12 +107,12 @@ class NanoSocratesBPE(Encoder): return self.encode_intermediate(converted_piece) def encode_intermediate(self, piece: list[int]) -> list[int]: - """ Encode a piece (as list of integer) till its maximum + """Encode a piece (as list of integer) till its maximum Args: piece (list[int]): piece to encode Returns: - list[int]: piece encoded - """ + list[int]: piece encoded + """ current_piece = piece new_piece = self.__round_encode(current_piece) @@ -130,9 +123,8 @@ class NanoSocratesBPE(Encoder): return current_piece - def __round_encode(self, piece: list[int]): - """ A single round of encode that traverse all the object. Multiple round are needed for a full encode: \n + """A single round of encode that traverse all the object. Multiple round are needed for a full encode: \n 1) "ABAB" -> "XX" 2) "XX" -> "Y" Args: @@ -146,22 +138,25 @@ class NanoSocratesBPE(Encoder): return piece PIECE_LENGTH = len(piece) - 1 - NEW_PIECE : list[int]= [] + NEW_PIECE: list[int] = [] index = 0 while index < PIECE_LENGTH: - CANDIDATE_WORD = (piece[index], piece[index + 1]) # take a tuple of consecutive element [int] + CANDIDATE_WORD = ( + piece[index], + piece[index + 1], + ) # take a tuple of consecutive element [int] CANDIDATE_TOKEN = self.__vocabulary.get(CANDIDATE_WORD) # if no token to substitute the tuple, append the first element if CANDIDATE_TOKEN is None: - NEW_PIECE.append(piece[index]) + NEW_PIECE.append(piece[index]) index += 1 # if the latter element of the tuple is the last element of the piece, append it if index == PIECE_LENGTH: - NEW_PIECE.append(piece[index]) + NEW_PIECE.append(piece[index]) continue @@ -169,13 +164,10 @@ class NanoSocratesBPE(Encoder): NEW_PIECE.append(CANDIDATE_TOKEN) index += 2 - return NEW_PIECE - # TODO: Remake decode to take a list of token IDs def decode(self, token_ids: list[int]) -> str: - # deque: double ended queue token_stack: deque[int] = deque(token_ids) @@ -185,19 +177,13 @@ class NanoSocratesBPE(Encoder): TOKEN_ID = token_stack.popleft() if TOKEN_ID < 256: - UTF_8_STRING_ARR.append( - TOKEN_ID - ) + UTF_8_STRING_ARR.append(TOKEN_ID) continue left_token, right_token = self.__token_decode(TOKEN_ID) - token_stack.appendleft( - right_token - ) - token_stack.appendleft( - left_token - ) + token_stack.appendleft(right_token) + token_stack.appendleft(left_token) return UTF_8_STRING_ARR.decode("utf-8") @@ -211,7 +197,7 @@ class NanoSocratesBPE(Encoder): return CANDIDATE_DECODED def __learn_word(self, words: tuple[int, int]): - """ learn a new couple of object in the vocabulary + """learn a new couple of object in the vocabulary Args: words (tuple[int, int]): the Pair of element to substitute with a new tokenID diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py b/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py index 8fe81bb..61d4741 100644 --- a/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py +++ b/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py @@ -1,47 +1,46 @@ from .Encoder import Encoder from ..Errors import OutOfDictionaryException + class NanoSocratesSpecial(Encoder): def __init__( - self, - vocabulary_index: int , - vocabulary: dict[str, int] | None = None - ) -> None: - + self, bpe_vocabulary_size: int, special_tokens: list[str] = [] + ) -> None: + super().__init__() - if vocabulary is None: - self.__vocabulary: dict[str, int] = {} - else: - self.__vocabulary: dict[str, int] = vocabulary - + self.__bpe_offset = bpe_vocabulary_size + self.__vocabulary: dict[str, int] = {} self.__reverse_vocabulary: dict[int, str] = {} - if vocabulary_index is None: - self.__vocabulary_index = 0 - else: - self.__vocabulary_index = vocabulary_index + if len(special_tokens) == 0: + return - # self.__build_reverse_vocabulary() + for index, TOKEN in zip(range(0, len(special_tokens)), special_tokens): + CANDIDATE_ID = self.__bpe_offset + index + 1 + self.__vocabulary[TOKEN] = CANDIDATE_ID + self.__reverse_vocabulary[CANDIDATE_ID] = TOKEN + @property + def __next_id(self): + BPE_OFFSET = self.__bpe_offset + VOC_LENGTH = len(self.__vocabulary) + return BPE_OFFSET + VOC_LENGTH + 1 - def build_reverse_vocabulary(self): - self.__reverse_vocabulary = {v: k for k, v in self.__vocabulary.items()} + @property + def vocabulary(self) -> dict[str, int]: + return self.__vocabulary - # @property - # def vocabulary_size(self): - # return self.__current_index + @property + def reverse_vocabulary(self) -> dict[int, str]: + return self.__reverse_vocabulary - def set_vocabulary_index(self, vocabulary_index: int): - self.__vocabulary_index = vocabulary_index - - def add_special_word_to_vocabulary(self, word:str): - self.__vocabulary_index = self.__vocabulary_index + 1 - CURRENT_INDEX = self.__vocabulary_index - self.__vocabulary[word] = CURRENT_INDEX - self.__reverse_vocabulary[CURRENT_INDEX] = word + def add_special_word_to_vocabulary(self, word: str): + CANDIDATE_INDEX = self.__next_id + self.__vocabulary[word] = CANDIDATE_INDEX + self.__reverse_vocabulary[CANDIDATE_INDEX] = word def encode(self, word: str) -> list[int]: ID = self.__vocabulary.get(word) @@ -51,15 +50,15 @@ class NanoSocratesSpecial(Encoder): return [ID] - def decode(self, token_id: int) -> str: + def decode(self, token_id: list[int]) -> str: - ID = token_id + if len(token_id) != 1: + raise OutOfDictionaryException() + + ID = token_id[0] WORD = self.__reverse_vocabulary.get(ID) if WORD is None: raise OutOfDictionaryException() return WORD - - def get_reverse_vocabulary(self)-> dict[int, str]: - return self.__reverse_vocabulary diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py b/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py index 6e0abc2..02a8ccf 100644 --- a/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py +++ b/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py @@ -31,7 +31,8 @@ class NanoSocratesSplitter: bpe_end = special_token_start BPE_TOKEN_TEXT = corpus[bpe_start:bpe_end] if BPE_TOKEN_TEXT != "": - yield (BPE_TOKEN_TEXT, TokenType.BPE) + for WORD in self.__split_words(BPE_TOKEN_TEXT): + yield (WORD, TokenType.BPE) # FIND SPECIAL TOKEN SPECIAL_TOKEN_TEXT = corpus[special_token_start:special_token_end] @@ -60,6 +61,27 @@ class NanoSocratesSplitter: # eof = len(corpus) # yield(eof,eof) + def __split_words(self, bpe_piece: str) -> Generator[str]: + + END_OF_STRING = len(bpe_piece) + bound_start = 0 + bound_end = END_OF_STRING + 1 + for i in range(0, END_OF_STRING): + + CANDIDATE_CHAR = bpe_piece[i] + + if CANDIDATE_CHAR != " ": + continue + + bound_end = i + + yield bpe_piece[bound_start:bound_end] + + bound_start = bound_end + bound_end = END_OF_STRING + 1 + + yield bpe_piece[bound_start:bound_end] + def split_tokens(self, corpus: list[int]) -> Generator[tuple[list[int], TokenType]]: not_special_token_list: list[int] = [] diff --git a/Project_Model/Libs/BPE/Classes/TokeNanoCore.py b/Project_Model/Libs/BPE/Classes/TokeNanoCore.py index c719219..f726a95 100644 --- a/Project_Model/Libs/BPE/Classes/TokeNanoCore.py +++ b/Project_Model/Libs/BPE/Classes/TokeNanoCore.py @@ -1,79 +1,56 @@ from pathlib import Path -from Project_Model.Libs.BPE.Classes.NanoSocratesSplitter import NanoSocratesSplitter -from Project_Model.Libs.BPE.Classes.NanoSocratesBPE import NanoSocratesBPE -from Project_Model.Libs.BPE.Classes.NanoSocratesSpecial import NanoSocratesSpecial +from ..Classes import NanoSocratesSplitter +from ..Classes import NanoSocratesBPE +from ..Classes import NanoSocratesSpecial + +from ..Utils import special_regex_maker +from ..Enums import TokenType + -from Project_Model.Libs.BPE.Utils.special_regex_maker import special_regex_maker -from Scripts.Libs.CleaningPipeline.special_token import SpecialToken -from Project_Model.Libs.BPE.Enums import TokenType -from Project_Model.Libs.BPE.Utils.json_utils import load_json class TokeNanoCore: - def __init__(self, - bpe_vocabulary: dict[tuple[int, int], int] - # special_vocabulary: dict[str, int] - ): - self._bpe = NanoSocratesBPE(bpe_vocabulary) - - # special_vocabulary = [token.value for token in SpecialToken] - special_token_list = [token.value for token in SpecialToken] - self._splitter = NanoSocratesSplitter(special_regex_maker(special_token_list),self._bpe.vocabulary_size) + def __init__( + self, + bpe_vocabulary: dict[tuple[int, int], int], + special_token_list: list[str], + # special_vocabulary: dict[str, int] + ): - self._special_bpe = NanoSocratesSpecial(vocabulary_index=None) # technically its not a bpe but more something like an "autoencoder" - self.prepare_special_token_vocabulary() - - - def encode(self, corpus : str) -> list[int]: - output : list[int] = [] - for piece, token_type in self._splitter.split_text(corpus): + self.__bpe_encoder = NanoSocratesBPE(bpe_vocabulary) + + SPECIAL_REGEX = special_regex_maker(special_token_list) + BPE_VOCABULARY_SIZE = self.__bpe_encoder.vocabulary_size + + self.__splitter = NanoSocratesSplitter(SPECIAL_REGEX, BPE_VOCABULARY_SIZE) + self.__special_encoder = NanoSocratesSpecial( + BPE_VOCABULARY_SIZE, special_token_list + ) + + def encode(self, corpus: str) -> list[int]: + output: list[int] = [] + for piece, token_type in self.__splitter.split_text(corpus): if token_type == TokenType.SPECIAL: - output.extend(self._special_bpe.encode(piece)) + output.extend(self.__special_encoder.encode(piece)) # slow but clear if token_type == TokenType.BPE: - output.extend(self._bpe.encode(piece)) + output.extend(self.__bpe_encoder.encode(piece)) return output - - - def decode(self, corpus : list[int])-> str: - output_str = '' - for token, token_type in self._splitter.split_tokens(corpus): + def decode(self, corpus: list[int]) -> str: + output_str = "" + for token, token_type in self.__splitter.split_tokens(corpus): # token is an integer if special, a list of integer otherwise if token_type == TokenType.SPECIAL: - output_str += self._special_bpe.decode(token) # it accept an integer + output_str += self.__special_encoder.decode( + token + ) # it accept an integer # slow but clear if token_type == TokenType.BPE: - output_str += self._bpe.decode(token) # it accept a list of integer + output_str += self.__bpe_encoder.decode( + token + ) # it accept a list of integer return output_str - - - - def prepare_special_token_vocabulary(self): - self._special_bpe.set_vocabulary_index(self._bpe.vocabulary_size) - - for special_token in [token.value for token in SpecialToken]: - self._special_bpe.add_special_word_to_vocabulary(special_token) - - self._special_bpe.build_reverse_vocabulary() - - -if __name__ == "__main__": - dictionary_path = "Assets/Dataset/Tmp/toy_dictionary.json" - dictionary = load_json(Path(dictionary_path)) - - tokenano = TokeNanoCore(dictionary) - - corpus = "dbp-dbr:How_It_Should_Have_Ended" - print(corpus) - - encoded_list = tokenano.encode(corpus) - print(encoded_list) - - decoded_string = tokenano.decode(encoded_list) - print(decoded_string) - -# [1474, 1475, 1477, 100, 98, 112, 45, 100, 98, 114, 58, 72, 111, 119, 95, 73, 116, 95, 83, 104, 111, 117, 108, 100, 95, 72, 97, 118, 101, 95, 69, 110, 100, 101, 100, 1478] \ No newline at end of file diff --git a/Project_Model/Libs/BPE/Enums/SpecialToken.py b/Project_Model/Libs/BPE/Enums/SpecialToken.py new file mode 100644 index 0000000..3f25a2d --- /dev/null +++ b/Project_Model/Libs/BPE/Enums/SpecialToken.py @@ -0,0 +1,21 @@ +from enum import Enum + + +class SpecialToken(Enum): + # (Enum, str) -> throws an error + START_TRIPLE_LIST = "" + START_TRIPLE = "" + END_TRIPLE = "" + SUBJECT = "" + RELATIONSHIP = "" + OBJECT = "" + ABSTRACT = "" + CORPUS_END = "" + + ## Tasks' Token + RDF_TO_TEXT = "" + TEXT_TO_RDF = "" + CONTINUE_RDF = "" + MASK = "" + + # BPE Training: