diff --git a/Project_Model/Libs/BPE/Classes/TokeNano.py b/Project_Model/Libs/BPE/Classes/TokeNano.py index e69de29..1088f7d 100644 --- a/Project_Model/Libs/BPE/Classes/TokeNano.py +++ b/Project_Model/Libs/BPE/Classes/TokeNano.py @@ -0,0 +1,8 @@ + +from Project_Model.Libs.BPE.Classes.TokeNanoCore import TokeNanoCore + +class TokeNano: + + def __init__(self): + + pass \ No newline at end of file diff --git a/Project_Model/Libs/BPE/Classes/TokeNanoCore.py b/Project_Model/Libs/BPE/Classes/TokeNanoCore.py index e69de29..c719219 100644 --- a/Project_Model/Libs/BPE/Classes/TokeNanoCore.py +++ b/Project_Model/Libs/BPE/Classes/TokeNanoCore.py @@ -0,0 +1,79 @@ +from pathlib import Path + +from Project_Model.Libs.BPE.Classes.NanoSocratesSplitter import NanoSocratesSplitter +from Project_Model.Libs.BPE.Classes.NanoSocratesBPE import NanoSocratesBPE +from Project_Model.Libs.BPE.Classes.NanoSocratesSpecial import NanoSocratesSpecial + +from Project_Model.Libs.BPE.Utils.special_regex_maker import special_regex_maker +from Scripts.Libs.CleaningPipeline.special_token import SpecialToken +from Project_Model.Libs.BPE.Enums import TokenType +from Project_Model.Libs.BPE.Utils.json_utils import load_json +class TokeNanoCore: + def __init__(self, + bpe_vocabulary: dict[tuple[int, int], int] + # special_vocabulary: dict[str, int] + ): + self._bpe = NanoSocratesBPE(bpe_vocabulary) + + # special_vocabulary = [token.value for token in SpecialToken] + special_token_list = [token.value for token in SpecialToken] + self._splitter = NanoSocratesSplitter(special_regex_maker(special_token_list),self._bpe.vocabulary_size) + + self._special_bpe = NanoSocratesSpecial(vocabulary_index=None) # technically its not a bpe but more something like an "autoencoder" + self.prepare_special_token_vocabulary() + + + def encode(self, corpus : str) -> list[int]: + output : list[int] = [] + for piece, token_type in self._splitter.split_text(corpus): + + if token_type == TokenType.SPECIAL: + output.extend(self._special_bpe.encode(piece)) + + # slow but clear + if token_type == TokenType.BPE: + output.extend(self._bpe.encode(piece)) + + return output + + + + def decode(self, corpus : list[int])-> str: + output_str = '' + for token, token_type in self._splitter.split_tokens(corpus): + # token is an integer if special, a list of integer otherwise + if token_type == TokenType.SPECIAL: + output_str += self._special_bpe.decode(token) # it accept an integer + + # slow but clear + if token_type == TokenType.BPE: + output_str += self._bpe.decode(token) # it accept a list of integer + return output_str + + + + def prepare_special_token_vocabulary(self): + self._special_bpe.set_vocabulary_index(self._bpe.vocabulary_size) + + for special_token in [token.value for token in SpecialToken]: + self._special_bpe.add_special_word_to_vocabulary(special_token) + + self._special_bpe.build_reverse_vocabulary() + + +if __name__ == "__main__": + dictionary_path = "Assets/Dataset/Tmp/toy_dictionary.json" + dictionary = load_json(Path(dictionary_path)) + + tokenano = TokeNanoCore(dictionary) + + corpus = "dbp-dbr:How_It_Should_Have_Ended" + print(corpus) + + encoded_list = tokenano.encode(corpus) + print(encoded_list) + + decoded_string = tokenano.decode(encoded_list) + print(decoded_string) + +# [1474, 1475, 1477, 100, 98, 112, 45, 100, 98, 114, 58, 72, 111, 119, 95, 73, 116, 95, 83, 104, 111, 117, 108, 100, 95, 72, 97, 118, 101, 95, 69, 110, 100, 101, 100, 1478] \ No newline at end of file