2025-10-03 01:04:06 +02:00
|
|
|
from pathlib import Path
|
|
|
|
|
|
|
|
|
|
from Project_Model.Libs.BPE.Classes.NanoSocratesSplitter import NanoSocratesSplitter
|
|
|
|
|
from Project_Model.Libs.BPE.Classes.NanoSocratesBPE import NanoSocratesBPE
|
|
|
|
|
from Project_Model.Libs.BPE.Classes.NanoSocratesSpecial import NanoSocratesSpecial
|
|
|
|
|
|
|
|
|
|
from Project_Model.Libs.BPE.Utils.special_regex_maker import special_regex_maker
|
|
|
|
|
from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
|
|
|
|
|
from Project_Model.Libs.BPE.Enums import TokenType
|
|
|
|
|
from Project_Model.Libs.BPE.Utils.json_utils import load_json
|
|
|
|
|
class TokeNanoCore:
|
|
|
|
|
def __init__(self,
|
|
|
|
|
bpe_vocabulary: dict[tuple[int, int], int]
|
|
|
|
|
# special_vocabulary: dict[str, int]
|
|
|
|
|
):
|
|
|
|
|
self._bpe = NanoSocratesBPE(bpe_vocabulary)
|
|
|
|
|
|
|
|
|
|
# special_vocabulary = [token.value for token in SpecialToken]
|
|
|
|
|
special_token_list = [token.value for token in SpecialToken]
|
|
|
|
|
self._splitter = NanoSocratesSplitter(special_regex_maker(special_token_list),self._bpe.vocabulary_size)
|
|
|
|
|
|
|
|
|
|
self._special_bpe = NanoSocratesSpecial(vocabulary_index=None) # technically its not a bpe but more something like an "autoencoder"
|
|
|
|
|
self.prepare_special_token_vocabulary()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def encode(self, corpus : str) -> list[int]:
|
|
|
|
|
output : list[int] = []
|
|
|
|
|
for piece, token_type in self._splitter.split_text(corpus):
|
|
|
|
|
|
|
|
|
|
if token_type == TokenType.SPECIAL:
|
|
|
|
|
output.extend(self._special_bpe.encode(piece))
|
|
|
|
|
|
|
|
|
|
# slow but clear
|
|
|
|
|
if token_type == TokenType.BPE:
|
|
|
|
|
output.extend(self._bpe.encode(piece))
|
|
|
|
|
|
|
|
|
|
return output
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def decode(self, corpus : list[int])-> str:
|
|
|
|
|
output_str = ''
|
|
|
|
|
for token, token_type in self._splitter.split_tokens(corpus):
|
|
|
|
|
# token is an integer if special, a list of integer otherwise
|
|
|
|
|
if token_type == TokenType.SPECIAL:
|
|
|
|
|
output_str += self._special_bpe.decode(token) # it accept an integer
|
|
|
|
|
|
|
|
|
|
# slow but clear
|
|
|
|
|
if token_type == TokenType.BPE:
|
|
|
|
|
output_str += self._bpe.decode(token) # it accept a list of integer
|
|
|
|
|
return output_str
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def prepare_special_token_vocabulary(self):
|
|
|
|
|
self._special_bpe.set_vocabulary_index(self._bpe.vocabulary_size)
|
|
|
|
|
|
|
|
|
|
for special_token in [token.value for token in SpecialToken]:
|
|
|
|
|
self._special_bpe.add_special_word_to_vocabulary(special_token)
|
|
|
|
|
|
|
|
|
|
self._special_bpe.build_reverse_vocabulary()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
dictionary_path = "Assets/Dataset/Tmp/toy_dictionary.json"
|
|
|
|
|
dictionary = load_json(Path(dictionary_path))
|
|
|
|
|
|
|
|
|
|
tokenano = TokeNanoCore(dictionary)
|
|
|
|
|
|
|
|
|
|
corpus = "<SOTL><SOT><SUBJ>dbp-dbr:How_It_Should_Have_Ended<PRED>"
|
|
|
|
|
print(corpus)
|
|
|
|
|
|
|
|
|
|
encoded_list = tokenano.encode(corpus)
|
|
|
|
|
print(encoded_list)
|
|
|
|
|
|
|
|
|
|
decoded_string = tokenano.decode(encoded_list)
|
|
|
|
|
print(decoded_string)
|
|
|
|
|
|
|
|
|
|
# [1474, 1475, 1477, 100, 98, 112, 45, 100, 98, 114, 58, 72, 111, 119, 95, 73, 116, 95, 83, 104, 111, 117, 108, 100, 95, 72, 97, 118, 101, 95, 69, 110, 100, 101, 100, 1478]
|