NanoSocrates/Project_Model/Libs/BPE/Classes/TokeNanoCore.py

from pathlib import Path

from Project_Model.Libs.BPE.Classes.NanoSocratesSplitter import NanoSocratesSplitter
from Project_Model.Libs.BPE.Classes.NanoSocratesBPE import NanoSocratesBPE
from Project_Model.Libs.BPE.Classes.NanoSocratesSpecial import NanoSocratesSpecial

from Project_Model.Libs.BPE.Utils.special_regex_maker import special_regex_maker
from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
from Project_Model.Libs.BPE.Enums import TokenType
from Project_Model.Libs.BPE.Utils.json_utils import load_json
class TokeNanoCore:
    def __init__(self, 
                bpe_vocabulary: dict[tuple[int, int], int]
                # special_vocabulary: dict[str, int] 
                ):
        self._bpe = NanoSocratesBPE(bpe_vocabulary)
        
        # special_vocabulary = [token.value for token in SpecialToken]
        special_token_list = [token.value for token in SpecialToken]
        self._splitter = NanoSocratesSplitter(special_regex_maker(special_token_list),self._bpe.vocabulary_size)

        self._special_bpe =  NanoSocratesSpecial(vocabulary_index=None) # technically its not a bpe but more something like an "autoencoder"
        self.prepare_special_token_vocabulary()
        
        
    def encode(self, corpus : str) -> list[int]:
        output : list[int] = []
        for piece, token_type in self._splitter.split_text(corpus):

            if token_type == TokenType.SPECIAL:
                output.extend(self._special_bpe.encode(piece))

            # slow but clear
            if token_type == TokenType.BPE:
                output.extend(self._bpe.encode(piece))

        return output

    
    def decode(self, corpus : list[int])-> str:
        output_str = ''
        for token, token_type in self._splitter.split_tokens(corpus):
            # token is an integer if special, a list of integer otherwise
            if token_type == TokenType.SPECIAL:
                output_str += self._special_bpe.decode(token) # it accept an integer

            # slow but clear
            if token_type == TokenType.BPE:
                output_str += self._bpe.decode(token) # it accept a list of integer
        return output_str

        
    def prepare_special_token_vocabulary(self):
        self._special_bpe.set_vocabulary_index(self._bpe.vocabulary_size) 

        for special_token in [token.value for token in SpecialToken]:
            self._special_bpe.add_special_word_to_vocabulary(special_token)

        self._special_bpe.build_reverse_vocabulary()


if __name__ == "__main__":
    dictionary_path = "Assets/Dataset/Tmp/toy_dictionary.json"
    dictionary = load_json(Path(dictionary_path))

    tokenano = TokeNanoCore(dictionary)

    corpus = "<SOTL><SOT><SUBJ>dbp-dbr:How_It_Should_Have_Ended<PRED>"
    print(corpus)

    encoded_list = tokenano.encode(corpus)
    print(encoded_list)

    decoded_string = tokenano.decode(encoded_list)
    print(decoded_string)

# [1474, 1475, 1477, 100, 98, 112, 45, 100, 98, 114, 58, 72, 111, 119, 95, 73, 116, 95, 83, 104, 111, 117, 108, 100, 95, 72, 97, 118, 101, 95, 69, 110, 100, 101, 100, 1478]
implemented token nano for the BPE encoding/decoding 2025-10-03 01:04:06 +02:00			`from pathlib import Path`

			`from Project_Model.Libs.BPE.Classes.NanoSocratesSplitter import NanoSocratesSplitter`
			`from Project_Model.Libs.BPE.Classes.NanoSocratesBPE import NanoSocratesBPE`
			`from Project_Model.Libs.BPE.Classes.NanoSocratesSpecial import NanoSocratesSpecial`

			`from Project_Model.Libs.BPE.Utils.special_regex_maker import special_regex_maker`
			`from Scripts.Libs.CleaningPipeline.special_token import SpecialToken`
			`from Project_Model.Libs.BPE.Enums import TokenType`
			`from Project_Model.Libs.BPE.Utils.json_utils import load_json`
			`class TokeNanoCore:`
			`def __init__(self,`
			`bpe_vocabulary: dict[tuple[int, int], int]`
			`# special_vocabulary: dict[str, int]`
			`):`
			`self._bpe = NanoSocratesBPE(bpe_vocabulary)`

			`# special_vocabulary = [token.value for token in SpecialToken]`
			`special_token_list = [token.value for token in SpecialToken]`
			`self._splitter = NanoSocratesSplitter(special_regex_maker(special_token_list),self._bpe.vocabulary_size)`

			`self._special_bpe = NanoSocratesSpecial(vocabulary_index=None) # technically its not a bpe but more something like an "autoencoder"`
			`self.prepare_special_token_vocabulary()`


			`def encode(self, corpus : str) -> list[int]:`
			`output : list[int] = []`
			`for piece, token_type in self._splitter.split_text(corpus):`

			`if token_type == TokenType.SPECIAL:`
			`output.extend(self._special_bpe.encode(piece))`

			`# slow but clear`
			`if token_type == TokenType.BPE:`
			`output.extend(self._bpe.encode(piece))`

			`return output`



			`def decode(self, corpus : list[int])-> str:`
			`output_str = ''`
			`for token, token_type in self._splitter.split_tokens(corpus):`
			`# token is an integer if special, a list of integer otherwise`
			`if token_type == TokenType.SPECIAL:`
			`output_str += self._special_bpe.decode(token) # it accept an integer`

			`# slow but clear`
			`if token_type == TokenType.BPE:`
			`output_str += self._bpe.decode(token) # it accept a list of integer`
			`return output_str`



			`def prepare_special_token_vocabulary(self):`
			`self._special_bpe.set_vocabulary_index(self._bpe.vocabulary_size)`

			`for special_token in [token.value for token in SpecialToken]:`
			`self._special_bpe.add_special_word_to_vocabulary(special_token)`

			`self._special_bpe.build_reverse_vocabulary()`


			`if __name__ == "__main__":`
			`dictionary_path = "Assets/Dataset/Tmp/toy_dictionary.json"`
			`dictionary = load_json(Path(dictionary_path))`

			`tokenano = TokeNanoCore(dictionary)`

			`corpus = "<SOTL><SOT><SUBJ>dbp-dbr:How_It_Should_Have_Ended<PRED>"`
			`print(corpus)`

			`encoded_list = tokenano.encode(corpus)`
			`print(encoded_list)`

			`decoded_string = tokenano.decode(encoded_list)`
			`print(decoded_string)`

			`# [1474, 1475, 1477, 100, 98, 112, 45, 100, 98, 114, 58, 72, 111, 119, 95, 73, 116, 95, 83, 104, 111, 117, 108, 100, 95, 72, 97, 118, 101, 95, 69, 110, 100, 101, 100, 1478]`