implemented token nano for the BPE encoding/decoding

2025-10-03 01:04:06 +02:00
parent 8121c75a09
commit 070dc1b744
2 changed files with 87 additions and 0 deletions
--- a/Project_Model/Libs/BPE/Classes/TokeNano.py
+++ b/Project_Model/Libs/BPE/Classes/TokeNano.py
@@ -0,0 +1,8 @@
+
+from Project_Model.Libs.BPE.Classes.TokeNanoCore import TokeNanoCore
+
+class TokeNano:
+
+    def __init__(self):
+        
+        pass
--- a/Project_Model/Libs/BPE/Classes/TokeNanoCore.py
+++ b/Project_Model/Libs/BPE/Classes/TokeNanoCore.py
@@ -0,0 +1,79 @@
+from pathlib import Path
+
+from Project_Model.Libs.BPE.Classes.NanoSocratesSplitter import NanoSocratesSplitter
+from Project_Model.Libs.BPE.Classes.NanoSocratesBPE import NanoSocratesBPE
+from Project_Model.Libs.BPE.Classes.NanoSocratesSpecial import NanoSocratesSpecial
+
+from Project_Model.Libs.BPE.Utils.special_regex_maker import special_regex_maker
+from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
+from Project_Model.Libs.BPE.Enums import TokenType
+from Project_Model.Libs.BPE.Utils.json_utils import load_json
+class TokeNanoCore:
+    def __init__(self, 
+                bpe_vocabulary: dict[tuple[int, int], int]
+                # special_vocabulary: dict[str, int] 
+                ):
+        self._bpe = NanoSocratesBPE(bpe_vocabulary)
+        
+        # special_vocabulary = [token.value for token in SpecialToken]
+        special_token_list = [token.value for token in SpecialToken]
+        self._splitter = NanoSocratesSplitter(special_regex_maker(special_token_list),self._bpe.vocabulary_size)
+
+        self._special_bpe =  NanoSocratesSpecial(vocabulary_index=None) # technically its not a bpe but more something like an "autoencoder"
+        self.prepare_special_token_vocabulary()
+        
+        
+    def encode(self, corpus : str) -> list[int]:
+        output : list[int] = []
+        for piece, token_type in self._splitter.split_text(corpus):
+
+            if token_type == TokenType.SPECIAL:
+                output.extend(self._special_bpe.encode(piece))
+
+            # slow but clear
+            if token_type == TokenType.BPE:
+                output.extend(self._bpe.encode(piece))
+
+        return output
+
+    
+
+    def decode(self, corpus : list[int])-> str:
+        output_str = ''
+        for token, token_type in self._splitter.split_tokens(corpus):
+            # token is an integer if special, a list of integer otherwise
+            if token_type == TokenType.SPECIAL:
+                output_str += self._special_bpe.decode(token) # it accept an integer
+
+            # slow but clear
+            if token_type == TokenType.BPE:
+                output_str += self._bpe.decode(token) # it accept a list of integer
+        return output_str
+
+        
+
+    def prepare_special_token_vocabulary(self):
+        self._special_bpe.set_vocabulary_index(self._bpe.vocabulary_size) 
+
+        for special_token in [token.value for token in SpecialToken]:
+            self._special_bpe.add_special_word_to_vocabulary(special_token)
+
+        self._special_bpe.build_reverse_vocabulary()
+
+
+if __name__ == "__main__":
+    dictionary_path = "Assets/Dataset/Tmp/toy_dictionary.json"
+    dictionary = load_json(Path(dictionary_path))
+
+    tokenano = TokeNanoCore(dictionary)
+
+    corpus = "<SOTL><SOT><SUBJ>dbp-dbr:How_It_Should_Have_Ended<PRED>"
+    print(corpus)
+
+    encoded_list = tokenano.encode(corpus)
+    print(encoded_list)
+
+    decoded_string = tokenano.decode(encoded_list)
+    print(decoded_string)
+
+# [1474, 1475, 1477, 100, 98, 112, 45, 100, 98, 114, 58, 72, 111, 119, 95, 73, 116, 95, 83, 104, 111, 117, 108, 100, 95, 72, 97, 118, 101, 95, 69, 110, 100, 101, 100, 1478]