Fix of bugs and semantics

2025-10-03 13:26:58 +02:00
parent 6b9cb7cd35
commit c5c0c61f79
5 changed files with 134 additions and 129 deletions
--- a/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py
@@ -1,47 +1,46 @@
 from .Encoder import Encoder
 from ..Errors import OutOfDictionaryException

+
 class NanoSocratesSpecial(Encoder):

    def __init__(
-        self,
-        vocabulary_index: int ,
-        vocabulary: dict[str, int] | None = None
-        ) -> None:
-        
+        self, bpe_vocabulary_size: int, special_tokens: list[str] = []
+    ) -> None:
+
        super().__init__()

-        if vocabulary is None:
-            self.__vocabulary: dict[str, int] = {}
-        else:
-            self.__vocabulary:  dict[str, int] = vocabulary
-        
+        self.__bpe_offset = bpe_vocabulary_size
+        self.__vocabulary: dict[str, int] = {}
        self.__reverse_vocabulary: dict[int, str] = {}

-        if vocabulary_index is None:
-            self.__vocabulary_index = 0
-        else:
-            self.__vocabulary_index = vocabulary_index
+        if len(special_tokens) == 0:
+            return

-        # self.__build_reverse_vocabulary()
+        for index, TOKEN in zip(range(0, len(special_tokens)), special_tokens):

+            CANDIDATE_ID = self.__bpe_offset + index + 1
+            self.__vocabulary[TOKEN] = CANDIDATE_ID
+            self.__reverse_vocabulary[CANDIDATE_ID] = TOKEN

+    @property
+    def __next_id(self):
+        BPE_OFFSET = self.__bpe_offset
+        VOC_LENGTH = len(self.__vocabulary)
+        return BPE_OFFSET + VOC_LENGTH + 1

-    def build_reverse_vocabulary(self):
-        self.__reverse_vocabulary = {v: k for k, v in self.__vocabulary.items()}
+    @property
+    def vocabulary(self) -> dict[str, int]:
+        return self.__vocabulary

-    # @property
-    # def vocabulary_size(self):
-    #     return self.__current_index
+    @property
+    def reverse_vocabulary(self) -> dict[int, str]:
+        return self.__reverse_vocabulary

-    def set_vocabulary_index(self, vocabulary_index: int):
-        self.__vocabulary_index = vocabulary_index
-
-    def add_special_word_to_vocabulary(self, word:str):
-        self.__vocabulary_index = self.__vocabulary_index + 1
-        CURRENT_INDEX = self.__vocabulary_index
-        self.__vocabulary[word] = CURRENT_INDEX
-        self.__reverse_vocabulary[CURRENT_INDEX] = word
+    def add_special_word_to_vocabulary(self, word: str):
+        CANDIDATE_INDEX = self.__next_id
+        self.__vocabulary[word] = CANDIDATE_INDEX
+        self.__reverse_vocabulary[CANDIDATE_INDEX] = word

    def encode(self, word: str) -> list[int]:
        ID = self.__vocabulary.get(word)
@@ -51,15 +50,15 @@ class NanoSocratesSpecial(Encoder):

        return [ID]

-    def decode(self, token_id: int) -> str:
+    def decode(self, token_id: list[int]) -> str:

-        ID = token_id
+        if len(token_id) != 1:
+            raise OutOfDictionaryException()
+
+        ID = token_id[0]
        WORD = self.__reverse_vocabulary.get(ID)

        if WORD is None:
            raise OutOfDictionaryException()

        return WORD
-
-    def get_reverse_vocabulary(self)-> dict[int, str]:
-        return self.__reverse_vocabulary