NanoSocrates/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py

from .Encoder import Encoder
from ..Errors import OutOfDictionaryException


class NanoSocratesSpecial(Encoder):

    def __init__(
        self, bpe_vocabulary_size: int, special_tokens: list[str] = []
    ) -> None:

        super().__init__()

        self.__bpe_offset = bpe_vocabulary_size
        self.__vocabulary: dict[str, int] = {}
        self.__reverse_vocabulary: dict[int, str] = {}

        if len(special_tokens) == 0:
            return

        for index, TOKEN in zip(range(0, len(special_tokens)), special_tokens):

            CANDIDATE_ID = self.__bpe_offset + index + 1
            self.__vocabulary[TOKEN] = CANDIDATE_ID
            self.__reverse_vocabulary[CANDIDATE_ID] = TOKEN

    @property
    def __next_id(self):
        BPE_OFFSET = self.__bpe_offset
        VOC_LENGTH = len(self.__vocabulary)
        return BPE_OFFSET + VOC_LENGTH + 1

    @property
    def vocabulary_size(self) -> int:
        return len(self.vocabulary)

    @property
    def vocabulary(self) -> dict[str, int]:
        return self.__vocabulary

    @property
    def reverse_vocabulary(self) -> dict[int, str]:
        return self.__reverse_vocabulary

    def add_special_word_to_vocabulary(self, word: str):
        CANDIDATE_INDEX = self.__next_id
        self.__vocabulary[word] = CANDIDATE_INDEX
        self.__reverse_vocabulary[CANDIDATE_INDEX] = word

    def encode(self, word: str) -> list[int]:
        ID = self.__vocabulary.get(word)

        if ID is None:
            raise OutOfDictionaryException()

        return [ID]

    def decode(self, token_id: list[int]) -> str:

        if len(token_id) != 1:
            raise OutOfDictionaryException()

        ID = token_id[0]
        WORD = self.__reverse_vocabulary.get(ID)

        if WORD is None:
            raise OutOfDictionaryException()

        return WORD
Added Special Encoder 2025-09-28 18:03:47 +02:00			`from .Encoder import Encoder`
			`from ..Errors import OutOfDictionaryException`

Fix of bugs and semantics 2025-10-03 13:26:58 +02:00
Added Special Encoder 2025-09-28 18:03:47 +02:00			`class NanoSocratesSpecial(Encoder):`

			`def __init__(`
Fix of bugs and semantics 2025-10-03 13:26:58 +02:00			`self, bpe_vocabulary_size: int, special_tokens: list[str] = []`
			`) -> None:`

Added Special Encoder 2025-09-28 18:03:47 +02:00			`super().__init__()`

Fix of bugs and semantics 2025-10-03 13:26:58 +02:00			`self.__bpe_offset = bpe_vocabulary_size`
			`self.__vocabulary: dict[str, int] = {}`
Added Special Encoder 2025-09-28 18:03:47 +02:00			`self.__reverse_vocabulary: dict[int, str] = {}`

Fix of bugs and semantics 2025-10-03 13:26:58 +02:00			`if len(special_tokens) == 0:`
			`return`
Added Special Encoder 2025-09-28 18:03:47 +02:00
Fix of bugs and semantics 2025-10-03 13:26:58 +02:00			`for index, TOKEN in zip(range(0, len(special_tokens)), special_tokens):`
Added Special Encoder 2025-09-28 18:03:47 +02:00
Fix of bugs and semantics 2025-10-03 13:26:58 +02:00			`CANDIDATE_ID = self.__bpe_offset + index + 1`
			`self.__vocabulary[TOKEN] = CANDIDATE_ID`
			`self.__reverse_vocabulary[CANDIDATE_ID] = TOKEN`
Added Special Encoder 2025-09-28 18:03:47 +02:00
Fix of bugs and semantics 2025-10-03 13:26:58 +02:00			`@property`
			`def __next_id(self):`
			`BPE_OFFSET = self.__bpe_offset`
			`VOC_LENGTH = len(self.__vocabulary)`
			`return BPE_OFFSET + VOC_LENGTH + 1`
Added Special Encoder 2025-09-28 18:03:47 +02:00
Added a way to see vocabulary size 2025-10-04 19:42:29 +02:00			`@property`
			`def vocabulary_size(self) -> int:`
			`return len(self.vocabulary)`

Fix of bugs and semantics 2025-10-03 13:26:58 +02:00			`@property`
			`def vocabulary(self) -> dict[str, int]:`
			`return self.__vocabulary`
Added Special Encoder 2025-09-28 18:03:47 +02:00
Fix of bugs and semantics 2025-10-03 13:26:58 +02:00			`@property`
			`def reverse_vocabulary(self) -> dict[int, str]:`
			`return self.__reverse_vocabulary`
Updated NanoSocratesSpecial to work with TokeNano 2025-10-03 00:59:15 +02:00
Fix of bugs and semantics 2025-10-03 13:26:58 +02:00			`def add_special_word_to_vocabulary(self, word: str):`
			`CANDIDATE_INDEX = self.__next_id`
			`self.__vocabulary[word] = CANDIDATE_INDEX`
			`self.__reverse_vocabulary[CANDIDATE_INDEX] = word`
Added Special Encoder 2025-09-28 18:03:47 +02:00
			`def encode(self, word: str) -> list[int]:`
			`ID = self.__vocabulary.get(word)`

			`if ID is None:`
			`raise OutOfDictionaryException()`

			`return [ID]`

Fix of bugs and semantics 2025-10-03 13:26:58 +02:00			`def decode(self, token_id: list[int]) -> str:`
Added Special Encoder 2025-09-28 18:03:47 +02:00
Fix of bugs and semantics 2025-10-03 13:26:58 +02:00			`if len(token_id) != 1:`
			`raise OutOfDictionaryException()`

			`ID = token_id[0]`
Added Special Encoder 2025-09-28 18:03:47 +02:00			`WORD = self.__reverse_vocabulary.get(ID)`

			`if WORD is None:`
			`raise OutOfDictionaryException()`

			`return WORD`