NanoSocrates/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py

from .Encoder import Encoder
from ..Errors import OutOfDictionaryException

class NanoSocratesSpecial(Encoder):

    def __init__(
        self,
        vocabulary_index: int ,
        vocabulary: dict[str, int] | None = None
        ) -> None:

        super().__init__()

        if vocabulary is None:
            self.__vocabulary: dict[str, int] = {}
        else:
            self.__vocabulary:  dict[str, int] = vocabulary

        self.__reverse_vocabulary: dict[int, str] = {}

        if vocabulary_index is None:
            self.__vocabulary_index = 0
        else:
            self.__vocabulary_index = vocabulary_index

        # self.__build_reverse_vocabulary()


    def build_reverse_vocabulary(self):
        self.__reverse_vocabulary = {v: k for k, v in self.__vocabulary.items()}

    # @property
    # def vocabulary_size(self):
    #     return self.__current_index

    def set_vocabulary_index(self, vocabulary_index: int):
        self.__vocabulary_index = vocabulary_index

    def add_special_word_to_vocabulary(self, word:str):
        self.__vocabulary_index = self.__vocabulary_index + 1
        CURRENT_INDEX = self.__vocabulary_index
        self.__vocabulary[word] = CURRENT_INDEX
        self.__reverse_vocabulary[CURRENT_INDEX] = word

    def encode(self, word: str) -> list[int]:
        ID = self.__vocabulary.get(word)

        if ID is None:
            raise OutOfDictionaryException()

        return [ID]

    def decode(self, token_id: int) -> str:

        ID = token_id
        WORD = self.__reverse_vocabulary.get(ID)

        if WORD is None:
            raise OutOfDictionaryException()

        return WORD

    def get_reverse_vocabulary(self)-> dict[int, str]:
        return self.__reverse_vocabulary