from .Encoder import Encoder from ..Errors import OutOfDictionaryException class NanoSocratesSpecial(Encoder): def __init__( self, bpe_vocabulary_size: int, special_tokens: list[str] = [] ) -> None: super().__init__() self.__bpe_offset = bpe_vocabulary_size self.__vocabulary: dict[str, int] = {} self.__reverse_vocabulary: dict[int, str] = {} if len(special_tokens) == 0: return for index, TOKEN in zip(range(0, len(special_tokens)), special_tokens): CANDIDATE_ID = self.__bpe_offset + index + 1 self.__vocabulary[TOKEN] = CANDIDATE_ID self.__reverse_vocabulary[CANDIDATE_ID] = TOKEN @property def __next_id(self): BPE_OFFSET = self.__bpe_offset VOC_LENGTH = len(self.__vocabulary) return BPE_OFFSET + VOC_LENGTH + 1 @property def vocabulary_size(self) -> int: return len(self.vocabulary) @property def vocabulary(self) -> dict[str, int]: return self.__vocabulary @property def reverse_vocabulary(self) -> dict[int, str]: return self.__reverse_vocabulary def add_special_word_to_vocabulary(self, word: str): CANDIDATE_INDEX = self.__next_id self.__vocabulary[word] = CANDIDATE_INDEX self.__reverse_vocabulary[CANDIDATE_INDEX] = word def encode(self, word: str) -> list[int]: ID = self.__vocabulary.get(word) if ID is None: raise OutOfDictionaryException() return [ID] def decode(self, token_id: list[int]) -> str: if len(token_id) != 1: raise OutOfDictionaryException() ID = token_id[0] WORD = self.__reverse_vocabulary.get(ID) if WORD is None: raise OutOfDictionaryException() return WORD