NanoSocrates/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py
2025-10-03 00:59:15 +02:00

66 lines
1.7 KiB
Python

from .Encoder import Encoder
from ..Errors import OutOfDictionaryException
class NanoSocratesSpecial(Encoder):
def __init__(
self,
vocabulary_index: int ,
vocabulary: dict[str, int] | None = None
) -> None:
super().__init__()
if vocabulary is None:
self.__vocabulary: dict[str, int] = {}
else:
self.__vocabulary: dict[str, int] = vocabulary
self.__reverse_vocabulary: dict[int, str] = {}
if vocabulary_index is None:
self.__vocabulary_index = 0
else:
self.__vocabulary_index = vocabulary_index
# self.__build_reverse_vocabulary()
def build_reverse_vocabulary(self):
self.__reverse_vocabulary = {v: k for k, v in self.__vocabulary.items()}
# @property
# def vocabulary_size(self):
# return self.__current_index
def set_vocabulary_index(self, vocabulary_index: int):
self.__vocabulary_index = vocabulary_index
def add_special_word_to_vocabulary(self, word:str):
self.__vocabulary_index = self.__vocabulary_index + 1
CURRENT_INDEX = self.__vocabulary_index
self.__vocabulary[word] = CURRENT_INDEX
self.__reverse_vocabulary[CURRENT_INDEX] = word
def encode(self, word: str) -> list[int]:
ID = self.__vocabulary.get(word)
if ID is None:
raise OutOfDictionaryException()
return [ID]
def decode(self, token_id: int) -> str:
ID = token_id
WORD = self.__reverse_vocabulary.get(ID)
if WORD is None:
raise OutOfDictionaryException()
return WORD
def get_reverse_vocabulary(self)-> dict[int, str]:
return self.__reverse_vocabulary