55 lines
1.3 KiB
Python
55 lines
1.3 KiB
Python
from .Encoder import Encoder
|
|
from ..Errors import OutOfDictionaryException
|
|
|
|
class NanoSocratesSpecial(Encoder):
|
|
|
|
def __init__(
|
|
self,
|
|
initial_vocabulary: list[str] | None = None
|
|
) -> None:
|
|
super().__init__()
|
|
|
|
self.__vocabulary: dict[str, int] = {}
|
|
self.__reverse_vocabulary: dict[int, str] = {}
|
|
self.__current_index = 0
|
|
|
|
if initial_vocabulary is None:
|
|
return
|
|
|
|
for word in initial_vocabulary:
|
|
|
|
CURRENT_INDEX = self.__current_index
|
|
self.__vocabulary[word] = CURRENT_INDEX
|
|
self.__reverse_vocabulary[CURRENT_INDEX] = word
|
|
|
|
self.__current_index += 1
|
|
|
|
@property
|
|
def vocabulary_size(self):
|
|
return self.__current_index
|
|
|
|
def add_special_word(self, word:str):
|
|
CURRENT_INDEX = self.__current_index
|
|
self.__vocabulary[word] = CURRENT_INDEX
|
|
self.__reverse_vocabulary[CURRENT_INDEX] = word
|
|
self.__current_index += 1
|
|
|
|
def encode(self, word: str) -> list[int]:
|
|
ID = self.__vocabulary.get(word)
|
|
|
|
if ID is None:
|
|
raise OutOfDictionaryException()
|
|
|
|
return [ID]
|
|
|
|
def decode(self, token_id: int) -> str:
|
|
|
|
ID = token_id
|
|
WORD = self.__reverse_vocabulary.get(ID)
|
|
|
|
if WORD is None:
|
|
raise OutOfDictionaryException()
|
|
|
|
return WORD
|
|
|