Fix of bugs and semantics
This commit is contained in:
@@ -1,47 +1,46 @@
|
||||
from .Encoder import Encoder
|
||||
from ..Errors import OutOfDictionaryException
|
||||
|
||||
|
||||
class NanoSocratesSpecial(Encoder):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocabulary_index: int ,
|
||||
vocabulary: dict[str, int] | None = None
|
||||
) -> None:
|
||||
|
||||
self, bpe_vocabulary_size: int, special_tokens: list[str] = []
|
||||
) -> None:
|
||||
|
||||
super().__init__()
|
||||
|
||||
if vocabulary is None:
|
||||
self.__vocabulary: dict[str, int] = {}
|
||||
else:
|
||||
self.__vocabulary: dict[str, int] = vocabulary
|
||||
|
||||
self.__bpe_offset = bpe_vocabulary_size
|
||||
self.__vocabulary: dict[str, int] = {}
|
||||
self.__reverse_vocabulary: dict[int, str] = {}
|
||||
|
||||
if vocabulary_index is None:
|
||||
self.__vocabulary_index = 0
|
||||
else:
|
||||
self.__vocabulary_index = vocabulary_index
|
||||
if len(special_tokens) == 0:
|
||||
return
|
||||
|
||||
# self.__build_reverse_vocabulary()
|
||||
for index, TOKEN in zip(range(0, len(special_tokens)), special_tokens):
|
||||
|
||||
CANDIDATE_ID = self.__bpe_offset + index + 1
|
||||
self.__vocabulary[TOKEN] = CANDIDATE_ID
|
||||
self.__reverse_vocabulary[CANDIDATE_ID] = TOKEN
|
||||
|
||||
@property
|
||||
def __next_id(self):
|
||||
BPE_OFFSET = self.__bpe_offset
|
||||
VOC_LENGTH = len(self.__vocabulary)
|
||||
return BPE_OFFSET + VOC_LENGTH + 1
|
||||
|
||||
def build_reverse_vocabulary(self):
|
||||
self.__reverse_vocabulary = {v: k for k, v in self.__vocabulary.items()}
|
||||
@property
|
||||
def vocabulary(self) -> dict[str, int]:
|
||||
return self.__vocabulary
|
||||
|
||||
# @property
|
||||
# def vocabulary_size(self):
|
||||
# return self.__current_index
|
||||
@property
|
||||
def reverse_vocabulary(self) -> dict[int, str]:
|
||||
return self.__reverse_vocabulary
|
||||
|
||||
def set_vocabulary_index(self, vocabulary_index: int):
|
||||
self.__vocabulary_index = vocabulary_index
|
||||
|
||||
def add_special_word_to_vocabulary(self, word:str):
|
||||
self.__vocabulary_index = self.__vocabulary_index + 1
|
||||
CURRENT_INDEX = self.__vocabulary_index
|
||||
self.__vocabulary[word] = CURRENT_INDEX
|
||||
self.__reverse_vocabulary[CURRENT_INDEX] = word
|
||||
def add_special_word_to_vocabulary(self, word: str):
|
||||
CANDIDATE_INDEX = self.__next_id
|
||||
self.__vocabulary[word] = CANDIDATE_INDEX
|
||||
self.__reverse_vocabulary[CANDIDATE_INDEX] = word
|
||||
|
||||
def encode(self, word: str) -> list[int]:
|
||||
ID = self.__vocabulary.get(word)
|
||||
@@ -51,15 +50,15 @@ class NanoSocratesSpecial(Encoder):
|
||||
|
||||
return [ID]
|
||||
|
||||
def decode(self, token_id: int) -> str:
|
||||
def decode(self, token_id: list[int]) -> str:
|
||||
|
||||
ID = token_id
|
||||
if len(token_id) != 1:
|
||||
raise OutOfDictionaryException()
|
||||
|
||||
ID = token_id[0]
|
||||
WORD = self.__reverse_vocabulary.get(ID)
|
||||
|
||||
if WORD is None:
|
||||
raise OutOfDictionaryException()
|
||||
|
||||
return WORD
|
||||
|
||||
def get_reverse_vocabulary(self)-> dict[int, str]:
|
||||
return self.__reverse_vocabulary
|
||||
|
||||
Reference in New Issue
Block a user