2025-09-28 18:03:47 +02:00
|
|
|
from .Encoder import Encoder
|
|
|
|
|
from ..Errors import OutOfDictionaryException
|
|
|
|
|
|
2025-10-03 13:26:58 +02:00
|
|
|
|
2025-09-28 18:03:47 +02:00
|
|
|
class NanoSocratesSpecial(Encoder):
|
|
|
|
|
|
|
|
|
|
def __init__(
|
2025-10-03 13:26:58 +02:00
|
|
|
self, bpe_vocabulary_size: int, special_tokens: list[str] = []
|
|
|
|
|
) -> None:
|
|
|
|
|
|
2025-09-28 18:03:47 +02:00
|
|
|
super().__init__()
|
|
|
|
|
|
2025-10-03 13:26:58 +02:00
|
|
|
self.__bpe_offset = bpe_vocabulary_size
|
|
|
|
|
self.__vocabulary: dict[str, int] = {}
|
2025-09-28 18:03:47 +02:00
|
|
|
self.__reverse_vocabulary: dict[int, str] = {}
|
|
|
|
|
|
2025-10-03 13:26:58 +02:00
|
|
|
if len(special_tokens) == 0:
|
|
|
|
|
return
|
2025-09-28 18:03:47 +02:00
|
|
|
|
2025-10-03 13:26:58 +02:00
|
|
|
for index, TOKEN in zip(range(0, len(special_tokens)), special_tokens):
|
2025-09-28 18:03:47 +02:00
|
|
|
|
2025-10-03 13:26:58 +02:00
|
|
|
CANDIDATE_ID = self.__bpe_offset + index + 1
|
|
|
|
|
self.__vocabulary[TOKEN] = CANDIDATE_ID
|
|
|
|
|
self.__reverse_vocabulary[CANDIDATE_ID] = TOKEN
|
2025-09-28 18:03:47 +02:00
|
|
|
|
2025-10-03 13:26:58 +02:00
|
|
|
@property
|
|
|
|
|
def __next_id(self):
|
|
|
|
|
BPE_OFFSET = self.__bpe_offset
|
|
|
|
|
VOC_LENGTH = len(self.__vocabulary)
|
|
|
|
|
return BPE_OFFSET + VOC_LENGTH + 1
|
2025-09-28 18:03:47 +02:00
|
|
|
|
2025-10-04 19:42:29 +02:00
|
|
|
@property
|
|
|
|
|
def vocabulary_size(self) -> int:
|
|
|
|
|
return len(self.vocabulary)
|
|
|
|
|
|
2025-10-03 13:26:58 +02:00
|
|
|
@property
|
|
|
|
|
def vocabulary(self) -> dict[str, int]:
|
|
|
|
|
return self.__vocabulary
|
2025-09-28 18:03:47 +02:00
|
|
|
|
2025-10-03 13:26:58 +02:00
|
|
|
@property
|
|
|
|
|
def reverse_vocabulary(self) -> dict[int, str]:
|
|
|
|
|
return self.__reverse_vocabulary
|
2025-10-03 00:59:15 +02:00
|
|
|
|
2025-10-03 13:26:58 +02:00
|
|
|
def add_special_word_to_vocabulary(self, word: str):
|
|
|
|
|
CANDIDATE_INDEX = self.__next_id
|
|
|
|
|
self.__vocabulary[word] = CANDIDATE_INDEX
|
|
|
|
|
self.__reverse_vocabulary[CANDIDATE_INDEX] = word
|
2025-09-28 18:03:47 +02:00
|
|
|
|
|
|
|
|
def encode(self, word: str) -> list[int]:
|
|
|
|
|
ID = self.__vocabulary.get(word)
|
|
|
|
|
|
|
|
|
|
if ID is None:
|
|
|
|
|
raise OutOfDictionaryException()
|
|
|
|
|
|
|
|
|
|
return [ID]
|
|
|
|
|
|
2025-10-03 13:26:58 +02:00
|
|
|
def decode(self, token_id: list[int]) -> str:
|
2025-09-28 18:03:47 +02:00
|
|
|
|
2025-10-03 13:26:58 +02:00
|
|
|
if len(token_id) != 1:
|
|
|
|
|
raise OutOfDictionaryException()
|
|
|
|
|
|
|
|
|
|
ID = token_id[0]
|
2025-09-28 18:03:47 +02:00
|
|
|
WORD = self.__reverse_vocabulary.get(ID)
|
|
|
|
|
|
|
|
|
|
if WORD is None:
|
|
|
|
|
raise OutOfDictionaryException()
|
|
|
|
|
|
|
|
|
|
return WORD
|