63 lines
2.0 KiB
Python
63 lines
2.0 KiB
Python
from pathlib import Path
|
|
|
|
from ..Classes import NanoSocratesSplitter
|
|
from ..Classes import NanoSocratesBPE
|
|
from ..Classes import NanoSocratesSpecial
|
|
|
|
from ..Utils import special_regex_maker
|
|
from ..Enums import TokenType
|
|
|
|
|
|
class TokeNanoCore:
|
|
def __init__(
|
|
self,
|
|
bpe_vocabulary: dict[tuple[int, int], int],
|
|
special_token_list: list[str],
|
|
# special_vocabulary: dict[str, int]
|
|
):
|
|
|
|
self.__bpe_encoder = NanoSocratesBPE(bpe_vocabulary)
|
|
|
|
SPECIAL_REGEX = special_regex_maker(special_token_list)
|
|
BPE_VOCABULARY_SIZE = self.__bpe_encoder.vocabulary_size
|
|
|
|
self.__splitter = NanoSocratesSplitter(SPECIAL_REGEX, BPE_VOCABULARY_SIZE)
|
|
self.__special_encoder = NanoSocratesSpecial(
|
|
BPE_VOCABULARY_SIZE, special_token_list
|
|
)
|
|
|
|
@property
|
|
def vocabulary_size(self):
|
|
BPE_VOC_SIZE = self.__bpe_encoder.vocabulary_size
|
|
SPECIAL_VOC_SIZE = self.__special_encoder.vocabulary_size
|
|
return BPE_VOC_SIZE + SPECIAL_VOC_SIZE
|
|
|
|
def encode(self, corpus: str) -> list[int]:
|
|
output: list[int] = []
|
|
for piece, token_type in self.__splitter.split_text(corpus):
|
|
|
|
if token_type == TokenType.SPECIAL:
|
|
output.extend(self.__special_encoder.encode(piece))
|
|
|
|
# slow but clear
|
|
if token_type == TokenType.BPE:
|
|
output.extend(self.__bpe_encoder.encode(piece))
|
|
|
|
return output
|
|
|
|
def decode(self, corpus: list[int]) -> str:
|
|
output_str = ""
|
|
for token, token_type in self.__splitter.split_tokens(corpus):
|
|
# token is an integer if special, a list of integer otherwise
|
|
if token_type == TokenType.SPECIAL:
|
|
output_str += self.__special_encoder.decode(
|
|
token
|
|
) # it accept an integer
|
|
|
|
# slow but clear
|
|
if token_type == TokenType.BPE:
|
|
output_str += self.__bpe_encoder.decode(
|
|
token
|
|
) # it accept a list of integer
|
|
return output_str
|