from pathlib import Path from ..Classes import NanoSocratesSplitter from ..Classes import NanoSocratesBPE from ..Classes import NanoSocratesSpecial from ..Utils import special_regex_maker from ..Enums import TokenType class TokeNanoCore: def __init__( self, bpe_vocabulary: dict[tuple[int, int], int], special_token_list: list[str], # special_vocabulary: dict[str, int] ): self.__bpe_encoder = NanoSocratesBPE(bpe_vocabulary) SPECIAL_REGEX = special_regex_maker(special_token_list) BPE_VOCABULARY_SIZE = self.__bpe_encoder.vocabulary_size self.__splitter = NanoSocratesSplitter(SPECIAL_REGEX, BPE_VOCABULARY_SIZE) self.__special_encoder = NanoSocratesSpecial( BPE_VOCABULARY_SIZE, special_token_list ) def encode(self, corpus: str) -> list[int]: output: list[int] = [] for piece, token_type in self.__splitter.split_text(corpus): if token_type == TokenType.SPECIAL: ENCODED_PIECE = self.__special_encoder.encode(piece) output.extend(ENCODED_PIECE) continue # slow but clear if token_type == TokenType.BPE: ENCODED_PIECE = self.__bpe_encoder.encode(piece) output.extend(ENCODED_PIECE) continue return output def decode(self, corpus: list[int]) -> str: output_str = "" for token, token_type in self.__splitter.split_tokens(corpus): # token is an integer if special, a list of integer otherwise if token_type == TokenType.SPECIAL: output_str += self.__special_encoder.decode( token ) continue # slow but clear if token_type == TokenType.BPE: output_str += self.__bpe_encoder.decode( token ) continue return output_str