import re from collections import deque from typing import Generator from ..Enums import TokenType class NanoSocratesSplitter: def __init__( self, special_token_regex: re.Pattern, max_bpe_token_id: int = 255 ) -> None: # attention the regex got already compiled self.__special_token_regex = special_token_regex self.__max_bpe_token_id: int = max_bpe_token_id # used for decoding def split_text(self, corpus: str) -> Generator[tuple[str, TokenType]]: """Split a text using a regex given Args: corpus (str): all the corpus string to split Yields: Generator[tuple[str, TokenType]]: each time returns a piece of the splitted text: string and its TokenType. \n TokenType describe if the string is for the BPE or a special Token [BPE, SPECIAL] """ bpe_start = 0 bpe_end = len(corpus) # this can be deleted! for special_token_start, special_token_end in self.__find_boundaries(corpus): # FIND BPE bpe_end = special_token_start BPE_TOKEN_TEXT = corpus[bpe_start:bpe_end] if BPE_TOKEN_TEXT != "": for WORD in self.__split_words(BPE_TOKEN_TEXT): yield (WORD, TokenType.BPE) # FIND SPECIAL TOKEN SPECIAL_TOKEN_TEXT = corpus[special_token_start:special_token_end] if SPECIAL_TOKEN_TEXT != "": yield (SPECIAL_TOKEN_TEXT, TokenType.SPECIAL) # now save the new bpe start point # it will used in the next interaction bpe_start = special_token_end def __find_boundaries(self, corpus: str) -> Generator[tuple[int, int]]: """ Find each time the start and end (not included) of the special token Args: corpus (str): the string where the special token will be searched Yields: Generator[tuple[int, int]]: Note the end is not included """ for match in self.__special_token_regex.finditer(corpus): start = match.start() end = match.end() yield (start, end) # make the last boundary be the end of corpus # eof = len(corpus) # yield(eof,eof) def __split_words(self, bpe_piece: str) -> Generator[str]: END_OF_STRING = len(bpe_piece) bound_start = 0 bound_end = END_OF_STRING + 1 for i in range(0, END_OF_STRING): CANDIDATE_CHAR = bpe_piece[i] if CANDIDATE_CHAR != " ": continue bound_end = i yield bpe_piece[bound_start:bound_end] bound_start = bound_end bound_end = END_OF_STRING + 1 yield bpe_piece[bound_start:bound_end] def split_tokens(self, corpus: list[int]) -> Generator[tuple[list[int], TokenType]]: not_special_token_list: list[int] = [] for token in corpus: if token > self.__max_bpe_token_id: if len(not_special_token_list) > 0: yield (not_special_token_list, TokenType.BPE) not_special_token_list = [] yield ([token], TokenType.SPECIAL) continue not_special_token_list.append(token)