import re from typing import Generator from ..Enums import TokenType class NanoSocratesSplitter: def __init__( self, special_token_regex: re.Pattern ) -> None: self.__special_token_regex = special_token_regex def split_text(self, corpus: str) -> Generator[tuple[str, TokenType]]: bpe_start = 0 bpe_end = len(corpus) for bound_start, bound_end in self.__find_boundaries(corpus): bpe_end = bound_start BPE_TOKEN_TEXT = corpus[bpe_start:bpe_end] if BPE_TOKEN_TEXT != "": yield (BPE_TOKEN_TEXT, TokenType.BPE) bpe_start = bound_end SPECIAL_TOKEN_TEXT = corpus[bound_start:bound_end] if SPECIAL_TOKEN_TEXT != "": yield (SPECIAL_TOKEN_TEXT, TokenType.SPECIAL) def __find_boundaries(self, corpus: str) -> Generator[tuple[int, int]]: for match in self.__special_token_regex.finditer(corpus): start = match.start() end = match.end() yield (start, end)