from pathlib import Path import re from ..Errors import DelimiterNotFoundException class NanoSocratesChunker: def __init__(self, max_size: int, special_token_regex: re.Pattern) -> None: self.__max_size: int = max_size self.__special_token_regex: re.Pattern = special_token_regex self.__residual: str = "" # max theorethical size of chars # between special tokens: # - min: size - len(longest_token) # - MAX: size - len(shortest_token) def chunk(self, file_path: Path): # read_file FILE = open(file_path, "r", encoding="utf-8") exit = False while not exit: REMAINING_SIZE = self.__max_size - len(self.__residual) READ_SIZE = min(self.__max_size, REMAINING_SIZE) FILE_CHUNK = FILE.read(READ_SIZE) if len(FILE_CHUNK) == 0: exit = True continue CHUNK = self.__append_residuals(FILE_CHUNK) boundaries = self.__identify_boudaries(CHUNK) if boundaries is None: # boundaries not found in 2 chunks, if len(CHUNK) > self.__max_size - 1: raise DelimiterNotFoundException() if exit: yield CHUNK self.__set_residual(0, CHUNK) continue start, end = boundaries self.__set_residual(end, CHUNK) yield CHUNK[start:end] def __identify_boudaries(self, corpus: str) -> tuple[int, int] | None: end = 0 for match in self.__special_token_regex.finditer(corpus): # print(match) end = match.end() if end == 0: return None return (0, end) def __append_residuals(self, corpus: str) -> str: RESIDUAL = self.__residual self.__residual = "" return RESIDUAL + corpus def __set_residual(self, index: int, corpus: str): self.__residual = corpus[index:]