diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesChunker.py b/Project_Model/Libs/BPE/Classes/NanoSocratesChunker.py new file mode 100644 index 0000000..6821151 --- /dev/null +++ b/Project_Model/Libs/BPE/Classes/NanoSocratesChunker.py @@ -0,0 +1,66 @@ +from pathlib import Path +import re +from ..Errors import DelimiterNotFoundException + + +class NanoSocratesChunker: + + def __init__(self, max_size: int, special_token_regex: re.Pattern) -> None: + self.__max_size: int = max_size + self.__special_token_regex: re.Pattern = special_token_regex + self.__residual: str = "" + + def chunk(self, file_path: Path): + # read_file + FILE = open(file_path, "r", encoding="utf-8") + exit = False + + while not exit: + REMAINING_SIZE = self.__max_size - len(self.__residual) + READ_SIZE = min(self.__max_size, REMAINING_SIZE) + FILE_CHUNK = FILE.read(READ_SIZE) + + if len(FILE_CHUNK) == 0: + exit = True + continue + + CHUNK = self.__append_residuals(FILE_CHUNK) + + boundaries = self.__identify_boudaries(CHUNK) + + if boundaries is None: + + # boundaries not found in 2 chunks, + if len(CHUNK) > self.__max_size - 1: + raise DelimiterNotFoundException() + + if exit: + yield CHUNK + + self.__set_residual(0, CHUNK) + continue + + start, end = boundaries + self.__set_residual(end, CHUNK) + yield CHUNK[start:end] + + def __identify_boudaries(self, corpus: str) -> tuple[int, int] | None: + + end = 0 + + for match in self.__special_token_regex.finditer(corpus): + # print(match) + end = match.end() + + if end == 0: + return None + + return (0, end) + + def __append_residuals(self, corpus: str) -> str: + RESIDUAL = self.__residual + self.__residual = "" + return RESIDUAL + corpus + + def __set_residual(self, index: int, corpus: str): + self.__residual = corpus[index:]