67 lines
1.8 KiB
Python
67 lines
1.8 KiB
Python
from pathlib import Path
|
|
import re
|
|
from ..Errors import DelimiterNotFoundException
|
|
|
|
|
|
class NanoSocratesChunker:
|
|
|
|
def __init__(self, max_size: int, special_token_regex: re.Pattern) -> None:
|
|
self.__max_size: int = max_size
|
|
self.__special_token_regex: re.Pattern = special_token_regex
|
|
self.__residual: str = ""
|
|
|
|
def chunk(self, file_path: Path):
|
|
# read_file
|
|
FILE = open(file_path, "r", encoding="utf-8")
|
|
exit = False
|
|
|
|
while not exit:
|
|
REMAINING_SIZE = self.__max_size - len(self.__residual)
|
|
READ_SIZE = min(self.__max_size, REMAINING_SIZE)
|
|
FILE_CHUNK = FILE.read(READ_SIZE)
|
|
|
|
if len(FILE_CHUNK) == 0:
|
|
exit = True
|
|
continue
|
|
|
|
CHUNK = self.__append_residuals(FILE_CHUNK)
|
|
|
|
boundaries = self.__identify_boudaries(CHUNK)
|
|
|
|
if boundaries is None:
|
|
|
|
# boundaries not found in 2 chunks,
|
|
if len(CHUNK) > self.__max_size - 1:
|
|
raise DelimiterNotFoundException()
|
|
|
|
if exit:
|
|
yield CHUNK
|
|
|
|
self.__set_residual(0, CHUNK)
|
|
continue
|
|
|
|
start, end = boundaries
|
|
self.__set_residual(end, CHUNK)
|
|
yield CHUNK[start:end]
|
|
|
|
def __identify_boudaries(self, corpus: str) -> tuple[int, int] | None:
|
|
|
|
end = 0
|
|
|
|
for match in self.__special_token_regex.finditer(corpus):
|
|
# print(match)
|
|
end = match.end()
|
|
|
|
if end == 0:
|
|
return None
|
|
|
|
return (0, end)
|
|
|
|
def __append_residuals(self, corpus: str) -> str:
|
|
RESIDUAL = self.__residual
|
|
self.__residual = ""
|
|
return RESIDUAL + corpus
|
|
|
|
def __set_residual(self, index: int, corpus: str):
|
|
self.__residual = corpus[index:]
|