Added Chunker to restrict our domains

This commit is contained in:
Christian Risi 2025-09-26 18:50:23 +02:00
parent 9552d61f8d
commit 8db35732f9

View File

@ -0,0 +1,66 @@
from pathlib import Path
import re
from ..Errors import DelimiterNotFoundException
class NanoSocratesChunker:
def __init__(self, max_size: int, special_token_regex: re.Pattern) -> None:
self.__max_size: int = max_size
self.__special_token_regex: re.Pattern = special_token_regex
self.__residual: str = ""
def chunk(self, file_path: Path):
# read_file
FILE = open(file_path, "r", encoding="utf-8")
exit = False
while not exit:
REMAINING_SIZE = self.__max_size - len(self.__residual)
READ_SIZE = min(self.__max_size, REMAINING_SIZE)
FILE_CHUNK = FILE.read(READ_SIZE)
if len(FILE_CHUNK) == 0:
exit = True
continue
CHUNK = self.__append_residuals(FILE_CHUNK)
boundaries = self.__identify_boudaries(CHUNK)
if boundaries is None:
# boundaries not found in 2 chunks,
if len(CHUNK) > self.__max_size - 1:
raise DelimiterNotFoundException()
if exit:
yield CHUNK
self.__set_residual(0, CHUNK)
continue
start, end = boundaries
self.__set_residual(end, CHUNK)
yield CHUNK[start:end]
def __identify_boudaries(self, corpus: str) -> tuple[int, int] | None:
end = 0
for match in self.__special_token_regex.finditer(corpus):
# print(match)
end = match.end()
if end == 0:
return None
return (0, end)
def __append_residuals(self, corpus: str) -> str:
RESIDUAL = self.__residual
self.__residual = ""
return RESIDUAL + corpus
def __set_residual(self, index: int, corpus: str):
self.__residual = corpus[index:]