NanoSocrates/Project_Model/Libs/BPE/Classes/NanoSocratesChunker.py

71 lines
2.0 KiB
Python
Raw Normal View History

2025-09-26 18:50:23 +02:00
from pathlib import Path
import re
from ..Errors import DelimiterNotFoundException
class NanoSocratesChunker:
def __init__(self, max_size: int, special_token_regex: re.Pattern) -> None:
self.__max_size: int = max_size
self.__special_token_regex: re.Pattern = special_token_regex
self.__residual: str = ""
2025-09-28 18:02:06 +02:00
# max theorethical size of chars
# between special tokens:
# - min: size - len(longest_token)
# - MAX: size - len(shortest_token)
2025-09-26 18:50:23 +02:00
def chunk(self, file_path: Path):
# read_file
FILE = open(file_path, "r", encoding="utf-8")
exit = False
while not exit:
REMAINING_SIZE = self.__max_size - len(self.__residual)
READ_SIZE = min(self.__max_size, REMAINING_SIZE)
FILE_CHUNK = FILE.read(READ_SIZE)
if len(FILE_CHUNK) == 0:
exit = True
continue
CHUNK = self.__append_residuals(FILE_CHUNK)
boundaries = self.__identify_boudaries(CHUNK)
if boundaries is None:
# boundaries not found in 2 chunks,
if len(CHUNK) > self.__max_size - 1:
raise DelimiterNotFoundException()
if exit:
yield CHUNK
self.__set_residual(0, CHUNK)
continue
start, end = boundaries
self.__set_residual(end, CHUNK)
yield CHUNK[start:end]
def __identify_boudaries(self, corpus: str) -> tuple[int, int] | None:
end = 0
for match in self.__special_token_regex.finditer(corpus):
# print(match)
end = match.end()
if end == 0:
return None
return (0, end)
def __append_residuals(self, corpus: str) -> str:
RESIDUAL = self.__residual
self.__residual = ""
return RESIDUAL + corpus
def __set_residual(self, index: int, corpus: str):
self.__residual = corpus[index:]