Added Chunker to restrict our domains
This commit is contained in:
parent
9552d61f8d
commit
8db35732f9
66
Project_Model/Libs/BPE/Classes/NanoSocratesChunker.py
Normal file
66
Project_Model/Libs/BPE/Classes/NanoSocratesChunker.py
Normal file
@ -0,0 +1,66 @@
|
||||
from pathlib import Path
|
||||
import re
|
||||
from ..Errors import DelimiterNotFoundException
|
||||
|
||||
|
||||
class NanoSocratesChunker:
|
||||
|
||||
def __init__(self, max_size: int, special_token_regex: re.Pattern) -> None:
|
||||
self.__max_size: int = max_size
|
||||
self.__special_token_regex: re.Pattern = special_token_regex
|
||||
self.__residual: str = ""
|
||||
|
||||
def chunk(self, file_path: Path):
|
||||
# read_file
|
||||
FILE = open(file_path, "r", encoding="utf-8")
|
||||
exit = False
|
||||
|
||||
while not exit:
|
||||
REMAINING_SIZE = self.__max_size - len(self.__residual)
|
||||
READ_SIZE = min(self.__max_size, REMAINING_SIZE)
|
||||
FILE_CHUNK = FILE.read(READ_SIZE)
|
||||
|
||||
if len(FILE_CHUNK) == 0:
|
||||
exit = True
|
||||
continue
|
||||
|
||||
CHUNK = self.__append_residuals(FILE_CHUNK)
|
||||
|
||||
boundaries = self.__identify_boudaries(CHUNK)
|
||||
|
||||
if boundaries is None:
|
||||
|
||||
# boundaries not found in 2 chunks,
|
||||
if len(CHUNK) > self.__max_size - 1:
|
||||
raise DelimiterNotFoundException()
|
||||
|
||||
if exit:
|
||||
yield CHUNK
|
||||
|
||||
self.__set_residual(0, CHUNK)
|
||||
continue
|
||||
|
||||
start, end = boundaries
|
||||
self.__set_residual(end, CHUNK)
|
||||
yield CHUNK[start:end]
|
||||
|
||||
def __identify_boudaries(self, corpus: str) -> tuple[int, int] | None:
|
||||
|
||||
end = 0
|
||||
|
||||
for match in self.__special_token_regex.finditer(corpus):
|
||||
# print(match)
|
||||
end = match.end()
|
||||
|
||||
if end == 0:
|
||||
return None
|
||||
|
||||
return (0, end)
|
||||
|
||||
def __append_residuals(self, corpus: str) -> str:
|
||||
RESIDUAL = self.__residual
|
||||
self.__residual = ""
|
||||
return RESIDUAL + corpus
|
||||
|
||||
def __set_residual(self, index: int, corpus: str):
|
||||
self.__residual = corpus[index:]
|
||||
Loading…
x
Reference in New Issue
Block a user