Added Chunker to restrict our domains
This commit is contained in:
parent
9552d61f8d
commit
8db35732f9
66
Project_Model/Libs/BPE/Classes/NanoSocratesChunker.py
Normal file
66
Project_Model/Libs/BPE/Classes/NanoSocratesChunker.py
Normal file
@ -0,0 +1,66 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
import re
|
||||||
|
from ..Errors import DelimiterNotFoundException
|
||||||
|
|
||||||
|
|
||||||
|
class NanoSocratesChunker:
|
||||||
|
|
||||||
|
def __init__(self, max_size: int, special_token_regex: re.Pattern) -> None:
|
||||||
|
self.__max_size: int = max_size
|
||||||
|
self.__special_token_regex: re.Pattern = special_token_regex
|
||||||
|
self.__residual: str = ""
|
||||||
|
|
||||||
|
def chunk(self, file_path: Path):
|
||||||
|
# read_file
|
||||||
|
FILE = open(file_path, "r", encoding="utf-8")
|
||||||
|
exit = False
|
||||||
|
|
||||||
|
while not exit:
|
||||||
|
REMAINING_SIZE = self.__max_size - len(self.__residual)
|
||||||
|
READ_SIZE = min(self.__max_size, REMAINING_SIZE)
|
||||||
|
FILE_CHUNK = FILE.read(READ_SIZE)
|
||||||
|
|
||||||
|
if len(FILE_CHUNK) == 0:
|
||||||
|
exit = True
|
||||||
|
continue
|
||||||
|
|
||||||
|
CHUNK = self.__append_residuals(FILE_CHUNK)
|
||||||
|
|
||||||
|
boundaries = self.__identify_boudaries(CHUNK)
|
||||||
|
|
||||||
|
if boundaries is None:
|
||||||
|
|
||||||
|
# boundaries not found in 2 chunks,
|
||||||
|
if len(CHUNK) > self.__max_size - 1:
|
||||||
|
raise DelimiterNotFoundException()
|
||||||
|
|
||||||
|
if exit:
|
||||||
|
yield CHUNK
|
||||||
|
|
||||||
|
self.__set_residual(0, CHUNK)
|
||||||
|
continue
|
||||||
|
|
||||||
|
start, end = boundaries
|
||||||
|
self.__set_residual(end, CHUNK)
|
||||||
|
yield CHUNK[start:end]
|
||||||
|
|
||||||
|
def __identify_boudaries(self, corpus: str) -> tuple[int, int] | None:
|
||||||
|
|
||||||
|
end = 0
|
||||||
|
|
||||||
|
for match in self.__special_token_regex.finditer(corpus):
|
||||||
|
# print(match)
|
||||||
|
end = match.end()
|
||||||
|
|
||||||
|
if end == 0:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return (0, end)
|
||||||
|
|
||||||
|
def __append_residuals(self, corpus: str) -> str:
|
||||||
|
RESIDUAL = self.__residual
|
||||||
|
self.__residual = ""
|
||||||
|
return RESIDUAL + corpus
|
||||||
|
|
||||||
|
def __set_residual(self, index: int, corpus: str):
|
||||||
|
self.__residual = corpus[index:]
|
||||||
Loading…
x
Reference in New Issue
Block a user