Added Chunker

This commit is contained in:
Christian Risi 2025-09-28 18:02:06 +02:00
parent ed0255e99b
commit b071145f6e

View File

@ -10,6 +10,10 @@ class NanoSocratesChunker:
self.__special_token_regex: re.Pattern = special_token_regex self.__special_token_regex: re.Pattern = special_token_regex
self.__residual: str = "" self.__residual: str = ""
# max theorethical size of chars
# between special tokens:
# - min: size - len(longest_token)
# - MAX: size - len(shortest_token)
def chunk(self, file_path: Path): def chunk(self, file_path: Path):
# read_file # read_file
FILE = open(file_path, "r", encoding="utf-8") FILE = open(file_path, "r", encoding="utf-8")