From b071145f6eff631bdc651182d7b93cf10f88d784 Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Sun, 28 Sep 2025 18:02:06 +0200 Subject: [PATCH] Added Chunker --- Project_Model/Libs/BPE/Classes/NanoSocratesChunker.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesChunker.py b/Project_Model/Libs/BPE/Classes/NanoSocratesChunker.py index 6821151..a81587c 100644 --- a/Project_Model/Libs/BPE/Classes/NanoSocratesChunker.py +++ b/Project_Model/Libs/BPE/Classes/NanoSocratesChunker.py @@ -10,6 +10,10 @@ class NanoSocratesChunker: self.__special_token_regex: re.Pattern = special_token_regex self.__residual: str = "" + # max theorethical size of chars + # between special tokens: + # - min: size - len(longest_token) + # - MAX: size - len(shortest_token) def chunk(self, file_path: Path): # read_file FILE = open(file_path, "r", encoding="utf-8")