diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesChunker.py b/Project_Model/Libs/BPE/Classes/NanoSocratesChunker.py index 6821151..a81587c 100644 --- a/Project_Model/Libs/BPE/Classes/NanoSocratesChunker.py +++ b/Project_Model/Libs/BPE/Classes/NanoSocratesChunker.py @@ -10,6 +10,10 @@ class NanoSocratesChunker: self.__special_token_regex: re.Pattern = special_token_regex self.__residual: str = "" + # max theorethical size of chars + # between special tokens: + # - min: size - len(longest_token) + # - MAX: size - len(shortest_token) def chunk(self, file_path: Path): # read_file FILE = open(file_path, "r", encoding="utf-8")