from pathlib import Path import re import pytest import Project_Model.Libs.BPE as BPE PATTERN = "<(TOKEN|SOT|SEP|EOT)>" SYMBOL_REGEX = re.compile(PATTERN) class TestChunker: def test_correct_simple(self): FILE_PATH = Path("Project_Model/Tests/chunker_files/simple.txt") LEAST_EXPECTED_CHUNKS = 3 ORIG_TEXT = FILE_PATH.read_text(encoding="utf-8") CHUNKER = BPE.NanoSocratesChunker(40, SYMBOL_REGEX) CHUNKS = [] for chunk in CHUNKER.chunk(FILE_PATH): print(chunk) CHUNKS.append( chunk ) NANO_TEXT = "".join(CHUNKS) assert len(CHUNKS) > (LEAST_EXPECTED_CHUNKS - 1) assert NANO_TEXT == ORIG_TEXT def test_correct_edge_1(self): FILE_PATH = Path("Project_Model/Tests/chunker_files/edge-1.txt") LEAST_EXPECTED_CHUNKS = 3 ORIG_TEXT = FILE_PATH.read_text(encoding="utf-8") CHUNKER = BPE.NanoSocratesChunker(15, SYMBOL_REGEX) CHUNKS = [] for chunk in CHUNKER.chunk(FILE_PATH): print(chunk) CHUNKS.append( chunk ) NANO_TEXT = "".join(CHUNKS) assert len(CHUNKS) > (LEAST_EXPECTED_CHUNKS - 1) assert NANO_TEXT == ORIG_TEXT def test_throwing(self): FILE_PATH = Path("Project_Model/Tests/chunker_files/simple.txt") CHUNKER = BPE.NanoSocratesChunker(5, SYMBOL_REGEX) with pytest.raises(BPE.DelimiterNotFoundException): for chunk in CHUNKER.chunk(FILE_PATH): print(chunk) if __name__ == "__main__": FILE_PATH = Path("Project_Model/Tests/chunker_files/stress.txt") LEAST_EXPECTED_CHUNKS = 3 ORIG_TEXT = FILE_PATH.read_text(encoding="utf-8") CHUNKER = BPE.NanoSocratesChunker(40, SYMBOL_REGEX) CHUNKS = [] try: for chunk in CHUNKER.chunk(FILE_PATH): print(f"CHUNK START:\n{chunk} - {len(chunk)}\nCHUNK END\n") CHUNKS.append( chunk ) except: exit(0) NANO_TEXT = "".join(CHUNKS) assert len(CHUNKS) > (LEAST_EXPECTED_CHUNKS - 1) assert NANO_TEXT == ORIG_TEXT