From 3e8b5c55796963ba9e8db3dde9dbbf241c1819b2 Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Fri, 26 Sep 2025 18:50:32 +0200 Subject: [PATCH] Added test for chunker --- Project_Model/Tests/chunker_test.py | 89 +++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 Project_Model/Tests/chunker_test.py diff --git a/Project_Model/Tests/chunker_test.py b/Project_Model/Tests/chunker_test.py new file mode 100644 index 0000000..7bac3bc --- /dev/null +++ b/Project_Model/Tests/chunker_test.py @@ -0,0 +1,89 @@ +from pathlib import Path +import re +import pytest +import Project_Model.Libs.BPE as BPE + +PATTERN = "<(TOKEN|SOT|SEP|EOT)>" +SYMBOL_REGEX = re.compile(PATTERN) + +class TestChunker: + + def test_correct_simple(self): + + FILE_PATH = Path("Project_Model/Tests/chunker_files/simple.txt") + LEAST_EXPECTED_CHUNKS = 3 + ORIG_TEXT = FILE_PATH.read_text(encoding="utf-8") + + CHUNKER = BPE.NanoSocratesChunker(40, SYMBOL_REGEX) + + CHUNKS = [] + + for chunk in CHUNKER.chunk(FILE_PATH): + print(chunk) + CHUNKS.append( + chunk + ) + + NANO_TEXT = "".join(CHUNKS) + + assert len(CHUNKS) > (LEAST_EXPECTED_CHUNKS - 1) + assert NANO_TEXT == ORIG_TEXT + + + + def test_correct_edge_1(self): + + FILE_PATH = Path("Project_Model/Tests/chunker_files/edge-1.txt") + LEAST_EXPECTED_CHUNKS = 3 + ORIG_TEXT = FILE_PATH.read_text(encoding="utf-8") + + CHUNKER = BPE.NanoSocratesChunker(15, SYMBOL_REGEX) + + CHUNKS = [] + + for chunk in CHUNKER.chunk(FILE_PATH): + print(chunk) + CHUNKS.append( + chunk + ) + + NANO_TEXT = "".join(CHUNKS) + + assert len(CHUNKS) > (LEAST_EXPECTED_CHUNKS - 1) + assert NANO_TEXT == ORIG_TEXT + + + + def test_throwing(self): + + FILE_PATH = Path("Project_Model/Tests/chunker_files/simple.txt") + + CHUNKER = BPE.NanoSocratesChunker(5, SYMBOL_REGEX) + + with pytest.raises(BPE.DelimiterNotFoundException): + for chunk in CHUNKER.chunk(FILE_PATH): + print(chunk) + +if __name__ == "__main__": + + FILE_PATH = Path("Project_Model/Tests/chunker_files/stress.txt") + LEAST_EXPECTED_CHUNKS = 3 + ORIG_TEXT = FILE_PATH.read_text(encoding="utf-8") + + CHUNKER = BPE.NanoSocratesChunker(40, SYMBOL_REGEX) + + CHUNKS = [] + + try: + for chunk in CHUNKER.chunk(FILE_PATH): + print(f"CHUNK START:\n{chunk} - {len(chunk)}\nCHUNK END\n") + CHUNKS.append( + chunk + ) + except: + exit(0) + + NANO_TEXT = "".join(CHUNKS) + + assert len(CHUNKS) > (LEAST_EXPECTED_CHUNKS - 1) + assert NANO_TEXT == ORIG_TEXT