Added test for chunker
This commit is contained in:
parent
8db35732f9
commit
3e8b5c5579
89
Project_Model/Tests/chunker_test.py
Normal file
89
Project_Model/Tests/chunker_test.py
Normal file
@ -0,0 +1,89 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
import re
|
||||||
|
import pytest
|
||||||
|
import Project_Model.Libs.BPE as BPE
|
||||||
|
|
||||||
|
PATTERN = "<(TOKEN|SOT|SEP|EOT)>"
|
||||||
|
SYMBOL_REGEX = re.compile(PATTERN)
|
||||||
|
|
||||||
|
class TestChunker:
|
||||||
|
|
||||||
|
def test_correct_simple(self):
|
||||||
|
|
||||||
|
FILE_PATH = Path("Project_Model/Tests/chunker_files/simple.txt")
|
||||||
|
LEAST_EXPECTED_CHUNKS = 3
|
||||||
|
ORIG_TEXT = FILE_PATH.read_text(encoding="utf-8")
|
||||||
|
|
||||||
|
CHUNKER = BPE.NanoSocratesChunker(40, SYMBOL_REGEX)
|
||||||
|
|
||||||
|
CHUNKS = []
|
||||||
|
|
||||||
|
for chunk in CHUNKER.chunk(FILE_PATH):
|
||||||
|
print(chunk)
|
||||||
|
CHUNKS.append(
|
||||||
|
chunk
|
||||||
|
)
|
||||||
|
|
||||||
|
NANO_TEXT = "".join(CHUNKS)
|
||||||
|
|
||||||
|
assert len(CHUNKS) > (LEAST_EXPECTED_CHUNKS - 1)
|
||||||
|
assert NANO_TEXT == ORIG_TEXT
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def test_correct_edge_1(self):
|
||||||
|
|
||||||
|
FILE_PATH = Path("Project_Model/Tests/chunker_files/edge-1.txt")
|
||||||
|
LEAST_EXPECTED_CHUNKS = 3
|
||||||
|
ORIG_TEXT = FILE_PATH.read_text(encoding="utf-8")
|
||||||
|
|
||||||
|
CHUNKER = BPE.NanoSocratesChunker(15, SYMBOL_REGEX)
|
||||||
|
|
||||||
|
CHUNKS = []
|
||||||
|
|
||||||
|
for chunk in CHUNKER.chunk(FILE_PATH):
|
||||||
|
print(chunk)
|
||||||
|
CHUNKS.append(
|
||||||
|
chunk
|
||||||
|
)
|
||||||
|
|
||||||
|
NANO_TEXT = "".join(CHUNKS)
|
||||||
|
|
||||||
|
assert len(CHUNKS) > (LEAST_EXPECTED_CHUNKS - 1)
|
||||||
|
assert NANO_TEXT == ORIG_TEXT
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def test_throwing(self):
|
||||||
|
|
||||||
|
FILE_PATH = Path("Project_Model/Tests/chunker_files/simple.txt")
|
||||||
|
|
||||||
|
CHUNKER = BPE.NanoSocratesChunker(5, SYMBOL_REGEX)
|
||||||
|
|
||||||
|
with pytest.raises(BPE.DelimiterNotFoundException):
|
||||||
|
for chunk in CHUNKER.chunk(FILE_PATH):
|
||||||
|
print(chunk)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
|
||||||
|
FILE_PATH = Path("Project_Model/Tests/chunker_files/stress.txt")
|
||||||
|
LEAST_EXPECTED_CHUNKS = 3
|
||||||
|
ORIG_TEXT = FILE_PATH.read_text(encoding="utf-8")
|
||||||
|
|
||||||
|
CHUNKER = BPE.NanoSocratesChunker(40, SYMBOL_REGEX)
|
||||||
|
|
||||||
|
CHUNKS = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
for chunk in CHUNKER.chunk(FILE_PATH):
|
||||||
|
print(f"CHUNK START:\n{chunk} - {len(chunk)}\nCHUNK END\n")
|
||||||
|
CHUNKS.append(
|
||||||
|
chunk
|
||||||
|
)
|
||||||
|
except:
|
||||||
|
exit(0)
|
||||||
|
|
||||||
|
NANO_TEXT = "".join(CHUNKS)
|
||||||
|
|
||||||
|
assert len(CHUNKS) > (LEAST_EXPECTED_CHUNKS - 1)
|
||||||
|
assert NANO_TEXT == ORIG_TEXT
|
||||||
Loading…
x
Reference in New Issue
Block a user