Added test for chunker
This commit is contained in:
parent
8db35732f9
commit
3e8b5c5579
89
Project_Model/Tests/chunker_test.py
Normal file
89
Project_Model/Tests/chunker_test.py
Normal file
@ -0,0 +1,89 @@
|
||||
from pathlib import Path
|
||||
import re
|
||||
import pytest
|
||||
import Project_Model.Libs.BPE as BPE
|
||||
|
||||
PATTERN = "<(TOKEN|SOT|SEP|EOT)>"
|
||||
SYMBOL_REGEX = re.compile(PATTERN)
|
||||
|
||||
class TestChunker:
|
||||
|
||||
def test_correct_simple(self):
|
||||
|
||||
FILE_PATH = Path("Project_Model/Tests/chunker_files/simple.txt")
|
||||
LEAST_EXPECTED_CHUNKS = 3
|
||||
ORIG_TEXT = FILE_PATH.read_text(encoding="utf-8")
|
||||
|
||||
CHUNKER = BPE.NanoSocratesChunker(40, SYMBOL_REGEX)
|
||||
|
||||
CHUNKS = []
|
||||
|
||||
for chunk in CHUNKER.chunk(FILE_PATH):
|
||||
print(chunk)
|
||||
CHUNKS.append(
|
||||
chunk
|
||||
)
|
||||
|
||||
NANO_TEXT = "".join(CHUNKS)
|
||||
|
||||
assert len(CHUNKS) > (LEAST_EXPECTED_CHUNKS - 1)
|
||||
assert NANO_TEXT == ORIG_TEXT
|
||||
|
||||
|
||||
|
||||
def test_correct_edge_1(self):
|
||||
|
||||
FILE_PATH = Path("Project_Model/Tests/chunker_files/edge-1.txt")
|
||||
LEAST_EXPECTED_CHUNKS = 3
|
||||
ORIG_TEXT = FILE_PATH.read_text(encoding="utf-8")
|
||||
|
||||
CHUNKER = BPE.NanoSocratesChunker(15, SYMBOL_REGEX)
|
||||
|
||||
CHUNKS = []
|
||||
|
||||
for chunk in CHUNKER.chunk(FILE_PATH):
|
||||
print(chunk)
|
||||
CHUNKS.append(
|
||||
chunk
|
||||
)
|
||||
|
||||
NANO_TEXT = "".join(CHUNKS)
|
||||
|
||||
assert len(CHUNKS) > (LEAST_EXPECTED_CHUNKS - 1)
|
||||
assert NANO_TEXT == ORIG_TEXT
|
||||
|
||||
|
||||
|
||||
def test_throwing(self):
|
||||
|
||||
FILE_PATH = Path("Project_Model/Tests/chunker_files/simple.txt")
|
||||
|
||||
CHUNKER = BPE.NanoSocratesChunker(5, SYMBOL_REGEX)
|
||||
|
||||
with pytest.raises(BPE.DelimiterNotFoundException):
|
||||
for chunk in CHUNKER.chunk(FILE_PATH):
|
||||
print(chunk)
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
FILE_PATH = Path("Project_Model/Tests/chunker_files/stress.txt")
|
||||
LEAST_EXPECTED_CHUNKS = 3
|
||||
ORIG_TEXT = FILE_PATH.read_text(encoding="utf-8")
|
||||
|
||||
CHUNKER = BPE.NanoSocratesChunker(40, SYMBOL_REGEX)
|
||||
|
||||
CHUNKS = []
|
||||
|
||||
try:
|
||||
for chunk in CHUNKER.chunk(FILE_PATH):
|
||||
print(f"CHUNK START:\n{chunk} - {len(chunk)}\nCHUNK END\n")
|
||||
CHUNKS.append(
|
||||
chunk
|
||||
)
|
||||
except:
|
||||
exit(0)
|
||||
|
||||
NANO_TEXT = "".join(CHUNKS)
|
||||
|
||||
assert len(CHUNKS) > (LEAST_EXPECTED_CHUNKS - 1)
|
||||
assert NANO_TEXT == ORIG_TEXT
|
||||
Loading…
x
Reference in New Issue
Block a user