90 lines
2.1 KiB
Python
90 lines
2.1 KiB
Python
from pathlib import Path
|
|
import re
|
|
import pytest
|
|
import Project_Model.Libs.BPE as BPE
|
|
|
|
PATTERN = "<(TOKEN|SOT|SEP|EOT)>"
|
|
SYMBOL_REGEX = re.compile(PATTERN)
|
|
|
|
class TestChunker:
|
|
|
|
def test_correct_simple(self):
|
|
|
|
FILE_PATH = Path("Project_Model/Tests/chunker_files/simple.txt")
|
|
LEAST_EXPECTED_CHUNKS = 3
|
|
ORIG_TEXT = FILE_PATH.read_text(encoding="utf-8")
|
|
|
|
CHUNKER = BPE.NanoSocratesChunker(40, SYMBOL_REGEX)
|
|
|
|
CHUNKS = []
|
|
|
|
for chunk in CHUNKER.chunk(FILE_PATH):
|
|
print(chunk)
|
|
CHUNKS.append(
|
|
chunk
|
|
)
|
|
|
|
NANO_TEXT = "".join(CHUNKS)
|
|
|
|
assert len(CHUNKS) > (LEAST_EXPECTED_CHUNKS - 1)
|
|
assert NANO_TEXT == ORIG_TEXT
|
|
|
|
|
|
|
|
def test_correct_edge_1(self):
|
|
|
|
FILE_PATH = Path("Project_Model/Tests/chunker_files/edge-1.txt")
|
|
LEAST_EXPECTED_CHUNKS = 3
|
|
ORIG_TEXT = FILE_PATH.read_text(encoding="utf-8")
|
|
|
|
CHUNKER = BPE.NanoSocratesChunker(15, SYMBOL_REGEX)
|
|
|
|
CHUNKS = []
|
|
|
|
for chunk in CHUNKER.chunk(FILE_PATH):
|
|
print(chunk)
|
|
CHUNKS.append(
|
|
chunk
|
|
)
|
|
|
|
NANO_TEXT = "".join(CHUNKS)
|
|
|
|
assert len(CHUNKS) > (LEAST_EXPECTED_CHUNKS - 1)
|
|
assert NANO_TEXT == ORIG_TEXT
|
|
|
|
|
|
|
|
def test_throwing(self):
|
|
|
|
FILE_PATH = Path("Project_Model/Tests/chunker_files/simple.txt")
|
|
|
|
CHUNKER = BPE.NanoSocratesChunker(5, SYMBOL_REGEX)
|
|
|
|
with pytest.raises(BPE.DelimiterNotFoundException):
|
|
for chunk in CHUNKER.chunk(FILE_PATH):
|
|
print(chunk)
|
|
|
|
if __name__ == "__main__":
|
|
|
|
FILE_PATH = Path("Project_Model/Tests/chunker_files/stress.txt")
|
|
LEAST_EXPECTED_CHUNKS = 3
|
|
ORIG_TEXT = FILE_PATH.read_text(encoding="utf-8")
|
|
|
|
CHUNKER = BPE.NanoSocratesChunker(40, SYMBOL_REGEX)
|
|
|
|
CHUNKS = []
|
|
|
|
try:
|
|
for chunk in CHUNKER.chunk(FILE_PATH):
|
|
print(f"CHUNK START:\n{chunk} - {len(chunk)}\nCHUNK END\n")
|
|
CHUNKS.append(
|
|
chunk
|
|
)
|
|
except:
|
|
exit(0)
|
|
|
|
NANO_TEXT = "".join(CHUNKS)
|
|
|
|
assert len(CHUNKS) > (LEAST_EXPECTED_CHUNKS - 1)
|
|
assert NANO_TEXT == ORIG_TEXT
|