Added test to see if illegal tokens were included in target

This commit is contained in:
Christian Risi 2025-10-06 16:17:12 +02:00
parent e93710af08
commit 1797571bb2

View File

@ -16,7 +16,7 @@ class TestSpannedMasker:
CORPUS_PATH = Path("Project_Model/Tests/spanner_file/mask.txt") CORPUS_PATH = Path("Project_Model/Tests/spanner_file/mask.txt")
TEXT = CORPUS_PATH.read_text("utf-8") TEXT = CORPUS_PATH.read_text("utf-8")
CORRUPTION_PERCENTAGE = 0.15 CORRUPTION_PERCENTAGE = 0.15
TOLERANCE = 0.05 TOLERANCE = 0.15
TOKENIZER = BPE.TokeNanoCore(VOCABULARY, SPECIAL_LIST) TOKENIZER = BPE.TokeNanoCore(VOCABULARY, SPECIAL_LIST)
VOCABULARY_SIZE = TOKENIZER.vocabulary_size VOCABULARY_SIZE = TOKENIZER.vocabulary_size
@ -79,6 +79,9 @@ class TestSpannedMasker:
) )
) )
for token in TARGET[:len(TARGET) - 1]:
assert token not in ILLEGAL_TOKENS
assert ACTUAL_CORRUPTION_PERCENTAGE > CORRUPTION_PERCENTAGE - TOLERANCE assert ACTUAL_CORRUPTION_PERCENTAGE > CORRUPTION_PERCENTAGE - TOLERANCE
assert ACTUAL_CORRUPTION_PERCENTAGE < CORRUPTION_PERCENTAGE + TOLERANCE assert ACTUAL_CORRUPTION_PERCENTAGE < CORRUPTION_PERCENTAGE + TOLERANCE