From d3b1f7da91463482f87130a83503766686e140c2 Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Mon, 6 Oct 2025 15:55:40 +0200 Subject: [PATCH] Added testing for spanned masking --- Project_Model/Tests/spanned_masker_test.py | 73 ++++++++++++++++++++++ Project_Model/Tests/spanner_file/mask.txt | 1 + 2 files changed, 74 insertions(+) create mode 100644 Project_Model/Tests/spanned_masker_test.py create mode 100644 Project_Model/Tests/spanner_file/mask.txt diff --git a/Project_Model/Tests/spanned_masker_test.py b/Project_Model/Tests/spanned_masker_test.py new file mode 100644 index 0000000..e78d04e --- /dev/null +++ b/Project_Model/Tests/spanned_masker_test.py @@ -0,0 +1,73 @@ +from functools import reduce +from pathlib import Path +import pytest +import Project_Model.Libs.BPE as BPE +import Project_Model.Libs.Transformer as Transformer + +VOCABULARY_PATH = Path("Assets/Model/toy_10/toy_dictionary.json") +VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH) +SPECIAL_LIST = BPE.default_special_tokens() + +class TestSpannedMasker: + + def test_spanned_masking(self): + + CORPUS_PATH = Path("Project_Model/Tests/spanner_file/mask.txt") + TEXT = CORPUS_PATH.read_text("utf-8") + + TOKENIZER = BPE.TokeNanoCore( + VOCABULARY, + SPECIAL_LIST + ) + VOCABULARY_SIZE = TOKENIZER.vocabulary_size + + MASKER = Transformer.SpannedMasker(0.4,average_span=3) + + TOKENS = TOKENIZER.encode(TEXT) + + LEGAL_TOKENS: set[int] = set(TOKENIZER.encode( + "" + )) + + SPECIAL_TOKENS: set[int] = set(TOKENIZER.encode( + "".join(SPECIAL_LIST) + )) + + ILLEGAL_TOKENS: set[int] = SPECIAL_TOKENS.difference(LEGAL_TOKENS) + + SPECIAL_FORMATTER = TOKENIZER.encode("*")[0] + END_FORMATTER = TOKENIZER.encode("")[0] + + OUTPUT, TARGET = MASKER.mask_sequence(TOKENS, VOCABULARY_SIZE, ILLEGAL_TOKENS) + + UNCORRUPTED_TOKENS = list(filter(lambda token: token <= VOCABULARY_SIZE, OUTPUT)) + CORRUPTED_TOKENS = list(filter(lambda token: token <= VOCABULARY_SIZE, TARGET)) + + TARGET.append(END_FORMATTER) + + OUTPUT = list(map(lambda token: SPECIAL_FORMATTER if token > VOCABULARY_SIZE else token, OUTPUT)) + TARGET = list(map(lambda token: SPECIAL_FORMATTER if token > VOCABULARY_SIZE else token, TARGET)) + + OUT_TEXT = TOKENIZER.decode(OUTPUT) + TAR_TEXT = TOKENIZER.decode(TARGET) + + print(f"Original text:\n\n{TEXT}") + print(f"Inputs:\n\n{OUT_TEXT}") + print(f"Targets:\n\n{TAR_TEXT}") + + print("\n".join([ + f"======================", + f"Original length: {len(TOKENS)}", + f"Uncorrupted Chars: {len(UNCORRUPTED_TOKENS)}", + f"Corrupted Chars: {len(CORRUPTED_TOKENS)}", + f"Percentage_corruption: {(len(CORRUPTED_TOKENS)/len(TOKENS))*100}%", + f"======================" + ])) + + + +if __name__ == "__main__": + TestSpannedMasker().test_spanned_masking() + + + diff --git a/Project_Model/Tests/spanner_file/mask.txt b/Project_Model/Tests/spanner_file/mask.txt new file mode 100644 index 0000000..88215b8 --- /dev/null +++ b/Project_Model/Tests/spanner_file/mask.txt @@ -0,0 +1 @@ +dbp-dbr:How_It_Should_Have_Endeddbp-dbp:titledbp-dbr:The_Dark_Knightdbp-dbr:The_Dark_Knightdbp-dbp:captionTheatrical release posterdbp-dbr:The_Dark_Knightdbp-dbp:directordbp-dbr:Christopher_Nolandbp-dbr:The_Dark_Knightdbp-dbp:distributorWarner Bros. Picturesdbp-dbr:The_Dark_Knightdbp-dbp:producerCharles Rovendbp-dbr:The_Dark_Knightdbp-dbp:producerChristopher Nolandbp-dbr:The_Dark_Knightdbp-dbp:producerEmma Thomasdbp-dbr:The_Dark_Knightdbp-dbp:starringChristian Bale \ No newline at end of file