From 09f7b39512a72de432afd245b5efc9d87ccd6207 Mon Sep 17 00:00:00 2001 From: GassiGiuseppe Date: Fri, 3 Oct 2025 01:04:47 +0200 Subject: [PATCH] test files updated --- Project_Model/Tests/bpe_test.py | 3 +- Project_Model/Tests/splitter_test.py | 49 ++++++++++++++++++++++++++++ Project_Model/Tests/tokenano_test.py | 21 ++++++++++++ 3 files changed, 72 insertions(+), 1 deletion(-) create mode 100644 Project_Model/Tests/tokenano_test.py diff --git a/Project_Model/Tests/bpe_test.py b/Project_Model/Tests/bpe_test.py index e6c8f31..0acae46 100644 --- a/Project_Model/Tests/bpe_test.py +++ b/Project_Model/Tests/bpe_test.py @@ -70,4 +70,5 @@ class TestBPE: # Useful to debug weird cases if __name__ == "__main__": - TestBPE().test_bpe_decoding_simple() + # TestBPE().test_bpe_decoding_simple() + TestBPE().test_bpe_encoding_simple() diff --git a/Project_Model/Tests/splitter_test.py b/Project_Model/Tests/splitter_test.py index eda95b6..154e24e 100644 --- a/Project_Model/Tests/splitter_test.py +++ b/Project_Model/Tests/splitter_test.py @@ -45,6 +45,7 @@ class TestSplitter: ("", TokenType.SPECIAL), ("m d", TokenType.BPE), ("", TokenType.SPECIAL), + #("olor", TokenType.BPE) ] CHUNKS = list(SPLITTER.split_text(TEXT)) @@ -129,3 +130,51 @@ class TestSplitter: assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE + + def test_split_token_decode_simple(self): + # to test the token split into special and bpe + SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX, max_bpe_token_id=1473) + token_list = [100,101,1477] + + CHUNKS = list(SPLITTER.split_tokens(token_list)) + EXPECTED_CHUNKS = [ + ([100,101], TokenType.BPE), + (1477, TokenType.SPECIAL), + ] + + assert len(CHUNKS) == len(EXPECTED_CHUNKS) + + for chunk, expected_chunk in zip(EXPECTED_CHUNKS, CHUNKS): + print(f"TEST:\n\tCHUNK:\t\t{chunk}\n\tEXPECTED:\t\t{expected_chunk}") + RECEIVED_TOKEN_STRING, RECEIVED_TOKEN_TYPE = chunk + EXPECTED_TOKEN_STRING, EXPECTED_TOKEN_TYPE = expected_chunk + + assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING + assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE + + def test_split_token_decode_simple_malformed(self): + # to test the token split into special and bpe + SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX, max_bpe_token_id=1473) + token_list = [100,101,1477,100] + + CHUNKS = list(SPLITTER.split_tokens(token_list)) + EXPECTED_CHUNKS = [ + ([100,101], TokenType.BPE), + (1477, TokenType.SPECIAL), + ] + + assert len(CHUNKS) == len(EXPECTED_CHUNKS) + + for chunk, expected_chunk in zip(EXPECTED_CHUNKS, CHUNKS): + print(f"TEST:\n\tCHUNK:\t\t{chunk}\n\tEXPECTED:\t\t{expected_chunk}") + RECEIVED_TOKEN_STRING, RECEIVED_TOKEN_TYPE = chunk + EXPECTED_TOKEN_STRING, EXPECTED_TOKEN_TYPE = expected_chunk + + assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING + assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE + + + +# Useful to debug weird cases +if __name__ == "__main__": + TestSplitter().test_split_trailing_text() \ No newline at end of file diff --git a/Project_Model/Tests/tokenano_test.py b/Project_Model/Tests/tokenano_test.py new file mode 100644 index 0000000..2dc7779 --- /dev/null +++ b/Project_Model/Tests/tokenano_test.py @@ -0,0 +1,21 @@ + +from Project_Model.Libs.BPE.Classes.TokeNanoCore import TokeNanoCore + +class TestTokeNano: + + def test_decode_encode_simple(self): + TEXT = "abababab" + + # ab = 256 + # 256, 256 = 257 + # 257, 257 = 258 + + VOCABULARY = {(ord("a"), ord("b")): 256, (256, 256): 257, (257, 257): 258} + # EXPECTED = [258] + + TOKE_NANO = TokeNanoCore(VOCABULARY) + + ENCODED = TOKE_NANO.encode(TEXT) + DECODED = TOKE_NANO.decode(ENCODED) + + assert TEXT == DECODED