From c74689d01d0b8c3c5217cc15a806200c58d6eef0 Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Fri, 3 Oct 2025 13:27:38 +0200 Subject: [PATCH] Fixed tests to reflect new version of tokenizer --- Project_Model/Tests/splitter_test.py | 26 ++++++++++++++------------ Project_Model/Tests/tokenano_test.py | 2 +- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/Project_Model/Tests/splitter_test.py b/Project_Model/Tests/splitter_test.py index 154e24e..2bf9a0f 100644 --- a/Project_Model/Tests/splitter_test.py +++ b/Project_Model/Tests/splitter_test.py @@ -18,7 +18,8 @@ class TestSplitter: EXPECTED_CHUNKS = [ ("", TokenType.SPECIAL), - ("Lorem ", TokenType.BPE), + ("Lorem", TokenType.BPE), + (" ", TokenType.BPE), ("", TokenType.SPECIAL), ] @@ -43,9 +44,10 @@ class TestSplitter: EXPECTED_CHUNKS = [ ("ipsu", TokenType.BPE), ("", TokenType.SPECIAL), - ("m d", TokenType.BPE), + ("m", TokenType.BPE), + (" d", TokenType.BPE), ("", TokenType.SPECIAL), - #("olor", TokenType.BPE) + # ("olor", TokenType.BPE) ] CHUNKS = list(SPLITTER.split_text(TEXT)) @@ -69,7 +71,8 @@ class TestSplitter: EXPECTED_CHUNKS = [ ("ipsu", TokenType.BPE), ("", TokenType.SPECIAL), - ("m d", TokenType.BPE), + ("m", TokenType.BPE), + (" d", TokenType.BPE), ("", TokenType.SPECIAL), ("", TokenType.SPECIAL), ("", TokenType.SPECIAL), @@ -134,12 +137,12 @@ class TestSplitter: def test_split_token_decode_simple(self): # to test the token split into special and bpe SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX, max_bpe_token_id=1473) - token_list = [100,101,1477] + token_list = [100, 101, 1477] CHUNKS = list(SPLITTER.split_tokens(token_list)) EXPECTED_CHUNKS = [ - ([100,101], TokenType.BPE), - (1477, TokenType.SPECIAL), + ([100, 101], TokenType.BPE), + ([1477], TokenType.SPECIAL), ] assert len(CHUNKS) == len(EXPECTED_CHUNKS) @@ -155,12 +158,12 @@ class TestSplitter: def test_split_token_decode_simple_malformed(self): # to test the token split into special and bpe SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX, max_bpe_token_id=1473) - token_list = [100,101,1477,100] + token_list = [100, 101, 1477, 100] CHUNKS = list(SPLITTER.split_tokens(token_list)) EXPECTED_CHUNKS = [ - ([100,101], TokenType.BPE), - (1477, TokenType.SPECIAL), + ([100, 101], TokenType.BPE), + ([1477], TokenType.SPECIAL), ] assert len(CHUNKS) == len(EXPECTED_CHUNKS) @@ -174,7 +177,6 @@ class TestSplitter: assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE - # Useful to debug weird cases if __name__ == "__main__": - TestSplitter().test_split_trailing_text() \ No newline at end of file + TestSplitter().test_split_trailing_text() diff --git a/Project_Model/Tests/tokenano_test.py b/Project_Model/Tests/tokenano_test.py index 2dc7779..c8f0d88 100644 --- a/Project_Model/Tests/tokenano_test.py +++ b/Project_Model/Tests/tokenano_test.py @@ -13,7 +13,7 @@ class TestTokeNano: VOCABULARY = {(ord("a"), ord("b")): 256, (256, 256): 257, (257, 257): 258} # EXPECTED = [258] - TOKE_NANO = TokeNanoCore(VOCABULARY) + TOKE_NANO = TokeNanoCore(VOCABULARY, ["", ""]) ENCODED = TOKE_NANO.encode(TEXT) DECODED = TOKE_NANO.decode(ENCODED)