Fixed tests to reflect new version of tokenizer

This commit is contained in:
Christian Risi 2025-10-03 13:27:38 +02:00
parent 51f491d033
commit c74689d01d
2 changed files with 15 additions and 13 deletions

View File

@ -18,7 +18,8 @@ class TestSplitter:
EXPECTED_CHUNKS = [ EXPECTED_CHUNKS = [
("<SOT>", TokenType.SPECIAL), ("<SOT>", TokenType.SPECIAL),
("Lorem ", TokenType.BPE), ("Lorem", TokenType.BPE),
(" ", TokenType.BPE),
("<SEP>", TokenType.SPECIAL), ("<SEP>", TokenType.SPECIAL),
] ]
@ -43,9 +44,10 @@ class TestSplitter:
EXPECTED_CHUNKS = [ EXPECTED_CHUNKS = [
("ipsu", TokenType.BPE), ("ipsu", TokenType.BPE),
("<SEP>", TokenType.SPECIAL), ("<SEP>", TokenType.SPECIAL),
("m d", TokenType.BPE), ("m", TokenType.BPE),
(" d", TokenType.BPE),
("<SEP>", TokenType.SPECIAL), ("<SEP>", TokenType.SPECIAL),
#("olor", TokenType.BPE) # ("olor", TokenType.BPE)
] ]
CHUNKS = list(SPLITTER.split_text(TEXT)) CHUNKS = list(SPLITTER.split_text(TEXT))
@ -69,7 +71,8 @@ class TestSplitter:
EXPECTED_CHUNKS = [ EXPECTED_CHUNKS = [
("ipsu", TokenType.BPE), ("ipsu", TokenType.BPE),
("<SEP>", TokenType.SPECIAL), ("<SEP>", TokenType.SPECIAL),
("m d", TokenType.BPE), ("m", TokenType.BPE),
(" d", TokenType.BPE),
("<SEP>", TokenType.SPECIAL), ("<SEP>", TokenType.SPECIAL),
("<SEP>", TokenType.SPECIAL), ("<SEP>", TokenType.SPECIAL),
("<SEP>", TokenType.SPECIAL), ("<SEP>", TokenType.SPECIAL),
@ -134,12 +137,12 @@ class TestSplitter:
def test_split_token_decode_simple(self): def test_split_token_decode_simple(self):
# to test the token split into special and bpe # to test the token split into special and bpe
SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX, max_bpe_token_id=1473) SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX, max_bpe_token_id=1473)
token_list = [100,101,1477] token_list = [100, 101, 1477]
CHUNKS = list(SPLITTER.split_tokens(token_list)) CHUNKS = list(SPLITTER.split_tokens(token_list))
EXPECTED_CHUNKS = [ EXPECTED_CHUNKS = [
([100,101], TokenType.BPE), ([100, 101], TokenType.BPE),
(1477, TokenType.SPECIAL), ([1477], TokenType.SPECIAL),
] ]
assert len(CHUNKS) == len(EXPECTED_CHUNKS) assert len(CHUNKS) == len(EXPECTED_CHUNKS)
@ -155,12 +158,12 @@ class TestSplitter:
def test_split_token_decode_simple_malformed(self): def test_split_token_decode_simple_malformed(self):
# to test the token split into special and bpe # to test the token split into special and bpe
SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX, max_bpe_token_id=1473) SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX, max_bpe_token_id=1473)
token_list = [100,101,1477,100] token_list = [100, 101, 1477, 100]
CHUNKS = list(SPLITTER.split_tokens(token_list)) CHUNKS = list(SPLITTER.split_tokens(token_list))
EXPECTED_CHUNKS = [ EXPECTED_CHUNKS = [
([100,101], TokenType.BPE), ([100, 101], TokenType.BPE),
(1477, TokenType.SPECIAL), ([1477], TokenType.SPECIAL),
] ]
assert len(CHUNKS) == len(EXPECTED_CHUNKS) assert len(CHUNKS) == len(EXPECTED_CHUNKS)
@ -174,7 +177,6 @@ class TestSplitter:
assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE
# Useful to debug weird cases # Useful to debug weird cases
if __name__ == "__main__": if __name__ == "__main__":
TestSplitter().test_split_trailing_text() TestSplitter().test_split_trailing_text()

View File

@ -13,7 +13,7 @@ class TestTokeNano:
VOCABULARY = {(ord("a"), ord("b")): 256, (256, 256): 257, (257, 257): 258} VOCABULARY = {(ord("a"), ord("b")): 256, (256, 256): 257, (257, 257): 258}
# EXPECTED = [258] # EXPECTED = [258]
TOKE_NANO = TokeNanoCore(VOCABULARY) TOKE_NANO = TokeNanoCore(VOCABULARY, ["<SOT>", "<EOT>"])
ENCODED = TOKE_NANO.encode(TEXT) ENCODED = TOKE_NANO.encode(TEXT)
DECODED = TOKE_NANO.decode(ENCODED) DECODED = TOKE_NANO.decode(ENCODED)