Fixed tests to reflect new version of tokenizer
This commit is contained in:
parent
51f491d033
commit
c74689d01d
@ -19,6 +19,7 @@ class TestSplitter:
|
|||||||
EXPECTED_CHUNKS = [
|
EXPECTED_CHUNKS = [
|
||||||
("<SOT>", TokenType.SPECIAL),
|
("<SOT>", TokenType.SPECIAL),
|
||||||
("Lorem", TokenType.BPE),
|
("Lorem", TokenType.BPE),
|
||||||
|
(" ", TokenType.BPE),
|
||||||
("<SEP>", TokenType.SPECIAL),
|
("<SEP>", TokenType.SPECIAL),
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -43,7 +44,8 @@ class TestSplitter:
|
|||||||
EXPECTED_CHUNKS = [
|
EXPECTED_CHUNKS = [
|
||||||
("ipsu", TokenType.BPE),
|
("ipsu", TokenType.BPE),
|
||||||
("<SEP>", TokenType.SPECIAL),
|
("<SEP>", TokenType.SPECIAL),
|
||||||
("m d", TokenType.BPE),
|
("m", TokenType.BPE),
|
||||||
|
(" d", TokenType.BPE),
|
||||||
("<SEP>", TokenType.SPECIAL),
|
("<SEP>", TokenType.SPECIAL),
|
||||||
# ("olor", TokenType.BPE)
|
# ("olor", TokenType.BPE)
|
||||||
]
|
]
|
||||||
@ -69,7 +71,8 @@ class TestSplitter:
|
|||||||
EXPECTED_CHUNKS = [
|
EXPECTED_CHUNKS = [
|
||||||
("ipsu", TokenType.BPE),
|
("ipsu", TokenType.BPE),
|
||||||
("<SEP>", TokenType.SPECIAL),
|
("<SEP>", TokenType.SPECIAL),
|
||||||
("m d", TokenType.BPE),
|
("m", TokenType.BPE),
|
||||||
|
(" d", TokenType.BPE),
|
||||||
("<SEP>", TokenType.SPECIAL),
|
("<SEP>", TokenType.SPECIAL),
|
||||||
("<SEP>", TokenType.SPECIAL),
|
("<SEP>", TokenType.SPECIAL),
|
||||||
("<SEP>", TokenType.SPECIAL),
|
("<SEP>", TokenType.SPECIAL),
|
||||||
@ -139,7 +142,7 @@ class TestSplitter:
|
|||||||
CHUNKS = list(SPLITTER.split_tokens(token_list))
|
CHUNKS = list(SPLITTER.split_tokens(token_list))
|
||||||
EXPECTED_CHUNKS = [
|
EXPECTED_CHUNKS = [
|
||||||
([100, 101], TokenType.BPE),
|
([100, 101], TokenType.BPE),
|
||||||
(1477, TokenType.SPECIAL),
|
([1477], TokenType.SPECIAL),
|
||||||
]
|
]
|
||||||
|
|
||||||
assert len(CHUNKS) == len(EXPECTED_CHUNKS)
|
assert len(CHUNKS) == len(EXPECTED_CHUNKS)
|
||||||
@ -160,7 +163,7 @@ class TestSplitter:
|
|||||||
CHUNKS = list(SPLITTER.split_tokens(token_list))
|
CHUNKS = list(SPLITTER.split_tokens(token_list))
|
||||||
EXPECTED_CHUNKS = [
|
EXPECTED_CHUNKS = [
|
||||||
([100, 101], TokenType.BPE),
|
([100, 101], TokenType.BPE),
|
||||||
(1477, TokenType.SPECIAL),
|
([1477], TokenType.SPECIAL),
|
||||||
]
|
]
|
||||||
|
|
||||||
assert len(CHUNKS) == len(EXPECTED_CHUNKS)
|
assert len(CHUNKS) == len(EXPECTED_CHUNKS)
|
||||||
@ -174,7 +177,6 @@ class TestSplitter:
|
|||||||
assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE
|
assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Useful to debug weird cases
|
# Useful to debug weird cases
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
TestSplitter().test_split_trailing_text()
|
TestSplitter().test_split_trailing_text()
|
||||||
@ -13,7 +13,7 @@ class TestTokeNano:
|
|||||||
VOCABULARY = {(ord("a"), ord("b")): 256, (256, 256): 257, (257, 257): 258}
|
VOCABULARY = {(ord("a"), ord("b")): 256, (256, 256): 257, (257, 257): 258}
|
||||||
# EXPECTED = [258]
|
# EXPECTED = [258]
|
||||||
|
|
||||||
TOKE_NANO = TokeNanoCore(VOCABULARY)
|
TOKE_NANO = TokeNanoCore(VOCABULARY, ["<SOT>", "<EOT>"])
|
||||||
|
|
||||||
ENCODED = TOKE_NANO.encode(TEXT)
|
ENCODED = TOKE_NANO.encode(TEXT)
|
||||||
DECODED = TOKE_NANO.decode(ENCODED)
|
DECODED = TOKE_NANO.decode(ENCODED)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user