test files updated

This commit is contained in:
GassiGiuseppe 2025-10-03 01:04:47 +02:00
parent 070dc1b744
commit 09f7b39512
3 changed files with 72 additions and 1 deletions

View File

@ -70,4 +70,5 @@ class TestBPE:
# Useful to debug weird cases
if __name__ == "__main__":
TestBPE().test_bpe_decoding_simple()
# TestBPE().test_bpe_decoding_simple()
TestBPE().test_bpe_encoding_simple()

View File

@ -45,6 +45,7 @@ class TestSplitter:
("<SEP>", TokenType.SPECIAL),
("m d", TokenType.BPE),
("<SEP>", TokenType.SPECIAL),
#("olor", TokenType.BPE)
]
CHUNKS = list(SPLITTER.split_text(TEXT))
@ -129,3 +130,51 @@ class TestSplitter:
assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING
assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE
def test_split_token_decode_simple(self):
# to test the token split into special and bpe
SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX, max_bpe_token_id=1473)
token_list = [100,101,1477]
CHUNKS = list(SPLITTER.split_tokens(token_list))
EXPECTED_CHUNKS = [
([100,101], TokenType.BPE),
(1477, TokenType.SPECIAL),
]
assert len(CHUNKS) == len(EXPECTED_CHUNKS)
for chunk, expected_chunk in zip(EXPECTED_CHUNKS, CHUNKS):
print(f"TEST:\n\tCHUNK:\t\t{chunk}\n\tEXPECTED:\t\t{expected_chunk}")
RECEIVED_TOKEN_STRING, RECEIVED_TOKEN_TYPE = chunk
EXPECTED_TOKEN_STRING, EXPECTED_TOKEN_TYPE = expected_chunk
assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING
assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE
def test_split_token_decode_simple_malformed(self):
# to test the token split into special and bpe
SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX, max_bpe_token_id=1473)
token_list = [100,101,1477,100]
CHUNKS = list(SPLITTER.split_tokens(token_list))
EXPECTED_CHUNKS = [
([100,101], TokenType.BPE),
(1477, TokenType.SPECIAL),
]
assert len(CHUNKS) == len(EXPECTED_CHUNKS)
for chunk, expected_chunk in zip(EXPECTED_CHUNKS, CHUNKS):
print(f"TEST:\n\tCHUNK:\t\t{chunk}\n\tEXPECTED:\t\t{expected_chunk}")
RECEIVED_TOKEN_STRING, RECEIVED_TOKEN_TYPE = chunk
EXPECTED_TOKEN_STRING, EXPECTED_TOKEN_TYPE = expected_chunk
assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING
assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE
# Useful to debug weird cases
if __name__ == "__main__":
TestSplitter().test_split_trailing_text()

View File

@ -0,0 +1,21 @@
from Project_Model.Libs.BPE.Classes.TokeNanoCore import TokeNanoCore
class TestTokeNano:
def test_decode_encode_simple(self):
TEXT = "<SOT>abababab<EOT>"
# ab = 256
# 256, 256 = 257
# 257, 257 = 258
VOCABULARY = {(ord("a"), ord("b")): 256, (256, 256): 257, (257, 257): 258}
# EXPECTED = [258]
TOKE_NANO = TokeNanoCore(VOCABULARY)
ENCODED = TOKE_NANO.encode(TEXT)
DECODED = TOKE_NANO.decode(ENCODED)
assert TEXT == DECODED