Merge branch 'dev' into dev.embedder

This commit is contained in:
Christian Risi
2025-10-04 19:04:03 +02:00
5 changed files with 79 additions and 8 deletions

View File

@@ -36,6 +36,42 @@ class TestTrainBPE:
for encoded, expected in zip(ENCODED, EXPECTED):
assert encoded == expected
def test_bpe_train_encoding_and_decoding(self):
SPECIAL_LIST = ["<ABS>", "<SOTL>"]
TRAINER = BPE.NanoSocraTrainerPool(
int(32E3),
SPECIAL_LIST
)
TEXT_PATH = Path("Project_Model/Tests/trainer_files/train_encode_decode.txt")
FILE = open(TEXT_PATH)
TEXT = FILE.read()
FILE.close()
EXPECTED = TEXT
# ab = 256
# 256, 256 = 257
# 257, 257 = 258
BPE_ENCODER = TRAINER.trainBPE(
TEXT_PATH,
CACHE_DIR_PATH
)
VOCABULARY = BPE_ENCODER.vocabulary
TOKENANO = BPE.TokeNanoCore(VOCABULARY,SPECIAL_LIST)
ENCODED = TOKENANO.encode(TEXT)
DECODED = TOKENANO.decode(ENCODED)
assert len(DECODED) == len(EXPECTED)
for decoded, expected in zip(DECODED, EXPECTED):
assert decoded == expected
# Useful to debug weird cases
if __name__ == "__main__":
TestTrainBPE().test_bpe_train_encoding_simple()
# TestTrainBPE().test_bpe_train_encoding_simple()
TestTrainBPE().test_bpe_train_encoding_and_decoding()

View File

@@ -0,0 +1 @@
<ABS>The Dark Knight is a 2008 superhero film directed by Christopher Nolan,<SOTL>