Corrected test to reflect changes in NanoSocratesBPE

This commit is contained in:
Christian Risi 2025-10-02 09:33:47 +02:00
parent aa765b4555
commit eadba1fb82

View File

@ -29,7 +29,7 @@ class TestBPE:
def test_bpe_decoding_simple(self): def test_bpe_decoding_simple(self):
INPUT = 258 INPUT = [258]
# ab = 256 # ab = 256
# 256, 256 = 257 # 256, 256 = 257
@ -47,6 +47,27 @@ class TestBPE:
for encoded, expected in zip(DECODED, EXPECTED): for encoded, expected in zip(DECODED, EXPECTED):
assert encoded == expected assert encoded == expected
def test_bpe_decoding_edge_1(self):
INPUT = [258, ord("c")]
# ab = 256
# 256, 256 = 257
# 257, 257 = 258
VOCABULARY = {(ord("a"), ord("b")): 256, (256, 256): 257, (257, 257): 258}
EXPECTED = "ababababc"
BPE_ENCODER = BPE.NanoSocratesBPE(VOCABULARY)
DECODED = BPE_ENCODER.decode(INPUT)
assert len(DECODED) == len(EXPECTED)
for encoded, expected in zip(DECODED, EXPECTED):
assert encoded == expected
# Useful to debug weird cases # Useful to debug weird cases
if __name__ == "__main__": if __name__ == "__main__":
TestBPE().test_bpe_decoding_simple() TestBPE().test_bpe_decoding_simple()