Fixed bug for utf-8 conversion

This commit is contained in:
Christian Risi
2025-09-30 23:58:31 +02:00
parent ccacea18d8
commit 89a0a1f4bb
2 changed files with 23 additions and 4 deletions

View File

@@ -90,7 +90,7 @@ class NanoSocratesBPE(Encoder):
def encode(self, piece: str) -> list[int]:
current_piece = list(map(ord, piece))
current_piece = list(piece.encode("utf-8"))
new_piece = self.__round_encode(current_piece)
while len(current_piece) != len(new_piece):
@@ -128,7 +128,7 @@ class NanoSocratesBPE(Encoder):
return NEW_PIECE
# TODO: decode
# TODO: Remake decode to take a list of token IDs
def decode(self, token_id: int) -> str:
token_stack: list[int] = [token_id]