Fixed bug for utf-8 conversion
This commit is contained in:
@@ -90,7 +90,7 @@ class NanoSocratesBPE(Encoder):
|
||||
|
||||
def encode(self, piece: str) -> list[int]:
|
||||
|
||||
current_piece = list(map(ord, piece))
|
||||
current_piece = list(piece.encode("utf-8"))
|
||||
new_piece = self.__round_encode(current_piece)
|
||||
|
||||
while len(current_piece) != len(new_piece):
|
||||
@@ -128,7 +128,7 @@ class NanoSocratesBPE(Encoder):
|
||||
|
||||
return NEW_PIECE
|
||||
|
||||
# TODO: decode
|
||||
# TODO: Remake decode to take a list of token IDs
|
||||
def decode(self, token_id: int) -> str:
|
||||
|
||||
token_stack: list[int] = [token_id]
|
||||
|
||||
Reference in New Issue
Block a user