added nwew method to encode from list of tokens

This commit is contained in:
Christian Risi 2025-10-02 08:48:13 +02:00
parent 3fe4e45ceb
commit 0975c19e69

View File

@ -52,7 +52,7 @@ class NanoSocratesBPE(Encoder):
last_batch: bool
):
ENCODED_CHUNK = self.__round_encode(chunk_data)
ENCODED_CHUNK = self.encode_intermediate(chunk_data)
DATA_LEN_BEFORE_LAST = len(ENCODED_CHUNK) - 1
for i in range(0, DATA_LEN_BEFORE_LAST):
@ -99,6 +99,17 @@ class NanoSocratesBPE(Encoder):
return current_piece
def encode_intermediate(self, piece: list[int]):
current_piece = piece
new_piece = self.__round_encode(current_piece)
while len(current_piece) != len(new_piece):
current_piece = new_piece
new_piece = self.__round_encode(current_piece)
return current_piece
def __round_encode(self, piece: list[int]):
if len(piece) == 1: