added nwew method to encode from list of tokens
This commit is contained in:
parent
3fe4e45ceb
commit
0975c19e69
@ -52,7 +52,7 @@ class NanoSocratesBPE(Encoder):
|
||||
last_batch: bool
|
||||
):
|
||||
|
||||
ENCODED_CHUNK = self.__round_encode(chunk_data)
|
||||
ENCODED_CHUNK = self.encode_intermediate(chunk_data)
|
||||
DATA_LEN_BEFORE_LAST = len(ENCODED_CHUNK) - 1
|
||||
|
||||
for i in range(0, DATA_LEN_BEFORE_LAST):
|
||||
@ -99,6 +99,17 @@ class NanoSocratesBPE(Encoder):
|
||||
|
||||
return current_piece
|
||||
|
||||
def encode_intermediate(self, piece: list[int]):
|
||||
current_piece = piece
|
||||
new_piece = self.__round_encode(current_piece)
|
||||
|
||||
while len(current_piece) != len(new_piece):
|
||||
current_piece = new_piece
|
||||
new_piece = self.__round_encode(current_piece)
|
||||
|
||||
return current_piece
|
||||
|
||||
|
||||
def __round_encode(self, piece: list[int]):
|
||||
|
||||
if len(piece) == 1:
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user