WIP training Batching
This commit is contained in:
@@ -6,6 +6,7 @@ from ..Classes import NanoSocratesSpecial
|
||||
|
||||
from ..Utils import special_regex_maker
|
||||
from ..Enums import TokenType
|
||||
from ..Enums import SpecialToken
|
||||
|
||||
|
||||
class TokeNanoCore:
|
||||
@@ -44,6 +45,27 @@ class TokeNanoCore:
|
||||
output.extend(self.__bpe_encoder.encode(piece))
|
||||
|
||||
return output
|
||||
|
||||
|
||||
|
||||
def encode_incomplete_string(self, corpus: str) -> list[int]:
|
||||
"""
|
||||
Encode string which don't end with a special token
|
||||
"""
|
||||
corpus = corpus + SpecialToken.CORPUS_END.value
|
||||
output: list[int] = []
|
||||
for piece, token_type in self.__splitter.split_text(corpus):
|
||||
|
||||
if token_type == TokenType.SPECIAL:
|
||||
output.extend(self.__special_encoder.encode(piece))
|
||||
|
||||
# slow but clear
|
||||
if token_type == TokenType.BPE:
|
||||
output.extend(self.__bpe_encoder.encode(piece))
|
||||
|
||||
return output[:-1]
|
||||
|
||||
|
||||
|
||||
def decode(self, corpus: list[int]) -> str:
|
||||
output_str = ""
|
||||
|
||||
Reference in New Issue
Block a user