WIP training Batching

This commit is contained in:
GassiGiuseppe
2025-10-07 17:41:53 +02:00
parent 490edcfd53
commit b4ee8362a2
2 changed files with 60 additions and 10 deletions

View File

@@ -6,6 +6,7 @@ from ..Classes import NanoSocratesSpecial
from ..Utils import special_regex_maker
from ..Enums import TokenType
from ..Enums import SpecialToken
class TokeNanoCore:
@@ -44,6 +45,27 @@ class TokeNanoCore:
output.extend(self.__bpe_encoder.encode(piece))
return output
def encode_incomplete_string(self, corpus: str) -> list[int]:
"""
Encode string which don't end with a special token
"""
corpus = corpus + SpecialToken.CORPUS_END.value
output: list[int] = []
for piece, token_type in self.__splitter.split_text(corpus):
if token_type == TokenType.SPECIAL:
output.extend(self.__special_encoder.encode(piece))
# slow but clear
if token_type == TokenType.BPE:
output.extend(self.__bpe_encoder.encode(piece))
return output[:-1]
def decode(self, corpus: list[int]) -> str:
output_str = ""