Fix of bugs and semantics

This commit is contained in:
Christian Risi
2025-10-03 13:26:58 +02:00
parent 6b9cb7cd35
commit c5c0c61f79
5 changed files with 134 additions and 129 deletions

View File

@@ -31,7 +31,8 @@ class NanoSocratesSplitter:
bpe_end = special_token_start
BPE_TOKEN_TEXT = corpus[bpe_start:bpe_end]
if BPE_TOKEN_TEXT != "":
yield (BPE_TOKEN_TEXT, TokenType.BPE)
for WORD in self.__split_words(BPE_TOKEN_TEXT):
yield (WORD, TokenType.BPE)
# FIND SPECIAL TOKEN
SPECIAL_TOKEN_TEXT = corpus[special_token_start:special_token_end]
@@ -60,6 +61,27 @@ class NanoSocratesSplitter:
# eof = len(corpus)
# yield(eof,eof)
def __split_words(self, bpe_piece: str) -> Generator[str]:
END_OF_STRING = len(bpe_piece)
bound_start = 0
bound_end = END_OF_STRING + 1
for i in range(0, END_OF_STRING):
CANDIDATE_CHAR = bpe_piece[i]
if CANDIDATE_CHAR != " ":
continue
bound_end = i
yield bpe_piece[bound_start:bound_end]
bound_start = bound_end
bound_end = END_OF_STRING + 1
yield bpe_piece[bound_start:bound_end]
def split_tokens(self, corpus: list[int]) -> Generator[tuple[list[int], TokenType]]:
not_special_token_list: list[int] = []