Fix of bugs and semantics
This commit is contained in:
@@ -31,7 +31,8 @@ class NanoSocratesSplitter:
|
||||
bpe_end = special_token_start
|
||||
BPE_TOKEN_TEXT = corpus[bpe_start:bpe_end]
|
||||
if BPE_TOKEN_TEXT != "":
|
||||
yield (BPE_TOKEN_TEXT, TokenType.BPE)
|
||||
for WORD in self.__split_words(BPE_TOKEN_TEXT):
|
||||
yield (WORD, TokenType.BPE)
|
||||
|
||||
# FIND SPECIAL TOKEN
|
||||
SPECIAL_TOKEN_TEXT = corpus[special_token_start:special_token_end]
|
||||
@@ -60,6 +61,27 @@ class NanoSocratesSplitter:
|
||||
# eof = len(corpus)
|
||||
# yield(eof,eof)
|
||||
|
||||
def __split_words(self, bpe_piece: str) -> Generator[str]:
|
||||
|
||||
END_OF_STRING = len(bpe_piece)
|
||||
bound_start = 0
|
||||
bound_end = END_OF_STRING + 1
|
||||
for i in range(0, END_OF_STRING):
|
||||
|
||||
CANDIDATE_CHAR = bpe_piece[i]
|
||||
|
||||
if CANDIDATE_CHAR != " ":
|
||||
continue
|
||||
|
||||
bound_end = i
|
||||
|
||||
yield bpe_piece[bound_start:bound_end]
|
||||
|
||||
bound_start = bound_end
|
||||
bound_end = END_OF_STRING + 1
|
||||
|
||||
yield bpe_piece[bound_start:bound_end]
|
||||
|
||||
def split_tokens(self, corpus: list[int]) -> Generator[tuple[list[int], TokenType]]:
|
||||
|
||||
not_special_token_list: list[int] = []
|
||||
|
||||
Reference in New Issue
Block a user