Fixed a bug where a token (int) was yielded instead of a list of int
This commit is contained in:
parent
845d645348
commit
e8894504c6
@ -4,13 +4,10 @@ from typing import Generator
|
||||
from ..Enums import TokenType
|
||||
|
||||
|
||||
|
||||
class NanoSocratesSplitter:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
special_token_regex: re.Pattern,
|
||||
max_bpe_token_id: int = 255
|
||||
self, special_token_regex: re.Pattern, max_bpe_token_id: int = 255
|
||||
) -> None:
|
||||
# attention the regex got already compiled
|
||||
self.__special_token_regex = special_token_regex
|
||||
@ -45,7 +42,6 @@ class NanoSocratesSplitter:
|
||||
# it will used in the next interaction
|
||||
bpe_start = special_token_end
|
||||
|
||||
|
||||
def __find_boundaries(self, corpus: str) -> Generator[tuple[int, int]]:
|
||||
"""
|
||||
Find each time the start and end (not included) of the special token
|
||||
@ -64,7 +60,6 @@ class NanoSocratesSplitter:
|
||||
# eof = len(corpus)
|
||||
# yield(eof,eof)
|
||||
|
||||
|
||||
def split_tokens(self, corpus: list[int]) -> Generator[tuple[list[int], TokenType]]:
|
||||
|
||||
not_special_token_list: list[int] = []
|
||||
@ -75,8 +70,7 @@ class NanoSocratesSplitter:
|
||||
yield (not_special_token_list, TokenType.BPE)
|
||||
not_special_token_list = []
|
||||
|
||||
yield (token, TokenType.SPECIAL)
|
||||
yield ([token], TokenType.SPECIAL)
|
||||
continue
|
||||
|
||||
not_special_token_list.append(token)
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user