Fixed a bug where a token (int) was yielded instead of a list of int

This commit is contained in:
Christian Risi 2025-10-03 11:44:44 +02:00
parent 845d645348
commit e8894504c6

View File

@ -4,20 +4,17 @@ from typing import Generator
from ..Enums import TokenType
class NanoSocratesSplitter:
def __init__(
self,
special_token_regex: re.Pattern,
max_bpe_token_id: int = 255
self, special_token_regex: re.Pattern, max_bpe_token_id: int = 255
) -> None:
# attention the regex got already compiled
self.__special_token_regex = special_token_regex
self.__max_bpe_token_id : int = max_bpe_token_id # used for decoding
self.__max_bpe_token_id: int = max_bpe_token_id # used for decoding
def split_text(self, corpus: str) -> Generator[tuple[str, TokenType]]:
""" Split a text using a regex given
"""Split a text using a regex given
Args:
corpus (str): all the corpus string to split
Yields:
@ -45,7 +42,6 @@ class NanoSocratesSplitter:
# it will used in the next interaction
bpe_start = special_token_end
def __find_boundaries(self, corpus: str) -> Generator[tuple[int, int]]:
"""
Find each time the start and end (not included) of the special token
@ -64,10 +60,9 @@ class NanoSocratesSplitter:
# eof = len(corpus)
# yield(eof,eof)
def split_tokens(self, corpus: list[int]) -> Generator[tuple[list[int], TokenType]]:
def split_tokens(self, corpus: list[int]) -> Generator[tuple[list[int], TokenType]] :
not_special_token_list : list[int]= []
not_special_token_list: list[int] = []
for token in corpus:
if token > self.__max_bpe_token_id:
@ -75,8 +70,7 @@ class NanoSocratesSplitter:
yield (not_special_token_list, TokenType.BPE)
not_special_token_list = []
yield (token, TokenType.SPECIAL)
yield ([token], TokenType.SPECIAL)
continue
not_special_token_list.append(token)