Updated NanoSocratesSplitter to split also token in decode phase

This commit is contained in:
GassiGiuseppe 2025-10-03 01:00:36 +02:00
parent a5b8692a77
commit 8121c75a09

View File

@ -1,40 +1,82 @@
import re import re
from collections import deque
from typing import Generator from typing import Generator
from ..Enums import TokenType from ..Enums import TokenType
class NanoSocratesSplitter: class NanoSocratesSplitter:
def __init__( def __init__(
self, self,
special_token_regex: re.Pattern special_token_regex: re.Pattern,
max_bpe_token_id: int = 255
) -> None: ) -> None:
# attention the regex got already compiled
self.__special_token_regex = special_token_regex self.__special_token_regex = special_token_regex
self.__max_bpe_token_id : int = max_bpe_token_id # used for decoding
def split_text(self, corpus: str) -> Generator[tuple[str, TokenType]]: def split_text(self, corpus: str) -> Generator[tuple[str, TokenType]]:
""" Split a text using a regex given
Args:
corpus (str): all the corpus string to split
Yields:
Generator[tuple[str, TokenType]]: each time returns a piece of the splitted text: string and its TokenType. \n
TokenType describe if the string is for the BPE or a special Token [BPE, SPECIAL]
"""
bpe_start = 0 bpe_start = 0
bpe_end = len(corpus) bpe_end = len(corpus) # this can be deleted!
for bound_start, bound_end in self.__find_boundaries(corpus): for special_token_start, special_token_end in self.__find_boundaries(corpus):
bpe_end = bound_start # FIND BPE
bpe_end = special_token_start
BPE_TOKEN_TEXT = corpus[bpe_start:bpe_end] BPE_TOKEN_TEXT = corpus[bpe_start:bpe_end]
if BPE_TOKEN_TEXT != "": if BPE_TOKEN_TEXT != "":
yield (BPE_TOKEN_TEXT, TokenType.BPE) yield (BPE_TOKEN_TEXT, TokenType.BPE)
bpe_start = bound_end # FIND SPECIAL TOKEN
SPECIAL_TOKEN_TEXT = corpus[bound_start:bound_end] SPECIAL_TOKEN_TEXT = corpus[special_token_start:special_token_end]
if SPECIAL_TOKEN_TEXT != "": if SPECIAL_TOKEN_TEXT != "":
yield (SPECIAL_TOKEN_TEXT, TokenType.SPECIAL) yield (SPECIAL_TOKEN_TEXT, TokenType.SPECIAL)
def __find_boundaries(self, corpus: str) -> Generator[tuple[int, int]]: # now save the new bpe start point
# it will used in the next interaction
bpe_start = special_token_end
def __find_boundaries(self, corpus: str) -> Generator[tuple[int, int]]:
"""
Find each time the start and end (not included) of the special token
Args:
corpus (str): the string where the special token will be searched
Yields:
Generator[tuple[int, int]]: Note the end is not included
"""
for match in self.__special_token_regex.finditer(corpus): for match in self.__special_token_regex.finditer(corpus):
start = match.start() start = match.start()
end = match.end() end = match.end()
yield (start, end) yield (start, end)
# make the last boundary be the end of corpus
# eof = len(corpus)
# yield(eof,eof)
def split_tokens(self, corpus: list[int]) -> Generator[tuple[list[int], TokenType]] :
not_special_token_list : list[int]= []
for token in corpus:
if token > self.__max_bpe_token_id:
if len(not_special_token_list) > 0:
yield (not_special_token_list, TokenType.BPE)
not_special_token_list = []
yield (token, TokenType.SPECIAL)
continue
not_special_token_list.append(token)