diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py b/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py index ccca300..399fa77 100644 --- a/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py +++ b/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py @@ -1,40 +1,82 @@ import re +from collections import deque from typing import Generator from ..Enums import TokenType + class NanoSocratesSplitter: def __init__( self, - special_token_regex: re.Pattern + special_token_regex: re.Pattern, + max_bpe_token_id: int = 255 ) -> None: + # attention the regex got already compiled self.__special_token_regex = special_token_regex + self.__max_bpe_token_id : int = max_bpe_token_id # used for decoding def split_text(self, corpus: str) -> Generator[tuple[str, TokenType]]: + """ Split a text using a regex given + Args: + corpus (str): all the corpus string to split + Yields: + Generator[tuple[str, TokenType]]: each time returns a piece of the splitted text: string and its TokenType. \n + TokenType describe if the string is for the BPE or a special Token [BPE, SPECIAL] + """ bpe_start = 0 - bpe_end = len(corpus) + bpe_end = len(corpus) # this can be deleted! - for bound_start, bound_end in self.__find_boundaries(corpus): + for special_token_start, special_token_end in self.__find_boundaries(corpus): - bpe_end = bound_start + # FIND BPE + bpe_end = special_token_start BPE_TOKEN_TEXT = corpus[bpe_start:bpe_end] - if BPE_TOKEN_TEXT != "": yield (BPE_TOKEN_TEXT, TokenType.BPE) - bpe_start = bound_end - SPECIAL_TOKEN_TEXT = corpus[bound_start:bound_end] - + # FIND SPECIAL TOKEN + SPECIAL_TOKEN_TEXT = corpus[special_token_start:special_token_end] if SPECIAL_TOKEN_TEXT != "": yield (SPECIAL_TOKEN_TEXT, TokenType.SPECIAL) - def __find_boundaries(self, corpus: str) -> Generator[tuple[int, int]]: + # now save the new bpe start point + # it will used in the next interaction + bpe_start = special_token_end + + def __find_boundaries(self, corpus: str) -> Generator[tuple[int, int]]: + """ + Find each time the start and end (not included) of the special token + Args: + corpus (str): the string where the special token will be searched + Yields: + Generator[tuple[int, int]]: Note the end is not included + """ for match in self.__special_token_regex.finditer(corpus): start = match.start() end = match.end() yield (start, end) + + # make the last boundary be the end of corpus + # eof = len(corpus) + # yield(eof,eof) + + + def split_tokens(self, corpus: list[int]) -> Generator[tuple[list[int], TokenType]] : + + not_special_token_list : list[int]= [] + for token in corpus: + if token > self.__max_bpe_token_id: + + if len(not_special_token_list) > 0: + yield (not_special_token_list, TokenType.BPE) + not_special_token_list = [] + + yield (token, TokenType.SPECIAL) + continue + + not_special_token_list.append(token)