NanoSocrates/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py

import re
from collections import deque
from typing import Generator
from ..Enums import TokenType


class NanoSocratesSplitter:

    def __init__(
        self,
        special_token_regex: re.Pattern,
        max_bpe_token_id: int = 255
    ) -> None:
        # attention the regex got already compiled
        self.__special_token_regex = special_token_regex
        self.__max_bpe_token_id : int = max_bpe_token_id # used for decoding

    def split_text(self, corpus: str) -> Generator[tuple[str, TokenType]]:
        """ Split a text using a regex given
        Args:
            corpus (str): all the corpus string to split
        Yields:
            Generator[tuple[str, TokenType]]: each time returns a piece of the splitted text: string and its TokenType. \n
            TokenType describe if the string is for the BPE or a special Token [BPE, SPECIAL]
        """

        bpe_start = 0
        bpe_end = len(corpus) # this can be deleted!

        for special_token_start, special_token_end in self.__find_boundaries(corpus):

            # FIND BPE
            bpe_end = special_token_start
            BPE_TOKEN_TEXT = corpus[bpe_start:bpe_end]
            if BPE_TOKEN_TEXT != "":
                yield (BPE_TOKEN_TEXT, TokenType.BPE)

            # FIND SPECIAL TOKEN
            SPECIAL_TOKEN_TEXT = corpus[special_token_start:special_token_end]
            if SPECIAL_TOKEN_TEXT != "":
                yield (SPECIAL_TOKEN_TEXT, TokenType.SPECIAL)

            # now save the new bpe start point
            # it will used in the next interaction
            bpe_start = special_token_end


    def __find_boundaries(self, corpus: str) -> Generator[tuple[int, int]]:
        """
        Find each time the start and end (not included) of the special token
        Args:
            corpus (str): the string where the special token will be searched
        Yields:
            Generator[tuple[int, int]]: Note the end is not included
        """
        for match in self.__special_token_regex.finditer(corpus):
            start = match.start()
            end = match.end()

            yield (start, end)

        # make the last boundary be the end of corpus
        # eof = len(corpus)
        # yield(eof,eof)


    def split_tokens(self, corpus: list[int]) -> Generator[tuple[list[int], TokenType]] :

        not_special_token_list : list[int]= []
        for token in corpus:
            if token > self.__max_bpe_token_id:

                if len(not_special_token_list) > 0:
                    yield (not_special_token_list, TokenType.BPE)
                    not_special_token_list = []

                yield (token, TokenType.SPECIAL)
                continue

            not_special_token_list.append(token)