NanoSocrates/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py

import re
from collections import deque
from typing import Generator
from ..Enums import TokenType


class NanoSocratesSplitter:

    def __init__(
        self, special_token_regex: re.Pattern, max_bpe_token_id: int = 255
    ) -> None:
        # attention the regex got already compiled
        self.__special_token_regex = special_token_regex
        self.__max_bpe_token_id: int = max_bpe_token_id  # used for decoding

    def split_text(self, corpus: str) -> Generator[tuple[str, TokenType]]:
        """Split a text using a regex given
        Args:
            corpus (str): all the corpus string to split
        Yields:
            Generator[tuple[str, TokenType]]: each time returns a piece of the splitted text: string and its TokenType. \n
            TokenType describe if the string is for the BPE or a special Token [BPE, SPECIAL]
        """

        bpe_start = 0
        bpe_end = len(corpus)  # this can be deleted!

        for special_token_start, special_token_end in self.__find_boundaries(corpus):

            # FIND BPE
            bpe_end = special_token_start
            BPE_TOKEN_TEXT = corpus[bpe_start:bpe_end]
            if BPE_TOKEN_TEXT != "":
                for WORD in self.__split_words(BPE_TOKEN_TEXT):
                    yield (WORD, TokenType.BPE)

            # FIND SPECIAL TOKEN
            SPECIAL_TOKEN_TEXT = corpus[special_token_start:special_token_end]
            if SPECIAL_TOKEN_TEXT != "":
                yield (SPECIAL_TOKEN_TEXT, TokenType.SPECIAL)

            # now save the new bpe start point
            # it will used in the next interaction
            bpe_start = special_token_end

    def __find_boundaries(self, corpus: str) -> Generator[tuple[int, int]]:
        """
        Find each time the start and end (not included) of the special token
        Args:
            corpus (str): the string where the special token will be searched
        Yields:
            Generator[tuple[int, int]]: Note the end is not included
        """
        for match in self.__special_token_regex.finditer(corpus):
            start = match.start()
            end = match.end()

            yield (start, end)

        # make the last boundary be the end of corpus
        # eof = len(corpus)
        # yield(eof,eof)

    def __split_words(self, bpe_piece: str) -> Generator[str]:

        END_OF_STRING = len(bpe_piece)
        bound_start = 0
        bound_end = END_OF_STRING + 1
        for i in range(0, END_OF_STRING):

            CANDIDATE_CHAR = bpe_piece[i]

            if CANDIDATE_CHAR != " ":
                continue

            bound_end = i

            yield bpe_piece[bound_start:bound_end]

            bound_start = bound_end
            bound_end = END_OF_STRING + 1

        yield bpe_piece[bound_start:bound_end]

    def split_tokens(self, corpus: list[int]) -> Generator[tuple[list[int], TokenType]]:

        not_special_token_list: list[int] = []
        for token in corpus:
            if token > self.__max_bpe_token_id:

                if len(not_special_token_list) > 0:
                    yield (not_special_token_list, TokenType.BPE)
                    not_special_token_list = []

                yield ([token], TokenType.SPECIAL)
                continue

            not_special_token_list.append(token)
Added Splitter to divide tokens from text 2025-09-28 18:03:16 +02:00			`import re`
Fixed a bug where a token (int) was yielded instead of a list of int 2025-10-03 11:44:44 +02:00			`from collections import deque`
Added Splitter to divide tokens from text 2025-09-28 18:03:16 +02:00			`from typing import Generator`
			`from ..Enums import TokenType`


			`class NanoSocratesSplitter:`

			`def __init__(`
Fixed a bug where a token (int) was yielded instead of a list of int 2025-10-03 11:44:44 +02:00			`self, special_token_regex: re.Pattern, max_bpe_token_id: int = 255`
Added Splitter to divide tokens from text 2025-09-28 18:03:16 +02:00			`) -> None:`
Updated NanoSocratesSplitter to split also token in decode phase 2025-10-03 01:00:36 +02:00			`# attention the regex got already compiled`
Added Splitter to divide tokens from text 2025-09-28 18:03:16 +02:00			`self.__special_token_regex = special_token_regex`
Fixed a bug where a token (int) was yielded instead of a list of int 2025-10-03 11:44:44 +02:00			`self.__max_bpe_token_id: int = max_bpe_token_id # used for decoding`
Added Splitter to divide tokens from text 2025-09-28 18:03:16 +02:00
			`def split_text(self, corpus: str) -> Generator[tuple[str, TokenType]]:`
Fixed a bug where a token (int) was yielded instead of a list of int 2025-10-03 11:44:44 +02:00			`"""Split a text using a regex given`
Updated NanoSocratesSplitter to split also token in decode phase 2025-10-03 01:00:36 +02:00			`Args:`
			`corpus (str): all the corpus string to split`
			`Yields:`
			`Generator[tuple[str, TokenType]]: each time returns a piece of the splitted text: string and its TokenType. \n`
			`TokenType describe if the string is for the BPE or a special Token [BPE, SPECIAL]`
			`"""`
Added Splitter to divide tokens from text 2025-09-28 18:03:16 +02:00
			`bpe_start = 0`
Fixed a bug where a token (int) was yielded instead of a list of int 2025-10-03 11:44:44 +02:00			`bpe_end = len(corpus) # this can be deleted!`
Added Splitter to divide tokens from text 2025-09-28 18:03:16 +02:00
Updated NanoSocratesSplitter to split also token in decode phase 2025-10-03 01:00:36 +02:00			`for special_token_start, special_token_end in self.__find_boundaries(corpus):`
Added Splitter to divide tokens from text 2025-09-28 18:03:16 +02:00
Updated NanoSocratesSplitter to split also token in decode phase 2025-10-03 01:00:36 +02:00			`# FIND BPE`
			`bpe_end = special_token_start`
Added Splitter to divide tokens from text 2025-09-28 18:03:16 +02:00			`BPE_TOKEN_TEXT = corpus[bpe_start:bpe_end]`
			`if BPE_TOKEN_TEXT != "":`
Fix of bugs and semantics 2025-10-03 13:26:58 +02:00			`for WORD in self.__split_words(BPE_TOKEN_TEXT):`
			`yield (WORD, TokenType.BPE)`
Added Splitter to divide tokens from text 2025-09-28 18:03:16 +02:00
Updated NanoSocratesSplitter to split also token in decode phase 2025-10-03 01:00:36 +02:00			`# FIND SPECIAL TOKEN`
			`SPECIAL_TOKEN_TEXT = corpus[special_token_start:special_token_end]`
Added Splitter to divide tokens from text 2025-09-28 18:03:16 +02:00			`if SPECIAL_TOKEN_TEXT != "":`
			`yield (SPECIAL_TOKEN_TEXT, TokenType.SPECIAL)`

Updated NanoSocratesSplitter to split also token in decode phase 2025-10-03 01:00:36 +02:00			`# now save the new bpe start point`
			`# it will used in the next interaction`
			`bpe_start = special_token_end`

			`def __find_boundaries(self, corpus: str) -> Generator[tuple[int, int]]:`
			`"""`
			`Find each time the start and end (not included) of the special token`
			`Args:`
			`corpus (str): the string where the special token will be searched`
			`Yields:`
			`Generator[tuple[int, int]]: Note the end is not included`
Fixed a bug where a token (int) was yielded instead of a list of int 2025-10-03 11:44:44 +02:00			`"""`
Added Splitter to divide tokens from text 2025-09-28 18:03:16 +02:00			`for match in self.__special_token_regex.finditer(corpus):`
			`start = match.start()`
			`end = match.end()`

			`yield (start, end)`
Fixed a bug where a token (int) was yielded instead of a list of int 2025-10-03 11:44:44 +02:00
Updated NanoSocratesSplitter to split also token in decode phase 2025-10-03 01:00:36 +02:00			`# make the last boundary be the end of corpus`
			`# eof = len(corpus)`
			`# yield(eof,eof)`

Fix of bugs and semantics 2025-10-03 13:26:58 +02:00			`def __split_words(self, bpe_piece: str) -> Generator[str]:`

			`END_OF_STRING = len(bpe_piece)`
			`bound_start = 0`
			`bound_end = END_OF_STRING + 1`
			`for i in range(0, END_OF_STRING):`

			`CANDIDATE_CHAR = bpe_piece[i]`

			`if CANDIDATE_CHAR != " ":`
			`continue`

			`bound_end = i`

			`yield bpe_piece[bound_start:bound_end]`

			`bound_start = bound_end`
			`bound_end = END_OF_STRING + 1`

			`yield bpe_piece[bound_start:bound_end]`

Fixed a bug where a token (int) was yielded instead of a list of int 2025-10-03 11:44:44 +02:00			`def split_tokens(self, corpus: list[int]) -> Generator[tuple[list[int], TokenType]]:`
Updated NanoSocratesSplitter to split also token in decode phase 2025-10-03 01:00:36 +02:00
Fixed a bug where a token (int) was yielded instead of a list of int 2025-10-03 11:44:44 +02:00			`not_special_token_list: list[int] = []`
Updated NanoSocratesSplitter to split also token in decode phase 2025-10-03 01:00:36 +02:00			`for token in corpus:`
			`if token > self.__max_bpe_token_id:`

			`if len(not_special_token_list) > 0:`
			`yield (not_special_token_list, TokenType.BPE)`
			`not_special_token_list = []`

Fixed a bug where a token (int) was yielded instead of a list of int 2025-10-03 11:44:44 +02:00			`yield ([token], TokenType.SPECIAL)`
Updated NanoSocratesSplitter to split also token in decode phase 2025-10-03 01:00:36 +02:00			`continue`
Added Splitter to divide tokens from text 2025-09-28 18:03:16 +02:00
Fixed a bug where a token (int) was yielded instead of a list of int 2025-10-03 11:44:44 +02:00			`not_special_token_list.append(token)`