NanoSocrates/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py

import re
from typing import Generator
from ..Enums import TokenType


class NanoSocratesSplitter:

    def __init__(
        self,
        special_token_regex: re.Pattern
    ) -> None:
        self.__special_token_regex = special_token_regex

    def split_text(self, corpus: str) -> Generator[tuple[str, TokenType]]:

        bpe_start = 0
        bpe_end = len(corpus)

        for bound_start, bound_end in self.__find_boundaries(corpus):

            bpe_end = bound_start
            BPE_TOKEN_TEXT = corpus[bpe_start:bpe_end]

            if BPE_TOKEN_TEXT != "":
                yield (BPE_TOKEN_TEXT, TokenType.BPE)

            bpe_start = bound_end
            SPECIAL_TOKEN_TEXT = corpus[bound_start:bound_end]

            if SPECIAL_TOKEN_TEXT != "":
                yield (SPECIAL_TOKEN_TEXT, TokenType.SPECIAL)

    def __find_boundaries(self, corpus: str) -> Generator[tuple[int, int]]:

        for match in self.__special_token_regex.finditer(corpus):
            start = match.start()
            end = match.end()

            yield (start, end)
Added Splitter to divide tokens from text 2025-09-28 18:03:16 +02:00			`import re`
			`from typing import Generator`
			`from ..Enums import TokenType`


			`class NanoSocratesSplitter:`

			`def __init__(`
			`self,`
			`special_token_regex: re.Pattern`
			`) -> None:`
			`self.__special_token_regex = special_token_regex`

			`def split_text(self, corpus: str) -> Generator[tuple[str, TokenType]]:`

			`bpe_start = 0`
			`bpe_end = len(corpus)`

			`for bound_start, bound_end in self.__find_boundaries(corpus):`

			`bpe_end = bound_start`
			`BPE_TOKEN_TEXT = corpus[bpe_start:bpe_end]`

			`if BPE_TOKEN_TEXT != "":`
			`yield (BPE_TOKEN_TEXT, TokenType.BPE)`

			`bpe_start = bound_end`
			`SPECIAL_TOKEN_TEXT = corpus[bound_start:bound_end]`

			`if SPECIAL_TOKEN_TEXT != "":`
			`yield (SPECIAL_TOKEN_TEXT, TokenType.SPECIAL)`

			`def __find_boundaries(self, corpus: str) -> Generator[tuple[int, int]]:`

			`for match in self.__special_token_regex.finditer(corpus):`
			`start = match.start()`
			`end = match.end()`

			`yield (start, end)`