NanoSocrates/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py
2025-09-28 18:03:16 +02:00

41 lines
1.0 KiB
Python

import re
from typing import Generator
from ..Enums import TokenType
class NanoSocratesSplitter:
def __init__(
self,
special_token_regex: re.Pattern
) -> None:
self.__special_token_regex = special_token_regex
def split_text(self, corpus: str) -> Generator[tuple[str, TokenType]]:
bpe_start = 0
bpe_end = len(corpus)
for bound_start, bound_end in self.__find_boundaries(corpus):
bpe_end = bound_start
BPE_TOKEN_TEXT = corpus[bpe_start:bpe_end]
if BPE_TOKEN_TEXT != "":
yield (BPE_TOKEN_TEXT, TokenType.BPE)
bpe_start = bound_end
SPECIAL_TOKEN_TEXT = corpus[bound_start:bound_end]
if SPECIAL_TOKEN_TEXT != "":
yield (SPECIAL_TOKEN_TEXT, TokenType.SPECIAL)
def __find_boundaries(self, corpus: str) -> Generator[tuple[int, int]]:
for match in self.__special_token_regex.finditer(corpus):
start = match.start()
end = match.end()
yield (start, end)