41 lines
1.0 KiB
Python
41 lines
1.0 KiB
Python
|
|
import re
|
||
|
|
from typing import Generator
|
||
|
|
from ..Enums import TokenType
|
||
|
|
|
||
|
|
|
||
|
|
class NanoSocratesSplitter:
|
||
|
|
|
||
|
|
def __init__(
|
||
|
|
self,
|
||
|
|
special_token_regex: re.Pattern
|
||
|
|
) -> None:
|
||
|
|
self.__special_token_regex = special_token_regex
|
||
|
|
|
||
|
|
def split_text(self, corpus: str) -> Generator[tuple[str, TokenType]]:
|
||
|
|
|
||
|
|
bpe_start = 0
|
||
|
|
bpe_end = len(corpus)
|
||
|
|
|
||
|
|
for bound_start, bound_end in self.__find_boundaries(corpus):
|
||
|
|
|
||
|
|
bpe_end = bound_start
|
||
|
|
BPE_TOKEN_TEXT = corpus[bpe_start:bpe_end]
|
||
|
|
|
||
|
|
if BPE_TOKEN_TEXT != "":
|
||
|
|
yield (BPE_TOKEN_TEXT, TokenType.BPE)
|
||
|
|
|
||
|
|
bpe_start = bound_end
|
||
|
|
SPECIAL_TOKEN_TEXT = corpus[bound_start:bound_end]
|
||
|
|
|
||
|
|
if SPECIAL_TOKEN_TEXT != "":
|
||
|
|
yield (SPECIAL_TOKEN_TEXT, TokenType.SPECIAL)
|
||
|
|
|
||
|
|
def __find_boundaries(self, corpus: str) -> Generator[tuple[int, int]]:
|
||
|
|
|
||
|
|
for match in self.__special_token_regex.finditer(corpus):
|
||
|
|
start = match.start()
|
||
|
|
end = match.end()
|
||
|
|
|
||
|
|
yield (start, end)
|
||
|
|
|