2025-09-28 18:03:16 +02:00
|
|
|
import re
|
2025-10-03 11:44:44 +02:00
|
|
|
from collections import deque
|
2025-09-28 18:03:16 +02:00
|
|
|
from typing import Generator
|
|
|
|
|
from ..Enums import TokenType
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class NanoSocratesSplitter:
|
|
|
|
|
|
|
|
|
|
def __init__(
|
2025-10-03 11:44:44 +02:00
|
|
|
self, special_token_regex: re.Pattern, max_bpe_token_id: int = 255
|
2025-09-28 18:03:16 +02:00
|
|
|
) -> None:
|
2025-10-03 01:00:36 +02:00
|
|
|
# attention the regex got already compiled
|
2025-09-28 18:03:16 +02:00
|
|
|
self.__special_token_regex = special_token_regex
|
2025-10-03 11:44:44 +02:00
|
|
|
self.__max_bpe_token_id: int = max_bpe_token_id # used for decoding
|
2025-09-28 18:03:16 +02:00
|
|
|
|
|
|
|
|
def split_text(self, corpus: str) -> Generator[tuple[str, TokenType]]:
|
2025-10-03 11:44:44 +02:00
|
|
|
"""Split a text using a regex given
|
2025-10-03 01:00:36 +02:00
|
|
|
Args:
|
|
|
|
|
corpus (str): all the corpus string to split
|
|
|
|
|
Yields:
|
|
|
|
|
Generator[tuple[str, TokenType]]: each time returns a piece of the splitted text: string and its TokenType. \n
|
|
|
|
|
TokenType describe if the string is for the BPE or a special Token [BPE, SPECIAL]
|
|
|
|
|
"""
|
2025-09-28 18:03:16 +02:00
|
|
|
|
|
|
|
|
bpe_start = 0
|
2025-10-03 11:44:44 +02:00
|
|
|
bpe_end = len(corpus) # this can be deleted!
|
2025-09-28 18:03:16 +02:00
|
|
|
|
2025-10-03 01:00:36 +02:00
|
|
|
for special_token_start, special_token_end in self.__find_boundaries(corpus):
|
2025-09-28 18:03:16 +02:00
|
|
|
|
2025-10-03 01:00:36 +02:00
|
|
|
# FIND BPE
|
|
|
|
|
bpe_end = special_token_start
|
2025-09-28 18:03:16 +02:00
|
|
|
BPE_TOKEN_TEXT = corpus[bpe_start:bpe_end]
|
|
|
|
|
if BPE_TOKEN_TEXT != "":
|
2025-10-03 13:26:58 +02:00
|
|
|
for WORD in self.__split_words(BPE_TOKEN_TEXT):
|
|
|
|
|
yield (WORD, TokenType.BPE)
|
2025-09-28 18:03:16 +02:00
|
|
|
|
2025-10-03 01:00:36 +02:00
|
|
|
# FIND SPECIAL TOKEN
|
|
|
|
|
SPECIAL_TOKEN_TEXT = corpus[special_token_start:special_token_end]
|
2025-09-28 18:03:16 +02:00
|
|
|
if SPECIAL_TOKEN_TEXT != "":
|
|
|
|
|
yield (SPECIAL_TOKEN_TEXT, TokenType.SPECIAL)
|
|
|
|
|
|
2025-10-03 01:00:36 +02:00
|
|
|
# now save the new bpe start point
|
|
|
|
|
# it will used in the next interaction
|
|
|
|
|
bpe_start = special_token_end
|
|
|
|
|
|
|
|
|
|
def __find_boundaries(self, corpus: str) -> Generator[tuple[int, int]]:
|
|
|
|
|
"""
|
|
|
|
|
Find each time the start and end (not included) of the special token
|
|
|
|
|
Args:
|
|
|
|
|
corpus (str): the string where the special token will be searched
|
|
|
|
|
Yields:
|
|
|
|
|
Generator[tuple[int, int]]: Note the end is not included
|
2025-10-03 11:44:44 +02:00
|
|
|
"""
|
2025-09-28 18:03:16 +02:00
|
|
|
for match in self.__special_token_regex.finditer(corpus):
|
|
|
|
|
start = match.start()
|
|
|
|
|
end = match.end()
|
|
|
|
|
|
|
|
|
|
yield (start, end)
|
2025-10-03 11:44:44 +02:00
|
|
|
|
2025-10-03 01:00:36 +02:00
|
|
|
# make the last boundary be the end of corpus
|
|
|
|
|
# eof = len(corpus)
|
|
|
|
|
# yield(eof,eof)
|
|
|
|
|
|
2025-10-03 13:26:58 +02:00
|
|
|
def __split_words(self, bpe_piece: str) -> Generator[str]:
|
|
|
|
|
|
|
|
|
|
END_OF_STRING = len(bpe_piece)
|
|
|
|
|
bound_start = 0
|
|
|
|
|
bound_end = END_OF_STRING + 1
|
|
|
|
|
for i in range(0, END_OF_STRING):
|
|
|
|
|
|
|
|
|
|
CANDIDATE_CHAR = bpe_piece[i]
|
|
|
|
|
|
|
|
|
|
if CANDIDATE_CHAR != " ":
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
bound_end = i
|
|
|
|
|
|
|
|
|
|
yield bpe_piece[bound_start:bound_end]
|
|
|
|
|
|
|
|
|
|
bound_start = bound_end
|
|
|
|
|
bound_end = END_OF_STRING + 1
|
|
|
|
|
|
|
|
|
|
yield bpe_piece[bound_start:bound_end]
|
|
|
|
|
|
2025-10-03 11:44:44 +02:00
|
|
|
def split_tokens(self, corpus: list[int]) -> Generator[tuple[list[int], TokenType]]:
|
2025-10-03 01:00:36 +02:00
|
|
|
|
2025-10-03 11:44:44 +02:00
|
|
|
not_special_token_list: list[int] = []
|
2025-10-03 01:00:36 +02:00
|
|
|
for token in corpus:
|
|
|
|
|
if token > self.__max_bpe_token_id:
|
|
|
|
|
|
|
|
|
|
if len(not_special_token_list) > 0:
|
|
|
|
|
yield (not_special_token_list, TokenType.BPE)
|
|
|
|
|
not_special_token_list = []
|
|
|
|
|
|
2025-10-03 11:44:44 +02:00
|
|
|
yield ([token], TokenType.SPECIAL)
|
2025-10-03 01:00:36 +02:00
|
|
|
continue
|
2025-09-28 18:03:16 +02:00
|
|
|
|
2025-10-03 11:44:44 +02:00
|
|
|
not_special_token_list.append(token)
|