From e8894504c60d648698cf29d2cb72cd7a1a1edebd Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Fri, 3 Oct 2025 11:44:44 +0200 Subject: [PATCH] Fixed a bug where a token (int) was yielded instead of a list of int --- .../Libs/BPE/Classes/NanoSocratesSplitter.py | 28 ++++++++----------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py b/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py index 399fa77..6e0abc2 100644 --- a/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py +++ b/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py @@ -1,23 +1,20 @@ import re -from collections import deque +from collections import deque from typing import Generator from ..Enums import TokenType - class NanoSocratesSplitter: def __init__( - self, - special_token_regex: re.Pattern, - max_bpe_token_id: int = 255 + self, special_token_regex: re.Pattern, max_bpe_token_id: int = 255 ) -> None: # attention the regex got already compiled self.__special_token_regex = special_token_regex - self.__max_bpe_token_id : int = max_bpe_token_id # used for decoding + self.__max_bpe_token_id: int = max_bpe_token_id # used for decoding def split_text(self, corpus: str) -> Generator[tuple[str, TokenType]]: - """ Split a text using a regex given + """Split a text using a regex given Args: corpus (str): all the corpus string to split Yields: @@ -26,7 +23,7 @@ class NanoSocratesSplitter: """ bpe_start = 0 - bpe_end = len(corpus) # this can be deleted! + bpe_end = len(corpus) # this can be deleted! for special_token_start, special_token_end in self.__find_boundaries(corpus): @@ -45,7 +42,6 @@ class NanoSocratesSplitter: # it will used in the next interaction bpe_start = special_token_end - def __find_boundaries(self, corpus: str) -> Generator[tuple[int, int]]: """ Find each time the start and end (not included) of the special token @@ -53,21 +49,20 @@ class NanoSocratesSplitter: corpus (str): the string where the special token will be searched Yields: Generator[tuple[int, int]]: Note the end is not included - """ + """ for match in self.__special_token_regex.finditer(corpus): start = match.start() end = match.end() yield (start, end) - + # make the last boundary be the end of corpus # eof = len(corpus) # yield(eof,eof) + def split_tokens(self, corpus: list[int]) -> Generator[tuple[list[int], TokenType]]: - def split_tokens(self, corpus: list[int]) -> Generator[tuple[list[int], TokenType]] : - - not_special_token_list : list[int]= [] + not_special_token_list: list[int] = [] for token in corpus: if token > self.__max_bpe_token_id: @@ -75,8 +70,7 @@ class NanoSocratesSplitter: yield (not_special_token_list, TokenType.BPE) not_special_token_list = [] - yield (token, TokenType.SPECIAL) + yield ([token], TokenType.SPECIAL) continue - - not_special_token_list.append(token) + not_special_token_list.append(token)