Fixed a bug where a token (int) was yielded instead of a list of int

2025-10-03 11:44:44 +02:00
parent 845d645348
commit e8894504c6
1 changed files with 11 additions and 17 deletions
--- a/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py
@@ -4,20 +4,17 @@ from typing import Generator
 from ..Enums import TokenType


-
 class NanoSocratesSplitter:

    def __init__(
-        self,
-        special_token_regex: re.Pattern,
-        max_bpe_token_id: int = 255
+        self, special_token_regex: re.Pattern, max_bpe_token_id: int = 255
    ) -> None:
        # attention the regex got already compiled
        self.__special_token_regex = special_token_regex
-        self.__max_bpe_token_id : int = max_bpe_token_id # used for decoding
+        self.__max_bpe_token_id: int = max_bpe_token_id  # used for decoding

    def split_text(self, corpus: str) -> Generator[tuple[str, TokenType]]:
-        """ Split a text using a regex given 
+        """Split a text using a regex given
        Args:
            corpus (str): all the corpus string to split
        Yields:
@@ -45,7 +42,6 @@ class NanoSocratesSplitter:
            # it will used in the next interaction
            bpe_start = special_token_end

-
    def __find_boundaries(self, corpus: str) -> Generator[tuple[int, int]]:
        """
        Find each time the start and end (not included) of the special token
@@ -64,10 +60,9 @@ class NanoSocratesSplitter:
        # eof = len(corpus)
        # yield(eof,eof)

+    def split_tokens(self, corpus: list[int]) -> Generator[tuple[list[int], TokenType]]:

-    def split_tokens(self, corpus: list[int]) -> Generator[tuple[list[int], TokenType]] :
-        
-        not_special_token_list : list[int]= []
+        not_special_token_list: list[int] = []
        for token in corpus:
            if token > self.__max_bpe_token_id:

@@ -75,8 +70,7 @@ class NanoSocratesSplitter:
                    yield (not_special_token_list, TokenType.BPE)
                    not_special_token_list = []

-                yield (token, TokenType.SPECIAL)
+                yield ([token], TokenType.SPECIAL)
                continue

            not_special_token_list.append(token)
-