From e8894504c60d648698cf29d2cb72cd7a1a1edebd Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Fri, 3 Oct 2025 11:44:44 +0200
Subject: [PATCH] Fixed a bug where a token (int) was yielded instead of a list
 of int

---
 .../Libs/BPE/Classes/NanoSocratesSplitter.py  | 28 ++++++++-----------
 1 file changed, 11 insertions(+), 17 deletions(-)

diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py b/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py
index 399fa77..6e0abc2 100644
--- a/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py
@@ -1,23 +1,20 @@
 import re
-from collections import deque 
+from collections import deque
 from typing import Generator
 from ..Enums import TokenType
 
 
-
 class NanoSocratesSplitter:
 
     def __init__(
-        self,
-        special_token_regex: re.Pattern,
-        max_bpe_token_id: int = 255
+        self, special_token_regex: re.Pattern, max_bpe_token_id: int = 255
     ) -> None:
         # attention the regex got already compiled
         self.__special_token_regex = special_token_regex
-        self.__max_bpe_token_id : int = max_bpe_token_id # used for decoding
+        self.__max_bpe_token_id: int = max_bpe_token_id  # used for decoding
 
     def split_text(self, corpus: str) -> Generator[tuple[str, TokenType]]:
-        """ Split a text using a regex given 
+        """Split a text using a regex given
         Args:
             corpus (str): all the corpus string to split
         Yields:
@@ -26,7 +23,7 @@ class NanoSocratesSplitter:
         """
 
         bpe_start = 0
-        bpe_end = len(corpus) # this can be deleted!
+        bpe_end = len(corpus)  # this can be deleted!
 
         for special_token_start, special_token_end in self.__find_boundaries(corpus):
 
@@ -45,7 +42,6 @@ class NanoSocratesSplitter:
             # it will used in the next interaction
             bpe_start = special_token_end
 
-
     def __find_boundaries(self, corpus: str) -> Generator[tuple[int, int]]:
         """
         Find each time the start and end (not included) of the special token
@@ -53,21 +49,20 @@ class NanoSocratesSplitter:
             corpus (str): the string where the special token will be searched
         Yields:
             Generator[tuple[int, int]]: Note the end is not included
-        """        
+        """
         for match in self.__special_token_regex.finditer(corpus):
             start = match.start()
             end = match.end()
 
             yield (start, end)
-        
+
         # make the last boundary be the end of corpus
         # eof = len(corpus)
         # yield(eof,eof)
 
+    def split_tokens(self, corpus: list[int]) -> Generator[tuple[list[int], TokenType]]:
 
-    def split_tokens(self, corpus: list[int]) -> Generator[tuple[list[int], TokenType]] :
-        
-        not_special_token_list : list[int]= []
+        not_special_token_list: list[int] = []
         for token in corpus:
             if token > self.__max_bpe_token_id:
 
@@ -75,8 +70,7 @@ class NanoSocratesSplitter:
                     yield (not_special_token_list, TokenType.BPE)
                     not_special_token_list = []
 
-                yield (token, TokenType.SPECIAL)
+                yield ([token], TokenType.SPECIAL)
                 continue
-            
-            not_special_token_list.append(token)
 
+            not_special_token_list.append(token)