Fix of bugs and semantics

2025-10-03 13:26:58 +02:00
parent 6b9cb7cd35
commit c5c0c61f79
5 changed files with 134 additions and 129 deletions
--- a/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py
@@ -31,7 +31,8 @@ class NanoSocratesSplitter:
            bpe_end = special_token_start
            BPE_TOKEN_TEXT = corpus[bpe_start:bpe_end]
            if BPE_TOKEN_TEXT != "":
-                yield (BPE_TOKEN_TEXT, TokenType.BPE)
+                for WORD in self.__split_words(BPE_TOKEN_TEXT):
+                    yield (WORD, TokenType.BPE)

            # FIND SPECIAL TOKEN
            SPECIAL_TOKEN_TEXT = corpus[special_token_start:special_token_end]
@@ -60,6 +61,27 @@ class NanoSocratesSplitter:
        # eof = len(corpus)
        # yield(eof,eof)

+    def __split_words(self, bpe_piece: str) -> Generator[str]:
+
+        END_OF_STRING = len(bpe_piece)
+        bound_start = 0
+        bound_end = END_OF_STRING + 1
+        for i in range(0, END_OF_STRING):
+
+            CANDIDATE_CHAR = bpe_piece[i]
+
+            if CANDIDATE_CHAR != " ":
+                continue
+
+            bound_end = i
+
+            yield bpe_piece[bound_start:bound_end]
+
+            bound_start = bound_end
+            bound_end = END_OF_STRING + 1
+
+        yield bpe_piece[bound_start:bound_end]
+
    def split_tokens(self, corpus: list[int]) -> Generator[tuple[list[int], TokenType]]:

        not_special_token_list: list[int] = []