Fixed bug for utf-8 conversion

2025-09-30 23:58:31 +02:00
parent ccacea18d8
commit 89a0a1f4bb
2 changed files with 23 additions and 4 deletions
--- a/Project_Model/Libs/BPE/Classes/NanoSocraTrainer.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocraTrainer.py
@@ -15,6 +15,7 @@ class NanoSocraTrainer:
        chunk_size: int,
        merge_treshold: int = 0,
        max_iterations: int = 0,
        print_after_iterations: int = 1
    ) -> None:
        # Bytes
        BYTE_RESERVED_TOKENS = 256
@@ -26,6 +27,7 @@ class NanoSocraTrainer:
        self.__chunk_size = chunk_size
        self.__merge_treshold = merge_treshold
        self.__special_token_regex = special_regex_maker(special_vocabulary)
        self.__print_after_iterations = print_after_iterations
    def trainBPE(
        self, path: Path, cache_dir: Path, bpe: NanoSocratesBPE | None = None
@@ -61,7 +63,9 @@ class NanoSocraTrainer:
            FILE = open(out_path, "w")
-            for _, _, output in self.__round_train(input_path, BPE, cached):
+            last_memory = None
            for _, memory, output in self.__round_train(input_path, BPE, cached):
                last_memory = memory
                FILE.write(output)
            FILE.close()
@@ -71,6 +75,21 @@ class NanoSocraTrainer:
            NEW_VOC_SIZE = BPE.vocabulary_size
            if current_iteration % self.__print_after_iterations == 0:
                DELIMITER = "==============="
                DEBUG = "\n".join([
                    DELIMITER,
                    f"ITERATION: {current_iteration}",
                    DELIMITER,
                    f"\tVocabulary size: {BPE.vocabulary_size}\n",
                    f"\tFrequencies:\n{last_memory.frequencies}\n",
                    f"\tvocabulary:\n{BPE.vocabulary}",
                    DELIMITER,
                    ""
                ])
                print(DEBUG)
            if LAST_VOC_SIZE == NEW_VOC_SIZE:
                exit = True
                continue
@@ -137,7 +156,7 @@ class NanoSocraTrainer:
    def __make_list_ids(self, corpus: str, cached: bool):
        if not cached:
-            return list(map(ord, corpus))
+            return list(corpus.encode("utf-8"))
        REDUCED_CORPUS_LEN = len(corpus) -1
--- a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py
@@ -90,7 +90,7 @@ class NanoSocratesBPE(Encoder):
    def encode(self, piece: str) -> list[int]:
-        current_piece = list(map(ord, piece))
+        current_piece = list(piece.encode("utf-8"))
        new_piece = self.__round_encode(current_piece)
        while len(current_piece) != len(new_piece):
@@ -128,7 +128,7 @@ class NanoSocratesBPE(Encoder):
        return NEW_PIECE
-    # TODO: decode
+    # TODO: Remake decode to take a list of token IDs
    def decode(self, token_id: int) -> str:
        token_stack: list[int] = [token_id]