Refactored to remove tokens that can't be compressed anymore

2025-10-01 19:42:22 +02:00
parent fbbe6226bb
commit 7cfaf601b4
1 changed files with 15 additions and 4 deletions
--- a/Project_Model/Libs/BPE/Classes/NanoSocraTraineRam.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocraTraineRam.py
@@ -105,18 +105,29 @@ class NanoSocraTraineRam:
        return BPE

    def __round_train(self, bpe: NanoSocratesBPE, data: list[list[int]]):
+
        DATA_LEN = len(data)
+        NEW_DATA = []

+        counter = 0
        memory = NanoSocratesBatchMemoryBPE({}, 0)
-        for piece, index in zip(data, range(0, DATA_LEN)):
+        while len(data) > 0:
+            counter += 1
+            last_batch = len(data) == 1

-            last_batch = index == DATA_LEN - 1
+            piece = data.pop()

            bpe, memory, output = bpe.fit(piece, memory, last_batch)

-            data[index] = output
+            if counter % int(1E6) == 0:
+                print(f"Fitted: {counter}/{DATA_LEN}")

-        return (bpe, data, memory)
+            if len(output) < 2:
+                continue
+
+            NEW_DATA.append(output)
+
+        return (bpe, NEW_DATA, memory)

    def __gather_data_from_file(self, path: Path) -> list[list[int]]: