From 7cfaf601b411ea4e6ad5c929793f5aad7b8b127a Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Wed, 1 Oct 2025 19:42:22 +0200 Subject: [PATCH] Refactored to remove tokens that can't be compressed anymore --- .../Libs/BPE/Classes/NanoSocraTraineRam.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/Project_Model/Libs/BPE/Classes/NanoSocraTraineRam.py b/Project_Model/Libs/BPE/Classes/NanoSocraTraineRam.py index 9c4f444..aca820e 100644 --- a/Project_Model/Libs/BPE/Classes/NanoSocraTraineRam.py +++ b/Project_Model/Libs/BPE/Classes/NanoSocraTraineRam.py @@ -105,18 +105,29 @@ class NanoSocraTraineRam: return BPE def __round_train(self, bpe: NanoSocratesBPE, data: list[list[int]]): + DATA_LEN = len(data) + NEW_DATA = [] + counter = 0 memory = NanoSocratesBatchMemoryBPE({}, 0) - for piece, index in zip(data, range(0, DATA_LEN)): + while len(data) > 0: + counter += 1 + last_batch = len(data) == 1 - last_batch = index == DATA_LEN - 1 + piece = data.pop() bpe, memory, output = bpe.fit(piece, memory, last_batch) - data[index] = output + if counter % int(1E6) == 0: + print(f"Fitted: {counter}/{DATA_LEN}") - return (bpe, data, memory) + if len(output) < 2: + continue + + NEW_DATA.append(output) + + return (bpe, NEW_DATA, memory) def __gather_data_from_file(self, path: Path) -> list[list[int]]: