From 3fe4e45ceb32c842d782e2347cbc0b03f09362ef Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Thu, 2 Oct 2025 01:50:37 +0200 Subject: [PATCH] Fixed a bug while joining frequencies --- .../Libs/BPE/Classes/NanoSocraTrainerPool.py | 22 ++++++++++++++----- 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py b/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py index 167b433..74a596f 100644 --- a/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py +++ b/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py @@ -21,15 +21,17 @@ from ..Utils import ( load_json, ) + def split(a, n): k, m = divmod(len(a), n) - return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n)) + return (a[i * k + min(i, m) : (i + 1) * k + min(i + 1, m)] for i in range(n)) + def split_fit(object: tuple[NanoSocratesBPE, list[list[int]]]): bpe, data = object - NEW_DATA: list[list[int]]= [] + NEW_DATA: list[list[int]] = [] memory = NanoSocratesBatchMemoryBPE({}, 0) @@ -144,7 +146,7 @@ class NanoSocraTrainerPool: def __round_train(self, bpe: NanoSocratesBPE, data: list[list[int]]): - NEW_DATA : list[list[int]] = [] + NEW_DATA: list[list[int]] = [] MEMORY = NanoSocratesBatchMemoryBPE({}, 0) @@ -159,7 +161,9 @@ class NanoSocraTrainerPool: data_chunks = split(data, CPU_COUNT) JOBS = [(NanoSocratesBPE(VOCABULARY), chunk) for chunk in data_chunks] - JOB_RESULTS: list[tuple[NanoSocratesBPE, list[list[int]], NanoSocratesBatchMemoryBPE]] + JOB_RESULTS: list[ + tuple[NanoSocratesBPE, list[list[int]], NanoSocratesBatchMemoryBPE] + ] with Pool() as pool: JOB_RESULTS = pool.map(fit_funct, JOBS) @@ -169,14 +173,20 @@ class NanoSocraTrainerPool: NEW_DATA.extend(job_output) for key, value in job_memory.frequencies.items(): - MEMORY.frequencies[key] = value + frequency = MEMORY.frequencies.get(key) + + if frequency is None: + frequency = 0 + MEMORY.frequencies[key] = 0 + + frequency += value + MEMORY.frequencies[key] = frequency del job_output del job_memory print(f"Joined {i + 1} out of {CPU_COUNT}") - # Get new token bpe.fit([], MEMORY, True)