From aa765b4555c7a447b06d688b515b93a71330b9a7 Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Thu, 2 Oct 2025 08:48:45 +0200 Subject: [PATCH] Added time checking --- .../Libs/BPE/Classes/NanoSocraTrainerPool.py | 57 ++++++++++++++++++- 1 file changed, 54 insertions(+), 3 deletions(-) diff --git a/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py b/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py index 74a596f..cdd7a95 100644 --- a/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py +++ b/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py @@ -5,6 +5,7 @@ from multiprocessing import Pool import os from pathlib import Path import re +import time from ..Classes import ( NanoSocratesBPE, NanoSocratesChunker, @@ -49,6 +50,22 @@ def split_fit(object: tuple[NanoSocratesBPE, list[list[int]]]): return (bpe, NEW_DATA, memory) +def split_encode(object: tuple[NanoSocratesBPE, list[list[int]]]): + + bpe, data = object + + NEW_DATA: list[list[int]] = [] + + for piece in data: + output = bpe.encode_intermediate(piece) + + if len(output) < 2: + continue + + # We are sure of its type + NEW_DATA.append(output) # type: ignore + + return NEW_DATA class NanoSocraTrainerPool: @@ -96,6 +113,8 @@ class NanoSocraTrainerPool: exit = False current_iteration = 0 data = self.__gather_data_from_file(path) + data = self.__encode_from_cache(BPE, data) + while not exit: @@ -105,8 +124,9 @@ class NanoSocraTrainerPool: last_memory = None + start = time.time_ns() _, data, last_memory = self.__round_train(BPE, data) - + end = time.time_ns() NEW_VOC_SIZE = BPE.vocabulary_size VOCABULARY = BPE.vocabulary @@ -122,8 +142,8 @@ class NanoSocraTrainerPool: DELIMITER, f"ITERATION: {current_iteration}", DELIMITER, - f"\tVocabulary size: {BPE.vocabulary_size}\n", - f"\tvocabulary:\n{BPE.vocabulary}", + f"\tVocabulary size: {BPE.vocabulary_size - 256}\n", + f"\tTime elapsed: {(end - start)/1E9}s", DELIMITER, "", ] @@ -214,6 +234,37 @@ class NanoSocraTrainerPool: return DATA + def __encode_from_cache(self, bpe: NanoSocratesBPE, data: list[list[int]]): + + NEW_DATA : list[list[int]]= [] + + CPU_COUNT = os.process_cpu_count() + + if CPU_COUNT is None: + raise Exception() + + VOCABULARY = bpe.vocabulary + + data_chunks = split(data, CPU_COUNT) + JOBS = [(NanoSocratesBPE(VOCABULARY), chunk) for chunk in data_chunks] + + JOB_RESULTS: list[list[list[int]]] + + with Pool() as pool: + JOB_RESULTS = pool.map(split_encode, JOBS) + + for i, res in zip(range(0, CPU_COUNT), JOB_RESULTS): + job_output = res + NEW_DATA.extend(job_output) + + del job_output + + print(f"Joined {i + 1} out of {CPU_COUNT}") + + print(f"Sentences from {len(data)} to {len(NEW_DATA)}") + + return NEW_DATA + def __increment_counter(self, counter: int): # What if overflows???