From 96cbf4eabb034f38c65651bf10f3c554b5c1d3f1 Mon Sep 17 00:00:00 2001 From: GassiGiuseppe Date: Tue, 7 Oct 2025 20:09:51 +0200 Subject: [PATCH] wip Batcher --- Project_Model/Libs/Batch/Classes/Batcher.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/Project_Model/Libs/Batch/Classes/Batcher.py b/Project_Model/Libs/Batch/Classes/Batcher.py index 66c1d60..f53d239 100644 --- a/Project_Model/Libs/Batch/Classes/Batcher.py +++ b/Project_Model/Libs/Batch/Classes/Batcher.py @@ -3,7 +3,7 @@ from pathlib import Path import Project_Model.Libs.BPE as BPE #from BPE import TokeNanoCore as Tokenizer from Scripts.Libs.CleaningPipeline.special_token import SpecialToken - +import random class Batcher: def __init__(self, dataset_path: str, batch_size:int, tokenizer: BPE.TokeNanoCore) -> None: @@ -38,9 +38,14 @@ class Batcher: yield output - def __random_subset_rdfs(self, batch: pd.DataFrame): + def __random_subset_rdfs(self, batch: pd.DataFrame, seed = 0): + rng = random.Random(seed) + + def to_list(x): + return x.split(SpecialToken.START_TRIPLE.value)[1:] + batch["RDFs"] = batch["RDFs"].map( - lambda x: x.split(SpecialToken.START_TRIPLE.value)[1:] + to_list ) def __rdf2txt_transformation(self, batch: pd.DataFrame):