wip Batcher

This commit is contained in:
GassiGiuseppe 2025-10-07 20:09:51 +02:00
parent f801afe0e4
commit 96cbf4eabb

View File

@ -3,7 +3,7 @@ from pathlib import Path
import Project_Model.Libs.BPE as BPE
#from BPE import TokeNanoCore as Tokenizer
from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
import random
class Batcher:
def __init__(self, dataset_path: str, batch_size:int, tokenizer: BPE.TokeNanoCore) -> None:
@ -38,9 +38,14 @@ class Batcher:
yield output
def __random_subset_rdfs(self, batch: pd.DataFrame):
def __random_subset_rdfs(self, batch: pd.DataFrame, seed = 0):
rng = random.Random(seed)
def to_list(x):
return x.split(SpecialToken.START_TRIPLE.value)[1:]
batch["RDFs"] = batch["RDFs"].map(
lambda x: x.split(SpecialToken.START_TRIPLE.value)[1:]
to_list
)
def __rdf2txt_transformation(self, batch: pd.DataFrame):