This commit is contained in:
Christian Risi 2025-10-07 20:45:10 +02:00
commit 0560bc439a

View File

@ -3,7 +3,7 @@ from pathlib import Path
import Project_Model.Libs.BPE as BPE import Project_Model.Libs.BPE as BPE
#from BPE import TokeNanoCore as Tokenizer #from BPE import TokeNanoCore as Tokenizer
from Scripts.Libs.CleaningPipeline.special_token import SpecialToken from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
import random
class Batcher: class Batcher:
def __init__(self, dataset_path: str, batch_size:int, tokenizer: BPE.TokeNanoCore) -> None: def __init__(self, dataset_path: str, batch_size:int, tokenizer: BPE.TokeNanoCore) -> None:
@ -38,9 +38,14 @@ class Batcher:
yield output yield output
def __random_subset_rdfs(self, batch: pd.DataFrame): def __random_subset_rdfs(self, batch: pd.DataFrame, seed = 0):
rng = random.Random(seed)
def to_list(x):
return x.split(SpecialToken.START_TRIPLE.value)[1:]
batch["RDFs"] = batch["RDFs"].map( batch["RDFs"] = batch["RDFs"].map(
lambda x: x.split(SpecialToken.START_TRIPLE.value)[1:] to_list
) )
def __rdf2txt_transformation(self, batch: pd.DataFrame): def __rdf2txt_transformation(self, batch: pd.DataFrame):