2025-10-07 15:36:51 +02:00

44 lines
1.7 KiB
Python

import pandas as pd
from BPE import TokeNanoCore as Tokenizer
class Batcher:
def __init__(self, dataset_path: str, batch_size:int, tokenizer: Tokenizer) -> None:
# ABSTRACT, TRIPLE
# tasks:
# rdf2text: X: TRIPLE, Y: ABSTRACT
# text2rdf: X: ABSTRACT, X:TRIPLE
# masking ( call masker): X: incomplete_triple Y: complete_triple (as exam)
# completation: X: TRIPLE SUBSET, Y: related TRIPLE SUBSET
# self._DATASET = pd.read_csv(dataset_path)
self._dataset_path = dataset_path
self._batch_size = batch_size
self._tokenizer = tokenizer
def get_batch(self):
for batch in pd.read_csv(self._dataset_path, chunksize= int(self._batch_size/3)): #now we support 3 task
# each batch get 4 transformation for the 4 tasks and then shuffled
# now a batch is ["Abstract"], ["Triples"]
# tokenize the strings:
tokenized_batch
tokenized_batch[["Abstract","Triples"]] = batch[["Abstract","Triples"]].map(lambda t: self._tokenizer.encode(t))
# ??? i hope this works, later will be tested
rdf2_txt_batch = self.__rdf2txt_transformation(tokenized_batch)
def __rdf2txt_transformation(self, batch: pd.DataFrame):
# rename ["Triples"] as ["X"]
# rename ["Abstract"] as ["Y"]
# return just them
batch = batch.rename(columns={"Triples": "X", "Abstract": "Y"})
return batch[["X", "Y"]] #.sample(frac=1).reset_index(drop=True)
def __txt2rdf_transformation(self, batch: pd.DataFrame):
batch = batch.rename(columns={ "Abstract": "X","Triples": "Y"})
return batch[["X", "Y"]]# .sample(frac=1).reset_index(drop=True)
def __masking()