44 lines
1.7 KiB
Python
44 lines
1.7 KiB
Python
|
|
import pandas as pd
|
||
|
|
|
||
|
|
from BPE import TokeNanoCore as Tokenizer
|
||
|
|
|
||
|
|
class Batcher:
|
||
|
|
|
||
|
|
def __init__(self, dataset_path: str, batch_size:int, tokenizer: Tokenizer) -> None:
|
||
|
|
# ABSTRACT, TRIPLE
|
||
|
|
# tasks:
|
||
|
|
# rdf2text: X: TRIPLE, Y: ABSTRACT
|
||
|
|
# text2rdf: X: ABSTRACT, X:TRIPLE
|
||
|
|
# masking ( call masker): X: incomplete_triple Y: complete_triple (as exam)
|
||
|
|
# completation: X: TRIPLE SUBSET, Y: related TRIPLE SUBSET
|
||
|
|
# self._DATASET = pd.read_csv(dataset_path)
|
||
|
|
self._dataset_path = dataset_path
|
||
|
|
self._batch_size = batch_size
|
||
|
|
self._tokenizer = tokenizer
|
||
|
|
|
||
|
|
def get_batch(self):
|
||
|
|
for batch in pd.read_csv(self._dataset_path, chunksize= int(self._batch_size/3)): #now we support 3 task
|
||
|
|
# each batch get 4 transformation for the 4 tasks and then shuffled
|
||
|
|
# now a batch is ["Abstract"], ["Triples"]
|
||
|
|
# tokenize the strings:
|
||
|
|
tokenized_batch
|
||
|
|
tokenized_batch[["Abstract","Triples"]] = batch[["Abstract","Triples"]].map(lambda t: self._tokenizer.encode(t))
|
||
|
|
# ??? i hope this works, later will be tested
|
||
|
|
rdf2_txt_batch = self.__rdf2txt_transformation(tokenized_batch)
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
def __rdf2txt_transformation(self, batch: pd.DataFrame):
|
||
|
|
# rename ["Triples"] as ["X"]
|
||
|
|
# rename ["Abstract"] as ["Y"]
|
||
|
|
# return just them
|
||
|
|
batch = batch.rename(columns={"Triples": "X", "Abstract": "Y"})
|
||
|
|
return batch[["X", "Y"]] #.sample(frac=1).reset_index(drop=True)
|
||
|
|
|
||
|
|
|
||
|
|
def __txt2rdf_transformation(self, batch: pd.DataFrame):
|
||
|
|
batch = batch.rename(columns={ "Abstract": "X","Triples": "Y"})
|
||
|
|
return batch[["X", "Y"]]# .sample(frac=1).reset_index(drop=True)
|
||
|
|
|
||
|
|
def __masking()
|