import pandas as pd from BPE import TokeNanoCore as Tokenizer class Batcher: def __init__(self, dataset_path: str, batch_size:int, tokenizer: Tokenizer) -> None: # ABSTRACT, TRIPLE # tasks: # rdf2text: X: TRIPLE, Y: ABSTRACT # text2rdf: X: ABSTRACT, X:TRIPLE # masking ( call masker): X: incomplete_triple Y: complete_triple (as exam) # completation: X: TRIPLE SUBSET, Y: related TRIPLE SUBSET # self._DATASET = pd.read_csv(dataset_path) self._dataset_path = dataset_path self._batch_size = batch_size self._tokenizer = tokenizer def get_batch(self): for batch in pd.read_csv(self._dataset_path, chunksize= int(self._batch_size/3)): #now we support 3 task # each batch get 4 transformation for the 4 tasks and then shuffled # now a batch is ["Abstract"], ["Triples"] # tokenize the strings: tokenized_batch tokenized_batch[["Abstract","Triples"]] = batch[["Abstract","Triples"]].map(lambda t: self._tokenizer.encode(t)) # ??? i hope this works, later will be tested rdf2_txt_batch = self.__rdf2txt_transformation(tokenized_batch) def __rdf2txt_transformation(self, batch: pd.DataFrame): # rename ["Triples"] as ["X"] # rename ["Abstract"] as ["Y"] # return just them batch = batch.rename(columns={"Triples": "X", "Abstract": "Y"}) return batch[["X", "Y"]] #.sample(frac=1).reset_index(drop=True) def __txt2rdf_transformation(self, batch: pd.DataFrame): batch = batch.rename(columns={ "Abstract": "X","Triples": "Y"}) return batch[["X", "Y"]]# .sample(frac=1).reset_index(drop=True) def __masking()