NanoSocrates/Project_Model/Libs/Batch/Classes/Batcher.py

import pandas as pd

from BPE import TokeNanoCore as Tokenizer

class Batcher:

    def __init__(self, dataset_path: str, batch_size:int, tokenizer: Tokenizer) -> None:
        # ABSTRACT, TRIPLE
        # tasks:
        #   rdf2text: X: TRIPLE, Y: ABSTRACT
        #   text2rdf: X: ABSTRACT, X:TRIPLE
        #   masking ( call masker): X: incomplete_triple Y: complete_triple (as exam)
        #   completation: X: TRIPLE SUBSET, Y: related TRIPLE SUBSET
        # self._DATASET = pd.read_csv(dataset_path)
        self._dataset_path = dataset_path
        self._batch_size = batch_size
        self._tokenizer = tokenizer

    def get_batch(self):
        for batch in pd.read_csv(self._dataset_path, chunksize= int(self._batch_size/3)): #now we support 3 task
            # each batch get 4 transformation for the 4 tasks and then shuffled
            # now a batch is ["Abstract"], ["Triples"]
            # tokenize the strings:
            tokenized_batch
            tokenized_batch[["Abstract","Triples"]] = batch[["Abstract","Triples"]].map(lambda t: self._tokenizer.encode(t))
            # ??? i hope this works, later will be tested
            rdf2_txt_batch = self.__rdf2txt_transformation(tokenized_batch)


    def __rdf2txt_transformation(self, batch: pd.DataFrame):
        # rename ["Triples"] as ["X"]
        # rename ["Abstract"] as ["Y"]
        # return just them
        batch = batch.rename(columns={"Triples": "X", "Abstract": "Y"})
        return batch[["X", "Y"]] #.sample(frac=1).reset_index(drop=True)


    def __txt2rdf_transformation(self, batch: pd.DataFrame):
        batch = batch.rename(columns={ "Abstract": "X","Triples": "Y"})
        return batch[["X", "Y"]]# .sample(frac=1).reset_index(drop=True)

    def __masking()