diff --git a/Project_Model/Libs/Batch/Classes/Batcher.py b/Project_Model/Libs/Batch/Classes/Batcher.py new file mode 100644 index 0000000..aa18a93 --- /dev/null +++ b/Project_Model/Libs/Batch/Classes/Batcher.py @@ -0,0 +1,44 @@ +import pandas as pd + +from BPE import TokeNanoCore as Tokenizer + +class Batcher: + + def __init__(self, dataset_path: str, batch_size:int, tokenizer: Tokenizer) -> None: + # ABSTRACT, TRIPLE + # tasks: + # rdf2text: X: TRIPLE, Y: ABSTRACT + # text2rdf: X: ABSTRACT, X:TRIPLE + # masking ( call masker): X: incomplete_triple Y: complete_triple (as exam) + # completation: X: TRIPLE SUBSET, Y: related TRIPLE SUBSET + # self._DATASET = pd.read_csv(dataset_path) + self._dataset_path = dataset_path + self._batch_size = batch_size + self._tokenizer = tokenizer + + def get_batch(self): + for batch in pd.read_csv(self._dataset_path, chunksize= int(self._batch_size/3)): #now we support 3 task + # each batch get 4 transformation for the 4 tasks and then shuffled + # now a batch is ["Abstract"], ["Triples"] + # tokenize the strings: + tokenized_batch + tokenized_batch[["Abstract","Triples"]] = batch[["Abstract","Triples"]].map(lambda t: self._tokenizer.encode(t)) + # ??? i hope this works, later will be tested + rdf2_txt_batch = self.__rdf2txt_transformation(tokenized_batch) + + + + + def __rdf2txt_transformation(self, batch: pd.DataFrame): + # rename ["Triples"] as ["X"] + # rename ["Abstract"] as ["Y"] + # return just them + batch = batch.rename(columns={"Triples": "X", "Abstract": "Y"}) + return batch[["X", "Y"]] #.sample(frac=1).reset_index(drop=True) + + + def __txt2rdf_transformation(self, batch: pd.DataFrame): + batch = batch.rename(columns={ "Abstract": "X","Triples": "Y"}) + return batch[["X", "Y"]]# .sample(frac=1).reset_index(drop=True) + + def __masking() \ No newline at end of file diff --git a/Project_Model/Libs/Batch/Enums/TaskType.py b/Project_Model/Libs/Batch/Enums/TaskType.py new file mode 100644 index 0000000..b565aae --- /dev/null +++ b/Project_Model/Libs/Batch/Enums/TaskType.py @@ -0,0 +1,8 @@ +from enum import Enum, auto + +class TaskType(Enum): + + RDF2TXT = auto() + TEXT2RDF = auto() + MASKING = auto() + COMPLETATION = auto() \ No newline at end of file