diff --git a/Project_Model/Libs/Batch/Classes/Batcher.py b/Project_Model/Libs/Batch/Classes/Batcher.py index 6cf68d4..bc09436 100644 --- a/Project_Model/Libs/Batch/Classes/Batcher.py +++ b/Project_Model/Libs/Batch/Classes/Batcher.py @@ -1,12 +1,13 @@ +import random +from typing import Generator import pandas as pd -from pathlib import Path + import Project_Model.Libs.BPE as BPE -#from BPE import TokeNanoCore as Tokenizer from Scripts.Libs.CleaningPipeline.special_token import SpecialToken from Project_Model.Libs.Transformer.Classes.SpannedMasker import SpannedMasker from TokenCompletation import TokenCompletationTransformer from Project_Model.Libs.BPE.Enums.SpecialToken import SpecialToken -import random + class Batcher: def __init__(self, dataset_path: str, batch_size:int, tokenizer: BPE.TokeNanoCore, masker: SpannedMasker) -> None: @@ -16,7 +17,6 @@ class Batcher: # text2rdf: X: ABSTRACT, X:TRIPLE # masking ( call masker): X: incomplete_triple Y: complete_triple (as exam) # completation: X: TRIPLE SUBSET, Y: related TRIPLE SUBSET - # self._DATASET = pd.read_csv(dataset_path) self._dataset_path = dataset_path self._batch_size = batch_size self._tokenizer = tokenizer @@ -26,29 +26,28 @@ class Batcher: eos = self._tokenizer.encode(SpecialToken.END_OF_SEQUENCE.value) self._token_completation = TokenCompletationTransformer(sotl,eos) - def get_batch(self): + + def get_batch(self)-> Generator[pd.DataFrame]: for batch in pd.read_csv(self._dataset_path, chunksize= int(self._batch_size/4)): #now we support 3 task - # each batch get 4 transformation for the 4 tasks and then shuffled - # now a batch is ["Abstract"], ["Triples"] - # tokenize the strings: - # batch = batch.drop(columns=['MovieID']) + tokenized_batch = pd.DataFrame() - # bho = batch.map(lambda x: self._tokenizer.encode(x)) - tokenized_batch[["Abstract","RDFs"]] = batch[["Abstract","RDFs"]].map( - lambda t: self._tokenizer.encode(t)) - + tokenized_batch[["Abstract","RDFs"]] = ( + batch[["Abstract","RDFs"]] + .map(lambda t: self._tokenizer.encode(t)) + ) rdf2txt_batch = self.__rdf2txt_transformation(tokenized_batch) txt2rdf_batch = self.__txt2rdf_transformation(tokenized_batch) mask_batch = self.__masking_trasformation(tokenized_batch) - completation = self.__token_completation_task(tokenized_batch) + completation_batch = self.__token_completation_task(tokenized_batch) - output = pd.concat([rdf2txt_batch,txt2rdf_batch,mask_batch,completation],ignore_index=True) - output.sample(frac=1).reset_index(drop=True) + output = pd.concat([rdf2txt_batch,txt2rdf_batch,mask_batch,completation_batch],ignore_index=True) + output = output.sample(frac=1).reset_index(drop=True) yield output - def __random_subset_rdfs(self, batch: pd.DataFrame, seed = 0): + def __random_subset_rdfs(self, batch: pd.DataFrame, seed = 0): + # WIP rng = random.Random(seed) def to_list(x): @@ -59,16 +58,13 @@ class Batcher: ) def __rdf2txt_transformation(self, batch: pd.DataFrame): - # rename ["Triples"] as ["X"] - # rename ["Abstract"] as ["Y"] - # return just them batch = batch.rename(columns={"RDFs": "X", "Abstract": "Y"}) - return batch[["X", "Y"]] #.sample(frac=1).reset_index(drop=True) + return batch[["X", "Y"]] def __txt2rdf_transformation(self, batch: pd.DataFrame): batch = batch.rename(columns={ "Abstract": "X","RDFs": "Y"}) - return batch[["X", "Y"]]# .sample(frac=1).reset_index(drop=True) + return batch[["X", "Y"]] def __masking_trasformation(self, batch: pd.DataFrame): # mask_sequence: List[int] -> Tuple[List[int], List[int]] @@ -92,6 +88,7 @@ class Batcher: DATASET_PATH = "Assets/Dataset/Tmp/rdf_text.csv" VOCABULARY_path = "Assets/Dataset/Tmp/trimmed.json" +from pathlib import Path VOCABULARY = BPE.load_nanos_vocabulary(Path(VOCABULARY_path)) SPECIAL_LIST = BPE.default_special_tokens() TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_LIST)