typo in Batcher
This commit is contained in:
parent
9b0c57c238
commit
c9a50d50b7
@ -1,12 +1,13 @@
|
||||
import random
|
||||
from typing import Generator
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
|
||||
import Project_Model.Libs.BPE as BPE
|
||||
#from BPE import TokeNanoCore as Tokenizer
|
||||
from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
|
||||
from Project_Model.Libs.Transformer.Classes.SpannedMasker import SpannedMasker
|
||||
from TokenCompletation import TokenCompletationTransformer
|
||||
from Project_Model.Libs.BPE.Enums.SpecialToken import SpecialToken
|
||||
import random
|
||||
|
||||
class Batcher:
|
||||
|
||||
def __init__(self, dataset_path: str, batch_size:int, tokenizer: BPE.TokeNanoCore, masker: SpannedMasker) -> None:
|
||||
@ -16,7 +17,6 @@ class Batcher:
|
||||
# text2rdf: X: ABSTRACT, X:TRIPLE
|
||||
# masking ( call masker): X: incomplete_triple Y: complete_triple (as exam)
|
||||
# completation: X: TRIPLE SUBSET, Y: related TRIPLE SUBSET
|
||||
# self._DATASET = pd.read_csv(dataset_path)
|
||||
self._dataset_path = dataset_path
|
||||
self._batch_size = batch_size
|
||||
self._tokenizer = tokenizer
|
||||
@ -26,29 +26,28 @@ class Batcher:
|
||||
eos = self._tokenizer.encode(SpecialToken.END_OF_SEQUENCE.value)
|
||||
self._token_completation = TokenCompletationTransformer(sotl,eos)
|
||||
|
||||
def get_batch(self):
|
||||
for batch in pd.read_csv(self._dataset_path, chunksize= int(self._batch_size/4)): #now we support 3 task
|
||||
# each batch get 4 transformation for the 4 tasks and then shuffled
|
||||
# now a batch is ["Abstract"], ["Triples"]
|
||||
# tokenize the strings:
|
||||
# batch = batch.drop(columns=['MovieID'])
|
||||
tokenized_batch = pd.DataFrame()
|
||||
# bho = batch.map(lambda x: self._tokenizer.encode(x))
|
||||
tokenized_batch[["Abstract","RDFs"]] = batch[["Abstract","RDFs"]].map(
|
||||
lambda t: self._tokenizer.encode(t))
|
||||
|
||||
def get_batch(self)-> Generator[pd.DataFrame]:
|
||||
for batch in pd.read_csv(self._dataset_path, chunksize= int(self._batch_size/4)): #now we support 3 task
|
||||
|
||||
tokenized_batch = pd.DataFrame()
|
||||
tokenized_batch[["Abstract","RDFs"]] = (
|
||||
batch[["Abstract","RDFs"]]
|
||||
.map(lambda t: self._tokenizer.encode(t))
|
||||
)
|
||||
|
||||
rdf2txt_batch = self.__rdf2txt_transformation(tokenized_batch)
|
||||
txt2rdf_batch = self.__txt2rdf_transformation(tokenized_batch)
|
||||
mask_batch = self.__masking_trasformation(tokenized_batch)
|
||||
completation = self.__token_completation_task(tokenized_batch)
|
||||
completation_batch = self.__token_completation_task(tokenized_batch)
|
||||
|
||||
output = pd.concat([rdf2txt_batch,txt2rdf_batch,mask_batch,completation],ignore_index=True)
|
||||
output.sample(frac=1).reset_index(drop=True)
|
||||
output = pd.concat([rdf2txt_batch,txt2rdf_batch,mask_batch,completation_batch],ignore_index=True)
|
||||
output = output.sample(frac=1).reset_index(drop=True)
|
||||
yield output
|
||||
|
||||
|
||||
def __random_subset_rdfs(self, batch: pd.DataFrame, seed = 0):
|
||||
# WIP
|
||||
rng = random.Random(seed)
|
||||
|
||||
def to_list(x):
|
||||
@ -59,16 +58,13 @@ class Batcher:
|
||||
)
|
||||
|
||||
def __rdf2txt_transformation(self, batch: pd.DataFrame):
|
||||
# rename ["Triples"] as ["X"]
|
||||
# rename ["Abstract"] as ["Y"]
|
||||
# return just them
|
||||
batch = batch.rename(columns={"RDFs": "X", "Abstract": "Y"})
|
||||
return batch[["X", "Y"]] #.sample(frac=1).reset_index(drop=True)
|
||||
return batch[["X", "Y"]]
|
||||
|
||||
|
||||
def __txt2rdf_transformation(self, batch: pd.DataFrame):
|
||||
batch = batch.rename(columns={ "Abstract": "X","RDFs": "Y"})
|
||||
return batch[["X", "Y"]]# .sample(frac=1).reset_index(drop=True)
|
||||
return batch[["X", "Y"]]
|
||||
|
||||
def __masking_trasformation(self, batch: pd.DataFrame):
|
||||
# mask_sequence: List[int] -> Tuple[List[int], List[int]]
|
||||
@ -92,6 +88,7 @@ class Batcher:
|
||||
DATASET_PATH = "Assets/Dataset/Tmp/rdf_text.csv"
|
||||
VOCABULARY_path = "Assets/Dataset/Tmp/trimmed.json"
|
||||
|
||||
from pathlib import Path
|
||||
VOCABULARY = BPE.load_nanos_vocabulary(Path(VOCABULARY_path))
|
||||
SPECIAL_LIST = BPE.default_special_tokens()
|
||||
TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_LIST)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user