typo in Batcher
This commit is contained in:
parent
9b0c57c238
commit
c9a50d50b7
@ -1,12 +1,13 @@
|
|||||||
|
import random
|
||||||
|
from typing import Generator
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from pathlib import Path
|
|
||||||
import Project_Model.Libs.BPE as BPE
|
import Project_Model.Libs.BPE as BPE
|
||||||
#from BPE import TokeNanoCore as Tokenizer
|
|
||||||
from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
|
from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
|
||||||
from Project_Model.Libs.Transformer.Classes.SpannedMasker import SpannedMasker
|
from Project_Model.Libs.Transformer.Classes.SpannedMasker import SpannedMasker
|
||||||
from TokenCompletation import TokenCompletationTransformer
|
from TokenCompletation import TokenCompletationTransformer
|
||||||
from Project_Model.Libs.BPE.Enums.SpecialToken import SpecialToken
|
from Project_Model.Libs.BPE.Enums.SpecialToken import SpecialToken
|
||||||
import random
|
|
||||||
class Batcher:
|
class Batcher:
|
||||||
|
|
||||||
def __init__(self, dataset_path: str, batch_size:int, tokenizer: BPE.TokeNanoCore, masker: SpannedMasker) -> None:
|
def __init__(self, dataset_path: str, batch_size:int, tokenizer: BPE.TokeNanoCore, masker: SpannedMasker) -> None:
|
||||||
@ -16,7 +17,6 @@ class Batcher:
|
|||||||
# text2rdf: X: ABSTRACT, X:TRIPLE
|
# text2rdf: X: ABSTRACT, X:TRIPLE
|
||||||
# masking ( call masker): X: incomplete_triple Y: complete_triple (as exam)
|
# masking ( call masker): X: incomplete_triple Y: complete_triple (as exam)
|
||||||
# completation: X: TRIPLE SUBSET, Y: related TRIPLE SUBSET
|
# completation: X: TRIPLE SUBSET, Y: related TRIPLE SUBSET
|
||||||
# self._DATASET = pd.read_csv(dataset_path)
|
|
||||||
self._dataset_path = dataset_path
|
self._dataset_path = dataset_path
|
||||||
self._batch_size = batch_size
|
self._batch_size = batch_size
|
||||||
self._tokenizer = tokenizer
|
self._tokenizer = tokenizer
|
||||||
@ -26,29 +26,28 @@ class Batcher:
|
|||||||
eos = self._tokenizer.encode(SpecialToken.END_OF_SEQUENCE.value)
|
eos = self._tokenizer.encode(SpecialToken.END_OF_SEQUENCE.value)
|
||||||
self._token_completation = TokenCompletationTransformer(sotl,eos)
|
self._token_completation = TokenCompletationTransformer(sotl,eos)
|
||||||
|
|
||||||
def get_batch(self):
|
|
||||||
|
def get_batch(self)-> Generator[pd.DataFrame]:
|
||||||
for batch in pd.read_csv(self._dataset_path, chunksize= int(self._batch_size/4)): #now we support 3 task
|
for batch in pd.read_csv(self._dataset_path, chunksize= int(self._batch_size/4)): #now we support 3 task
|
||||||
# each batch get 4 transformation for the 4 tasks and then shuffled
|
|
||||||
# now a batch is ["Abstract"], ["Triples"]
|
|
||||||
# tokenize the strings:
|
|
||||||
# batch = batch.drop(columns=['MovieID'])
|
|
||||||
tokenized_batch = pd.DataFrame()
|
tokenized_batch = pd.DataFrame()
|
||||||
# bho = batch.map(lambda x: self._tokenizer.encode(x))
|
tokenized_batch[["Abstract","RDFs"]] = (
|
||||||
tokenized_batch[["Abstract","RDFs"]] = batch[["Abstract","RDFs"]].map(
|
batch[["Abstract","RDFs"]]
|
||||||
lambda t: self._tokenizer.encode(t))
|
.map(lambda t: self._tokenizer.encode(t))
|
||||||
|
)
|
||||||
|
|
||||||
rdf2txt_batch = self.__rdf2txt_transformation(tokenized_batch)
|
rdf2txt_batch = self.__rdf2txt_transformation(tokenized_batch)
|
||||||
txt2rdf_batch = self.__txt2rdf_transformation(tokenized_batch)
|
txt2rdf_batch = self.__txt2rdf_transformation(tokenized_batch)
|
||||||
mask_batch = self.__masking_trasformation(tokenized_batch)
|
mask_batch = self.__masking_trasformation(tokenized_batch)
|
||||||
completation = self.__token_completation_task(tokenized_batch)
|
completation_batch = self.__token_completation_task(tokenized_batch)
|
||||||
|
|
||||||
output = pd.concat([rdf2txt_batch,txt2rdf_batch,mask_batch,completation],ignore_index=True)
|
output = pd.concat([rdf2txt_batch,txt2rdf_batch,mask_batch,completation_batch],ignore_index=True)
|
||||||
output.sample(frac=1).reset_index(drop=True)
|
output = output.sample(frac=1).reset_index(drop=True)
|
||||||
yield output
|
yield output
|
||||||
|
|
||||||
|
|
||||||
def __random_subset_rdfs(self, batch: pd.DataFrame, seed = 0):
|
def __random_subset_rdfs(self, batch: pd.DataFrame, seed = 0):
|
||||||
|
# WIP
|
||||||
rng = random.Random(seed)
|
rng = random.Random(seed)
|
||||||
|
|
||||||
def to_list(x):
|
def to_list(x):
|
||||||
@ -59,16 +58,13 @@ class Batcher:
|
|||||||
)
|
)
|
||||||
|
|
||||||
def __rdf2txt_transformation(self, batch: pd.DataFrame):
|
def __rdf2txt_transformation(self, batch: pd.DataFrame):
|
||||||
# rename ["Triples"] as ["X"]
|
|
||||||
# rename ["Abstract"] as ["Y"]
|
|
||||||
# return just them
|
|
||||||
batch = batch.rename(columns={"RDFs": "X", "Abstract": "Y"})
|
batch = batch.rename(columns={"RDFs": "X", "Abstract": "Y"})
|
||||||
return batch[["X", "Y"]] #.sample(frac=1).reset_index(drop=True)
|
return batch[["X", "Y"]]
|
||||||
|
|
||||||
|
|
||||||
def __txt2rdf_transformation(self, batch: pd.DataFrame):
|
def __txt2rdf_transformation(self, batch: pd.DataFrame):
|
||||||
batch = batch.rename(columns={ "Abstract": "X","RDFs": "Y"})
|
batch = batch.rename(columns={ "Abstract": "X","RDFs": "Y"})
|
||||||
return batch[["X", "Y"]]# .sample(frac=1).reset_index(drop=True)
|
return batch[["X", "Y"]]
|
||||||
|
|
||||||
def __masking_trasformation(self, batch: pd.DataFrame):
|
def __masking_trasformation(self, batch: pd.DataFrame):
|
||||||
# mask_sequence: List[int] -> Tuple[List[int], List[int]]
|
# mask_sequence: List[int] -> Tuple[List[int], List[int]]
|
||||||
@ -92,6 +88,7 @@ class Batcher:
|
|||||||
DATASET_PATH = "Assets/Dataset/Tmp/rdf_text.csv"
|
DATASET_PATH = "Assets/Dataset/Tmp/rdf_text.csv"
|
||||||
VOCABULARY_path = "Assets/Dataset/Tmp/trimmed.json"
|
VOCABULARY_path = "Assets/Dataset/Tmp/trimmed.json"
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
VOCABULARY = BPE.load_nanos_vocabulary(Path(VOCABULARY_path))
|
VOCABULARY = BPE.load_nanos_vocabulary(Path(VOCABULARY_path))
|
||||||
SPECIAL_LIST = BPE.default_special_tokens()
|
SPECIAL_LIST = BPE.default_special_tokens()
|
||||||
TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_LIST)
|
TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_LIST)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user