WIP Batcher
This commit is contained in:
parent
9b5bb6d5f8
commit
490edcfd53
44
Project_Model/Libs/Batch/Classes/Batcher.py
Normal file
44
Project_Model/Libs/Batch/Classes/Batcher.py
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
from BPE import TokeNanoCore as Tokenizer
|
||||||
|
|
||||||
|
class Batcher:
|
||||||
|
|
||||||
|
def __init__(self, dataset_path: str, batch_size:int, tokenizer: Tokenizer) -> None:
|
||||||
|
# ABSTRACT, TRIPLE
|
||||||
|
# tasks:
|
||||||
|
# rdf2text: X: TRIPLE, Y: ABSTRACT
|
||||||
|
# text2rdf: X: ABSTRACT, X:TRIPLE
|
||||||
|
# masking ( call masker): X: incomplete_triple Y: complete_triple (as exam)
|
||||||
|
# completation: X: TRIPLE SUBSET, Y: related TRIPLE SUBSET
|
||||||
|
# self._DATASET = pd.read_csv(dataset_path)
|
||||||
|
self._dataset_path = dataset_path
|
||||||
|
self._batch_size = batch_size
|
||||||
|
self._tokenizer = tokenizer
|
||||||
|
|
||||||
|
def get_batch(self):
|
||||||
|
for batch in pd.read_csv(self._dataset_path, chunksize= int(self._batch_size/3)): #now we support 3 task
|
||||||
|
# each batch get 4 transformation for the 4 tasks and then shuffled
|
||||||
|
# now a batch is ["Abstract"], ["Triples"]
|
||||||
|
# tokenize the strings:
|
||||||
|
tokenized_batch
|
||||||
|
tokenized_batch[["Abstract","Triples"]] = batch[["Abstract","Triples"]].map(lambda t: self._tokenizer.encode(t))
|
||||||
|
# ??? i hope this works, later will be tested
|
||||||
|
rdf2_txt_batch = self.__rdf2txt_transformation(tokenized_batch)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def __rdf2txt_transformation(self, batch: pd.DataFrame):
|
||||||
|
# rename ["Triples"] as ["X"]
|
||||||
|
# rename ["Abstract"] as ["Y"]
|
||||||
|
# return just them
|
||||||
|
batch = batch.rename(columns={"Triples": "X", "Abstract": "Y"})
|
||||||
|
return batch[["X", "Y"]] #.sample(frac=1).reset_index(drop=True)
|
||||||
|
|
||||||
|
|
||||||
|
def __txt2rdf_transformation(self, batch: pd.DataFrame):
|
||||||
|
batch = batch.rename(columns={ "Abstract": "X","Triples": "Y"})
|
||||||
|
return batch[["X", "Y"]]# .sample(frac=1).reset_index(drop=True)
|
||||||
|
|
||||||
|
def __masking()
|
||||||
8
Project_Model/Libs/Batch/Enums/TaskType.py
Normal file
8
Project_Model/Libs/Batch/Enums/TaskType.py
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
from enum import Enum, auto
|
||||||
|
|
||||||
|
class TaskType(Enum):
|
||||||
|
|
||||||
|
RDF2TXT = auto()
|
||||||
|
TEXT2RDF = auto()
|
||||||
|
MASKING = auto()
|
||||||
|
COMPLETATION = auto()
|
||||||
Loading…
x
Reference in New Issue
Block a user