Merge branch 'dev.embedder' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev.embedder
This commit is contained in:
commit
3021a51961
44
Project_Model/Libs/Batch/Classes/Batcher.py
Normal file
44
Project_Model/Libs/Batch/Classes/Batcher.py
Normal file
@ -0,0 +1,44 @@
|
||||
import pandas as pd
|
||||
|
||||
from BPE import TokeNanoCore as Tokenizer
|
||||
|
||||
class Batcher:
|
||||
|
||||
def __init__(self, dataset_path: str, batch_size:int, tokenizer: Tokenizer) -> None:
|
||||
# ABSTRACT, TRIPLE
|
||||
# tasks:
|
||||
# rdf2text: X: TRIPLE, Y: ABSTRACT
|
||||
# text2rdf: X: ABSTRACT, X:TRIPLE
|
||||
# masking ( call masker): X: incomplete_triple Y: complete_triple (as exam)
|
||||
# completation: X: TRIPLE SUBSET, Y: related TRIPLE SUBSET
|
||||
# self._DATASET = pd.read_csv(dataset_path)
|
||||
self._dataset_path = dataset_path
|
||||
self._batch_size = batch_size
|
||||
self._tokenizer = tokenizer
|
||||
|
||||
def get_batch(self):
|
||||
for batch in pd.read_csv(self._dataset_path, chunksize= int(self._batch_size/3)): #now we support 3 task
|
||||
# each batch get 4 transformation for the 4 tasks and then shuffled
|
||||
# now a batch is ["Abstract"], ["Triples"]
|
||||
# tokenize the strings:
|
||||
tokenized_batch
|
||||
tokenized_batch[["Abstract","Triples"]] = batch[["Abstract","Triples"]].map(lambda t: self._tokenizer.encode(t))
|
||||
# ??? i hope this works, later will be tested
|
||||
rdf2_txt_batch = self.__rdf2txt_transformation(tokenized_batch)
|
||||
|
||||
|
||||
|
||||
|
||||
def __rdf2txt_transformation(self, batch: pd.DataFrame):
|
||||
# rename ["Triples"] as ["X"]
|
||||
# rename ["Abstract"] as ["Y"]
|
||||
# return just them
|
||||
batch = batch.rename(columns={"Triples": "X", "Abstract": "Y"})
|
||||
return batch[["X", "Y"]] #.sample(frac=1).reset_index(drop=True)
|
||||
|
||||
|
||||
def __txt2rdf_transformation(self, batch: pd.DataFrame):
|
||||
batch = batch.rename(columns={ "Abstract": "X","Triples": "Y"})
|
||||
return batch[["X", "Y"]]# .sample(frac=1).reset_index(drop=True)
|
||||
|
||||
def __masking()
|
||||
8
Project_Model/Libs/Batch/Enums/TaskType.py
Normal file
8
Project_Model/Libs/Batch/Enums/TaskType.py
Normal file
@ -0,0 +1,8 @@
|
||||
from enum import Enum, auto
|
||||
|
||||
class TaskType(Enum):
|
||||
|
||||
RDF2TXT = auto()
|
||||
TEXT2RDF = auto()
|
||||
MASKING = auto()
|
||||
COMPLETATION = auto()
|
||||
Loading…
x
Reference in New Issue
Block a user