typo in Batcher

This commit is contained in:
GassiGiuseppe 2025-10-08 11:39:08 +02:00
parent 9b0c57c238
commit c9a50d50b7

View File

@ -1,12 +1,13 @@
import random
from typing import Generator
import pandas as pd
from pathlib import Path
import Project_Model.Libs.BPE as BPE
#from BPE import TokeNanoCore as Tokenizer
from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
from Project_Model.Libs.Transformer.Classes.SpannedMasker import SpannedMasker
from TokenCompletation import TokenCompletationTransformer
from Project_Model.Libs.BPE.Enums.SpecialToken import SpecialToken
import random
class Batcher:
def __init__(self, dataset_path: str, batch_size:int, tokenizer: BPE.TokeNanoCore, masker: SpannedMasker) -> None:
@ -16,7 +17,6 @@ class Batcher:
# text2rdf: X: ABSTRACT, X:TRIPLE
# masking ( call masker): X: incomplete_triple Y: complete_triple (as exam)
# completation: X: TRIPLE SUBSET, Y: related TRIPLE SUBSET
# self._DATASET = pd.read_csv(dataset_path)
self._dataset_path = dataset_path
self._batch_size = batch_size
self._tokenizer = tokenizer
@ -26,29 +26,28 @@ class Batcher:
eos = self._tokenizer.encode(SpecialToken.END_OF_SEQUENCE.value)
self._token_completation = TokenCompletationTransformer(sotl,eos)
def get_batch(self):
def get_batch(self)-> Generator[pd.DataFrame]:
for batch in pd.read_csv(self._dataset_path, chunksize= int(self._batch_size/4)): #now we support 3 task
# each batch get 4 transformation for the 4 tasks and then shuffled
# now a batch is ["Abstract"], ["Triples"]
# tokenize the strings:
# batch = batch.drop(columns=['MovieID'])
tokenized_batch = pd.DataFrame()
# bho = batch.map(lambda x: self._tokenizer.encode(x))
tokenized_batch[["Abstract","RDFs"]] = batch[["Abstract","RDFs"]].map(
lambda t: self._tokenizer.encode(t))
tokenized_batch[["Abstract","RDFs"]] = (
batch[["Abstract","RDFs"]]
.map(lambda t: self._tokenizer.encode(t))
)
rdf2txt_batch = self.__rdf2txt_transformation(tokenized_batch)
txt2rdf_batch = self.__txt2rdf_transformation(tokenized_batch)
mask_batch = self.__masking_trasformation(tokenized_batch)
completation = self.__token_completation_task(tokenized_batch)
completation_batch = self.__token_completation_task(tokenized_batch)
output = pd.concat([rdf2txt_batch,txt2rdf_batch,mask_batch,completation],ignore_index=True)
output.sample(frac=1).reset_index(drop=True)
output = pd.concat([rdf2txt_batch,txt2rdf_batch,mask_batch,completation_batch],ignore_index=True)
output = output.sample(frac=1).reset_index(drop=True)
yield output
def __random_subset_rdfs(self, batch: pd.DataFrame, seed = 0):
def __random_subset_rdfs(self, batch: pd.DataFrame, seed = 0):
# WIP
rng = random.Random(seed)
def to_list(x):
@ -59,16 +58,13 @@ class Batcher:
)
def __rdf2txt_transformation(self, batch: pd.DataFrame):
# rename ["Triples"] as ["X"]
# rename ["Abstract"] as ["Y"]
# return just them
batch = batch.rename(columns={"RDFs": "X", "Abstract": "Y"})
return batch[["X", "Y"]] #.sample(frac=1).reset_index(drop=True)
return batch[["X", "Y"]]
def __txt2rdf_transformation(self, batch: pd.DataFrame):
batch = batch.rename(columns={ "Abstract": "X","RDFs": "Y"})
return batch[["X", "Y"]]# .sample(frac=1).reset_index(drop=True)
return batch[["X", "Y"]]
def __masking_trasformation(self, batch: pd.DataFrame):
# mask_sequence: List[int] -> Tuple[List[int], List[int]]
@ -92,6 +88,7 @@ class Batcher:
DATASET_PATH = "Assets/Dataset/Tmp/rdf_text.csv"
VOCABULARY_path = "Assets/Dataset/Tmp/trimmed.json"
from pathlib import Path
VOCABULARY = BPE.load_nanos_vocabulary(Path(VOCABULARY_path))
SPECIAL_LIST = BPE.default_special_tokens()
TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_LIST)