NanoSocrates/Project_Model/Libs/Batch/Classes/Batcher.py

import pandas as pd

from BPE import TokeNanoCore as Tokenizer

class Batcher:

    def __init__(self, dataset_path: str, batch_size:int, tokenizer: Tokenizer) -> None:
        # ABSTRACT, TRIPLE 
        # tasks: 
        #   rdf2text: X: TRIPLE, Y: ABSTRACT 
        #   text2rdf: X: ABSTRACT, X:TRIPLE 
        #   masking ( call masker): X: incomplete_triple Y: complete_triple (as exam) 
        #   completation: X: TRIPLE SUBSET, Y: related TRIPLE SUBSET
        # self._DATASET = pd.read_csv(dataset_path)
        self._dataset_path = dataset_path
        self._batch_size = batch_size
        self._tokenizer = tokenizer

    def get_batch(self):
        for batch in pd.read_csv(self._dataset_path, chunksize= int(self._batch_size/3)): #now we support 3 task
            # each batch get 4 transformation for the 4 tasks and then shuffled
            # now a batch is ["Abstract"], ["Triples"]
            # tokenize the strings:
            tokenized_batch 
            tokenized_batch[["Abstract","Triples"]] = batch[["Abstract","Triples"]].map(lambda t: self._tokenizer.encode(t))
            # ??? i hope this works, later will be tested
            rdf2_txt_batch = self.__rdf2txt_transformation(tokenized_batch)
            

    def __rdf2txt_transformation(self, batch: pd.DataFrame):
        # rename ["Triples"] as ["X"]
        # rename ["Abstract"] as ["Y"]
        # return just them
        batch = batch.rename(columns={"Triples": "X", "Abstract": "Y"})
        return batch[["X", "Y"]] #.sample(frac=1).reset_index(drop=True)


    def __txt2rdf_transformation(self, batch: pd.DataFrame):
        batch = batch.rename(columns={ "Abstract": "X","Triples": "Y"})
        return batch[["X", "Y"]]# .sample(frac=1).reset_index(drop=True)
    
    def __masking()
WIP Batcher 2025-10-07 15:36:51 +02:00			`import pandas as pd`

			`from BPE import TokeNanoCore as Tokenizer`

			`class Batcher:`

			`def __init__(self, dataset_path: str, batch_size:int, tokenizer: Tokenizer) -> None:`
			`# ABSTRACT, TRIPLE`
			`# tasks:`
			`# rdf2text: X: TRIPLE, Y: ABSTRACT`
			`# text2rdf: X: ABSTRACT, X:TRIPLE`
			`# masking ( call masker): X: incomplete_triple Y: complete_triple (as exam)`
			`# completation: X: TRIPLE SUBSET, Y: related TRIPLE SUBSET`
			`# self._DATASET = pd.read_csv(dataset_path)`
			`self._dataset_path = dataset_path`
			`self._batch_size = batch_size`
			`self._tokenizer = tokenizer`

			`def get_batch(self):`
			`for batch in pd.read_csv(self._dataset_path, chunksize= int(self._batch_size/3)): #now we support 3 task`
			`# each batch get 4 transformation for the 4 tasks and then shuffled`
			`# now a batch is ["Abstract"], ["Triples"]`
			`# tokenize the strings:`
			`tokenized_batch`
			`tokenized_batch[["Abstract","Triples"]] = batch[["Abstract","Triples"]].map(lambda t: self._tokenizer.encode(t))`
			`# ??? i hope this works, later will be tested`
			`rdf2_txt_batch = self.__rdf2txt_transformation(tokenized_batch)`




			`def __rdf2txt_transformation(self, batch: pd.DataFrame):`
			`# rename ["Triples"] as ["X"]`
			`# rename ["Abstract"] as ["Y"]`
			`# return just them`
			`batch = batch.rename(columns={"Triples": "X", "Abstract": "Y"})`
			`return batch[["X", "Y"]] #.sample(frac=1).reset_index(drop=True)`


			`def __txt2rdf_transformation(self, batch: pd.DataFrame):`
			`batch = batch.rename(columns={ "Abstract": "X","Triples": "Y"})`
			`return batch[["X", "Y"]]# .sample(frac=1).reset_index(drop=True)`

			`def __masking()`