Merge branch 'dev.embedder' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev.embedder
This commit is contained in:
commit
0560bc439a
@ -3,7 +3,7 @@ from pathlib import Path
|
|||||||
import Project_Model.Libs.BPE as BPE
|
import Project_Model.Libs.BPE as BPE
|
||||||
#from BPE import TokeNanoCore as Tokenizer
|
#from BPE import TokeNanoCore as Tokenizer
|
||||||
from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
|
from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
|
||||||
|
import random
|
||||||
class Batcher:
|
class Batcher:
|
||||||
|
|
||||||
def __init__(self, dataset_path: str, batch_size:int, tokenizer: BPE.TokeNanoCore) -> None:
|
def __init__(self, dataset_path: str, batch_size:int, tokenizer: BPE.TokeNanoCore) -> None:
|
||||||
@ -38,9 +38,14 @@ class Batcher:
|
|||||||
yield output
|
yield output
|
||||||
|
|
||||||
|
|
||||||
def __random_subset_rdfs(self, batch: pd.DataFrame):
|
def __random_subset_rdfs(self, batch: pd.DataFrame, seed = 0):
|
||||||
|
rng = random.Random(seed)
|
||||||
|
|
||||||
|
def to_list(x):
|
||||||
|
return x.split(SpecialToken.START_TRIPLE.value)[1:]
|
||||||
|
|
||||||
batch["RDFs"] = batch["RDFs"].map(
|
batch["RDFs"] = batch["RDFs"].map(
|
||||||
lambda x: x.split(SpecialToken.START_TRIPLE.value)[1:]
|
to_list
|
||||||
)
|
)
|
||||||
|
|
||||||
def __rdf2txt_transformation(self, batch: pd.DataFrame):
|
def __rdf2txt_transformation(self, batch: pd.DataFrame):
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user