Merge branch 'dev.embedder' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev.embedder
This commit is contained in:
commit
0560bc439a
@ -3,7 +3,7 @@ from pathlib import Path
|
||||
import Project_Model.Libs.BPE as BPE
|
||||
#from BPE import TokeNanoCore as Tokenizer
|
||||
from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
|
||||
|
||||
import random
|
||||
class Batcher:
|
||||
|
||||
def __init__(self, dataset_path: str, batch_size:int, tokenizer: BPE.TokeNanoCore) -> None:
|
||||
@ -38,9 +38,14 @@ class Batcher:
|
||||
yield output
|
||||
|
||||
|
||||
def __random_subset_rdfs(self, batch: pd.DataFrame):
|
||||
def __random_subset_rdfs(self, batch: pd.DataFrame, seed = 0):
|
||||
rng = random.Random(seed)
|
||||
|
||||
def to_list(x):
|
||||
return x.split(SpecialToken.START_TRIPLE.value)[1:]
|
||||
|
||||
batch["RDFs"] = batch["RDFs"].map(
|
||||
lambda x: x.split(SpecialToken.START_TRIPLE.value)[1:]
|
||||
to_list
|
||||
)
|
||||
|
||||
def __rdf2txt_transformation(self, batch: pd.DataFrame):
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user