update cleaning pipeline with a new method to filter also by number of films,
also updated the signature of the pipeline
This commit is contained in:
parent
c2f9344c82
commit
bbadd4c521
@ -73,6 +73,10 @@ class PipelineApplier():
|
|||||||
return RDF
|
return RDF
|
||||||
|
|
||||||
|
|
||||||
|
def reduce_movie_list(self, starting_offset:int , ending_offset:int):
|
||||||
|
end = min(len(self.MOVIE_FILTER), ending_offset)
|
||||||
|
self.MOVIE_FILTER = self.MOVIE_FILTER.iloc[starting_offset:end].copy()
|
||||||
|
|
||||||
def drop_na_from_dataset(self, RDF: pd.DataFrame) -> pd.DataFrame:
|
def drop_na_from_dataset(self, RDF: pd.DataFrame) -> pd.DataFrame:
|
||||||
# dataset has SubjectURI RelationshipURI ObjectURI
|
# dataset has SubjectURI RelationshipURI ObjectURI
|
||||||
# want to drop the '' in them
|
# want to drop the '' in them
|
||||||
|
|||||||
@ -10,13 +10,19 @@ from Scripts.DataCleaning.data_output_models.rdf_completation_task import RDF_co
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
class Pipeline():
|
class Pipeline():
|
||||||
def __init__(self):
|
def __init__(self,
|
||||||
|
mask_task_dataset_path:str = "./Assets/Dataset/Tmp/rdf_mask.csv",
|
||||||
|
bpe_corpus_path:str = "./Assets/Dataset/Tmp/corpus.txt",
|
||||||
|
text_to_rdf_task_dataset_path:str = "./Assets/Dataset/Tmp/rdf_text.csv",
|
||||||
|
completation_rdf_task_dataset_path:str = "./Assets/Dataset/Tmp/rdf_completation.csv",
|
||||||
|
|
||||||
|
):
|
||||||
self.sql_endpoint = SqlEndpoint()
|
self.sql_endpoint = SqlEndpoint()
|
||||||
# classes to manage taskes' datasets
|
# classes to manage taskes' datasets
|
||||||
self.task_rdf_mask = RDF_mask_task_dataset("./Assets/Dataset/Tmp/rdf_mask.csv")
|
self.task_rdf_mask = RDF_mask_task_dataset(mask_task_dataset_path)
|
||||||
self.task_bpe_corpus = BPE_corpus("./Assets/Dataset/Tmp/corpus.txt")
|
self.task_bpe_corpus = BPE_corpus(bpe_corpus_path)
|
||||||
self.task_rdf_text = RDF_text_task_dataset("./Assets/Dataset/Tmp/rdf_text.csv")
|
self.task_rdf_text = RDF_text_task_dataset(text_to_rdf_task_dataset_path)
|
||||||
self.task_rdf_completation = RDF_completation_task_dataset("./Assets/Dataset/Tmp/rdf_completation.csv")
|
self.task_rdf_completation = RDF_completation_task_dataset(completation_rdf_task_dataset_path)
|
||||||
|
|
||||||
# prepare the filter
|
# prepare the filter
|
||||||
# the filter applier needs to know the frequence of Movies and Relationship among all the Dataset
|
# the filter applier needs to know the frequence of Movies and Relationship among all the Dataset
|
||||||
@ -113,6 +119,9 @@ class Pipeline():
|
|||||||
movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]
|
movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]
|
||||||
self.sql_endpoint.movie_ids = movie_list
|
self.sql_endpoint.movie_ids = movie_list
|
||||||
|
|
||||||
|
def reduce_movie_list(self, starting_offset:int , ending_offset:int):
|
||||||
|
self.filter_applier.reduce_movie_list(starting_offset,ending_offset)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# there are a lot of settings to manage
|
# there are a lot of settings to manage
|
||||||
@ -121,11 +130,11 @@ class Pipeline():
|
|||||||
# in the use_toy_dataset , to change the toy dataset
|
# in the use_toy_dataset , to change the toy dataset
|
||||||
# in _get_cleaned_movie_rows: to change how the pipeline behave
|
# in _get_cleaned_movie_rows: to change how the pipeline behave
|
||||||
|
|
||||||
pipeline = Pipeline()
|
#pipeline = Pipeline()
|
||||||
|
|
||||||
# pipeline.use_toy_dataset()
|
# pipeline.use_toy_dataset()
|
||||||
# pipeline.execute_task_bpe_corpus()
|
# pipeline.execute_task_bpe_corpus()
|
||||||
# pipeline.execute_task_rdf_mask()
|
# pipeline.execute_task_rdf_mask()
|
||||||
# pipeline.execute_tasks_rdf_text()
|
# pipeline.execute_tasks_rdf_text()
|
||||||
# pipeline.execute_task_rdf_completation()
|
# pipeline.execute_task_rdf_completation()
|
||||||
pipeline.execute_all_task()
|
# pipeline.execute_all_task()
|
||||||
Loading…
x
Reference in New Issue
Block a user