From bbadd4c521fdc2ef4b867d12b597c4703af336d5 Mon Sep 17 00:00:00 2001 From: GassiGiuseppe Date: Sat, 4 Oct 2025 19:00:05 +0200 Subject: [PATCH] update cleaning pipeline with a new method to filter also by number of films, also updated the signature of the pipeline --- Scripts/DataCleaning/filter.py | 4 ++++ Scripts/DataCleaning/pipeline.py | 23 ++++++++++++++++------- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/Scripts/DataCleaning/filter.py b/Scripts/DataCleaning/filter.py index 50d6ead..317ea6b 100644 --- a/Scripts/DataCleaning/filter.py +++ b/Scripts/DataCleaning/filter.py @@ -73,6 +73,10 @@ class PipelineApplier(): return RDF + def reduce_movie_list(self, starting_offset:int , ending_offset:int): + end = min(len(self.MOVIE_FILTER), ending_offset) + self.MOVIE_FILTER = self.MOVIE_FILTER.iloc[starting_offset:end].copy() + def drop_na_from_dataset(self, RDF: pd.DataFrame) -> pd.DataFrame: # dataset has SubjectURI RelationshipURI ObjectURI # want to drop the '' in them diff --git a/Scripts/DataCleaning/pipeline.py b/Scripts/DataCleaning/pipeline.py index eb5b2f7..153f127 100644 --- a/Scripts/DataCleaning/pipeline.py +++ b/Scripts/DataCleaning/pipeline.py @@ -10,13 +10,19 @@ from Scripts.DataCleaning.data_output_models.rdf_completation_task import RDF_co import pandas as pd class Pipeline(): - def __init__(self): + def __init__(self, + mask_task_dataset_path:str = "./Assets/Dataset/Tmp/rdf_mask.csv", + bpe_corpus_path:str = "./Assets/Dataset/Tmp/corpus.txt", + text_to_rdf_task_dataset_path:str = "./Assets/Dataset/Tmp/rdf_text.csv", + completation_rdf_task_dataset_path:str = "./Assets/Dataset/Tmp/rdf_completation.csv", + + ): self.sql_endpoint = SqlEndpoint() # classes to manage taskes' datasets - self.task_rdf_mask = RDF_mask_task_dataset("./Assets/Dataset/Tmp/rdf_mask.csv") - self.task_bpe_corpus = BPE_corpus("./Assets/Dataset/Tmp/corpus.txt") - self.task_rdf_text = RDF_text_task_dataset("./Assets/Dataset/Tmp/rdf_text.csv") - self.task_rdf_completation = RDF_completation_task_dataset("./Assets/Dataset/Tmp/rdf_completation.csv") + self.task_rdf_mask = RDF_mask_task_dataset(mask_task_dataset_path) + self.task_bpe_corpus = BPE_corpus(bpe_corpus_path) + self.task_rdf_text = RDF_text_task_dataset(text_to_rdf_task_dataset_path) + self.task_rdf_completation = RDF_completation_task_dataset(completation_rdf_task_dataset_path) # prepare the filter # the filter applier needs to know the frequence of Movies and Relationship among all the Dataset @@ -113,6 +119,9 @@ class Pipeline(): movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025] self.sql_endpoint.movie_ids = movie_list + def reduce_movie_list(self, starting_offset:int , ending_offset:int): + self.filter_applier.reduce_movie_list(starting_offset,ending_offset) + # there are a lot of settings to manage @@ -121,11 +130,11 @@ class Pipeline(): # in the use_toy_dataset , to change the toy dataset # in _get_cleaned_movie_rows: to change how the pipeline behave -pipeline = Pipeline() +#pipeline = Pipeline() # pipeline.use_toy_dataset() # pipeline.execute_task_bpe_corpus() # pipeline.execute_task_rdf_mask() # pipeline.execute_tasks_rdf_text() # pipeline.execute_task_rdf_completation() -pipeline.execute_all_task() \ No newline at end of file +# pipeline.execute_all_task() \ No newline at end of file