From 64e355e80c7d6c84747448194098a7305eb9cf43 Mon Sep 17 00:00:00 2001 From: GassiGiuseppe Date: Tue, 30 Sep 2025 15:00:07 +0200 Subject: [PATCH 1/2] Added regex to delete new lines and * from ObjectURI --- Scripts/DataCleaning/filter.py | 6 ++++++ Scripts/DataCleaning/pipeline.py | 2 ++ 2 files changed, 8 insertions(+) diff --git a/Scripts/DataCleaning/filter.py b/Scripts/DataCleaning/filter.py index 50d6ead..c555e3d 100644 --- a/Scripts/DataCleaning/filter.py +++ b/Scripts/DataCleaning/filter.py @@ -182,3 +182,9 @@ class PipelineApplier(): # as input two dataframe, one with 2 column return None + def regex_on_objects(self, RDF: pd.DataFrame) -> pd.DataFrame: + RDF["ObjectURI"] = (RDF["ObjectURI"].astype("string") + .str.replace(r"\r?\n+", ", ", regex=True) # newlines -> ", " + .str.replace(r"\*", "", regex=True)) # delete all asterisks + + return RDF \ No newline at end of file diff --git a/Scripts/DataCleaning/pipeline.py b/Scripts/DataCleaning/pipeline.py index eb5b2f7..48a0af3 100644 --- a/Scripts/DataCleaning/pipeline.py +++ b/Scripts/DataCleaning/pipeline.py @@ -92,6 +92,8 @@ class Pipeline(): # other filter # RDF = self.filter_applier.delete_relationship_by_list_filter(RDF) + # regex on ObjectURI + RDF = self.filter_applier.regex_on_objects(RDF) if RDF.empty: continue RDF = self.filter_applier.rdf_add_special_token(RDF) # WARNING, THIS MUST BE DONE AFTER FILTER BY FREQUENCE From 69fba7c3e97825d48ab9ad1d48173e1bc69be913 Mon Sep 17 00:00:00 2001 From: GassiGiuseppe Date: Sat, 4 Oct 2025 21:33:09 +0200 Subject: [PATCH 2/2] new utility to generate a csv debug file of the output of the pipeline --- .../data_output_models/debug_csv.py | 21 +++++++++++++++++++ Scripts/DataCleaning/pipeline.py | 13 ++++++++++-- 2 files changed, 32 insertions(+), 2 deletions(-) create mode 100644 Scripts/DataCleaning/data_output_models/debug_csv.py diff --git a/Scripts/DataCleaning/data_output_models/debug_csv.py b/Scripts/DataCleaning/data_output_models/debug_csv.py new file mode 100644 index 0000000..c120765 --- /dev/null +++ b/Scripts/DataCleaning/data_output_models/debug_csv.py @@ -0,0 +1,21 @@ +import pandas as pd + +class Debug_csv(): + def __init__(self, output_path:str): + + + self.output = open(output_path, "w") + # then the first row as header + header = ["MovieURI","SubjectURI","RelationshipURI","ObjectURI","Abstract"] + self.output.write(",".join(header) + "\n") + + def close(self): + self.output.close() + + def write(self, RDF: pd.DataFrame): + """ + Args: + RDF (pd.DataFrame): ["MovieURI","SubjectURI","RelationshipURI","ObjectURI","Abstract"] + """ + + RDF.to_csv(self.output, index=False, header=False) \ No newline at end of file diff --git a/Scripts/DataCleaning/pipeline.py b/Scripts/DataCleaning/pipeline.py index 48a0af3..0106b10 100644 --- a/Scripts/DataCleaning/pipeline.py +++ b/Scripts/DataCleaning/pipeline.py @@ -6,6 +6,7 @@ from Scripts.DataCleaning.data_output_models.rdf_mask_task import RDF_mask_task_ from Scripts.DataCleaning.data_output_models.bpe_corpus import BPE_corpus from Scripts.DataCleaning.data_output_models.rdf_text_tasks import RDF_text_task_dataset from Scripts.DataCleaning.data_output_models.rdf_completation_task import RDF_completation_task_dataset +from Scripts.DataCleaning.data_output_models.debug_csv import Debug_csv import pandas as pd @@ -115,6 +116,13 @@ class Pipeline(): movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025] self.sql_endpoint.movie_ids = movie_list + def generate_csv_debug_file(self, debug_path:str): + debug_csv = Debug_csv(debug_path) + + for RDF in self._get_cleaned_movie_rows(): + debug_csv.write(RDF) + + debug_csv.close() # there are a lot of settings to manage @@ -125,9 +133,10 @@ class Pipeline(): pipeline = Pipeline() -# pipeline.use_toy_dataset() +pipeline.use_toy_dataset() # pipeline.execute_task_bpe_corpus() # pipeline.execute_task_rdf_mask() # pipeline.execute_tasks_rdf_text() # pipeline.execute_task_rdf_completation() -pipeline.execute_all_task() \ No newline at end of file +# pipeline.execute_all_task() +pipeline.generate_csv_debug_file("Assets/Dataset/Tmp/debug.csv") \ No newline at end of file