From 64e355e80c7d6c84747448194098a7305eb9cf43 Mon Sep 17 00:00:00 2001
From: GassiGiuseppe <g.gassi@studenti.poliba.it>
Date: Tue, 30 Sep 2025 15:00:07 +0200
Subject: [PATCH 1/2] Added regex to delete new lines and * from ObjectURI

---
 Scripts/DataCleaning/filter.py   | 6 ++++++
 Scripts/DataCleaning/pipeline.py | 2 ++
 2 files changed, 8 insertions(+)

diff --git a/Scripts/DataCleaning/filter.py b/Scripts/DataCleaning/filter.py
index 50d6ead..c555e3d 100644
--- a/Scripts/DataCleaning/filter.py
+++ b/Scripts/DataCleaning/filter.py
@@ -182,3 +182,9 @@ class PipelineApplier():
         # as input two dataframe, one with 2 column  
         return None
 
+    def regex_on_objects(self, RDF: pd.DataFrame) -> pd.DataFrame:
+        RDF["ObjectURI"] = (RDF["ObjectURI"].astype("string")
+                   .str.replace(r"\r?\n+", ", ", regex=True)   # newlines -> ", "
+                   .str.replace(r"\*", "", regex=True))        # delete all asterisks
+
+        return RDF
\ No newline at end of file
diff --git a/Scripts/DataCleaning/pipeline.py b/Scripts/DataCleaning/pipeline.py
index eb5b2f7..48a0af3 100644
--- a/Scripts/DataCleaning/pipeline.py
+++ b/Scripts/DataCleaning/pipeline.py
@@ -92,6 +92,8 @@ class Pipeline():
             # other filter
             #
             RDF = self.filter_applier.delete_relationship_by_list_filter(RDF)
+            # regex on ObjectURI
+            RDF = self.filter_applier.regex_on_objects(RDF)
             if RDF.empty:
                 continue
             RDF = self.filter_applier.rdf_add_special_token(RDF)  # WARNING, THIS MUST BE DONE AFTER FILTER BY FREQUENCE

From 69fba7c3e97825d48ab9ad1d48173e1bc69be913 Mon Sep 17 00:00:00 2001
From: GassiGiuseppe <g.gassi@studenti.poliba.it>
Date: Sat, 4 Oct 2025 21:33:09 +0200
Subject: [PATCH 2/2] new utility to generate a csv debug file of the output of
 the pipeline

---
 .../data_output_models/debug_csv.py           | 21 +++++++++++++++++++
 Scripts/DataCleaning/pipeline.py              | 13 ++++++++++--
 2 files changed, 32 insertions(+), 2 deletions(-)
 create mode 100644 Scripts/DataCleaning/data_output_models/debug_csv.py

diff --git a/Scripts/DataCleaning/data_output_models/debug_csv.py b/Scripts/DataCleaning/data_output_models/debug_csv.py
new file mode 100644
index 0000000..c120765
--- /dev/null
+++ b/Scripts/DataCleaning/data_output_models/debug_csv.py
@@ -0,0 +1,21 @@
+import pandas as pd
+
+class Debug_csv():
+    def __init__(self, output_path:str):
+     
+
+        self.output =  open(output_path, "w")
+        # then the first row as header
+        header = ["MovieURI","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
+        self.output.write(",".join(header) + "\n")
+
+    def close(self):
+        self.output.close()
+
+    def write(self, RDF: pd.DataFrame):
+        """
+        Args:
+            RDF (pd.DataFrame): ["MovieURI","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
+        """        
+
+        RDF.to_csv(self.output, index=False, header=False)
\ No newline at end of file
diff --git a/Scripts/DataCleaning/pipeline.py b/Scripts/DataCleaning/pipeline.py
index 48a0af3..0106b10 100644
--- a/Scripts/DataCleaning/pipeline.py
+++ b/Scripts/DataCleaning/pipeline.py
@@ -6,6 +6,7 @@ from Scripts.DataCleaning.data_output_models.rdf_mask_task import RDF_mask_task_
 from Scripts.DataCleaning.data_output_models.bpe_corpus import BPE_corpus
 from Scripts.DataCleaning.data_output_models.rdf_text_tasks import RDF_text_task_dataset
 from Scripts.DataCleaning.data_output_models.rdf_completation_task import RDF_completation_task_dataset
+from Scripts.DataCleaning.data_output_models.debug_csv import Debug_csv
 
 import pandas as pd
 
@@ -115,6 +116,13 @@ class Pipeline():
         movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]
         self.sql_endpoint.movie_ids = movie_list
 
+    def generate_csv_debug_file(self, debug_path:str):
+        debug_csv = Debug_csv(debug_path)
+
+        for RDF in self._get_cleaned_movie_rows():
+            debug_csv.write(RDF)
+
+        debug_csv.close()
 
 
 # there are a lot of settings to manage
@@ -125,9 +133,10 @@ class Pipeline():
 
 pipeline = Pipeline()
 
-# pipeline.use_toy_dataset()
+pipeline.use_toy_dataset()
 # pipeline.execute_task_bpe_corpus()
 # pipeline.execute_task_rdf_mask()
 # pipeline.execute_tasks_rdf_text()
 # pipeline.execute_task_rdf_completation()
-pipeline.execute_all_task()
\ No newline at end of file
+# pipeline.execute_all_task()
+pipeline.generate_csv_debug_file("Assets/Dataset/Tmp/debug.csv")
\ No newline at end of file