Added file to execute the complete cleaning pipeline

2025-09-29 15:21:26 +02:00
parent 6ddb7de9da
commit bd72ad3571
9 changed files with 596 additions and 0 deletions
--- a/Scripts/DataCleaning/data_output_models/bpe_corpus.py
+++ b/Scripts/DataCleaning/data_output_models/bpe_corpus.py
@@ -0,0 +1,21 @@
+from Scripts.Libs.Utils.dataframe_interaction import get_raw_from_dataframe
+from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
+import pandas as pd
+
+class BPE_corpus():
+
+    def __init__(self, output_path :str):
+        self.output_handler = open(output_path, "w")
+
+    def close(self):
+        # add corpus end before closing
+        self.output_handler.write(SpecialToken.CORPUS_END.value)
+        self.output_handler.close()
+        
+    def write_from_str(self, output: str):
+        if output == '':
+            return
+        self.output_handler.write(output)
+
+    def write_from_df(self, df: pd.DataFrame):
+        self.write_from_str(get_raw_from_dataframe(df))
--- a/Scripts/DataCleaning/data_output_models/rdf_completation_task.py
+++ b/Scripts/DataCleaning/data_output_models/rdf_completation_task.py
@@ -0,0 +1,26 @@
+import pandas as pd
+
+class RDF_completation_task_dataset():
+    """
+        Write the CSV for the fourth task, which is "Predicting subsequent triples based on a given context".
+        Each RDF is saved as str
+        CSV Composition: ["MovieID","RDF"]
+    """
+    def __init__(self, output_path:str):
+     
+
+        self.output =  open(output_path, "w")
+        # then the first row as header
+        header = ["MovieID","RDF"]
+        self.output.write(",".join(header) + "\n")
+
+    def close(self):
+        self.output.close()
+
+    def write(self, RDF: pd.DataFrame):
+        """
+        Args:
+            RDF (pd.DataFrame): ["MovieID","RDF"]
+        """        
+
+        RDF.to_csv(self.output, index=False, header=False)
--- a/Scripts/DataCleaning/data_output_models/rdf_mask_task.py
+++ b/Scripts/DataCleaning/data_output_models/rdf_mask_task.py
@@ -0,0 +1,58 @@
+import pandas as pd
+
+# do not worry about circular dependencies, this class will never call something else
+from Scripts.DataCleaning.filter import PipelineApplier
+
+class RDF_mask_task_dataset():
+    """
+        Write the CSV for the third task, which is "Predicting a masked component within an RDF triple".
+        The CSV is like: for each RDF there will be 3 rows, where every time one of the componments is missing.
+        CSV Composition: ["MovieID","IncompleteRDF","Missing","RDF"]
+    """
+    def __init__(self, output_path:str):
+     
+        # this methods will only be used by this class, but they belong in a lower level
+        self._build_triple = PipelineApplier.build_triple
+        self._build_incomplete_triple = PipelineApplier.build_incomplete_triple
+
+        self.output =  open(output_path, "w")
+        # then the first row as header
+        header = ["MovieID","IncompleteRDF","Missing","RDF"]
+        self.output.write(",".join(header) + "\n")
+
+    def close(self):
+        self.output.close()
+
+    def write(self, RDF: pd.DataFrame):
+        rdf_complete = self._build_triple(RDF)
+
+        rdf_without_subject = self._build_incomplete_triple(RDF.drop(columns=["SubjectURI"]))
+        rdf_without_relationship = self._build_incomplete_triple(RDF.drop(columns=["RelationshipURI"]))
+        rdf_without_object = self._build_incomplete_triple(RDF.drop(columns=["ObjectURI"]))
+        ####
+        df_subject = pd.DataFrame({
+            "MovieID": RDF["MovieID"],
+            "IncompleteRDF": rdf_without_subject,
+            "Missing": RDF["SubjectURI"],
+            "RDF": rdf_complete,
+        })
+
+        df_relationship = pd.DataFrame({
+            "MovieID": RDF["MovieID"],
+            "IncompleteRDF": rdf_without_relationship,
+            "Missing": RDF["RelationshipURI"],
+            "RDF": rdf_complete,
+        })
+
+        df_object = pd.DataFrame({
+            "MovieID": RDF["MovieID"],
+            "IncompleteRDF": rdf_without_object,
+            "Missing": RDF["ObjectURI"],
+            "RDF": rdf_complete,
+        })
+
+
+        output_df = pd.concat([df_subject, df_relationship, df_object], ignore_index=True)
+        output_df.to_csv(self.output, index=False, header=False)
+
+
--- a/Scripts/DataCleaning/data_output_models/rdf_text_tasks.py
+++ b/Scripts/DataCleaning/data_output_models/rdf_text_tasks.py
@@ -0,0 +1,26 @@
+import pandas as pd
+
+class RDF_text_task_dataset():
+    """
+        Write the CSV for the firsts two tasks, which are "Generating structured RDF triples from natural language text" and reverse.
+        In the CVS the RDFs will be saved toghether as a string.
+        CSV Composition: ["MovieID","RDFs","Abstract"]
+    """
+    def __init__(self, output_path:str):
+     
+
+        self.output =  open(output_path, "w")
+        # then the first row as header
+        header = ["MovieID","RDFs","Abstract"]
+        self.output.write(",".join(header) + "\n")
+
+    def close(self):
+        self.output.close()
+
+    def write(self, RDF: pd.DataFrame):
+        """
+        Args:
+            RDF (pd.DataFrame): ["MovieID","Triple","Abstract"]
+        """        
+
+        RDF.to_csv(self.output, index=False, header=False)