NanoSocrates/Scripts/DataCleaning/data_output_models/rdf_mask_task.py

import pandas as pd

# do not worry about circular dependencies, this class will never call something else
from Scripts.DataCleaning.legacy.filter import PipelineApplier

class RDF_mask_task_dataset():
    """
        Write the CSV for the third task, which is "Predicting a masked component within an RDF triple".
        The CSV is like: for each RDF there will be 3 rows, where every time one of the componments is missing.
        CSV Composition: ["MovieID","IncompleteRDF","Missing","RDF"]
    """
    def __init__(self, output_path:str):
     
        # this methods will only be used by this class, but they belong in a lower level
        self._build_triple = PipelineApplier.build_triple
        self._build_incomplete_triple = PipelineApplier.build_incomplete_triple

        self.output =  open(output_path, "w")
        # then the first row as header
        header = ["MovieID","IncompleteRDF","Missing","RDF"]
        self.output.write(",".join(header) + "\n")

    def close(self):
        self.output.close()

    def write(self, RDF: pd.DataFrame):
        rdf_complete = self._build_triple(RDF)

        rdf_without_subject = self._build_incomplete_triple(RDF.drop(columns=["SubjectURI"]))
        rdf_without_relationship = self._build_incomplete_triple(RDF.drop(columns=["RelationshipURI"]))
        rdf_without_object = self._build_incomplete_triple(RDF.drop(columns=["ObjectURI"]))
        ####
        df_subject = pd.DataFrame({
            "MovieID": RDF["MovieID"],
            "IncompleteRDF": rdf_without_subject,
            "Missing": RDF["SubjectURI"],
            "RDF": rdf_complete,
        })

        df_relationship = pd.DataFrame({
            "MovieID": RDF["MovieID"],
            "IncompleteRDF": rdf_without_relationship,
            "Missing": RDF["RelationshipURI"],
            "RDF": rdf_complete,
        })

        df_object = pd.DataFrame({
            "MovieID": RDF["MovieID"],
            "IncompleteRDF": rdf_without_object,
            "Missing": RDF["ObjectURI"],
            "RDF": rdf_complete,
        })


        output_df = pd.concat([df_subject, df_relationship, df_object], ignore_index=True)
        output_df.to_csv(self.output, index=False, header=False)
Added file to execute the complete cleaning pipeline 2025-09-29 15:21:26 +02:00			`import pandas as pd`

			`# do not worry about circular dependencies, this class will never call something else`
updated the mask rdf_mask_task. however since the model will build the mask itself, it is deprecated 2025-10-05 14:56:33 +02:00			`from Scripts.DataCleaning.legacy.filter import PipelineApplier`
Added file to execute the complete cleaning pipeline 2025-09-29 15:21:26 +02:00
			`class RDF_mask_task_dataset():`
			`"""`
			`Write the CSV for the third task, which is "Predicting a masked component within an RDF triple".`
			`The CSV is like: for each RDF there will be 3 rows, where every time one of the componments is missing.`
			`CSV Composition: ["MovieID","IncompleteRDF","Missing","RDF"]`
			`"""`
			`def __init__(self, output_path:str):`

			`# this methods will only be used by this class, but they belong in a lower level`
			`self._build_triple = PipelineApplier.build_triple`
			`self._build_incomplete_triple = PipelineApplier.build_incomplete_triple`

			`self.output = open(output_path, "w")`
			`# then the first row as header`
			`header = ["MovieID","IncompleteRDF","Missing","RDF"]`
			`self.output.write(",".join(header) + "\n")`

			`def close(self):`
			`self.output.close()`

			`def write(self, RDF: pd.DataFrame):`
			`rdf_complete = self._build_triple(RDF)`

			`rdf_without_subject = self._build_incomplete_triple(RDF.drop(columns=["SubjectURI"]))`
			`rdf_without_relationship = self._build_incomplete_triple(RDF.drop(columns=["RelationshipURI"]))`
			`rdf_without_object = self._build_incomplete_triple(RDF.drop(columns=["ObjectURI"]))`
			`####`
			`df_subject = pd.DataFrame({`
			`"MovieID": RDF["MovieID"],`
			`"IncompleteRDF": rdf_without_subject,`
			`"Missing": RDF["SubjectURI"],`
			`"RDF": rdf_complete,`
			`})`

			`df_relationship = pd.DataFrame({`
			`"MovieID": RDF["MovieID"],`
			`"IncompleteRDF": rdf_without_relationship,`
			`"Missing": RDF["RelationshipURI"],`
			`"RDF": rdf_complete,`
			`})`

			`df_object = pd.DataFrame({`
			`"MovieID": RDF["MovieID"],`
			`"IncompleteRDF": rdf_without_object,`
			`"Missing": RDF["ObjectURI"],`
			`"RDF": rdf_complete,`
			`})`


			`output_df = pd.concat([df_subject, df_relationship, df_object], ignore_index=True)`
			`output_df.to_csv(self.output, index=False, header=False)`