import pandas as pd # do not worry about circular dependencies, this class will never call something else from Scripts.DataCleaning.legacy.filter import PipelineApplier class RDF_mask_task_dataset(): """ Write the CSV for the third task, which is "Predicting a masked component within an RDF triple". The CSV is like: for each RDF there will be 3 rows, where every time one of the componments is missing. CSV Composition: ["MovieID","IncompleteRDF","Missing","RDF"] """ def __init__(self, output_path:str): # this methods will only be used by this class, but they belong in a lower level self._build_triple = PipelineApplier.build_triple self._build_incomplete_triple = PipelineApplier.build_incomplete_triple self.output = open(output_path, "w") # then the first row as header header = ["MovieID","IncompleteRDF","Missing","RDF"] self.output.write(",".join(header) + "\n") def close(self): self.output.close() def write(self, RDF: pd.DataFrame): rdf_complete = self._build_triple(RDF) rdf_without_subject = self._build_incomplete_triple(RDF.drop(columns=["SubjectURI"])) rdf_without_relationship = self._build_incomplete_triple(RDF.drop(columns=["RelationshipURI"])) rdf_without_object = self._build_incomplete_triple(RDF.drop(columns=["ObjectURI"])) #### df_subject = pd.DataFrame({ "MovieID": RDF["MovieID"], "IncompleteRDF": rdf_without_subject, "Missing": RDF["SubjectURI"], "RDF": rdf_complete, }) df_relationship = pd.DataFrame({ "MovieID": RDF["MovieID"], "IncompleteRDF": rdf_without_relationship, "Missing": RDF["RelationshipURI"], "RDF": rdf_complete, }) df_object = pd.DataFrame({ "MovieID": RDF["MovieID"], "IncompleteRDF": rdf_without_object, "Missing": RDF["ObjectURI"], "RDF": rdf_complete, }) output_df = pd.concat([df_subject, df_relationship, df_object], ignore_index=True) output_df.to_csv(self.output, index=False, header=False)