59 lines
2.2 KiB
Python

import pandas as pd
# do not worry about circular dependencies, this class will never call something else
from Scripts.DataCleaning.filter import PipelineApplier
class RDF_mask_task_dataset():
"""
Write the CSV for the third task, which is "Predicting a masked component within an RDF triple".
The CSV is like: for each RDF there will be 3 rows, where every time one of the componments is missing.
CSV Composition: ["MovieID","IncompleteRDF","Missing","RDF"]
"""
def __init__(self, output_path:str):
# this methods will only be used by this class, but they belong in a lower level
self._build_triple = PipelineApplier.build_triple
self._build_incomplete_triple = PipelineApplier.build_incomplete_triple
self.output = open(output_path, "w")
# then the first row as header
header = ["MovieID","IncompleteRDF","Missing","RDF"]
self.output.write(",".join(header) + "\n")
def close(self):
self.output.close()
def write(self, RDF: pd.DataFrame):
rdf_complete = self._build_triple(RDF)
rdf_without_subject = self._build_incomplete_triple(RDF.drop(columns=["SubjectURI"]))
rdf_without_relationship = self._build_incomplete_triple(RDF.drop(columns=["RelationshipURI"]))
rdf_without_object = self._build_incomplete_triple(RDF.drop(columns=["ObjectURI"]))
####
df_subject = pd.DataFrame({
"MovieID": RDF["MovieID"],
"IncompleteRDF": rdf_without_subject,
"Missing": RDF["SubjectURI"],
"RDF": rdf_complete,
})
df_relationship = pd.DataFrame({
"MovieID": RDF["MovieID"],
"IncompleteRDF": rdf_without_relationship,
"Missing": RDF["RelationshipURI"],
"RDF": rdf_complete,
})
df_object = pd.DataFrame({
"MovieID": RDF["MovieID"],
"IncompleteRDF": rdf_without_object,
"Missing": RDF["ObjectURI"],
"RDF": rdf_complete,
})
output_df = pd.concat([df_subject, df_relationship, df_object], ignore_index=True)
output_df.to_csv(self.output, index=False, header=False)