2025-09-29 15:21:26 +02:00
|
|
|
import pandas as pd
|
|
|
|
|
|
|
|
|
|
# do not worry about circular dependencies, this class will never call something else
|
2025-10-05 14:56:33 +02:00
|
|
|
from Scripts.DataCleaning.legacy.filter import PipelineApplier
|
2025-09-29 15:21:26 +02:00
|
|
|
|
|
|
|
|
class RDF_mask_task_dataset():
|
|
|
|
|
"""
|
|
|
|
|
Write the CSV for the third task, which is "Predicting a masked component within an RDF triple".
|
|
|
|
|
The CSV is like: for each RDF there will be 3 rows, where every time one of the componments is missing.
|
|
|
|
|
CSV Composition: ["MovieID","IncompleteRDF","Missing","RDF"]
|
|
|
|
|
"""
|
|
|
|
|
def __init__(self, output_path:str):
|
|
|
|
|
|
|
|
|
|
# this methods will only be used by this class, but they belong in a lower level
|
|
|
|
|
self._build_triple = PipelineApplier.build_triple
|
|
|
|
|
self._build_incomplete_triple = PipelineApplier.build_incomplete_triple
|
|
|
|
|
|
|
|
|
|
self.output = open(output_path, "w")
|
|
|
|
|
# then the first row as header
|
|
|
|
|
header = ["MovieID","IncompleteRDF","Missing","RDF"]
|
|
|
|
|
self.output.write(",".join(header) + "\n")
|
|
|
|
|
|
|
|
|
|
def close(self):
|
|
|
|
|
self.output.close()
|
|
|
|
|
|
|
|
|
|
def write(self, RDF: pd.DataFrame):
|
|
|
|
|
rdf_complete = self._build_triple(RDF)
|
|
|
|
|
|
|
|
|
|
rdf_without_subject = self._build_incomplete_triple(RDF.drop(columns=["SubjectURI"]))
|
|
|
|
|
rdf_without_relationship = self._build_incomplete_triple(RDF.drop(columns=["RelationshipURI"]))
|
|
|
|
|
rdf_without_object = self._build_incomplete_triple(RDF.drop(columns=["ObjectURI"]))
|
|
|
|
|
####
|
|
|
|
|
df_subject = pd.DataFrame({
|
|
|
|
|
"MovieID": RDF["MovieID"],
|
|
|
|
|
"IncompleteRDF": rdf_without_subject,
|
|
|
|
|
"Missing": RDF["SubjectURI"],
|
|
|
|
|
"RDF": rdf_complete,
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
df_relationship = pd.DataFrame({
|
|
|
|
|
"MovieID": RDF["MovieID"],
|
|
|
|
|
"IncompleteRDF": rdf_without_relationship,
|
|
|
|
|
"Missing": RDF["RelationshipURI"],
|
|
|
|
|
"RDF": rdf_complete,
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
df_object = pd.DataFrame({
|
|
|
|
|
"MovieID": RDF["MovieID"],
|
|
|
|
|
"IncompleteRDF": rdf_without_object,
|
|
|
|
|
"Missing": RDF["ObjectURI"],
|
|
|
|
|
"RDF": rdf_complete,
|
|
|
|
|
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
output_df = pd.concat([df_subject, df_relationship, df_object], ignore_index=True)
|
|
|
|
|
output_df.to_csv(self.output, index=False, header=False)
|
|
|
|
|
|
|
|
|
|
|