Added file to execute the complete cleaning pipeline
This commit is contained in:
21
Scripts/DataCleaning/data_output_models/bpe_corpus.py
Normal file
21
Scripts/DataCleaning/data_output_models/bpe_corpus.py
Normal file
@@ -0,0 +1,21 @@
|
||||
from Scripts.Libs.Utils.dataframe_interaction import get_raw_from_dataframe
|
||||
from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
|
||||
import pandas as pd
|
||||
|
||||
class BPE_corpus():
|
||||
|
||||
def __init__(self, output_path :str):
|
||||
self.output_handler = open(output_path, "w")
|
||||
|
||||
def close(self):
|
||||
# add corpus end before closing
|
||||
self.output_handler.write(SpecialToken.CORPUS_END.value)
|
||||
self.output_handler.close()
|
||||
|
||||
def write_from_str(self, output: str):
|
||||
if output == '':
|
||||
return
|
||||
self.output_handler.write(output)
|
||||
|
||||
def write_from_df(self, df: pd.DataFrame):
|
||||
self.write_from_str(get_raw_from_dataframe(df))
|
||||
@@ -0,0 +1,26 @@
|
||||
import pandas as pd
|
||||
|
||||
class RDF_completation_task_dataset():
|
||||
"""
|
||||
Write the CSV for the fourth task, which is "Predicting subsequent triples based on a given context".
|
||||
Each RDF is saved as str
|
||||
CSV Composition: ["MovieID","RDF"]
|
||||
"""
|
||||
def __init__(self, output_path:str):
|
||||
|
||||
|
||||
self.output = open(output_path, "w")
|
||||
# then the first row as header
|
||||
header = ["MovieID","RDF"]
|
||||
self.output.write(",".join(header) + "\n")
|
||||
|
||||
def close(self):
|
||||
self.output.close()
|
||||
|
||||
def write(self, RDF: pd.DataFrame):
|
||||
"""
|
||||
Args:
|
||||
RDF (pd.DataFrame): ["MovieID","RDF"]
|
||||
"""
|
||||
|
||||
RDF.to_csv(self.output, index=False, header=False)
|
||||
58
Scripts/DataCleaning/data_output_models/rdf_mask_task.py
Normal file
58
Scripts/DataCleaning/data_output_models/rdf_mask_task.py
Normal file
@@ -0,0 +1,58 @@
|
||||
import pandas as pd
|
||||
|
||||
# do not worry about circular dependencies, this class will never call something else
|
||||
from Scripts.DataCleaning.filter import PipelineApplier
|
||||
|
||||
class RDF_mask_task_dataset():
|
||||
"""
|
||||
Write the CSV for the third task, which is "Predicting a masked component within an RDF triple".
|
||||
The CSV is like: for each RDF there will be 3 rows, where every time one of the componments is missing.
|
||||
CSV Composition: ["MovieID","IncompleteRDF","Missing","RDF"]
|
||||
"""
|
||||
def __init__(self, output_path:str):
|
||||
|
||||
# this methods will only be used by this class, but they belong in a lower level
|
||||
self._build_triple = PipelineApplier.build_triple
|
||||
self._build_incomplete_triple = PipelineApplier.build_incomplete_triple
|
||||
|
||||
self.output = open(output_path, "w")
|
||||
# then the first row as header
|
||||
header = ["MovieID","IncompleteRDF","Missing","RDF"]
|
||||
self.output.write(",".join(header) + "\n")
|
||||
|
||||
def close(self):
|
||||
self.output.close()
|
||||
|
||||
def write(self, RDF: pd.DataFrame):
|
||||
rdf_complete = self._build_triple(RDF)
|
||||
|
||||
rdf_without_subject = self._build_incomplete_triple(RDF.drop(columns=["SubjectURI"]))
|
||||
rdf_without_relationship = self._build_incomplete_triple(RDF.drop(columns=["RelationshipURI"]))
|
||||
rdf_without_object = self._build_incomplete_triple(RDF.drop(columns=["ObjectURI"]))
|
||||
####
|
||||
df_subject = pd.DataFrame({
|
||||
"MovieID": RDF["MovieID"],
|
||||
"IncompleteRDF": rdf_without_subject,
|
||||
"Missing": RDF["SubjectURI"],
|
||||
"RDF": rdf_complete,
|
||||
})
|
||||
|
||||
df_relationship = pd.DataFrame({
|
||||
"MovieID": RDF["MovieID"],
|
||||
"IncompleteRDF": rdf_without_relationship,
|
||||
"Missing": RDF["RelationshipURI"],
|
||||
"RDF": rdf_complete,
|
||||
})
|
||||
|
||||
df_object = pd.DataFrame({
|
||||
"MovieID": RDF["MovieID"],
|
||||
"IncompleteRDF": rdf_without_object,
|
||||
"Missing": RDF["ObjectURI"],
|
||||
"RDF": rdf_complete,
|
||||
})
|
||||
|
||||
|
||||
output_df = pd.concat([df_subject, df_relationship, df_object], ignore_index=True)
|
||||
output_df.to_csv(self.output, index=False, header=False)
|
||||
|
||||
|
||||
26
Scripts/DataCleaning/data_output_models/rdf_text_tasks.py
Normal file
26
Scripts/DataCleaning/data_output_models/rdf_text_tasks.py
Normal file
@@ -0,0 +1,26 @@
|
||||
import pandas as pd
|
||||
|
||||
class RDF_text_task_dataset():
|
||||
"""
|
||||
Write the CSV for the firsts two tasks, which are "Generating structured RDF triples from natural language text" and reverse.
|
||||
In the CVS the RDFs will be saved toghether as a string.
|
||||
CSV Composition: ["MovieID","RDFs","Abstract"]
|
||||
"""
|
||||
def __init__(self, output_path:str):
|
||||
|
||||
|
||||
self.output = open(output_path, "w")
|
||||
# then the first row as header
|
||||
header = ["MovieID","RDFs","Abstract"]
|
||||
self.output.write(",".join(header) + "\n")
|
||||
|
||||
def close(self):
|
||||
self.output.close()
|
||||
|
||||
def write(self, RDF: pd.DataFrame):
|
||||
"""
|
||||
Args:
|
||||
RDF (pd.DataFrame): ["MovieID","Triple","Abstract"]
|
||||
"""
|
||||
|
||||
RDF.to_csv(self.output, index=False, header=False)
|
||||
Reference in New Issue
Block a user