151 lines
7.0 KiB
Python
151 lines
7.0 KiB
Python
from movie_filter import MovieFilter
|
|
from relationship_filter import RelationshipFilter
|
|
from rdf_filter import RdfFilter
|
|
from cleaner import PipelineApplier
|
|
|
|
from Scripts.DataCleaning.data_output_models.bpe_corpus import BPE_corpus
|
|
from Scripts.DataCleaning.data_output_models.rdf_text_tasks import RDF_text_task_dataset
|
|
from Scripts.DataCleaning.data_output_models.rdf_completation_task import RDF_completation_task_dataset
|
|
from Scripts.DataCleaning.data_output_models.debug_csv import Debug_csv
|
|
|
|
import pandas as pd
|
|
|
|
RELATIONSHIP_FILTER_LIST = [
|
|
"dbp-dbp:wikiPageUsesTemplate","w3:2000/01/rdf-schema#label","dbp-dbo:abstract",
|
|
"dbp-dbo:wikiPageID","dbp-dbo:wikiPageRevisionID", "dbp-dbo:wikiPageDisambiguates",
|
|
"w3:2002/07/owl#sameAs","dbp-dbp:image","dbp-dbo:wikiPageLength", "w3:2000/01/rdf-schema#comment",
|
|
"dbp-dbo:thumbnail", "foaf:depiction", "w3:1999/02/22-rdf-syntax-ns#type",
|
|
"dbp-dbp:id","dbp-dbp:totalWidth", "w3:ns/prov#wasDerivedFrom", "dbp-dbp:n", "dbp-dbp:alt",
|
|
"dbp-dbo:soundRecording", "dbp-dbp:align", "dbp-dbp:format",
|
|
"dbp-dbp:filename", "dbp-dbp:wikt", "foaf:isPrimaryTopicOf", "dbp-dbp:quote", "foaf:homepage",
|
|
"dbp-dbp:wordnet_type", "dbp-dbp:length","dbp-dbp:caption", "dbp-dbo:imdbId", "dbp-dbp:border", "dbp-dbp:note",
|
|
"dbp-dbp:postalCodeType", "dbp-dbp:extraColumn", "foaf:homepage", "dbp-dbp:bgcolor","dbp-dbp:prevTitle",
|
|
"dbp-dbp:imageUpright", "dbp-dbp:url", "dbp-dbp:italicTitle", "dbp-dbp:imageSize", "dbp-dbp:text",
|
|
"dbp-dbp:captionAlign", "dbp-dbp:headerAlign", "dbp-dbp:height", "dbp-dbp:link", "dbp-dbo:wikiPageInterLanguageLink",
|
|
"w3:2003/01/geo/wgs84_pos#lat", "w3:2003/01/geo/wgs84_pos#long", "http://www.georss.org/georss/point",
|
|
"dbp-dbp:bgcolor", "dbp-dbp:mc", "dbp-dbp:rev3score", "dbp-dbp:rev4score", "dbp-dbp:imageAlt",
|
|
"dbp-dbp:b", "dbp-dbp:s", "dbp-dbp:c", "dbp-dbp:d", "dbp-dbp:m", "dbp-dbp:v", "dbp-dbp:mw", "dbp-dbp:fontsize",
|
|
"dbp-dbp:salign", "dbp-dbp:q", "dbp-dbp:portal", "dbp-dbp:dSearch", "dbp-dbp:header", "w3:2003/01/geo/wgs84_pos#geometry",
|
|
"dbp-dbp:shortsummary", "dbp-dbp:fixAttempted", "dbp-dbo:developer", "dbp-dbp:no", "dbp-dbp:ref", "dbp-dbp:infoa"
|
|
"dbp-dbp:infob", "dbp-dbp:1a", "dbp-dbp:1p", "dbp-dbp:2a", "dbp-dbp:2p", "http://rdvocab.info/RDARelationshipsWEMI/manifestationOfWork",
|
|
"dbp-dbp:isbn", "dbp-dbp:titleWidth", "dbp-dbp:prodcode", "dbp-dbp:page", "w3:2004/02/skos/core#closeMatch",
|
|
"dbp-dbp:colwidth", "dbp-dbp:imagesize", "dbp-dbp:rr", "dbp-dbp:date", "dbp-dbp:type", "dbp-dbp:list",
|
|
"dbp-dbp:listEpisodes", "dbp-dbp:footerAlign", "dbp-dbp:float", "dbp-dbp:bot", "dbp-dbp:p", "dbp-dbp:l", "dbp-dbp:t", "dbp-dbp:j",
|
|
"dbp-dbp:1y", "dbp-dbp:2y", "dbp-dbp:1pp", "dbp-dbp:vgs", "dbp-dbp:3a", "dbp-dbp:3p", "dbp-dbp:3y", "dbp-dbp:4a", "dbp-dbp:4y",
|
|
"dbp-dbp:website"
|
|
]
|
|
|
|
"""
|
|
SELECT DISTINCT field3
|
|
FROM debug
|
|
"""
|
|
|
|
class Pipeline():
|
|
|
|
def __init__(self) -> None:
|
|
self._movie_filter = MovieFilter()
|
|
self._relationship_filter = RelationshipFilter()
|
|
self._rdf_filter = RdfFilter()
|
|
self._pipeline = PipelineApplier()
|
|
|
|
self.task_bpe_corpus = BPE_corpus("./Assets/Dataset/Tmp/corpus.txt")
|
|
self.task_rdf_text = RDF_text_task_dataset("./Assets/Dataset/Tmp/rdf_text.csv")
|
|
self.task_rdf_completation = RDF_completation_task_dataset("./Assets/Dataset/Tmp/rdf_completation.csv")
|
|
|
|
self._movie_filter.frequency_filter(50,3000)
|
|
self._relationship_filter.frequency_filter(25, 2395627) # from 2718 to 3069
|
|
self._relationship_filter.delete_relationship_uri_by_list(RELATIONSHIP_FILTER_LIST)
|
|
|
|
def other_filter(self):
|
|
self._movie_filter.relation_filter("purl:dc/terms/subject",5,100)
|
|
self._movie_filter.filter_by_director()
|
|
self._movie_filter.filter_by_english_movies()
|
|
self._movie_filter.relation_filter("dbp-dbp:budget",1,100) # the most important film have relationship budget
|
|
self._movie_filter.relation_filter("dbp-dbp:released",1,100) # to cut to 2000 :(
|
|
|
|
def _get_cleaned_movie_rows(self):
|
|
movie_ids = self._movie_filter.get_movie_id()
|
|
rel_ids = self._relationship_filter.get_relationship_id()
|
|
|
|
for RDF in self._rdf_filter.yield_movie_abbreviated_rdfs(movie_ids,rel_ids):
|
|
RDF = self._pipeline.drop_na_from_dataset(RDF)
|
|
RDF = self._pipeline.regex_on_objects(RDF)
|
|
RDF = self._pipeline.rdf_add_special_token(RDF)
|
|
|
|
if RDF.empty:
|
|
continue
|
|
yield RDF
|
|
|
|
|
|
def execute_task_bpe_corpus(self):
|
|
for RDF in self._get_cleaned_movie_rows():
|
|
RDF = self._pipeline.rebuild_by_movie(RDF)
|
|
RDF = RDF[["Triple","Abstract"]]
|
|
self.task_bpe_corpus.write_from_df(RDF)
|
|
self._end_file_handler()
|
|
|
|
|
|
def execute_tasks_rdf_text(self):
|
|
for RDF in self._get_cleaned_movie_rows():
|
|
RDF = self._pipeline.rebuild_by_movie(RDF)
|
|
self.task_rdf_text.write(RDF)
|
|
self._end_file_handler()
|
|
|
|
|
|
def execute_task_rdf_completation(self):
|
|
for RDF in self._get_cleaned_movie_rows():
|
|
RDF["Triple"] = self._pipeline.build_triple(RDF)
|
|
self.task_rdf_completation.write(RDF[["MovieID","Triple"]])
|
|
self._end_file_handler()
|
|
|
|
|
|
def _end_file_handler(self):
|
|
self.task_bpe_corpus.close()
|
|
self.task_rdf_text.close()
|
|
self.task_rdf_completation.close()
|
|
|
|
|
|
def execute_all_task(self):
|
|
for RDF in self._get_cleaned_movie_rows():
|
|
completation_RDF = RDF.copy()
|
|
completation_RDF["Triple"] = self._pipeline.build_triple(completation_RDF)
|
|
self.task_rdf_completation.write(completation_RDF[["MovieID","Triple"]])
|
|
|
|
RDF = self._pipeline.rebuild_by_movie(RDF)
|
|
|
|
self.task_rdf_text.write(RDF)
|
|
self.task_bpe_corpus.write_from_df(RDF[["Triple","Abstract"]])
|
|
|
|
self._end_file_handler()
|
|
|
|
|
|
def use_toy_dataset(self):
|
|
# CHOOSEN MOVIE:
|
|
# The Dark Knight : 117248
|
|
# Inception : 147074
|
|
# The Avengers : 113621
|
|
# Cast Away : 1123
|
|
# The Departed : 117586
|
|
# American Psycho : 90177
|
|
# Avatar : 71587
|
|
# Django Unchained : 138952
|
|
# Spirited Away : 144137
|
|
# Knives Out : 148025
|
|
# [106465,106466,106467,106468,106469,106470,106471,106472,106473]
|
|
movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]
|
|
self._movie_filter.MOVIE_FILTER = pd.DataFrame({"MovieID": movie_list})
|
|
|
|
def generate_csv_debug_file(self, debug_path:str):
|
|
debug_csv = Debug_csv(debug_path)
|
|
|
|
for RDF in self._get_cleaned_movie_rows():
|
|
debug_csv.write(RDF)
|
|
|
|
debug_csv.close()
|
|
|
|
|
|
pipe = Pipeline()
|
|
#pipe.use_toy_dataset()
|
|
pipe.other_filter()
|
|
pipe.execute_all_task()
|
|
# pipe.generate_csv_debug_file("Assets/Dataset/Tmp/debug.csv") |