from movie_filter import MovieFilter from relationship_filter import RelationshipFilter from rdf_filter import RdfFilter from cleaner import PipelineApplier from Scripts.DataCleaning.data_output_models.bpe_corpus import BPE_corpus from Scripts.DataCleaning.data_output_models.rdf_text_tasks import RDF_text_task_dataset from Scripts.DataCleaning.data_output_models.rdf_completation_task import RDF_completation_task_dataset from Scripts.DataCleaning.data_output_models.debug_csv import Debug_csv import pandas as pd RELATIONSHIP_FILTER_LIST = [ "dbp-dbp:wikiPageUsesTemplate","w3:2000/01/rdf-schema#label","dbp-dbo:abstract", "dbp-dbo:wikiPageID","dbp-dbo:wikiPageRevisionID", "dbp-dbo:wikiPageDisambiguates", "w3:2002/07/owl#sameAs","dbp-dbp:image","dbp-dbo:wikiPageLength", "w3:2000/01/rdf-schema#comment", "dbp-dbo:thumbnail", "foaf:depiction", "w3:1999/02/22-rdf-syntax-ns#type", "dbp-dbp:id","dbp-dbp:totalWidth", "w3:ns/prov#wasDerivedFrom", "dbp-dbp:n", "dbp-dbp:alt", "dbp-dbo:soundRecording", "dbp-dbp:align", "dbp-dbp:format", "dbp-dbp:n", "dbp-dbp:filename", "dbp-dbp:wikt", "foaf:isPrimaryTopicOf", "dbp-dbp:quote", "foaf:homepage", "dbp-dbp:wordnet_type", "dbp-dbp:length" ] class Pipeline(): def __init__(self) -> None: self._movie_filter = MovieFilter() self._relationship_filter = RelationshipFilter() self._rdf_filter = RdfFilter() self._pipeline = PipelineApplier() self.task_bpe_corpus = BPE_corpus("./Assets/Dataset/Tmp/corpus.txt") self.task_rdf_text = RDF_text_task_dataset("./Assets/Dataset/Tmp/rdf_text.csv") self.task_rdf_completation = RDF_completation_task_dataset("./Assets/Dataset/Tmp/rdf_completation.csv") self._movie_filter.frequency_filter(50,3000) self._relationship_filter.frequency_filter(50, 2395627) # from 2718 to 3069 self._relationship_filter.delete_relationship_uri_by_list(RELATIONSHIP_FILTER_LIST) def other_filter(self): self._movie_filter.relation_filter("purl:dc/terms/subject",5,100) self._movie_filter.relation_filter("dbp-dbo:director",1,100) def _get_cleaned_movie_rows(self): movie_ids = self._movie_filter.get_movie_id() rel_ids = self._relationship_filter.get_relationship_id() for RDF in self._rdf_filter.yield_movie_abbreviated_rdfs(movie_ids,rel_ids): RDF = self._pipeline.drop_na_from_dataset(RDF) RDF = self._pipeline.regex_on_objects(RDF) RDF = self._pipeline.rdf_add_special_token(RDF) if RDF.empty: continue yield RDF def execute_task_bpe_corpus(self): for RDF in self._get_cleaned_movie_rows(): RDF = self._pipeline.rebuild_by_movie(RDF) RDF = RDF[["Triple","Abstract"]] self.task_bpe_corpus.write_from_df(RDF) self._end_file_handler() def execute_tasks_rdf_text(self): for RDF in self._get_cleaned_movie_rows(): RDF = self._pipeline.rebuild_by_movie(RDF) self.task_rdf_text.write(RDF) self._end_file_handler() def execute_task_rdf_completation(self): for RDF in self._get_cleaned_movie_rows(): RDF["Triple"] = self._pipeline.build_triple(RDF) self.task_rdf_completation.write(RDF[["MovieID","Triple"]]) self._end_file_handler() def _end_file_handler(self): self.task_bpe_corpus.close() self.task_rdf_text.close() self.task_rdf_completation.close() def execute_all_task(self): for RDF in self._get_cleaned_movie_rows(): completation_RDF = RDF.copy() completation_RDF["Triple"] = self._pipeline.build_triple(completation_RDF) self.task_rdf_completation.write(completation_RDF[["MovieID","Triple"]]) RDF = self._pipeline.rebuild_by_movie(RDF) self.task_rdf_text.write(RDF) self.task_bpe_corpus.write_from_df(RDF[["Triple","Abstract"]]) self._end_file_handler() def use_toy_dataset(self): # CHOOSEN MOVIE: # The Dark Knight : 117248 # Inception : 147074 # The Avengers : 113621 # Cast Away : 1123 # The Departed : 117586 # American Psycho : 90177 # Avatar : 71587 # Django Unchained : 138952 # Spirited Away : 144137 # Knives Out : 148025 movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025] self._movie_filter.MOVIE_FILTER = pd.DataFrame({"MovieID": movie_list}) def generate_csv_debug_file(self, debug_path:str): debug_csv = Debug_csv(debug_path) for RDF in self._get_cleaned_movie_rows(): debug_csv.write(RDF) debug_csv.close() pipe = Pipeline() pipe.use_toy_dataset() pipe.other_filter() # pipe.execute_all_task() pipe.generate_csv_debug_file("Assets/Dataset/Tmp/debug.csv")