import re from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint from Scripts.DataCleaning.filter import PipelineApplier # tasks dataset builder from Scripts.DataCleaning.data_output_models.rdf_mask_task import RDF_mask_task_dataset from Scripts.DataCleaning.data_output_models.bpe_corpus import BPE_corpus from Scripts.DataCleaning.data_output_models.rdf_text_tasks import RDF_text_task_dataset from Scripts.DataCleaning.data_output_models.rdf_completation_task import RDF_completation_task_dataset import pandas as pd class Pipeline(): def __init__(self, mask_task_dataset_path:str = "./Assets/Dataset/Tmp/rdf_mask.csv", bpe_corpus_path:str = "./Assets/Dataset/Tmp/corpus.txt", text_to_rdf_task_dataset_path:str = "./Assets/Dataset/Tmp/rdf_text.csv", completation_rdf_task_dataset_path:str = "./Assets/Dataset/Tmp/rdf_completation.csv", ): self.sql_endpoint = SqlEndpoint() # classes to manage taskes' datasets self.task_rdf_mask = RDF_mask_task_dataset(mask_task_dataset_path) self.task_bpe_corpus = BPE_corpus(bpe_corpus_path) self.task_rdf_text = RDF_text_task_dataset(text_to_rdf_task_dataset_path) self.task_rdf_completation = RDF_completation_task_dataset(completation_rdf_task_dataset_path) # prepare the filter # the filter applier needs to know the frequence of Movies and Relationship among all the Dataset self.filter_applier = PipelineApplier() MOVIE_COUNT = self.sql_endpoint.get_movies_id_count() REL_COUNT = self.sql_endpoint.get_relationship_count() self.filter_applier.generate_frequency_movie_filter(MOVIE_COUNT,50,3000) self.filter_applier.generate_frequency_relationship_filter(REL_COUNT, 50, 2395627) # prepare the filter on the relationshipURI you want to delete: relationship_uri_banned_list = [ "dbp-dbp:wikiPageUsesTemplate","w3:2000/01/rdf-schema#label","dbp-dbo:abstract", "dbp-dbo:wikiPageID","dbp-dbo:wikiPageRevisionID", "dbp-dbo:wikiPageDisambiguates", "w3:2002/07/owl#sameAs","dbp-dbp:image","dbp-dbo:wikiPageLength", "w3:2000/01/rdf-schema#comment", "dbp-dbo:thumbnail", "foaf:depiction", "w3:1999/02/22-rdf-syntax-ns#type"] self.filter_applier.generate_list_relationship_filter(relationship_uri_banned_list) def execute_task_bpe_corpus(self): for RDF in self._get_cleaned_movie_rows(): RDF = self.filter_applier.rebuild_by_movie(RDF) RDF = RDF[["Triple","Abstract"]] self.task_bpe_corpus.write_from_df(RDF) self._end_file_handler() def execute_task_rdf_mask(self): for RDF in self._get_cleaned_movie_rows(): self.task_rdf_mask.write(RDF) self._end_file_handler() def execute_tasks_rdf_text(self): for RDF in self._get_cleaned_movie_rows(): RDF = self.filter_applier.rebuild_by_movie(RDF) self.task_rdf_text.write(RDF) self._end_file_handler() def execute_task_rdf_completation(self): for RDF in self._get_cleaned_movie_rows(): RDF["Triple"] = self.filter_applier.build_triple(RDF) self.task_rdf_completation.write(RDF[["MovieID","Triple"]]) self._end_file_handler() def execute_all_task(self): for RDF in self._get_cleaned_movie_rows(): self.task_rdf_mask.write(RDF) RDF["Triple"] = self.filter_applier.build_triple(RDF) self.task_rdf_completation.write(RDF[["MovieID","Triple"]]) RDF = self.filter_applier.group_by_movie_from_triple(RDF[["MovieID","Triple","Abstract"]]) self.task_rdf_text.write(RDF) self.task_bpe_corpus.write_from_df(RDF[["Triple","Abstract"]]) self._end_file_handler() def _end_file_handler(self): self.task_bpe_corpus.close() self.task_rdf_mask.close() self.task_rdf_text.close() self.task_rdf_completation.close() def _get_cleaned_movie_rows(self): for RDF in self.sql_endpoint.get_abbreviated_dataset_by_movie_id(): RDF = self.filter_applier.drop_na_from_dataset(RDF) RDF = self.filter_applier.filter_by_frequency_movie_id(RDF) RDF = self.filter_applier.filter_by_frequency_relationship(RDF) # other filter # RDF = self.filter_applier.delete_relationship_by_list_filter(RDF) if RDF.empty: continue RDF = self.filter_applier.rdf_add_special_token(RDF) # WARNING, THIS MUST BE DONE AFTER FILTER BY FREQUENCE yield RDF def use_toy_dataset(self): # CHOOSEN MOVIE: # The Dark Knight : 117248 # Inception : 147074 # The Avengers : 113621 # Cast Away : 1123 # The Departed : 117586 # American Psycho : 90177 # Avatar : 71587 # Django Unchained : 138952 # Spirited Away : 144137 # Knives Out : 148025 movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025] self.sql_endpoint.movie_ids = movie_list def reduce_movie_list(self, starting_offset:int , ending_offset:int): self.filter_applier.reduce_movie_list(starting_offset,ending_offset) # there are a lot of settings to manage # you only need to change settings: # in the init for file paths, frequency filter limit, banned reletionshipURI # in the use_toy_dataset , to change the toy dataset # in _get_cleaned_movie_rows: to change how the pipeline behave #pipeline = Pipeline() # pipeline.use_toy_dataset() # pipeline.execute_task_bpe_corpus() # pipeline.execute_task_rdf_mask() # pipeline.execute_tasks_rdf_text() # pipeline.execute_task_rdf_completation() # pipeline.execute_all_task()