diff --git a/Scripts/DataCleaning/pipeline/pipeline.py b/Scripts/DataCleaning/pipeline/pipeline.py index eb8ba8c..349a859 100644 --- a/Scripts/DataCleaning/pipeline/pipeline.py +++ b/Scripts/DataCleaning/pipeline/pipeline.py @@ -35,6 +35,9 @@ RELATIONSHIP_FILTER_LIST = [ "dbp-dbp:website" ] +RELATIONSHIP_WHITE_LIST = [ + "dbp-dbp:director","dbp-dbo:starring", "dbp-dbo:writer", "dbp-dbp:name", "dbp-dbp:genre", "purl:dc/terms/subject" + ] """ SELECT DISTINCT field3 FROM debug @@ -66,6 +69,7 @@ class Pipeline(): def _get_cleaned_movie_rows(self): movie_ids = self._movie_filter.get_movie_id() rel_ids = self._relationship_filter.get_relationship_id() + # rel_ids = self._relationship_filter.get_relationship_id_from_white_list(RELATIONSHIP_WHITE_LIST) for RDF in self._rdf_filter.yield_movie_abbreviated_rdfs(movie_ids,rel_ids): RDF = self._pipeline.drop_na_from_dataset(RDF) @@ -147,5 +151,5 @@ class Pipeline(): pipe = Pipeline() #pipe.use_toy_dataset() pipe.other_filter() -pipe.execute_all_task() -# pipe.generate_csv_debug_file("Assets/Dataset/Tmp/debug.csv") \ No newline at end of file +# pipe.execute_all_task() +pipe.generate_csv_debug_file("Assets/Dataset/Tmp/debug.csv") \ No newline at end of file diff --git a/Scripts/DataCleaning/pipeline/relationship_filter.py b/Scripts/DataCleaning/pipeline/relationship_filter.py index 205d792..c5cd09e 100644 --- a/Scripts/DataCleaning/pipeline/relationship_filter.py +++ b/Scripts/DataCleaning/pipeline/relationship_filter.py @@ -26,6 +26,19 @@ class RelationshipFilter: def get_relationship_id(self): return self.RELATIONSHIP_FILTER + def get_relationship_id_from_white_list(self, relationship_list: list[str]): + ids_placeholder = ",".join(["?"] * len(self.RELATIONSHIP_FILTER)) + uri_placeholder = ",".join(["?"] * len(relationship_list)) + filter_query = f""" + SELECT RelationshipID + FROM ParsedRelationships + WHERE RelationshipID IN ({ids_placeholder}) + AND RelationshipURI IN ({uri_placeholder}); + """ + params = tuple(self.RELATIONSHIP_FILTER["RelationshipID"].to_list()) + tuple(relationship_list) + return self.sql_endpoint.get_dataframe_from_query(filter_query, params) + + def delete_relationship_uri_by_list(self, filter_list: list[str]): ids_placeholder = ",".join(["?"] * len(self.RELATIONSHIP_FILTER))