Added possibility to whitelist relationships

This commit is contained in:
GassiGiuseppe 2025-10-12 12:26:26 +02:00
parent e9d30b3cea
commit 856c693650
2 changed files with 19 additions and 2 deletions

View File

@ -35,6 +35,9 @@ RELATIONSHIP_FILTER_LIST = [
"dbp-dbp:website"
]
RELATIONSHIP_WHITE_LIST = [
"dbp-dbp:director","dbp-dbo:starring", "dbp-dbo:writer", "dbp-dbp:name", "dbp-dbp:genre", "purl:dc/terms/subject"
]
"""
SELECT DISTINCT field3
FROM debug
@ -66,6 +69,7 @@ class Pipeline():
def _get_cleaned_movie_rows(self):
movie_ids = self._movie_filter.get_movie_id()
rel_ids = self._relationship_filter.get_relationship_id()
# rel_ids = self._relationship_filter.get_relationship_id_from_white_list(RELATIONSHIP_WHITE_LIST)
for RDF in self._rdf_filter.yield_movie_abbreviated_rdfs(movie_ids,rel_ids):
RDF = self._pipeline.drop_na_from_dataset(RDF)
@ -147,5 +151,5 @@ class Pipeline():
pipe = Pipeline()
#pipe.use_toy_dataset()
pipe.other_filter()
pipe.execute_all_task()
# pipe.generate_csv_debug_file("Assets/Dataset/Tmp/debug.csv")
# pipe.execute_all_task()
pipe.generate_csv_debug_file("Assets/Dataset/Tmp/debug.csv")

View File

@ -26,6 +26,19 @@ class RelationshipFilter:
def get_relationship_id(self):
return self.RELATIONSHIP_FILTER
def get_relationship_id_from_white_list(self, relationship_list: list[str]):
ids_placeholder = ",".join(["?"] * len(self.RELATIONSHIP_FILTER))
uri_placeholder = ",".join(["?"] * len(relationship_list))
filter_query = f"""
SELECT RelationshipID
FROM ParsedRelationships
WHERE RelationshipID IN ({ids_placeholder})
AND RelationshipURI IN ({uri_placeholder});
"""
params = tuple(self.RELATIONSHIP_FILTER["RelationshipID"].to_list()) + tuple(relationship_list)
return self.sql_endpoint.get_dataframe_from_query(filter_query, params)
def delete_relationship_uri_by_list(self, filter_list: list[str]):
ids_placeholder = ",".join(["?"] * len(self.RELATIONSHIP_FILTER))