Movie filters updated

This commit is contained in:
GassiGiuseppe 2025-10-06 10:57:50 +02:00
parent 7307916891
commit 0373460105
2 changed files with 43 additions and 3 deletions

View File

@ -45,5 +45,44 @@ class MovieFilter:
self.MOVIE_FILTER = self.sql_endpoint.get_dataframe_from_query(filter_query, params)
def filter_by_director(self):
director_list = ['dbp-dbo:director','dbp-dbp:director']
movie_ids = self.MOVIE_FILTER["MovieID"].to_list()
movie_list_placeholder = ",".join(["?"] * len(movie_ids))
filter_query = f"""
SELECT DISTINCT RDFs.MovieID
FROM RDFs
JOIN ParsedRelationships USING (RelationshipID)
WHERE RDFs.MovieID IN ({movie_list_placeholder})
AND ParsedRelationships.RelationshipURI IN {tuple(director_list)};
"""
params = tuple(movie_ids)
self.MOVIE_FILTER = self.sql_endpoint.get_dataframe_from_query(filter_query, params)
def filter_by_english_movies(self):
movie_ids = self.MOVIE_FILTER["MovieID"].to_list()
movie_list_placeholder = ",".join(["?"] * len(movie_ids))
relationship = ["<PRED>dbp-dbp:language"]
objects_list = ["<OBJ>English", "<OBJ>dbp-dbr:English_language"]
filter_query = f"""
SELECT DISTINCT RDFs.MovieID
FROM RDFs
INNER JOIN ParsedRelationships USING (RelationshipID)
INNER JOIN ParsedObjects USING (ObjectID)
WHERE RDFs.MovieID IN ({movie_list_placeholder})
AND ParsedRelationships.RelationshipURI IN {tuple(relationship)}
AND ParsedObjects.ObjectURI in {tuple(objects_list)};
"""
params = tuple(movie_ids)
self.MOVIE_FILTER = self.sql_endpoint.get_dataframe_from_query(filter_query, params)
# movie_filter = MovieFilter()
# movie_filter.frequency_filter(5,10)

View File

@ -40,7 +40,8 @@ class Pipeline():
def other_filter(self):
self._movie_filter.relation_filter("purl:dc/terms/subject",5,100)
self._movie_filter.relation_filter("dbp-dbo:director",1,100)
self._movie_filter.filter_by_director()
# self._movie_filter.relation_filter("dbp-dbo:director",1,100)
def _get_cleaned_movie_rows(self):
movie_ids = self._movie_filter.get_movie_id()
@ -110,7 +111,7 @@ class Pipeline():
# Django Unchained : 138952
# Spirited Away : 144137
# Knives Out : 148025
movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]
movie_list = [106465,106466,106467,106468,106469,106470,106471,106472,106473]#[117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]
self._movie_filter.MOVIE_FILTER = pd.DataFrame({"MovieID": movie_list})
def generate_csv_debug_file(self, debug_path:str):
@ -123,7 +124,7 @@ class Pipeline():
pipe = Pipeline()
pipe.use_toy_dataset()
# pipe.use_toy_dataset()
pipe.other_filter()
# pipe.execute_all_task()
pipe.generate_csv_debug_file("Assets/Dataset/Tmp/debug.csv")