From 0373460105a6ec18047c4cb2a326549c14d1bf8b Mon Sep 17 00:00:00 2001 From: GassiGiuseppe Date: Mon, 6 Oct 2025 10:57:50 +0200 Subject: [PATCH] Movie filters updated --- Scripts/DataCleaning/pipeline/movie_filter.py | 39 +++++++++++++++++++ Scripts/DataCleaning/pipeline/pipeline.py | 7 ++-- 2 files changed, 43 insertions(+), 3 deletions(-) diff --git a/Scripts/DataCleaning/pipeline/movie_filter.py b/Scripts/DataCleaning/pipeline/movie_filter.py index a65b360..14e679c 100644 --- a/Scripts/DataCleaning/pipeline/movie_filter.py +++ b/Scripts/DataCleaning/pipeline/movie_filter.py @@ -45,5 +45,44 @@ class MovieFilter: self.MOVIE_FILTER = self.sql_endpoint.get_dataframe_from_query(filter_query, params) + def filter_by_director(self): + director_list = ['dbp-dbo:director','dbp-dbp:director'] + + movie_ids = self.MOVIE_FILTER["MovieID"].to_list() + movie_list_placeholder = ",".join(["?"] * len(movie_ids)) + + filter_query = f""" + SELECT DISTINCT RDFs.MovieID + FROM RDFs + JOIN ParsedRelationships USING (RelationshipID) + WHERE RDFs.MovieID IN ({movie_list_placeholder}) + AND ParsedRelationships.RelationshipURI IN {tuple(director_list)}; + """ + + params = tuple(movie_ids) + self.MOVIE_FILTER = self.sql_endpoint.get_dataframe_from_query(filter_query, params) + + + def filter_by_english_movies(self): + movie_ids = self.MOVIE_FILTER["MovieID"].to_list() + movie_list_placeholder = ",".join(["?"] * len(movie_ids)) + + relationship = ["dbp-dbp:language"] + objects_list = ["English", "dbp-dbr:English_language"] + + filter_query = f""" + SELECT DISTINCT RDFs.MovieID + FROM RDFs + INNER JOIN ParsedRelationships USING (RelationshipID) + INNER JOIN ParsedObjects USING (ObjectID) + WHERE RDFs.MovieID IN ({movie_list_placeholder}) + AND ParsedRelationships.RelationshipURI IN {tuple(relationship)} + AND ParsedObjects.ObjectURI in {tuple(objects_list)}; + """ + params = tuple(movie_ids) + self.MOVIE_FILTER = self.sql_endpoint.get_dataframe_from_query(filter_query, params) + + + # movie_filter = MovieFilter() # movie_filter.frequency_filter(5,10) \ No newline at end of file diff --git a/Scripts/DataCleaning/pipeline/pipeline.py b/Scripts/DataCleaning/pipeline/pipeline.py index 42c3aad..44c9a94 100644 --- a/Scripts/DataCleaning/pipeline/pipeline.py +++ b/Scripts/DataCleaning/pipeline/pipeline.py @@ -40,7 +40,8 @@ class Pipeline(): def other_filter(self): self._movie_filter.relation_filter("purl:dc/terms/subject",5,100) - self._movie_filter.relation_filter("dbp-dbo:director",1,100) + self._movie_filter.filter_by_director() + # self._movie_filter.relation_filter("dbp-dbo:director",1,100) def _get_cleaned_movie_rows(self): movie_ids = self._movie_filter.get_movie_id() @@ -110,7 +111,7 @@ class Pipeline(): # Django Unchained : 138952 # Spirited Away : 144137 # Knives Out : 148025 - movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025] + movie_list = [106465,106466,106467,106468,106469,106470,106471,106472,106473]#[117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025] self._movie_filter.MOVIE_FILTER = pd.DataFrame({"MovieID": movie_list}) def generate_csv_debug_file(self, debug_path:str): @@ -123,7 +124,7 @@ class Pipeline(): pipe = Pipeline() -pipe.use_toy_dataset() +# pipe.use_toy_dataset() pipe.other_filter() # pipe.execute_all_task() pipe.generate_csv_debug_file("Assets/Dataset/Tmp/debug.csv") \ No newline at end of file