From a93e61b8c18bbdd2761dc85e32cd6c3f7eb0113c Mon Sep 17 00:00:00 2001 From: GassiGiuseppe Date: Tue, 7 Oct 2025 00:54:00 +0200 Subject: [PATCH] Update ETL --- Scripts/DataCleaning/pipeline/movie_filter.py | 23 +++++++++++--- Scripts/DataCleaning/pipeline/pipeline.py | 31 +++++++++++++++---- Scripts/DataCleaning/pipeline/rdf_filter.py | 4 +++ 3 files changed, 48 insertions(+), 10 deletions(-) diff --git a/Scripts/DataCleaning/pipeline/movie_filter.py b/Scripts/DataCleaning/pipeline/movie_filter.py index 14e679c..6fc3ecc 100644 --- a/Scripts/DataCleaning/pipeline/movie_filter.py +++ b/Scripts/DataCleaning/pipeline/movie_filter.py @@ -67,8 +67,8 @@ class MovieFilter: movie_ids = self.MOVIE_FILTER["MovieID"].to_list() movie_list_placeholder = ",".join(["?"] * len(movie_ids)) - relationship = ["dbp-dbp:language"] - objects_list = ["English", "dbp-dbr:English_language"] + relationship = ["dbp-dbp:language"] + objects_list = ["English", "dbp-dbr:English_language"] filter_query = f""" SELECT DISTINCT RDFs.MovieID @@ -76,11 +76,26 @@ class MovieFilter: INNER JOIN ParsedRelationships USING (RelationshipID) INNER JOIN ParsedObjects USING (ObjectID) WHERE RDFs.MovieID IN ({movie_list_placeholder}) - AND ParsedRelationships.RelationshipURI IN {tuple(relationship)} + AND ParsedRelationships.RelationshipURI IN ('{relationship[0]}') AND ParsedObjects.ObjectURI in {tuple(objects_list)}; """ + + other_query = f""" + SELECT RDFs.MovieID + FROM RDFs + INNER JOIN ParsedRelationships USING (RelationshipID) + INNER JOIN ParsedObjects USING (ObjectID) + WHERE RDFs.MovieID IN ({movie_list_placeholder}) + AND ParsedRelationships.RelationshipURI IN ('{relationship[0]}') + GROUP BY RDFs.MovieID + HAVING + SUM(CASE WHEN ParsedObjects.ObjectURI IN {tuple(objects_list)} THEN 1 ELSE 0 END) >= 1 + AND + SUM(CASE WHEN ParsedObjects.ObjectURI NOT IN {tuple(objects_list)} THEN 1 ELSE 0 END) = 0; + """ + params = tuple(movie_ids) - self.MOVIE_FILTER = self.sql_endpoint.get_dataframe_from_query(filter_query, params) + self.MOVIE_FILTER = self.sql_endpoint.get_dataframe_from_query(other_query, params) diff --git a/Scripts/DataCleaning/pipeline/pipeline.py b/Scripts/DataCleaning/pipeline/pipeline.py index 44c9a94..d523897 100644 --- a/Scripts/DataCleaning/pipeline/pipeline.py +++ b/Scripts/DataCleaning/pipeline/pipeline.py @@ -16,11 +16,29 @@ RELATIONSHIP_FILTER_LIST = [ "w3:2002/07/owl#sameAs","dbp-dbp:image","dbp-dbo:wikiPageLength", "w3:2000/01/rdf-schema#comment", "dbp-dbo:thumbnail", "foaf:depiction", "w3:1999/02/22-rdf-syntax-ns#type", "dbp-dbp:id","dbp-dbp:totalWidth", "w3:ns/prov#wasDerivedFrom", "dbp-dbp:n", "dbp-dbp:alt", - "dbp-dbo:soundRecording", "dbp-dbp:align", "dbp-dbp:format", "dbp-dbp:n", + "dbp-dbo:soundRecording", "dbp-dbp:align", "dbp-dbp:format", "dbp-dbp:filename", "dbp-dbp:wikt", "foaf:isPrimaryTopicOf", "dbp-dbp:quote", "foaf:homepage", - "dbp-dbp:wordnet_type", "dbp-dbp:length" + "dbp-dbp:wordnet_type", "dbp-dbp:length","dbp-dbp:caption", "dbp-dbo:imdbId", "dbp-dbp:border", "dbp-dbp:note", + "dbp-dbp:postalCodeType", "dbp-dbp:extraColumn", "foaf:homepage", "dbp-dbp:bgcolor","dbp-dbp:prevTitle", + "dbp-dbp:imageUpright", "dbp-dbp:url", "dbp-dbp:italicTitle", "dbp-dbp:imageSize", "dbp-dbp:text", + "dbp-dbp:captionAlign", "dbp-dbp:headerAlign", "dbp-dbp:height", "dbp-dbp:link", "dbp-dbo:wikiPageInterLanguageLink", + "w3:2003/01/geo/wgs84_pos#lat", "w3:2003/01/geo/wgs84_pos#long", "http://www.georss.org/georss/point", + "dbp-dbp:bgcolor", "dbp-dbp:mc", "dbp-dbp:rev3score", "dbp-dbp:rev4score", "dbp-dbp:imageAlt", + "dbp-dbp:b", "dbp-dbp:s", "dbp-dbp:c", "dbp-dbp:d", "dbp-dbp:m", "dbp-dbp:v", "dbp-dbp:mw", "dbp-dbp:fontsize", + "dbp-dbp:salign", "dbp-dbp:q", "dbp-dbp:portal", "dbp-dbp:dSearch", "dbp-dbp:header", "w3:2003/01/geo/wgs84_pos#geometry", + "dbp-dbp:shortsummary", "dbp-dbp:fixAttempted", "dbp-dbo:developer", "dbp-dbp:no", "dbp-dbp:ref", "dbp-dbp:infoa" + "dbp-dbp:infob", "dbp-dbp:1a", "dbp-dbp:1p", "dbp-dbp:2a", "dbp-dbp:2p", "http://rdvocab.info/RDARelationshipsWEMI/manifestationOfWork", + "dbp-dbp:isbn", "dbp-dbp:titleWidth", "dbp-dbp:prodcode", "dbp-dbp:page", "w3:2004/02/skos/core#closeMatch", + "dbp-dbp:colwidth", "dbp-dbp:imagesize", "dbp-dbp:rr", "dbp-dbp:date", "dbp-dbp:type", "dbp-dbp:list", + "dbp-dbp:listEpisodes", "dbp-dbp:footerAlign", "dbp-dbp:float", "dbp-dbp:bot", "dbp-dbp:p", "dbp-dbp:l", "dbp-dbp:t", "dbp-dbp:j", + "dbp-dbp:1y", "dbp-dbp:2y", "dbp-dbp:1pp", "dbp-dbp:vgs", "dbp-dbp:3a", "dbp-dbp:3p", "dbp-dbp:3y", "dbp-dbp:4a", "dbp-dbp:4y", + "dbp-dbp:website" ] +""" +SELECT DISTINCT field3 +FROM debug +""" class Pipeline(): @@ -35,13 +53,14 @@ class Pipeline(): self.task_rdf_completation = RDF_completation_task_dataset("./Assets/Dataset/Tmp/rdf_completation.csv") self._movie_filter.frequency_filter(50,3000) - self._relationship_filter.frequency_filter(50, 2395627) # from 2718 to 3069 + self._relationship_filter.frequency_filter(20, 2395627) # from 2718 to 3069 self._relationship_filter.delete_relationship_uri_by_list(RELATIONSHIP_FILTER_LIST) def other_filter(self): self._movie_filter.relation_filter("purl:dc/terms/subject",5,100) self._movie_filter.filter_by_director() - # self._movie_filter.relation_filter("dbp-dbo:director",1,100) + self._movie_filter.filter_by_english_movies() + self._movie_filter.relation_filter("dbp-dbp:budget",1,100) # the most important film have relationship budget def _get_cleaned_movie_rows(self): movie_ids = self._movie_filter.get_movie_id() @@ -126,5 +145,5 @@ class Pipeline(): pipe = Pipeline() # pipe.use_toy_dataset() pipe.other_filter() -# pipe.execute_all_task() -pipe.generate_csv_debug_file("Assets/Dataset/Tmp/debug.csv") \ No newline at end of file +pipe.execute_all_task() +# pipe.generate_csv_debug_file("Assets/Dataset/Tmp/debug.csv") \ No newline at end of file diff --git a/Scripts/DataCleaning/pipeline/rdf_filter.py b/Scripts/DataCleaning/pipeline/rdf_filter.py index 2c0ffd6..50be597 100644 --- a/Scripts/DataCleaning/pipeline/rdf_filter.py +++ b/Scripts/DataCleaning/pipeline/rdf_filter.py @@ -7,6 +7,10 @@ class RdfFilter: self.sql_endpoint = SqlEndpoint() + # def delete_hyperum_when_movie(self): + # purl:linguistics/gold/hypernym + # is almost ever as "dbp-dbr:Movie" or "dbp-dbr:Film" + # banned triple def yield_movie_abbreviated_rdfs(self, MOVIE_ID: pd.DataFrame, REL_ID: pd.DataFrame): relationship_placeholder = ",".join(["?"] * len(REL_ID))