Update ETL

This commit is contained in:
GassiGiuseppe 2025-10-07 00:54:00 +02:00
parent 0373460105
commit a93e61b8c1
3 changed files with 48 additions and 10 deletions

View File

@ -67,8 +67,8 @@ class MovieFilter:
movie_ids = self.MOVIE_FILTER["MovieID"].to_list() movie_ids = self.MOVIE_FILTER["MovieID"].to_list()
movie_list_placeholder = ",".join(["?"] * len(movie_ids)) movie_list_placeholder = ",".join(["?"] * len(movie_ids))
relationship = ["<PRED>dbp-dbp:language"] relationship = ["dbp-dbp:language"]
objects_list = ["<OBJ>English", "<OBJ>dbp-dbr:English_language"] objects_list = ["English", "dbp-dbr:English_language"]
filter_query = f""" filter_query = f"""
SELECT DISTINCT RDFs.MovieID SELECT DISTINCT RDFs.MovieID
@ -76,11 +76,26 @@ class MovieFilter:
INNER JOIN ParsedRelationships USING (RelationshipID) INNER JOIN ParsedRelationships USING (RelationshipID)
INNER JOIN ParsedObjects USING (ObjectID) INNER JOIN ParsedObjects USING (ObjectID)
WHERE RDFs.MovieID IN ({movie_list_placeholder}) WHERE RDFs.MovieID IN ({movie_list_placeholder})
AND ParsedRelationships.RelationshipURI IN {tuple(relationship)} AND ParsedRelationships.RelationshipURI IN ('{relationship[0]}')
AND ParsedObjects.ObjectURI in {tuple(objects_list)}; AND ParsedObjects.ObjectURI in {tuple(objects_list)};
""" """
other_query = f"""
SELECT RDFs.MovieID
FROM RDFs
INNER JOIN ParsedRelationships USING (RelationshipID)
INNER JOIN ParsedObjects USING (ObjectID)
WHERE RDFs.MovieID IN ({movie_list_placeholder})
AND ParsedRelationships.RelationshipURI IN ('{relationship[0]}')
GROUP BY RDFs.MovieID
HAVING
SUM(CASE WHEN ParsedObjects.ObjectURI IN {tuple(objects_list)} THEN 1 ELSE 0 END) >= 1
AND
SUM(CASE WHEN ParsedObjects.ObjectURI NOT IN {tuple(objects_list)} THEN 1 ELSE 0 END) = 0;
"""
params = tuple(movie_ids) params = tuple(movie_ids)
self.MOVIE_FILTER = self.sql_endpoint.get_dataframe_from_query(filter_query, params) self.MOVIE_FILTER = self.sql_endpoint.get_dataframe_from_query(other_query, params)

View File

@ -16,11 +16,29 @@ RELATIONSHIP_FILTER_LIST = [
"w3:2002/07/owl#sameAs","dbp-dbp:image","dbp-dbo:wikiPageLength", "w3:2000/01/rdf-schema#comment", "w3:2002/07/owl#sameAs","dbp-dbp:image","dbp-dbo:wikiPageLength", "w3:2000/01/rdf-schema#comment",
"dbp-dbo:thumbnail", "foaf:depiction", "w3:1999/02/22-rdf-syntax-ns#type", "dbp-dbo:thumbnail", "foaf:depiction", "w3:1999/02/22-rdf-syntax-ns#type",
"dbp-dbp:id","dbp-dbp:totalWidth", "w3:ns/prov#wasDerivedFrom", "dbp-dbp:n", "dbp-dbp:alt", "dbp-dbp:id","dbp-dbp:totalWidth", "w3:ns/prov#wasDerivedFrom", "dbp-dbp:n", "dbp-dbp:alt",
"dbp-dbo:soundRecording", "dbp-dbp:align", "dbp-dbp:format", "dbp-dbp:n", "dbp-dbo:soundRecording", "dbp-dbp:align", "dbp-dbp:format",
"dbp-dbp:filename", "dbp-dbp:wikt", "foaf:isPrimaryTopicOf", "dbp-dbp:quote", "foaf:homepage", "dbp-dbp:filename", "dbp-dbp:wikt", "foaf:isPrimaryTopicOf", "dbp-dbp:quote", "foaf:homepage",
"dbp-dbp:wordnet_type", "dbp-dbp:length" "dbp-dbp:wordnet_type", "dbp-dbp:length","dbp-dbp:caption", "dbp-dbo:imdbId", "dbp-dbp:border", "dbp-dbp:note",
"dbp-dbp:postalCodeType", "dbp-dbp:extraColumn", "foaf:homepage", "dbp-dbp:bgcolor","dbp-dbp:prevTitle",
"dbp-dbp:imageUpright", "dbp-dbp:url", "dbp-dbp:italicTitle", "dbp-dbp:imageSize", "dbp-dbp:text",
"dbp-dbp:captionAlign", "dbp-dbp:headerAlign", "dbp-dbp:height", "dbp-dbp:link", "dbp-dbo:wikiPageInterLanguageLink",
"w3:2003/01/geo/wgs84_pos#lat", "w3:2003/01/geo/wgs84_pos#long", "http://www.georss.org/georss/point",
"dbp-dbp:bgcolor", "dbp-dbp:mc", "dbp-dbp:rev3score", "dbp-dbp:rev4score", "dbp-dbp:imageAlt",
"dbp-dbp:b", "dbp-dbp:s", "dbp-dbp:c", "dbp-dbp:d", "dbp-dbp:m", "dbp-dbp:v", "dbp-dbp:mw", "dbp-dbp:fontsize",
"dbp-dbp:salign", "dbp-dbp:q", "dbp-dbp:portal", "dbp-dbp:dSearch", "dbp-dbp:header", "w3:2003/01/geo/wgs84_pos#geometry",
"dbp-dbp:shortsummary", "dbp-dbp:fixAttempted", "dbp-dbo:developer", "dbp-dbp:no", "dbp-dbp:ref", "dbp-dbp:infoa"
"dbp-dbp:infob", "dbp-dbp:1a", "dbp-dbp:1p", "dbp-dbp:2a", "dbp-dbp:2p", "http://rdvocab.info/RDARelationshipsWEMI/manifestationOfWork",
"dbp-dbp:isbn", "dbp-dbp:titleWidth", "dbp-dbp:prodcode", "dbp-dbp:page", "w3:2004/02/skos/core#closeMatch",
"dbp-dbp:colwidth", "dbp-dbp:imagesize", "dbp-dbp:rr", "dbp-dbp:date", "dbp-dbp:type", "dbp-dbp:list",
"dbp-dbp:listEpisodes", "dbp-dbp:footerAlign", "dbp-dbp:float", "dbp-dbp:bot", "dbp-dbp:p", "dbp-dbp:l", "dbp-dbp:t", "dbp-dbp:j",
"dbp-dbp:1y", "dbp-dbp:2y", "dbp-dbp:1pp", "dbp-dbp:vgs", "dbp-dbp:3a", "dbp-dbp:3p", "dbp-dbp:3y", "dbp-dbp:4a", "dbp-dbp:4y",
"dbp-dbp:website"
] ]
"""
SELECT DISTINCT field3
FROM debug
"""
class Pipeline(): class Pipeline():
@ -35,13 +53,14 @@ class Pipeline():
self.task_rdf_completation = RDF_completation_task_dataset("./Assets/Dataset/Tmp/rdf_completation.csv") self.task_rdf_completation = RDF_completation_task_dataset("./Assets/Dataset/Tmp/rdf_completation.csv")
self._movie_filter.frequency_filter(50,3000) self._movie_filter.frequency_filter(50,3000)
self._relationship_filter.frequency_filter(50, 2395627) # from 2718 to 3069 self._relationship_filter.frequency_filter(20, 2395627) # from 2718 to 3069
self._relationship_filter.delete_relationship_uri_by_list(RELATIONSHIP_FILTER_LIST) self._relationship_filter.delete_relationship_uri_by_list(RELATIONSHIP_FILTER_LIST)
def other_filter(self): def other_filter(self):
self._movie_filter.relation_filter("purl:dc/terms/subject",5,100) self._movie_filter.relation_filter("purl:dc/terms/subject",5,100)
self._movie_filter.filter_by_director() self._movie_filter.filter_by_director()
# self._movie_filter.relation_filter("dbp-dbo:director",1,100) self._movie_filter.filter_by_english_movies()
self._movie_filter.relation_filter("dbp-dbp:budget",1,100) # the most important film have relationship budget
def _get_cleaned_movie_rows(self): def _get_cleaned_movie_rows(self):
movie_ids = self._movie_filter.get_movie_id() movie_ids = self._movie_filter.get_movie_id()
@ -126,5 +145,5 @@ class Pipeline():
pipe = Pipeline() pipe = Pipeline()
# pipe.use_toy_dataset() # pipe.use_toy_dataset()
pipe.other_filter() pipe.other_filter()
# pipe.execute_all_task() pipe.execute_all_task()
pipe.generate_csv_debug_file("Assets/Dataset/Tmp/debug.csv") # pipe.generate_csv_debug_file("Assets/Dataset/Tmp/debug.csv")

View File

@ -7,6 +7,10 @@ class RdfFilter:
self.sql_endpoint = SqlEndpoint() self.sql_endpoint = SqlEndpoint()
# def delete_hyperum_when_movie(self):
# purl:linguistics/gold/hypernym
# is almost ever as "dbp-dbr:Movie" or "dbp-dbr:Film"
# banned triple
def yield_movie_abbreviated_rdfs(self, MOVIE_ID: pd.DataFrame, REL_ID: pd.DataFrame): def yield_movie_abbreviated_rdfs(self, MOVIE_ID: pd.DataFrame, REL_ID: pd.DataFrame):
relationship_placeholder = ",".join(["?"] * len(REL_ID)) relationship_placeholder = ",".join(["?"] * len(REL_ID))