Update ETL
This commit is contained in:
parent
0373460105
commit
a93e61b8c1
@ -67,8 +67,8 @@ class MovieFilter:
|
||||
movie_ids = self.MOVIE_FILTER["MovieID"].to_list()
|
||||
movie_list_placeholder = ",".join(["?"] * len(movie_ids))
|
||||
|
||||
relationship = ["<PRED>dbp-dbp:language"]
|
||||
objects_list = ["<OBJ>English", "<OBJ>dbp-dbr:English_language"]
|
||||
relationship = ["dbp-dbp:language"]
|
||||
objects_list = ["English", "dbp-dbr:English_language"]
|
||||
|
||||
filter_query = f"""
|
||||
SELECT DISTINCT RDFs.MovieID
|
||||
@ -76,11 +76,26 @@ class MovieFilter:
|
||||
INNER JOIN ParsedRelationships USING (RelationshipID)
|
||||
INNER JOIN ParsedObjects USING (ObjectID)
|
||||
WHERE RDFs.MovieID IN ({movie_list_placeholder})
|
||||
AND ParsedRelationships.RelationshipURI IN {tuple(relationship)}
|
||||
AND ParsedRelationships.RelationshipURI IN ('{relationship[0]}')
|
||||
AND ParsedObjects.ObjectURI in {tuple(objects_list)};
|
||||
"""
|
||||
|
||||
other_query = f"""
|
||||
SELECT RDFs.MovieID
|
||||
FROM RDFs
|
||||
INNER JOIN ParsedRelationships USING (RelationshipID)
|
||||
INNER JOIN ParsedObjects USING (ObjectID)
|
||||
WHERE RDFs.MovieID IN ({movie_list_placeholder})
|
||||
AND ParsedRelationships.RelationshipURI IN ('{relationship[0]}')
|
||||
GROUP BY RDFs.MovieID
|
||||
HAVING
|
||||
SUM(CASE WHEN ParsedObjects.ObjectURI IN {tuple(objects_list)} THEN 1 ELSE 0 END) >= 1
|
||||
AND
|
||||
SUM(CASE WHEN ParsedObjects.ObjectURI NOT IN {tuple(objects_list)} THEN 1 ELSE 0 END) = 0;
|
||||
"""
|
||||
|
||||
params = tuple(movie_ids)
|
||||
self.MOVIE_FILTER = self.sql_endpoint.get_dataframe_from_query(filter_query, params)
|
||||
self.MOVIE_FILTER = self.sql_endpoint.get_dataframe_from_query(other_query, params)
|
||||
|
||||
|
||||
|
||||
|
||||
@ -16,11 +16,29 @@ RELATIONSHIP_FILTER_LIST = [
|
||||
"w3:2002/07/owl#sameAs","dbp-dbp:image","dbp-dbo:wikiPageLength", "w3:2000/01/rdf-schema#comment",
|
||||
"dbp-dbo:thumbnail", "foaf:depiction", "w3:1999/02/22-rdf-syntax-ns#type",
|
||||
"dbp-dbp:id","dbp-dbp:totalWidth", "w3:ns/prov#wasDerivedFrom", "dbp-dbp:n", "dbp-dbp:alt",
|
||||
"dbp-dbo:soundRecording", "dbp-dbp:align", "dbp-dbp:format", "dbp-dbp:n",
|
||||
"dbp-dbo:soundRecording", "dbp-dbp:align", "dbp-dbp:format",
|
||||
"dbp-dbp:filename", "dbp-dbp:wikt", "foaf:isPrimaryTopicOf", "dbp-dbp:quote", "foaf:homepage",
|
||||
"dbp-dbp:wordnet_type", "dbp-dbp:length"
|
||||
"dbp-dbp:wordnet_type", "dbp-dbp:length","dbp-dbp:caption", "dbp-dbo:imdbId", "dbp-dbp:border", "dbp-dbp:note",
|
||||
"dbp-dbp:postalCodeType", "dbp-dbp:extraColumn", "foaf:homepage", "dbp-dbp:bgcolor","dbp-dbp:prevTitle",
|
||||
"dbp-dbp:imageUpright", "dbp-dbp:url", "dbp-dbp:italicTitle", "dbp-dbp:imageSize", "dbp-dbp:text",
|
||||
"dbp-dbp:captionAlign", "dbp-dbp:headerAlign", "dbp-dbp:height", "dbp-dbp:link", "dbp-dbo:wikiPageInterLanguageLink",
|
||||
"w3:2003/01/geo/wgs84_pos#lat", "w3:2003/01/geo/wgs84_pos#long", "http://www.georss.org/georss/point",
|
||||
"dbp-dbp:bgcolor", "dbp-dbp:mc", "dbp-dbp:rev3score", "dbp-dbp:rev4score", "dbp-dbp:imageAlt",
|
||||
"dbp-dbp:b", "dbp-dbp:s", "dbp-dbp:c", "dbp-dbp:d", "dbp-dbp:m", "dbp-dbp:v", "dbp-dbp:mw", "dbp-dbp:fontsize",
|
||||
"dbp-dbp:salign", "dbp-dbp:q", "dbp-dbp:portal", "dbp-dbp:dSearch", "dbp-dbp:header", "w3:2003/01/geo/wgs84_pos#geometry",
|
||||
"dbp-dbp:shortsummary", "dbp-dbp:fixAttempted", "dbp-dbo:developer", "dbp-dbp:no", "dbp-dbp:ref", "dbp-dbp:infoa"
|
||||
"dbp-dbp:infob", "dbp-dbp:1a", "dbp-dbp:1p", "dbp-dbp:2a", "dbp-dbp:2p", "http://rdvocab.info/RDARelationshipsWEMI/manifestationOfWork",
|
||||
"dbp-dbp:isbn", "dbp-dbp:titleWidth", "dbp-dbp:prodcode", "dbp-dbp:page", "w3:2004/02/skos/core#closeMatch",
|
||||
"dbp-dbp:colwidth", "dbp-dbp:imagesize", "dbp-dbp:rr", "dbp-dbp:date", "dbp-dbp:type", "dbp-dbp:list",
|
||||
"dbp-dbp:listEpisodes", "dbp-dbp:footerAlign", "dbp-dbp:float", "dbp-dbp:bot", "dbp-dbp:p", "dbp-dbp:l", "dbp-dbp:t", "dbp-dbp:j",
|
||||
"dbp-dbp:1y", "dbp-dbp:2y", "dbp-dbp:1pp", "dbp-dbp:vgs", "dbp-dbp:3a", "dbp-dbp:3p", "dbp-dbp:3y", "dbp-dbp:4a", "dbp-dbp:4y",
|
||||
"dbp-dbp:website"
|
||||
]
|
||||
|
||||
"""
|
||||
SELECT DISTINCT field3
|
||||
FROM debug
|
||||
"""
|
||||
|
||||
class Pipeline():
|
||||
|
||||
@ -35,13 +53,14 @@ class Pipeline():
|
||||
self.task_rdf_completation = RDF_completation_task_dataset("./Assets/Dataset/Tmp/rdf_completation.csv")
|
||||
|
||||
self._movie_filter.frequency_filter(50,3000)
|
||||
self._relationship_filter.frequency_filter(50, 2395627) # from 2718 to 3069
|
||||
self._relationship_filter.frequency_filter(20, 2395627) # from 2718 to 3069
|
||||
self._relationship_filter.delete_relationship_uri_by_list(RELATIONSHIP_FILTER_LIST)
|
||||
|
||||
def other_filter(self):
|
||||
self._movie_filter.relation_filter("purl:dc/terms/subject",5,100)
|
||||
self._movie_filter.filter_by_director()
|
||||
# self._movie_filter.relation_filter("dbp-dbo:director",1,100)
|
||||
self._movie_filter.filter_by_english_movies()
|
||||
self._movie_filter.relation_filter("dbp-dbp:budget",1,100) # the most important film have relationship budget
|
||||
|
||||
def _get_cleaned_movie_rows(self):
|
||||
movie_ids = self._movie_filter.get_movie_id()
|
||||
@ -126,5 +145,5 @@ class Pipeline():
|
||||
pipe = Pipeline()
|
||||
# pipe.use_toy_dataset()
|
||||
pipe.other_filter()
|
||||
# pipe.execute_all_task()
|
||||
pipe.generate_csv_debug_file("Assets/Dataset/Tmp/debug.csv")
|
||||
pipe.execute_all_task()
|
||||
# pipe.generate_csv_debug_file("Assets/Dataset/Tmp/debug.csv")
|
||||
@ -7,6 +7,10 @@ class RdfFilter:
|
||||
self.sql_endpoint = SqlEndpoint()
|
||||
|
||||
|
||||
# def delete_hyperum_when_movie(self):
|
||||
# purl:linguistics/gold/hypernym
|
||||
# is almost ever as "dbp-dbr:Movie" or "dbp-dbr:Film"
|
||||
# banned triple
|
||||
|
||||
def yield_movie_abbreviated_rdfs(self, MOVIE_ID: pd.DataFrame, REL_ID: pd.DataFrame):
|
||||
relationship_placeholder = ",".join(["?"] * len(REL_ID))
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user