From 64f9b41378b2b5a6c7c05bb865a902ab2606468b Mon Sep 17 00:00:00 2001 From: GassiGiuseppe Date: Mon, 22 Sep 2025 17:17:22 +0200 Subject: [PATCH] Built datawarehouse.py which populate the dataset --- Scripts/DatasetMerging/datawarehouse.py | 373 ++++++++++++++++++++++++ 1 file changed, 373 insertions(+) create mode 100644 Scripts/DatasetMerging/datawarehouse.py diff --git a/Scripts/DatasetMerging/datawarehouse.py b/Scripts/DatasetMerging/datawarehouse.py new file mode 100644 index 0000000..dce7584 --- /dev/null +++ b/Scripts/DatasetMerging/datawarehouse.py @@ -0,0 +1,373 @@ +import sqlite3 +import csv + +##################################################################### +# This file builds DatawareHouse/dataset.db from 1-hop csv files # +# Its Schema in . /SQL_Queries/db_creation.sql # +# The sql query used to popualate id in . /SQL_Queries/query.sql # +##################################################################### + + +# --- Global configuration --- +DB_NAME = "./Assets/Dataset/DatawareHouse/dataset.db" +MOVIES_CSV = "./Assets/Dataset/1-hop/movies.csv" +PAGEID_CSV = "./Assets/Dataset/1-hop/movie-pageid.csv" +SUMMARY_CSV = "./Assets/Dataset/1-hop/wikipedia-summary.csv" +DATASET_CSV = "./Assets/Dataset/1-hop/dataset.csv" +REVERSE_CSV = "./Assets/Dataset/1-hop/reverse.csv" + +MOVIES_CSV_HANDLER = open(MOVIES_CSV,"r",newline='', encoding="utf-8") +PAGEID_CSV_HANDLER = open(PAGEID_CSV,"r",newline='', encoding="utf-8") +SUMMARY_CSV_HANDLER = open(SUMMARY_CSV,"r",newline='', encoding="utf-8") +DATASET_CSV_HANDLER = open(DATASET_CSV,"r",newline='', encoding="utf-8") +REVERSE_CSV_HANDLER = open(REVERSE_CSV,"r",newline='', encoding="utf-8") + +CONN = sqlite3.connect(DB_NAME) +CURS = CONN.cursor() + +# MARK: SQL Definitions +# Insert MovieURI + +def insertOrigin(curs : sqlite3.Cursor ) -> bool: + + QUERY = "INSERT INTO Origins (OriginName) VALUES ('dataset.csv'),('reverse.csv');" + try: + curs.execute(QUERY) + return True + except sqlite3.IntegrityError: + return False + +def selectOrigin(curs: sqlite3.Cursor, originName: str) -> int | None: + + QUERY = "SELECT OriginID FROM Origins WHERE OriginName = ?;" + + curs.execute(QUERY, [originName]) + originId = curs.fetchone() + if not originId: + return None + + # in this case the real id is the first element of the tuple + return originId[0] + +def insertMovie(curs : sqlite3.Cursor , movieUri: str) -> bool: + + QUERY = "INSERT INTO Movies (MovieURI) VALUES (?);" + try: + curs.execute(QUERY,[movieUri]) + return True + except sqlite3.IntegrityError: + return False + + +def selectMovieId(curs: sqlite3.Cursor, movieUri: str) -> int | None: + + QUERY = "SELECT MovieID FROM Movies WHERE MovieURI = ?;" + + curs.execute(QUERY, [movieUri]) + movieId = curs.fetchone() + if not movieId: + return None + + # in this case the real id is the first element of the tuple + return movieId[0] + + +def insertWikiPageId(curs: sqlite3.Cursor, movieId: int, pageId: int) -> bool: + QUERY = "INSERT INTO WikiPageIDs (MovieID, PageID) VALUES (?,?);" + try: + curs.execute(QUERY,[movieId, pageId]) + return True + except sqlite3.IntegrityError: + return False + +def selectMovieIdFromWikiPageId(curs: sqlite3.Cursor,pageId: int) -> int | None: + + QUERY = "SELECT MovieID FROM WikiPageIDs WHERE PageID = ?;" + + curs.execute(QUERY, [pageId]) + movieId = curs.fetchone() + if not movieId: + return None + + # in this case the real id is the first element of the tuple + return movieId[0] + +def insertWikiAbstract(curs: sqlite3.Cursor, movieId: int, abstract: str) -> bool: + QUERY = "INSERT INTO WikipediaAbstracts (MovieID, Abstract) VALUES (?,?);" + try: + curs.execute(QUERY,[movieId, abstract]) + return True + except sqlite3.IntegrityError: + return False + +def insertSubject(curs: sqlite3.Cursor, subjectURI: str, originID: int) -> bool: + QUERY = "INSERT INTO Subjects (SubjectURI, OriginID) VALUES (?,?);" + try: + curs.execute(QUERY,[subjectURI, originID]) + return True + except sqlite3.IntegrityError: + return False + +def insertRelationship(curs: sqlite3.Cursor, relationshipURI: str) -> bool: + QUERY = "INSERT INTO Relationships (RelationshipURI) VALUES (?);" + try: + curs.execute(QUERY,[relationshipURI]) + return True + except sqlite3.IntegrityError: + return False + +def insertObject(curs: sqlite3.Cursor, objectURI: str, originID: int) -> bool: + QUERY = "INSERT INTO objects (ObjectURI, OriginID) VALUES (?,?);" + try: + curs.execute(QUERY,[objectURI, originID]) + return True + except sqlite3.IntegrityError: + return False + +def selectSubjectId(curs: sqlite3.Cursor, subjectURI: str) -> int | None: + + QUERY = "SELECT SubjectID FROM Subjects WHERE SubjectURI = ?;" + + curs.execute(QUERY, [subjectURI]) + subjectId = curs.fetchone() + if not subjectId: + return None + + # in this case the real id is the first element of the tuple + return subjectId[0] + +def selectRelationshipId(curs: sqlite3.Cursor, relationshipURI: str) -> int | None: + + QUERY = "SELECT RelationshipID FROM Relationships WHERE RelationshipURI = ?;" + + curs.execute(QUERY, [relationshipURI]) + relationshipId = curs.fetchone() + if not relationshipId: + return None + + # in this case the real id is the first element of the tuple + return relationshipId[0] + +def selectObjectId(curs: sqlite3.Cursor, objectURI: str) -> int | None: + + QUERY = "SELECT ObjectID FROM Objects WHERE ObjectURI = ?;" + + curs.execute(QUERY, [objectURI]) + objectId = curs.fetchone() + if not objectId: + return None + + # in this case the real id is the first element of the tuple + return objectId[0] + +def insertRDF( + curs: sqlite3.Cursor, + movieId: int, + subjectId: int, + relationshipId: int, + objectId: int +) -> bool: + QUERY = "INSERT INTO RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?,?,?,?);" + try: + curs.execute(QUERY,[movieId,subjectId,relationshipId,objectId]) + return True + except sqlite3.IntegrityError: + return False + +# MARK: Parsing +def parseMovies(): + + CSV_READER = csv.reader(MOVIES_CSV_HANDLER) + next(CSV_READER) + for row in CSV_READER: + MOVIE = row[0] + insertMovie(CURS, MOVIE) + + +def parseWikiPageId(): + CSV_READER = csv.DictReader(PAGEID_CSV_HANDLER) + for row in CSV_READER: + MOVIE_URI = row["subject"] + WIKI_PAGE_ID = int(row["object"]) + MOVIE_ID = selectMovieId(CURS, MOVIE_URI) + + if MOVIE_ID is None: + print(f"The MovieUri: {MOVIE_URI} has not a MovieId ") + continue + + insertWikiPageId(CURS, MOVIE_ID, WIKI_PAGE_ID) + + +def parseAbstract(): + CSV_READER = csv.DictReader(SUMMARY_CSV_HANDLER) + for row in CSV_READER: + + WIKI_PAGE_ID = int(row["subject"]) + ABSTRACT = row["text"] + MOVIE_ID = selectMovieIdFromWikiPageId(CURS, WIKI_PAGE_ID) + + + if MOVIE_ID is None: + print(f"The WikiPageId: {WIKI_PAGE_ID} has not a MovieId ") + continue + + insertWikiAbstract(CURS, MOVIE_ID, ABSTRACT) + + +def parseRDF_Reverse(): + + REVERSE_CSV_READER = csv.DictReader(REVERSE_CSV_HANDLER) + REVERSE_ORIGIN_ID = selectOrigin(CURS, 'reverse.csv') + total = 0 + + for row in REVERSE_CSV_READER: + SUBJECT = row["subject"] + RELATIONSHIP = row["relationship"] + OBJECT = row["object"] + print(f"RDF triplets:\n\t{SUBJECT} - {RELATIONSHIP} - {OBJECT}") + insertSubject(CURS,SUBJECT,REVERSE_ORIGIN_ID) + insertRelationship(CURS, RELATIONSHIP) + insertObject(CURS, OBJECT, REVERSE_ORIGIN_ID) + + SUBJECT_ID = selectSubjectId(CURS, SUBJECT) + OBJECT_ID = selectObjectId(CURS, OBJECT) + RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP) + MOVIE_ID = selectMovieId(CURS, OBJECT) + + + skip = False + + # guard + if SUBJECT_ID is None: + print(f"No SubjectId for {SUBJECT}") + skip = True + + if OBJECT_ID is None: + print(f"No ObjectId for {OBJECT}") + skip = True + + if RELATIONSHIP_ID is None: + print(f"No RelationshipId for {RELATIONSHIP}") + skip = True + + if MOVIE_ID is None: + print(f"No MovieId for {OBJECT}") + skip = True + + if skip: + continue + + if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID): + total += 1 + + print(total) + + + +def parseRDF_Dataset(): + + DATASET_CSV_READER = csv.DictReader(DATASET_CSV_HANDLER) + DATASET_ORIGIN_ID = selectOrigin(CURS, 'dataset.csv') + + total = 0 + rdf_idx = 0 + for row in DATASET_CSV_READER: + + SUBJECT = row["subject"] + RELATIONSHIP = row["relationship"] + OBJECT = row["object"] + + rdf_idx += 1 + + if rdf_idx % 100000 == 0: + print(f"RDF number {rdf_idx}:\n\t{SUBJECT} - {RELATIONSHIP} - {OBJECT}") + + insertSubject(CURS,SUBJECT,DATASET_ORIGIN_ID) + insertRelationship(CURS, RELATIONSHIP) + insertObject(CURS, OBJECT, DATASET_ORIGIN_ID) + + SUBJECT_ID = selectSubjectId(CURS, SUBJECT) + OBJECT_ID = selectObjectId(CURS, OBJECT) + RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP) + MOVIE_ID = selectMovieId(CURS, SUBJECT) + + + skip = False + + # guard + if SUBJECT_ID is None: + print(f"No SubjectId for {SUBJECT}") + skip = True + + if OBJECT_ID is None: + print(f"No ObjectId for {OBJECT}") + skip = True + + if RELATIONSHIP_ID is None: + print(f"No RelationshipId for {RELATIONSHIP}") + skip = True + + if MOVIE_ID is None: + print(f"No MovieId for {SUBJECT}") + skip = True + + if skip: + continue + + if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID): + total += 1 + + print(total) + + +# MARK: Actual Code +# parseMovies() +# parseWikiPageId() +# parseAbstract() +# insertOrigin(CURS) +# parseRDF_Reverse() +# parseRDF_Dataset() + + +CONN.commit() +CONN.close() + + + +MOVIES_CSV_HANDLER.close() +PAGEID_CSV_HANDLER.close() +SUMMARY_CSV_HANDLER.close() +DATASET_CSV_HANDLER.close() +REVERSE_CSV_HANDLER.close() + + +""" +The MovieUri: http://dbpedia.org/resource/1%25_(film) has not a MovieId +The MovieUri: http://dbpedia.org/resource/10%25:_What_Makes_a_Hero%3F has not a MovieId +The MovieUri: http://dbpedia.org/resource/100%25_Arabica has not a MovieId +The MovieUri: http://dbpedia.org/resource/100%25_Kadhal has not a MovieId +The MovieUri: http://dbpedia.org/resource/100%25_Love_(2011_film) has not a MovieId +The MovieUri: http://dbpedia.org/resource/100%25_Love_(2012_film) has not a MovieId +The MovieUri: http://dbpedia.org/resource/100%25_Wolf has not a MovieId +The MovieUri: http://dbpedia.org/resource/Who_the_$&%25_Is_Jackson_Pollock%3F has not a MovieId +The MovieUri: http://dbpedia.org/resource/99%25:_The_Occupy_Wall_Street_Collaborative_Film has not a MovieId +The MovieUri: http://dbpedia.org/resource/99_and_44/100%25_Dead has not a MovieId +The MovieUri: http://dbpedia.org/resource/Postcards_from_the_48%25 has not a MovieId +The MovieUri: http://dbpedia.org/resource/Wool_100%25 has not a MovieId +""" + +""" +The WikiPageId: 10068850 has not a MovieId +The WikiPageId: 55069615 has not a MovieId +The WikiPageId: 49510056 has not a MovieId +The WikiPageId: 4049786 has not a MovieId +The WikiPageId: 55510238 has not a MovieId +The WikiPageId: 31239628 has not a MovieId +The WikiPageId: 34757217 has not a MovieId +The WikiPageId: 64311757 has not a MovieId +The WikiPageId: 8326198 has not a MovieId +The WikiPageId: 42162164 has not a MovieId +The WikiPageId: 18502369 has not a MovieId +The WikiPageId: 58092358 has not a MovieId +The WikiPageId: 40710250 has not a MovieId +""" \ No newline at end of file