import sqlite3 import csv ##################################################################### # This file builds DatawareHouse/dataset.db from 1-hop csv files # # Its Schema in . /SQL_Queries/db_creation.sql # # The sql query used to popualate id in . /SQL_Queries/query.sql # ##################################################################### # sometimes you may need to build a new db file, here a little snippet for you # sqlite3 ./Assets/Dataset/Tmp/dataset.db < ./Scripts/DataCleaning/SQL_Queries/db_creation.sql # --- Global configuration --- DB_NAME = "./Assets/Dataset/DatawareHouse/dataset.db" MOVIES_CSV = "./Assets/Dataset/1-hop/movies.csv" PAGEID_CSV = "./Assets/Dataset/1-hop/movie-pageid.csv" SUMMARY_CSV = "./Assets/Dataset/1-hop/wikipedia-summary.csv" DATASET_CSV = "./Assets/Dataset/1-hop/dataset.csv" REVERSE_CSV = "./Assets/Dataset/1-hop/reverse.csv" MOVIES_CSV_HANDLER = open(MOVIES_CSV,"r",newline='', encoding="utf-8") PAGEID_CSV_HANDLER = open(PAGEID_CSV,"r",newline='', encoding="utf-8") SUMMARY_CSV_HANDLER = open(SUMMARY_CSV,"r",newline='', encoding="utf-8") DATASET_CSV_HANDLER = open(DATASET_CSV,"r",newline='', encoding="utf-8") REVERSE_CSV_HANDLER = open(REVERSE_CSV,"r",newline='', encoding="utf-8") CONN = sqlite3.connect(DB_NAME) CURS = CONN.cursor() # MARK: SQL Definitions # Insert MovieURI def insertOrigin(curs : sqlite3.Cursor ) -> bool: QUERY = "INSERT INTO Origins (OriginName) VALUES ('dataset.csv'),('reverse.csv');" try: curs.execute(QUERY) return True except sqlite3.IntegrityError: return False def selectOrigin(curs: sqlite3.Cursor, originName: str) -> int | None: QUERY = "SELECT OriginID FROM Origins WHERE OriginName = ?;" curs.execute(QUERY, [originName]) originId = curs.fetchone() if not originId: return None # in this case the real id is the first element of the tuple return originId[0] def insertMovie(curs : sqlite3.Cursor , movieUri: str) -> bool: QUERY = "INSERT INTO Movies (MovieURI) VALUES (?);" try: curs.execute(QUERY,[movieUri]) return True except sqlite3.IntegrityError: return False def selectMovieId(curs: sqlite3.Cursor, movieUri: str) -> int | None: QUERY = "SELECT MovieID FROM Movies WHERE MovieURI = ?;" curs.execute(QUERY, [movieUri]) movieId = curs.fetchone() if not movieId: return None # in this case the real id is the first element of the tuple return movieId[0] def insertWikiPageId(curs: sqlite3.Cursor, movieId: int, pageId: int) -> bool: QUERY = "INSERT INTO WikiPageIDs (MovieID, PageID) VALUES (?,?);" try: curs.execute(QUERY,[movieId, pageId]) return True except sqlite3.IntegrityError: return False def selectMovieIdFromWikiPageId(curs: sqlite3.Cursor,pageId: int) -> int | None: QUERY = "SELECT MovieID FROM WikiPageIDs WHERE PageID = ?;" curs.execute(QUERY, [pageId]) movieId = curs.fetchone() if not movieId: return None # in this case the real id is the first element of the tuple return movieId[0] def insertWikiAbstract(curs: sqlite3.Cursor, movieId: int, abstract: str) -> bool: QUERY = "INSERT INTO WikipediaAbstracts (MovieID, Abstract) VALUES (?,?);" try: curs.execute(QUERY,[movieId, abstract]) return True except sqlite3.IntegrityError: return False def insertSubject(curs: sqlite3.Cursor, subjectURI: str, originID: int) -> bool: QUERY = "INSERT INTO Subjects (SubjectURI, OriginID) VALUES (?,?);" try: curs.execute(QUERY,[subjectURI, originID]) return True except sqlite3.IntegrityError: return False def insertRelationship(curs: sqlite3.Cursor, relationshipURI: str) -> bool: QUERY = "INSERT INTO Relationships (RelationshipURI) VALUES (?);" try: curs.execute(QUERY,[relationshipURI]) return True except sqlite3.IntegrityError: return False def insertObject(curs: sqlite3.Cursor, objectURI: str, originID: int) -> bool: QUERY = "INSERT INTO objects (ObjectURI, OriginID) VALUES (?,?);" try: curs.execute(QUERY,[objectURI, originID]) return True except sqlite3.IntegrityError: return False def selectSubjectId(curs: sqlite3.Cursor, subjectURI: str) -> int | None: QUERY = "SELECT SubjectID FROM Subjects WHERE SubjectURI = ?;" curs.execute(QUERY, [subjectURI]) subjectId = curs.fetchone() if not subjectId: return None # in this case the real id is the first element of the tuple return subjectId[0] def selectRelationshipId(curs: sqlite3.Cursor, relationshipURI: str) -> int | None: QUERY = "SELECT RelationshipID FROM Relationships WHERE RelationshipURI = ?;" curs.execute(QUERY, [relationshipURI]) relationshipId = curs.fetchone() if not relationshipId: return None # in this case the real id is the first element of the tuple return relationshipId[0] def selectObjectId(curs: sqlite3.Cursor, objectURI: str) -> int | None: QUERY = "SELECT ObjectID FROM Objects WHERE ObjectURI = ?;" curs.execute(QUERY, [objectURI]) objectId = curs.fetchone() if not objectId: return None # in this case the real id is the first element of the tuple return objectId[0] def insertRDF( curs: sqlite3.Cursor, movieId: int, subjectId: int, relationshipId: int, objectId: int ) -> bool: QUERY = "INSERT INTO RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?,?,?,?);" try: curs.execute(QUERY,[movieId,subjectId,relationshipId,objectId]) return True except sqlite3.IntegrityError: return False # MARK: Parsing def parseMovies(): CSV_READER = csv.reader(MOVIES_CSV_HANDLER) next(CSV_READER) for row in CSV_READER: MOVIE = row[0] insertMovie(CURS, MOVIE) def parseWikiPageId(): CSV_READER = csv.DictReader(PAGEID_CSV_HANDLER) for row in CSV_READER: MOVIE_URI = row["subject"] WIKI_PAGE_ID = int(row["object"]) MOVIE_ID = selectMovieId(CURS, MOVIE_URI) if MOVIE_ID is None: print(f"The MovieUri: {MOVIE_URI} has not a MovieId ") continue insertWikiPageId(CURS, MOVIE_ID, WIKI_PAGE_ID) def parseAbstract(): CSV_READER = csv.DictReader(SUMMARY_CSV_HANDLER) for row in CSV_READER: WIKI_PAGE_ID = int(row["subject"]) ABSTRACT = row["text"] MOVIE_ID = selectMovieIdFromWikiPageId(CURS, WIKI_PAGE_ID) if MOVIE_ID is None: print(f"The WikiPageId: {WIKI_PAGE_ID} has not a MovieId ") continue insertWikiAbstract(CURS, MOVIE_ID, ABSTRACT) def parseRDF_Reverse(): REVERSE_CSV_READER = csv.DictReader(REVERSE_CSV_HANDLER) REVERSE_ORIGIN_ID = selectOrigin(CURS, 'reverse.csv') total = 0 for row in REVERSE_CSV_READER: SUBJECT = row["subject"] RELATIONSHIP = row["relationship"] OBJECT = row["object"] print(f"RDF triplets:\n\t{SUBJECT} - {RELATIONSHIP} - {OBJECT}") insertSubject(CURS,SUBJECT,REVERSE_ORIGIN_ID) insertRelationship(CURS, RELATIONSHIP) insertObject(CURS, OBJECT, REVERSE_ORIGIN_ID) SUBJECT_ID = selectSubjectId(CURS, SUBJECT) OBJECT_ID = selectObjectId(CURS, OBJECT) RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP) MOVIE_ID = selectMovieId(CURS, OBJECT) skip = False # guard if SUBJECT_ID is None: print(f"No SubjectId for {SUBJECT}") skip = True if OBJECT_ID is None: print(f"No ObjectId for {OBJECT}") skip = True if RELATIONSHIP_ID is None: print(f"No RelationshipId for {RELATIONSHIP}") skip = True if MOVIE_ID is None: print(f"No MovieId for {OBJECT}") skip = True if skip: continue if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID): total += 1 print(total) def parseRDF_Dataset(): DATASET_CSV_READER = csv.DictReader(DATASET_CSV_HANDLER) DATASET_ORIGIN_ID = selectOrigin(CURS, 'dataset.csv') total = 0 rdf_idx = 0 for row in DATASET_CSV_READER: SUBJECT = row["subject"] RELATIONSHIP = row["relationship"] OBJECT = row["object"] rdf_idx += 1 if rdf_idx % 100000 == 0: print(f"RDF number {rdf_idx}:\n\t{SUBJECT} - {RELATIONSHIP} - {OBJECT}") insertSubject(CURS,SUBJECT,DATASET_ORIGIN_ID) insertRelationship(CURS, RELATIONSHIP) insertObject(CURS, OBJECT, DATASET_ORIGIN_ID) SUBJECT_ID = selectSubjectId(CURS, SUBJECT) OBJECT_ID = selectObjectId(CURS, OBJECT) RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP) MOVIE_ID = selectMovieId(CURS, SUBJECT) skip = False # guard if SUBJECT_ID is None: print(f"No SubjectId for {SUBJECT}") skip = True if OBJECT_ID is None: print(f"No ObjectId for {OBJECT}") skip = True if RELATIONSHIP_ID is None: print(f"No RelationshipId for {RELATIONSHIP}") skip = True if MOVIE_ID is None: print(f"No MovieId for {SUBJECT}") skip = True if skip: continue if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID): total += 1 print(total) # MARK: Actual Code # parseMovies() # parseWikiPageId() # parseAbstract() # insertOrigin(CURS) # parseRDF_Reverse() # parseRDF_Dataset() CONN.commit() CONN.close() MOVIES_CSV_HANDLER.close() PAGEID_CSV_HANDLER.close() SUMMARY_CSV_HANDLER.close() DATASET_CSV_HANDLER.close() REVERSE_CSV_HANDLER.close() """ The MovieUri: http://dbpedia.org/resource/1%25_(film) has not a MovieId The MovieUri: http://dbpedia.org/resource/10%25:_What_Makes_a_Hero%3F has not a MovieId The MovieUri: http://dbpedia.org/resource/100%25_Arabica has not a MovieId The MovieUri: http://dbpedia.org/resource/100%25_Kadhal has not a MovieId The MovieUri: http://dbpedia.org/resource/100%25_Love_(2011_film) has not a MovieId The MovieUri: http://dbpedia.org/resource/100%25_Love_(2012_film) has not a MovieId The MovieUri: http://dbpedia.org/resource/100%25_Wolf has not a MovieId The MovieUri: http://dbpedia.org/resource/Who_the_$&%25_Is_Jackson_Pollock%3F has not a MovieId The MovieUri: http://dbpedia.org/resource/99%25:_The_Occupy_Wall_Street_Collaborative_Film has not a MovieId The MovieUri: http://dbpedia.org/resource/99_and_44/100%25_Dead has not a MovieId The MovieUri: http://dbpedia.org/resource/Postcards_from_the_48%25 has not a MovieId The MovieUri: http://dbpedia.org/resource/Wool_100%25 has not a MovieId """ """ The WikiPageId: 10068850 has not a MovieId The WikiPageId: 55069615 has not a MovieId The WikiPageId: 49510056 has not a MovieId The WikiPageId: 4049786 has not a MovieId The WikiPageId: 55510238 has not a MovieId The WikiPageId: 31239628 has not a MovieId The WikiPageId: 34757217 has not a MovieId The WikiPageId: 64311757 has not a MovieId The WikiPageId: 8326198 has not a MovieId The WikiPageId: 42162164 has not a MovieId The WikiPageId: 18502369 has not a MovieId The WikiPageId: 58092358 has not a MovieId The WikiPageId: 40710250 has not a MovieId """