import sqlite3 import csv ##################################################################### # This file builds DatawareHouse/dataset.db from 1-hop csv files # # Its Schema in . /SQL_Queries/db_creation.sql # # The sql query used to popualate id in . /SQL_Queries/query.sql # ##################################################################### # sometimes you may need to build a new db file, here a little snippet for you # sqlite3 ./Assets/Dataset/Tmp/dataset.db < ./Scripts/DataCleaning/SQL_Queries/db_creation.sql # --- Global configuration --- DB_NAME = "./Assets/Dataset/DatawareHouse/dataset.db" MOVIES_CSV = "./Assets/Dataset/1-hop/movies.csv" PAGEID_CSV = "./Assets/Dataset/1-hop/movie-pageid.csv" SUMMARY_CSV = "./Assets/Dataset/1-hop/wikipedia-summary.csv" DATASET_CSV = "./Assets/Dataset/1-hop/dataset.csv" REVERSE_CSV = "./Assets/Dataset/1-hop/reverse.csv" URI_CSV = "./Assets/Dataset/1-hop/uri-abbreviations.csv" MOVIES_CSV_HANDLER = open(MOVIES_CSV, "r", newline="", encoding="utf-8") PAGEID_CSV_HANDLER = open(PAGEID_CSV, "r", newline="", encoding="utf-8") SUMMARY_CSV_HANDLER = open(SUMMARY_CSV, "r", newline="", encoding="utf-8") DATASET_CSV_HANDLER = open(DATASET_CSV, "r", newline="", encoding="utf-8") REVERSE_CSV_HANDLER = open(REVERSE_CSV, "r", newline="", encoding="utf-8") URI_ABBR_CSV_HANDLER = open(URI_CSV, "r", newline="", encoding="utf-8") CONN = sqlite3.connect(DB_NAME) CURS = CONN.cursor() # MARK: SQL Definitions # Insert MovieURI def insertOrigin(curs: sqlite3.Cursor) -> bool: QUERY = "INSERT INTO Origins (OriginName) VALUES ('dataset.csv'),('reverse.csv');" try: curs.execute(QUERY) return True except sqlite3.IntegrityError: return False def selectOrigin(curs: sqlite3.Cursor, originName: str) -> int | None: QUERY = "SELECT OriginID FROM Origins WHERE OriginName = ?;" curs.execute(QUERY, [originName]) originId = curs.fetchone() if not originId: return None # in this case the real id is the first element of the tuple return originId[0] def insertMovie(curs: sqlite3.Cursor, movieUri: str) -> bool: QUERY = "INSERT INTO Movies (MovieURI) VALUES (?);" try: curs.execute(QUERY, [movieUri]) return True except sqlite3.IntegrityError: return False def selectMovieId(curs: sqlite3.Cursor, movieUri: str) -> int | None: QUERY = "SELECT MovieID FROM Movies WHERE MovieURI = ?;" curs.execute(QUERY, [movieUri]) movieId = curs.fetchone() if not movieId: return None # in this case the real id is the first element of the tuple return movieId[0] def insertWikiPageId(curs: sqlite3.Cursor, movieId: int, pageId: int) -> bool: QUERY = "INSERT INTO WikiPageIDs (MovieID, PageID) VALUES (?,?);" try: curs.execute(QUERY, [movieId, pageId]) return True except sqlite3.IntegrityError: return False def selectMovieIdFromWikiPageId(curs: sqlite3.Cursor, pageId: int) -> int | None: QUERY = "SELECT MovieID FROM WikiPageIDs WHERE PageID = ?;" curs.execute(QUERY, [pageId]) movieId = curs.fetchone() if not movieId: return None # in this case the real id is the first element of the tuple return movieId[0] def insertWikiAbstract(curs: sqlite3.Cursor, movieId: int, abstract: str) -> bool: QUERY = "INSERT INTO WikipediaAbstracts (MovieID, Abstract) VALUES (?,?);" try: curs.execute(QUERY, [movieId, abstract]) return True except sqlite3.IntegrityError: return False def insertSubject(curs: sqlite3.Cursor, subjectURI: str, originID: int) -> bool: QUERY = "INSERT INTO Subjects (SubjectURI, OriginID) VALUES (?,?);" try: curs.execute(QUERY, [subjectURI, originID]) return True except sqlite3.IntegrityError: return False def insertRelationship(curs: sqlite3.Cursor, relationshipURI: str) -> bool: QUERY = "INSERT INTO Relationships (RelationshipURI) VALUES (?);" try: curs.execute(QUERY, [relationshipURI]) return True except sqlite3.IntegrityError: return False def insertObject(curs: sqlite3.Cursor, objectURI: str, originID: int) -> bool: QUERY = "INSERT INTO objects (ObjectURI, OriginID) VALUES (?,?);" try: curs.execute(QUERY, [objectURI, originID]) return True except sqlite3.IntegrityError: return False def selectSubjectId(curs: sqlite3.Cursor, subjectURI: str) -> int | None: QUERY = "SELECT SubjectID FROM Subjects WHERE SubjectURI = ?;" curs.execute(QUERY, [subjectURI]) subjectId = curs.fetchone() if not subjectId: return None # in this case the real id is the first element of the tuple return subjectId[0] def selectRelationshipId(curs: sqlite3.Cursor, relationshipURI: str) -> int | None: QUERY = "SELECT RelationshipID FROM Relationships WHERE RelationshipURI = ?;" curs.execute(QUERY, [relationshipURI]) relationshipId = curs.fetchone() if not relationshipId: return None # in this case the real id is the first element of the tuple return relationshipId[0] def selectObjectId(curs: sqlite3.Cursor, objectURI: str) -> int | None: QUERY = "SELECT ObjectID FROM Objects WHERE ObjectURI = ?;" curs.execute(QUERY, [objectURI]) objectId = curs.fetchone() if not objectId: return None # in this case the real id is the first element of the tuple return objectId[0] def insertRDF( curs: sqlite3.Cursor, movieId: int, subjectId: int, relationshipId: int, objectId: int, ) -> bool: QUERY = "INSERT INTO RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?,?,?,?);" try: curs.execute(QUERY, [movieId, subjectId, relationshipId, objectId]) return True except sqlite3.IntegrityError: return False # UGLY: correct method to add cursor def insert_abbreviation(uri, abbreviation) -> bool: QUERY = "INSERT INTO Abbreviations(URI, Abbreviation) VALUES (?,?);" try: CURS.execute(QUERY, [uri, abbreviation]) return True except sqlite3.IntegrityError: return False # UGLY: correct method to add cursor def insert_object_abbreviation(object_id, abbreviation_id) -> bool: QUERY = "INSERT INTO Objects_Abbreviations(ObjectID, AbbreviationID) VALUES (?,?);" try: CURS.execute(QUERY, [object_id, abbreviation_id]) return True except sqlite3.IntegrityError: return False # UGLY: correct method to add cursor def insert_relationship_abbreviation(relationship_id, abbreviation_id) -> bool: QUERY = "INSERT INTO Relationships_Abbreviations(RelationshipID, AbbreviationID) VALUES (?,?);" try: CURS.execute(QUERY, [relationship_id, abbreviation_id]) return True except sqlite3.IntegrityError: return False # UGLY: correct method to add cursor def insert_subject_abbreviation(subject_id, abbreviation_id) -> bool: QUERY = ( "INSERT INTO Subjects_Abbreviations(SubjectID, AbbreviationID) VALUES (?,?);" ) try: CURS.execute(QUERY, [subject_id, abbreviation_id]) return True except sqlite3.IntegrityError: return False # UGLY: correct method to add cursor def select_abbreviation_id(uri) -> int | None: QUERY = "SELECT AbbreviationID FROM Abbreviations WHERE URI LIKE ?;" CURS.execute(QUERY, [uri]) abbreviation_id = CURS.fetchone() if not abbreviation_id: return None # in this case the real id is the first element of the tuple return abbreviation_id[0] # MARK: Parsing def parseMovies(): CSV_READER = csv.reader(MOVIES_CSV_HANDLER) next(CSV_READER) for row in CSV_READER: MOVIE = row[0] insertMovie(CURS, MOVIE) def parseWikiPageId(): CSV_READER = csv.DictReader(PAGEID_CSV_HANDLER) for row in CSV_READER: MOVIE_URI = row["subject"] WIKI_PAGE_ID = int(row["object"]) MOVIE_ID = selectMovieId(CURS, MOVIE_URI) if MOVIE_ID is None: print(f"The MovieUri: {MOVIE_URI} has not a MovieId ") continue insertWikiPageId(CURS, MOVIE_ID, WIKI_PAGE_ID) def parseAbstract(): CSV_READER = csv.DictReader(SUMMARY_CSV_HANDLER) for row in CSV_READER: WIKI_PAGE_ID = int(row["subject"]) ABSTRACT = row["text"] MOVIE_ID = selectMovieIdFromWikiPageId(CURS, WIKI_PAGE_ID) if MOVIE_ID is None: print(f"The WikiPageId: {WIKI_PAGE_ID} has not a MovieId ") continue insertWikiAbstract(CURS, MOVIE_ID, ABSTRACT) def parseAbbreviations(): URI_CSV = csv.DictReader(URI_ABBR_CSV_HANDLER) for row in URI_CSV: URI = row["uri"] ABBREVIATION = row["abbreviation"] insert_abbreviation(URI, ABBREVIATION) def parseRDF_Reverse(): REVERSE_CSV_READER = csv.DictReader(REVERSE_CSV_HANDLER) REVERSE_ORIGIN_ID = selectOrigin(CURS, "reverse.csv") if REVERSE_ORIGIN_ID is None: return total = 0 for row in REVERSE_CSV_READER: SUBJECT = row["subject"] RELATIONSHIP = row["relationship"] OBJECT = row["object"] print(f"RDF triplets:\n\t{SUBJECT} - {RELATIONSHIP} - {OBJECT}") insertSubject(CURS, SUBJECT, REVERSE_ORIGIN_ID) insertRelationship(CURS, RELATIONSHIP) insertObject(CURS, OBJECT, REVERSE_ORIGIN_ID) SUBJECT_ID = selectSubjectId(CURS, SUBJECT) OBJECT_ID = selectObjectId(CURS, OBJECT) RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP) MOVIE_ID = selectMovieId(CURS, OBJECT) skip = False # guard if SUBJECT_ID is None: print(f"No SubjectId for {SUBJECT}") skip = True if OBJECT_ID is None: print(f"No ObjectId for {OBJECT}") skip = True if RELATIONSHIP_ID is None: print(f"No RelationshipId for {RELATIONSHIP}") skip = True if MOVIE_ID is None: print(f"No MovieId for {OBJECT}") skip = True if skip: continue if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID): # type: ignore total += 1 print(total) def parseRDF_Dataset(): DATASET_CSV_READER = csv.DictReader(DATASET_CSV_HANDLER) DATASET_ORIGIN_ID = selectOrigin(CURS, "dataset.csv") if DATASET_ORIGIN_ID is None: return total = 0 rdf_idx = 0 for row in DATASET_CSV_READER: SUBJECT = row["subject"] RELATIONSHIP = row["relationship"] OBJECT = row["object"] rdf_idx += 1 if rdf_idx % 100000 == 0: print(f"RDF number {rdf_idx}:\n\t{SUBJECT} - {RELATIONSHIP} - {OBJECT}") insertSubject(CURS, SUBJECT, DATASET_ORIGIN_ID) insertRelationship(CURS, RELATIONSHIP) insertObject(CURS, OBJECT, DATASET_ORIGIN_ID) SUBJECT_ID = selectSubjectId(CURS, SUBJECT) OBJECT_ID = selectObjectId(CURS, OBJECT) RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP) MOVIE_ID = selectMovieId(CURS, SUBJECT) skip = False # guard if SUBJECT_ID is None: print(f"No SubjectId for {SUBJECT}") skip = True if OBJECT_ID is None: print(f"No ObjectId for {OBJECT}") skip = True if RELATIONSHIP_ID is None: print(f"No RelationshipId for {RELATIONSHIP}") skip = True if MOVIE_ID is None: print(f"No MovieId for {SUBJECT}") skip = True if skip: continue if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID): # type: ignore total += 1 print(total) def parseAbbr_Reverse(): REVERSE_CSV_READER = csv.DictReader(REVERSE_CSV_HANDLER) REVERSE_ORIGIN_ID = selectOrigin(CURS, "reverse.csv") if REVERSE_ORIGIN_ID is None: return total = 0 for row in REVERSE_CSV_READER: SUBJECT = row["subject"] RELATIONSHIP = row["relationship"] OBJECT = row["object"] SUBJECT_ID = selectSubjectId(CURS, SUBJECT) OBJECT_ID = selectObjectId(CURS, OBJECT) RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP) SUB_SECTIONS = SUBJECT.split("/") REL_SECTIONS = RELATIONSHIP.split("/") OBJ_SECTIONS = OBJECT.split("/") SUB_ABBR_ID = None REL_ABBR_ID = None OBJ_ABBR_ID = None skip = False # guard if SUBJECT_ID is None: print(f"No SubjectId for {SUBJECT}") skip = True if OBJECT_ID is None: print(f"No ObjectId for {OBJECT}") skip = True if RELATIONSHIP_ID is None: print(f"No RelationshipId for {RELATIONSHIP}") skip = True if skip: continue if len(SUB_SECTIONS) > 4: index = min(len(SUB_SECTIONS), 7) while index > 3: PATH = "/".join(SUB_SECTIONS[0:index]) + "%" SUB_ABBR_ID = select_abbreviation_id(PATH) if SUB_ABBR_ID is not None: if insert_subject_abbreviation(SUBJECT_ID, SUB_ABBR_ID): total += 1 index = 0 index -= 1 if len(REL_SECTIONS) > 4: index = min(len(REL_SECTIONS), 7) while index > 2: PATH = "/".join(REL_SECTIONS[0:index]) + "%" REL_ABBR_ID = select_abbreviation_id(PATH) if REL_ABBR_ID is not None: if insert_relationship_abbreviation(RELATIONSHIP_ID, REL_ABBR_ID): total += 1 index = 0 index -= 1 if len(OBJ_SECTIONS) > 4: index = min(len(OBJ_SECTIONS), 7) while index > 3: PATH = "/".join(OBJ_SECTIONS[0:index]) + "%" OBJ_ABBR_ID = select_abbreviation_id(PATH) if OBJ_ABBR_ID is not None: if insert_object_abbreviation(OBJECT_ID, OBJ_ABBR_ID): total += 1 index = 0 index -= 1 print(total) def parseAbbr_Dataset(): DATASET_CSV_READER = csv.DictReader(DATASET_CSV_HANDLER) DATASET_ORIGIN_ID = selectOrigin(CURS, "dataset.csv") if DATASET_ORIGIN_ID is None: return total = 0 rdf_idx = 0 for row in DATASET_CSV_READER: SUBJECT = row["subject"] RELATIONSHIP = row["relationship"] OBJECT = row["object"] rdf_idx += 1 if rdf_idx % 100000 == 0: print(f"RDF number {rdf_idx}:\n\t{SUBJECT} - {RELATIONSHIP} - {OBJECT}") SUBJECT_ID = selectSubjectId(CURS, SUBJECT) OBJECT_ID = selectObjectId(CURS, OBJECT) RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP) SUB_SECTIONS = SUBJECT.split("/") REL_SECTIONS = RELATIONSHIP.split("/") OBJ_SECTIONS = OBJECT.split("/") SUB_ABBR_ID = None REL_ABBR_ID = None OBJ_ABBR_ID = None skip = False # guard if SUBJECT_ID is None: print(f"No SubjectId for {SUBJECT}") skip = True if OBJECT_ID is None: print(f"No ObjectId for {OBJECT}") skip = True if RELATIONSHIP_ID is None: print(f"No RelationshipId for {RELATIONSHIP}") skip = True if skip: continue if len(SUB_SECTIONS) > 4: index = min(len(SUB_SECTIONS), 7) while index > 3: PATH = "/".join(SUB_SECTIONS[0:index]) + "%" SUB_ABBR_ID = select_abbreviation_id(PATH) if SUB_ABBR_ID is not None: if insert_subject_abbreviation(SUBJECT_ID, SUB_ABBR_ID): total += 1 index = 0 index -= 1 if len(REL_SECTIONS) > 4: index = min(len(REL_SECTIONS), 7) while index > 2: PATH = "/".join(REL_SECTIONS[0:index]) + "%" REL_ABBR_ID = select_abbreviation_id(PATH) if REL_ABBR_ID is not None: if insert_relationship_abbreviation(RELATIONSHIP_ID, REL_ABBR_ID): total += 1 index = 0 index -= 1 if len(OBJ_SECTIONS) > 4: index = min(len(OBJ_SECTIONS), 7) while index > 3: PATH = "/".join(OBJ_SECTIONS[0:index]) + "%" OBJ_ABBR_ID = select_abbreviation_id(PATH) if OBJ_ABBR_ID is not None: if insert_object_abbreviation(OBJECT_ID, OBJ_ABBR_ID): total += 1 index = 0 index -= 1 print(total) # MARK: Actual Code # parseMovies() # parseWikiPageId() # parseAbstract() # insertOrigin(CURS) # parseAbbreviations() # parseRDF_Reverse() # parseRDF_Dataset() # parseAbbr_Reverse() parseAbbr_Dataset() CONN.commit() CONN.close() MOVIES_CSV_HANDLER.close() PAGEID_CSV_HANDLER.close() SUMMARY_CSV_HANDLER.close() DATASET_CSV_HANDLER.close() REVERSE_CSV_HANDLER.close() URI_ABBR_CSV_HANDLER.close() """ The MovieUri: http://dbpedia.org/resource/1%25_(film) has not a MovieId The MovieUri: http://dbpedia.org/resource/10%25:_What_Makes_a_Hero%3F has not a MovieId The MovieUri: http://dbpedia.org/resource/100%25_Arabica has not a MovieId The MovieUri: http://dbpedia.org/resource/100%25_Kadhal has not a MovieId The MovieUri: http://dbpedia.org/resource/100%25_Love_(2011_film) has not a MovieId The MovieUri: http://dbpedia.org/resource/100%25_Love_(2012_film) has not a MovieId The MovieUri: http://dbpedia.org/resource/100%25_Wolf has not a MovieId The MovieUri: http://dbpedia.org/resource/Who_the_$&%25_Is_Jackson_Pollock%3F has not a MovieId The MovieUri: http://dbpedia.org/resource/99%25:_The_Occupy_Wall_Street_Collaborative_Film has not a MovieId The MovieUri: http://dbpedia.org/resource/99_and_44/100%25_Dead has not a MovieId The MovieUri: http://dbpedia.org/resource/Postcards_from_the_48%25 has not a MovieId The MovieUri: http://dbpedia.org/resource/Wool_100%25 has not a MovieId """ """ The WikiPageId: 10068850 has not a MovieId The WikiPageId: 55069615 has not a MovieId The WikiPageId: 49510056 has not a MovieId The WikiPageId: 4049786 has not a MovieId The WikiPageId: 55510238 has not a MovieId The WikiPageId: 31239628 has not a MovieId The WikiPageId: 34757217 has not a MovieId The WikiPageId: 64311757 has not a MovieId The WikiPageId: 8326198 has not a MovieId The WikiPageId: 42162164 has not a MovieId The WikiPageId: 18502369 has not a MovieId The WikiPageId: 58092358 has not a MovieId The WikiPageId: 40710250 has not a MovieId """