import sqlite3 import csv # --- Global configuration --- DB_NAME = "./Assets/Dataset/Tmp/dataset.db" MOVIES_CSV = "./Assets/Dataset/1-hop/movies.csv" PAGEID_CSV = "./Assets/Dataset/1-hop/movie-pageid.csv" SUMMARY_CSV = "./Assets/Dataset/1-hop/wikipedia-summary.csv" DATASET_CSV = "./Assets/Dataset/1-hop/dataset.csv" REVERSE_CSV = "./Assets/Dataset/1-hop/reverse.csv" # --- Helper: idempotent insert-or-select --- def get_or_create(cursor, table, column, value, origin_id=None): # tries to put new values in db, then get the id (regardless of the check) # Subjects and Objects need origin_id. Relationships do not # try/except on INSERT keeps IDs contiguous (no AUTOINCREMENT jumps) try: if origin_id is not None: cursor.execute( f"INSERT INTO {table} ({column}, OriginID) VALUES (?, ?)", (value, int(origin_id)), ) else: cursor.execute( f"INSERT INTO {table} ({column}) VALUES (?)", (value,), ) except sqlite3.IntegrityError: # Row already exists, do nothing pass # Always fetch the ID (whether new or existing) # {table[:-1]}ID -> # Subjects -> SubjectID # Objects -> ObjectID # Relationships -> RelationshipID # kinda hardcoded cursor.execute(f"SELECT {table[:-1]}ID FROM {table} WHERE {column}=?", (value,)) return cursor.fetchone()[0] # fetchone returns a list with one element # --- Load Movies --- def load_movies(): # Creates Movies: MovieID [PK] / Movie URI # MovieID is managed by sql conn = sqlite3.connect(DB_NAME) cur = conn.cursor() total = 0 # MOVIES_CSV: "subject" [it has only this column] with open(MOVIES_CSV, newline="", encoding="utf-8") as f: reader = csv.DictReader(f) for row in reader: movie_uri = row["subject"].strip() # try/except on INSERT keeps IDs contiguous (no AUTOINCREMENT jumps) try: cur.execute("INSERT INTO Movies (MovieURI) VALUES (?)", (movie_uri,)) total += 1 # count only if a new row was added except sqlite3.IntegrityError: # already exists, skip pass conn.commit() # suggested by dr conn.close() print(f"Movies loaded: {total}") # --- Load Origins --- def load_origins(): # Creates Origins: OriginID [PK]/ Origin Name # ["Dataset", "Reverse"] conn = sqlite3.connect(DB_NAME) cur = conn.cursor() for origin in ["Dataset", "Reverse"]: try: cur.execute("INSERT INTO Origins (OriginName) VALUES (?)", (origin,)) except sqlite3.IntegrityError: pass conn.commit() conn.close() print("Origins loaded.") # --- Load WikiPageIDs --- def load_wikipageids(): # Creates WikiPageIDs: MovieID [PK, FK]/ PageId [IDX] (wiki) conn = sqlite3.connect(DB_NAME) cur = conn.cursor() total = 0 # PAGEID_CSV: "subject","object" -> MovieURI, WikiPageId with open(PAGEID_CSV, newline="", encoding="utf-8") as f: reader = csv.DictReader(f) for row in reader: movie_uri = row["subject"].strip() page_id = int(row["object"]) cur.execute("SELECT MovieID FROM Movies WHERE MovieURI=?", (movie_uri,)) movie = cur.fetchone() if movie: try: # it can become INSERT OR IGNORE instead of try catch cur.execute( "INSERT INTO WikiPageIDs (MovieID, PageID) VALUES (?, ?)", (movie[0], page_id), ) total += 1 except sqlite3.IntegrityError: pass conn.commit() conn.close() print(f"WikiPageIDs loaded: {total}") # --- Load Wikipedia Abstracts --- def load_abstracts(): # Cretes WikipediaAbstracts: MovieID [PK, FK]/ abstract conn = sqlite3.connect(DB_NAME) cur = conn.cursor() total = 0 # SUMMARY_CSV: subject,text -> WikiPageID / abstract with open(SUMMARY_CSV, newline="", encoding="utf-8") as f: reader = csv.DictReader(f) for row in reader: page_id = int(row["subject"]) abstract = row["text"].strip() # WikiPageIDs: MovieID [PK, FK]/ PageId [IDX] (wiki) cur.execute("SELECT MovieID FROM WikiPageIDs WHERE PageID=?", (page_id,)) movie = cur.fetchone() # which is MovieID if movie: try: # it can become INSERT OR IGNORE instead of try catch cur.execute( "INSERT INTO WikipediaAbstracts (MovieID, Abstract) VALUES (?, ?)", (movie[0], abstract), ) total += 1 except sqlite3.IntegrityError as e: print(e) pass conn.commit() conn.close() print(f"WikipediaAbstracts loaded: {total}") # --- Load Dataset RDFs --- def load_dataset(): conn = sqlite3.connect(DB_NAME) cur = conn.cursor() # get oridin_id from datset cur.execute("SELECT OriginID FROM Origins WHERE OriginName='Dataset'") origin_id = int(cur.fetchone()[0]) print(f"Origin_id is: {origin_id}") #### total = 0 skipped_movie = 0 # DATASET_CSV: "subject","relationshi","object" -> MovieUri, RelationshipUri, ObjectUri with open(DATASET_CSV, newline="", encoding="utf-8") as f: reader = csv.DictReader(f) for row in reader: movie_uri = row["subject"].strip() relationship_uri = row["relationship"].strip() object_uri = row["object"].strip() cur.execute("SELECT MovieID FROM Movies WHERE MovieURI=?", (movie_uri,)) movie = cur.fetchone() if not movie: skipped_movie += skipped_movie continue # it is a guard check if it doest exist a MovieID from the given MovieURI, then skip # now put each URI into their SCHEMA and retrieves IDs instead subject_id = get_or_create(cur, "Subjects", "SubjectURI", movie_uri, origin_id) relationship_id = get_or_create(cur, "Relationships", "RelationshipURI", relationship_uri) object_id = get_or_create(cur, "Objects", "ObjectURI", object_uri, origin_id) # check if the triple is already in the RDF cur.execute( "SELECT RDF_ID FROM RDFs WHERE SubjectID=? AND RelationshipID=? AND ObjectID=?", (subject_id, relationship_id, object_id), ) if not cur.fetchone(): cur.execute( "INSERT INTO RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?, ?, ?, ?)", (movie[0], subject_id, relationship_id, object_id), ) total += 1 conn.commit() conn.close() print(f"Dataset RDFs loaded: {total}") print(f"Skipped Movies: {skipped_movie}") # --- Load Reverse RDFs --- def load_reverse(): conn = sqlite3.connect(DB_NAME) cur = conn.cursor() # get oridin_id from datset cur.execute("SELECT OriginID FROM Origins WHERE OriginName='Reverse'") origin_id = int(cur.fetchone()[0]) print(f"Origin_id is: {origin_id}") ### total = 0 skipped_movie = 0 # REVERSE_CSV : "subject","relationship","object" -> SubjectURI, RelationshipURI, MovieURI with open(REVERSE_CSV, newline="", encoding="utf-8") as f: reader = csv.DictReader(f) for row in reader: subject_uri = row["subject"].strip() relationship_uri = row["relationship"].strip() movie_uri = row["object"].strip() cur.execute("SELECT MovieID FROM Movies WHERE MovieURI=?", (movie_uri,)) movie = cur.fetchone() if not movie: skipped_movie += skipped_movie continue # it is a guard check if it doest exist a MovieID from the given MovieURI, then skip # now put each URI into their SCHEMA and retrieves IDs instead subject_id = get_or_create(cur, "Subjects", "SubjectURI", subject_uri, origin_id) relationship_id = get_or_create(cur, "Relationships", "RelationshipURI", relationship_uri) object_id = get_or_create(cur, "Objects", "ObjectURI", movie_uri, origin_id) # check if the triple is already in the RDF cur.execute( "SELECT RDF_ID FROM RDFs WHERE SubjectID=? AND RelationshipID=? AND ObjectID=?", (subject_id, relationship_id, object_id), ) if not cur.fetchone(): cur.execute( "INSERT INTO RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?, ?, ?, ?)", (movie[0], subject_id, relationship_id, object_id), ) total += 1 conn.commit() conn.close() print(f"Reverse RDFs loaded: {total}") print(f"Skipped Movies: {skipped_movie}") # --- Execution order --- load_movies() load_origins() load_wikipageids() load_abstracts() load_dataset() load_reverse() # sqlite3 ./Assets/Dataset/Tmp/dataset.db < ./Scripts/DataCleaning/SQL_Queries/db_creation.sql