249 lines
9.2 KiB
Python
249 lines
9.2 KiB
Python
import sqlite3
|
|
import csv
|
|
|
|
# --- Global configuration ---
|
|
DB_NAME = "./Assets/Dataset/Tmp/dataset.db"
|
|
MOVIES_CSV = "./Assets/Dataset/1-hop/movies.csv"
|
|
PAGEID_CSV = "./Assets/Dataset/1-hop/movie-pageid.csv"
|
|
SUMMARY_CSV = "./Assets/Dataset/1-hop/wikipedia-summary.csv"
|
|
DATASET_CSV = "./Assets/Dataset/1-hop/dataset.csv"
|
|
REVERSE_CSV = "./Assets/Dataset/1-hop/reverse.csv"
|
|
|
|
|
|
# --- Helper: idempotent insert-or-select ---
|
|
def get_or_create(cursor, table, column, value, origin_id=None):
|
|
# tries to put new values in db, then get the id (regardless of the check)
|
|
# Subjects and Objects need origin_id. Relationships do not
|
|
# try/except on INSERT keeps IDs contiguous (no AUTOINCREMENT jumps)
|
|
|
|
try:
|
|
if origin_id is not None:
|
|
cursor.execute(
|
|
f"INSERT INTO {table} ({column}, OriginID) VALUES (?, ?)",
|
|
(value, int(origin_id)),
|
|
)
|
|
else:
|
|
cursor.execute(
|
|
f"INSERT INTO {table} ({column}) VALUES (?)",
|
|
(value,),
|
|
)
|
|
except sqlite3.IntegrityError:
|
|
# Row already exists, do nothing
|
|
pass
|
|
|
|
# Always fetch the ID (whether new or existing)
|
|
# {table[:-1]}ID ->
|
|
# Subjects -> SubjectID
|
|
# Objects -> ObjectID
|
|
# Relationships -> RelationshipID
|
|
# kinda hardcoded
|
|
cursor.execute(f"SELECT {table[:-1]}ID FROM {table} WHERE {column}=?", (value,))
|
|
return cursor.fetchone()[0] # fetchone returns a list with one element
|
|
|
|
|
|
|
|
# --- Load Movies ---
|
|
def load_movies():
|
|
# Creates Movies: MovieID [PK] / Movie URI
|
|
# MovieID is managed by sql
|
|
conn = sqlite3.connect(DB_NAME)
|
|
cur = conn.cursor()
|
|
total = 0
|
|
# MOVIES_CSV: "subject" [it has only this column]
|
|
with open(MOVIES_CSV, newline="", encoding="utf-8") as f:
|
|
reader = csv.DictReader(f)
|
|
for row in reader:
|
|
movie_uri = row["subject"].strip()
|
|
# try/except on INSERT keeps IDs contiguous (no AUTOINCREMENT jumps)
|
|
try:
|
|
cur.execute("INSERT INTO Movies (MovieURI) VALUES (?)", (movie_uri,))
|
|
total += 1 # count only if a new row was added
|
|
except sqlite3.IntegrityError:
|
|
# already exists, skip
|
|
pass
|
|
conn.commit() # suggested by dr
|
|
conn.close()
|
|
print(f"Movies loaded: {total}")
|
|
|
|
|
|
# --- Load Origins ---
|
|
def load_origins():
|
|
# Creates Origins: OriginID [PK]/ Origin Name
|
|
# ["Dataset", "Reverse"]
|
|
conn = sqlite3.connect(DB_NAME)
|
|
cur = conn.cursor()
|
|
for origin in ["Dataset", "Reverse"]:
|
|
try:
|
|
cur.execute("INSERT INTO Origins (OriginName) VALUES (?)", (origin,))
|
|
except sqlite3.IntegrityError:
|
|
pass
|
|
conn.commit()
|
|
conn.close()
|
|
print("Origins loaded.")
|
|
|
|
|
|
# --- Load WikiPageIDs ---
|
|
def load_wikipageids():
|
|
# Creates WikiPageIDs: MovieID [PK, FK]/ PageId [IDX] (wiki)
|
|
conn = sqlite3.connect(DB_NAME)
|
|
cur = conn.cursor()
|
|
total = 0
|
|
# PAGEID_CSV: "subject","object" -> MovieURI, WikiPageId
|
|
with open(PAGEID_CSV, newline="", encoding="utf-8") as f:
|
|
reader = csv.DictReader(f)
|
|
for row in reader:
|
|
movie_uri = row["subject"].strip()
|
|
page_id = int(row["object"])
|
|
cur.execute("SELECT MovieID FROM Movies WHERE MovieURI=?", (movie_uri,))
|
|
movie = cur.fetchone()
|
|
if movie:
|
|
try:
|
|
# it can become INSERT OR IGNORE instead of try catch
|
|
cur.execute(
|
|
"INSERT INTO WikiPageIDs (MovieID, PageID) VALUES (?, ?)",
|
|
(movie[0], page_id),
|
|
)
|
|
total += 1
|
|
except sqlite3.IntegrityError:
|
|
pass
|
|
conn.commit()
|
|
conn.close()
|
|
print(f"WikiPageIDs loaded: {total}")
|
|
|
|
|
|
# --- Load Wikipedia Abstracts ---
|
|
def load_abstracts():
|
|
# Cretes WikipediaAbstracts: MovieID [PK, FK]/ abstract
|
|
conn = sqlite3.connect(DB_NAME)
|
|
cur = conn.cursor()
|
|
total = 0
|
|
# SUMMARY_CSV: subject,text -> WikiPageID / abstract
|
|
with open(SUMMARY_CSV, newline="", encoding="utf-8") as f:
|
|
reader = csv.DictReader(f)
|
|
for row in reader:
|
|
page_id = int(row["subject"])
|
|
abstract = row["text"].strip()
|
|
# WikiPageIDs: MovieID [PK, FK]/ PageId [IDX] (wiki)
|
|
cur.execute("SELECT MovieID FROM WikiPageIDs WHERE PageID=?", (page_id,))
|
|
movie = cur.fetchone() # which is MovieID
|
|
if movie:
|
|
try:
|
|
# it can become INSERT OR IGNORE instead of try catch
|
|
cur.execute(
|
|
"INSERT INTO WikipediaAbstracts (MovieID, Abstract) VALUES (?, ?)",
|
|
(movie[0], abstract),
|
|
)
|
|
total += 1
|
|
except sqlite3.IntegrityError as e:
|
|
print(e)
|
|
pass
|
|
conn.commit()
|
|
conn.close()
|
|
print(f"WikipediaAbstracts loaded: {total}")
|
|
|
|
|
|
# --- Load Dataset RDFs ---
|
|
def load_dataset():
|
|
conn = sqlite3.connect(DB_NAME)
|
|
cur = conn.cursor()
|
|
# get oridin_id from datset
|
|
cur.execute("SELECT OriginID FROM Origins WHERE OriginName='Dataset'")
|
|
origin_id = int(cur.fetchone()[0])
|
|
print(f"Origin_id is: {origin_id}")
|
|
####
|
|
total = 0
|
|
skipped_movie = 0
|
|
# DATASET_CSV: "subject","relationshi","object" -> MovieUri, RelationshipUri, ObjectUri
|
|
with open(DATASET_CSV, newline="", encoding="utf-8") as f:
|
|
reader = csv.DictReader(f)
|
|
for row in reader:
|
|
movie_uri = row["subject"].strip()
|
|
relationship_uri = row["relationship"].strip()
|
|
object_uri = row["object"].strip()
|
|
|
|
cur.execute("SELECT MovieID FROM Movies WHERE MovieURI=?", (movie_uri,))
|
|
movie = cur.fetchone()
|
|
if not movie:
|
|
skipped_movie += skipped_movie
|
|
continue
|
|
# it is a guard check if it doest exist a MovieID from the given MovieURI, then skip
|
|
|
|
# now put each URI into their SCHEMA and retrieves IDs instead
|
|
subject_id = get_or_create(cur, "Subjects", "SubjectURI", movie_uri, origin_id)
|
|
relationship_id = get_or_create(cur, "Relationships", "RelationshipURI", relationship_uri)
|
|
object_id = get_or_create(cur, "Objects", "ObjectURI", object_uri, origin_id)
|
|
|
|
# check if the triple is already in the RDF
|
|
cur.execute(
|
|
"SELECT RDF_ID FROM RDFs WHERE SubjectID=? AND RelationshipID=? AND ObjectID=?",
|
|
(subject_id, relationship_id, object_id),
|
|
)
|
|
if not cur.fetchone():
|
|
cur.execute(
|
|
"INSERT INTO RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?, ?, ?, ?)",
|
|
(movie[0], subject_id, relationship_id, object_id),
|
|
)
|
|
total += 1
|
|
conn.commit()
|
|
conn.close()
|
|
print(f"Dataset RDFs loaded: {total}")
|
|
print(f"Skipped Movies: {skipped_movie}")
|
|
|
|
|
|
# --- Load Reverse RDFs ---
|
|
def load_reverse():
|
|
conn = sqlite3.connect(DB_NAME)
|
|
cur = conn.cursor()
|
|
# get oridin_id from datset
|
|
cur.execute("SELECT OriginID FROM Origins WHERE OriginName='Reverse'")
|
|
origin_id = int(cur.fetchone()[0])
|
|
print(f"Origin_id is: {origin_id}")
|
|
###
|
|
total = 0
|
|
skipped_movie = 0
|
|
# REVERSE_CSV : "subject","relationship","object" -> SubjectURI, RelationshipURI, MovieURI
|
|
with open(REVERSE_CSV, newline="", encoding="utf-8") as f:
|
|
reader = csv.DictReader(f)
|
|
for row in reader:
|
|
subject_uri = row["subject"].strip()
|
|
relationship_uri = row["relationship"].strip()
|
|
movie_uri = row["object"].strip()
|
|
|
|
cur.execute("SELECT MovieID FROM Movies WHERE MovieURI=?", (movie_uri,))
|
|
movie = cur.fetchone()
|
|
if not movie:
|
|
skipped_movie += skipped_movie
|
|
continue
|
|
# it is a guard check if it doest exist a MovieID from the given MovieURI, then skip
|
|
|
|
# now put each URI into their SCHEMA and retrieves IDs instead
|
|
subject_id = get_or_create(cur, "Subjects", "SubjectURI", subject_uri, origin_id)
|
|
relationship_id = get_or_create(cur, "Relationships", "RelationshipURI", relationship_uri)
|
|
object_id = get_or_create(cur, "Objects", "ObjectURI", movie_uri, origin_id)
|
|
|
|
# check if the triple is already in the RDF
|
|
cur.execute(
|
|
"SELECT RDF_ID FROM RDFs WHERE SubjectID=? AND RelationshipID=? AND ObjectID=?",
|
|
(subject_id, relationship_id, object_id),
|
|
)
|
|
if not cur.fetchone():
|
|
cur.execute(
|
|
"INSERT INTO RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?, ?, ?, ?)",
|
|
(movie[0], subject_id, relationship_id, object_id),
|
|
)
|
|
total += 1
|
|
conn.commit()
|
|
conn.close()
|
|
print(f"Reverse RDFs loaded: {total}")
|
|
print(f"Skipped Movies: {skipped_movie}")
|
|
|
|
|
|
|
|
# --- Execution order ---
|
|
load_movies()
|
|
load_origins()
|
|
load_wikipageids()
|
|
load_abstracts()
|
|
load_dataset()
|
|
load_reverse()
|
|
# sqlite3 ./Assets/Dataset/Tmp/dataset.db < ./Scripts/DataCleaning/SQL_Queries/db_creation.sql |