Folder DataCleaning renamed to DatasetMerging since it doesn't clean nothing
and instead Build the dataset
This commit is contained in:
@@ -1,28 +0,0 @@
|
||||
"""
|
||||
What we have now:
|
||||
|
||||
Wikipeda-summary : PageId / abstract
|
||||
Movies : Movie URI
|
||||
Dataset : Movie URI / Relationship / Object [RDF]
|
||||
Movies-PageId : Movie URI / PageId (wiki)
|
||||
Reverse : Subject / Relationship / Movie URI
|
||||
|
||||
What we want:
|
||||
( we will generate MovieID)
|
||||
Movies : MovieID [PK] / Movie URI
|
||||
WikiPageIDs : MovieID [PK, FK]/ PageId [IDX] (wiki) (Not important for now)
|
||||
Abstracts : MovieID [PK, FK]/ abstract
|
||||
Subjects : SubjectID [PK] / RDF Subject ( both from either Dataset.csv or Reverse.csv) / OriginID [FK]
|
||||
Relationships : RelationshipID [PK]/ RDF Relationship (not the actual relationshi but the value)
|
||||
Objects : ObjectID [PK]/ RDF Object / OriginID [FK]
|
||||
Origins : OriginID [PK]/ Origin Name
|
||||
RDFs : RDF_ID[PK] / MovieID [FK] / SubjectID [FK]/ RelationshipID [FK]/ ObjectID [FK]
|
||||
|
||||
What we will build for the model
|
||||
|
||||
we need RDF list for each movie together with abstract
|
||||
|
||||
: MovieID / RDF_set / abstrct
|
||||
|
||||
"""
|
||||
|
||||
@@ -1,249 +0,0 @@
|
||||
import sqlite3
|
||||
import csv
|
||||
|
||||
# --- Global configuration ---
|
||||
DB_NAME = "./Assets/Dataset/Tmp/dataset.db"
|
||||
MOVIES_CSV = "./Assets/Dataset/1-hop/movies.csv"
|
||||
PAGEID_CSV = "./Assets/Dataset/1-hop/movie-pageid.csv"
|
||||
SUMMARY_CSV = "./Assets/Dataset/1-hop/wikipedia-summary.csv"
|
||||
DATASET_CSV = "./Assets/Dataset/1-hop/dataset.csv"
|
||||
REVERSE_CSV = "./Assets/Dataset/1-hop/reverse.csv"
|
||||
|
||||
|
||||
# --- Helper: idempotent insert-or-select ---
|
||||
def get_or_create(cursor, table, column, value, origin_id=None):
|
||||
# tries to put new values in db, then get the id (regardless of the check)
|
||||
# Subjects and Objects need origin_id. Relationships do not
|
||||
# try/except on INSERT keeps IDs contiguous (no AUTOINCREMENT jumps)
|
||||
|
||||
try:
|
||||
if origin_id is not None:
|
||||
cursor.execute(
|
||||
f"INSERT INTO {table} ({column}, OriginID) VALUES (?, ?)",
|
||||
(value, int(origin_id)),
|
||||
)
|
||||
else:
|
||||
cursor.execute(
|
||||
f"INSERT INTO {table} ({column}) VALUES (?)",
|
||||
(value,),
|
||||
)
|
||||
except sqlite3.IntegrityError:
|
||||
# Row already exists, do nothing
|
||||
pass
|
||||
|
||||
# Always fetch the ID (whether new or existing)
|
||||
# {table[:-1]}ID ->
|
||||
# Subjects -> SubjectID
|
||||
# Objects -> ObjectID
|
||||
# Relationships -> RelationshipID
|
||||
# kinda hardcoded
|
||||
cursor.execute(f"SELECT {table[:-1]}ID FROM {table} WHERE {column}=?", (value,))
|
||||
return cursor.fetchone()[0] # fetchone returns a list with one element
|
||||
|
||||
|
||||
|
||||
# --- Load Movies ---
|
||||
def load_movies():
|
||||
# Creates Movies: MovieID [PK] / Movie URI
|
||||
# MovieID is managed by sql
|
||||
conn = sqlite3.connect(DB_NAME)
|
||||
cur = conn.cursor()
|
||||
total = 0
|
||||
# MOVIES_CSV: "subject" [it has only this column]
|
||||
with open(MOVIES_CSV, newline="", encoding="utf-8") as f:
|
||||
reader = csv.DictReader(f)
|
||||
for row in reader:
|
||||
movie_uri = row["subject"].strip()
|
||||
# try/except on INSERT keeps IDs contiguous (no AUTOINCREMENT jumps)
|
||||
try:
|
||||
cur.execute("INSERT INTO Movies (MovieURI) VALUES (?)", (movie_uri,))
|
||||
total += 1 # count only if a new row was added
|
||||
except sqlite3.IntegrityError:
|
||||
# already exists, skip
|
||||
pass
|
||||
conn.commit() # suggested by dr
|
||||
conn.close()
|
||||
print(f"Movies loaded: {total}")
|
||||
|
||||
|
||||
# --- Load Origins ---
|
||||
def load_origins():
|
||||
# Creates Origins: OriginID [PK]/ Origin Name
|
||||
# ["Dataset", "Reverse"]
|
||||
conn = sqlite3.connect(DB_NAME)
|
||||
cur = conn.cursor()
|
||||
for origin in ["Dataset", "Reverse"]:
|
||||
try:
|
||||
cur.execute("INSERT INTO Origins (OriginName) VALUES (?)", (origin,))
|
||||
except sqlite3.IntegrityError:
|
||||
pass
|
||||
conn.commit()
|
||||
conn.close()
|
||||
print("Origins loaded.")
|
||||
|
||||
|
||||
# --- Load WikiPageIDs ---
|
||||
def load_wikipageids():
|
||||
# Creates WikiPageIDs: MovieID [PK, FK]/ PageId [IDX] (wiki)
|
||||
conn = sqlite3.connect(DB_NAME)
|
||||
cur = conn.cursor()
|
||||
total = 0
|
||||
# PAGEID_CSV: "subject","object" -> MovieURI, WikiPageId
|
||||
with open(PAGEID_CSV, newline="", encoding="utf-8") as f:
|
||||
reader = csv.DictReader(f)
|
||||
for row in reader:
|
||||
movie_uri = row["subject"].strip()
|
||||
page_id = int(row["object"])
|
||||
cur.execute("SELECT MovieID FROM Movies WHERE MovieURI=?", (movie_uri,))
|
||||
movie = cur.fetchone()
|
||||
if movie:
|
||||
try:
|
||||
# it can become INSERT OR IGNORE instead of try catch
|
||||
cur.execute(
|
||||
"INSERT INTO WikiPageIDs (MovieID, PageID) VALUES (?, ?)",
|
||||
(movie[0], page_id),
|
||||
)
|
||||
total += 1
|
||||
except sqlite3.IntegrityError:
|
||||
pass
|
||||
conn.commit()
|
||||
conn.close()
|
||||
print(f"WikiPageIDs loaded: {total}")
|
||||
|
||||
|
||||
# --- Load Wikipedia Abstracts ---
|
||||
def load_abstracts():
|
||||
# Cretes WikipediaAbstracts: MovieID [PK, FK]/ abstract
|
||||
conn = sqlite3.connect(DB_NAME)
|
||||
cur = conn.cursor()
|
||||
total = 0
|
||||
# SUMMARY_CSV: subject,text -> WikiPageID / abstract
|
||||
with open(SUMMARY_CSV, newline="", encoding="utf-8") as f:
|
||||
reader = csv.DictReader(f)
|
||||
for row in reader:
|
||||
page_id = int(row["subject"])
|
||||
abstract = row["text"].strip()
|
||||
# WikiPageIDs: MovieID [PK, FK]/ PageId [IDX] (wiki)
|
||||
cur.execute("SELECT MovieID FROM WikiPageIDs WHERE PageID=?", (page_id,))
|
||||
movie = cur.fetchone() # which is MovieID
|
||||
if movie:
|
||||
try:
|
||||
# it can become INSERT OR IGNORE instead of try catch
|
||||
cur.execute(
|
||||
"INSERT INTO WikipediaAbstracts (MovieID, Abstract) VALUES (?, ?)",
|
||||
(movie[0], abstract),
|
||||
)
|
||||
total += 1
|
||||
except sqlite3.IntegrityError as e:
|
||||
print(e)
|
||||
pass
|
||||
conn.commit()
|
||||
conn.close()
|
||||
print(f"WikipediaAbstracts loaded: {total}")
|
||||
|
||||
|
||||
# --- Load Dataset RDFs ---
|
||||
def load_dataset():
|
||||
conn = sqlite3.connect(DB_NAME)
|
||||
cur = conn.cursor()
|
||||
# get oridin_id from datset
|
||||
cur.execute("SELECT OriginID FROM Origins WHERE OriginName='Dataset'")
|
||||
origin_id = int(cur.fetchone()[0])
|
||||
print(f"Origin_id is: {origin_id}")
|
||||
####
|
||||
total = 0
|
||||
skipped_movie = 0
|
||||
# DATASET_CSV: "subject","relationshi","object" -> MovieUri, RelationshipUri, ObjectUri
|
||||
with open(DATASET_CSV, newline="", encoding="utf-8") as f:
|
||||
reader = csv.DictReader(f)
|
||||
for row in reader:
|
||||
movie_uri = row["subject"].strip()
|
||||
relationship_uri = row["relationship"].strip()
|
||||
object_uri = row["object"].strip()
|
||||
|
||||
cur.execute("SELECT MovieID FROM Movies WHERE MovieURI=?", (movie_uri,))
|
||||
movie = cur.fetchone()
|
||||
if not movie:
|
||||
skipped_movie += skipped_movie
|
||||
continue
|
||||
# it is a guard check if it doest exist a MovieID from the given MovieURI, then skip
|
||||
|
||||
# now put each URI into their SCHEMA and retrieves IDs instead
|
||||
subject_id = get_or_create(cur, "Subjects", "SubjectURI", movie_uri, origin_id)
|
||||
relationship_id = get_or_create(cur, "Relationships", "RelationshipURI", relationship_uri)
|
||||
object_id = get_or_create(cur, "Objects", "ObjectURI", object_uri, origin_id)
|
||||
|
||||
# check if the triple is already in the RDF
|
||||
cur.execute(
|
||||
"SELECT RDF_ID FROM RDFs WHERE SubjectID=? AND RelationshipID=? AND ObjectID=?",
|
||||
(subject_id, relationship_id, object_id),
|
||||
)
|
||||
if not cur.fetchone():
|
||||
cur.execute(
|
||||
"INSERT INTO RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?, ?, ?, ?)",
|
||||
(movie[0], subject_id, relationship_id, object_id),
|
||||
)
|
||||
total += 1
|
||||
conn.commit()
|
||||
conn.close()
|
||||
print(f"Dataset RDFs loaded: {total}")
|
||||
print(f"Skipped Movies: {skipped_movie}")
|
||||
|
||||
|
||||
# --- Load Reverse RDFs ---
|
||||
def load_reverse():
|
||||
conn = sqlite3.connect(DB_NAME)
|
||||
cur = conn.cursor()
|
||||
# get oridin_id from datset
|
||||
cur.execute("SELECT OriginID FROM Origins WHERE OriginName='Reverse'")
|
||||
origin_id = int(cur.fetchone()[0])
|
||||
print(f"Origin_id is: {origin_id}")
|
||||
###
|
||||
total = 0
|
||||
skipped_movie = 0
|
||||
# REVERSE_CSV : "subject","relationship","object" -> SubjectURI, RelationshipURI, MovieURI
|
||||
with open(REVERSE_CSV, newline="", encoding="utf-8") as f:
|
||||
reader = csv.DictReader(f)
|
||||
for row in reader:
|
||||
subject_uri = row["subject"].strip()
|
||||
relationship_uri = row["relationship"].strip()
|
||||
movie_uri = row["object"].strip()
|
||||
|
||||
cur.execute("SELECT MovieID FROM Movies WHERE MovieURI=?", (movie_uri,))
|
||||
movie = cur.fetchone()
|
||||
if not movie:
|
||||
skipped_movie += skipped_movie
|
||||
continue
|
||||
# it is a guard check if it doest exist a MovieID from the given MovieURI, then skip
|
||||
|
||||
# now put each URI into their SCHEMA and retrieves IDs instead
|
||||
subject_id = get_or_create(cur, "Subjects", "SubjectURI", subject_uri, origin_id)
|
||||
relationship_id = get_or_create(cur, "Relationships", "RelationshipURI", relationship_uri)
|
||||
object_id = get_or_create(cur, "Objects", "ObjectURI", movie_uri, origin_id)
|
||||
|
||||
# check if the triple is already in the RDF
|
||||
cur.execute(
|
||||
"SELECT RDF_ID FROM RDFs WHERE SubjectID=? AND RelationshipID=? AND ObjectID=?",
|
||||
(subject_id, relationship_id, object_id),
|
||||
)
|
||||
if not cur.fetchone():
|
||||
cur.execute(
|
||||
"INSERT INTO RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?, ?, ?, ?)",
|
||||
(movie[0], subject_id, relationship_id, object_id),
|
||||
)
|
||||
total += 1
|
||||
conn.commit()
|
||||
conn.close()
|
||||
print(f"Reverse RDFs loaded: {total}")
|
||||
print(f"Skipped Movies: {skipped_movie}")
|
||||
|
||||
|
||||
|
||||
# --- Execution order ---
|
||||
load_movies()
|
||||
load_origins()
|
||||
load_wikipageids()
|
||||
load_abstracts()
|
||||
load_dataset()
|
||||
load_reverse()
|
||||
# sqlite3 ./Assets/Dataset/Tmp/dataset.db < ./Scripts/DataCleaning/SQL_Queries/db_creation.sql
|
||||
@@ -1,65 +0,0 @@
|
||||
CREATE TABLE IF NOT EXISTS Movies (
|
||||
MovieID INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
MovieURI TEXT UNIQUE NOT NULL
|
||||
);
|
||||
|
||||
|
||||
CREATE TABLE IF NOT EXISTS WikiPageIDs (
|
||||
MovieID INTEGER PRIMARY KEY,
|
||||
PageID INTEGER UNIQUE NOT NULL,
|
||||
FOREIGN KEY(MovieID) REFERENCES Movies(MovieID)
|
||||
);
|
||||
|
||||
|
||||
CREATE TABLE IF NOT EXISTS WikipediaAbstracts (
|
||||
MovieID INTEGER PRIMARY KEY,
|
||||
Abstract TEXT NOT NULL,
|
||||
FOREIGN KEY(MovieID) REFERENCES Movies(MovieID)
|
||||
);
|
||||
|
||||
|
||||
CREATE TABLE IF NOT EXISTS Origins (
|
||||
OriginID INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
OriginName TEXT UNIQUE NOT NULL
|
||||
);
|
||||
|
||||
|
||||
CREATE TABLE IF NOT EXISTS Subjects (
|
||||
SubjectID INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
SubjectURI TEXT UNIQUE NOT NULL,
|
||||
OriginID BIGINT NOT NULL,
|
||||
FOREIGN KEY(OriginID) REFERENCES Origins(OriginID)
|
||||
);
|
||||
|
||||
|
||||
CREATE TABLE IF NOT EXISTS Relationships (
|
||||
RelationshipID INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
RelationshipURI TEXT UNIQUE NOT NULL
|
||||
);
|
||||
|
||||
|
||||
CREATE TABLE IF NOT EXISTS Objects (
|
||||
ObjectID INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
ObjectURI TEXT UNIQUE NOT NULL,
|
||||
OriginID BIGINT NOT NULL,
|
||||
FOREIGN KEY(OriginID) REFERENCES Origins(OriginID)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS RDFs (
|
||||
RDF_ID INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
MovieID INTEGER NOT NULL,
|
||||
SubjectID INTEGER NOT NULL,
|
||||
RelationshipID INTEGER NOT NULL,
|
||||
ObjectID INTEGER NOT NULL,
|
||||
UNIQUE(SubjectID, RelationshipID, ObjectID),
|
||||
FOREIGN KEY(MovieID) REFERENCES Movies(MovieID),
|
||||
FOREIGN KEY(SubjectID) REFERENCES Subjects(SubjectID),
|
||||
FOREIGN KEY(RelationshipID) REFERENCES Relationships(RelationshipID),
|
||||
FOREIGN KEY(ObjectID) REFERENCES Objects(ObjectID)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_rdf_movie_id ON RDFs(MovieID);
|
||||
CREATE INDEX IF NOT EXISTS idx_rdf_subject_id ON RDFs(SubjectID);
|
||||
CREATE INDEX IF NOT EXISTS idx_rdf_relationship_id ON RDFs(RelationshipID);
|
||||
CREATE INDEX IF NOT EXISTS idx_rdf_object_id ON RDFs(ObjectID);
|
||||
|
||||
Reference in New Issue
Block a user