Folder DataCleaning renamed to DatasetMerging since it doesn't clean nothing

and instead Build the dataset
This commit is contained in:
GassiGiuseppe
2025-09-22 17:11:49 +02:00
parent edd01a2c83
commit ac1ed42c49
4 changed files with 45 additions and 28 deletions

View File

@@ -0,0 +1,45 @@
"""
What we have now: Saved AS:
Wikipeda-summary : PageId / abstract subject,text
Movies : Movie URI "subject"
Dataset : Movie URI / Relationship / Object [RDF] subject,relationship,object
Movies-PageId : Movie URI / PageId (wiki) "subject", "object"
Reverse : Subject / Relationship / Movie URI "subject","relationship","object"
What we want:
( we will generate MovieID)
Movies : MovieID [PK] / Movie URI
WikiPageIDs : MovieID [PK, FK]/ PageId [IDX] (wiki) (Not important for now)
Abstracts : MovieID [PK, FK]/ abstract
Subjects : SubjectID [PK] / RDF Subject ( both from either Dataset.csv or Reverse.csv) / OriginID [FK]
Relationships : RelationshipID [PK]/ RDF Relationship (not the actual relationshi but the value)
Objects : ObjectID [PK]/ RDF Object / OriginID [FK]
Origins : OriginID [PK]/ Origin Name
RDFs : RDF_ID[PK] / MovieID [FK] / SubjectID [FK]/ RelationshipID [FK]/ ObjectID [FK]
What we will build for the model
we need RDF list for each movie together with abstract
: MovieID / RDF_set / abstrct
"""
import sqlite3
# Create a SQL connection to our SQLite database
con = sqlite3.connect("data/portal_mammals.sqlite")
cur = con.cursor()
# Return all results of query
cur.execute('SELECT plot_id FROM plots WHERE plot_type="Control"')
cur.fetchall()
# Return first result of query
cur.execute('SELECT species FROM species WHERE taxa="Bird"')
cur.fetchone()
# Be sure to close the connection
con.close()

View File

@@ -0,0 +1,249 @@
import sqlite3
import csv
# --- Global configuration ---
DB_NAME = "./Assets/Dataset/Tmp/dataset.db"
MOVIES_CSV = "./Assets/Dataset/1-hop/movies.csv"
PAGEID_CSV = "./Assets/Dataset/1-hop/movie-pageid.csv"
SUMMARY_CSV = "./Assets/Dataset/1-hop/wikipedia-summary.csv"
DATASET_CSV = "./Assets/Dataset/1-hop/dataset.csv"
REVERSE_CSV = "./Assets/Dataset/1-hop/reverse.csv"
# --- Helper: idempotent insert-or-select ---
def get_or_create(cursor, table, column, value, origin_id=None):
# tries to put new values in db, then get the id (regardless of the check)
# Subjects and Objects need origin_id. Relationships do not
# try/except on INSERT keeps IDs contiguous (no AUTOINCREMENT jumps)
try:
if origin_id is not None:
cursor.execute(
f"INSERT INTO {table} ({column}, OriginID) VALUES (?, ?)",
(value, int(origin_id)),
)
else:
cursor.execute(
f"INSERT INTO {table} ({column}) VALUES (?)",
(value,),
)
except sqlite3.IntegrityError:
# Row already exists, do nothing
pass
# Always fetch the ID (whether new or existing)
# {table[:-1]}ID ->
# Subjects -> SubjectID
# Objects -> ObjectID
# Relationships -> RelationshipID
# kinda hardcoded
cursor.execute(f"SELECT {table[:-1]}ID FROM {table} WHERE {column}=?", (value,))
return cursor.fetchone()[0] # fetchone returns a list with one element
# --- Load Movies ---
def load_movies():
# Creates Movies: MovieID [PK] / Movie URI
# MovieID is managed by sql
conn = sqlite3.connect(DB_NAME)
cur = conn.cursor()
total = 0
# MOVIES_CSV: "subject" [it has only this column]
with open(MOVIES_CSV, newline="", encoding="utf-8") as f:
reader = csv.DictReader(f)
for row in reader:
movie_uri = row["subject"].strip()
# try/except on INSERT keeps IDs contiguous (no AUTOINCREMENT jumps)
try:
cur.execute("INSERT INTO Movies (MovieURI) VALUES (?)", (movie_uri,))
total += 1 # count only if a new row was added
except sqlite3.IntegrityError:
# already exists, skip
pass
conn.commit() # suggested by dr
conn.close()
print(f"Movies loaded: {total}")
# --- Load Origins ---
def load_origins():
# Creates Origins: OriginID [PK]/ Origin Name
# ["Dataset", "Reverse"]
conn = sqlite3.connect(DB_NAME)
cur = conn.cursor()
for origin in ["Dataset", "Reverse"]:
try:
cur.execute("INSERT INTO Origins (OriginName) VALUES (?)", (origin,))
except sqlite3.IntegrityError:
pass
conn.commit()
conn.close()
print("Origins loaded.")
# --- Load WikiPageIDs ---
def load_wikipageids():
# Creates WikiPageIDs: MovieID [PK, FK]/ PageId [IDX] (wiki)
conn = sqlite3.connect(DB_NAME)
cur = conn.cursor()
total = 0
# PAGEID_CSV: "subject","object" -> MovieURI, WikiPageId
with open(PAGEID_CSV, newline="", encoding="utf-8") as f:
reader = csv.DictReader(f)
for row in reader:
movie_uri = row["subject"].strip()
page_id = int(row["object"])
cur.execute("SELECT MovieID FROM Movies WHERE MovieURI=?", (movie_uri,))
movie = cur.fetchone()
if movie:
try:
# it can become INSERT OR IGNORE instead of try catch
cur.execute(
"INSERT INTO WikiPageIDs (MovieID, PageID) VALUES (?, ?)",
(movie[0], page_id),
)
total += 1
except sqlite3.IntegrityError:
pass
conn.commit()
conn.close()
print(f"WikiPageIDs loaded: {total}")
# --- Load Wikipedia Abstracts ---
def load_abstracts():
# Cretes WikipediaAbstracts: MovieID [PK, FK]/ abstract
conn = sqlite3.connect(DB_NAME)
cur = conn.cursor()
total = 0
# SUMMARY_CSV: subject,text -> WikiPageID / abstract
with open(SUMMARY_CSV, newline="", encoding="utf-8") as f:
reader = csv.DictReader(f)
for row in reader:
page_id = int(row["subject"])
abstract = row["text"].strip()
# WikiPageIDs: MovieID [PK, FK]/ PageId [IDX] (wiki)
cur.execute("SELECT MovieID FROM WikiPageIDs WHERE PageID=?", (page_id,))
movie = cur.fetchone() # which is MovieID
if movie:
try:
# it can become INSERT OR IGNORE instead of try catch
cur.execute(
"INSERT INTO WikipediaAbstracts (MovieID, Abstract) VALUES (?, ?)",
(movie[0], abstract),
)
total += 1
except sqlite3.IntegrityError as e:
print(e)
pass
conn.commit()
conn.close()
print(f"WikipediaAbstracts loaded: {total}")
# --- Load Dataset RDFs ---
def load_dataset():
conn = sqlite3.connect(DB_NAME)
cur = conn.cursor()
# get oridin_id from datset
cur.execute("SELECT OriginID FROM Origins WHERE OriginName='Dataset'")
origin_id = int(cur.fetchone()[0])
print(f"Origin_id is: {origin_id}")
####
total = 0
skipped_movie = 0
# DATASET_CSV: "subject","relationshi","object" -> MovieUri, RelationshipUri, ObjectUri
with open(DATASET_CSV, newline="", encoding="utf-8") as f:
reader = csv.DictReader(f)
for row in reader:
movie_uri = row["subject"].strip()
relationship_uri = row["relationship"].strip()
object_uri = row["object"].strip()
cur.execute("SELECT MovieID FROM Movies WHERE MovieURI=?", (movie_uri,))
movie = cur.fetchone()
if not movie:
skipped_movie += skipped_movie
continue
# it is a guard check if it doest exist a MovieID from the given MovieURI, then skip
# now put each URI into their SCHEMA and retrieves IDs instead
subject_id = get_or_create(cur, "Subjects", "SubjectURI", movie_uri, origin_id)
relationship_id = get_or_create(cur, "Relationships", "RelationshipURI", relationship_uri)
object_id = get_or_create(cur, "Objects", "ObjectURI", object_uri, origin_id)
# check if the triple is already in the RDF
cur.execute(
"SELECT RDF_ID FROM RDFs WHERE SubjectID=? AND RelationshipID=? AND ObjectID=?",
(subject_id, relationship_id, object_id),
)
if not cur.fetchone():
cur.execute(
"INSERT INTO RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?, ?, ?, ?)",
(movie[0], subject_id, relationship_id, object_id),
)
total += 1
conn.commit()
conn.close()
print(f"Dataset RDFs loaded: {total}")
print(f"Skipped Movies: {skipped_movie}")
# --- Load Reverse RDFs ---
def load_reverse():
conn = sqlite3.connect(DB_NAME)
cur = conn.cursor()
# get oridin_id from datset
cur.execute("SELECT OriginID FROM Origins WHERE OriginName='Reverse'")
origin_id = int(cur.fetchone()[0])
print(f"Origin_id is: {origin_id}")
###
total = 0
skipped_movie = 0
# REVERSE_CSV : "subject","relationship","object" -> SubjectURI, RelationshipURI, MovieURI
with open(REVERSE_CSV, newline="", encoding="utf-8") as f:
reader = csv.DictReader(f)
for row in reader:
subject_uri = row["subject"].strip()
relationship_uri = row["relationship"].strip()
movie_uri = row["object"].strip()
cur.execute("SELECT MovieID FROM Movies WHERE MovieURI=?", (movie_uri,))
movie = cur.fetchone()
if not movie:
skipped_movie += skipped_movie
continue
# it is a guard check if it doest exist a MovieID from the given MovieURI, then skip
# now put each URI into their SCHEMA and retrieves IDs instead
subject_id = get_or_create(cur, "Subjects", "SubjectURI", subject_uri, origin_id)
relationship_id = get_or_create(cur, "Relationships", "RelationshipURI", relationship_uri)
object_id = get_or_create(cur, "Objects", "ObjectURI", movie_uri, origin_id)
# check if the triple is already in the RDF
cur.execute(
"SELECT RDF_ID FROM RDFs WHERE SubjectID=? AND RelationshipID=? AND ObjectID=?",
(subject_id, relationship_id, object_id),
)
if not cur.fetchone():
cur.execute(
"INSERT INTO RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?, ?, ?, ?)",
(movie[0], subject_id, relationship_id, object_id),
)
total += 1
conn.commit()
conn.close()
print(f"Reverse RDFs loaded: {total}")
print(f"Skipped Movies: {skipped_movie}")
# --- Execution order ---
load_movies()
load_origins()
load_wikipageids()
load_abstracts()
load_dataset()
load_reverse()
# sqlite3 ./Assets/Dataset/Tmp/dataset.db < ./Scripts/DataCleaning/SQL_Queries/db_creation.sql

View File

@@ -0,0 +1,65 @@
CREATE TABLE IF NOT EXISTS Movies (
MovieID INTEGER PRIMARY KEY AUTOINCREMENT,
MovieURI TEXT UNIQUE NOT NULL
);
CREATE TABLE IF NOT EXISTS WikiPageIDs (
MovieID INTEGER PRIMARY KEY,
PageID INTEGER UNIQUE NOT NULL,
FOREIGN KEY(MovieID) REFERENCES Movies(MovieID)
);
CREATE TABLE IF NOT EXISTS WikipediaAbstracts (
MovieID INTEGER PRIMARY KEY,
Abstract TEXT NOT NULL,
FOREIGN KEY(MovieID) REFERENCES Movies(MovieID)
);
CREATE TABLE IF NOT EXISTS Origins (
OriginID INTEGER PRIMARY KEY AUTOINCREMENT,
OriginName TEXT UNIQUE NOT NULL
);
CREATE TABLE IF NOT EXISTS Subjects (
SubjectID INTEGER PRIMARY KEY AUTOINCREMENT,
SubjectURI TEXT UNIQUE NOT NULL,
OriginID BIGINT NOT NULL,
FOREIGN KEY(OriginID) REFERENCES Origins(OriginID)
);
CREATE TABLE IF NOT EXISTS Relationships (
RelationshipID INTEGER PRIMARY KEY AUTOINCREMENT,
RelationshipURI TEXT UNIQUE NOT NULL
);
CREATE TABLE IF NOT EXISTS Objects (
ObjectID INTEGER PRIMARY KEY AUTOINCREMENT,
ObjectURI TEXT UNIQUE NOT NULL,
OriginID BIGINT NOT NULL,
FOREIGN KEY(OriginID) REFERENCES Origins(OriginID)
);
CREATE TABLE IF NOT EXISTS RDFs (
RDF_ID INTEGER PRIMARY KEY AUTOINCREMENT,
MovieID INTEGER NOT NULL,
SubjectID INTEGER NOT NULL,
RelationshipID INTEGER NOT NULL,
ObjectID INTEGER NOT NULL,
UNIQUE(SubjectID, RelationshipID, ObjectID),
FOREIGN KEY(MovieID) REFERENCES Movies(MovieID),
FOREIGN KEY(SubjectID) REFERENCES Subjects(SubjectID),
FOREIGN KEY(RelationshipID) REFERENCES Relationships(RelationshipID),
FOREIGN KEY(ObjectID) REFERENCES Objects(ObjectID)
);
CREATE INDEX IF NOT EXISTS idx_rdf_movie_id ON RDFs(MovieID);
CREATE INDEX IF NOT EXISTS idx_rdf_subject_id ON RDFs(SubjectID);
CREATE INDEX IF NOT EXISTS idx_rdf_relationship_id ON RDFs(RelationshipID);
CREATE INDEX IF NOT EXISTS idx_rdf_object_id ON RDFs(ObjectID);