Folder DataCleaning renamed to DatasetMerging since it doesn't clean nothing

and instead Build the dataset
2025-09-22 17:11:49 +02:00
parent edd01a2c83
commit ac1ed42c49
4 changed files with 45 additions and 28 deletions
--- a/Scripts/DatasetMerging/DBMerger.py
+++ b/Scripts/DatasetMerging/DBMerger.py
@@ -0,0 +1,45 @@
+"""
+What we have now:                                                   Saved AS:
+
+Wikipeda-summary    : PageId / abstract                             subject,text
+Movies              : Movie URI                                     "subject"
+Dataset             : Movie URI / Relationship / Object [RDF]       subject,relationship,object
+Movies-PageId       : Movie URI / PageId (wiki)                     "subject", "object"
+Reverse             : Subject / Relationship / Movie URI            "subject","relationship","object"
+
+What we want:
+( we will generate MovieID)
+Movies              : MovieID [PK] / Movie URI
+WikiPageIDs         : MovieID [PK, FK]/ PageId [IDX] (wiki) (Not important for now)
+Abstracts           : MovieID [PK, FK]/ abstract
+Subjects            : SubjectID [PK] / RDF Subject ( both from either Dataset.csv or Reverse.csv) / OriginID [FK]
+Relationships       : RelationshipID [PK]/ RDF Relationship  (not the actual relationshi but the value)
+Objects             : ObjectID [PK]/ RDF Object / OriginID [FK]
+Origins             : OriginID [PK]/ Origin Name
+RDFs                : RDF_ID[PK] / MovieID [FK] / SubjectID [FK]/ RelationshipID [FK]/ ObjectID [FK]
+
+What we will build for the model
+
+we need RDF list for each movie together with abstract
+
+: MovieID / RDF_set / abstrct
+
+"""
+
+import sqlite3
+
+# Create a SQL connection to our SQLite database
+con = sqlite3.connect("data/portal_mammals.sqlite")
+
+cur = con.cursor()
+
+# Return all results of query
+cur.execute('SELECT plot_id FROM plots WHERE plot_type="Control"')
+cur.fetchall()
+
+# Return first result of query
+cur.execute('SELECT species FROM species WHERE taxa="Bird"')
+cur.fetchone()
+
+# Be sure to close the connection
+con.close()
--- a/Scripts/DatasetMerging/DataRetrivial.py
+++ b/Scripts/DatasetMerging/DataRetrivial.py
@@ -0,0 +1,249 @@
+import sqlite3
+import csv
+
+# --- Global configuration ---
+DB_NAME = "./Assets/Dataset/Tmp/dataset.db"
+MOVIES_CSV = "./Assets/Dataset/1-hop/movies.csv"
+PAGEID_CSV = "./Assets/Dataset/1-hop/movie-pageid.csv"
+SUMMARY_CSV = "./Assets/Dataset/1-hop/wikipedia-summary.csv"
+DATASET_CSV = "./Assets/Dataset/1-hop/dataset.csv"
+REVERSE_CSV = "./Assets/Dataset/1-hop/reverse.csv"
+
+
+# --- Helper: idempotent insert-or-select ---
+def get_or_create(cursor, table, column, value, origin_id=None):
+    # tries to put new values in db, then get the id (regardless of the check)
+    # Subjects and Objects need origin_id. Relationships do not
+    # try/except on INSERT keeps IDs contiguous (no AUTOINCREMENT jumps)
+
+    try:
+        if origin_id is not None:
+            cursor.execute(
+                f"INSERT INTO {table} ({column}, OriginID) VALUES (?, ?)",
+                (value, int(origin_id)),
+            )
+        else:
+            cursor.execute(
+                f"INSERT INTO {table} ({column}) VALUES (?)",
+                (value,),
+            )
+    except sqlite3.IntegrityError:
+        # Row already exists, do nothing
+        pass
+
+    # Always fetch the ID (whether new or existing)
+    # {table[:-1]}ID ->
+    # Subjects -> SubjectID
+    # Objects -> ObjectID
+    # Relationships -> RelationshipID
+    # kinda hardcoded
+    cursor.execute(f"SELECT {table[:-1]}ID FROM {table} WHERE {column}=?", (value,))
+    return cursor.fetchone()[0]  # fetchone returns a list with one element
+
+
+
+# --- Load Movies ---
+def load_movies():
+    # Creates Movies: MovieID [PK] / Movie URI
+    # MovieID is managed by sql
+    conn = sqlite3.connect(DB_NAME)
+    cur = conn.cursor()
+    total = 0
+    # MOVIES_CSV: "subject" [it has only this column]
+    with open(MOVIES_CSV, newline="", encoding="utf-8") as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            movie_uri = row["subject"].strip()
+            # try/except on INSERT keeps IDs contiguous (no AUTOINCREMENT jumps)
+            try:
+                cur.execute("INSERT INTO Movies (MovieURI) VALUES (?)", (movie_uri,))
+                total += 1  # count only if a new row was added
+            except sqlite3.IntegrityError:
+                # already exists, skip
+                pass
+    conn.commit() # suggested by dr
+    conn.close()
+    print(f"Movies loaded: {total}")
+
+
+# --- Load Origins ---
+def load_origins():
+    # Creates Origins: OriginID [PK]/ Origin Name
+    # ["Dataset", "Reverse"]
+    conn = sqlite3.connect(DB_NAME)
+    cur = conn.cursor()
+    for origin in ["Dataset", "Reverse"]:
+        try:
+            cur.execute("INSERT INTO Origins (OriginName) VALUES (?)", (origin,))
+        except sqlite3.IntegrityError:
+            pass
+    conn.commit()
+    conn.close()
+    print("Origins loaded.")
+
+
+# --- Load WikiPageIDs ---
+def load_wikipageids():
+    # Creates WikiPageIDs: MovieID [PK, FK]/ PageId [IDX] (wiki)
+    conn = sqlite3.connect(DB_NAME)
+    cur = conn.cursor()
+    total = 0
+    # PAGEID_CSV: "subject","object" -> MovieURI, WikiPageId
+    with open(PAGEID_CSV, newline="", encoding="utf-8") as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            movie_uri = row["subject"].strip()
+            page_id = int(row["object"])
+            cur.execute("SELECT MovieID FROM Movies WHERE MovieURI=?", (movie_uri,))
+            movie = cur.fetchone()
+            if movie:
+                try:
+                    # it can become INSERT OR IGNORE instead of try catch
+                    cur.execute(
+                        "INSERT INTO WikiPageIDs (MovieID, PageID) VALUES (?, ?)",
+                        (movie[0], page_id),
+                    )
+                    total += 1
+                except sqlite3.IntegrityError:
+                    pass
+    conn.commit()
+    conn.close()
+    print(f"WikiPageIDs loaded: {total}")
+
+
+# --- Load Wikipedia Abstracts ---
+def load_abstracts():
+    # Cretes WikipediaAbstracts: MovieID [PK, FK]/ abstract
+    conn = sqlite3.connect(DB_NAME)
+    cur = conn.cursor()
+    total = 0
+    # SUMMARY_CSV: subject,text -> WikiPageID / abstract 
+    with open(SUMMARY_CSV, newline="", encoding="utf-8") as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            page_id = int(row["subject"])
+            abstract = row["text"].strip()
+            # WikiPageIDs: MovieID [PK, FK]/ PageId [IDX] (wiki)
+            cur.execute("SELECT MovieID FROM WikiPageIDs WHERE PageID=?", (page_id,))
+            movie = cur.fetchone() # which is MovieID
+            if movie:
+                try:
+                    # it can become INSERT OR IGNORE instead of try catch
+                    cur.execute(
+                        "INSERT INTO WikipediaAbstracts (MovieID, Abstract) VALUES (?, ?)",
+                        (movie[0], abstract),
+                    )
+                    total += 1
+                except sqlite3.IntegrityError as e:
+                    print(e)
+                    pass
+    conn.commit()
+    conn.close()
+    print(f"WikipediaAbstracts loaded: {total}")
+
+
+# --- Load Dataset RDFs ---
+def load_dataset():
+    conn = sqlite3.connect(DB_NAME)
+    cur = conn.cursor()
+    # get oridin_id from datset
+    cur.execute("SELECT OriginID FROM Origins WHERE OriginName='Dataset'")
+    origin_id = int(cur.fetchone()[0])
+    print(f"Origin_id is: {origin_id}")
+    ####
+    total = 0
+    skipped_movie = 0
+    # DATASET_CSV: "subject","relationshi","object" -> MovieUri, RelationshipUri, ObjectUri
+    with open(DATASET_CSV, newline="", encoding="utf-8") as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            movie_uri = row["subject"].strip()
+            relationship_uri = row["relationship"].strip()
+            object_uri = row["object"].strip()
+
+            cur.execute("SELECT MovieID FROM Movies WHERE MovieURI=?", (movie_uri,))
+            movie = cur.fetchone()
+            if not movie:
+                skipped_movie += skipped_movie
+                continue
+            # it is a guard check if it doest exist a MovieID from the given MovieURI, then skip
+
+            # now put each URI into their SCHEMA and retrieves IDs instead
+            subject_id = get_or_create(cur, "Subjects", "SubjectURI", movie_uri, origin_id)
+            relationship_id = get_or_create(cur, "Relationships", "RelationshipURI", relationship_uri)
+            object_id = get_or_create(cur, "Objects", "ObjectURI", object_uri, origin_id)
+
+            # check if the triple is already in the RDF
+            cur.execute(
+                "SELECT RDF_ID FROM RDFs WHERE SubjectID=? AND RelationshipID=? AND ObjectID=?",
+                (subject_id, relationship_id, object_id),
+            )
+            if not cur.fetchone():
+                cur.execute(
+                    "INSERT INTO RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?, ?, ?, ?)",
+                    (movie[0], subject_id, relationship_id, object_id),
+                )
+                total += 1
+    conn.commit()
+    conn.close()
+    print(f"Dataset RDFs loaded: {total}")
+    print(f"Skipped Movies: {skipped_movie}")
+
+
+# --- Load Reverse RDFs ---
+def load_reverse():
+    conn = sqlite3.connect(DB_NAME)
+    cur = conn.cursor()
+    # get oridin_id from datset
+    cur.execute("SELECT OriginID FROM Origins WHERE OriginName='Reverse'")
+    origin_id = int(cur.fetchone()[0])
+    print(f"Origin_id is: {origin_id}")
+    ###
+    total = 0
+    skipped_movie = 0
+    # REVERSE_CSV : "subject","relationship","object" -> SubjectURI, RelationshipURI, MovieURI
+    with open(REVERSE_CSV, newline="", encoding="utf-8") as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            subject_uri = row["subject"].strip()
+            relationship_uri = row["relationship"].strip()
+            movie_uri = row["object"].strip()
+
+            cur.execute("SELECT MovieID FROM Movies WHERE MovieURI=?", (movie_uri,))
+            movie = cur.fetchone()
+            if not movie:
+                skipped_movie += skipped_movie
+                continue
+            # it is a guard check if it doest exist a MovieID from the given MovieURI, then skip
+
+            # now put each URI into their SCHEMA and retrieves IDs instead
+            subject_id = get_or_create(cur, "Subjects", "SubjectURI", subject_uri, origin_id)
+            relationship_id = get_or_create(cur, "Relationships", "RelationshipURI", relationship_uri)
+            object_id = get_or_create(cur, "Objects", "ObjectURI", movie_uri, origin_id)
+
+            # check if the triple is already in the RDF
+            cur.execute(
+                "SELECT RDF_ID FROM RDFs WHERE SubjectID=? AND RelationshipID=? AND ObjectID=?",
+                (subject_id, relationship_id, object_id),
+            )
+            if not cur.fetchone():
+                cur.execute(
+                    "INSERT INTO RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?, ?, ?, ?)",
+                    (movie[0], subject_id, relationship_id, object_id),
+                )
+                total += 1
+    conn.commit()
+    conn.close()
+    print(f"Reverse RDFs loaded: {total}")
+    print(f"Skipped Movies: {skipped_movie}")
+
+
+
+# --- Execution order ---
+load_movies()
+load_origins()
+load_wikipageids()
+load_abstracts()
+load_dataset()
+load_reverse()
+# sqlite3 ./Assets/Dataset/Tmp/dataset.db < ./Scripts/DataCleaning/SQL_Queries/db_creation.sql    
--- a/Scripts/DatasetMerging/SQL_Queries/db_creation.sql
+++ b/Scripts/DatasetMerging/SQL_Queries/db_creation.sql
@@ -0,0 +1,65 @@
+CREATE TABLE IF NOT EXISTS Movies (
+    MovieID INTEGER PRIMARY KEY AUTOINCREMENT,
+    MovieURI TEXT UNIQUE NOT NULL
+);
+
+
+CREATE TABLE IF NOT EXISTS WikiPageIDs (
+    MovieID INTEGER PRIMARY KEY,
+    PageID INTEGER UNIQUE NOT NULL,
+    FOREIGN KEY(MovieID) REFERENCES Movies(MovieID)
+);
+
+
+CREATE TABLE IF NOT EXISTS WikipediaAbstracts (
+    MovieID INTEGER PRIMARY KEY,
+    Abstract TEXT NOT NULL,
+    FOREIGN KEY(MovieID) REFERENCES Movies(MovieID)
+);
+
+
+CREATE TABLE IF NOT EXISTS Origins (
+    OriginID INTEGER PRIMARY KEY AUTOINCREMENT,
+    OriginName TEXT UNIQUE NOT NULL
+);
+
+
+CREATE TABLE IF NOT EXISTS Subjects (
+    SubjectID INTEGER PRIMARY KEY AUTOINCREMENT,
+    SubjectURI TEXT UNIQUE NOT NULL,
+    OriginID BIGINT NOT NULL,
+    FOREIGN KEY(OriginID) REFERENCES Origins(OriginID)
+);
+
+
+CREATE TABLE IF NOT EXISTS Relationships (
+    RelationshipID INTEGER PRIMARY KEY AUTOINCREMENT,
+    RelationshipURI TEXT UNIQUE NOT NULL
+);
+
+
+CREATE TABLE IF NOT EXISTS Objects (
+    ObjectID INTEGER PRIMARY KEY AUTOINCREMENT,
+    ObjectURI TEXT UNIQUE NOT NULL,
+    OriginID BIGINT NOT NULL,
+    FOREIGN KEY(OriginID) REFERENCES Origins(OriginID)
+);
+
+CREATE TABLE IF NOT EXISTS RDFs (
+    RDF_ID INTEGER PRIMARY KEY AUTOINCREMENT,
+    MovieID INTEGER NOT NULL,
+    SubjectID INTEGER NOT NULL,
+    RelationshipID INTEGER NOT NULL,
+    ObjectID INTEGER NOT NULL,
+    UNIQUE(SubjectID, RelationshipID, ObjectID),
+    FOREIGN KEY(MovieID) REFERENCES Movies(MovieID),
+    FOREIGN KEY(SubjectID) REFERENCES Subjects(SubjectID),
+    FOREIGN KEY(RelationshipID) REFERENCES Relationships(RelationshipID),
+    FOREIGN KEY(ObjectID) REFERENCES Objects(ObjectID)
+);
+
+CREATE INDEX IF NOT EXISTS idx_rdf_movie_id ON RDFs(MovieID);
+CREATE INDEX IF NOT EXISTS idx_rdf_subject_id ON RDFs(SubjectID);
+CREATE INDEX IF NOT EXISTS idx_rdf_relationship_id ON RDFs(RelationshipID);
+CREATE INDEX IF NOT EXISTS idx_rdf_object_id ON RDFs(ObjectID);
+