Folder DataCleaning renamed to DatasetMerging since it doesn't clean nothing

and instead Build the dataset
2025-09-22 17:11:49 +02:00
parent edd01a2c83
commit ac1ed42c49
4 changed files with 45 additions and 28 deletions
--- a/Scripts/DataCleaning/DBMerger.py
+++ b/Scripts/DataCleaning/DBMerger.py
@@ -1,28 +0,0 @@
-"""
-What we have now:
-
-Wikipeda-summary    : PageId / abstract
-Movies              : Movie URI
-Dataset             : Movie URI / Relationship / Object [RDF]
-Movies-PageId       : Movie URI / PageId (wiki)
-Reverse             : Subject / Relationship / Movie URI
-
-What we want:
-( we will generate MovieID)
-Movies              : MovieID [PK] / Movie URI
-WikiPageIDs         : MovieID [PK, FK]/ PageId [IDX] (wiki) (Not important for now)
-Abstracts           : MovieID [PK, FK]/ abstract
-Subjects            : SubjectID [PK] / RDF Subject ( both from either Dataset.csv or Reverse.csv) / OriginID [FK]
-Relationships       : RelationshipID [PK]/ RDF Relationship  (not the actual relationshi but the value)
-Objects             : ObjectID [PK]/ RDF Object / OriginID [FK]
-Origins             : OriginID [PK]/ Origin Name
-RDFs                : RDF_ID[PK] / MovieID [FK] / SubjectID [FK]/ RelationshipID [FK]/ ObjectID [FK]
-
-What we will build for the model
-
-we need RDF list for each movie together with abstract
-
-: MovieID / RDF_set / abstrct
-
-"""
-
--- a/Scripts/DataCleaning/DataRetrivial.py
+++ b/Scripts/DataCleaning/DataRetrivial.py
@@ -1,249 +0,0 @@
-import sqlite3
-import csv
-
-# --- Global configuration ---
-DB_NAME = "./Assets/Dataset/Tmp/dataset.db"
-MOVIES_CSV = "./Assets/Dataset/1-hop/movies.csv"
-PAGEID_CSV = "./Assets/Dataset/1-hop/movie-pageid.csv"
-SUMMARY_CSV = "./Assets/Dataset/1-hop/wikipedia-summary.csv"
-DATASET_CSV = "./Assets/Dataset/1-hop/dataset.csv"
-REVERSE_CSV = "./Assets/Dataset/1-hop/reverse.csv"
-
-
-# --- Helper: idempotent insert-or-select ---
-def get_or_create(cursor, table, column, value, origin_id=None):
-    # tries to put new values in db, then get the id (regardless of the check)
-    # Subjects and Objects need origin_id. Relationships do not
-    # try/except on INSERT keeps IDs contiguous (no AUTOINCREMENT jumps)
-
-    try:
-        if origin_id is not None:
-            cursor.execute(
-                f"INSERT INTO {table} ({column}, OriginID) VALUES (?, ?)",
-                (value, int(origin_id)),
-            )
-        else:
-            cursor.execute(
-                f"INSERT INTO {table} ({column}) VALUES (?)",
-                (value,),
-            )
-    except sqlite3.IntegrityError:
-        # Row already exists, do nothing
-        pass
-
-    # Always fetch the ID (whether new or existing)
-    # {table[:-1]}ID ->
-    # Subjects -> SubjectID
-    # Objects -> ObjectID
-    # Relationships -> RelationshipID
-    # kinda hardcoded
-    cursor.execute(f"SELECT {table[:-1]}ID FROM {table} WHERE {column}=?", (value,))
-    return cursor.fetchone()[0]  # fetchone returns a list with one element
-
-
-
-# --- Load Movies ---
-def load_movies():
-    # Creates Movies: MovieID [PK] / Movie URI
-    # MovieID is managed by sql
-    conn = sqlite3.connect(DB_NAME)
-    cur = conn.cursor()
-    total = 0
-    # MOVIES_CSV: "subject" [it has only this column]
-    with open(MOVIES_CSV, newline="", encoding="utf-8") as f:
-        reader = csv.DictReader(f)
-        for row in reader:
-            movie_uri = row["subject"].strip()
-            # try/except on INSERT keeps IDs contiguous (no AUTOINCREMENT jumps)
-            try:
-                cur.execute("INSERT INTO Movies (MovieURI) VALUES (?)", (movie_uri,))
-                total += 1  # count only if a new row was added
-            except sqlite3.IntegrityError:
-                # already exists, skip
-                pass
-    conn.commit() # suggested by dr
-    conn.close()
-    print(f"Movies loaded: {total}")
-
-
-# --- Load Origins ---
-def load_origins():
-    # Creates Origins: OriginID [PK]/ Origin Name
-    # ["Dataset", "Reverse"]
-    conn = sqlite3.connect(DB_NAME)
-    cur = conn.cursor()
-    for origin in ["Dataset", "Reverse"]:
-        try:
-            cur.execute("INSERT INTO Origins (OriginName) VALUES (?)", (origin,))
-        except sqlite3.IntegrityError:
-            pass
-    conn.commit()
-    conn.close()
-    print("Origins loaded.")
-
-
-# --- Load WikiPageIDs ---
-def load_wikipageids():
-    # Creates WikiPageIDs: MovieID [PK, FK]/ PageId [IDX] (wiki)
-    conn = sqlite3.connect(DB_NAME)
-    cur = conn.cursor()
-    total = 0
-    # PAGEID_CSV: "subject","object" -> MovieURI, WikiPageId
-    with open(PAGEID_CSV, newline="", encoding="utf-8") as f:
-        reader = csv.DictReader(f)
-        for row in reader:
-            movie_uri = row["subject"].strip()
-            page_id = int(row["object"])
-            cur.execute("SELECT MovieID FROM Movies WHERE MovieURI=?", (movie_uri,))
-            movie = cur.fetchone()
-            if movie:
-                try:
-                    # it can become INSERT OR IGNORE instead of try catch
-                    cur.execute(
-                        "INSERT INTO WikiPageIDs (MovieID, PageID) VALUES (?, ?)",
-                        (movie[0], page_id),
-                    )
-                    total += 1
-                except sqlite3.IntegrityError:
-                    pass
-    conn.commit()
-    conn.close()
-    print(f"WikiPageIDs loaded: {total}")
-
-
-# --- Load Wikipedia Abstracts ---
-def load_abstracts():
-    # Cretes WikipediaAbstracts: MovieID [PK, FK]/ abstract
-    conn = sqlite3.connect(DB_NAME)
-    cur = conn.cursor()
-    total = 0
-    # SUMMARY_CSV: subject,text -> WikiPageID / abstract 
-    with open(SUMMARY_CSV, newline="", encoding="utf-8") as f:
-        reader = csv.DictReader(f)
-        for row in reader:
-            page_id = int(row["subject"])
-            abstract = row["text"].strip()
-            # WikiPageIDs: MovieID [PK, FK]/ PageId [IDX] (wiki)
-            cur.execute("SELECT MovieID FROM WikiPageIDs WHERE PageID=?", (page_id,))
-            movie = cur.fetchone() # which is MovieID
-            if movie:
-                try:
-                    # it can become INSERT OR IGNORE instead of try catch
-                    cur.execute(
-                        "INSERT INTO WikipediaAbstracts (MovieID, Abstract) VALUES (?, ?)",
-                        (movie[0], abstract),
-                    )
-                    total += 1
-                except sqlite3.IntegrityError as e:
-                    print(e)
-                    pass
-    conn.commit()
-    conn.close()
-    print(f"WikipediaAbstracts loaded: {total}")
-
-
-# --- Load Dataset RDFs ---
-def load_dataset():
-    conn = sqlite3.connect(DB_NAME)
-    cur = conn.cursor()
-    # get oridin_id from datset
-    cur.execute("SELECT OriginID FROM Origins WHERE OriginName='Dataset'")
-    origin_id = int(cur.fetchone()[0])
-    print(f"Origin_id is: {origin_id}")
-    ####
-    total = 0
-    skipped_movie = 0
-    # DATASET_CSV: "subject","relationshi","object" -> MovieUri, RelationshipUri, ObjectUri
-    with open(DATASET_CSV, newline="", encoding="utf-8") as f:
-        reader = csv.DictReader(f)
-        for row in reader:
-            movie_uri = row["subject"].strip()
-            relationship_uri = row["relationship"].strip()
-            object_uri = row["object"].strip()
-
-            cur.execute("SELECT MovieID FROM Movies WHERE MovieURI=?", (movie_uri,))
-            movie = cur.fetchone()
-            if not movie:
-                skipped_movie += skipped_movie
-                continue
-            # it is a guard check if it doest exist a MovieID from the given MovieURI, then skip
-
-            # now put each URI into their SCHEMA and retrieves IDs instead
-            subject_id = get_or_create(cur, "Subjects", "SubjectURI", movie_uri, origin_id)
-            relationship_id = get_or_create(cur, "Relationships", "RelationshipURI", relationship_uri)
-            object_id = get_or_create(cur, "Objects", "ObjectURI", object_uri, origin_id)
-
-            # check if the triple is already in the RDF
-            cur.execute(
-                "SELECT RDF_ID FROM RDFs WHERE SubjectID=? AND RelationshipID=? AND ObjectID=?",
-                (subject_id, relationship_id, object_id),
-            )
-            if not cur.fetchone():
-                cur.execute(
-                    "INSERT INTO RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?, ?, ?, ?)",
-                    (movie[0], subject_id, relationship_id, object_id),
-                )
-                total += 1
-    conn.commit()
-    conn.close()
-    print(f"Dataset RDFs loaded: {total}")
-    print(f"Skipped Movies: {skipped_movie}")
-
-
-# --- Load Reverse RDFs ---
-def load_reverse():
-    conn = sqlite3.connect(DB_NAME)
-    cur = conn.cursor()
-    # get oridin_id from datset
-    cur.execute("SELECT OriginID FROM Origins WHERE OriginName='Reverse'")
-    origin_id = int(cur.fetchone()[0])
-    print(f"Origin_id is: {origin_id}")
-    ###
-    total = 0
-    skipped_movie = 0
-    # REVERSE_CSV : "subject","relationship","object" -> SubjectURI, RelationshipURI, MovieURI
-    with open(REVERSE_CSV, newline="", encoding="utf-8") as f:
-        reader = csv.DictReader(f)
-        for row in reader:
-            subject_uri = row["subject"].strip()
-            relationship_uri = row["relationship"].strip()
-            movie_uri = row["object"].strip()
-
-            cur.execute("SELECT MovieID FROM Movies WHERE MovieURI=?", (movie_uri,))
-            movie = cur.fetchone()
-            if not movie:
-                skipped_movie += skipped_movie
-                continue
-            # it is a guard check if it doest exist a MovieID from the given MovieURI, then skip
-
-            # now put each URI into their SCHEMA and retrieves IDs instead
-            subject_id = get_or_create(cur, "Subjects", "SubjectURI", subject_uri, origin_id)
-            relationship_id = get_or_create(cur, "Relationships", "RelationshipURI", relationship_uri)
-            object_id = get_or_create(cur, "Objects", "ObjectURI", movie_uri, origin_id)
-
-            # check if the triple is already in the RDF
-            cur.execute(
-                "SELECT RDF_ID FROM RDFs WHERE SubjectID=? AND RelationshipID=? AND ObjectID=?",
-                (subject_id, relationship_id, object_id),
-            )
-            if not cur.fetchone():
-                cur.execute(
-                    "INSERT INTO RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?, ?, ?, ?)",
-                    (movie[0], subject_id, relationship_id, object_id),
-                )
-                total += 1
-    conn.commit()
-    conn.close()
-    print(f"Reverse RDFs loaded: {total}")
-    print(f"Skipped Movies: {skipped_movie}")
-
-
-
-# --- Execution order ---
-load_movies()
-load_origins()
-load_wikipageids()
-load_abstracts()
-load_dataset()
-load_reverse()
-# sqlite3 ./Assets/Dataset/Tmp/dataset.db < ./Scripts/DataCleaning/SQL_Queries/db_creation.sql    
--- a/Scripts/DataCleaning/SQL_Queries/db_creation.sql
+++ b/Scripts/DataCleaning/SQL_Queries/db_creation.sql
@@ -1,65 +0,0 @@
-CREATE TABLE IF NOT EXISTS Movies (
-    MovieID INTEGER PRIMARY KEY AUTOINCREMENT,
-    MovieURI TEXT UNIQUE NOT NULL
-);
-
-
-CREATE TABLE IF NOT EXISTS WikiPageIDs (
-    MovieID INTEGER PRIMARY KEY,
-    PageID INTEGER UNIQUE NOT NULL,
-    FOREIGN KEY(MovieID) REFERENCES Movies(MovieID)
-);
-
-
-CREATE TABLE IF NOT EXISTS WikipediaAbstracts (
-    MovieID INTEGER PRIMARY KEY,
-    Abstract TEXT NOT NULL,
-    FOREIGN KEY(MovieID) REFERENCES Movies(MovieID)
-);
-
-
-CREATE TABLE IF NOT EXISTS Origins (
-    OriginID INTEGER PRIMARY KEY AUTOINCREMENT,
-    OriginName TEXT UNIQUE NOT NULL
-);
-
-
-CREATE TABLE IF NOT EXISTS Subjects (
-    SubjectID INTEGER PRIMARY KEY AUTOINCREMENT,
-    SubjectURI TEXT UNIQUE NOT NULL,
-    OriginID BIGINT NOT NULL,
-    FOREIGN KEY(OriginID) REFERENCES Origins(OriginID)
-);
-
-
-CREATE TABLE IF NOT EXISTS Relationships (
-    RelationshipID INTEGER PRIMARY KEY AUTOINCREMENT,
-    RelationshipURI TEXT UNIQUE NOT NULL
-);
-
-
-CREATE TABLE IF NOT EXISTS Objects (
-    ObjectID INTEGER PRIMARY KEY AUTOINCREMENT,
-    ObjectURI TEXT UNIQUE NOT NULL,
-    OriginID BIGINT NOT NULL,
-    FOREIGN KEY(OriginID) REFERENCES Origins(OriginID)
-);
-
-CREATE TABLE IF NOT EXISTS RDFs (
-    RDF_ID INTEGER PRIMARY KEY AUTOINCREMENT,
-    MovieID INTEGER NOT NULL,
-    SubjectID INTEGER NOT NULL,
-    RelationshipID INTEGER NOT NULL,
-    ObjectID INTEGER NOT NULL,
-    UNIQUE(SubjectID, RelationshipID, ObjectID),
-    FOREIGN KEY(MovieID) REFERENCES Movies(MovieID),
-    FOREIGN KEY(SubjectID) REFERENCES Subjects(SubjectID),
-    FOREIGN KEY(RelationshipID) REFERENCES Relationships(RelationshipID),
-    FOREIGN KEY(ObjectID) REFERENCES Objects(ObjectID)
-);
-
-CREATE INDEX IF NOT EXISTS idx_rdf_movie_id ON RDFs(MovieID);
-CREATE INDEX IF NOT EXISTS idx_rdf_subject_id ON RDFs(SubjectID);
-CREATE INDEX IF NOT EXISTS idx_rdf_relationship_id ON RDFs(RelationshipID);
-CREATE INDEX IF NOT EXISTS idx_rdf_object_id ON RDFs(ObjectID);
-