DataRetrivial deleted since it does the same thing as datawarehouse.py

2025-09-22 17:51:35 +02:00
parent 3a6dca0681
commit 51114af853
1 changed files with 0 additions and 249 deletions
--- a/Scripts/DatasetMerging/DataRetrivial.py
+++ b/Scripts/DatasetMerging/DataRetrivial.py
@@ -1,249 +0,0 @@
-import sqlite3
-import csv
-
-# --- Global configuration ---
-DB_NAME = "./Assets/Dataset/Tmp/dataset.db"
-MOVIES_CSV = "./Assets/Dataset/1-hop/movies.csv"
-PAGEID_CSV = "./Assets/Dataset/1-hop/movie-pageid.csv"
-SUMMARY_CSV = "./Assets/Dataset/1-hop/wikipedia-summary.csv"
-DATASET_CSV = "./Assets/Dataset/1-hop/dataset.csv"
-REVERSE_CSV = "./Assets/Dataset/1-hop/reverse.csv"
-
-
-# --- Helper: idempotent insert-or-select ---
-def get_or_create(cursor, table, column, value, origin_id=None):
-    # tries to put new values in db, then get the id (regardless of the check)
-    # Subjects and Objects need origin_id. Relationships do not
-    # try/except on INSERT keeps IDs contiguous (no AUTOINCREMENT jumps)
-
-    try:
-        if origin_id is not None:
-            cursor.execute(
-                f"INSERT INTO {table} ({column}, OriginID) VALUES (?, ?)",
-                (value, int(origin_id)),
-            )
-        else:
-            cursor.execute(
-                f"INSERT INTO {table} ({column}) VALUES (?)",
-                (value,),
-            )
-    except sqlite3.IntegrityError:
-        # Row already exists, do nothing
-        pass
-
-    # Always fetch the ID (whether new or existing)
-    # {table[:-1]}ID ->
-    # Subjects -> SubjectID
-    # Objects -> ObjectID
-    # Relationships -> RelationshipID
-    # kinda hardcoded
-    cursor.execute(f"SELECT {table[:-1]}ID FROM {table} WHERE {column}=?", (value,))
-    return cursor.fetchone()[0]  # fetchone returns a list with one element
-
-
-
-# --- Load Movies ---
-def load_movies():
-    # Creates Movies: MovieID [PK] / Movie URI
-    # MovieID is managed by sql
-    conn = sqlite3.connect(DB_NAME)
-    cur = conn.cursor()
-    total = 0
-    # MOVIES_CSV: "subject" [it has only this column]
-    with open(MOVIES_CSV, newline="", encoding="utf-8") as f:
-        reader = csv.DictReader(f)
-        for row in reader:
-            movie_uri = row["subject"].strip()
-            # try/except on INSERT keeps IDs contiguous (no AUTOINCREMENT jumps)
-            try:
-                cur.execute("INSERT INTO Movies (MovieURI) VALUES (?)", (movie_uri,))
-                total += 1  # count only if a new row was added
-            except sqlite3.IntegrityError:
-                # already exists, skip
-                pass
-    conn.commit() # suggested by dr
-    conn.close()
-    print(f"Movies loaded: {total}")
-
-
-# --- Load Origins ---
-def load_origins():
-    # Creates Origins: OriginID [PK]/ Origin Name
-    # ["Dataset", "Reverse"]
-    conn = sqlite3.connect(DB_NAME)
-    cur = conn.cursor()
-    for origin in ["Dataset", "Reverse"]:
-        try:
-            cur.execute("INSERT INTO Origins (OriginName) VALUES (?)", (origin,))
-        except sqlite3.IntegrityError:
-            pass
-    conn.commit()
-    conn.close()
-    print("Origins loaded.")
-
-
-# --- Load WikiPageIDs ---
-def load_wikipageids():
-    # Creates WikiPageIDs: MovieID [PK, FK]/ PageId [IDX] (wiki)
-    conn = sqlite3.connect(DB_NAME)
-    cur = conn.cursor()
-    total = 0
-    # PAGEID_CSV: "subject","object" -> MovieURI, WikiPageId
-    with open(PAGEID_CSV, newline="", encoding="utf-8") as f:
-        reader = csv.DictReader(f)
-        for row in reader:
-            movie_uri = row["subject"].strip()
-            page_id = int(row["object"])
-            cur.execute("SELECT MovieID FROM Movies WHERE MovieURI=?", (movie_uri,))
-            movie = cur.fetchone()
-            if movie:
-                try:
-                    # it can become INSERT OR IGNORE instead of try catch
-                    cur.execute(
-                        "INSERT INTO WikiPageIDs (MovieID, PageID) VALUES (?, ?)",
-                        (movie[0], page_id),
-                    )
-                    total += 1
-                except sqlite3.IntegrityError:
-                    pass
-    conn.commit()
-    conn.close()
-    print(f"WikiPageIDs loaded: {total}")
-
-
-# --- Load Wikipedia Abstracts ---
-def load_abstracts():
-    # Cretes WikipediaAbstracts: MovieID [PK, FK]/ abstract
-    conn = sqlite3.connect(DB_NAME)
-    cur = conn.cursor()
-    total = 0
-    # SUMMARY_CSV: subject,text -> WikiPageID / abstract 
-    with open(SUMMARY_CSV, newline="", encoding="utf-8") as f:
-        reader = csv.DictReader(f)
-        for row in reader:
-            page_id = int(row["subject"])
-            abstract = row["text"].strip()
-            # WikiPageIDs: MovieID [PK, FK]/ PageId [IDX] (wiki)
-            cur.execute("SELECT MovieID FROM WikiPageIDs WHERE PageID=?", (page_id,))
-            movie = cur.fetchone() # which is MovieID
-            if movie:
-                try:
-                    # it can become INSERT OR IGNORE instead of try catch
-                    cur.execute(
-                        "INSERT INTO WikipediaAbstracts (MovieID, Abstract) VALUES (?, ?)",
-                        (movie[0], abstract),
-                    )
-                    total += 1
-                except sqlite3.IntegrityError as e:
-                    print(e)
-                    pass
-    conn.commit()
-    conn.close()
-    print(f"WikipediaAbstracts loaded: {total}")
-
-
-# --- Load Dataset RDFs ---
-def load_dataset():
-    conn = sqlite3.connect(DB_NAME)
-    cur = conn.cursor()
-    # get oridin_id from datset
-    cur.execute("SELECT OriginID FROM Origins WHERE OriginName='Dataset'")
-    origin_id = int(cur.fetchone()[0])
-    print(f"Origin_id is: {origin_id}")
-    ####
-    total = 0
-    skipped_movie = 0
-    # DATASET_CSV: "subject","relationshi","object" -> MovieUri, RelationshipUri, ObjectUri
-    with open(DATASET_CSV, newline="", encoding="utf-8") as f:
-        reader = csv.DictReader(f)
-        for row in reader:
-            movie_uri = row["subject"].strip()
-            relationship_uri = row["relationship"].strip()
-            object_uri = row["object"].strip()
-
-            cur.execute("SELECT MovieID FROM Movies WHERE MovieURI=?", (movie_uri,))
-            movie = cur.fetchone()
-            if not movie:
-                skipped_movie += skipped_movie
-                continue
-            # it is a guard check if it doest exist a MovieID from the given MovieURI, then skip
-
-            # now put each URI into their SCHEMA and retrieves IDs instead
-            subject_id = get_or_create(cur, "Subjects", "SubjectURI", movie_uri, origin_id)
-            relationship_id = get_or_create(cur, "Relationships", "RelationshipURI", relationship_uri)
-            object_id = get_or_create(cur, "Objects", "ObjectURI", object_uri, origin_id)
-
-            # check if the triple is already in the RDF
-            cur.execute(
-                "SELECT RDF_ID FROM RDFs WHERE SubjectID=? AND RelationshipID=? AND ObjectID=?",
-                (subject_id, relationship_id, object_id),
-            )
-            if not cur.fetchone():
-                cur.execute(
-                    "INSERT INTO RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?, ?, ?, ?)",
-                    (movie[0], subject_id, relationship_id, object_id),
-                )
-                total += 1
-    conn.commit()
-    conn.close()
-    print(f"Dataset RDFs loaded: {total}")
-    print(f"Skipped Movies: {skipped_movie}")
-
-
-# --- Load Reverse RDFs ---
-def load_reverse():
-    conn = sqlite3.connect(DB_NAME)
-    cur = conn.cursor()
-    # get oridin_id from datset
-    cur.execute("SELECT OriginID FROM Origins WHERE OriginName='Reverse'")
-    origin_id = int(cur.fetchone()[0])
-    print(f"Origin_id is: {origin_id}")
-    ###
-    total = 0
-    skipped_movie = 0
-    # REVERSE_CSV : "subject","relationship","object" -> SubjectURI, RelationshipURI, MovieURI
-    with open(REVERSE_CSV, newline="", encoding="utf-8") as f:
-        reader = csv.DictReader(f)
-        for row in reader:
-            subject_uri = row["subject"].strip()
-            relationship_uri = row["relationship"].strip()
-            movie_uri = row["object"].strip()
-
-            cur.execute("SELECT MovieID FROM Movies WHERE MovieURI=?", (movie_uri,))
-            movie = cur.fetchone()
-            if not movie:
-                skipped_movie += skipped_movie
-                continue
-            # it is a guard check if it doest exist a MovieID from the given MovieURI, then skip
-
-            # now put each URI into their SCHEMA and retrieves IDs instead
-            subject_id = get_or_create(cur, "Subjects", "SubjectURI", subject_uri, origin_id)
-            relationship_id = get_or_create(cur, "Relationships", "RelationshipURI", relationship_uri)
-            object_id = get_or_create(cur, "Objects", "ObjectURI", movie_uri, origin_id)
-
-            # check if the triple is already in the RDF
-            cur.execute(
-                "SELECT RDF_ID FROM RDFs WHERE SubjectID=? AND RelationshipID=? AND ObjectID=?",
-                (subject_id, relationship_id, object_id),
-            )
-            if not cur.fetchone():
-                cur.execute(
-                    "INSERT INTO RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?, ?, ?, ?)",
-                    (movie[0], subject_id, relationship_id, object_id),
-                )
-                total += 1
-    conn.commit()
-    conn.close()
-    print(f"Reverse RDFs loaded: {total}")
-    print(f"Skipped Movies: {skipped_movie}")
-
-
-
-# --- Execution order ---
-load_movies()
-load_origins()
-load_wikipageids()
-load_abstracts()
-load_dataset()
-load_reverse()
-# sqlite3 ./Assets/Dataset/Tmp/dataset.db < ./Scripts/DataCleaning/SQL_Queries/db_creation.sql