From 51114af853130322cdb8876ca28721f6abdcf0fb Mon Sep 17 00:00:00 2001 From: GassiGiuseppe Date: Mon, 22 Sep 2025 17:51:35 +0200 Subject: [PATCH] DataRetrivial deleted since it does the same thing as datawarehouse.py --- Scripts/DatasetMerging/DataRetrivial.py | 249 ------------------------ 1 file changed, 249 deletions(-) delete mode 100644 Scripts/DatasetMerging/DataRetrivial.py diff --git a/Scripts/DatasetMerging/DataRetrivial.py b/Scripts/DatasetMerging/DataRetrivial.py deleted file mode 100644 index f9e0d4a..0000000 --- a/Scripts/DatasetMerging/DataRetrivial.py +++ /dev/null @@ -1,249 +0,0 @@ -import sqlite3 -import csv - -# --- Global configuration --- -DB_NAME = "./Assets/Dataset/Tmp/dataset.db" -MOVIES_CSV = "./Assets/Dataset/1-hop/movies.csv" -PAGEID_CSV = "./Assets/Dataset/1-hop/movie-pageid.csv" -SUMMARY_CSV = "./Assets/Dataset/1-hop/wikipedia-summary.csv" -DATASET_CSV = "./Assets/Dataset/1-hop/dataset.csv" -REVERSE_CSV = "./Assets/Dataset/1-hop/reverse.csv" - - -# --- Helper: idempotent insert-or-select --- -def get_or_create(cursor, table, column, value, origin_id=None): - # tries to put new values in db, then get the id (regardless of the check) - # Subjects and Objects need origin_id. Relationships do not - # try/except on INSERT keeps IDs contiguous (no AUTOINCREMENT jumps) - - try: - if origin_id is not None: - cursor.execute( - f"INSERT INTO {table} ({column}, OriginID) VALUES (?, ?)", - (value, int(origin_id)), - ) - else: - cursor.execute( - f"INSERT INTO {table} ({column}) VALUES (?)", - (value,), - ) - except sqlite3.IntegrityError: - # Row already exists, do nothing - pass - - # Always fetch the ID (whether new or existing) - # {table[:-1]}ID -> - # Subjects -> SubjectID - # Objects -> ObjectID - # Relationships -> RelationshipID - # kinda hardcoded - cursor.execute(f"SELECT {table[:-1]}ID FROM {table} WHERE {column}=?", (value,)) - return cursor.fetchone()[0] # fetchone returns a list with one element - - - -# --- Load Movies --- -def load_movies(): - # Creates Movies: MovieID [PK] / Movie URI - # MovieID is managed by sql - conn = sqlite3.connect(DB_NAME) - cur = conn.cursor() - total = 0 - # MOVIES_CSV: "subject" [it has only this column] - with open(MOVIES_CSV, newline="", encoding="utf-8") as f: - reader = csv.DictReader(f) - for row in reader: - movie_uri = row["subject"].strip() - # try/except on INSERT keeps IDs contiguous (no AUTOINCREMENT jumps) - try: - cur.execute("INSERT INTO Movies (MovieURI) VALUES (?)", (movie_uri,)) - total += 1 # count only if a new row was added - except sqlite3.IntegrityError: - # already exists, skip - pass - conn.commit() # suggested by dr - conn.close() - print(f"Movies loaded: {total}") - - -# --- Load Origins --- -def load_origins(): - # Creates Origins: OriginID [PK]/ Origin Name - # ["Dataset", "Reverse"] - conn = sqlite3.connect(DB_NAME) - cur = conn.cursor() - for origin in ["Dataset", "Reverse"]: - try: - cur.execute("INSERT INTO Origins (OriginName) VALUES (?)", (origin,)) - except sqlite3.IntegrityError: - pass - conn.commit() - conn.close() - print("Origins loaded.") - - -# --- Load WikiPageIDs --- -def load_wikipageids(): - # Creates WikiPageIDs: MovieID [PK, FK]/ PageId [IDX] (wiki) - conn = sqlite3.connect(DB_NAME) - cur = conn.cursor() - total = 0 - # PAGEID_CSV: "subject","object" -> MovieURI, WikiPageId - with open(PAGEID_CSV, newline="", encoding="utf-8") as f: - reader = csv.DictReader(f) - for row in reader: - movie_uri = row["subject"].strip() - page_id = int(row["object"]) - cur.execute("SELECT MovieID FROM Movies WHERE MovieURI=?", (movie_uri,)) - movie = cur.fetchone() - if movie: - try: - # it can become INSERT OR IGNORE instead of try catch - cur.execute( - "INSERT INTO WikiPageIDs (MovieID, PageID) VALUES (?, ?)", - (movie[0], page_id), - ) - total += 1 - except sqlite3.IntegrityError: - pass - conn.commit() - conn.close() - print(f"WikiPageIDs loaded: {total}") - - -# --- Load Wikipedia Abstracts --- -def load_abstracts(): - # Cretes WikipediaAbstracts: MovieID [PK, FK]/ abstract - conn = sqlite3.connect(DB_NAME) - cur = conn.cursor() - total = 0 - # SUMMARY_CSV: subject,text -> WikiPageID / abstract - with open(SUMMARY_CSV, newline="", encoding="utf-8") as f: - reader = csv.DictReader(f) - for row in reader: - page_id = int(row["subject"]) - abstract = row["text"].strip() - # WikiPageIDs: MovieID [PK, FK]/ PageId [IDX] (wiki) - cur.execute("SELECT MovieID FROM WikiPageIDs WHERE PageID=?", (page_id,)) - movie = cur.fetchone() # which is MovieID - if movie: - try: - # it can become INSERT OR IGNORE instead of try catch - cur.execute( - "INSERT INTO WikipediaAbstracts (MovieID, Abstract) VALUES (?, ?)", - (movie[0], abstract), - ) - total += 1 - except sqlite3.IntegrityError as e: - print(e) - pass - conn.commit() - conn.close() - print(f"WikipediaAbstracts loaded: {total}") - - -# --- Load Dataset RDFs --- -def load_dataset(): - conn = sqlite3.connect(DB_NAME) - cur = conn.cursor() - # get oridin_id from datset - cur.execute("SELECT OriginID FROM Origins WHERE OriginName='Dataset'") - origin_id = int(cur.fetchone()[0]) - print(f"Origin_id is: {origin_id}") - #### - total = 0 - skipped_movie = 0 - # DATASET_CSV: "subject","relationshi","object" -> MovieUri, RelationshipUri, ObjectUri - with open(DATASET_CSV, newline="", encoding="utf-8") as f: - reader = csv.DictReader(f) - for row in reader: - movie_uri = row["subject"].strip() - relationship_uri = row["relationship"].strip() - object_uri = row["object"].strip() - - cur.execute("SELECT MovieID FROM Movies WHERE MovieURI=?", (movie_uri,)) - movie = cur.fetchone() - if not movie: - skipped_movie += skipped_movie - continue - # it is a guard check if it doest exist a MovieID from the given MovieURI, then skip - - # now put each URI into their SCHEMA and retrieves IDs instead - subject_id = get_or_create(cur, "Subjects", "SubjectURI", movie_uri, origin_id) - relationship_id = get_or_create(cur, "Relationships", "RelationshipURI", relationship_uri) - object_id = get_or_create(cur, "Objects", "ObjectURI", object_uri, origin_id) - - # check if the triple is already in the RDF - cur.execute( - "SELECT RDF_ID FROM RDFs WHERE SubjectID=? AND RelationshipID=? AND ObjectID=?", - (subject_id, relationship_id, object_id), - ) - if not cur.fetchone(): - cur.execute( - "INSERT INTO RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?, ?, ?, ?)", - (movie[0], subject_id, relationship_id, object_id), - ) - total += 1 - conn.commit() - conn.close() - print(f"Dataset RDFs loaded: {total}") - print(f"Skipped Movies: {skipped_movie}") - - -# --- Load Reverse RDFs --- -def load_reverse(): - conn = sqlite3.connect(DB_NAME) - cur = conn.cursor() - # get oridin_id from datset - cur.execute("SELECT OriginID FROM Origins WHERE OriginName='Reverse'") - origin_id = int(cur.fetchone()[0]) - print(f"Origin_id is: {origin_id}") - ### - total = 0 - skipped_movie = 0 - # REVERSE_CSV : "subject","relationship","object" -> SubjectURI, RelationshipURI, MovieURI - with open(REVERSE_CSV, newline="", encoding="utf-8") as f: - reader = csv.DictReader(f) - for row in reader: - subject_uri = row["subject"].strip() - relationship_uri = row["relationship"].strip() - movie_uri = row["object"].strip() - - cur.execute("SELECT MovieID FROM Movies WHERE MovieURI=?", (movie_uri,)) - movie = cur.fetchone() - if not movie: - skipped_movie += skipped_movie - continue - # it is a guard check if it doest exist a MovieID from the given MovieURI, then skip - - # now put each URI into their SCHEMA and retrieves IDs instead - subject_id = get_or_create(cur, "Subjects", "SubjectURI", subject_uri, origin_id) - relationship_id = get_or_create(cur, "Relationships", "RelationshipURI", relationship_uri) - object_id = get_or_create(cur, "Objects", "ObjectURI", movie_uri, origin_id) - - # check if the triple is already in the RDF - cur.execute( - "SELECT RDF_ID FROM RDFs WHERE SubjectID=? AND RelationshipID=? AND ObjectID=?", - (subject_id, relationship_id, object_id), - ) - if not cur.fetchone(): - cur.execute( - "INSERT INTO RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?, ?, ?, ?)", - (movie[0], subject_id, relationship_id, object_id), - ) - total += 1 - conn.commit() - conn.close() - print(f"Reverse RDFs loaded: {total}") - print(f"Skipped Movies: {skipped_movie}") - - - -# --- Execution order --- -load_movies() -load_origins() -load_wikipageids() -load_abstracts() -load_dataset() -load_reverse() -# sqlite3 ./Assets/Dataset/Tmp/dataset.db < ./Scripts/DataCleaning/SQL_Queries/db_creation.sql \ No newline at end of file