NanoSocrates/Scripts/DataCleaning/DataRetrivial.py

import sqlite3
import pandas as pd

# --- Global configuration ---
DB_NAME = "./Assets/Dataset/DatawareHouse/dataset.db"
MOVIES_CSV = "./Assets/Dataset/1-hop/movies.csv"
PAGEID_CSV = "./Assets/Dataset/1-hop/movie-pageid.csv"
SUMMARY_CSV = "./Assets/Dataset/1-hop/wikipedia-summary.csv"
DATASET_CSV = "./Assets/Dataset/1-hop/dataset.csv"
REVERSE_CSV = "./Assets/Dataset/1-hop/reverse.csv"
CHUNK_SIZE = 50000  # adjust based on memory

# --- Load Movies ---
def load_movies():
    conn = sqlite3.connect(DB_NAME)
    total_inserted = 0
    for chunk in pd.read_csv(MOVIES_CSV, chunksize=CHUNK_SIZE):
        chunk.rename(columns={"subject": "MovieURI"}, inplace=True)
        chunk["MovieURI"] = chunk["MovieURI"].astype(str).str.strip()
        chunk.to_sql("Movies", conn, if_exists="append", index=False)
        total_inserted += len(chunk)
    print(f"Movies loaded: {total_inserted} rows inserted.")
    conn.close()

# --- Load Origins ---
def load_origins():
    conn = sqlite3.connect(DB_NAME)
    origins = pd.DataFrame({"OriginName": ["Dataset", "Reverse"]})
    origins.to_sql("Origins", conn, if_exists="append", index=False)
    conn.close()
    print("Origins loaded.")

# --- Load WikiPageIDs ---
def load_wikipageids():
    conn = sqlite3.connect(DB_NAME)
    movies_df = pd.read_sql_query("SELECT MovieID, MovieURI FROM Movies", conn)
    movies_df["MovieURI"] = movies_df["MovieURI"].astype(str).str.strip()

    total_inserted = 0
    for chunk in pd.read_csv(PAGEID_CSV, chunksize=CHUNK_SIZE):
        chunk.rename(columns={"subject": "MovieURI", "object": "PageID"}, inplace=True)
        chunk["MovieURI"] = chunk["MovieURI"].astype(str).str.strip()
        chunk["PageID"] = chunk["PageID"].astype(int)
        merged = chunk.merge(movies_df, on="MovieURI", how="inner")
        if not merged.empty:
            merged[["MovieID", "PageID"]].to_sql("WikiPageIDs", conn, if_exists="append", index=False)
            total_inserted += len(merged)
    print(f"WikiPageIDs loaded: {total_inserted} rows inserted.")
    conn.close()

# --- Load Wikipedia Abstracts ---
def load_abstracts():
    conn = sqlite3.connect(DB_NAME)
    # Get MovieID mapping from WikiPageIDs
    pageid_df = pd.read_sql_query("SELECT MovieID, PageID FROM WikiPageIDs", conn)
    pageid_df["PageID"] = pageid_df["PageID"].astype(int)

    total_inserted = 0
    for chunk in pd.read_csv(SUMMARY_CSV, chunksize=CHUNK_SIZE):
        chunk.rename(columns={"subject": "PageID", "text": "Abstract"}, inplace=True)
        chunk["PageID"] = chunk["PageID"].astype(int)
        merged = chunk.merge(pageid_df, on="PageID", how="inner")
        if not merged.empty:
            merged[["MovieID", "Abstract"]].to_sql("WikipediaAbstracts", conn, if_exists="append", index=False)
            total_inserted += len(merged)
    if total_inserted == 0:
        print("No abstracts inserted — table WikipediaAbstracts is empty.")
    else:
        print(f"WikipediaAbstracts loaded: {total_inserted} rows inserted.")
    conn.close()

# --- Load Dataset RDFs ---
# --- Helper function to get or create an entry and return its ID ---
def get_or_create(cursor, table, column, value, origin_id=None):
    # is idempotent!
    # Check existence only on the value itself (because the column is UNIQUE)
    cursor.execute(f"SELECT {table[:-1]}ID FROM {table} WHERE {column}=?", (value,))
    result = cursor.fetchone()
    if result:
        return result[0]  # IDEMPOTENT: if the object already exists, there isn't another insert
    else:
        if origin_id is not None:
            cursor.execute(f"INSERT INTO {table} ({column}, OriginID) VALUES (?, ?)", (value, origin_id))
        else:
            cursor.execute(f"INSERT INTO {table} ({column}) VALUES (?)", (value,))
        return cursor.lastrowid

# --- Load Dataset RDFs ---
def load_dataset(): 
    conn = sqlite3.connect(DB_NAME)
    cursor = conn.cursor()

    # --- Load Movies mapping ---
    movies_df = pd.read_sql_query("SELECT MovieID, MovieURI FROM Movies", conn)
    movies_df["MovieURI"] = movies_df["MovieURI"].astype(str).str.strip()

    # --- Get Dataset OriginID ---
    origin_id = pd.read_sql_query("SELECT OriginID FROM Origins WHERE OriginName='Dataset'", conn)["OriginID"].iloc[0]

    total_rdfs = 0

    for chunk in pd.read_csv(DATASET_CSV, chunksize=CHUNK_SIZE):
        chunk.rename(columns={"subject": "MovieURI", "relationship": "Relationship", "object": "ObjectURI"}, inplace=True)
        chunk["MovieURI"] = chunk["MovieURI"].astype(str).str.strip()
        chunk["ObjectURI"] = chunk["ObjectURI"].astype(str).str.strip()
        chunk["Relationship"] = chunk["Relationship"].astype(str).str.strip()

        # --- Merge to get MovieID ---
        merged = chunk.merge(movies_df, on="MovieURI", how="inner") # movideId / Subject (MovieUri) / Rel / Obj
        if merged.empty:
            continue

        for _, row in merged.iterrows():  # HERE, EACH ROW IS ELABORATED ALONE
            # Subjects: the Movie itself as SubjectURI <---- Remember Subject is renamed as MovieURI
            subject_id = get_or_create(cursor, "Subjects", "SubjectURI", row["MovieURI"], origin_id)

            # Relationships
            relationship_id = get_or_create(cursor, "Relationships", "RelationshipURI", row["Relationship"])

            # Objects
            object_id = get_or_create(cursor, "Objects", "ObjectURI", row["ObjectURI"], origin_id)

            # RDFs: only insert if the triplet doesn't exist
            cursor.execute(
                "SELECT RDF_ID FROM RDFs WHERE SubjectID=? AND RelationshipID=? AND ObjectID=?",
                (subject_id, relationship_id, object_id)
            )
            if not cursor.fetchone():
                cursor.execute(
                    "INSERT INTO RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?, ?, ?, ?)",
                    (row["MovieID"], subject_id, relationship_id, object_id)
                )
                total_rdfs += 1

        conn.commit()

    print(f"Dataset RDFs loaded: {total_rdfs} rows inserted.")
    conn.close()

# --- Load Reverse RDFs ---
# --- Load Reverse RDFs ---
def load_reverse():
    conn = sqlite3.connect(DB_NAME)
    cursor = conn.cursor()

    # --- Load Movies mapping ---
    movies_df = pd.read_sql_query("SELECT MovieID, MovieURI FROM Movies", conn)
    movies_df["MovieURI"] = movies_df["MovieURI"].astype(str).str.strip()

    # --- Get Reverse OriginID ---
    origin_id = pd.read_sql_query("SELECT OriginID FROM Origins WHERE OriginName='Reverse'", conn)["OriginID"].iloc[0]

    total_rdfs = 0

    for chunk in pd.read_csv(REVERSE_CSV, chunksize=CHUNK_SIZE):
        chunk.rename(columns={"subject": "SubjectURI", "relationship": "Relationship", "object": "MovieURI"}, inplace=True)
        chunk["MovieURI"] = chunk["MovieURI"].astype(str).str.strip()
        chunk["SubjectURI"] = chunk["SubjectURI"].astype(str).str.strip()
        chunk["Relationship"] = chunk["Relationship"].astype(str).str.strip()

        # --- Merge to get MovieID ---
        merged = chunk.merge(movies_df, on="MovieURI", how="inner")
        if merged.empty:
            continue

        for _, row in merged.iterrows():
            # Subjects: from Reverse CSV
            subject_id = get_or_create(cursor, "Subjects", "SubjectURI", row["SubjectURI"], origin_id)

            # Relationships
            relationship_id = get_or_create(cursor, "Relationships", "RelationshipURI", row["Relationship"])

            # Objects: the Movie itself as ObjectURI
            object_id = get_or_create(cursor, "Objects", "ObjectURI", row["MovieURI"], origin_id)

            # RDFs: only insert if the triplet doesn't exist
            cursor.execute(
                "SELECT RDF_ID FROM RDFs WHERE SubjectID=? AND RelationshipID=? AND ObjectID=?",
                (subject_id, relationship_id, object_id)
            )
            if not cursor.fetchone():
                cursor.execute(
                    "INSERT INTO RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?, ?, ?, ?)",
                    (row["MovieID"], subject_id, relationship_id, object_id)
                )
                total_rdfs += 1

        conn.commit()

    print(f"Reverse RDFs loaded: {total_rdfs} rows inserted.")
    conn.close()


# --- Append the calls ---
# load_movies()
# load_origins()
# load_wikipageids()
# load_abstracts()
# load_dataset()
load_reverse()
DataRetrivial populate the db from csv 2025-09-20 19:56:24 +02:00			`import sqlite3`
			`import pandas as pd`

			`# --- Global configuration ---`
			`DB_NAME = "./Assets/Dataset/DatawareHouse/dataset.db"`
			`MOVIES_CSV = "./Assets/Dataset/1-hop/movies.csv"`
			`PAGEID_CSV = "./Assets/Dataset/1-hop/movie-pageid.csv"`
			`SUMMARY_CSV = "./Assets/Dataset/1-hop/wikipedia-summary.csv"`
			`DATASET_CSV = "./Assets/Dataset/1-hop/dataset.csv"`
			`REVERSE_CSV = "./Assets/Dataset/1-hop/reverse.csv"`
			`CHUNK_SIZE = 50000 # adjust based on memory`

			`# --- Load Movies ---`
			`def load_movies():`
			`conn = sqlite3.connect(DB_NAME)`
			`total_inserted = 0`
			`for chunk in pd.read_csv(MOVIES_CSV, chunksize=CHUNK_SIZE):`
			`chunk.rename(columns={"subject": "MovieURI"}, inplace=True)`
			`chunk["MovieURI"] = chunk["MovieURI"].astype(str).str.strip()`
			`chunk.to_sql("Movies", conn, if_exists="append", index=False)`
			`total_inserted += len(chunk)`
			`print(f"Movies loaded: {total_inserted} rows inserted.")`
			`conn.close()`

			`# --- Load Origins ---`
			`def load_origins():`
			`conn = sqlite3.connect(DB_NAME)`
			`origins = pd.DataFrame({"OriginName": ["Dataset", "Reverse"]})`
			`origins.to_sql("Origins", conn, if_exists="append", index=False)`
			`conn.close()`
			`print("Origins loaded.")`

			`# --- Load WikiPageIDs ---`
			`def load_wikipageids():`
			`conn = sqlite3.connect(DB_NAME)`
			`movies_df = pd.read_sql_query("SELECT MovieID, MovieURI FROM Movies", conn)`
			`movies_df["MovieURI"] = movies_df["MovieURI"].astype(str).str.strip()`

			`total_inserted = 0`
			`for chunk in pd.read_csv(PAGEID_CSV, chunksize=CHUNK_SIZE):`
			`chunk.rename(columns={"subject": "MovieURI", "object": "PageID"}, inplace=True)`
			`chunk["MovieURI"] = chunk["MovieURI"].astype(str).str.strip()`
			`chunk["PageID"] = chunk["PageID"].astype(int)`
			`merged = chunk.merge(movies_df, on="MovieURI", how="inner")`
			`if not merged.empty:`
			`merged[["MovieID", "PageID"]].to_sql("WikiPageIDs", conn, if_exists="append", index=False)`
			`total_inserted += len(merged)`
			`print(f"WikiPageIDs loaded: {total_inserted} rows inserted.")`
			`conn.close()`

			`# --- Load Wikipedia Abstracts ---`
			`def load_abstracts():`
			`conn = sqlite3.connect(DB_NAME)`
			`# Get MovieID mapping from WikiPageIDs`
			`pageid_df = pd.read_sql_query("SELECT MovieID, PageID FROM WikiPageIDs", conn)`
			`pageid_df["PageID"] = pageid_df["PageID"].astype(int)`

			`total_inserted = 0`
			`for chunk in pd.read_csv(SUMMARY_CSV, chunksize=CHUNK_SIZE):`
			`chunk.rename(columns={"subject": "PageID", "text": "Abstract"}, inplace=True)`
			`chunk["PageID"] = chunk["PageID"].astype(int)`
			`merged = chunk.merge(pageid_df, on="PageID", how="inner")`
			`if not merged.empty:`
			`merged[["MovieID", "Abstract"]].to_sql("WikipediaAbstracts", conn, if_exists="append", index=False)`
			`total_inserted += len(merged)`
			`if total_inserted == 0:`
			`print("No abstracts inserted — table WikipediaAbstracts is empty.")`
			`else:`
			`print(f"WikipediaAbstracts loaded: {total_inserted} rows inserted.")`
			`conn.close()`

			`# --- Load Dataset RDFs ---`
			`# --- Helper function to get or create an entry and return its ID ---`
			`def get_or_create(cursor, table, column, value, origin_id=None):`
			`# is idempotent!`
			`# Check existence only on the value itself (because the column is UNIQUE)`
			`cursor.execute(f"SELECT {table[:-1]}ID FROM {table} WHERE {column}=?", (value,))`
			`result = cursor.fetchone()`
			`if result:`
			`return result[0] # IDEMPOTENT: if the object already exists, there isn't another insert`
			`else:`
			`if origin_id is not None:`
			`cursor.execute(f"INSERT INTO {table} ({column}, OriginID) VALUES (?, ?)", (value, origin_id))`
			`else:`
			`cursor.execute(f"INSERT INTO {table} ({column}) VALUES (?)", (value,))`
			`return cursor.lastrowid`

			`# --- Load Dataset RDFs ---`
			`def load_dataset():`
			`conn = sqlite3.connect(DB_NAME)`
			`cursor = conn.cursor()`

			`# --- Load Movies mapping ---`
			`movies_df = pd.read_sql_query("SELECT MovieID, MovieURI FROM Movies", conn)`
			`movies_df["MovieURI"] = movies_df["MovieURI"].astype(str).str.strip()`

			`# --- Get Dataset OriginID ---`
			`origin_id = pd.read_sql_query("SELECT OriginID FROM Origins WHERE OriginName='Dataset'", conn)["OriginID"].iloc[0]`

			`total_rdfs = 0`

			`for chunk in pd.read_csv(DATASET_CSV, chunksize=CHUNK_SIZE):`
			`chunk.rename(columns={"subject": "MovieURI", "relationship": "Relationship", "object": "ObjectURI"}, inplace=True)`
			`chunk["MovieURI"] = chunk["MovieURI"].astype(str).str.strip()`
			`chunk["ObjectURI"] = chunk["ObjectURI"].astype(str).str.strip()`
			`chunk["Relationship"] = chunk["Relationship"].astype(str).str.strip()`

			`# --- Merge to get MovieID ---`
			`merged = chunk.merge(movies_df, on="MovieURI", how="inner") # movideId / Subject (MovieUri) / Rel / Obj`
			`if merged.empty:`
			`continue`

			`for _, row in merged.iterrows(): # HERE, EACH ROW IS ELABORATED ALONE`
			`# Subjects: the Movie itself as SubjectURI <---- Remember Subject is renamed as MovieURI`
			`subject_id = get_or_create(cursor, "Subjects", "SubjectURI", row["MovieURI"], origin_id)`

			`# Relationships`
			`relationship_id = get_or_create(cursor, "Relationships", "RelationshipURI", row["Relationship"])`

			`# Objects`
			`object_id = get_or_create(cursor, "Objects", "ObjectURI", row["ObjectURI"], origin_id)`

			`# RDFs: only insert if the triplet doesn't exist`
			`cursor.execute(`
			`"SELECT RDF_ID FROM RDFs WHERE SubjectID=? AND RelationshipID=? AND ObjectID=?",`
			`(subject_id, relationship_id, object_id)`
			`)`
			`if not cursor.fetchone():`
			`cursor.execute(`
			`"INSERT INTO RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?, ?, ?, ?)",`
			`(row["MovieID"], subject_id, relationship_id, object_id)`
			`)`
			`total_rdfs += 1`

			`conn.commit()`

			`print(f"Dataset RDFs loaded: {total_rdfs} rows inserted.")`
			`conn.close()`

			`# --- Load Reverse RDFs ---`
			`# --- Load Reverse RDFs ---`
			`def load_reverse():`
			`conn = sqlite3.connect(DB_NAME)`
			`cursor = conn.cursor()`

			`# --- Load Movies mapping ---`
			`movies_df = pd.read_sql_query("SELECT MovieID, MovieURI FROM Movies", conn)`
			`movies_df["MovieURI"] = movies_df["MovieURI"].astype(str).str.strip()`

			`# --- Get Reverse OriginID ---`
			`origin_id = pd.read_sql_query("SELECT OriginID FROM Origins WHERE OriginName='Reverse'", conn)["OriginID"].iloc[0]`

			`total_rdfs = 0`

			`for chunk in pd.read_csv(REVERSE_CSV, chunksize=CHUNK_SIZE):`
			`chunk.rename(columns={"subject": "SubjectURI", "relationship": "Relationship", "object": "MovieURI"}, inplace=True)`
			`chunk["MovieURI"] = chunk["MovieURI"].astype(str).str.strip()`
			`chunk["SubjectURI"] = chunk["SubjectURI"].astype(str).str.strip()`
			`chunk["Relationship"] = chunk["Relationship"].astype(str).str.strip()`

			`# --- Merge to get MovieID ---`
			`merged = chunk.merge(movies_df, on="MovieURI", how="inner")`
			`if merged.empty:`
			`continue`

			`for _, row in merged.iterrows():`
			`# Subjects: from Reverse CSV`
			`subject_id = get_or_create(cursor, "Subjects", "SubjectURI", row["SubjectURI"], origin_id)`

			`# Relationships`
			`relationship_id = get_or_create(cursor, "Relationships", "RelationshipURI", row["Relationship"])`

			`# Objects: the Movie itself as ObjectURI`
			`object_id = get_or_create(cursor, "Objects", "ObjectURI", row["MovieURI"], origin_id)`

			`# RDFs: only insert if the triplet doesn't exist`
			`cursor.execute(`
			`"SELECT RDF_ID FROM RDFs WHERE SubjectID=? AND RelationshipID=? AND ObjectID=?",`
			`(subject_id, relationship_id, object_id)`
			`)`
			`if not cursor.fetchone():`
			`cursor.execute(`
			`"INSERT INTO RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?, ?, ?, ?)",`
			`(row["MovieID"], subject_id, relationship_id, object_id)`
			`)`
			`total_rdfs += 1`

			`conn.commit()`

			`print(f"Reverse RDFs loaded: {total_rdfs} rows inserted.")`
			`conn.close()`


			`# --- Append the calls ---`
			`# load_movies()`
			`# load_origins()`
			`# load_wikipageids()`
			`# load_abstracts()`
			`# load_dataset()`
			`load_reverse()`