29 changed files with 2 additions and 1470 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -1,3 +1,2 @@
 Exam/Deep_Learning_2025_VIII.pdf filter=lfs diff=lfs merge=lfs -text
 Assets/** filter=lfs diff=lfs merge=lfs -text
 Assets/Dataset/1-hop/dataset.csv filter=lfs diff=lfs merge=lfs -text
--- a/.gitignore
+++ b/.gitignore
@ -189,8 +189,7 @@ ipython_config.py
 .LSOverride
 # Icon must end with two \r
-Icon
+Icon
 # Thumbnails
 ._*
@ -252,6 +251,3 @@ $RECYCLE.BIN/
 # .nfs files are created when an open file is removed but is still being accessed
 .nfs*
 # ---> Custom
 **/Tmp/**
 !**/.gitkeep
--- a/.vscode/extensions.json
+++ b/.vscode/extensions.json
@ -1,14 +0,0 @@
 {
    "recommendations": [
        "bierner.github-markdown-preview",
        "bierner.markdown-checkbox",
        "bierner.markdown-emoji",
        "bierner.markdown-footnotes",
        "bierner.markdown-mermaid",
        "bierner.markdown-preview-github-styles",
        "bierner.markdown-yaml-preamble",
        "davidanson.vscode-markdownlint",
        "kejun.markdown-alert",
        "yzhang.markdown-all-in-one"
    ]
 }
--- a/Assets/Dataset/1-hop/dataset.csv
+++ b/Assets/Dataset/1-hop/dataset.csv
--- a/Assets/Dataset/1-hop/movie-pageid.csv
+++ b/Assets/Dataset/1-hop/movie-pageid.csv
--- a/Assets/Dataset/1-hop/movies.csv
+++ b/Assets/Dataset/1-hop/movies.csv
--- a/Assets/Dataset/1-hop/reverse.csv
+++ b/Assets/Dataset/1-hop/reverse.csv
--- a/Assets/Dataset/1-hop/wikipedia-movie.csv
+++ b/Assets/Dataset/1-hop/wikipedia-movie.csv
--- a/Assets/Dataset/1-hop/wikipedia-summary.csv
+++ b/Assets/Dataset/1-hop/wikipedia-summary.csv
--- a/Assets/Dataset/DatawareHouse/dataset.db
+++ b/Assets/Dataset/DatawareHouse/dataset.db
--- a/Assets/Dataset/Tmp/.gitkeep
+++ b/Assets/Dataset/Tmp/.gitkeep
--- a/README.md
+++ b/README.md
@ -1,28 +1,3 @@
 # NanoSocrates
-This is the work project for the DeepLearning exam of 16th September 2025
+This is the work project for the DeepLearning exam of 16th September 2025
 ## Index
 - [Resources](./docs/RESOURCES.md)
 ## Setup
 Create and activate you Conda enviroment with:
       conda env create -f environment.yaml
       conda activate deep_learning
 Now install dependencies on pip:
        pip install -r requirements.txt
 ## TroubleShooting
 Sometimes when uploading really large batch of data, git can stop the uploads thanks to the timeout.
 The solution is to locally change its settings:
       git config lfs.dialtimeout 3600
       git config lfs.activitytimeout 3600
 for clearance check the link: https://stackoverflow.com/questions/58961697/i-o-timeout-when-pushing-to-a-git-reporsitory
--- a/Scripts/DataCleaning/path_splitter_tree.py
+++ b/Scripts/DataCleaning/path_splitter_tree.py
@ -1,139 +0,0 @@
 import argparse
 import csv
 import sys
 from typing import Self
 class ProgramArgs:
    def __init__(self, file: str, output: str, treshold: int):
        self.file = file
        self.output = output
        self.treshold = treshold
 class Node:
    def __init__(
        self,
        name: str,
        quantity: int = 0,
    ):
        self.name = name
        self.quantity = quantity
        self.children: dict[str, Node] = {}
    @property
    def is_leaf(self):
        return len(self.children) == 0
    def append_child(self, child: list[str]):
        # print(child)
        KEY = child[0]
        if not self.children.get(KEY):
            self.children[KEY] = Node(KEY, 0)
        CHILD = self.children[KEY]
        self.quantity += 1
        if len(child) == 1:
            return
        new_children = child[1:]
        CHILD.append_child(new_children)
    def __str__(self):
        return f"{self.name}/ - {self.quantity}"
 def get_args(args: list[str]) -> ProgramArgs:
    PARSER = argparse.ArgumentParser()
    PARSER.add_argument("--input-file", "-i", required=True, type=str)
    PARSER.add_argument("--output-file", "-o", required=True, type=str)
    PARSER.add_argument("--treshold", "-t", type=int, default=1)
    parsed_args, _ = PARSER.parse_known_args(args)
    # print(parsed_args.input_file)
    return ProgramArgs(parsed_args.input_file,parsed_args.output_file, parsed_args.treshold)  # type ignore
 def get_debug_args() -> ProgramArgs:
    FILE = "./Assets/Dataset/Tmp/reverse-rel.txt"
    TRESHOLD = 1
    return ProgramArgs(
        FILE,
        TRESHOLD
    )
 def tree_like(file: str, out: str):
    INDENTATION = "    "
    properties: dict[str, Node] = {}
    properties["pure"] = Node("pure", 0)
    properties["URI"] = Node("uri", 0)
    FILE = open(file, "r", encoding="utf-8")
    for row in FILE:
        sections = row.split("/")
        sections = list(filter(lambda item: item != "", sections))
        # print(sections)
        if sections[0] != "http:" and sections[0] != "https:":
            properties["pure"].append_child(sections)
            continue
        properties["URI"].append_child(sections)
    FILE.close()
    stack: list[tuple[Node, int]] = []
    for _, item in properties.items():
        stack.append((item, 0))
    OUT = open(out, mode="w", encoding="utf-8")
    while len(stack) > 0:
        LAST_ITEM = stack.pop()
        NODE: Node = LAST_ITEM[0]
        DEPTH: int = LAST_ITEM[1]
        INDENT: str = INDENTATION * DEPTH
        if NODE.quantity < ARGS.treshold:
            continue
        OUT.write(f"{INDENT}- {NODE}\n")
        if NODE.is_leaf:
            continue
        CHILDREN = []
        for _, child in NODE.children.items():
            CHILDREN.append((child, DEPTH + 1))
        stack.extend(CHILDREN)
    OUT.close()
 if __name__ == "__main__":
    ARGS = get_args(sys.argv)
    # ARGS = get_debug_args()
    tree_like(ARGS.file, ARGS.output)
--- a/Scripts/DataGathering/analysis.py
+++ b/Scripts/DataGathering/analysis.py
@ -1,53 +0,0 @@
 import argparse
 import sys
 import pandas as pd
 class ProgramArgs:
    def __init__(
        self, input_file: str, column: str, output_file: str, count: bool
    ) -> None:
        self.input_file = input_file
        self.column = column
        self.output_file = output_file
        self.count = count
 def get_args(args: list[str]) -> ProgramArgs:
    PARSER = argparse.ArgumentParser()
    PARSER.add_argument("--input-file", "--input", "-i", required=True, type=str)
    PARSER.add_argument("--output-file", "--output", "-o", required=True, type=str)
    PARSER.add_argument("--column", "--col", required=True, type=str)
    PARSER.add_argument(
        "--count", "-c", action="store_const", const=True, default=False
    )
    parsed_args, _ = PARSER.parse_known_args(args)
    return ProgramArgs(
        parsed_args.input_file,
        parsed_args.column,
        parsed_args.output_file,
        parsed_args.count,
    )  # type ignore
 if __name__ == "__main__":
    ARGS = get_args(sys.argv)
    OUTPUT_FILE = open(ARGS.output_file, "w+", encoding="utf-8")
    # Load the CSV
    df = pd.read_csv(ARGS.input_file)
    # Count occurrences of each unique last part
    item_counts = df[ARGS.column].value_counts()
    # Print the counts
    for item, count in item_counts.items():
        if ARGS.count:
            OUTPUT_FILE.write(f"{item}: {count}\n")
        else:
            OUTPUT_FILE.write(f"{item}\n")
--- a/Scripts/DataGathering/fetchdata.py
+++ b/Scripts/DataGathering/fetchdata.py
@ -1,146 +0,0 @@
 import argparse
 from math import floor
 import sys
 from time import sleep
 import SPARQLWrapper
 class ProgramData:
    def __init__(
        self,
        local_url,
        query_url,
        sparql_url,
        output_type,
        initial_offset,
        timeout,
        limit,
        max_pages,
        verbosity_level,
    ) -> None:
        self.local_url = local_url
        self.query_url = query_url
        self.sparql_url = sparql_url
        self.output_type = output_type
        self.initial_offset = initial_offset
        self.timeout = timeout
        self.limit = limit
        self.max_pages = max_pages
        self.verbosity_level = verbosity_level
    @property
    def offset(self):
        return self.limit
    @property
    def query(self):
        with open(self.query_url, "r") as file:
            return file.read()
 DBPEDIA_URL = "https://dbpedia.org/sparql"
 TYPE = SPARQLWrapper.CSV
 TIMEOUT_SECONDS = 1.5
 LIMIT = int(1E4)
 INITIAL_OFFSET = 0
 MAX_PAGES = int(1E9)
 def gather_cli_args(args: list[str]) -> ProgramData:
    # TODO: Add argument for type
    PARSER = argparse.ArgumentParser("sparql data fetcher")
    PARSER.add_argument("--file-path", "--file", "--output", "-o", required=True, type=str)
    PARSER.add_argument("--query-file", "--query", "-q", required=True, type=str)
    PARSER.add_argument("--url", type=str, default=DBPEDIA_URL)
    PARSER.add_argument("--limit", type=int, default=LIMIT)
    PARSER.add_argument("--timeout", type=float, default=TIMEOUT_SECONDS)
    PARSER.add_argument("--offset", type=int, default=INITIAL_OFFSET)
    PARSER.add_argument("--max-pages", type=int, default=MAX_PAGES)
    PARSER.add_argument("--verbose", "-v", action="count", default=0)
    parsed_args, _ = PARSER.parse_known_args(args)
    return ProgramData(
        parsed_args.file_path,
        parsed_args.query_file,
        parsed_args.url,
        SPARQLWrapper.CSV,
        parsed_args.offset,
        parsed_args.timeout,
        parsed_args.limit,
        parsed_args.max_pages,
        parsed_args.verbose
    )
    # type: ignore
 def fetch_data(DATA: ProgramData):
    # Take correction of page into account
    page = int(floor(DATA.initial_offset / DATA.limit)) - 1
    exit = False
    while not exit:
        print(f"Starting to get page {page}")
        CURRENT_OFFSET = int(DATA.offset + (page * DATA.limit))
        sparql = SPARQLWrapper.SPARQLWrapper(DATA.sparql_url)
        sparql.setReturnFormat(TYPE)
        CURRENT_PAGE_QUERY = "\n".join([
            DATA.query,
            f"LIMIT {LIMIT}",
            f"OFFSET {CURRENT_OFFSET}"
        ])
        print(f"\nCurrent Query:\n{CURRENT_PAGE_QUERY}\n")
        sparql.setQuery(CURRENT_PAGE_QUERY)
        try:
            res = sparql.queryAndConvert()
            text = ""
            if type(res) == bytes:
                initial_offset = 0
                if page != 0:
                    initial_offset = 1
                lines = res.decode("utf-8", "ignore").split("\n")
                text = "\n".join(lines[initial_offset:])
            if text == "":
                exit = True
                continue
            with open(DATA.local_url, "a+", encoding="utf-8") as dataset:
                print(f"Writing page {page} on {DATA.local_url}")
                dataset.write(
                    text
                )
        except Exception as ex:
            print(f"Something went wrong during page {page}:\n\t{ex}")
        print(f"Sleeping for {TIMEOUT_SECONDS}")
        page += 1
        if page == MAX_PAGES - 1:
            exit = True
        sleep(TIMEOUT_SECONDS)
 if __name__ == "__main__":
    DATA = gather_cli_args(sys.argv)
    fetch_data(DATA)
--- a/Scripts/DataGathering/wikipedia_gathering.py
+++ b/Scripts/DataGathering/wikipedia_gathering.py
@ -1,154 +0,0 @@
 from pathlib import Path
 import pandas as pd
 import csv
 import time
 import requests
 input_csv = "./Assets/Dataset/1-hop/movie-pageid.csv"
 output_csv = "./Assets/Dataset/Tmp/wikipedia-summary.csv"
 sess = requests.Session()
 CHUNK = 20
 # Function to get clean full text from Wikipedia PageID
 def get_clean_text(pageIDS: list[str]):
    parsing_time = 0
    start_full = time.time()
    API_URL = "https://en.wikipedia.org/w/api.php"
    headers = {
        "User-Agent": "CoolBot/0.0"
        ""
        " (https://example.org/coolbot/; coolbot@example.org)"
    }
    ids = "|".join(pageIDS)
    start_fetch = time.time()
    res = sess.get(headers=headers, url=f"{API_URL}?action=query&pageids={ids}&prop=extracts&exintro=1&explaintext=1&format=json")
    end_fetch = time.time()
    fetch_time = end_fetch - start_fetch
    print(f"Time elapsed FETCH: {fetch_time} seconds")
    data = res.json()
    abstracts = {}
    # Make sure 'query' and the page exist
    SKIPPED = 0
    if "query" in data and "pages" in data["query"]:
        for pageID in pageIDS:
            if pageID in data["query"]["pages"]:
                page = data["query"]["pages"][pageID]
                extract: str = page.get("extract")
                if extract:
                    print(f"Entry FOUND for pageID {pageID}")
                    start_parse = time.time()
                    extract = extract.strip()
                    extract = extract.replace("\n", "")
                    end_parse = time.time()
                    parsing_time = end_parse - start_parse
                    print(f"Time elapsed PARSE: {parsing_time} seconds")
                    abstracts[pageID] = extract
                else:
                    SKIPPED += 1
                    print(f"Entry MISSING for pageID {pageID}")
            else:
                SKIPPED += 1
                print(f"Page MISSING for pageID {pageID}")
    print(f"Chunk done - Skipped {SKIPPED}")
    end_full = time.time()
    print(f"Time elapsed FULL: {end_full - start_full} seconds\n\tNO PARSE: {(end_full - start_full) - parsing_time} seconds")
    return abstracts
 def flush(movie_ids):
        abstracts = get_clean_text(movie_ids)
        start = time.time()
        with open(output_csv, "a", newline="", encoding="utf-8") as f_out:
            writer = csv.DictWriter(f_out, fieldnames=["subject", "text"])
            for id, text in abstracts.items():
                writer.writerow({"subject": id, "text": text})
        end = time.time()
        print(f"Time elapsed WRITE: {end - start} seconds")
 def reconcile() -> int:
    start = time.time()
    input_file = open(input_csv, "r", newline="", encoding="utf-8")
    output_file = open(output_csv, "r", newline="", encoding="utf-8")
    next(input_file)
    LAST_CHECKED = output_file.readlines()[-1].split(",")[0]
    current_check = input_file.readline().split(",")[1]
    index = 1
    while current_check != LAST_CHECKED:
        current_check = input_file.readline().split(",")[1].replace("\n", "")
        index += 1
    input_file.close()
    output_file.close()
    end = time.time()
    print(f"Time elapsed RECONCILE: {end - start} seconds")
    print(f"FOUND, we need to skip {index} lines")
    return index
 if not Path(output_csv).is_file():
    # Initialize output CSV
    with open(output_csv, "w", newline="", encoding="utf-8") as f_out:
        writer = csv.DictWriter(f_out, fieldnames=["subject", "text"])
        writer.writeheader()
 SKIP = reconcile()
 # Read CSV in RAM
 with open(input_csv, "r", newline="", encoding="utf-8") as input:
    # Skip already done
    for i in range(0, SKIP):
        next(input)
    reader = csv.reader(input)
    index = -1
    movie_ids = []
    for line in reader:
        index += 1
        if index == 0:
            continue
        # Save movies in map
        movie_ids.append(line[1])
        if index % CHUNK == 0:
            # Flush movies
            flush(movie_ids)
            movie_ids = []
--- a/Scripts/DatasetMerging/SQL_Queries/db_creation.sql
+++ b/Scripts/DatasetMerging/SQL_Queries/db_creation.sql
@ -1,65 +0,0 @@
 CREATE TABLE IF NOT EXISTS Movies (
    MovieID INTEGER PRIMARY KEY AUTOINCREMENT,
    MovieURI TEXT UNIQUE NOT NULL
 );
 CREATE TABLE IF NOT EXISTS WikiPageIDs (
    MovieID INTEGER PRIMARY KEY,
    PageID INTEGER UNIQUE NOT NULL,
    FOREIGN KEY(MovieID) REFERENCES Movies(MovieID)
 );
 CREATE TABLE IF NOT EXISTS WikipediaAbstracts (
    MovieID INTEGER PRIMARY KEY,
    Abstract TEXT NOT NULL,
    FOREIGN KEY(MovieID) REFERENCES Movies(MovieID)
 );
 CREATE TABLE IF NOT EXISTS Origins (
    OriginID INTEGER PRIMARY KEY AUTOINCREMENT,
    OriginName TEXT UNIQUE NOT NULL
 );
 CREATE TABLE IF NOT EXISTS Subjects (
    SubjectID INTEGER PRIMARY KEY AUTOINCREMENT,
    SubjectURI TEXT UNIQUE NOT NULL,
    OriginID BIGINT NOT NULL,
    FOREIGN KEY(OriginID) REFERENCES Origins(OriginID)
 );
 CREATE TABLE IF NOT EXISTS Relationships (
    RelationshipID INTEGER PRIMARY KEY AUTOINCREMENT,
    RelationshipURI TEXT UNIQUE NOT NULL
 );
 CREATE TABLE IF NOT EXISTS Objects (
    ObjectID INTEGER PRIMARY KEY AUTOINCREMENT,
    ObjectURI TEXT UNIQUE NOT NULL,
    OriginID BIGINT NOT NULL,
    FOREIGN KEY(OriginID) REFERENCES Origins(OriginID)
 );
 CREATE TABLE IF NOT EXISTS RDFs (
    RDF_ID INTEGER PRIMARY KEY AUTOINCREMENT,
    MovieID INTEGER NOT NULL,
    SubjectID INTEGER NOT NULL,
    RelationshipID INTEGER NOT NULL,
    ObjectID INTEGER NOT NULL,
    UNIQUE(SubjectID, RelationshipID, ObjectID),
    FOREIGN KEY(MovieID) REFERENCES Movies(MovieID),
    FOREIGN KEY(SubjectID) REFERENCES Subjects(SubjectID),
    FOREIGN KEY(RelationshipID) REFERENCES Relationships(RelationshipID),
    FOREIGN KEY(ObjectID) REFERENCES Objects(ObjectID)
 );
 CREATE INDEX IF NOT EXISTS idx_rdf_movie_id ON RDFs(MovieID);
 CREATE INDEX IF NOT EXISTS idx_rdf_subject_id ON RDFs(SubjectID);
 CREATE INDEX IF NOT EXISTS idx_rdf_relationship_id ON RDFs(RelationshipID);
 CREATE INDEX IF NOT EXISTS idx_rdf_object_id ON RDFs(ObjectID);
--- a/Scripts/DatasetMerging/SQL_Queries/query.sql
+++ b/Scripts/DatasetMerging/SQL_Queries/query.sql
@ -1,35 +0,0 @@
 -- Insert MovieURI into Movies ; MovieID is auto incremental
 INSERT INTO  Movies (MovieURI) VALUES (?);
 -- Get MovieID where MovieURI equal given value
 SELECT MovieID FROM Movies WHERE MovieURI = ?;
 -- SetPageId
 INSERT INTO  WikiPageIDs (MovieID, PageID) VALUES (?,?);
 -- Get MovieId by PageID ... ( to create WikipediaAbstract)
 SELECT MovieID FROM WikiPageIDs WHERE PageID = ?;
 -- SetAbstract ...
 INSERT INTO  WikipediaAbstracts (MovieID, Abstract) VALUES (?,?);
 -- SetOrigin
 ---
 INSERT INTO  Origins (OriginName) VALUES ("dataset.csv"),("reverse.csv");
 -- GetOrigin
 SELECT OriginID FROM Origins WHERE OriginName = ?;
 -- Subject, Relationship, Object, RDF
 INSERT INTO  Subjects (SubjectURI, OriginID) VALUES (?,?);
 INSERT INTO  Relationships (RelationshipURI) VALUES (?);
 INSERT INTO  Objects (ObjectURI, OriginID) VALUES (?,?);
 SELECT SubjectID FROM Subjects WHERE SubjectURI = ?;
 SELECT RelationshipID FROM Relationships WHERE RelationshipURI = ?;
 SELECT ObjectID FROM Objects WHERE ObjectURI = ?;
 INSERT INTO  RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?,?,?,?);
--- a/Scripts/DatasetMerging/datasetInfo.md
+++ b/Scripts/DatasetMerging/datasetInfo.md
@ -1,26 +0,0 @@
 # HOW THE DATASET IS BUILT AND POPULATED
 Note: the data are taken from CSV files in 1-hop
 ## CSV files composition
 | CSV files          | Original structure                    | Saved AS                            |
 |--------------------|---------------------------------------|-------------------------------------|
 | Wikipeda-summary   | PageId / abstract                     | subject, text                       |
 | Movies             | Movie URI                             | "subject"                           |
 | Dataset            | Movie URI / Relationship / Object [RDF] | subject, relationship, object       |
 | Movies-PageId      | Movie URI / PageId (wiki)             | "subject", "object"                 |
 | Reverse            | Subject / Relationship / Movie URI    | "subject", "relationship", "object" |
 ## Wanted tables schema
 | Table         | Columns                                                                 |
 |---------------|-------------------------------------------------------------------------|
 | Movies        | MovieID [PK], Movie URI                                                 |
 | WikiPageIDs   | MovieID [PK, FK], PageId [IDX] (wiki) *(Not important for now)*         |
 | Abstracts     | MovieID [PK, FK], abstract                                              |
 | Subjects      | SubjectID [PK], RDF Subject (from Dataset.csv or Reverse.csv), OriginID [FK] |
 | Relationships | RelationshipID [PK], RDF Relationship (value only, not the actual relation) |
 | Objects       | ObjectID [PK], RDF Object, OriginID [FK]                                |
 | Origins       | OriginID [PK], Origin Name                                              |
 | RDFs          | RDF_ID [PK], MovieID [FK], SubjectID [FK], RelationshipID [FK], ObjectID [FK] |
--- a/Scripts/DatasetMerging/datawarehouse.py
+++ b/Scripts/DatasetMerging/datawarehouse.py
@ -1,375 +0,0 @@
 import sqlite3
 import csv
 #####################################################################
 #   This file builds DatawareHouse/dataset.db from 1-hop csv files  #
 #   Its Schema in . /SQL_Queries/db_creation.sql                    #
 #   The sql query used to popualate id in . /SQL_Queries/query.sql  #
 #####################################################################
 # sometimes you may need to build a new db file, here a little snippet for you
 # sqlite3 ./Assets/Dataset/Tmp/dataset.db < ./Scripts/DataCleaning/SQL_Queries/db_creation.sql  
 # --- Global configuration ---
 DB_NAME = "./Assets/Dataset/DatawareHouse/dataset.db"
 MOVIES_CSV = "./Assets/Dataset/1-hop/movies.csv"
 PAGEID_CSV = "./Assets/Dataset/1-hop/movie-pageid.csv"
 SUMMARY_CSV = "./Assets/Dataset/1-hop/wikipedia-summary.csv"
 DATASET_CSV = "./Assets/Dataset/1-hop/dataset.csv"
 REVERSE_CSV = "./Assets/Dataset/1-hop/reverse.csv"
 MOVIES_CSV_HANDLER = open(MOVIES_CSV,"r",newline='', encoding="utf-8")
 PAGEID_CSV_HANDLER = open(PAGEID_CSV,"r",newline='', encoding="utf-8")
 SUMMARY_CSV_HANDLER = open(SUMMARY_CSV,"r",newline='', encoding="utf-8")
 DATASET_CSV_HANDLER = open(DATASET_CSV,"r",newline='', encoding="utf-8")
 REVERSE_CSV_HANDLER = open(REVERSE_CSV,"r",newline='', encoding="utf-8")
 CONN = sqlite3.connect(DB_NAME)
 CURS = CONN.cursor()
 # MARK: SQL Definitions
 # Insert MovieURI
 def insertOrigin(curs : sqlite3.Cursor ) -> bool:
    QUERY = "INSERT INTO  Origins (OriginName) VALUES ('dataset.csv'),('reverse.csv');"
    try:
        curs.execute(QUERY)
        return True
    except sqlite3.IntegrityError:
        return False
 def selectOrigin(curs: sqlite3.Cursor, originName: str) -> int | None:
    QUERY = "SELECT OriginID FROM Origins WHERE OriginName = ?;"
    curs.execute(QUERY, [originName])
    originId = curs.fetchone()
    if not originId:
        return None
    # in this case the real id is the first element of the tuple
    return originId[0]
 def insertMovie(curs : sqlite3.Cursor , movieUri: str) -> bool:
    QUERY = "INSERT INTO Movies (MovieURI) VALUES (?);"
    try:
        curs.execute(QUERY,[movieUri])
        return True
    except sqlite3.IntegrityError:
        return False
 def selectMovieId(curs: sqlite3.Cursor, movieUri: str) -> int | None:
    QUERY = "SELECT MovieID FROM Movies WHERE MovieURI = ?;"
    curs.execute(QUERY, [movieUri])
    movieId = curs.fetchone()
    if not movieId:
        return None
    # in this case the real id is the first element of the tuple
    return movieId[0]
 def insertWikiPageId(curs: sqlite3.Cursor, movieId: int, pageId: int) -> bool:
    QUERY = "INSERT INTO  WikiPageIDs (MovieID, PageID) VALUES (?,?);"
    try:
        curs.execute(QUERY,[movieId, pageId])
        return True
    except sqlite3.IntegrityError:
        return False
 def selectMovieIdFromWikiPageId(curs: sqlite3.Cursor,pageId: int) -> int | None:
    QUERY = "SELECT MovieID FROM WikiPageIDs WHERE PageID = ?;"
    curs.execute(QUERY, [pageId])
    movieId = curs.fetchone()
    if not movieId:
        return None
    # in this case the real id is the first element of the tuple
    return movieId[0]
 def insertWikiAbstract(curs: sqlite3.Cursor, movieId: int, abstract: str) -> bool:
    QUERY = "INSERT INTO  WikipediaAbstracts (MovieID, Abstract) VALUES (?,?);"
    try:
        curs.execute(QUERY,[movieId, abstract])
        return True
    except sqlite3.IntegrityError:
        return False
 def insertSubject(curs: sqlite3.Cursor, subjectURI: str, originID: int) -> bool:
    QUERY = "INSERT INTO  Subjects (SubjectURI, OriginID) VALUES (?,?);"
    try:
        curs.execute(QUERY,[subjectURI, originID])
        return True
    except sqlite3.IntegrityError:
        return False
 def insertRelationship(curs: sqlite3.Cursor, relationshipURI: str) -> bool:
    QUERY = "INSERT INTO  Relationships (RelationshipURI) VALUES (?);"
    try:
        curs.execute(QUERY,[relationshipURI])
        return True
    except sqlite3.IntegrityError:
        return False
 def insertObject(curs: sqlite3.Cursor, objectURI: str, originID: int) -> bool:
    QUERY = "INSERT INTO  objects (ObjectURI, OriginID) VALUES (?,?);"
    try:
        curs.execute(QUERY,[objectURI, originID])
        return True
    except sqlite3.IntegrityError:
        return False
 def selectSubjectId(curs: sqlite3.Cursor, subjectURI: str) -> int | None:
    QUERY = "SELECT SubjectID FROM Subjects WHERE SubjectURI = ?;"
    curs.execute(QUERY, [subjectURI])
    subjectId = curs.fetchone()
    if not subjectId:
        return None
    # in this case the real id is the first element of the tuple
    return subjectId[0]
 def selectRelationshipId(curs: sqlite3.Cursor, relationshipURI: str) -> int | None:
    QUERY = "SELECT RelationshipID FROM Relationships WHERE RelationshipURI = ?;"
    curs.execute(QUERY, [relationshipURI])
    relationshipId = curs.fetchone()
    if not relationshipId:
        return None
    # in this case the real id is the first element of the tuple
    return relationshipId[0]
 def selectObjectId(curs: sqlite3.Cursor, objectURI: str) -> int | None:
    QUERY = "SELECT ObjectID FROM Objects WHERE ObjectURI = ?;"
    curs.execute(QUERY, [objectURI])
    objectId = curs.fetchone()
    if not objectId:
        return None
    # in this case the real id is the first element of the tuple
    return objectId[0]
 def insertRDF(
    curs: sqlite3.Cursor, 
    movieId: int, 
    subjectId: int,
    relationshipId: int,
    objectId: int 
 ) -> bool:
    QUERY = "INSERT INTO  RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?,?,?,?);"
    try:
        curs.execute(QUERY,[movieId,subjectId,relationshipId,objectId])
        return True
    except sqlite3.IntegrityError:
        return False
 # MARK: Parsing
 def parseMovies():
    CSV_READER = csv.reader(MOVIES_CSV_HANDLER)
    next(CSV_READER)
    for row in CSV_READER:
        MOVIE = row[0]
        insertMovie(CURS, MOVIE)
 def parseWikiPageId():
    CSV_READER = csv.DictReader(PAGEID_CSV_HANDLER)
    for row in CSV_READER:
        MOVIE_URI = row["subject"]
        WIKI_PAGE_ID = int(row["object"])
        MOVIE_ID = selectMovieId(CURS, MOVIE_URI)
        if MOVIE_ID is None:
            print(f"The MovieUri: {MOVIE_URI} has not a MovieId ")
            continue
        insertWikiPageId(CURS, MOVIE_ID, WIKI_PAGE_ID)
 def parseAbstract():
    CSV_READER = csv.DictReader(SUMMARY_CSV_HANDLER)
    for row in CSV_READER:
        WIKI_PAGE_ID = int(row["subject"])
        ABSTRACT = row["text"]
        MOVIE_ID = selectMovieIdFromWikiPageId(CURS, WIKI_PAGE_ID)
        if MOVIE_ID is None:
            print(f"The WikiPageId: {WIKI_PAGE_ID} has not a MovieId ")
            continue
        insertWikiAbstract(CURS, MOVIE_ID, ABSTRACT)
 def parseRDF_Reverse():
    REVERSE_CSV_READER = csv.DictReader(REVERSE_CSV_HANDLER)
    REVERSE_ORIGIN_ID = selectOrigin(CURS, 'reverse.csv')
    total = 0
    for row in REVERSE_CSV_READER:
        SUBJECT = row["subject"]
        RELATIONSHIP = row["relationship"]
        OBJECT = row["object"]
        print(f"RDF triplets:\n\t{SUBJECT} - {RELATIONSHIP} - {OBJECT}")
        insertSubject(CURS,SUBJECT,REVERSE_ORIGIN_ID)
        insertRelationship(CURS, RELATIONSHIP)
        insertObject(CURS, OBJECT, REVERSE_ORIGIN_ID)
        SUBJECT_ID = selectSubjectId(CURS, SUBJECT)
        OBJECT_ID = selectObjectId(CURS, OBJECT)
        RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
        MOVIE_ID = selectMovieId(CURS, OBJECT)
        skip = False
        # guard
        if SUBJECT_ID is None:
            print(f"No SubjectId for {SUBJECT}")
            skip = True
        if OBJECT_ID is None:
            print(f"No ObjectId for {OBJECT}")
            skip = True
        if RELATIONSHIP_ID is None:
            print(f"No RelationshipId for {RELATIONSHIP}")
            skip = True
        if MOVIE_ID is None:
            print(f"No MovieId for {OBJECT}")
            skip = True
        if skip:
            continue
        if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID):
            total += 1
    print(total)
 def parseRDF_Dataset():
    DATASET_CSV_READER = csv.DictReader(DATASET_CSV_HANDLER)
    DATASET_ORIGIN_ID = selectOrigin(CURS, 'dataset.csv')
    total = 0
    rdf_idx = 0
    for row in DATASET_CSV_READER:
        SUBJECT = row["subject"]
        RELATIONSHIP = row["relationship"]
        OBJECT = row["object"]
        rdf_idx += 1
        if rdf_idx % 100000 == 0:
            print(f"RDF number {rdf_idx}:\n\t{SUBJECT} - {RELATIONSHIP} - {OBJECT}")
        insertSubject(CURS,SUBJECT,DATASET_ORIGIN_ID)
        insertRelationship(CURS, RELATIONSHIP)
        insertObject(CURS, OBJECT, DATASET_ORIGIN_ID)
        SUBJECT_ID = selectSubjectId(CURS, SUBJECT)
        OBJECT_ID = selectObjectId(CURS, OBJECT)
        RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
        MOVIE_ID = selectMovieId(CURS, SUBJECT)
        skip = False
        # guard
        if SUBJECT_ID is None:
            print(f"No SubjectId for {SUBJECT}")
            skip = True
        if OBJECT_ID is None:
            print(f"No ObjectId for {OBJECT}")
            skip = True
        if RELATIONSHIP_ID is None:
            print(f"No RelationshipId for {RELATIONSHIP}")
            skip = True
        if MOVIE_ID is None:
            print(f"No MovieId for {SUBJECT}")
            skip = True
        if skip:
            continue
        if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID):
            total += 1
    print(total)
 # MARK: Actual Code
 # parseMovies()
 # parseWikiPageId()
 # parseAbstract()
 # insertOrigin(CURS)
 # parseRDF_Reverse()
 # parseRDF_Dataset()
 CONN.commit()
 CONN.close()
 MOVIES_CSV_HANDLER.close()
 PAGEID_CSV_HANDLER.close()
 SUMMARY_CSV_HANDLER.close()
 DATASET_CSV_HANDLER.close()
 REVERSE_CSV_HANDLER.close()
 """
 The MovieUri: http://dbpedia.org/resource/1%25_(film) has not a MovieId 
 The MovieUri: http://dbpedia.org/resource/10%25:_What_Makes_a_Hero%3F has not a MovieId 
 The MovieUri: http://dbpedia.org/resource/100%25_Arabica has not a MovieId 
 The MovieUri: http://dbpedia.org/resource/100%25_Kadhal has not a MovieId 
 The MovieUri: http://dbpedia.org/resource/100%25_Love_(2011_film) has not a MovieId 
 The MovieUri: http://dbpedia.org/resource/100%25_Love_(2012_film) has not a MovieId 
 The MovieUri: http://dbpedia.org/resource/100%25_Wolf has not a MovieId 
 The MovieUri: http://dbpedia.org/resource/Who_the_$&%25_Is_Jackson_Pollock%3F has not a MovieId 
 The MovieUri: http://dbpedia.org/resource/99%25:_The_Occupy_Wall_Street_Collaborative_Film has not a MovieId 
 The MovieUri: http://dbpedia.org/resource/99_and_44/100%25_Dead has not a MovieId 
 The MovieUri: http://dbpedia.org/resource/Postcards_from_the_48%25 has not a MovieId 
 The MovieUri: http://dbpedia.org/resource/Wool_100%25 has not a MovieId 
 """
 """
 The WikiPageId: 10068850 has not a MovieId 
 The WikiPageId: 55069615 has not a MovieId 
 The WikiPageId: 49510056 has not a MovieId 
 The WikiPageId: 4049786 has not a MovieId 
 The WikiPageId: 55510238 has not a MovieId 
 The WikiPageId: 31239628 has not a MovieId 
 The WikiPageId: 34757217 has not a MovieId 
 The WikiPageId: 64311757 has not a MovieId 
 The WikiPageId: 8326198 has not a MovieId 
 The WikiPageId: 42162164 has not a MovieId 
 The WikiPageId: 18502369 has not a MovieId 
 The WikiPageId: 58092358 has not a MovieId 
 The WikiPageId: 40710250 has not a MovieId 
 """
--- a/Scripts/Experiments/.gitkeep
+++ b/Scripts/Experiments/.gitkeep
--- a/Scripts/Experiments/Queries/.gitkeep
+++ b/Scripts/Experiments/Queries/.gitkeep
--- a/Scripts/Experiments/Tmp/.gitkeep
+++ b/Scripts/Experiments/Tmp/.gitkeep
--- a/docs/DBPEDIA.md
+++ b/docs/DBPEDIA.md
@ -1,215 +0,0 @@
 # DBPedia
 ## GraphIRI
 This is the graph identifier (URI):
 `http://dbpedia.org`
 ## History of queries
 ```SQL
 PREFIX dbo:  <http://dbpedia.org/ontology/>
 PREFIX dbp:  <http://dbpedia.org/property/>
 PREFIX dbr:  <http://dbpedia.org/resource/>
 SELECT ?subject, ?relationship, ?object
 WHERE {
  ?subject ?relationship ?object .
  {
    SELECT ?object
    WHERE {
      ?m rdf:type dbo:Film .
      ?object ?r ?m
    }
  }
 }
 ```
 ### 2 Hops
 ```SQL
 PREFIX dbo:  <http://dbpedia.org/ontology/>
 PREFIX dbp:  <http://dbpedia.org/property/>
 PREFIX dbr:  <http://dbpedia.org/resource/>
 SELECT ?subject, ?relationship, ?object
 WHERE {
  ?subject ?relationship ?object .
  FILTER (?relationship != <http://dbpedia.org/ontology/wikiPageWikiLink>)
  {
    SELECT ?object
    WHERE {
      ?m rdf:type dbo:Film .
      ?object ?r ?m
      FILTER (?r != <http://dbpedia.org/ontology/wikiPageWikiLink>)
    }
  }
 }
 LIMIT 1000000
 ```
 ### 1 Hop
 ```SQL
 PREFIX dbo:  <http://dbpedia.org/ontology/>
 PREFIX dbp:  <http://dbpedia.org/property/>
 PREFIX dbr:  <http://dbpedia.org/resource/>
 SELECT ?subject, ?relationship, ?object
 WHERE {
  ?subject ?relationship ?object .
  ?object rdf:type dbo:Film .
  FILTER (?relationship != <http://dbpedia.org/ontology/wikiPageWikiLink>)
 }
 LIMIT 1000000
 ```
 ```SQL
 PREFIX dbo:  <http://dbpedia.org/ontology/>
 PREFIX dbp:  <http://dbpedia.org/property/>
 PREFIX dbr:  <http://dbpedia.org/resource/>
 SELECT ?subject, ?relationship, ?object
 WHERE {
  ?subject ?relationship ?object .
  ?subject rdf:type dbo:Film .
 }
 ```
 ```SQL
 PREFIX dbo:  <http://dbpedia.org/ontology/>
 PREFIX dbp:  <http://dbpedia.org/property/>
 PREFIX dbr:  <http://dbpedia.org/resource/>
 PREFIX foaf: <http://xmlns.com/foaf/0.1/>
 SELECT ?subject, ?relationship, ?object
 WHERE {
  ?subject ?relationship ?object .
  ?subject rdf:type dbo:Film .
  ?a foaf:primaryTopic ?subject
 }
 ```
 ```SQL
 PREFIX dbo:  <http://dbpedia.org/ontology/>
 PREFIX dbp:  <http://dbpedia.org/property/>
 PREFIX dbr:  <http://dbpedia.org/resource/>
 SELECT ?subject
 WHERE {
  ?subject rdf:type dbo:Film .
 }
 ```
 ```SQL
 PREFIX dbo:  <http://dbpedia.org/ontology/>
 PREFIX dbp:  <http://dbpedia.org/property/>
 PREFIX dbr:  <http://dbpedia.org/resource/>
 PREFIX foaf: <http://xmlns.com/foaf/0.1/>
 SELECT ?subject
 WHERE {
  ?subject rdf:type dbo:Film .
  ?a foaf:primaryTopic ?subject
 }
 ```
 ```SQL
 PREFIX dbo:  <http://dbpedia.org/ontology/>
 PREFIX dbp:  <http://dbpedia.org/property/>
 PREFIX dbr:  <http://dbpedia.org/resource/>
 PREFIX foaf: <http://xmlns.com/foaf/0.1/>
 SELECT ?subject, ?relationship, ?object
 WHERE {
  ?subject ?relationship ?object .
  ?subject rdf:type dbo:Film .
  ?a foaf:primaryTopic ?subject
  FILTER (?relationship NOT IN (
    dbo:wikiPageRedirects,
    dbo:wikiPageExternalLink,
    dbo:wikiPageWikiLink,
    foaf:primaryTopic
  ))
 }
 ```
 #### Wikipedia-movie
 a.k.a the file with the wikipedia abstract
 ```SQL
 PREFIX dbo:  <http://dbpedia.org/ontology/>
 PREFIX dbp:  <http://dbpedia.org/property/>
 PREFIX dbr:  <http://dbpedia.org/resource/>
 PREFIX foaf: <http://xmlns.com/foaf/0.1/>
 SELECT  ?subject , ?object
 WHERE {
  ?subject foaf:primaryTopic ?object .
  ?object rdf:type dbo:Film 
 }
 ```
 #### Reverse
 ```SQL
 PREFIX dbo:  <http://dbpedia.org/ontology/>
 PREFIX dbp:  <http://dbpedia.org/property/>
 PREFIX dbr:  <http://dbpedia.org/resource/>
 PREFIX foaf: <http://xmlns.com/foaf/0.1/>
 SELECT ?subject, ?relationship, ?object
 WHERE {
  ?subject ?relationship ?object .
  ?object rdf:type dbo:Film .
  ?a foaf:primaryTopic ?object
  FILTER (?relationship NOT IN (
    dbo:wikiPageRedirects,
    dbo:wikiPageExternalLink,
    dbo:wikiPageWikiLink,
    foaf:primaryTopic
  ))
 }
 ```
 ```SQL
 PREFIX dbo:  <http://dbpedia.org/ontology/>
 PREFIX dbp:  <http://dbpedia.org/property/>
 PREFIX dbr:  <http://dbpedia.org/resource/>
 PREFIX foaf: <http://xmlns.com/foaf/0.1/>
 SELECT ?subject, ?relationship, ?object
 WHERE {
  ?subject ?relationship ?object .
  ?object rdf:type dbo:Film .
  ?a foaf:primaryTopic ?object
  FILTER (?relationship NOT IN (
    dbo:wikiPageRedirects,
    dbo:wikiPageExternalLink,
    dbo:wikiPageWikiLink,
    foaf:primaryTopic
  ))
 ```
 #### Film \ wiki page ID
 ```SQL
 PREFIX dbo:  <http://dbpedia.org/ontology/>
 PREFIX dbp:  <http://dbpedia.org/property/>
 PREFIX dbr:  <http://dbpedia.org/resource/>
 PREFIX foaf: <http://xmlns.com/foaf/0.1/>
 PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
 SELECT ?subject ?pageID
 WHERE {
  ?subject rdf:type dbo:Film .
  ?subject dbo:wikiPageID ?pageID .
  ?subject rdfs:label ?label .
  FILTER (lang(?label) = "en")
 }
 ```
--- a/docs/DEVELOPMENT.md
+++ b/docs/DEVELOPMENT.md
@ -1,3 +0,0 @@
 # Development
 ## Data Gathering
--- a/docs/RESOURCES.md
+++ b/docs/RESOURCES.md
@ -1,108 +0,0 @@
 # Resources
 ## Byte-Pair Encoding (BPE)
 ### Overview
 Byte-Pair Encoding (BPE) is a simple but powerful text compression and tokenization algorithm.
 Originally introduced as a data compression method, it has been widely adopted in **Natural Language Processing (NLP)** to build subword vocabularies for models such as GPT and BERT.
 ---
 ### Key Idea
 BPE works by iteratively replacing the most frequent pair of symbols (initially characters) with a new symbol.
 Over time, frequent character sequences (e.g., common morphemes, prefixes, suffixes) are merged into single tokens.
 ---
 ### Algorithm Steps
 1. **Initialization**
   - Treat each character of the input text as a token.
 2. **Find Frequent Pairs**
   - Count all adjacent token pairs in the sequence.
 3. **Merge Most Frequent Pair**
   - Replace the most frequent pair with a new symbol not used in the text.
 4. **Repeat**
   - Continue until no frequent pairs remain or a desired vocabulary size is reached.
 ---
 ### Example
 Suppose the data to be encoded is:
 ```text
 aaabdaaabac
 ```
 #### Step 1: Merge `"aa"`
 Most frequent pair: `"aa"` → replace with `"Z"`
 ```text
 ZabdZabac
 Z = aa
 ```
 ---
 #### Step 2: Merge `"ab"`
 Most frequent pair: `"ab"` → replace with `"Y"`
 ```text
 ZYdZYac
 Y = ab
 Z = aa
 ```
 ---
 #### Step 3: Merge `"ZY"`
 Most frequent pair: `"ZY"` → replace with `"X"`
 ```text
 XdXac
 X = ZY
 Y = ab
 Z = aa
 ```
 ---
 At this point, no pairs occur more than once, so the process stops.
 ---
 ### Decompression
 To recover the original data, replacements are applied in **reverse order**:
 ```text
 XdXac
 → ZYdZYac
 → ZabdZabac
 → aaabdaaabac
 ```
 ---
 ### Advantages
 - **Efficient vocabulary building**: reduces the need for massive word lists.
 - **Handles rare words**: breaks them into meaningful subword units.
 - **Balances character- and word-level tokenization**.
 ---
 ### Limitations
 - Does not consider linguistic meaning—merges are frequency-based.
 - May create tokens that are not linguistically natural.
 - Vocabulary is fixed after training.
--- a/docs/SPARQL.md
+++ b/docs/SPARQL.md
@ -1,67 +0,0 @@
 # SparQL
 > [!NOTE]
 > Resources taken from [this website](https://sparql.dev/)
 ## SQL Queries
 ### SELECT
 ```SQL
 SELECT ?var1, ?var2, ...
 ```
 ### WHERE
 ```SQL
 WHERE {
    pattern1 .
    pattern2 .
    ...
 }
 ```
 ### FILTER
 It's used to restrict [`WHERE`](#where) clauses
 ```SQL
 WHERE {
  ?person <http://example.com/hasCar> ?car .
  FILTER (?car = <http://example.com/Car1>)
 }
 ```
 ### OPTIONAL
 It's used to fetch available content if exists
 ```SQL
 SELECT ?person ?car
 WHERE {
  ?person <http://example.com/hasCar> ?car .
  OPTIONAL {
    ?car <http://example.com/hasColor> ?color .
  }
 }
 ```
 ### LIMIT
 Limits results
 ```SQL
 LIMIT 10 -- Take only 10 results
 ```
 ## SparQL functions
 ### COUNT
 ```SQL
 SELECT (COUNT(?person) AS ?count)
 WHERE {
  ?person <http://example.com/hasCar> ?car .
 }
 ```
--- a/environment.yaml
+++ b/environment.yaml
--- a/requirements.txt
+++ b/requirements.txt
@ -1,17 +0,0 @@
 certifi==2025.8.3
 charset-normalizer==3.4.3
 idna==3.10
 numpy==2.3.3
 pandas==2.3.2
 pyparsing==3.2.4
 python-dateutil==2.9.0.post0
 pytz==2025.2
 rdflib==7.1.4
 requests==2.32.5
 setuptools==78.1.1
 six==1.17.0
 SPARQLWrapper==2.0.0
 tzdata==2025.2
 urllib3==2.5.0
 wheel==0.45.1
 Wikipedia-API==0.8.1