Merge branch 'dev.etl' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev.etl

Fixed creation query to be unique even with movieID in RDFs
CSV support added to path_splitter_tree
2025-09-25 18:33:51 +02:00 · 2025-09-25 17:58:09 +02:00 · 2025-09-25 17:57:46 +02:00 · 2025-09-25 17:57:45 +02:00 · 2025-09-25 16:28:24 +02:00 · 2025-09-25 12:37:52 +02:00
36 changed files with 3002 additions and 2 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -1,2 +1,3 @@
 Exam/Deep_Learning_2025_VIII.pdf filter=lfs diff=lfs merge=lfs -text
 Assets/** filter=lfs diff=lfs merge=lfs -text
 Assets/Dataset/1-hop/dataset.csv filter=lfs diff=lfs merge=lfs -text
--- a/.gitignore
+++ b/.gitignore
@ -189,7 +189,8 @@ ipython_config.py
 .LSOverride
 # Icon must end with two \r
-Icon
+Icon
 # Thumbnails
 ._*
@ -251,3 +252,6 @@ $RECYCLE.BIN/
 # .nfs files are created when an open file is removed but is still being accessed
 .nfs*
 # ---> Custom
 **/Tmp/**
 !**/.gitkeep
--- a/.vscode/extensions.json
+++ b/.vscode/extensions.json
@ -0,0 +1,14 @@
 {
    "recommendations": [
        "bierner.github-markdown-preview",
        "bierner.markdown-checkbox",
        "bierner.markdown-emoji",
        "bierner.markdown-footnotes",
        "bierner.markdown-mermaid",
        "bierner.markdown-preview-github-styles",
        "bierner.markdown-yaml-preamble",
        "davidanson.vscode-markdownlint",
        "kejun.markdown-alert",
        "yzhang.markdown-all-in-one"
    ]
 }
--- a/Assets/Dataset/1-hop/dataset.csv
+++ b/Assets/Dataset/1-hop/dataset.csv
--- a/Assets/Dataset/1-hop/movie-pageid.csv
+++ b/Assets/Dataset/1-hop/movie-pageid.csv
--- a/Assets/Dataset/1-hop/movies.csv
+++ b/Assets/Dataset/1-hop/movies.csv
--- a/Assets/Dataset/1-hop/reverse.csv
+++ b/Assets/Dataset/1-hop/reverse.csv
--- a/Assets/Dataset/1-hop/uri-abbreviations.csv
+++ b/Assets/Dataset/1-hop/uri-abbreviations.csv
--- a/Assets/Dataset/1-hop/wikipedia-movie.csv
+++ b/Assets/Dataset/1-hop/wikipedia-movie.csv
--- a/Assets/Dataset/1-hop/wikipedia-summary.csv
+++ b/Assets/Dataset/1-hop/wikipedia-summary.csv
--- a/Assets/Dataset/DatawareHouse/dataset.db
+++ b/Assets/Dataset/DatawareHouse/dataset.db
--- a/Assets/Dataset/Tmp/.gitkeep
+++ b/Assets/Dataset/Tmp/.gitkeep
--- a/README.md
+++ b/README.md
@ -1,3 +1,28 @@
 # NanoSocrates
-This is the work project for the DeepLearning exam of 16th September 2025
+This is the work project for the DeepLearning exam of 16th September 2025
 ## Index
 - [Resources](./docs/RESOURCES.md)
 ## Setup
 Create and activate you Conda enviroment with:
       conda env create -f environment.yaml
       conda activate deep_learning
 Now install dependencies on pip:
        pip install -r requirements.txt
 ## TroubleShooting
 Sometimes when uploading really large batch of data, git can stop the uploads thanks to the timeout.
 The solution is to locally change its settings:
       git config lfs.dialtimeout 3600
       git config lfs.activitytimeout 3600
 for clearance check the link: https://stackoverflow.com/questions/58961697/i-o-timeout-when-pushing-to-a-git-reporsitory
--- a/Scripts/DataBaseQueries/dataset.sql
+++ b/Scripts/DataBaseQueries/dataset.sql
@ -0,0 +1,30 @@
 -- To pass to Pandas
 SELECT *
 FROM RDFs
 INNER JOIN Subjects USING (SubjectID)
 INNER JOIN Relationships USING (RelationshipID)
 INNER JOIN Objects USING (ObjectID);
 -- To pass to Pandas for abstracts
 SELECT *
 FROM RDFs
 INNER JOIN WikipediaAbstracts USING (MovieID);
 -- To pass to Pandas for abbreviations
 SELECT *
 FROM Abbreviations;
 -- More complex to have clean dataset
 -- More complex to have clean dataset
 SELECT MovieID, GROUP_CONCAT('<SOT>' || '<SUB>' || SubjectURI || '<REL>' || RelationshipURI || '<OBJ>' || ObjectURI || '<EOT>', '') as RDF_String, Abstract
 FROM RDFs
 INNER JOIN SubjectsCountInRDFs USING (SubjectID)
 INNER JOIN RelationshipsCountInRDFs USING(RelationshipID)
 INNER JOIN ObjectsCountInRDFs USING (ObjectID)
 INNER JOIN ParsedSubjects USING (SubjectID)
 INNER JOIN ParsedRelationships USING (RelationshipID)
 INNER JOIN ParsedObjects USING (ObjectID)
 INNER JOIN WikipediaAbstracts USING (MovieID)
    -- insert WHERE here
 -- WHERE SubjectID = 134626
 GROUP BY MovieID;
--- a/Scripts/DataBaseQueries/db_creation.sql
+++ b/Scripts/DataBaseQueries/db_creation.sql
@ -0,0 +1,174 @@
 CREATE TABLE IF NOT EXISTS Movies (
    MovieID INTEGER PRIMARY KEY AUTOINCREMENT,
    MovieURI TEXT UNIQUE NOT NULL
 );
 CREATE TABLE IF NOT EXISTS WikiPageIDs (
    MovieID INTEGER PRIMARY KEY,
    PageID INTEGER UNIQUE NOT NULL,
    FOREIGN KEY(MovieID) REFERENCES Movies(MovieID)
 );
 CREATE TABLE IF NOT EXISTS WikipediaAbstracts (
    MovieID INTEGER PRIMARY KEY,
    Abstract TEXT NOT NULL,
    FOREIGN KEY(MovieID) REFERENCES Movies(MovieID)
 );
 CREATE TABLE IF NOT EXISTS Origins (
    OriginID INTEGER PRIMARY KEY AUTOINCREMENT,
    OriginName TEXT UNIQUE NOT NULL
 );
 CREATE TABLE IF NOT EXISTS Subjects (
    SubjectID INTEGER PRIMARY KEY AUTOINCREMENT,
    SubjectURI TEXT UNIQUE NOT NULL,
    OriginID BIGINT NOT NULL,
    FOREIGN KEY(OriginID) REFERENCES Origins(OriginID)
 );
 CREATE TABLE IF NOT EXISTS Relationships (
    RelationshipID INTEGER PRIMARY KEY AUTOINCREMENT,
    RelationshipURI TEXT UNIQUE NOT NULL
 );
 CREATE TABLE IF NOT EXISTS Objects (
    ObjectID INTEGER PRIMARY KEY AUTOINCREMENT,
    ObjectURI TEXT UNIQUE NOT NULL,
    OriginID BIGINT NOT NULL,
    FOREIGN KEY(OriginID) REFERENCES Origins(OriginID)
 );
 CREATE TABLE IF NOT EXISTS RDFs (
    RDF_ID INTEGER PRIMARY KEY AUTOINCREMENT,
    MovieID INTEGER NOT NULL,
    SubjectID INTEGER NOT NULL,
    RelationshipID INTEGER NOT NULL,
    ObjectID INTEGER NOT NULL,
    UNIQUE(MovieID, SubjectID, RelationshipID, ObjectID),
    FOREIGN KEY(MovieID) REFERENCES Movies(MovieID),
    FOREIGN KEY(SubjectID) REFERENCES Subjects(SubjectID),
    FOREIGN KEY(RelationshipID) REFERENCES Relationships(RelationshipID),
    FOREIGN KEY(ObjectID) REFERENCES Objects(ObjectID)
 );
 CREATE INDEX IF NOT EXISTS idx_rdf_movie_id ON RDFs(MovieID);
 CREATE INDEX IF NOT EXISTS idx_rdf_subject_id ON RDFs(SubjectID);
 CREATE INDEX IF NOT EXISTS idx_rdf_relationship_id ON RDFs(RelationshipID);
 CREATE INDEX IF NOT EXISTS idx_rdf_object_id ON RDFs(ObjectID);
 CREATE TABLE IF NOT EXISTS Abbreviations (
    AbbreviationID INTEGER PRIMARY KEY AUTOINCREMENT,
    URI TEXT UNIQUE NOT NULL,
    Abbreviation TEXT UNIQUE NOT NULL
 );
 CREATE TABLE IF NOT EXISTS Subjects_Abbreviations (
    SubjectID INTEGER NOT NULL,
    AbbreviationID INTEGER NOT NULL,
    PRIMARY KEY(SubjectID, AbbreviationID),
    FOREIGN KEY(SubjectID) REFERENCES Subjects(SubjectID),
    FOREIGN KEY(AbbreviationID) REFERENCES Abbreviations(AbbreviationID)
 );
 CREATE TABLE IF NOT EXISTS Relationships_Abbreviations (
    RelationshipID INTEGER NOT NULL,
    AbbreviationID INTEGER NOT NULL,
    PRIMARY KEY(RelationshipID, AbbreviationID),
    FOREIGN KEY(RelationshipID) REFERENCES Relationships(RelationshipID),
    FOREIGN KEY(AbbreviationID) REFERENCES Abbreviations(AbbreviationID)
 );
 CREATE TABLE IF NOT EXISTS Objects_Abbreviations (
    ObjectID INTEGER NOT NULL,
    AbbreviationID INTEGER NOT NULL,
    PRIMARY KEY(ObjectID, AbbreviationID),
    FOREIGN KEY(ObjectID) REFERENCES Objects(ObjectID),
    FOREIGN KEY(AbbreviationID) REFERENCES Abbreviations(AbbreviationID)
 );
 CREATE INDEX IF NOT EXISTS idx_sub_abbr_sub_id ON Subjects_Abbreviations(SubjectID);
 CREATE INDEX IF NOT EXISTS idx_sub_abbr_abbr_id ON Subjects_Abbreviations(AbbreviationID);
 CREATE INDEX IF NOT EXISTS idx_rel_abbr_rel_id ON Relationships_Abbreviations(RelationshipID);
 CREATE INDEX IF NOT EXISTS idx_rel_abbr_abbr_id ON Relationships_Abbreviations(AbbreviationID);
 CREATE INDEX IF NOT EXISTS idx_obj_abbr_obj_id ON Objects_Abbreviations(ObjectID);
 CREATE INDEX IF NOT EXISTS idx_obj_abbr_abbr_id ON Objects_Abbreviations(AbbreviationID);
 -- Views
 -- Subjects
 CREATE VIEW IF NOT EXISTS ParsedSubjects
 AS
 SELECT
 	SubjectID,
 	CASE WHEN Abbreviation IS NULL
 		THEN SubjectURI
 		ELSE Abbreviation || ':' || replace(SubjectURI, URI, '') END
 		AS SubjectURI
 FROM Subjects
 	LEFT JOIN Subjects_Abbreviations USING (SubjectID)
 	LEFT JOIN Abbreviations USING (AbbreviationID);
 -- Relationships
 CREATE VIEW IF NOT EXISTS ParsedRelationships
 AS
 SELECT
 	RelationshipID,
 	CASE WHEN Abbreviation IS NULL
 		THEN RelationshipURI
 		ELSE Abbreviation || ':' || replace(RelationshipURI, URI, '') END
 		AS RelationshipURI
 FROM Relationships
 	LEFT JOIN Relationships_Abbreviations USING (RelationshipID)
 	LEFT JOIN Abbreviations USING (AbbreviationID);
 -- Objects
 CREATE VIEW IF NOT EXISTS ParsedObjects
 AS
 SELECT
 	ObjectID,
 	CASE WHEN Abbreviation IS NULL
 		THEN ObjectURI
 		ELSE Abbreviation || ':' || replace(ObjectURI, URI, '') END
 		AS ObjectURI
 FROM Objects
 	LEFT JOIN Objects_Abbreviations USING (ObjectID)
 	LEFT JOIN Abbreviations USING (AbbreviationID);
 -- Subject Count
 CREATE VIEW IF NOT EXISTS SubjectsCountInRDFs
 AS
 SELECT SubjectID, count(SubjectID) as Sub_Count
 FROM RDFs
 GROUP BY SubjectID;
 -- Relationship Count
 CREATE VIEW IF NOT EXISTS RelationshipsCountInRDFs
 AS
 SELECT RelationshipID, count(RelationshipID) as Rel_Count
 FROM RDFs
 GROUP BY RelationshipID;
 -- Object Count
 CREATE VIEW IF NOT EXISTS ObjectsCountInRDFs
 AS
 SELECT ObjectID, count(ObjectID) as Obj_Count
 FROM RDFs
 GROUP BY ObjectID;
--- a/Scripts/DataBaseQueries/query.sql
+++ b/Scripts/DataBaseQueries/query.sql
@ -0,0 +1,55 @@
 -- Insert MovieURI into Movies ; MovieID is auto incremental
 INSERT INTO  Movies (MovieURI) VALUES (?);
 -- Get MovieID where MovieURI equal given value
 SELECT MovieID FROM Movies WHERE MovieURI = ?;
 -- SetPageId
 INSERT INTO  WikiPageIDs (MovieID, PageID) VALUES (?,?);
 -- Get MovieId by PageID ... ( to create WikipediaAbstract)
 SELECT MovieID FROM WikiPageIDs WHERE PageID = ?;
 -- SetAbstract ...
 INSERT INTO  WikipediaAbstracts (MovieID, Abstract) VALUES (?,?);
 -- SetOrigin
 ---
 INSERT INTO  Origins (OriginName) VALUES ("dataset.csv"),("reverse.csv");
 -- GetOrigin
 SELECT OriginID FROM Origins WHERE OriginName = ?;
 -- Subject, Relationship, Object, RDF
 INSERT INTO  Subjects (SubjectURI, OriginID) VALUES (?,?);
 INSERT INTO  Relationships (RelationshipURI) VALUES (?);
 INSERT INTO  Objects (ObjectURI, OriginID) VALUES (?,?);
 SELECT SubjectID FROM Subjects WHERE SubjectURI = ?;
 SELECT RelationshipID FROM Relationships WHERE RelationshipURI = ?;
 SELECT ObjectID FROM Objects WHERE ObjectURI = ?;
 INSERT INTO  RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?,?,?,?);
 -- Prefixes
 INSERT INTO Abbreviations(URI, Abbreviation) VALUES (?,?);
 INSERT INTO Objects_Abbreviations(ObjectID, AbbreviationID) VALUES (?,?);
 INSERT INTO Relationships_Abbreviations(RelationshipID, AbbreviationID) VALUES (?,?);
 INSERT INTO Subjects_Abbreviations(SubjectID, AbbreviationID) VALUES (?,?);
 -- Please be sure it is a URI before running this query
 --  and take at least until the domain and the first path part
 SELECT AbbreviationID FROM Abbreviations WHERE URI LIKE ?;
 -- Query to retrieve data
 SELECT MovieID, GROUP_CONCAT('<SOT>' || '<SUB>' || SubjectURI || '<REL>' || RelationshipURI || '<OBJ>' || ObjectURI || '<EOT>', '') as RDF_String, Abstract
 FROM RDFs
 INNER JOIN ParsedSubjects USING (SubjectID)
 INNER JOIN ParsedRelationships USING (RelationshipID)
 INNER JOIN ParsedObjects USING (ObjectID)
 INNER JOIN WikipediaAbstracts USING (MovieID)
    -- insert WHERE here
 GROUP BY MovieID;
--- a/Scripts/DataCleaning/clean_relationship.ipynb
+++ b/Scripts/DataCleaning/clean_relationship.ipynb
@ -0,0 +1,186 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "b9081b7c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# This file deletes in the pipeline the unwanted relationship by different rules\n",
    "import pandas as pd\n",
    "import sqlite3\n",
    "import numpy as np\n",
    "\n",
    "\n",
    "CONN = sqlite3.connect('../../Assets/Dataset/Tmp/dataset2.db')\n",
    "\n",
    "def get_RDF() -> pd.DataFrame:\n",
    "    \"\"\"\n",
    "    QUERY = \"SELECT * FROM RDFs \" \\\n",
    "    \"INNER JOIN Subjects USING (SubjectID) \" \\\n",
    "    \"INNER JOIN Relationships USING (RelationshipID) \" \\\n",
    "    \"INNER JOIN Objects USING (ObjectID);\"\n",
    "    RDF = pd.read_sql_query(QUERY, CONN)\n",
    "    RDF = RDF[[\"SubjectURI\", \"RelationshipURI\", \"ObjectURI\"]]\n",
    "    RDF = RDF.dropna()\n",
    "    \"\"\"\n",
    "    Subjects = pd.read_sql_query('SELECT * FROM Subjects;', CONN)\n",
    "    Objects = pd.read_sql_query('SELECT * FROM Objects;', CONN)\n",
    "    Relationships = pd.read_sql_query('SELECT * FROM Relationships;', CONN)\n",
    "    RDF = pd.read_sql_query('SELECT * FROM RDFs;', CONN)\n",
    "\n",
    "    # drop '' values \n",
    "    Subjects = Subjects.replace('', np.nan)# .dropna()\n",
    "    Relationships = Relationships.replace('', np.nan)# .dropna()\n",
    "    Objects = Objects.replace('', np.nan)# .dropna()\n",
    "\n",
    "    # join RDF with its components\n",
    "    RDF = RDF.merge(Subjects, left_on=\"SubjectID\", right_on=\"SubjectID\")\n",
    "    RDF = RDF.merge(Objects, left_on=\"ObjectID\", right_on=\"ObjectID\")\n",
    "    RDF = RDF.merge(Relationships, left_on=\"RelationshipID\", right_on=\"RelationshipID\")\n",
    "    RDF = RDF[[\"SubjectURI\", \"RelationshipURI\", \"ObjectURI\", \"MovieID\"]]\n",
    "    return RDF\n",
    "\n",
    "\n",
    "#def delete_relationship_by_uri(RDF: pd.DataFrame, )\n",
    "\n",
    "def delete_relationship_by_uri(RDF: pd.DataFrame, uri: str) -> pd.DataFrame:\n",
    "    return RDF[RDF[\"RelationshipURI\"]!= uri]\n",
    "\n",
    "\n",
    "\n",
    "RDF = get_RDF()\n",
    "# RDF = RDF.dropna()\n",
    "# print(RDF)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "644690bb",
   "metadata": {},
   "outputs": [],
   "source": [
    "def filter_by_frequence_relationship_uri(RDF: pd.DataFrame, count_treshold) -> pd.DataFrame:\n",
    "    counts = RDF[\"RelationshipURI\"].value_counts() \n",
    "    RDF[\"RelationshipFreq\"] = RDF[\"RelationshipURI\"].map(counts)\n",
    "    RDF = RDF[RDF[\"RelationshipFreq\"] >= count_treshold]\n",
    "    # counts is a series as key: relationship, value: count\n",
    "    # counts = counts[counts > count_treshold]\n",
    "    # relationships = counts.index\n",
    "    # RDF = RDF[RDF[\"RelationshipURI\"].isin(relationships)]\n",
    "    # RDF = RDF.groupby(\"RelationshipURI\").filter(lambda x: len(x) >= count_treshold)\n",
    "    return RDF\n",
    "\n",
    "RDF = filter_by_frequence_relationship_uri(RDF, 1)\n",
    "# print(new_RDF)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "34525be6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                                 SubjectURI  \\\n",
      "0             http://dbpedia.org/resource/Nights_of_Cabiria   \n",
      "1         http://dbpedia.org/resource/California_Science...   \n",
      "2                 http://dbpedia.org/resource/China_Captain   \n",
      "3         http://dbpedia.org/resource/Caravan_of_Courage...   \n",
      "4                http://dbpedia.org/resource/WHIH_Newsfront   \n",
      "...                                                     ...   \n",
      "12725500   http://dbpedia.org/resource/I_Will_Follow_(film)   \n",
      "12725501   http://dbpedia.org/resource/I_Will_Follow_(film)   \n",
      "12725502  http://dbpedia.org/resource/I_Witnessed_Genoci...   \n",
      "12725503  http://dbpedia.org/resource/I_Woke_Up_Early_th...   \n",
      "12725504           http://dbpedia.org/resource/I_Won't_Play   \n",
      "\n",
      "                                       RelationshipURI  \\\n",
      "0          http://www.w3.org/2002/07/owl#differentFrom   \n",
      "1          http://www.w3.org/2002/07/owl#differentFrom   \n",
      "2          http://www.w3.org/2002/07/owl#differentFrom   \n",
      "3          http://www.w3.org/2002/07/owl#differentFrom   \n",
      "4         http://www.w3.org/2000/01/rdf-schema#seeAlso   \n",
      "...                                                ...   \n",
      "12725500          http://dbpedia.org/ontology/producer   \n",
      "12725501          http://dbpedia.org/ontology/producer   \n",
      "12725502          http://dbpedia.org/ontology/producer   \n",
      "12725503          http://dbpedia.org/ontology/producer   \n",
      "12725504          http://dbpedia.org/ontology/producer   \n",
      "\n",
      "                                                  ObjectURI  MovieID  \\\n",
      "0                       http://dbpedia.org/resource/Cabiria       26   \n",
      "1         http://dbpedia.org/resource/California_Academy...      185   \n",
      "2                 http://dbpedia.org/resource/Captain_China      614   \n",
      "3         http://dbpedia.org/resource/Caravan_of_Courage...      740   \n",
      "4         http://dbpedia.org/resource/Captain_America:_C...      594   \n",
      "...                                                     ...      ...   \n",
      "12725500           http://dbpedia.org/resource/Ava_DuVernay   145854   \n",
      "12725501           http://dbpedia.org/resource/Molly_Mayeux   145854   \n",
      "12725502        http://dbpedia.org/resource/Headlines_Today   145861   \n",
      "12725503             http://dbpedia.org/resource/Billy_Zane   145862   \n",
      "12725504    http://dbpedia.org/resource/Gordon_Hollingshead   145864   \n",
      "\n",
      "          RelationshipFreq  MovieFreq  \n",
      "0                     2132        216  \n",
      "1                     2132        264  \n",
      "2                     2132         66  \n",
      "3                     2132        131  \n",
      "4                     1653        133  \n",
      "...                    ...        ...  \n",
      "12725500             80077         95  \n",
      "12725501             80077         95  \n",
      "12725502             80077         41  \n",
      "12725503             80077         98  \n",
      "12725504             80077         91  \n",
      "\n",
      "[12725505 rows x 6 columns]\n"
     ]
    }
   ],
   "source": [
    "def filter_by_frequence_movie_id(RDF: pd.DataFrame, min_treshold, max_treshold) -> pd.DataFrame:\n",
    "    counts = RDF[\"MovieID\"].value_counts() \n",
    "    RDF[\"MovieFreq\"] = RDF[\"MovieID\"].map(counts)\n",
    "    RDF = RDF[RDF[\"MovieFreq\"] >= min_treshold]\n",
    "    RDF = RDF[RDF[\"MovieFreq\"] < max_treshold]\n",
    "    # counts is a series as key: relationship, value: count\n",
    "    # counts = counts[counts > count_treshold]\n",
    "    # relationships = counts.index\n",
    "    # RDF = RDF[RDF[\"RelationshipURI\"].isin(relationships)]\n",
    "    # RDF = RDF.groupby(\"RelationshipURI\").filter(lambda x: len(x) >= count_treshold)\n",
    "    return RDF\n",
    "\n",
    "RDF = filter_by_frequence_movie_id(RDF, 1, 1500)\n",
    "print(RDF)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "deep_learning",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/Scripts/DataCleaning/dbpedia-uri.py
+++ b/Scripts/DataCleaning/dbpedia-uri.py
@ -0,0 +1,77 @@
 import argparse
 import sys
 class ProgramArgs:
    def __init__(self, file: str, output: str, treshold: int):
        self.file = file
        self.output = output
        self.treshold = treshold
 def get_args(args: list[str]) -> ProgramArgs:
    PARSER = argparse.ArgumentParser()
    PARSER.add_argument("--input-file", "-i", required=True, type=str)
    PARSER.add_argument("--output-file", "-o", required=True, type=str)
    PARSER.add_argument("--treshold", "-t", type=int, default=1)
    parsed_args, _ = PARSER.parse_known_args(args)
    # print(parsed_args.input_file)
    return ProgramArgs(parsed_args.input_file,parsed_args.output_file, parsed_args.treshold)  # type ignore
 def print_dbpedia(file: str, out: str):
    FILE = open(file, "r", encoding="utf-8")
    OUT = open(out, mode="w", encoding="utf-8")
    DOMAIN_PART = "dbpedia"
    already_parsed : set[str] = set()
    for row in FILE:
        sections = row.split("/")
        sections = list(filter(lambda item: item != "", sections))
        # print(sections)
        if len(sections) < 3:
            continue
        URI = "/".join(sections[1:3])
        URI = "//".join([sections[0], URI])
        if  URI in already_parsed:
            continue
        DOMAIN = sections[1]
        SUBDOMAINS = DOMAIN.split(".")
        TYPE = sections[2][0]
        if DOMAIN_PART not in SUBDOMAINS:
            continue
        already_parsed.add(URI)
        SUB_ID = SUBDOMAINS[0]
        if len(SUB_ID) > 3:
            SUB_ID = SUB_ID[:3]
        OUT.write(f"\"{URI}/\", \"{SUB_ID}-db{TYPE}\"\n")
    FILE.close()
    OUT.close()
 if __name__ == "__main__":
    ARGS = get_args(sys.argv)
    # ARGS = get_debug_args()
    print_dbpedia(ARGS.file, ARGS.output)
--- a/Scripts/DataCleaning/path_splitter_tree.py
+++ b/Scripts/DataCleaning/path_splitter_tree.py
@ -0,0 +1,162 @@
 import argparse
 import csv
 import sys
 from typing import Self
 class ProgramArgs:
    def __init__(self, file: str, csv_uri_header: str, output: str, treshold: int):
        """
        Args:
            file (str): 
            csv_header (str): The name of the column of the csv file from which the program will get the URIs
            output (str): 
            treshold (int): 
        """        
        self.file = file
        self.csv_uri_header = csv_uri_header
        self.output = output
        self.treshold = treshold
 class Node:
    def __init__(
        self,
        name: str,
        quantity: int = 0,
    ):
        self.name = name
        self.quantity = quantity
        self.children: dict[str, Node] = {}
    @property
    def is_leaf(self):
        return len(self.children) == 0
    def append_child(self, child: list[str]):
        # print(child)
        KEY = child[0]
        if not self.children.get(KEY):
            # if the key has no value, it means we are traversing this branch for the first time
            # create another node for the key
            self.children[KEY] = Node(KEY, 0)
        # take the node for the key
        CHILD = self.children[KEY]
        self.quantity += 1
        # if the child list to enter has only one element, which is KEY, no more node will be created
        if len(child) == 1:
            return
        new_children = child[1:]
        CHILD.append_child(new_children)
    def __str__(self):
        return f"{self.name}/ - {self.quantity}"
 def get_args(args: list[str]) -> ProgramArgs:
    PARSER = argparse.ArgumentParser()
    PARSER.add_argument("--input-file", "-i", required=True, type=str)
    PARSER.add_argument("--header-name", "-c", required=True, type=str)                       # c stands for column
    PARSER.add_argument("--output-file", "-o", required=True, type=str)
    PARSER.add_argument("--treshold", "-t", type=int, default=1)
    parsed_args, _ = PARSER.parse_known_args(args)
    # print(parsed_args.input_file)
    return ProgramArgs(parsed_args.input_file, parsed_args.header_name ,parsed_args.output_file, parsed_args.treshold)  # type ignore
 def get_debug_args() -> ProgramArgs:
    # -i ./Assets/Dataset/1-hop/movies.csv  -c subject -o Assets/Dataset/Tmp/prova.csv -t 1
    FILE = "./Assets/Dataset/1-hop/movies.csv"
    CSV_HEADER = "subject"
    OUTPUT = "./Assets/Dataset/Tmp/prova.csv"
    TRESHOLD = 1
    return ProgramArgs(
        FILE,
        CSV_HEADER,
        OUTPUT,
        TRESHOLD
    )
 def tree_like(file: str, csv_uri_header:str, out: str):
    INDENTATION = "    "
    properties: dict[str, Node] = {}
    properties["pure"] = Node("pure", 0)
    properties["URI"] = Node("uri", 0)
    FILE = open(file, "r", encoding="utf-8")
    # TODO: Change here so it takes single URI from a CSV file
    # It is needed the header-name
    for row in csv.DictReader(FILE):
        uri_element = row[csv_uri_header]
        sections = uri_element.split("/")
        sections = list(filter(lambda item: item != "", sections))
        # print(sections)
        if sections[0] != "http:" and sections[0] != "https:":
            properties["pure"].append_child(sections)
            continue
        properties["URI"].append_child(sections)
    FILE.close()
    stack: list[tuple[Node, int]] = []
    for _, item in properties.items():
        stack.append((item, 0))
    OUT = open(out, mode="w", encoding="utf-8")
    while len(stack) > 0:
        LAST_ITEM = stack.pop()
        NODE: Node = LAST_ITEM[0]
        DEPTH: int = LAST_ITEM[1]
        INDENT: str = INDENTATION * DEPTH
        # Leaf node have quantity 0, so if i want them to appear the threshold have to be 0
        # if NODE.quantity < ARGS.treshold:
        if ARGS.treshold > NODE.quantity:
            continue
        OUT.write(f"{INDENT}- {NODE}\n")
        if NODE.is_leaf:
            continue
        CHILDREN = []
        for _, child in NODE.children.items():
            CHILDREN.append((child, DEPTH + 1))
        stack.extend(CHILDREN)
    OUT.close()
 if __name__ == "__main__":
    ARGS = get_args(sys.argv)
    # ARGS = get_debug_args()
    tree_like(ARGS.file,ARGS.csv_uri_header, ARGS.output)
--- a/Scripts/DataGathering/analysis.py
+++ b/Scripts/DataGathering/analysis.py
@ -0,0 +1,53 @@
 import argparse
 import sys
 import pandas as pd
 class ProgramArgs:
    def __init__(
        self, input_file: str, column: str, output_file: str, count: bool
    ) -> None:
        self.input_file = input_file
        self.column = column
        self.output_file = output_file
        self.count = count
 def get_args(args: list[str]) -> ProgramArgs:
    PARSER = argparse.ArgumentParser()
    PARSER.add_argument("--input-file", "--input", "-i", required=True, type=str)
    PARSER.add_argument("--output-file", "--output", "-o", required=True, type=str)
    PARSER.add_argument("--column", "--col", required=True, type=str)
    PARSER.add_argument(
        "--count", "-c", action="store_const", const=True, default=False
    )
    parsed_args, _ = PARSER.parse_known_args(args)
    return ProgramArgs(
        parsed_args.input_file,
        parsed_args.column,
        parsed_args.output_file,
        parsed_args.count,
    )  # type ignore
 if __name__ == "__main__":
    ARGS = get_args(sys.argv)
    OUTPUT_FILE = open(ARGS.output_file, "w+", encoding="utf-8")
    # Load the CSV
    df = pd.read_csv(ARGS.input_file)
    # Count occurrences of each unique last part
    item_counts = df[ARGS.column].value_counts()
    # Print the counts
    for item, count in item_counts.items():
        if ARGS.count:
            OUTPUT_FILE.write(f"{item}: {count}\n")
        else:
            OUTPUT_FILE.write(f"{item}\n")
--- a/Scripts/DataGathering/fetchdata.py
+++ b/Scripts/DataGathering/fetchdata.py
@ -0,0 +1,146 @@
 import argparse
 from math import floor
 import sys
 from time import sleep
 import SPARQLWrapper
 class ProgramData:
    def __init__(
        self,
        local_url,
        query_url,
        sparql_url,
        output_type,
        initial_offset,
        timeout,
        limit,
        max_pages,
        verbosity_level,
    ) -> None:
        self.local_url = local_url
        self.query_url = query_url
        self.sparql_url = sparql_url
        self.output_type = output_type
        self.initial_offset = initial_offset
        self.timeout = timeout
        self.limit = limit
        self.max_pages = max_pages
        self.verbosity_level = verbosity_level
    @property
    def offset(self):
        return self.limit
    @property
    def query(self):
        with open(self.query_url, "r") as file:
            return file.read()
 DBPEDIA_URL = "https://dbpedia.org/sparql"
 TYPE = SPARQLWrapper.CSV
 TIMEOUT_SECONDS = 1.5
 LIMIT = int(1E4)
 INITIAL_OFFSET = 0
 MAX_PAGES = int(1E9)
 def gather_cli_args(args: list[str]) -> ProgramData:
    # TODO: Add argument for type
    PARSER = argparse.ArgumentParser("sparql data fetcher")
    PARSER.add_argument("--file-path", "--file", "--output", "-o", required=True, type=str)
    PARSER.add_argument("--query-file", "--query", "-q", required=True, type=str)
    PARSER.add_argument("--url", type=str, default=DBPEDIA_URL)
    PARSER.add_argument("--limit", type=int, default=LIMIT)
    PARSER.add_argument("--timeout", type=float, default=TIMEOUT_SECONDS)
    PARSER.add_argument("--offset", type=int, default=INITIAL_OFFSET)
    PARSER.add_argument("--max-pages", type=int, default=MAX_PAGES)
    PARSER.add_argument("--verbose", "-v", action="count", default=0)
    parsed_args, _ = PARSER.parse_known_args(args)
    return ProgramData(
        parsed_args.file_path,
        parsed_args.query_file,
        parsed_args.url,
        SPARQLWrapper.CSV,
        parsed_args.offset,
        parsed_args.timeout,
        parsed_args.limit,
        parsed_args.max_pages,
        parsed_args.verbose
    )
    # type: ignore
 def fetch_data(DATA: ProgramData):
    # Take correction of page into account
    page = int(floor(DATA.initial_offset / DATA.limit)) - 1
    exit = False
    while not exit:
        print(f"Starting to get page {page}")
        CURRENT_OFFSET = int(DATA.offset + (page * DATA.limit))
        sparql = SPARQLWrapper.SPARQLWrapper(DATA.sparql_url)
        sparql.setReturnFormat(TYPE)
        CURRENT_PAGE_QUERY = "\n".join([
            DATA.query,
            f"LIMIT {LIMIT}",
            f"OFFSET {CURRENT_OFFSET}"
        ])
        print(f"\nCurrent Query:\n{CURRENT_PAGE_QUERY}\n")
        sparql.setQuery(CURRENT_PAGE_QUERY)
        try:
            res = sparql.queryAndConvert()
            text = ""
            if type(res) == bytes:
                initial_offset = 0
                if page != 0:
                    initial_offset = 1
                lines = res.decode("utf-8", "ignore").split("\n")
                text = "\n".join(lines[initial_offset:])
            if text == "":
                exit = True
                continue
            with open(DATA.local_url, "a+", encoding="utf-8") as dataset:
                print(f"Writing page {page} on {DATA.local_url}")
                dataset.write(
                    text
                )
        except Exception as ex:
            print(f"Something went wrong during page {page}:\n\t{ex}")
        print(f"Sleeping for {TIMEOUT_SECONDS}")
        page += 1
        if page == MAX_PAGES - 1:
            exit = True
        sleep(TIMEOUT_SECONDS)
 if __name__ == "__main__":
    DATA = gather_cli_args(sys.argv)
    fetch_data(DATA)
--- a/Scripts/DataGathering/wikipedia_gathering.py
+++ b/Scripts/DataGathering/wikipedia_gathering.py
@ -0,0 +1,154 @@
 from pathlib import Path
 import pandas as pd
 import csv
 import time
 import requests
 input_csv = "./Assets/Dataset/1-hop/movie-pageid.csv"
 output_csv = "./Assets/Dataset/Tmp/wikipedia-summary.csv"
 sess = requests.Session()
 CHUNK = 20
 # Function to get clean full text from Wikipedia PageID
 def get_clean_text(pageIDS: list[str]):
    parsing_time = 0
    start_full = time.time()
    API_URL = "https://en.wikipedia.org/w/api.php"
    headers = {
        "User-Agent": "CoolBot/0.0"
        ""
        " (https://example.org/coolbot/; coolbot@example.org)"
    }
    ids = "|".join(pageIDS)
    start_fetch = time.time()
    res = sess.get(headers=headers, url=f"{API_URL}?action=query&pageids={ids}&prop=extracts&exintro=1&explaintext=1&format=json")
    end_fetch = time.time()
    fetch_time = end_fetch - start_fetch
    print(f"Time elapsed FETCH: {fetch_time} seconds")
    data = res.json()
    abstracts = {}
    # Make sure 'query' and the page exist
    SKIPPED = 0
    if "query" in data and "pages" in data["query"]:
        for pageID in pageIDS:
            if pageID in data["query"]["pages"]:
                page = data["query"]["pages"][pageID]
                extract: str = page.get("extract")
                if extract:
                    print(f"Entry FOUND for pageID {pageID}")
                    start_parse = time.time()
                    extract = extract.strip()
                    extract = extract.replace("\n", "")
                    end_parse = time.time()
                    parsing_time = end_parse - start_parse
                    print(f"Time elapsed PARSE: {parsing_time} seconds")
                    abstracts[pageID] = extract
                else:
                    SKIPPED += 1
                    print(f"Entry MISSING for pageID {pageID}")
            else:
                SKIPPED += 1
                print(f"Page MISSING for pageID {pageID}")
    print(f"Chunk done - Skipped {SKIPPED}")
    end_full = time.time()
    print(f"Time elapsed FULL: {end_full - start_full} seconds\n\tNO PARSE: {(end_full - start_full) - parsing_time} seconds")
    return abstracts
 def flush(movie_ids):
        abstracts = get_clean_text(movie_ids)
        start = time.time()
        with open(output_csv, "a", newline="", encoding="utf-8") as f_out:
            writer = csv.DictWriter(f_out, fieldnames=["subject", "text"])
            for id, text in abstracts.items():
                writer.writerow({"subject": id, "text": text})
        end = time.time()
        print(f"Time elapsed WRITE: {end - start} seconds")
 def reconcile() -> int:
    start = time.time()
    input_file = open(input_csv, "r", newline="", encoding="utf-8")
    output_file = open(output_csv, "r", newline="", encoding="utf-8")
    next(input_file)
    LAST_CHECKED = output_file.readlines()[-1].split(",")[0]
    current_check = input_file.readline().split(",")[1]
    index = 1
    while current_check != LAST_CHECKED:
        current_check = input_file.readline().split(",")[1].replace("\n", "")
        index += 1
    input_file.close()
    output_file.close()
    end = time.time()
    print(f"Time elapsed RECONCILE: {end - start} seconds")
    print(f"FOUND, we need to skip {index} lines")
    return index
 if not Path(output_csv).is_file():
    # Initialize output CSV
    with open(output_csv, "w", newline="", encoding="utf-8") as f_out:
        writer = csv.DictWriter(f_out, fieldnames=["subject", "text"])
        writer.writeheader()
 SKIP = reconcile()
 # Read CSV in RAM
 with open(input_csv, "r", newline="", encoding="utf-8") as input:
    # Skip already done
    for i in range(0, SKIP):
        next(input)
    reader = csv.reader(input)
    index = -1
    movie_ids = []
    for line in reader:
        index += 1
        if index == 0:
            continue
        # Save movies in map
        movie_ids.append(line[1])
        if index % CHUNK == 0:
            # Flush movies
            flush(movie_ids)
            movie_ids = []
--- a/Scripts/DatasetMerging/datasetInfo.md
+++ b/Scripts/DatasetMerging/datasetInfo.md
@ -0,0 +1,26 @@
 # HOW THE DATASET IS BUILT AND POPULATED
 Note: the data are taken from CSV files in 1-hop
 ## CSV files composition
 | CSV files          | Original structure                    | Saved AS                            |
 |--------------------|---------------------------------------|-------------------------------------|
 | Wikipeda-summary   | PageId / abstract                     | subject, text                       |
 | Movies             | Movie URI                             | "subject"                           |
 | Dataset            | Movie URI / Relationship / Object [RDF] | subject, relationship, object       |
 | Movies-PageId      | Movie URI / PageId (wiki)             | "subject", "object"                 |
 | Reverse            | Subject / Relationship / Movie URI    | "subject", "relationship", "object" |
 ## Wanted tables schema
 | Table         | Columns                                                                 |
 |---------------|-------------------------------------------------------------------------|
 | Movies        | MovieID [PK], Movie URI                                                 |
 | WikiPageIDs   | MovieID [PK, FK], PageId [IDX] (wiki) *(Not important for now)*         |
 | Abstracts     | MovieID [PK, FK], abstract                                              |
 | Subjects      | SubjectID [PK], RDF Subject (from Dataset.csv or Reverse.csv), OriginID [FK] |
 | Relationships | RelationshipID [PK], RDF Relationship (value only, not the actual relation) |
 | Objects       | ObjectID [PK], RDF Object, OriginID [FK]                                |
 | Origins       | OriginID [PK], Origin Name                                              |
 | RDFs          | RDF_ID [PK], MovieID [FK], SubjectID [FK], RelationshipID [FK], ObjectID [FK] |
--- a/Scripts/DatasetMerging/datawarehouse.py
+++ b/Scripts/DatasetMerging/datawarehouse.py
@ -0,0 +1,633 @@
 import sqlite3
 import csv
 #####################################################################
 #   This file builds DatawareHouse/dataset.db from 1-hop csv files  #
 #   Its Schema in . /SQL_Queries/db_creation.sql                    #
 #   The sql query used to popualate id in . /SQL_Queries/query.sql  #
 #####################################################################
 # sometimes you may need to build a new db file, here a little snippet for you
 # sqlite3 ./Assets/Dataset/Tmp/dataset.db < ./Scripts/DataCleaning/SQL_Queries/db_creation.sql
 # --- Global configuration ---
 DB_NAME = "./Assets/Dataset/DatawareHouse/dataset.db"
 MOVIES_CSV = "./Assets/Dataset/1-hop/movies.csv"
 PAGEID_CSV = "./Assets/Dataset/1-hop/movie-pageid.csv"
 SUMMARY_CSV = "./Assets/Dataset/1-hop/wikipedia-summary.csv"
 DATASET_CSV = "./Assets/Dataset/1-hop/dataset.csv"
 REVERSE_CSV = "./Assets/Dataset/1-hop/reverse.csv"
 URI_CSV = "./Assets/Dataset/1-hop/uri-abbreviations.csv"
 MOVIES_CSV_HANDLER = open(MOVIES_CSV, "r", newline="", encoding="utf-8")
 PAGEID_CSV_HANDLER = open(PAGEID_CSV, "r", newline="", encoding="utf-8")
 SUMMARY_CSV_HANDLER = open(SUMMARY_CSV, "r", newline="", encoding="utf-8")
 DATASET_CSV_HANDLER = open(DATASET_CSV, "r", newline="", encoding="utf-8")
 REVERSE_CSV_HANDLER = open(REVERSE_CSV, "r", newline="", encoding="utf-8")
 URI_ABBR_CSV_HANDLER = open(URI_CSV, "r", newline="", encoding="utf-8")
 CONN = sqlite3.connect(DB_NAME)
 CURS = CONN.cursor()
 # MARK: SQL Definitions
 # Insert MovieURI
 def insertOrigin(curs: sqlite3.Cursor) -> bool:
    QUERY = "INSERT INTO  Origins (OriginName) VALUES ('dataset.csv'),('reverse.csv');"
    try:
        curs.execute(QUERY)
        return True
    except sqlite3.IntegrityError:
        return False
 def selectOrigin(curs: sqlite3.Cursor, originName: str) -> int | None:
    QUERY = "SELECT OriginID FROM Origins WHERE OriginName = ?;"
    curs.execute(QUERY, [originName])
    originId = curs.fetchone()
    if not originId:
        return None
    # in this case the real id is the first element of the tuple
    return originId[0]
 def insertMovie(curs: sqlite3.Cursor, movieUri: str) -> bool:
    QUERY = "INSERT INTO Movies (MovieURI) VALUES (?);"
    try:
        curs.execute(QUERY, [movieUri])
        return True
    except sqlite3.IntegrityError:
        return False
 def selectMovieId(curs: sqlite3.Cursor, movieUri: str) -> int | None:
    QUERY = "SELECT MovieID FROM Movies WHERE MovieURI = ?;"
    curs.execute(QUERY, [movieUri])
    movieId = curs.fetchone()
    if not movieId:
        return None
    # in this case the real id is the first element of the tuple
    return movieId[0]
 def insertWikiPageId(curs: sqlite3.Cursor, movieId: int, pageId: int) -> bool:
    QUERY = "INSERT INTO  WikiPageIDs (MovieID, PageID) VALUES (?,?);"
    try:
        curs.execute(QUERY, [movieId, pageId])
        return True
    except sqlite3.IntegrityError:
        return False
 def selectMovieIdFromWikiPageId(curs: sqlite3.Cursor, pageId: int) -> int | None:
    QUERY = "SELECT MovieID FROM WikiPageIDs WHERE PageID = ?;"
    curs.execute(QUERY, [pageId])
    movieId = curs.fetchone()
    if not movieId:
        return None
    # in this case the real id is the first element of the tuple
    return movieId[0]
 def insertWikiAbstract(curs: sqlite3.Cursor, movieId: int, abstract: str) -> bool:
    QUERY = "INSERT INTO  WikipediaAbstracts (MovieID, Abstract) VALUES (?,?);"
    try:
        curs.execute(QUERY, [movieId, abstract])
        return True
    except sqlite3.IntegrityError:
        return False
 def insertSubject(curs: sqlite3.Cursor, subjectURI: str, originID: int) -> bool:
    QUERY = "INSERT INTO  Subjects (SubjectURI, OriginID) VALUES (?,?);"
    try:
        curs.execute(QUERY, [subjectURI, originID])
        return True
    except sqlite3.IntegrityError:
        return False
 def insertRelationship(curs: sqlite3.Cursor, relationshipURI: str) -> bool:
    QUERY = "INSERT INTO  Relationships (RelationshipURI) VALUES (?);"
    try:
        curs.execute(QUERY, [relationshipURI])
        return True
    except sqlite3.IntegrityError:
        return False
 def insertObject(curs: sqlite3.Cursor, objectURI: str, originID: int) -> bool:
    QUERY = "INSERT INTO  objects (ObjectURI, OriginID) VALUES (?,?);"
    try:
        curs.execute(QUERY, [objectURI, originID])
        return True
    except sqlite3.IntegrityError:
        return False
 def selectSubjectId(curs: sqlite3.Cursor, subjectURI: str) -> int | None:
    QUERY = "SELECT SubjectID FROM Subjects WHERE SubjectURI = ?;"
    curs.execute(QUERY, [subjectURI])
    subjectId = curs.fetchone()
    if not subjectId:
        return None
    # in this case the real id is the first element of the tuple
    return subjectId[0]
 def selectRelationshipId(curs: sqlite3.Cursor, relationshipURI: str) -> int | None:
    QUERY = "SELECT RelationshipID FROM Relationships WHERE RelationshipURI = ?;"
    curs.execute(QUERY, [relationshipURI])
    relationshipId = curs.fetchone()
    if not relationshipId:
        return None
    # in this case the real id is the first element of the tuple
    return relationshipId[0]
 def selectObjectId(curs: sqlite3.Cursor, objectURI: str) -> int | None:
    QUERY = "SELECT ObjectID FROM Objects WHERE ObjectURI = ?;"
    curs.execute(QUERY, [objectURI])
    objectId = curs.fetchone()
    if not objectId:
        return None
    # in this case the real id is the first element of the tuple
    return objectId[0]
 def insertRDF(
    curs: sqlite3.Cursor,
    movieId: int,
    subjectId: int,
    relationshipId: int,
    objectId: int,
 ) -> bool:
    QUERY = "INSERT INTO  RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?,?,?,?);"
    try:
        curs.execute(QUERY, [movieId, subjectId, relationshipId, objectId])
        return True
    except sqlite3.IntegrityError:
        return False
 # UGLY: correct method to add cursor
 def insert_abbreviation(uri, abbreviation) -> bool:
    QUERY = "INSERT INTO Abbreviations(URI, Abbreviation) VALUES (?,?);"
    try:
        CURS.execute(QUERY, [uri, abbreviation])
        return True
    except sqlite3.IntegrityError:
        return False
 # UGLY: correct method to add cursor
 def insert_object_abbreviation(object_id, abbreviation_id) -> bool:
    QUERY = "INSERT INTO Objects_Abbreviations(ObjectID, AbbreviationID) VALUES (?,?);"
    try:
        CURS.execute(QUERY, [object_id, abbreviation_id])
        return True
    except sqlite3.IntegrityError:
        return False
 # UGLY: correct method to add cursor
 def insert_relationship_abbreviation(relationship_id, abbreviation_id) -> bool:
    QUERY = "INSERT INTO Relationships_Abbreviations(RelationshipID, AbbreviationID) VALUES (?,?);"
    try:
        CURS.execute(QUERY, [relationship_id, abbreviation_id])
        return True
    except sqlite3.IntegrityError:
        return False
 # UGLY: correct method to add cursor
 def insert_subject_abbreviation(subject_id, abbreviation_id) -> bool:
    QUERY = (
        "INSERT INTO Subjects_Abbreviations(SubjectID, AbbreviationID) VALUES (?,?);"
    )
    try:
        CURS.execute(QUERY, [subject_id, abbreviation_id])
        return True
    except sqlite3.IntegrityError:
        return False
 # UGLY: correct method to add cursor
 def select_abbreviation_id(uri) -> int | None:
    QUERY = "SELECT AbbreviationID FROM Abbreviations WHERE URI LIKE ?;"
    CURS.execute(QUERY, [uri])
    abbreviation_id = CURS.fetchone()
    if not abbreviation_id:
        return None
    # in this case the real id is the first element of the tuple
    return abbreviation_id[0]
 # MARK: Parsing
 def parseMovies():
    CSV_READER = csv.reader(MOVIES_CSV_HANDLER)
    next(CSV_READER)
    for row in CSV_READER:
        MOVIE = row[0]
        insertMovie(CURS, MOVIE)
 def parseWikiPageId():
    CSV_READER = csv.DictReader(PAGEID_CSV_HANDLER)
    for row in CSV_READER:
        MOVIE_URI = row["subject"]
        WIKI_PAGE_ID = int(row["object"])
        MOVIE_ID = selectMovieId(CURS, MOVIE_URI)
        if MOVIE_ID is None:
            print(f"The MovieUri: {MOVIE_URI} has not a MovieId ")
            continue
        insertWikiPageId(CURS, MOVIE_ID, WIKI_PAGE_ID)
 def parseAbstract():
    CSV_READER = csv.DictReader(SUMMARY_CSV_HANDLER)
    for row in CSV_READER:
        WIKI_PAGE_ID = int(row["subject"])
        ABSTRACT = row["text"]
        MOVIE_ID = selectMovieIdFromWikiPageId(CURS, WIKI_PAGE_ID)
        if MOVIE_ID is None:
            print(f"The WikiPageId: {WIKI_PAGE_ID} has not a MovieId ")
            continue
        insertWikiAbstract(CURS, MOVIE_ID, ABSTRACT)
 def parseAbbreviations():
    URI_CSV = csv.DictReader(URI_ABBR_CSV_HANDLER)
    for row in URI_CSV:
        URI = row["uri"]
        ABBREVIATION = row["abbreviation"]
        insert_abbreviation(URI, ABBREVIATION)
 def parseRDF_Reverse():
    REVERSE_CSV_READER = csv.DictReader(REVERSE_CSV_HANDLER)
    REVERSE_ORIGIN_ID = selectOrigin(CURS, "reverse.csv")
    if REVERSE_ORIGIN_ID is None:
        return
    total = 0
    for row in REVERSE_CSV_READER:
        SUBJECT = row["subject"]
        RELATIONSHIP = row["relationship"]
        OBJECT = row["object"]
        print(f"RDF triplets:\n\t{SUBJECT} - {RELATIONSHIP} - {OBJECT}")
        insertSubject(CURS, SUBJECT, REVERSE_ORIGIN_ID)
        insertRelationship(CURS, RELATIONSHIP)
        insertObject(CURS, OBJECT, REVERSE_ORIGIN_ID)
        SUBJECT_ID = selectSubjectId(CURS, SUBJECT)
        OBJECT_ID = selectObjectId(CURS, OBJECT)
        RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
        MOVIE_ID = selectMovieId(CURS, OBJECT)
        skip = False
        # guard
        if SUBJECT_ID is None:
            print(f"No SubjectId for {SUBJECT}")
            skip = True
        if OBJECT_ID is None:
            print(f"No ObjectId for {OBJECT}")
            skip = True
        if RELATIONSHIP_ID is None:
            print(f"No RelationshipId for {RELATIONSHIP}")
            skip = True
        if MOVIE_ID is None:
            print(f"No MovieId for {OBJECT}")
            skip = True
        if skip:
            continue
        if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID):  # type: ignore
            total += 1
    print(total)
 def parseRDF_Dataset():
    DATASET_CSV_READER = csv.DictReader(DATASET_CSV_HANDLER)
    DATASET_ORIGIN_ID = selectOrigin(CURS, "dataset.csv")
    if DATASET_ORIGIN_ID is None:
        return
    total = 0
    rdf_idx = 0
    for row in DATASET_CSV_READER:
        SUBJECT = row["subject"]
        RELATIONSHIP = row["relationship"]
        OBJECT = row["object"]
        rdf_idx += 1
        if rdf_idx % 100000 == 0:
            print(f"RDF number {rdf_idx}:\n\t{SUBJECT} - {RELATIONSHIP} - {OBJECT}")
        insertSubject(CURS, SUBJECT, DATASET_ORIGIN_ID)
        insertRelationship(CURS, RELATIONSHIP)
        insertObject(CURS, OBJECT, DATASET_ORIGIN_ID)
        SUBJECT_ID = selectSubjectId(CURS, SUBJECT)
        OBJECT_ID = selectObjectId(CURS, OBJECT)
        RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
        MOVIE_ID = selectMovieId(CURS, SUBJECT)
        skip = False
        # guard
        if SUBJECT_ID is None:
            print(f"No SubjectId for {SUBJECT}")
            skip = True
        if OBJECT_ID is None:
            print(f"No ObjectId for {OBJECT}")
            skip = True
        if RELATIONSHIP_ID is None:
            print(f"No RelationshipId for {RELATIONSHIP}")
            skip = True
        if MOVIE_ID is None:
            print(f"No MovieId for {SUBJECT}")
            skip = True
        if skip:
            continue
        if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID):  # type: ignore
            total += 1
    print(total)
 def parseAbbr_Reverse():
    REVERSE_CSV_READER = csv.DictReader(REVERSE_CSV_HANDLER)
    REVERSE_ORIGIN_ID = selectOrigin(CURS, "reverse.csv")
    if REVERSE_ORIGIN_ID is None:
        return
    total = 0
    for row in REVERSE_CSV_READER:
        SUBJECT = row["subject"]
        RELATIONSHIP = row["relationship"]
        OBJECT = row["object"]
        SUBJECT_ID = selectSubjectId(CURS, SUBJECT)
        OBJECT_ID = selectObjectId(CURS, OBJECT)
        RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
        SUB_SECTIONS = SUBJECT.split("/")
        REL_SECTIONS = RELATIONSHIP.split("/")
        OBJ_SECTIONS = OBJECT.split("/")
        SUB_ABBR_ID = None
        REL_ABBR_ID = None
        OBJ_ABBR_ID = None
        skip = False
        # guard
        if SUBJECT_ID is None:
            print(f"No SubjectId for {SUBJECT}")
            skip = True
        if OBJECT_ID is None:
            print(f"No ObjectId for {OBJECT}")
            skip = True
        if RELATIONSHIP_ID is None:
            print(f"No RelationshipId for {RELATIONSHIP}")
            skip = True
        if skip:
            continue
        if len(SUB_SECTIONS) > 4:
            index = min(len(SUB_SECTIONS), 7)
            while index > 3:
                PATH = "/".join(SUB_SECTIONS[0:index]) + "%"
                SUB_ABBR_ID = select_abbreviation_id(PATH)
                if SUB_ABBR_ID is not None:
                    if insert_subject_abbreviation(SUBJECT_ID, SUB_ABBR_ID):
                        total += 1
                    index = 0
                index -= 1
        if len(REL_SECTIONS) > 4:
            index = min(len(REL_SECTIONS), 7)
            while index > 2:
                PATH = "/".join(REL_SECTIONS[0:index]) + "%"
                REL_ABBR_ID = select_abbreviation_id(PATH)
                if REL_ABBR_ID is not None:
                    if insert_relationship_abbreviation(RELATIONSHIP_ID, REL_ABBR_ID):
                        total += 1
                    index = 0
                index -= 1
        if len(OBJ_SECTIONS) > 4:
            index = min(len(OBJ_SECTIONS), 7)
            while index > 3:
                PATH = "/".join(OBJ_SECTIONS[0:index]) + "%"
                OBJ_ABBR_ID = select_abbreviation_id(PATH)
                if OBJ_ABBR_ID is not None:
                    if insert_object_abbreviation(OBJECT_ID, OBJ_ABBR_ID):
                        total += 1
                    index = 0
                index -= 1
    print(total)
 def parseAbbr_Dataset():
    DATASET_CSV_READER = csv.DictReader(DATASET_CSV_HANDLER)
    DATASET_ORIGIN_ID = selectOrigin(CURS, "dataset.csv")
    if DATASET_ORIGIN_ID is None:
        return
    total = 0
    rdf_idx = 0
    for row in DATASET_CSV_READER:
        SUBJECT = row["subject"]
        RELATIONSHIP = row["relationship"]
        OBJECT = row["object"]
        rdf_idx += 1
        if rdf_idx % 100000 == 0:
            print(f"RDF number {rdf_idx}:\n\t{SUBJECT} - {RELATIONSHIP} - {OBJECT}")
        SUBJECT_ID = selectSubjectId(CURS, SUBJECT)
        OBJECT_ID = selectObjectId(CURS, OBJECT)
        RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
        SUB_SECTIONS = SUBJECT.split("/")
        REL_SECTIONS = RELATIONSHIP.split("/")
        OBJ_SECTIONS = OBJECT.split("/")
        SUB_ABBR_ID = None
        REL_ABBR_ID = None
        OBJ_ABBR_ID = None
        skip = False
        # guard
        if SUBJECT_ID is None:
            print(f"No SubjectId for {SUBJECT}")
            skip = True
        if OBJECT_ID is None:
            print(f"No ObjectId for {OBJECT}")
            skip = True
        if RELATIONSHIP_ID is None:
            print(f"No RelationshipId for {RELATIONSHIP}")
            skip = True
        if skip:
            continue
        if len(SUB_SECTIONS) > 4:
            index = min(len(SUB_SECTIONS), 7)
            while index > 3:
                PATH = "/".join(SUB_SECTIONS[0:index]) + "%"
                SUB_ABBR_ID = select_abbreviation_id(PATH)
                if SUB_ABBR_ID is not None:
                    if insert_subject_abbreviation(SUBJECT_ID, SUB_ABBR_ID):
                        total += 1
                    index = 0
                index -= 1
        if len(REL_SECTIONS) > 4:
            index = min(len(REL_SECTIONS), 7)
            while index > 2:
                PATH = "/".join(REL_SECTIONS[0:index]) + "%"
                REL_ABBR_ID = select_abbreviation_id(PATH)
                if REL_ABBR_ID is not None:
                    if insert_relationship_abbreviation(RELATIONSHIP_ID, REL_ABBR_ID):
                        total += 1
                    index = 0
                index -= 1
        if len(OBJ_SECTIONS) > 4:
            index = min(len(OBJ_SECTIONS), 7)
            while index > 3:
                PATH = "/".join(OBJ_SECTIONS[0:index]) + "%"
                OBJ_ABBR_ID = select_abbreviation_id(PATH)
                if OBJ_ABBR_ID is not None:
                    if insert_object_abbreviation(OBJECT_ID, OBJ_ABBR_ID):
                        total += 1
                    index = 0
                index -= 1
    print(total)
 # MARK: Actual Code
 # parseMovies()
 # parseWikiPageId()
 # parseAbstract()
 # insertOrigin(CURS)
 # parseAbbreviations()
 # parseRDF_Reverse()
 # parseRDF_Dataset()
 # parseAbbr_Reverse()
 parseAbbr_Dataset()
 CONN.commit()
 CONN.close()
 MOVIES_CSV_HANDLER.close()
 PAGEID_CSV_HANDLER.close()
 SUMMARY_CSV_HANDLER.close()
 DATASET_CSV_HANDLER.close()
 REVERSE_CSV_HANDLER.close()
 URI_ABBR_CSV_HANDLER.close()
 """
 The MovieUri: http://dbpedia.org/resource/1%25_(film) has not a MovieId
 The MovieUri: http://dbpedia.org/resource/10%25:_What_Makes_a_Hero%3F has not a MovieId
 The MovieUri: http://dbpedia.org/resource/100%25_Arabica has not a MovieId
 The MovieUri: http://dbpedia.org/resource/100%25_Kadhal has not a MovieId
 The MovieUri: http://dbpedia.org/resource/100%25_Love_(2011_film) has not a MovieId
 The MovieUri: http://dbpedia.org/resource/100%25_Love_(2012_film) has not a MovieId
 The MovieUri: http://dbpedia.org/resource/100%25_Wolf has not a MovieId
 The MovieUri: http://dbpedia.org/resource/Who_the_$&%25_Is_Jackson_Pollock%3F has not a MovieId
 The MovieUri: http://dbpedia.org/resource/99%25:_The_Occupy_Wall_Street_Collaborative_Film has not a MovieId
 The MovieUri: http://dbpedia.org/resource/99_and_44/100%25_Dead has not a MovieId
 The MovieUri: http://dbpedia.org/resource/Postcards_from_the_48%25 has not a MovieId
 The MovieUri: http://dbpedia.org/resource/Wool_100%25 has not a MovieId
 """
 """
 The WikiPageId: 10068850 has not a MovieId
 The WikiPageId: 55069615 has not a MovieId
 The WikiPageId: 49510056 has not a MovieId
 The WikiPageId: 4049786 has not a MovieId
 The WikiPageId: 55510238 has not a MovieId
 The WikiPageId: 31239628 has not a MovieId
 The WikiPageId: 34757217 has not a MovieId
 The WikiPageId: 64311757 has not a MovieId
 The WikiPageId: 8326198 has not a MovieId
 The WikiPageId: 42162164 has not a MovieId
 The WikiPageId: 18502369 has not a MovieId
 The WikiPageId: 58092358 has not a MovieId
 The WikiPageId: 40710250 has not a MovieId
 """
--- a/Scripts/Experiments/.gitkeep
+++ b/Scripts/Experiments/.gitkeep
--- a/Scripts/Experiments/Queries/.gitkeep
+++ b/Scripts/Experiments/Queries/.gitkeep
--- a/Scripts/Experiments/Tmp/.gitkeep
+++ b/Scripts/Experiments/Tmp/.gitkeep
--- a/Scripts/Libs/CleaningPipeline/.gitkeep
+++ b/Scripts/Libs/CleaningPipeline/.gitkeep
--- a/Scripts/Libs/Utils/.gitkeep
+++ b/Scripts/Libs/Utils/.gitkeep
--- a/Scripts/UML/CleaningPipeline/classes.excalidraw.json
+++ b/Scripts/UML/CleaningPipeline/classes.excalidraw.json
@ -0,0 +1,826 @@
 {
  "type": "excalidraw",
  "version": 2,
  "source": "https://marketplace.visualstudio.com/items?itemName=pomdtr.excalidraw-editor",
  "elements": [
    {
      "type": "line",
      "version": 4622,
      "versionNonce": 1623045672,
      "isDeleted": false,
      "id": "twu_PiAvEuQ4l1YYtZLET",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 289.8504963515835,
      "y": 91.87474806402287,
      "strokeColor": "#000000",
      "backgroundColor": "#a5d8ff",
      "width": 77.09201683999922,
      "height": 99.49948667804088,
      "seed": 1975340120,
      "groupIds": [
        "9PT4BXPfQ6UoCaB-T-h9A",
        "dp_TZJyYdyPIH1hOkAPlb"
      ],
      "strokeSharpness": "round",
      "boundElementIds": [],
      "startBinding": null,
      "endBinding": null,
      "lastCommittedPoint": null,
      "startArrowhead": null,
      "endArrowhead": null,
      "points": [
        [
          0,
          0
        ],
        [
          0.2542098813493443,
          75.20117273657175
        ],
        [
          0.011896425679918422,
          83.76249969444815
        ],
        [
          3.970409367559332,
          87.46174320643391
        ],
        [
          17.75573317066317,
          90.59250103325854
        ],
        [
          41.05683533152865,
          91.56737225214069
        ],
        [
          63.319497586673116,
          90.01084754868091
        ],
        [
          75.14781395923075,
          86.28844687220405
        ],
        [
          76.81603792670788,
          83.15042405259751
        ],
        [
          77.05033394391478,
          76.25776215104557
        ],
        [
          76.86643881413028,
          6.3089586511537865
        ],
        [
          76.45188016352971,
          -0.2999144698665015
        ],
        [
          71.50179495549581,
          -3.9936571317850627
        ],
        [
          61.077971898861186,
          -6.132877429442784
        ],
        [
          37.32348754161154,
          -7.932114425900202
        ],
        [
          18.278415656797975,
          -6.859225353587373
        ],
        [
          3.2995959613238286,
          -3.2201165291205287
        ],
        [
          -0.04168289608444441,
          -0.045185660461322996
        ],
        [
          0,
          0
        ]
      ],
      "index": "a1",
      "frameId": null,
      "roundness": {
        "type": 2
      },
      "boundElements": [],
      "updated": 1758646548051,
      "link": null,
      "locked": false
    },
    {
      "type": "line",
      "version": 2327,
      "versionNonce": 1593094440,
      "isDeleted": false,
      "id": "hmJk4dH9VpOsfkrCTkhvh",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 290.3744257898585,
      "y": 149.00103172175278,
      "strokeColor": "#000000",
      "backgroundColor": "#a5d8ff",
      "width": 77.17198221193564,
      "height": 8.562348957853036,
      "seed": 637665624,
      "groupIds": [
        "9PT4BXPfQ6UoCaB-T-h9A",
        "dp_TZJyYdyPIH1hOkAPlb"
      ],
      "strokeSharpness": "round",
      "boundElementIds": [],
      "startBinding": null,
      "endBinding": null,
      "lastCommittedPoint": null,
      "startArrowhead": null,
      "endArrowhead": null,
      "points": [
        [
          0,
          0
        ],
        [
          2.033150371639873,
          3.413095389435587
        ],
        [
          10.801287372573954,
          6.276651055277943
        ],
        [
          22.468666942209353,
          8.010803051612635
        ],
        [
          40.747074201802775,
          8.168828515515864
        ],
        [
          62.077348233027564,
          7.0647721921469495
        ],
        [
          74.53446931782398,
          3.04824021069218
        ],
        [
          77.17198221193564,
          -0.3935204423371723
        ]
      ],
      "index": "a2",
      "frameId": null,
      "roundness": {
        "type": 2
      },
      "boundElements": [],
      "updated": 1758646548051,
      "link": null,
      "locked": false
    },
    {
      "type": "line",
      "version": 2413,
      "versionNonce": 311708712,
      "isDeleted": false,
      "id": "X1ldVIXm4DfBal5N2Pwn9",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 289.3425684673547,
      "y": 120.03697638652972,
      "strokeColor": "#000000",
      "backgroundColor": "#a5d8ff",
      "width": 77.17198221193564,
      "height": 8.562348957853036,
      "seed": 904402520,
      "groupIds": [
        "9PT4BXPfQ6UoCaB-T-h9A",
        "dp_TZJyYdyPIH1hOkAPlb"
      ],
      "strokeSharpness": "round",
      "boundElementIds": [],
      "startBinding": null,
      "endBinding": null,
      "lastCommittedPoint": null,
      "startArrowhead": null,
      "endArrowhead": null,
      "points": [
        [
          0,
          0
        ],
        [
          2.033150371639873,
          3.413095389435587
        ],
        [
          10.801287372573954,
          6.276651055277943
        ],
        [
          22.468666942209353,
          8.010803051612635
        ],
        [
          40.747074201802775,
          8.168828515515864
        ],
        [
          62.077348233027564,
          7.0647721921469495
        ],
        [
          74.53446931782398,
          3.04824021069218
        ],
        [
          77.17198221193564,
          -0.3935204423371723
        ]
      ],
      "index": "a3",
      "frameId": null,
      "roundness": {
        "type": 2
      },
      "boundElements": [],
      "updated": 1758646548051,
      "link": null,
      "locked": false
    },
    {
      "type": "ellipse",
      "version": 5410,
      "versionNonce": 92833576,
      "isDeleted": false,
      "id": "CFhp5ZxSVwHYzGUj4hEn1",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 288.28461948527263,
      "y": 84.74247943834126,
      "strokeColor": "#000000",
      "backgroundColor": "#a5d8ff",
      "width": 76.59753601865496,
      "height": 15.49127539284798,
      "seed": 1782811480,
      "groupIds": [
        "9PT4BXPfQ6UoCaB-T-h9A",
        "dp_TZJyYdyPIH1hOkAPlb"
      ],
      "strokeSharpness": "sharp",
      "boundElementIds": [
        "bxuMGTzXLn7H-uBCptINx"
      ],
      "index": "a4",
      "frameId": null,
      "roundness": null,
      "boundElements": [],
      "updated": 1758646548051,
      "link": null,
      "locked": false
    },
    {
      "type": "ellipse",
      "version": 820,
      "versionNonce": 608002600,
      "isDeleted": false,
      "id": "B43R7rWwK2_vdiRHBSSPk",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 324.77660659049513,
      "y": 109.21914711824485,
      "strokeColor": "#000000",
      "backgroundColor": "#228be6",
      "width": 11.226103154161754,
      "height": 12.183758484455605,
      "seed": 1298686040,
      "groupIds": [
        "9PT4BXPfQ6UoCaB-T-h9A",
        "dp_TZJyYdyPIH1hOkAPlb"
      ],
      "strokeSharpness": "sharp",
      "boundElementIds": [],
      "index": "a5",
      "frameId": null,
      "roundness": null,
      "boundElements": [],
      "updated": 1758646548051,
      "link": null,
      "locked": false
    },
    {
      "type": "ellipse",
      "version": 1108,
      "versionNonce": 1839127848,
      "isDeleted": false,
      "id": "CkKMb9wkJfVk04T217zSs",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 325.12774837442873,
      "y": 135.43576140530996,
      "strokeColor": "#000000",
      "backgroundColor": "#228be6",
      "width": 11.226103154161754,
      "height": 12.183758484455605,
      "seed": 2133497176,
      "groupIds": [
        "9PT4BXPfQ6UoCaB-T-h9A",
        "dp_TZJyYdyPIH1hOkAPlb"
      ],
      "strokeSharpness": "sharp",
      "boundElementIds": [],
      "index": "a6",
      "frameId": null,
      "roundness": null,
      "boundElements": [],
      "updated": 1758646548051,
      "link": null,
      "locked": false
    },
    {
      "type": "ellipse",
      "version": 991,
      "versionNonce": 588838952,
      "isDeleted": false,
      "id": "SHJdKeQPkfpvzSoNH--3o",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 6.239590202363168,
      "x": 325.77660659049513,
      "y": 164.20448797661635,
      "strokeColor": "#000000",
      "backgroundColor": "#228be6",
      "width": 11.226103154161754,
      "height": 12.183758484455605,
      "seed": 81668696,
      "groupIds": [
        "9PT4BXPfQ6UoCaB-T-h9A",
        "dp_TZJyYdyPIH1hOkAPlb"
      ],
      "strokeSharpness": "sharp",
      "boundElementIds": [],
      "index": "a7",
      "frameId": null,
      "roundness": null,
      "boundElements": [],
      "updated": 1758646548051,
      "link": null,
      "locked": false
    },
    {
      "type": "text",
      "version": 489,
      "versionNonce": 2023207720,
      "isDeleted": false,
      "id": "vUSyMBPup0jZ71CYXKyGb",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 280.1846389770508,
      "y": 185.79462957545917,
      "strokeColor": "#000000",
      "backgroundColor": "#a5d8ff",
      "width": 95.63072204589844,
      "height": 23.595161071904883,
      "seed": 425140056,
      "groupIds": [
        "dp_TZJyYdyPIH1hOkAPlb"
      ],
      "strokeSharpness": "sharp",
      "boundElementIds": [],
      "fontSize": 17.4778970902999,
      "fontFamily": 1,
      "text": "dataset.db",
      "baseline": 16.595161071904883,
      "textAlign": "center",
      "verticalAlign": "top",
      "index": "a8",
      "frameId": null,
      "roundness": null,
      "boundElements": [],
      "updated": 1758646548051,
      "link": null,
      "locked": false,
      "containerId": null,
      "originalText": "dataset.db",
      "autoResize": true,
      "lineHeight": 1.350000000000001
    },
    {
      "id": "R7pU0VP6CFKCAwuvt0xsr",
      "type": "text",
      "x": 295.5,
      "y": 342,
      "width": 374,
      "height": 225,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "#228be6",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a9",
      "roundness": null,
      "seed": 705463336,
      "version": 1130,
      "versionNonce": 72522328,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758648226024,
      "link": null,
      "locked": false,
      "text": "class Extract(Action):\n    # Static\n    + type : ActionTypes = Extract\n    \n    # Properties\n    - db_connection: Path\n    - query: str\n    - query_parameters: [str]\n    - output_mapper: [str]",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class Extract(Action):\n    # Static\n    + type : ActionTypes = Extract\n    \n    # Properties\n    - db_connection: Path\n    - query: str\n    - query_parameters: [str]\n    - output_mapper: [str]",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "G1xIRcJgm34_NMEWQFFlW",
      "type": "text",
      "x": 1419.5,
      "y": 110,
      "width": 253,
      "height": 75,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "#228be6",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aA",
      "roundness": null,
      "seed": 651981400,
      "version": 256,
      "versionNonce": 138082856,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758646570344,
      "link": null,
      "locked": false,
      "text": "class Pipeline\n    - actions: [Action]\n    ",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class Pipeline\n    - actions: [Action]\n    ",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "TBVy3JbJCkbA9kjVEJ8lv",
      "type": "text",
      "x": 694,
      "y": 100,
      "width": 495,
      "height": 150,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "#228be6",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aB",
      "roundness": null,
      "seed": 680960040,
      "version": 560,
      "versionNonce": 85012520,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758649442239,
      "link": null,
      "locked": false,
      "text": "class Action\n    + type: ActionTypes\n    + name: str\n    + depends_on: [str]\n\n    + execute(mem) -> [Dict<str, any>] | Void",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class Action\n    + type: ActionTypes\n    + name: str\n    + depends_on: [str]\n\n    + execute(mem) -> [Dict<str, any>] | Void",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "an7KRTzWpCytKNKgHftKC",
      "type": "text",
      "x": 1528.5,
      "y": 365.5,
      "width": 187,
      "height": 150,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "#228be6",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aC",
      "roundness": null,
      "seed": 1974317656,
      "version": 306,
      "versionNonce": 1574962264,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758648154009,
      "link": null,
      "locked": false,
      "text": "enum ActionTypes:\n    + Extract\n    + Aggregate\n    + Filter\n    + Map\n    + Dump",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "enum ActionTypes:\n    + Extract\n    + Aggregate\n    + Filter\n    + Map\n    + Dump",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "2pQ5EULirrWs_QZPbClhh",
      "type": "text",
      "x": 785,
      "y": 332.5,
      "width": 418,
      "height": 375,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "#228be6",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aH",
      "roundness": null,
      "seed": 1402251560,
      "version": 742,
      "versionNonce": 680432168,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758649532881,
      "link": null,
      "locked": false,
      "text": "class Aggregate(Action):\n    # Static\n    + type: ActionTypes = Aggregate\n\n    # Properties\n    - actionIDs: [str]\n    - associations: [Association]\n    - output_mapper: [str]\n\n    + execute(mem):\n        tables = mem.gather(actionIDs)\n\n        for join in association:\n            \n            ",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class Aggregate(Action):\n    # Static\n    + type: ActionTypes = Aggregate\n\n    # Properties\n    - actionIDs: [str]\n    - associations: [Association]\n    - output_mapper: [str]\n\n    + execute(mem):\n        tables = mem.gather(actionIDs)\n\n        for join in association:\n            \n            ",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "O0fso8DJqFfwJEzmpUikM",
      "type": "text",
      "x": 1289,
      "y": 195,
      "width": 594,
      "height": 100,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "#228be6",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aI",
      "roundness": null,
      "seed": 1582329944,
      "version": 459,
      "versionNonce": 1080077144,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758647067031,
      "link": null,
      "locked": false,
      "text": "input_mapper: \n    - key: ActionID (name) that produced such output\n    - value: list of strings that represent the values\n                to take",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "input_mapper: \n    - key: ActionID (name) that produced such output\n    - value: list of strings that represent the values\n                to take",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "v0kzO6vlBWOdJCV3yoG69",
      "type": "text",
      "x": 1379.5,
      "y": 718.5,
      "width": 286,
      "height": 175,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "#228be6",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aL",
      "roundness": null,
      "seed": 1462407976,
      "version": 635,
      "versionNonce": 1012998696,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758649495598,
      "link": null,
      "locked": false,
      "text": "class Association:\n    - from_actionID: str\n    - from_key_name: str\n    - from_value_name: str\n    - to_actionID: str\n    - to_value_name: str\n    - type: Type",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class Association:\n    - from_actionID: str\n    - from_key_name: str\n    - from_value_name: str\n    - to_actionID: str\n    - to_value_name: str\n    - type: Type",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "WK34n9xeVxntypCtrlK6p",
      "type": "text",
      "x": 256.5,
      "y": 787.5,
      "width": 517,
      "height": 175,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "#228be6",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aM",
      "roundness": null,
      "seed": 1166526296,
      "version": 318,
      "versionNonce": 1042162520,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758649002604,
      "link": null,
      "locked": false,
      "text": "class Filter(Action):\n    # Static\n    + type: ActionTypes = Filter\n\n    # Properties\n    - compare: function(Dict<str, any>) -> bool\n    - output_mapper: [str]",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class Filter(Action):\n    # Static\n    + type: ActionTypes = Filter\n\n    # Properties\n    - compare: function(Dict<str, any>) -> bool\n    - output_mapper: [str]",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "NY9jyUFLFFCNPE2sh00SX",
      "type": "text",
      "x": 1639,
      "y": 606.5,
      "width": 407,
      "height": 200,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "#228be6",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aP",
      "roundness": null,
      "seed": 20345896,
      "version": 168,
      "versionNonce": 627282472,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758649426380,
      "link": null,
      "locked": false,
      "text": "class Map(Action):\n    # Static\n    + type: ActionTypes = Map\n\n    # Properties\n    - compare_mapper: [str]\n    - mapper: function(any...) -> any\n    - output_mapper: [str]",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class Map(Action):\n    # Static\n    + type: ActionTypes = Map\n\n    # Properties\n    - compare_mapper: [str]\n    - mapper: function(any...) -> any\n    - output_mapper: [str]",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "SkhaoW-3TTKDZzEii3Lf6",
      "type": "text",
      "x": 1457.5,
      "y": 955.5,
      "width": 121,
      "height": 50,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "#228be6",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aQ",
      "roundness": null,
      "seed": 2071523672,
      "version": 37,
      "versionNonce": 105260376,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758648834435,
      "link": null,
      "locked": false,
      "text": "class Dump:\n    -",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class Dump:\n    -",
      "autoResize": true,
      "lineHeight": 1.25
    }
  ],
  "appState": {
    "gridSize": 20,
    "gridStep": 5,
    "gridModeEnabled": false,
    "viewBackgroundColor": "#ffffff"
  },
  "files": {}
 }
--- a/docs/DBPEDIA.md
+++ b/docs/DBPEDIA.md
@ -0,0 +1,215 @@
 # DBPedia
 ## GraphIRI
 This is the graph identifier (URI):
 `http://dbpedia.org`
 ## History of queries
 ```SQL
 PREFIX dbo:  <http://dbpedia.org/ontology/>
 PREFIX dbp:  <http://dbpedia.org/property/>
 PREFIX dbr:  <http://dbpedia.org/resource/>
 SELECT ?subject, ?relationship, ?object
 WHERE {
  ?subject ?relationship ?object .
  {
    SELECT ?object
    WHERE {
      ?m rdf:type dbo:Film .
      ?object ?r ?m
    }
  }
 }
 ```
 ### 2 Hops
 ```SQL
 PREFIX dbo:  <http://dbpedia.org/ontology/>
 PREFIX dbp:  <http://dbpedia.org/property/>
 PREFIX dbr:  <http://dbpedia.org/resource/>
 SELECT ?subject, ?relationship, ?object
 WHERE {
  ?subject ?relationship ?object .
  FILTER (?relationship != <http://dbpedia.org/ontology/wikiPageWikiLink>)
  {
    SELECT ?object
    WHERE {
      ?m rdf:type dbo:Film .
      ?object ?r ?m
      FILTER (?r != <http://dbpedia.org/ontology/wikiPageWikiLink>)
    }
  }
 }
 LIMIT 1000000
 ```
 ### 1 Hop
 ```SQL
 PREFIX dbo:  <http://dbpedia.org/ontology/>
 PREFIX dbp:  <http://dbpedia.org/property/>
 PREFIX dbr:  <http://dbpedia.org/resource/>
 SELECT ?subject, ?relationship, ?object
 WHERE {
  ?subject ?relationship ?object .
  ?object rdf:type dbo:Film .
  FILTER (?relationship != <http://dbpedia.org/ontology/wikiPageWikiLink>)
 }
 LIMIT 1000000
 ```
 ```SQL
 PREFIX dbo:  <http://dbpedia.org/ontology/>
 PREFIX dbp:  <http://dbpedia.org/property/>
 PREFIX dbr:  <http://dbpedia.org/resource/>
 SELECT ?subject, ?relationship, ?object
 WHERE {
  ?subject ?relationship ?object .
  ?subject rdf:type dbo:Film .
 }
 ```
 ```SQL
 PREFIX dbo:  <http://dbpedia.org/ontology/>
 PREFIX dbp:  <http://dbpedia.org/property/>
 PREFIX dbr:  <http://dbpedia.org/resource/>
 PREFIX foaf: <http://xmlns.com/foaf/0.1/>
 SELECT ?subject, ?relationship, ?object
 WHERE {
  ?subject ?relationship ?object .
  ?subject rdf:type dbo:Film .
  ?a foaf:primaryTopic ?subject
 }
 ```
 ```SQL
 PREFIX dbo:  <http://dbpedia.org/ontology/>
 PREFIX dbp:  <http://dbpedia.org/property/>
 PREFIX dbr:  <http://dbpedia.org/resource/>
 SELECT ?subject
 WHERE {
  ?subject rdf:type dbo:Film .
 }
 ```
 ```SQL
 PREFIX dbo:  <http://dbpedia.org/ontology/>
 PREFIX dbp:  <http://dbpedia.org/property/>
 PREFIX dbr:  <http://dbpedia.org/resource/>
 PREFIX foaf: <http://xmlns.com/foaf/0.1/>
 SELECT ?subject
 WHERE {
  ?subject rdf:type dbo:Film .
  ?a foaf:primaryTopic ?subject
 }
 ```
 ```SQL
 PREFIX dbo:  <http://dbpedia.org/ontology/>
 PREFIX dbp:  <http://dbpedia.org/property/>
 PREFIX dbr:  <http://dbpedia.org/resource/>
 PREFIX foaf: <http://xmlns.com/foaf/0.1/>
 SELECT ?subject, ?relationship, ?object
 WHERE {
  ?subject ?relationship ?object .
  ?subject rdf:type dbo:Film .
  ?a foaf:primaryTopic ?subject
  FILTER (?relationship NOT IN (
    dbo:wikiPageRedirects,
    dbo:wikiPageExternalLink,
    dbo:wikiPageWikiLink,
    foaf:primaryTopic
  ))
 }
 ```
 #### Wikipedia-movie
 a.k.a the file with the wikipedia abstract
 ```SQL
 PREFIX dbo:  <http://dbpedia.org/ontology/>
 PREFIX dbp:  <http://dbpedia.org/property/>
 PREFIX dbr:  <http://dbpedia.org/resource/>
 PREFIX foaf: <http://xmlns.com/foaf/0.1/>
 SELECT  ?subject , ?object
 WHERE {
  ?subject foaf:primaryTopic ?object .
  ?object rdf:type dbo:Film 
 }
 ```
 #### Reverse
 ```SQL
 PREFIX dbo:  <http://dbpedia.org/ontology/>
 PREFIX dbp:  <http://dbpedia.org/property/>
 PREFIX dbr:  <http://dbpedia.org/resource/>
 PREFIX foaf: <http://xmlns.com/foaf/0.1/>
 SELECT ?subject, ?relationship, ?object
 WHERE {
  ?subject ?relationship ?object .
  ?object rdf:type dbo:Film .
  ?a foaf:primaryTopic ?object
  FILTER (?relationship NOT IN (
    dbo:wikiPageRedirects,
    dbo:wikiPageExternalLink,
    dbo:wikiPageWikiLink,
    foaf:primaryTopic
  ))
 }
 ```
 ```SQL
 PREFIX dbo:  <http://dbpedia.org/ontology/>
 PREFIX dbp:  <http://dbpedia.org/property/>
 PREFIX dbr:  <http://dbpedia.org/resource/>
 PREFIX foaf: <http://xmlns.com/foaf/0.1/>
 SELECT ?subject, ?relationship, ?object
 WHERE {
  ?subject ?relationship ?object .
  ?object rdf:type dbo:Film .
  ?a foaf:primaryTopic ?object
  FILTER (?relationship NOT IN (
    dbo:wikiPageRedirects,
    dbo:wikiPageExternalLink,
    dbo:wikiPageWikiLink,
    foaf:primaryTopic
  ))
 ```
 #### Film \ wiki page ID
 ```SQL
 PREFIX dbo:  <http://dbpedia.org/ontology/>
 PREFIX dbp:  <http://dbpedia.org/property/>
 PREFIX dbr:  <http://dbpedia.org/resource/>
 PREFIX foaf: <http://xmlns.com/foaf/0.1/>
 PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
 SELECT ?subject ?pageID
 WHERE {
  ?subject rdf:type dbo:Film .
  ?subject dbo:wikiPageID ?pageID .
  ?subject rdfs:label ?label .
  FILTER (lang(?label) = "en")
 }
 ```
--- a/docs/DEVELOPMENT.md
+++ b/docs/DEVELOPMENT.md
@ -0,0 +1,3 @@
 # Development
 ## Data Gathering
--- a/docs/RESOURCES.md
+++ b/docs/RESOURCES.md
@ -0,0 +1,108 @@
 # Resources
 ## Byte-Pair Encoding (BPE)
 ### Overview
 Byte-Pair Encoding (BPE) is a simple but powerful text compression and tokenization algorithm.
 Originally introduced as a data compression method, it has been widely adopted in **Natural Language Processing (NLP)** to build subword vocabularies for models such as GPT and BERT.
 ---
 ### Key Idea
 BPE works by iteratively replacing the most frequent pair of symbols (initially characters) with a new symbol.
 Over time, frequent character sequences (e.g., common morphemes, prefixes, suffixes) are merged into single tokens.
 ---
 ### Algorithm Steps
 1. **Initialization**
   - Treat each character of the input text as a token.
 2. **Find Frequent Pairs**
   - Count all adjacent token pairs in the sequence.
 3. **Merge Most Frequent Pair**
   - Replace the most frequent pair with a new symbol not used in the text.
 4. **Repeat**
   - Continue until no frequent pairs remain or a desired vocabulary size is reached.
 ---
 ### Example
 Suppose the data to be encoded is:
 ```text
 aaabdaaabac
 ```
 #### Step 1: Merge `"aa"`
 Most frequent pair: `"aa"` → replace with `"Z"`
 ```text
 ZabdZabac
 Z = aa
 ```
 ---
 #### Step 2: Merge `"ab"`
 Most frequent pair: `"ab"` → replace with `"Y"`
 ```text
 ZYdZYac
 Y = ab
 Z = aa
 ```
 ---
 #### Step 3: Merge `"ZY"`
 Most frequent pair: `"ZY"` → replace with `"X"`
 ```text
 XdXac
 X = ZY
 Y = ab
 Z = aa
 ```
 ---
 At this point, no pairs occur more than once, so the process stops.
 ---
 ### Decompression
 To recover the original data, replacements are applied in **reverse order**:
 ```text
 XdXac
 → ZYdZYac
 → ZabdZabac
 → aaabdaaabac
 ```
 ---
 ### Advantages
 - **Efficient vocabulary building**: reduces the need for massive word lists.
 - **Handles rare words**: breaks them into meaningful subword units.
 - **Balances character- and word-level tokenization**.
 ---
 ### Limitations
 - Does not consider linguistic meaning—merges are frequency-based.
 - May create tokens that are not linguistically natural.
 - Vocabulary is fixed after training.
--- a/docs/SPARQL.md
+++ b/docs/SPARQL.md
@ -0,0 +1,67 @@
 # SparQL
 > [!NOTE]
 > Resources taken from [this website](https://sparql.dev/)
 ## SQL Queries
 ### SELECT
 ```SQL
 SELECT ?var1, ?var2, ...
 ```
 ### WHERE
 ```SQL
 WHERE {
    pattern1 .
    pattern2 .
    ...
 }
 ```
 ### FILTER
 It's used to restrict [`WHERE`](#where) clauses
 ```SQL
 WHERE {
  ?person <http://example.com/hasCar> ?car .
  FILTER (?car = <http://example.com/Car1>)
 }
 ```
 ### OPTIONAL
 It's used to fetch available content if exists
 ```SQL
 SELECT ?person ?car
 WHERE {
  ?person <http://example.com/hasCar> ?car .
  OPTIONAL {
    ?car <http://example.com/hasColor> ?color .
  }
 }
 ```
 ### LIMIT
 Limits results
 ```SQL
 LIMIT 10 -- Take only 10 results
 ```
 ## SparQL functions
 ### COUNT
 ```SQL
 SELECT (COUNT(?person) AS ?count)
 WHERE {
  ?person <http://example.com/hasCar> ?car .
 }
 ```
--- a/environment.yaml
+++ b/environment.yaml
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,17 @@
 certifi==2025.8.3
 charset-normalizer==3.4.3
 idna==3.10
 numpy==2.3.3
 pandas==2.3.2
 pyparsing==3.2.4
 python-dateutil==2.9.0.post0
 pytz==2025.2
 rdflib==7.1.4
 requests==2.32.5
 setuptools==78.1.1
 six==1.17.0
 SPARQLWrapper==2.0.0
 tzdata==2025.2
 urllib3==2.5.0
 wheel==0.45.1
 Wikipedia-API==0.8.1
Author	SHA1	Message	Date
GassiGiuseppe	9440a562f2	Merge branch 'dev.etl' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev.etl	2025-09-25 18:33:51 +02:00
Christian Risi	5eda131aac	Fixed creation query to be unique even with movieID in RDFs	2025-09-25 17:58:09 +02:00
GassiGiuseppe	57884eaf2e	CSV support added to path_splitter_tree Also resolved a minor bug to print also leaf nodes	2025-09-25 17:57:46 +02:00
Christian Risi	4548a683c2	Fixed DB	2025-09-25 17:57:45 +02:00
GassiGiuseppe	3eec49ffa5	WIP: added test file: clean_relationship.jupyter to create a first cleaning pipeline	2025-09-25 16:28:24 +02:00
Christian Risi	0bc7f4b227	Fixed Typos	2025-09-25 12:37:52 +02:00
Christian Risi	f28952b0a2	Added todo	2025-09-25 12:00:26 +02:00
Christian Risi	0b626a8e09	Modified query to take all data	2025-09-25 11:53:12 +02:00
Christian Risi	b254098532	Added views to count for subjects and objects	2025-09-25 11:40:44 +02:00
Christian Risi	ee88ffe4cf	Added View to filter over relationship counts	2025-09-25 11:32:03 +02:00
Christian Risi	70b4bd8645	Added Complex query	2025-09-25 11:31:34 +02:00
Christian Risi	6316d2bfc4	Added queries to take data from SQL for dataset	2025-09-25 11:27:19 +02:00
Christian Risi	87ca748f45	Updated DB to reflect new changes	2025-09-24 19:29:57 +02:00
Christian Risi	4315d70109	Merged abbreviation_datawarehouse into datawarehouse	2025-09-24 19:29:43 +02:00
Christian Risi	9a5d633b5e	Fixed Typos	2025-09-24 19:29:07 +02:00
Christian Risi	a6760cd52d	Updated SQL Queries to support parsing in DB	2025-09-24 19:28:55 +02:00
GassiGiuseppe	a7eb92227d	Moved all db queries file in their own folder	2025-09-24 16:44:55 +02:00
GassiGiuseppe	9f221e31cd	Merge branch 'dev.etl' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev.etl	2025-09-24 16:32:52 +02:00
GassiGiuseppe	47197194d5	WIP abbrevietion_datawarehouse to creat an abbreviation system	2025-09-24 16:32:09 +02:00
Christian Risi	0cdbf6f624	Added query to retrieve a dirty dataset from SQLite DB	2025-09-24 16:15:47 +02:00
Christian Risi	3e30489f86	Updated Queries for DB	2025-09-24 14:44:53 +02:00
Christian Risi	8a22e453e4	Fixed csv	2025-09-24 14:44:25 +02:00
Christian Risi	7feb4eb857	Fixed URI generation	2025-09-24 14:44:07 +02:00
Christian Risi	70af19d356	Removed unused imports and added trailing slashes	2025-09-24 14:04:48 +02:00
Christian Risi	a4b44ab2ee	Fixed Typos	2025-09-24 14:04:27 +02:00
Christian Risi	74b6b609dd	Fixed typos	2025-09-24 13:59:19 +02:00
Christian Risi	59796c37cb	Added script to take dbpedia uris	2025-09-24 13:49:29 +02:00
Christian Risi	f696f5950b	Added uri-abbreviations	2025-09-24 13:48:53 +02:00
Christian Risi	605b496da7	Added barebone UML diagram for a Cleaning Pipeline	2025-09-23 19:49:01 +02:00
Christian Risi	7d693964dd	Added new directories to tree structure	2025-09-23 19:47:56 +02:00
Christian Risi	25f401b577	Fixed bug for parsing and added CLI functionalities	2025-09-23 17:58:08 +02:00
Christian Risi	14c5ade230	Added CLI functionalities	2025-09-23 17:57:38 +02:00
chris-admin	4c9c51f902	Added barebone to have a splitter	2025-09-23 15:34:53 +02:00
GassiGiuseppe	63c1a4a160	added little snippet to rebuild db from db_creation.sql	2025-09-22 17:52:23 +02:00
GassiGiuseppe	51114af853	DataRetrivial deleted since it does the same thing as datawarehouse.py	2025-09-22 17:51:35 +02:00
GassiGiuseppe	3a6dca0681	Infos about Dataset contruction from csv moved from python file to markdown	2025-09-22 17:39:44 +02:00
GassiGiuseppe	346098d2b7	Added query.sql , file with the query used to populate the Dataset	2025-09-22 17:21:32 +02:00
GassiGiuseppe	64f9b41378	Built datawarehouse.py which populate the dataset	2025-09-22 17:17:22 +02:00
GassiGiuseppe	ac1ed42c49	Folder DataCleaning renamed to DatasetMerging since it doesn't clean nothing and instead Build the dataset	2025-09-22 17:11:49 +02:00
GassiGiuseppe	edd01a2c83	Dataset updated, the new one is built with the new method ( 50 new rows found ... upon 13 milion )	2025-09-22 16:57:06 +02:00
GassiGiuseppe	5aa9e3fcf3	Added in DBPEDIA the query to get Film \ wiki page ID plus some editing	2025-09-22 15:42:57 +02:00
GassiGiuseppe	0970cabf92	reverse.csv grammar correction of the header it seemed to have missplaced the header also in the middle of the csv	2025-09-22 13:47:20 +02:00
GassiGiuseppe	a26d92750f	Update movie-pageid.csv : grammar correction of the header	2025-09-22 12:59:35 +02:00
GassiGiuseppe	34c4782232	Dataset.db update. it seems to be correct	2025-09-20 23:33:56 +02:00
GassiGiuseppe	c5439533e6	DataRetrivial update, without df	2025-09-20 23:32:08 +02:00
GassiGiuseppe	8819b8e87f	DataRetrivial populate the db from csv	2025-09-20 19:56:24 +02:00
Christian Risi	1076dc8aa6	Run /Scripts/DataCleaning/SQL_Queries/db_creation.sql	2025-09-20 16:39:16 +02:00
Christian Risi	3d15e03b09	Renamed file to fix spelling	2025-09-20 16:38:38 +02:00
Christian Risi	0ee2ec6fcd	Spelling corrections	2025-09-20 16:37:57 +02:00
Christian Risi	95cfa5486c	Added instructions to create databse schema	2025-09-20 16:30:08 +02:00
GassiGiuseppe	0d30e90ee0	Created file for the db DatawareHouse Also decided firsts schema models into DBMerger	2025-09-20 15:53:32 +02:00
GassiGiuseppe	faaba17a98	Merge branch 'dev' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev	2025-09-20 14:34:25 +02:00
Christian Risi	854e5f1d98	Updated file to gather data from wikipedia	2025-09-20 14:32:30 +02:00
GassiGiuseppe	242d7f674f	wikipedia summary file uploaded Dataset composed of PageId and wikipedia Summary	2025-09-20 14:32:25 +02:00
Christian Risi	de8c2afceb	Added reconciliation	2025-09-19 22:22:09 +02:00
Christian Risi	f89dffff75	Created script to gather wikipedia abstracts	2025-09-19 19:01:38 +02:00
GassiGiuseppe	e39bad8348	Added Troubleshooting section to README where are corrected some potential issue with git and big files	2025-09-19 13:39:56 +02:00
GassiGiuseppe	7a1a221017	update of the database of movie-pageid which has subject has film uri and object wikipage id	2025-09-19 13:37:56 +02:00
Christian Risi	fafe6ae0f9	Modified tree structure with more TMP directories	2025-09-19 12:46:31 +02:00
Christian Risi	e32444df75	Updated fetchdata to be used in terminal Changes: - now you can use it as if it were a cli command Missing: - documentation	2025-09-19 12:35:15 +02:00
Christian Risi	b74b7ac4f0	Added new directories to make experiments and updated .gitignore Changes: - Added /Scripts/Experiments/Queries to keep track of important queries, once set - Added /Scripts/Experiments/Tmp to run quick experiments when still unsure while explorating datasets	2025-09-19 08:43:54 +02:00
Christian Risi	22134391d9	Added Scripts/Experiment directory This directory is to place files to make experiments	2025-09-19 08:41:46 +02:00
Christian Risi	82c9023849	Ignoring Scripts/Experiments files and always tracking .gitkeep files	2025-09-19 08:39:47 +02:00
Christian Risi	00b87e01ea	Moved fetchdata.py to reflect working tree old - ${Proj}/Scripts/fetchdata.py new - ${Proj}/Scripts/DataGathering/fetchdata.py	2025-09-19 08:37:04 +02:00
Christian Risi	ce3d4bf6c5	Renamed dir from Script to Scripts	2025-09-19 08:31:00 +02:00
GassiGiuseppe	c415b175a0	added reverse.csv with the reletion incoming to films	2025-09-18 20:26:51 +02:00
GassiGiuseppe	ec81ea7930	Added file to gather wikipedia abstract from url	2025-09-18 20:26:11 +02:00
GassiGiuseppe	4bb03f86b3	Added file to study the most frequent relationship into a csv triplet	2025-09-18 20:25:25 +02:00
GassiGiuseppe	e5f201f3db	DEVELOPMENT file makrdown created	2025-09-18 20:24:54 +02:00
GassiGiuseppe	1c715dc569	Typo correction in the markdown	2025-09-18 20:24:11 +02:00
GassiGiuseppe	6686b47328	Added SQL to obtain wikipedia url with movies	2025-09-18 20:23:10 +02:00
GassiGiuseppe	9a5a7d84fd	Merge branch 'dev' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev	2025-09-18 19:20:26 +02:00
GassiGiuseppe	9678ece9c0	Requirements changed added Pandas and some other	2025-09-18 19:07:38 +02:00
Christian Risi	67bcd732b5	Updated movies	2025-09-18 18:36:52 +02:00
Christian Risi	1a4f900500	Updated git attributes	2025-09-18 18:36:42 +02:00
Christian Risi	ca8729b67c	Merge branch 'dev' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev	2025-09-18 18:36:30 +02:00
GassiGiuseppe	9dbffc52ed	Added dataset of movies and their wikipedia's page link	2025-09-18 18:16:51 +02:00
Christian Risi	b7f504942a	Created Dataset	2025-09-18 17:24:08 +02:00
Christian Risi	7f0c5ce8d3	Updated File for fetching	2025-09-18 17:23:56 +02:00
Christian Risi	9838e287a4	Updated file	2025-09-18 12:03:09 +02:00
Christian Risi	ca6143ea3c	Updated Query histories	2025-09-18 11:46:32 +02:00
Christian Risi	16e7ab4d9f	Modified Datasets	2025-09-17 17:30:51 +02:00
Christian Risi	28723ab662	Merge branch 'dev' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev	2025-09-17 17:06:16 +02:00
Christian Risi	3e59efcf33	Generated datasets	2025-09-17 17:06:14 +02:00
Christian Risi	7c04309cc1	Added script to fetch data from DBPedia	2025-09-17 17:05:27 +02:00
Christian Risi	db87295890	Added history of queries	2025-09-17 17:04:58 +02:00
GassiGiuseppe	61568200a8	README update with setup chapter where are scripted the command to manage conda and pip	2025-09-17 16:50:50 +02:00
Christian Risi	8df2736b97	Added environments	2025-09-17 16:16:58 +02:00
Christian Risi	eb5b7f629a	Conda env	2025-09-17 15:53:17 +02:00
Christian Risi	79232b391e	First SparQL query	2025-09-17 14:26:37 +02:00
Christian Risi	72eb937b47	Fixed Markdown violations	2025-09-17 12:51:14 +02:00
Christian Risi	cececa14ce	Merge branch 'dev' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev	2025-09-17 12:48:34 +02:00
Christian Risi	2487d44abd	Added SparQL	2025-09-17 12:48:33 +02:00
GassiGiuseppe	553b86cac2	Resources file updated with Byte-Pair Encoding a technique we will use to tokenize the engress' words	2025-09-17 12:06:01 +02:00
Christian Risi	12bd781fd3	Added workspace recommendations	2025-09-17 11:38:23 +02:00
Christian Risi	463f4907b8	Added Resources documentation	2025-09-17 11:36:02 +02:00