Merge branch 'dev.etl' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev.etl

Fixed creation query to be unique even with movieID in RDFs
CSV support added to path_splitter_tree
2025-09-25 18:33:51 +02:00 · 2025-09-25 17:58:09 +02:00 · 2025-09-25 17:57:46 +02:00 · 2025-09-25 17:57:45 +02:00 · 2025-09-25 16:28:24 +02:00 · 2025-09-25 12:37:52 +02:00
13 changed files with 1684 additions and 152 deletions
--- a/Assets/Dataset/1-hop/uri-abbreviations.csv
+++ b/Assets/Dataset/1-hop/uri-abbreviations.csv
--- a/Assets/Dataset/DatawareHouse/dataset.db
+++ b/Assets/Dataset/DatawareHouse/dataset.db
--- a/Scripts/DataBaseQueries/dataset.sql
+++ b/Scripts/DataBaseQueries/dataset.sql
@@ -0,0 +1,30 @@
+-- To pass to Pandas
+SELECT *
+FROM RDFs
+INNER JOIN Subjects USING (SubjectID)
+INNER JOIN Relationships USING (RelationshipID)
+INNER JOIN Objects USING (ObjectID);
+
+-- To pass to Pandas for abstracts
+SELECT *
+FROM RDFs
+INNER JOIN WikipediaAbstracts USING (MovieID);
+
+-- To pass to Pandas for abbreviations
+SELECT *
+FROM Abbreviations;
+
+-- More complex to have clean dataset
+-- More complex to have clean dataset
+SELECT MovieID, GROUP_CONCAT('<SOT>' || '<SUB>' || SubjectURI || '<REL>' || RelationshipURI || '<OBJ>' || ObjectURI || '<EOT>', '') as RDF_String, Abstract
+FROM RDFs
+INNER JOIN SubjectsCountInRDFs USING (SubjectID)
+INNER JOIN RelationshipsCountInRDFs USING(RelationshipID)
+INNER JOIN ObjectsCountInRDFs USING (ObjectID)
+INNER JOIN ParsedSubjects USING (SubjectID)
+INNER JOIN ParsedRelationships USING (RelationshipID)
+INNER JOIN ParsedObjects USING (ObjectID)
+INNER JOIN WikipediaAbstracts USING (MovieID)
+    -- insert WHERE here
+-- WHERE SubjectID = 134626
+GROUP BY MovieID;
--- a/Scripts/DataBaseQueries/db_creation.sql
+++ b/Scripts/DataBaseQueries/db_creation.sql
@@ -0,0 +1,174 @@
+CREATE TABLE IF NOT EXISTS Movies (
+    MovieID INTEGER PRIMARY KEY AUTOINCREMENT,
+    MovieURI TEXT UNIQUE NOT NULL
+);
+
+
+CREATE TABLE IF NOT EXISTS WikiPageIDs (
+    MovieID INTEGER PRIMARY KEY,
+    PageID INTEGER UNIQUE NOT NULL,
+    FOREIGN KEY(MovieID) REFERENCES Movies(MovieID)
+);
+
+
+CREATE TABLE IF NOT EXISTS WikipediaAbstracts (
+    MovieID INTEGER PRIMARY KEY,
+    Abstract TEXT NOT NULL,
+    FOREIGN KEY(MovieID) REFERENCES Movies(MovieID)
+);
+
+
+CREATE TABLE IF NOT EXISTS Origins (
+    OriginID INTEGER PRIMARY KEY AUTOINCREMENT,
+    OriginName TEXT UNIQUE NOT NULL
+);
+
+
+CREATE TABLE IF NOT EXISTS Subjects (
+    SubjectID INTEGER PRIMARY KEY AUTOINCREMENT,
+    SubjectURI TEXT UNIQUE NOT NULL,
+    OriginID BIGINT NOT NULL,
+    FOREIGN KEY(OriginID) REFERENCES Origins(OriginID)
+);
+
+
+CREATE TABLE IF NOT EXISTS Relationships (
+    RelationshipID INTEGER PRIMARY KEY AUTOINCREMENT,
+    RelationshipURI TEXT UNIQUE NOT NULL
+);
+
+
+CREATE TABLE IF NOT EXISTS Objects (
+    ObjectID INTEGER PRIMARY KEY AUTOINCREMENT,
+    ObjectURI TEXT UNIQUE NOT NULL,
+    OriginID BIGINT NOT NULL,
+    FOREIGN KEY(OriginID) REFERENCES Origins(OriginID)
+);
+
+CREATE TABLE IF NOT EXISTS RDFs (
+    RDF_ID INTEGER PRIMARY KEY AUTOINCREMENT,
+    MovieID INTEGER NOT NULL,
+    SubjectID INTEGER NOT NULL,
+    RelationshipID INTEGER NOT NULL,
+    ObjectID INTEGER NOT NULL,
+    UNIQUE(MovieID, SubjectID, RelationshipID, ObjectID),
+    FOREIGN KEY(MovieID) REFERENCES Movies(MovieID),
+    FOREIGN KEY(SubjectID) REFERENCES Subjects(SubjectID),
+    FOREIGN KEY(RelationshipID) REFERENCES Relationships(RelationshipID),
+    FOREIGN KEY(ObjectID) REFERENCES Objects(ObjectID)
+);
+
+CREATE INDEX IF NOT EXISTS idx_rdf_movie_id ON RDFs(MovieID);
+CREATE INDEX IF NOT EXISTS idx_rdf_subject_id ON RDFs(SubjectID);
+CREATE INDEX IF NOT EXISTS idx_rdf_relationship_id ON RDFs(RelationshipID);
+CREATE INDEX IF NOT EXISTS idx_rdf_object_id ON RDFs(ObjectID);
+
+CREATE TABLE IF NOT EXISTS Abbreviations (
+    AbbreviationID INTEGER PRIMARY KEY AUTOINCREMENT,
+    URI TEXT UNIQUE NOT NULL,
+    Abbreviation TEXT UNIQUE NOT NULL
+);
+
+CREATE TABLE IF NOT EXISTS Subjects_Abbreviations (
+    SubjectID INTEGER NOT NULL,
+    AbbreviationID INTEGER NOT NULL,
+    PRIMARY KEY(SubjectID, AbbreviationID),
+    FOREIGN KEY(SubjectID) REFERENCES Subjects(SubjectID),
+    FOREIGN KEY(AbbreviationID) REFERENCES Abbreviations(AbbreviationID)
+);
+
+CREATE TABLE IF NOT EXISTS Relationships_Abbreviations (
+    RelationshipID INTEGER NOT NULL,
+    AbbreviationID INTEGER NOT NULL,
+    PRIMARY KEY(RelationshipID, AbbreviationID),
+    FOREIGN KEY(RelationshipID) REFERENCES Relationships(RelationshipID),
+    FOREIGN KEY(AbbreviationID) REFERENCES Abbreviations(AbbreviationID)
+);
+
+CREATE TABLE IF NOT EXISTS Objects_Abbreviations (
+    ObjectID INTEGER NOT NULL,
+    AbbreviationID INTEGER NOT NULL,
+    PRIMARY KEY(ObjectID, AbbreviationID),
+    FOREIGN KEY(ObjectID) REFERENCES Objects(ObjectID),
+    FOREIGN KEY(AbbreviationID) REFERENCES Abbreviations(AbbreviationID)
+);
+
+CREATE INDEX IF NOT EXISTS idx_sub_abbr_sub_id ON Subjects_Abbreviations(SubjectID);
+CREATE INDEX IF NOT EXISTS idx_sub_abbr_abbr_id ON Subjects_Abbreviations(AbbreviationID);
+CREATE INDEX IF NOT EXISTS idx_rel_abbr_rel_id ON Relationships_Abbreviations(RelationshipID);
+CREATE INDEX IF NOT EXISTS idx_rel_abbr_abbr_id ON Relationships_Abbreviations(AbbreviationID);
+CREATE INDEX IF NOT EXISTS idx_obj_abbr_obj_id ON Objects_Abbreviations(ObjectID);
+CREATE INDEX IF NOT EXISTS idx_obj_abbr_abbr_id ON Objects_Abbreviations(AbbreviationID);
+
+-- Views
+-- Subjects
+CREATE VIEW IF NOT EXISTS ParsedSubjects
+AS
+SELECT
+	SubjectID,
+	CASE WHEN Abbreviation IS NULL
+		THEN SubjectURI
+		ELSE Abbreviation || ':' || replace(SubjectURI, URI, '') END
+		AS SubjectURI
+FROM Subjects
+	LEFT JOIN Subjects_Abbreviations USING (SubjectID)
+	LEFT JOIN Abbreviations USING (AbbreviationID);
+
+-- Relationships
+CREATE VIEW IF NOT EXISTS ParsedRelationships
+AS
+SELECT
+	RelationshipID,
+	CASE WHEN Abbreviation IS NULL
+		THEN RelationshipURI
+		ELSE Abbreviation || ':' || replace(RelationshipURI, URI, '') END
+		AS RelationshipURI
+FROM Relationships
+	LEFT JOIN Relationships_Abbreviations USING (RelationshipID)
+	LEFT JOIN Abbreviations USING (AbbreviationID);
+
+-- Objects
+CREATE VIEW IF NOT EXISTS ParsedObjects
+AS
+SELECT
+	ObjectID,
+	CASE WHEN Abbreviation IS NULL
+		THEN ObjectURI
+		ELSE Abbreviation || ':' || replace(ObjectURI, URI, '') END
+		AS ObjectURI
+FROM Objects
+	LEFT JOIN Objects_Abbreviations USING (ObjectID)
+	LEFT JOIN Abbreviations USING (AbbreviationID);
+
+
+-- Subject Count
+CREATE VIEW IF NOT EXISTS SubjectsCountInRDFs
+AS
+SELECT SubjectID, count(SubjectID) as Sub_Count
+FROM RDFs
+GROUP BY SubjectID;
+
+
+
+
+-- Relationship Count
+CREATE VIEW IF NOT EXISTS RelationshipsCountInRDFs
+AS
+SELECT RelationshipID, count(RelationshipID) as Rel_Count
+FROM RDFs
+GROUP BY RelationshipID;
+
+
+-- Object Count
+CREATE VIEW IF NOT EXISTS ObjectsCountInRDFs
+AS
+SELECT ObjectID, count(ObjectID) as Obj_Count
+FROM RDFs
+GROUP BY ObjectID;
+
+
+
+
+
+
+
--- a/Scripts/DatasetMerging/SQL_Queries/query.sql
+++ b/Scripts/DatasetMerging/SQL_Queries/query.sql
@@ -33,3 +33,23 @@ SELECT ObjectID FROM Objects WHERE ObjectURI = ?;


 INSERT INTO  RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?,?,?,?);
+
+-- Prefixes
+INSERT INTO Abbreviations(URI, Abbreviation) VALUES (?,?);
+INSERT INTO Objects_Abbreviations(ObjectID, AbbreviationID) VALUES (?,?);
+INSERT INTO Relationships_Abbreviations(RelationshipID, AbbreviationID) VALUES (?,?);
+INSERT INTO Subjects_Abbreviations(SubjectID, AbbreviationID) VALUES (?,?);
+
+-- Please be sure it is a URI before running this query
+--  and take at least until the domain and the first path part
+SELECT AbbreviationID FROM Abbreviations WHERE URI LIKE ?;
+
+-- Query to retrieve data
+SELECT MovieID, GROUP_CONCAT('<SOT>' || '<SUB>' || SubjectURI || '<REL>' || RelationshipURI || '<OBJ>' || ObjectURI || '<EOT>', '') as RDF_String, Abstract
+FROM RDFs
+INNER JOIN ParsedSubjects USING (SubjectID)
+INNER JOIN ParsedRelationships USING (RelationshipID)
+INNER JOIN ParsedObjects USING (ObjectID)
+INNER JOIN WikipediaAbstracts USING (MovieID)
+    -- insert WHERE here
+GROUP BY MovieID;
--- a/Scripts/DataCleaning/clean_relationship.ipynb
+++ b/Scripts/DataCleaning/clean_relationship.ipynb
@@ -0,0 +1,186 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "b9081b7c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# This file deletes in the pipeline the unwanted relationship by different rules\n",
+    "import pandas as pd\n",
+    "import sqlite3\n",
+    "import numpy as np\n",
+    "\n",
+    "\n",
+    "CONN = sqlite3.connect('../../Assets/Dataset/Tmp/dataset2.db')\n",
+    "\n",
+    "def get_RDF() -> pd.DataFrame:\n",
+    "    \"\"\"\n",
+    "    QUERY = \"SELECT * FROM RDFs \" \\\n",
+    "    \"INNER JOIN Subjects USING (SubjectID) \" \\\n",
+    "    \"INNER JOIN Relationships USING (RelationshipID) \" \\\n",
+    "    \"INNER JOIN Objects USING (ObjectID);\"\n",
+    "    RDF = pd.read_sql_query(QUERY, CONN)\n",
+    "    RDF = RDF[[\"SubjectURI\", \"RelationshipURI\", \"ObjectURI\"]]\n",
+    "    RDF = RDF.dropna()\n",
+    "    \"\"\"\n",
+    "    Subjects = pd.read_sql_query('SELECT * FROM Subjects;', CONN)\n",
+    "    Objects = pd.read_sql_query('SELECT * FROM Objects;', CONN)\n",
+    "    Relationships = pd.read_sql_query('SELECT * FROM Relationships;', CONN)\n",
+    "    RDF = pd.read_sql_query('SELECT * FROM RDFs;', CONN)\n",
+    "\n",
+    "    # drop '' values \n",
+    "    Subjects = Subjects.replace('', np.nan)# .dropna()\n",
+    "    Relationships = Relationships.replace('', np.nan)# .dropna()\n",
+    "    Objects = Objects.replace('', np.nan)# .dropna()\n",
+    "\n",
+    "    # join RDF with its components\n",
+    "    RDF = RDF.merge(Subjects, left_on=\"SubjectID\", right_on=\"SubjectID\")\n",
+    "    RDF = RDF.merge(Objects, left_on=\"ObjectID\", right_on=\"ObjectID\")\n",
+    "    RDF = RDF.merge(Relationships, left_on=\"RelationshipID\", right_on=\"RelationshipID\")\n",
+    "    RDF = RDF[[\"SubjectURI\", \"RelationshipURI\", \"ObjectURI\", \"MovieID\"]]\n",
+    "    return RDF\n",
+    "\n",
+    "\n",
+    "#def delete_relationship_by_uri(RDF: pd.DataFrame, )\n",
+    "\n",
+    "def delete_relationship_by_uri(RDF: pd.DataFrame, uri: str) -> pd.DataFrame:\n",
+    "    return RDF[RDF[\"RelationshipURI\"]!= uri]\n",
+    "\n",
+    "\n",
+    "\n",
+    "RDF = get_RDF()\n",
+    "# RDF = RDF.dropna()\n",
+    "# print(RDF)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "644690bb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def filter_by_frequence_relationship_uri(RDF: pd.DataFrame, count_treshold) -> pd.DataFrame:\n",
+    "    counts = RDF[\"RelationshipURI\"].value_counts() \n",
+    "    RDF[\"RelationshipFreq\"] = RDF[\"RelationshipURI\"].map(counts)\n",
+    "    RDF = RDF[RDF[\"RelationshipFreq\"] >= count_treshold]\n",
+    "    # counts is a series as key: relationship, value: count\n",
+    "    # counts = counts[counts > count_treshold]\n",
+    "    # relationships = counts.index\n",
+    "    # RDF = RDF[RDF[\"RelationshipURI\"].isin(relationships)]\n",
+    "    # RDF = RDF.groupby(\"RelationshipURI\").filter(lambda x: len(x) >= count_treshold)\n",
+    "    return RDF\n",
+    "\n",
+    "RDF = filter_by_frequence_relationship_uri(RDF, 1)\n",
+    "# print(new_RDF)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "34525be6",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                                                 SubjectURI  \\\n",
+      "0             http://dbpedia.org/resource/Nights_of_Cabiria   \n",
+      "1         http://dbpedia.org/resource/California_Science...   \n",
+      "2                 http://dbpedia.org/resource/China_Captain   \n",
+      "3         http://dbpedia.org/resource/Caravan_of_Courage...   \n",
+      "4                http://dbpedia.org/resource/WHIH_Newsfront   \n",
+      "...                                                     ...   \n",
+      "12725500   http://dbpedia.org/resource/I_Will_Follow_(film)   \n",
+      "12725501   http://dbpedia.org/resource/I_Will_Follow_(film)   \n",
+      "12725502  http://dbpedia.org/resource/I_Witnessed_Genoci...   \n",
+      "12725503  http://dbpedia.org/resource/I_Woke_Up_Early_th...   \n",
+      "12725504           http://dbpedia.org/resource/I_Won't_Play   \n",
+      "\n",
+      "                                       RelationshipURI  \\\n",
+      "0          http://www.w3.org/2002/07/owl#differentFrom   \n",
+      "1          http://www.w3.org/2002/07/owl#differentFrom   \n",
+      "2          http://www.w3.org/2002/07/owl#differentFrom   \n",
+      "3          http://www.w3.org/2002/07/owl#differentFrom   \n",
+      "4         http://www.w3.org/2000/01/rdf-schema#seeAlso   \n",
+      "...                                                ...   \n",
+      "12725500          http://dbpedia.org/ontology/producer   \n",
+      "12725501          http://dbpedia.org/ontology/producer   \n",
+      "12725502          http://dbpedia.org/ontology/producer   \n",
+      "12725503          http://dbpedia.org/ontology/producer   \n",
+      "12725504          http://dbpedia.org/ontology/producer   \n",
+      "\n",
+      "                                                  ObjectURI  MovieID  \\\n",
+      "0                       http://dbpedia.org/resource/Cabiria       26   \n",
+      "1         http://dbpedia.org/resource/California_Academy...      185   \n",
+      "2                 http://dbpedia.org/resource/Captain_China      614   \n",
+      "3         http://dbpedia.org/resource/Caravan_of_Courage...      740   \n",
+      "4         http://dbpedia.org/resource/Captain_America:_C...      594   \n",
+      "...                                                     ...      ...   \n",
+      "12725500           http://dbpedia.org/resource/Ava_DuVernay   145854   \n",
+      "12725501           http://dbpedia.org/resource/Molly_Mayeux   145854   \n",
+      "12725502        http://dbpedia.org/resource/Headlines_Today   145861   \n",
+      "12725503             http://dbpedia.org/resource/Billy_Zane   145862   \n",
+      "12725504    http://dbpedia.org/resource/Gordon_Hollingshead   145864   \n",
+      "\n",
+      "          RelationshipFreq  MovieFreq  \n",
+      "0                     2132        216  \n",
+      "1                     2132        264  \n",
+      "2                     2132         66  \n",
+      "3                     2132        131  \n",
+      "4                     1653        133  \n",
+      "...                    ...        ...  \n",
+      "12725500             80077         95  \n",
+      "12725501             80077         95  \n",
+      "12725502             80077         41  \n",
+      "12725503             80077         98  \n",
+      "12725504             80077         91  \n",
+      "\n",
+      "[12725505 rows x 6 columns]\n"
+     ]
+    }
+   ],
+   "source": [
+    "def filter_by_frequence_movie_id(RDF: pd.DataFrame, min_treshold, max_treshold) -> pd.DataFrame:\n",
+    "    counts = RDF[\"MovieID\"].value_counts() \n",
+    "    RDF[\"MovieFreq\"] = RDF[\"MovieID\"].map(counts)\n",
+    "    RDF = RDF[RDF[\"MovieFreq\"] >= min_treshold]\n",
+    "    RDF = RDF[RDF[\"MovieFreq\"] < max_treshold]\n",
+    "    # counts is a series as key: relationship, value: count\n",
+    "    # counts = counts[counts > count_treshold]\n",
+    "    # relationships = counts.index\n",
+    "    # RDF = RDF[RDF[\"RelationshipURI\"].isin(relationships)]\n",
+    "    # RDF = RDF.groupby(\"RelationshipURI\").filter(lambda x: len(x) >= count_treshold)\n",
+    "    return RDF\n",
+    "\n",
+    "RDF = filter_by_frequence_movie_id(RDF, 1, 1500)\n",
+    "print(RDF)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "deep_learning",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/Scripts/DataCleaning/dbpedia-uri.py
+++ b/Scripts/DataCleaning/dbpedia-uri.py
@@ -0,0 +1,77 @@
+import argparse
+import sys
+
+
+
+class ProgramArgs:
+
+    def __init__(self, file: str, output: str, treshold: int):
+        self.file = file
+        self.output = output
+        self.treshold = treshold
+
+def get_args(args: list[str]) -> ProgramArgs:
+
+    PARSER = argparse.ArgumentParser()
+    PARSER.add_argument("--input-file", "-i", required=True, type=str)
+    PARSER.add_argument("--output-file", "-o", required=True, type=str)
+    PARSER.add_argument("--treshold", "-t", type=int, default=1)
+    parsed_args, _ = PARSER.parse_known_args(args)
+
+    # print(parsed_args.input_file)
+
+    return ProgramArgs(parsed_args.input_file,parsed_args.output_file, parsed_args.treshold)  # type ignore
+
+
+def print_dbpedia(file: str, out: str):
+
+
+    FILE = open(file, "r", encoding="utf-8")
+    OUT = open(out, mode="w", encoding="utf-8")
+
+    DOMAIN_PART = "dbpedia"
+
+    already_parsed : set[str] = set()
+
+
+    for row in FILE:
+
+        sections = row.split("/")
+        sections = list(filter(lambda item: item != "", sections))
+
+        # print(sections)
+
+        if len(sections) < 3:
+            continue
+
+        URI = "/".join(sections[1:3])
+        URI = "//".join([sections[0], URI])
+
+        if  URI in already_parsed:
+            continue
+
+        DOMAIN = sections[1]
+        SUBDOMAINS = DOMAIN.split(".")
+        TYPE = sections[2][0]
+
+        if DOMAIN_PART not in SUBDOMAINS:
+            continue
+
+        already_parsed.add(URI)
+
+        SUB_ID = SUBDOMAINS[0]
+
+        if len(SUB_ID) > 3:
+            SUB_ID = SUB_ID[:3]
+
+        OUT.write(f"\"{URI}/\", \"{SUB_ID}-db{TYPE}\"\n")
+
+
+    FILE.close()
+    OUT.close()
+
+
+if __name__ == "__main__":
+    ARGS = get_args(sys.argv)
+    # ARGS = get_debug_args()
+    print_dbpedia(ARGS.file, ARGS.output)
--- a/Scripts/DataCleaning/path_splitter_tree.py
+++ b/Scripts/DataCleaning/path_splitter_tree.py
@@ -6,8 +6,16 @@ from typing import Self

 class ProgramArgs:

-    def __init__(self, file: str, output: str, treshold: int):
+    def __init__(self, file: str, csv_uri_header: str, output: str, treshold: int):
+        """
+        Args:
+            file (str): 
+            csv_header (str): The name of the column of the csv file from which the program will get the URIs
+            output (str): 
+            treshold (int): 
+        """        
        self.file = file
+        self.csv_uri_header = csv_uri_header
        self.output = output
        self.treshold = treshold

@@ -33,11 +41,15 @@ class Node:
        KEY = child[0]

        if not self.children.get(KEY):
+            # if the key has no value, it means we are traversing this branch for the first time
+            # create another node for the key
            self.children[KEY] = Node(KEY, 0)

+        # take the node for the key
        CHILD = self.children[KEY]
        self.quantity += 1

+        # if the child list to enter has only one element, which is KEY, no more node will be created
        if len(child) == 1:
            return

@@ -53,27 +65,32 @@ def get_args(args: list[str]) -> ProgramArgs:

    PARSER = argparse.ArgumentParser()
    PARSER.add_argument("--input-file", "-i", required=True, type=str)
+    PARSER.add_argument("--header-name", "-c", required=True, type=str)                       # c stands for column
    PARSER.add_argument("--output-file", "-o", required=True, type=str)
    PARSER.add_argument("--treshold", "-t", type=int, default=1)
    parsed_args, _ = PARSER.parse_known_args(args)

    # print(parsed_args.input_file)

-    return ProgramArgs(parsed_args.input_file,parsed_args.output_file, parsed_args.treshold)  # type ignore
+    return ProgramArgs(parsed_args.input_file, parsed_args.header_name ,parsed_args.output_file, parsed_args.treshold)  # type ignore


 def get_debug_args() -> ProgramArgs:
-
-    FILE = "./Assets/Dataset/Tmp/reverse-rel.txt"
+    # -i ./Assets/Dataset/1-hop/movies.csv  -c subject -o Assets/Dataset/Tmp/prova.csv -t 1
+    FILE = "./Assets/Dataset/1-hop/movies.csv"
+    CSV_HEADER = "subject"
+    OUTPUT = "./Assets/Dataset/Tmp/prova.csv"
    TRESHOLD = 1

    return ProgramArgs(
        FILE,
+        CSV_HEADER,
+        OUTPUT,
        TRESHOLD
    )


-def tree_like(file: str, out: str):
+def tree_like(file: str, csv_uri_header:str, out: str):

    INDENTATION = "    "

@@ -84,9 +101,12 @@ def tree_like(file: str, out: str):

    FILE = open(file, "r", encoding="utf-8")

-    for row in FILE:
+    # TODO: Change here so it takes single URI from a CSV file
+    # It is needed the header-name
+    for row in csv.DictReader(FILE):

-        sections = row.split("/")
+        uri_element = row[csv_uri_header]
+        sections = uri_element.split("/")
        sections = list(filter(lambda item: item != "", sections))

        # print(sections)
@@ -115,7 +135,9 @@ def tree_like(file: str, out: str):

        INDENT: str = INDENTATION * DEPTH

-        if NODE.quantity < ARGS.treshold:
+        # Leaf node have quantity 0, so if i want them to appear the threshold have to be 0
+        # if NODE.quantity < ARGS.treshold:
+        if ARGS.treshold > NODE.quantity:
            continue

        OUT.write(f"{INDENT}- {NODE}\n")
@@ -133,7 +155,8 @@ def tree_like(file: str, out: str):
    OUT.close()


+
 if __name__ == "__main__":
    ARGS = get_args(sys.argv)
    # ARGS = get_debug_args()
-    tree_like(ARGS.file, ARGS.output)
+    tree_like(ARGS.file,ARGS.csv_uri_header, ARGS.output)
--- a/Scripts/DatasetMerging/SQL_Queries/db_creation.sql
+++ b/Scripts/DatasetMerging/SQL_Queries/db_creation.sql
@@ -1,65 +0,0 @@
-CREATE TABLE IF NOT EXISTS Movies (
-    MovieID INTEGER PRIMARY KEY AUTOINCREMENT,
-    MovieURI TEXT UNIQUE NOT NULL
-);
-
-
-CREATE TABLE IF NOT EXISTS WikiPageIDs (
-    MovieID INTEGER PRIMARY KEY,
-    PageID INTEGER UNIQUE NOT NULL,
-    FOREIGN KEY(MovieID) REFERENCES Movies(MovieID)
-);
-
-
-CREATE TABLE IF NOT EXISTS WikipediaAbstracts (
-    MovieID INTEGER PRIMARY KEY,
-    Abstract TEXT NOT NULL,
-    FOREIGN KEY(MovieID) REFERENCES Movies(MovieID)
-);
-
-
-CREATE TABLE IF NOT EXISTS Origins (
-    OriginID INTEGER PRIMARY KEY AUTOINCREMENT,
-    OriginName TEXT UNIQUE NOT NULL
-);
-
-
-CREATE TABLE IF NOT EXISTS Subjects (
-    SubjectID INTEGER PRIMARY KEY AUTOINCREMENT,
-    SubjectURI TEXT UNIQUE NOT NULL,
-    OriginID BIGINT NOT NULL,
-    FOREIGN KEY(OriginID) REFERENCES Origins(OriginID)
-);
-
-
-CREATE TABLE IF NOT EXISTS Relationships (
-    RelationshipID INTEGER PRIMARY KEY AUTOINCREMENT,
-    RelationshipURI TEXT UNIQUE NOT NULL
-);
-
-
-CREATE TABLE IF NOT EXISTS Objects (
-    ObjectID INTEGER PRIMARY KEY AUTOINCREMENT,
-    ObjectURI TEXT UNIQUE NOT NULL,
-    OriginID BIGINT NOT NULL,
-    FOREIGN KEY(OriginID) REFERENCES Origins(OriginID)
-);
-
-CREATE TABLE IF NOT EXISTS RDFs (
-    RDF_ID INTEGER PRIMARY KEY AUTOINCREMENT,
-    MovieID INTEGER NOT NULL,
-    SubjectID INTEGER NOT NULL,
-    RelationshipID INTEGER NOT NULL,
-    ObjectID INTEGER NOT NULL,
-    UNIQUE(SubjectID, RelationshipID, ObjectID),
-    FOREIGN KEY(MovieID) REFERENCES Movies(MovieID),
-    FOREIGN KEY(SubjectID) REFERENCES Subjects(SubjectID),
-    FOREIGN KEY(RelationshipID) REFERENCES Relationships(RelationshipID),
-    FOREIGN KEY(ObjectID) REFERENCES Objects(ObjectID)
-);
-
-CREATE INDEX IF NOT EXISTS idx_rdf_movie_id ON RDFs(MovieID);
-CREATE INDEX IF NOT EXISTS idx_rdf_subject_id ON RDFs(SubjectID);
-CREATE INDEX IF NOT EXISTS idx_rdf_relationship_id ON RDFs(RelationshipID);
-CREATE INDEX IF NOT EXISTS idx_rdf_object_id ON RDFs(ObjectID);
-
--- a/Scripts/DatasetMerging/datawarehouse.py
+++ b/Scripts/DatasetMerging/datawarehouse.py
@@ -17,12 +17,15 @@ PAGEID_CSV = "./Assets/Dataset/1-hop/movie-pageid.csv"
 SUMMARY_CSV = "./Assets/Dataset/1-hop/wikipedia-summary.csv"
 DATASET_CSV = "./Assets/Dataset/1-hop/dataset.csv"
 REVERSE_CSV = "./Assets/Dataset/1-hop/reverse.csv"
+URI_CSV = "./Assets/Dataset/1-hop/uri-abbreviations.csv"
+
+MOVIES_CSV_HANDLER = open(MOVIES_CSV, "r", newline="", encoding="utf-8")
+PAGEID_CSV_HANDLER = open(PAGEID_CSV, "r", newline="", encoding="utf-8")
+SUMMARY_CSV_HANDLER = open(SUMMARY_CSV, "r", newline="", encoding="utf-8")
+DATASET_CSV_HANDLER = open(DATASET_CSV, "r", newline="", encoding="utf-8")
+REVERSE_CSV_HANDLER = open(REVERSE_CSV, "r", newline="", encoding="utf-8")
+URI_ABBR_CSV_HANDLER = open(URI_CSV, "r", newline="", encoding="utf-8")

-MOVIES_CSV_HANDLER = open(MOVIES_CSV,"r",newline='', encoding="utf-8")
-PAGEID_CSV_HANDLER = open(PAGEID_CSV,"r",newline='', encoding="utf-8")
-SUMMARY_CSV_HANDLER = open(SUMMARY_CSV,"r",newline='', encoding="utf-8")
-DATASET_CSV_HANDLER = open(DATASET_CSV,"r",newline='', encoding="utf-8")
-REVERSE_CSV_HANDLER = open(REVERSE_CSV,"r",newline='', encoding="utf-8")

 CONN = sqlite3.connect(DB_NAME)
 CURS = CONN.cursor()
@@ -30,6 +33,7 @@ CURS = CONN.cursor()
 # MARK: SQL Definitions
 # Insert MovieURI

+
 def insertOrigin(curs: sqlite3.Cursor) -> bool:

    QUERY = "INSERT INTO  Origins (OriginName) VALUES ('dataset.csv'),('reverse.csv');"
@@ -39,6 +43,7 @@ def insertOrigin(curs : sqlite3.Cursor ) -> bool:
    except sqlite3.IntegrityError:
        return False

+
 def selectOrigin(curs: sqlite3.Cursor, originName: str) -> int | None:

    QUERY = "SELECT OriginID FROM Origins WHERE OriginName = ?;"
@@ -51,6 +56,7 @@ def selectOrigin(curs: sqlite3.Cursor, originName: str) -> int | None:
    # in this case the real id is the first element of the tuple
    return originId[0]

+
 def insertMovie(curs: sqlite3.Cursor, movieUri: str) -> bool:

    QUERY = "INSERT INTO Movies (MovieURI) VALUES (?);"
@@ -82,6 +88,7 @@ def insertWikiPageId(curs: sqlite3.Cursor, movieId: int, pageId: int) -> bool:
    except sqlite3.IntegrityError:
        return False

+
 def selectMovieIdFromWikiPageId(curs: sqlite3.Cursor, pageId: int) -> int | None:

    QUERY = "SELECT MovieID FROM WikiPageIDs WHERE PageID = ?;"
@@ -94,6 +101,7 @@ def selectMovieIdFromWikiPageId(curs: sqlite3.Cursor,pageId: int) -> int | None:
    # in this case the real id is the first element of the tuple
    return movieId[0]

+
 def insertWikiAbstract(curs: sqlite3.Cursor, movieId: int, abstract: str) -> bool:
    QUERY = "INSERT INTO  WikipediaAbstracts (MovieID, Abstract) VALUES (?,?);"
    try:
@@ -102,6 +110,7 @@ def insertWikiAbstract(curs: sqlite3.Cursor, movieId: int, abstract: str) -> boo
    except sqlite3.IntegrityError:
        return False

+
 def insertSubject(curs: sqlite3.Cursor, subjectURI: str, originID: int) -> bool:
    QUERY = "INSERT INTO  Subjects (SubjectURI, OriginID) VALUES (?,?);"
    try:
@@ -110,6 +119,7 @@ def insertSubject(curs: sqlite3.Cursor, subjectURI: str, originID: int) -> bool:
    except sqlite3.IntegrityError:
        return False

+
 def insertRelationship(curs: sqlite3.Cursor, relationshipURI: str) -> bool:
    QUERY = "INSERT INTO  Relationships (RelationshipURI) VALUES (?);"
    try:
@@ -118,6 +128,7 @@ def insertRelationship(curs: sqlite3.Cursor, relationshipURI: str) -> bool:
    except sqlite3.IntegrityError:
        return False

+
 def insertObject(curs: sqlite3.Cursor, objectURI: str, originID: int) -> bool:
    QUERY = "INSERT INTO  objects (ObjectURI, OriginID) VALUES (?,?);"
    try:
@@ -126,6 +137,7 @@ def insertObject(curs: sqlite3.Cursor, objectURI: str, originID: int) -> bool:
    except sqlite3.IntegrityError:
        return False

+
 def selectSubjectId(curs: sqlite3.Cursor, subjectURI: str) -> int | None:

    QUERY = "SELECT SubjectID FROM Subjects WHERE SubjectURI = ?;"
@@ -138,6 +150,7 @@ def selectSubjectId(curs: sqlite3.Cursor, subjectURI: str) -> int | None:
    # in this case the real id is the first element of the tuple
    return subjectId[0]

+
 def selectRelationshipId(curs: sqlite3.Cursor, relationshipURI: str) -> int | None:

    QUERY = "SELECT RelationshipID FROM Relationships WHERE RelationshipURI = ?;"
@@ -150,6 +163,7 @@ def selectRelationshipId(curs: sqlite3.Cursor, relationshipURI: str) -> int | No
    # in this case the real id is the first element of the tuple
    return relationshipId[0]

+
 def selectObjectId(curs: sqlite3.Cursor, objectURI: str) -> int | None:

    QUERY = "SELECT ObjectID FROM Objects WHERE ObjectURI = ?;"
@@ -162,12 +176,13 @@ def selectObjectId(curs: sqlite3.Cursor, objectURI: str) -> int | None:
    # in this case the real id is the first element of the tuple
    return objectId[0]

+
 def insertRDF(
    curs: sqlite3.Cursor,
    movieId: int,
    subjectId: int,
    relationshipId: int,
-    objectId: int 
+    objectId: int,
 ) -> bool:
    QUERY = "INSERT INTO  RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?,?,?,?);"
    try:
@@ -176,6 +191,56 @@ def insertRDF(
    except sqlite3.IntegrityError:
        return False

+# UGLY: correct method to add cursor
+def insert_abbreviation(uri, abbreviation) -> bool:
+    QUERY = "INSERT INTO Abbreviations(URI, Abbreviation) VALUES (?,?);"
+    try:
+        CURS.execute(QUERY, [uri, abbreviation])
+        return True
+    except sqlite3.IntegrityError:
+        return False
+
+# UGLY: correct method to add cursor
+def insert_object_abbreviation(object_id, abbreviation_id) -> bool:
+    QUERY = "INSERT INTO Objects_Abbreviations(ObjectID, AbbreviationID) VALUES (?,?);"
+    try:
+        CURS.execute(QUERY, [object_id, abbreviation_id])
+        return True
+    except sqlite3.IntegrityError:
+        return False
+
+# UGLY: correct method to add cursor
+def insert_relationship_abbreviation(relationship_id, abbreviation_id) -> bool:
+    QUERY = "INSERT INTO Relationships_Abbreviations(RelationshipID, AbbreviationID) VALUES (?,?);"
+    try:
+        CURS.execute(QUERY, [relationship_id, abbreviation_id])
+        return True
+    except sqlite3.IntegrityError:
+        return False
+
+# UGLY: correct method to add cursor
+def insert_subject_abbreviation(subject_id, abbreviation_id) -> bool:
+    QUERY = (
+        "INSERT INTO Subjects_Abbreviations(SubjectID, AbbreviationID) VALUES (?,?);"
+    )
+    try:
+        CURS.execute(QUERY, [subject_id, abbreviation_id])
+        return True
+    except sqlite3.IntegrityError:
+        return False
+
+# UGLY: correct method to add cursor
+def select_abbreviation_id(uri) -> int | None:
+    QUERY = "SELECT AbbreviationID FROM Abbreviations WHERE URI LIKE ?;"
+    CURS.execute(QUERY, [uri])
+    abbreviation_id = CURS.fetchone()
+    if not abbreviation_id:
+        return None
+
+    # in this case the real id is the first element of the tuple
+    return abbreviation_id[0]
+
+
 # MARK: Parsing
 def parseMovies():

@@ -208,7 +273,6 @@ def parseAbstract():
        ABSTRACT = row["text"]
        MOVIE_ID = selectMovieIdFromWikiPageId(CURS, WIKI_PAGE_ID)

-
        if MOVIE_ID is None:
            print(f"The WikiPageId: {WIKI_PAGE_ID} has not a MovieId ")
            continue
@@ -216,10 +280,24 @@ def parseAbstract():
        insertWikiAbstract(CURS, MOVIE_ID, ABSTRACT)


+def parseAbbreviations():
+    URI_CSV = csv.DictReader(URI_ABBR_CSV_HANDLER)
+    for row in URI_CSV:
+
+        URI = row["uri"]
+        ABBREVIATION = row["abbreviation"]
+
+        insert_abbreviation(URI, ABBREVIATION)
+
+
 def parseRDF_Reverse():

    REVERSE_CSV_READER = csv.DictReader(REVERSE_CSV_HANDLER)
-    REVERSE_ORIGIN_ID = selectOrigin(CURS, 'reverse.csv')
+    REVERSE_ORIGIN_ID = selectOrigin(CURS, "reverse.csv")
+
+    if REVERSE_ORIGIN_ID is None:
+        return
+
    total = 0

    for row in REVERSE_CSV_READER:
@@ -236,7 +314,6 @@ def parseRDF_Reverse():
        RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
        MOVIE_ID = selectMovieId(CURS, OBJECT)

-
        skip = False

        # guard
@@ -259,17 +336,19 @@ def parseRDF_Reverse():
        if skip:
            continue

-        if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID):
+        if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID):  # type: ignore
            total += 1

    print(total)


-
 def parseRDF_Dataset():

    DATASET_CSV_READER = csv.DictReader(DATASET_CSV_HANDLER)
-    DATASET_ORIGIN_ID = selectOrigin(CURS, 'dataset.csv')
+    DATASET_ORIGIN_ID = selectOrigin(CURS, "dataset.csv")
+
+    if DATASET_ORIGIN_ID is None:
+        return

    total = 0
    rdf_idx = 0
@@ -293,7 +372,6 @@ def parseRDF_Dataset():
        RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
        MOVIE_ID = selectMovieId(CURS, SUBJECT)

-
        skip = False

        # guard
@@ -316,31 +394,211 @@ def parseRDF_Dataset():
        if skip:
            continue

-        if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID):
+        if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID):  # type: ignore
            total += 1

    print(total)


+def parseAbbr_Reverse():
+
+    REVERSE_CSV_READER = csv.DictReader(REVERSE_CSV_HANDLER)
+    REVERSE_ORIGIN_ID = selectOrigin(CURS, "reverse.csv")
+
+    if REVERSE_ORIGIN_ID is None:
+        return
+
+    total = 0
+
+    for row in REVERSE_CSV_READER:
+        SUBJECT = row["subject"]
+        RELATIONSHIP = row["relationship"]
+        OBJECT = row["object"]
+
+        SUBJECT_ID = selectSubjectId(CURS, SUBJECT)
+        OBJECT_ID = selectObjectId(CURS, OBJECT)
+        RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
+
+        SUB_SECTIONS = SUBJECT.split("/")
+        REL_SECTIONS = RELATIONSHIP.split("/")
+        OBJ_SECTIONS = OBJECT.split("/")
+
+        SUB_ABBR_ID = None
+        REL_ABBR_ID = None
+        OBJ_ABBR_ID = None
+
+        skip = False
+
+        # guard
+        if SUBJECT_ID is None:
+            print(f"No SubjectId for {SUBJECT}")
+            skip = True
+
+        if OBJECT_ID is None:
+            print(f"No ObjectId for {OBJECT}")
+            skip = True
+
+        if RELATIONSHIP_ID is None:
+            print(f"No RelationshipId for {RELATIONSHIP}")
+            skip = True
+
+
+        if skip:
+            continue
+
+        if len(SUB_SECTIONS) > 4:
+            index = min(len(SUB_SECTIONS), 7)
+            while index > 3:
+                PATH = "/".join(SUB_SECTIONS[0:index]) + "%"
+                SUB_ABBR_ID = select_abbreviation_id(PATH)
+
+                if SUB_ABBR_ID is not None:
+                    if insert_subject_abbreviation(SUBJECT_ID, SUB_ABBR_ID):
+                        total += 1
+                    index = 0
+                index -= 1
+
+        if len(REL_SECTIONS) > 4:
+            index = min(len(REL_SECTIONS), 7)
+            while index > 2:
+                PATH = "/".join(REL_SECTIONS[0:index]) + "%"
+                REL_ABBR_ID = select_abbreviation_id(PATH)
+
+
+                if REL_ABBR_ID is not None:
+                    if insert_relationship_abbreviation(RELATIONSHIP_ID, REL_ABBR_ID):
+                        total += 1
+                    index = 0
+                index -= 1
+
+        if len(OBJ_SECTIONS) > 4:
+            index = min(len(OBJ_SECTIONS), 7)
+            while index > 3:
+                PATH = "/".join(OBJ_SECTIONS[0:index]) + "%"
+                OBJ_ABBR_ID = select_abbreviation_id(PATH)
+
+                if OBJ_ABBR_ID is not None:
+                    if insert_object_abbreviation(OBJECT_ID, OBJ_ABBR_ID):
+                        total += 1
+                    index = 0
+                index -= 1
+
+    print(total)
+
+
+def parseAbbr_Dataset():
+
+    DATASET_CSV_READER = csv.DictReader(DATASET_CSV_HANDLER)
+    DATASET_ORIGIN_ID = selectOrigin(CURS, "dataset.csv")
+
+    if DATASET_ORIGIN_ID is None:
+        return
+
+    total = 0
+    rdf_idx = 0
+    for row in DATASET_CSV_READER:
+        SUBJECT = row["subject"]
+        RELATIONSHIP = row["relationship"]
+        OBJECT = row["object"]
+
+        rdf_idx += 1
+
+        if rdf_idx % 100000 == 0:
+            print(f"RDF number {rdf_idx}:\n\t{SUBJECT} - {RELATIONSHIP} - {OBJECT}")
+
+        SUBJECT_ID = selectSubjectId(CURS, SUBJECT)
+        OBJECT_ID = selectObjectId(CURS, OBJECT)
+        RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
+
+        SUB_SECTIONS = SUBJECT.split("/")
+        REL_SECTIONS = RELATIONSHIP.split("/")
+        OBJ_SECTIONS = OBJECT.split("/")
+
+        SUB_ABBR_ID = None
+        REL_ABBR_ID = None
+        OBJ_ABBR_ID = None
+
+        skip = False
+
+        # guard
+        if SUBJECT_ID is None:
+            print(f"No SubjectId for {SUBJECT}")
+            skip = True
+
+        if OBJECT_ID is None:
+            print(f"No ObjectId for {OBJECT}")
+            skip = True
+
+        if RELATIONSHIP_ID is None:
+            print(f"No RelationshipId for {RELATIONSHIP}")
+            skip = True
+
+
+        if skip:
+            continue
+
+        if len(SUB_SECTIONS) > 4:
+            index = min(len(SUB_SECTIONS), 7)
+            while index > 3:
+                PATH = "/".join(SUB_SECTIONS[0:index]) + "%"
+                SUB_ABBR_ID = select_abbreviation_id(PATH)
+
+                if SUB_ABBR_ID is not None:
+                    if insert_subject_abbreviation(SUBJECT_ID, SUB_ABBR_ID):
+                        total += 1
+                    index = 0
+                index -= 1
+
+        if len(REL_SECTIONS) > 4:
+            index = min(len(REL_SECTIONS), 7)
+            while index > 2:
+                PATH = "/".join(REL_SECTIONS[0:index]) + "%"
+                REL_ABBR_ID = select_abbreviation_id(PATH)
+
+
+                if REL_ABBR_ID is not None:
+                    if insert_relationship_abbreviation(RELATIONSHIP_ID, REL_ABBR_ID):
+                        total += 1
+                    index = 0
+                index -= 1
+
+        if len(OBJ_SECTIONS) > 4:
+            index = min(len(OBJ_SECTIONS), 7)
+            while index > 3:
+                PATH = "/".join(OBJ_SECTIONS[0:index]) + "%"
+                OBJ_ABBR_ID = select_abbreviation_id(PATH)
+
+                if OBJ_ABBR_ID is not None:
+                    if insert_object_abbreviation(OBJECT_ID, OBJ_ABBR_ID):
+                        total += 1
+                    index = 0
+                index -= 1
+
+    print(total)
+
+
 # MARK: Actual Code
 # parseMovies()
 # parseWikiPageId()
 # parseAbstract()
 # insertOrigin(CURS)
+# parseAbbreviations()
 # parseRDF_Reverse()
 # parseRDF_Dataset()
+# parseAbbr_Reverse()
+parseAbbr_Dataset()


 CONN.commit()
 CONN.close()


-
 MOVIES_CSV_HANDLER.close()
 PAGEID_CSV_HANDLER.close()
 SUMMARY_CSV_HANDLER.close()
 DATASET_CSV_HANDLER.close()
 REVERSE_CSV_HANDLER.close()
+URI_ABBR_CSV_HANDLER.close()


 """
--- a/Scripts/Libs/CleaningPipeline/.gitkeep
+++ b/Scripts/Libs/CleaningPipeline/.gitkeep
--- a/Scripts/Libs/Utils/.gitkeep
+++ b/Scripts/Libs/Utils/.gitkeep
--- a/Scripts/UML/CleaningPipeline/classes.excalidraw.json
+++ b/Scripts/UML/CleaningPipeline/classes.excalidraw.json
@@ -0,0 +1,826 @@
+{
+  "type": "excalidraw",
+  "version": 2,
+  "source": "https://marketplace.visualstudio.com/items?itemName=pomdtr.excalidraw-editor",
+  "elements": [
+    {
+      "type": "line",
+      "version": 4622,
+      "versionNonce": 1623045672,
+      "isDeleted": false,
+      "id": "twu_PiAvEuQ4l1YYtZLET",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 0,
+      "x": 289.8504963515835,
+      "y": 91.87474806402287,
+      "strokeColor": "#000000",
+      "backgroundColor": "#a5d8ff",
+      "width": 77.09201683999922,
+      "height": 99.49948667804088,
+      "seed": 1975340120,
+      "groupIds": [
+        "9PT4BXPfQ6UoCaB-T-h9A",
+        "dp_TZJyYdyPIH1hOkAPlb"
+      ],
+      "strokeSharpness": "round",
+      "boundElementIds": [],
+      "startBinding": null,
+      "endBinding": null,
+      "lastCommittedPoint": null,
+      "startArrowhead": null,
+      "endArrowhead": null,
+      "points": [
+        [
+          0,
+          0
+        ],
+        [
+          0.2542098813493443,
+          75.20117273657175
+        ],
+        [
+          0.011896425679918422,
+          83.76249969444815
+        ],
+        [
+          3.970409367559332,
+          87.46174320643391
+        ],
+        [
+          17.75573317066317,
+          90.59250103325854
+        ],
+        [
+          41.05683533152865,
+          91.56737225214069
+        ],
+        [
+          63.319497586673116,
+          90.01084754868091
+        ],
+        [
+          75.14781395923075,
+          86.28844687220405
+        ],
+        [
+          76.81603792670788,
+          83.15042405259751
+        ],
+        [
+          77.05033394391478,
+          76.25776215104557
+        ],
+        [
+          76.86643881413028,
+          6.3089586511537865
+        ],
+        [
+          76.45188016352971,
+          -0.2999144698665015
+        ],
+        [
+          71.50179495549581,
+          -3.9936571317850627
+        ],
+        [
+          61.077971898861186,
+          -6.132877429442784
+        ],
+        [
+          37.32348754161154,
+          -7.932114425900202
+        ],
+        [
+          18.278415656797975,
+          -6.859225353587373
+        ],
+        [
+          3.2995959613238286,
+          -3.2201165291205287
+        ],
+        [
+          -0.04168289608444441,
+          -0.045185660461322996
+        ],
+        [
+          0,
+          0
+        ]
+      ],
+      "index": "a1",
+      "frameId": null,
+      "roundness": {
+        "type": 2
+      },
+      "boundElements": [],
+      "updated": 1758646548051,
+      "link": null,
+      "locked": false
+    },
+    {
+      "type": "line",
+      "version": 2327,
+      "versionNonce": 1593094440,
+      "isDeleted": false,
+      "id": "hmJk4dH9VpOsfkrCTkhvh",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 0,
+      "x": 290.3744257898585,
+      "y": 149.00103172175278,
+      "strokeColor": "#000000",
+      "backgroundColor": "#a5d8ff",
+      "width": 77.17198221193564,
+      "height": 8.562348957853036,
+      "seed": 637665624,
+      "groupIds": [
+        "9PT4BXPfQ6UoCaB-T-h9A",
+        "dp_TZJyYdyPIH1hOkAPlb"
+      ],
+      "strokeSharpness": "round",
+      "boundElementIds": [],
+      "startBinding": null,
+      "endBinding": null,
+      "lastCommittedPoint": null,
+      "startArrowhead": null,
+      "endArrowhead": null,
+      "points": [
+        [
+          0,
+          0
+        ],
+        [
+          2.033150371639873,
+          3.413095389435587
+        ],
+        [
+          10.801287372573954,
+          6.276651055277943
+        ],
+        [
+          22.468666942209353,
+          8.010803051612635
+        ],
+        [
+          40.747074201802775,
+          8.168828515515864
+        ],
+        [
+          62.077348233027564,
+          7.0647721921469495
+        ],
+        [
+          74.53446931782398,
+          3.04824021069218
+        ],
+        [
+          77.17198221193564,
+          -0.3935204423371723
+        ]
+      ],
+      "index": "a2",
+      "frameId": null,
+      "roundness": {
+        "type": 2
+      },
+      "boundElements": [],
+      "updated": 1758646548051,
+      "link": null,
+      "locked": false
+    },
+    {
+      "type": "line",
+      "version": 2413,
+      "versionNonce": 311708712,
+      "isDeleted": false,
+      "id": "X1ldVIXm4DfBal5N2Pwn9",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 0,
+      "x": 289.3425684673547,
+      "y": 120.03697638652972,
+      "strokeColor": "#000000",
+      "backgroundColor": "#a5d8ff",
+      "width": 77.17198221193564,
+      "height": 8.562348957853036,
+      "seed": 904402520,
+      "groupIds": [
+        "9PT4BXPfQ6UoCaB-T-h9A",
+        "dp_TZJyYdyPIH1hOkAPlb"
+      ],
+      "strokeSharpness": "round",
+      "boundElementIds": [],
+      "startBinding": null,
+      "endBinding": null,
+      "lastCommittedPoint": null,
+      "startArrowhead": null,
+      "endArrowhead": null,
+      "points": [
+        [
+          0,
+          0
+        ],
+        [
+          2.033150371639873,
+          3.413095389435587
+        ],
+        [
+          10.801287372573954,
+          6.276651055277943
+        ],
+        [
+          22.468666942209353,
+          8.010803051612635
+        ],
+        [
+          40.747074201802775,
+          8.168828515515864
+        ],
+        [
+          62.077348233027564,
+          7.0647721921469495
+        ],
+        [
+          74.53446931782398,
+          3.04824021069218
+        ],
+        [
+          77.17198221193564,
+          -0.3935204423371723
+        ]
+      ],
+      "index": "a3",
+      "frameId": null,
+      "roundness": {
+        "type": 2
+      },
+      "boundElements": [],
+      "updated": 1758646548051,
+      "link": null,
+      "locked": false
+    },
+    {
+      "type": "ellipse",
+      "version": 5410,
+      "versionNonce": 92833576,
+      "isDeleted": false,
+      "id": "CFhp5ZxSVwHYzGUj4hEn1",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 0,
+      "x": 288.28461948527263,
+      "y": 84.74247943834126,
+      "strokeColor": "#000000",
+      "backgroundColor": "#a5d8ff",
+      "width": 76.59753601865496,
+      "height": 15.49127539284798,
+      "seed": 1782811480,
+      "groupIds": [
+        "9PT4BXPfQ6UoCaB-T-h9A",
+        "dp_TZJyYdyPIH1hOkAPlb"
+      ],
+      "strokeSharpness": "sharp",
+      "boundElementIds": [
+        "bxuMGTzXLn7H-uBCptINx"
+      ],
+      "index": "a4",
+      "frameId": null,
+      "roundness": null,
+      "boundElements": [],
+      "updated": 1758646548051,
+      "link": null,
+      "locked": false
+    },
+    {
+      "type": "ellipse",
+      "version": 820,
+      "versionNonce": 608002600,
+      "isDeleted": false,
+      "id": "B43R7rWwK2_vdiRHBSSPk",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 0,
+      "x": 324.77660659049513,
+      "y": 109.21914711824485,
+      "strokeColor": "#000000",
+      "backgroundColor": "#228be6",
+      "width": 11.226103154161754,
+      "height": 12.183758484455605,
+      "seed": 1298686040,
+      "groupIds": [
+        "9PT4BXPfQ6UoCaB-T-h9A",
+        "dp_TZJyYdyPIH1hOkAPlb"
+      ],
+      "strokeSharpness": "sharp",
+      "boundElementIds": [],
+      "index": "a5",
+      "frameId": null,
+      "roundness": null,
+      "boundElements": [],
+      "updated": 1758646548051,
+      "link": null,
+      "locked": false
+    },
+    {
+      "type": "ellipse",
+      "version": 1108,
+      "versionNonce": 1839127848,
+      "isDeleted": false,
+      "id": "CkKMb9wkJfVk04T217zSs",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 0,
+      "x": 325.12774837442873,
+      "y": 135.43576140530996,
+      "strokeColor": "#000000",
+      "backgroundColor": "#228be6",
+      "width": 11.226103154161754,
+      "height": 12.183758484455605,
+      "seed": 2133497176,
+      "groupIds": [
+        "9PT4BXPfQ6UoCaB-T-h9A",
+        "dp_TZJyYdyPIH1hOkAPlb"
+      ],
+      "strokeSharpness": "sharp",
+      "boundElementIds": [],
+      "index": "a6",
+      "frameId": null,
+      "roundness": null,
+      "boundElements": [],
+      "updated": 1758646548051,
+      "link": null,
+      "locked": false
+    },
+    {
+      "type": "ellipse",
+      "version": 991,
+      "versionNonce": 588838952,
+      "isDeleted": false,
+      "id": "SHJdKeQPkfpvzSoNH--3o",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 6.239590202363168,
+      "x": 325.77660659049513,
+      "y": 164.20448797661635,
+      "strokeColor": "#000000",
+      "backgroundColor": "#228be6",
+      "width": 11.226103154161754,
+      "height": 12.183758484455605,
+      "seed": 81668696,
+      "groupIds": [
+        "9PT4BXPfQ6UoCaB-T-h9A",
+        "dp_TZJyYdyPIH1hOkAPlb"
+      ],
+      "strokeSharpness": "sharp",
+      "boundElementIds": [],
+      "index": "a7",
+      "frameId": null,
+      "roundness": null,
+      "boundElements": [],
+      "updated": 1758646548051,
+      "link": null,
+      "locked": false
+    },
+    {
+      "type": "text",
+      "version": 489,
+      "versionNonce": 2023207720,
+      "isDeleted": false,
+      "id": "vUSyMBPup0jZ71CYXKyGb",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 0,
+      "x": 280.1846389770508,
+      "y": 185.79462957545917,
+      "strokeColor": "#000000",
+      "backgroundColor": "#a5d8ff",
+      "width": 95.63072204589844,
+      "height": 23.595161071904883,
+      "seed": 425140056,
+      "groupIds": [
+        "dp_TZJyYdyPIH1hOkAPlb"
+      ],
+      "strokeSharpness": "sharp",
+      "boundElementIds": [],
+      "fontSize": 17.4778970902999,
+      "fontFamily": 1,
+      "text": "dataset.db",
+      "baseline": 16.595161071904883,
+      "textAlign": "center",
+      "verticalAlign": "top",
+      "index": "a8",
+      "frameId": null,
+      "roundness": null,
+      "boundElements": [],
+      "updated": 1758646548051,
+      "link": null,
+      "locked": false,
+      "containerId": null,
+      "originalText": "dataset.db",
+      "autoResize": true,
+      "lineHeight": 1.350000000000001
+    },
+    {
+      "id": "R7pU0VP6CFKCAwuvt0xsr",
+      "type": "text",
+      "x": 295.5,
+      "y": 342,
+      "width": 374,
+      "height": 225,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "#228be6",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a9",
+      "roundness": null,
+      "seed": 705463336,
+      "version": 1130,
+      "versionNonce": 72522328,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758648226024,
+      "link": null,
+      "locked": false,
+      "text": "class Extract(Action):\n    # Static\n    + type : ActionTypes = Extract\n    \n    # Properties\n    - db_connection: Path\n    - query: str\n    - query_parameters: [str]\n    - output_mapper: [str]",
+      "fontSize": 20,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "class Extract(Action):\n    # Static\n    + type : ActionTypes = Extract\n    \n    # Properties\n    - db_connection: Path\n    - query: str\n    - query_parameters: [str]\n    - output_mapper: [str]",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "G1xIRcJgm34_NMEWQFFlW",
+      "type": "text",
+      "x": 1419.5,
+      "y": 110,
+      "width": 253,
+      "height": 75,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "#228be6",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aA",
+      "roundness": null,
+      "seed": 651981400,
+      "version": 256,
+      "versionNonce": 138082856,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758646570344,
+      "link": null,
+      "locked": false,
+      "text": "class Pipeline\n    - actions: [Action]\n    ",
+      "fontSize": 20,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "class Pipeline\n    - actions: [Action]\n    ",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "TBVy3JbJCkbA9kjVEJ8lv",
+      "type": "text",
+      "x": 694,
+      "y": 100,
+      "width": 495,
+      "height": 150,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "#228be6",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aB",
+      "roundness": null,
+      "seed": 680960040,
+      "version": 560,
+      "versionNonce": 85012520,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758649442239,
+      "link": null,
+      "locked": false,
+      "text": "class Action\n    + type: ActionTypes\n    + name: str\n    + depends_on: [str]\n\n    + execute(mem) -> [Dict<str, any>] | Void",
+      "fontSize": 20,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "class Action\n    + type: ActionTypes\n    + name: str\n    + depends_on: [str]\n\n    + execute(mem) -> [Dict<str, any>] | Void",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "an7KRTzWpCytKNKgHftKC",
+      "type": "text",
+      "x": 1528.5,
+      "y": 365.5,
+      "width": 187,
+      "height": 150,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "#228be6",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aC",
+      "roundness": null,
+      "seed": 1974317656,
+      "version": 306,
+      "versionNonce": 1574962264,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758648154009,
+      "link": null,
+      "locked": false,
+      "text": "enum ActionTypes:\n    + Extract\n    + Aggregate\n    + Filter\n    + Map\n    + Dump",
+      "fontSize": 20,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "enum ActionTypes:\n    + Extract\n    + Aggregate\n    + Filter\n    + Map\n    + Dump",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "2pQ5EULirrWs_QZPbClhh",
+      "type": "text",
+      "x": 785,
+      "y": 332.5,
+      "width": 418,
+      "height": 375,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "#228be6",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aH",
+      "roundness": null,
+      "seed": 1402251560,
+      "version": 742,
+      "versionNonce": 680432168,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758649532881,
+      "link": null,
+      "locked": false,
+      "text": "class Aggregate(Action):\n    # Static\n    + type: ActionTypes = Aggregate\n\n    # Properties\n    - actionIDs: [str]\n    - associations: [Association]\n    - output_mapper: [str]\n\n    + execute(mem):\n        tables = mem.gather(actionIDs)\n\n        for join in association:\n            \n            ",
+      "fontSize": 20,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "class Aggregate(Action):\n    # Static\n    + type: ActionTypes = Aggregate\n\n    # Properties\n    - actionIDs: [str]\n    - associations: [Association]\n    - output_mapper: [str]\n\n    + execute(mem):\n        tables = mem.gather(actionIDs)\n\n        for join in association:\n            \n            ",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "O0fso8DJqFfwJEzmpUikM",
+      "type": "text",
+      "x": 1289,
+      "y": 195,
+      "width": 594,
+      "height": 100,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "#228be6",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aI",
+      "roundness": null,
+      "seed": 1582329944,
+      "version": 459,
+      "versionNonce": 1080077144,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758647067031,
+      "link": null,
+      "locked": false,
+      "text": "input_mapper: \n    - key: ActionID (name) that produced such output\n    - value: list of strings that represent the values\n                to take",
+      "fontSize": 20,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "input_mapper: \n    - key: ActionID (name) that produced such output\n    - value: list of strings that represent the values\n                to take",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "v0kzO6vlBWOdJCV3yoG69",
+      "type": "text",
+      "x": 1379.5,
+      "y": 718.5,
+      "width": 286,
+      "height": 175,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "#228be6",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aL",
+      "roundness": null,
+      "seed": 1462407976,
+      "version": 635,
+      "versionNonce": 1012998696,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758649495598,
+      "link": null,
+      "locked": false,
+      "text": "class Association:\n    - from_actionID: str\n    - from_key_name: str\n    - from_value_name: str\n    - to_actionID: str\n    - to_value_name: str\n    - type: Type",
+      "fontSize": 20,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "class Association:\n    - from_actionID: str\n    - from_key_name: str\n    - from_value_name: str\n    - to_actionID: str\n    - to_value_name: str\n    - type: Type",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "WK34n9xeVxntypCtrlK6p",
+      "type": "text",
+      "x": 256.5,
+      "y": 787.5,
+      "width": 517,
+      "height": 175,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "#228be6",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aM",
+      "roundness": null,
+      "seed": 1166526296,
+      "version": 318,
+      "versionNonce": 1042162520,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758649002604,
+      "link": null,
+      "locked": false,
+      "text": "class Filter(Action):\n    # Static\n    + type: ActionTypes = Filter\n\n    # Properties\n    - compare: function(Dict<str, any>) -> bool\n    - output_mapper: [str]",
+      "fontSize": 20,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "class Filter(Action):\n    # Static\n    + type: ActionTypes = Filter\n\n    # Properties\n    - compare: function(Dict<str, any>) -> bool\n    - output_mapper: [str]",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "NY9jyUFLFFCNPE2sh00SX",
+      "type": "text",
+      "x": 1639,
+      "y": 606.5,
+      "width": 407,
+      "height": 200,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "#228be6",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aP",
+      "roundness": null,
+      "seed": 20345896,
+      "version": 168,
+      "versionNonce": 627282472,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758649426380,
+      "link": null,
+      "locked": false,
+      "text": "class Map(Action):\n    # Static\n    + type: ActionTypes = Map\n\n    # Properties\n    - compare_mapper: [str]\n    - mapper: function(any...) -> any\n    - output_mapper: [str]",
+      "fontSize": 20,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "class Map(Action):\n    # Static\n    + type: ActionTypes = Map\n\n    # Properties\n    - compare_mapper: [str]\n    - mapper: function(any...) -> any\n    - output_mapper: [str]",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "SkhaoW-3TTKDZzEii3Lf6",
+      "type": "text",
+      "x": 1457.5,
+      "y": 955.5,
+      "width": 121,
+      "height": 50,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "#228be6",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aQ",
+      "roundness": null,
+      "seed": 2071523672,
+      "version": 37,
+      "versionNonce": 105260376,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758648834435,
+      "link": null,
+      "locked": false,
+      "text": "class Dump:\n    -",
+      "fontSize": 20,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "class Dump:\n    -",
+      "autoResize": true,
+      "lineHeight": 1.25
+    }
+  ],
+  "appState": {
+    "gridSize": 20,
+    "gridStep": 5,
+    "gridModeEnabled": false,
+    "viewBackgroundColor": "#ffffff"
+  },
+  "files": {}
+}
Author	SHA1	Message	Date
GassiGiuseppe	9440a562f2	Merge branch 'dev.etl' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev.etl	2025-09-25 18:33:51 +02:00
Christian Risi	5eda131aac	Fixed creation query to be unique even with movieID in RDFs	2025-09-25 17:58:09 +02:00
GassiGiuseppe	57884eaf2e	CSV support added to path_splitter_tree Also resolved a minor bug to print also leaf nodes	2025-09-25 17:57:46 +02:00
Christian Risi	4548a683c2	Fixed DB	2025-09-25 17:57:45 +02:00
GassiGiuseppe	3eec49ffa5	WIP: added test file: clean_relationship.jupyter to create a first cleaning pipeline	2025-09-25 16:28:24 +02:00
Christian Risi	0bc7f4b227	Fixed Typos	2025-09-25 12:37:52 +02:00
Christian Risi	f28952b0a2	Added todo	2025-09-25 12:00:26 +02:00
Christian Risi	0b626a8e09	Modified query to take all data	2025-09-25 11:53:12 +02:00
Christian Risi	b254098532	Added views to count for subjects and objects	2025-09-25 11:40:44 +02:00
Christian Risi	ee88ffe4cf	Added View to filter over relationship counts	2025-09-25 11:32:03 +02:00
Christian Risi	70b4bd8645	Added Complex query	2025-09-25 11:31:34 +02:00
Christian Risi	6316d2bfc4	Added queries to take data from SQL for dataset	2025-09-25 11:27:19 +02:00
Christian Risi	87ca748f45	Updated DB to reflect new changes	2025-09-24 19:29:57 +02:00
Christian Risi	4315d70109	Merged abbreviation_datawarehouse into datawarehouse	2025-09-24 19:29:43 +02:00
Christian Risi	9a5d633b5e	Fixed Typos	2025-09-24 19:29:07 +02:00
Christian Risi	a6760cd52d	Updated SQL Queries to support parsing in DB	2025-09-24 19:28:55 +02:00
GassiGiuseppe	a7eb92227d	Moved all db queries file in their own folder	2025-09-24 16:44:55 +02:00
GassiGiuseppe	9f221e31cd	Merge branch 'dev.etl' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev.etl	2025-09-24 16:32:52 +02:00
GassiGiuseppe	47197194d5	WIP abbrevietion_datawarehouse to creat an abbreviation system	2025-09-24 16:32:09 +02:00
Christian Risi	0cdbf6f624	Added query to retrieve a dirty dataset from SQLite DB	2025-09-24 16:15:47 +02:00
Christian Risi	3e30489f86	Updated Queries for DB	2025-09-24 14:44:53 +02:00
Christian Risi	8a22e453e4	Fixed csv	2025-09-24 14:44:25 +02:00
Christian Risi	7feb4eb857	Fixed URI generation	2025-09-24 14:44:07 +02:00
Christian Risi	70af19d356	Removed unused imports and added trailing slashes	2025-09-24 14:04:48 +02:00
Christian Risi	a4b44ab2ee	Fixed Typos	2025-09-24 14:04:27 +02:00
Christian Risi	74b6b609dd	Fixed typos	2025-09-24 13:59:19 +02:00
Christian Risi	59796c37cb	Added script to take dbpedia uris	2025-09-24 13:49:29 +02:00
Christian Risi	f696f5950b	Added uri-abbreviations	2025-09-24 13:48:53 +02:00
Christian Risi	605b496da7	Added barebone UML diagram for a Cleaning Pipeline	2025-09-23 19:49:01 +02:00
Christian Risi	7d693964dd	Added new directories to tree structure	2025-09-23 19:47:56 +02:00