Added possibility to whitelist relationships

add divide method to create hold out dataset
Added EOS token
2025-10-12 12:26:26 +02:00 · 2025-10-11 16:49:36 +02:00 · 2025-10-07 22:47:59 +02:00 · 2025-10-07 15:49:25 +02:00 · 2025-10-07 00:54:00 +02:00 · 2025-10-06 10:57:50 +02:00
35 changed files with 4895 additions and 152 deletions
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -0,0 +1,24 @@
+{
+  // Always treat the project root as the working dir for Jupyter
+  "jupyter.notebookFileRoot": "${workspaceFolder}",
+
+  // When you click "Run Python File in Terminal", DON'T cd into the file's folder
+  "python.terminal.executeInFileDir": false,
+
+  // Start new integrated terminals at the project root
+  "terminal.integrated.cwd": "${workspaceFolder}",
+
+  // Ensure Python can import from the project root no matter which file you run
+  // (so `src/` is on sys.path). Linux shown here; add osx/windows if needed.
+  "terminal.integrated.env.linux": {
+    "PYTHONPATH": "${workspaceFolder}"
+  },
+
+  // Make pytest run from the root without needing a pytest.ini
+  "python.testing.pytestEnabled": true,
+  "python.testing.cwd": "${workspaceFolder}",
+  "python.testing.pytestArgs": ["src/test"],
+
+  // Help Pylance resolve imports like `from src...` without red squiggles
+  "python.analysis.extraPaths": ["${workspaceFolder}"]
+}
--- a/Assets/Dataset/1-hop/uri-abbreviations.csv
+++ b/Assets/Dataset/1-hop/uri-abbreviations.csv
--- a/Assets/Dataset/DatawareHouse/dataset.db
+++ b/Assets/Dataset/DatawareHouse/dataset.db
--- a/Scripts/DataBaseQueries/dataset.sql
+++ b/Scripts/DataBaseQueries/dataset.sql
@@ -0,0 +1,30 @@
+-- To pass to Pandas
+SELECT *
+FROM RDFs
+INNER JOIN Subjects USING (SubjectID)
+INNER JOIN Relationships USING (RelationshipID)
+INNER JOIN Objects USING (ObjectID);
+
+-- To pass to Pandas for abstracts
+SELECT *
+FROM RDFs
+INNER JOIN WikipediaAbstracts USING (MovieID);
+
+-- To pass to Pandas for abbreviations
+SELECT *
+FROM Abbreviations;
+
+-- More complex to have clean dataset
+-- More complex to have clean dataset
+SELECT MovieID, GROUP_CONCAT('<SOT>' || '<SUB>' || SubjectURI || '<REL>' || RelationshipURI || '<OBJ>' || ObjectURI || '<EOT>', '') as RDF_String, Abstract
+FROM RDFs
+INNER JOIN SubjectsCountInRDFs USING (SubjectID)
+INNER JOIN RelationshipsCountInRDFs USING(RelationshipID)
+INNER JOIN ObjectsCountInRDFs USING (ObjectID)
+INNER JOIN ParsedSubjects USING (SubjectID)
+INNER JOIN ParsedRelationships USING (RelationshipID)
+INNER JOIN ParsedObjects USING (ObjectID)
+INNER JOIN WikipediaAbstracts USING (MovieID)
+    -- insert WHERE here
+-- WHERE SubjectID = 134626
+GROUP BY MovieID;
--- a/Scripts/DataBaseQueries/db_creation.sql
+++ b/Scripts/DataBaseQueries/db_creation.sql
@@ -0,0 +1,174 @@
+CREATE TABLE IF NOT EXISTS Movies (
+    MovieID INTEGER PRIMARY KEY AUTOINCREMENT,
+    MovieURI TEXT UNIQUE NOT NULL
+);
+
+
+CREATE TABLE IF NOT EXISTS WikiPageIDs (
+    MovieID INTEGER PRIMARY KEY,
+    PageID INTEGER UNIQUE NOT NULL,
+    FOREIGN KEY(MovieID) REFERENCES Movies(MovieID)
+);
+
+
+CREATE TABLE IF NOT EXISTS WikipediaAbstracts (
+    MovieID INTEGER PRIMARY KEY,
+    Abstract TEXT NOT NULL,
+    FOREIGN KEY(MovieID) REFERENCES Movies(MovieID)
+);
+
+
+CREATE TABLE IF NOT EXISTS Origins (
+    OriginID INTEGER PRIMARY KEY AUTOINCREMENT,
+    OriginName TEXT UNIQUE NOT NULL
+);
+
+
+CREATE TABLE IF NOT EXISTS Subjects (
+    SubjectID INTEGER PRIMARY KEY AUTOINCREMENT,
+    SubjectURI TEXT UNIQUE NOT NULL,
+    OriginID BIGINT NOT NULL,
+    FOREIGN KEY(OriginID) REFERENCES Origins(OriginID)
+);
+
+
+CREATE TABLE IF NOT EXISTS Relationships (
+    RelationshipID INTEGER PRIMARY KEY AUTOINCREMENT,
+    RelationshipURI TEXT UNIQUE NOT NULL
+);
+
+
+CREATE TABLE IF NOT EXISTS Objects (
+    ObjectID INTEGER PRIMARY KEY AUTOINCREMENT,
+    ObjectURI TEXT UNIQUE NOT NULL,
+    OriginID BIGINT NOT NULL,
+    FOREIGN KEY(OriginID) REFERENCES Origins(OriginID)
+);
+
+CREATE TABLE IF NOT EXISTS RDFs (
+    RDF_ID INTEGER PRIMARY KEY AUTOINCREMENT,
+    MovieID INTEGER NOT NULL,
+    SubjectID INTEGER NOT NULL,
+    RelationshipID INTEGER NOT NULL,
+    ObjectID INTEGER NOT NULL,
+    UNIQUE(MovieID, SubjectID, RelationshipID, ObjectID),
+    FOREIGN KEY(MovieID) REFERENCES Movies(MovieID),
+    FOREIGN KEY(SubjectID) REFERENCES Subjects(SubjectID),
+    FOREIGN KEY(RelationshipID) REFERENCES Relationships(RelationshipID),
+    FOREIGN KEY(ObjectID) REFERENCES Objects(ObjectID)
+);
+
+CREATE INDEX IF NOT EXISTS idx_rdf_movie_id ON RDFs(MovieID);
+CREATE INDEX IF NOT EXISTS idx_rdf_subject_id ON RDFs(SubjectID);
+CREATE INDEX IF NOT EXISTS idx_rdf_relationship_id ON RDFs(RelationshipID);
+CREATE INDEX IF NOT EXISTS idx_rdf_object_id ON RDFs(ObjectID);
+
+CREATE TABLE IF NOT EXISTS Abbreviations (
+    AbbreviationID INTEGER PRIMARY KEY AUTOINCREMENT,
+    URI TEXT UNIQUE NOT NULL,
+    Abbreviation TEXT UNIQUE NOT NULL
+);
+
+CREATE TABLE IF NOT EXISTS Subjects_Abbreviations (
+    SubjectID INTEGER NOT NULL,
+    AbbreviationID INTEGER NOT NULL,
+    PRIMARY KEY(SubjectID, AbbreviationID),
+    FOREIGN KEY(SubjectID) REFERENCES Subjects(SubjectID),
+    FOREIGN KEY(AbbreviationID) REFERENCES Abbreviations(AbbreviationID)
+);
+
+CREATE TABLE IF NOT EXISTS Relationships_Abbreviations (
+    RelationshipID INTEGER NOT NULL,
+    AbbreviationID INTEGER NOT NULL,
+    PRIMARY KEY(RelationshipID, AbbreviationID),
+    FOREIGN KEY(RelationshipID) REFERENCES Relationships(RelationshipID),
+    FOREIGN KEY(AbbreviationID) REFERENCES Abbreviations(AbbreviationID)
+);
+
+CREATE TABLE IF NOT EXISTS Objects_Abbreviations (
+    ObjectID INTEGER NOT NULL,
+    AbbreviationID INTEGER NOT NULL,
+    PRIMARY KEY(ObjectID, AbbreviationID),
+    FOREIGN KEY(ObjectID) REFERENCES Objects(ObjectID),
+    FOREIGN KEY(AbbreviationID) REFERENCES Abbreviations(AbbreviationID)
+);
+
+CREATE INDEX IF NOT EXISTS idx_sub_abbr_sub_id ON Subjects_Abbreviations(SubjectID);
+CREATE INDEX IF NOT EXISTS idx_sub_abbr_abbr_id ON Subjects_Abbreviations(AbbreviationID);
+CREATE INDEX IF NOT EXISTS idx_rel_abbr_rel_id ON Relationships_Abbreviations(RelationshipID);
+CREATE INDEX IF NOT EXISTS idx_rel_abbr_abbr_id ON Relationships_Abbreviations(AbbreviationID);
+CREATE INDEX IF NOT EXISTS idx_obj_abbr_obj_id ON Objects_Abbreviations(ObjectID);
+CREATE INDEX IF NOT EXISTS idx_obj_abbr_abbr_id ON Objects_Abbreviations(AbbreviationID);
+
+-- Views
+-- Subjects
+CREATE VIEW IF NOT EXISTS ParsedSubjects
+AS
+SELECT
+	SubjectID,
+	CASE WHEN Abbreviation IS NULL
+		THEN SubjectURI
+		ELSE Abbreviation || ':' || replace(SubjectURI, URI, '') END
+		AS SubjectURI
+FROM Subjects
+	LEFT JOIN Subjects_Abbreviations USING (SubjectID)
+	LEFT JOIN Abbreviations USING (AbbreviationID);
+
+-- Relationships
+CREATE VIEW IF NOT EXISTS ParsedRelationships
+AS
+SELECT
+	RelationshipID,
+	CASE WHEN Abbreviation IS NULL
+		THEN RelationshipURI
+		ELSE Abbreviation || ':' || replace(RelationshipURI, URI, '') END
+		AS RelationshipURI
+FROM Relationships
+	LEFT JOIN Relationships_Abbreviations USING (RelationshipID)
+	LEFT JOIN Abbreviations USING (AbbreviationID);
+
+-- Objects
+CREATE VIEW IF NOT EXISTS ParsedObjects
+AS
+SELECT
+	ObjectID,
+	CASE WHEN Abbreviation IS NULL
+		THEN ObjectURI
+		ELSE Abbreviation || ':' || replace(ObjectURI, URI, '') END
+		AS ObjectURI
+FROM Objects
+	LEFT JOIN Objects_Abbreviations USING (ObjectID)
+	LEFT JOIN Abbreviations USING (AbbreviationID);
+
+
+-- Subject Count
+CREATE VIEW IF NOT EXISTS SubjectsCountInRDFs
+AS
+SELECT SubjectID, count(SubjectID) as Sub_Count
+FROM RDFs
+GROUP BY SubjectID;
+
+
+
+
+-- Relationship Count
+CREATE VIEW IF NOT EXISTS RelationshipsCountInRDFs
+AS
+SELECT RelationshipID, count(RelationshipID) as Rel_Count
+FROM RDFs
+GROUP BY RelationshipID;
+
+
+-- Object Count
+CREATE VIEW IF NOT EXISTS ObjectsCountInRDFs
+AS
+SELECT ObjectID, count(ObjectID) as Obj_Count
+FROM RDFs
+GROUP BY ObjectID;
+
+
+
+
+
+
+
--- a/Scripts/DatasetMerging/SQL_Queries/query.sql
+++ b/Scripts/DatasetMerging/SQL_Queries/query.sql
@@ -33,3 +33,23 @@ SELECT ObjectID FROM Objects WHERE ObjectURI = ?;


 INSERT INTO  RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?,?,?,?);
+
+-- Prefixes
+INSERT INTO Abbreviations(URI, Abbreviation) VALUES (?,?);
+INSERT INTO Objects_Abbreviations(ObjectID, AbbreviationID) VALUES (?,?);
+INSERT INTO Relationships_Abbreviations(RelationshipID, AbbreviationID) VALUES (?,?);
+INSERT INTO Subjects_Abbreviations(SubjectID, AbbreviationID) VALUES (?,?);
+
+-- Please be sure it is a URI before running this query
+--  and take at least until the domain and the first path part
+SELECT AbbreviationID FROM Abbreviations WHERE URI LIKE ?;
+
+-- Query to retrieve data
+SELECT MovieID, GROUP_CONCAT('<SOT>' || '<SUB>' || SubjectURI || '<REL>' || RelationshipURI || '<OBJ>' || ObjectURI || '<EOT>', '') as RDF_String, Abstract
+FROM RDFs
+INNER JOIN ParsedSubjects USING (SubjectID)
+INNER JOIN ParsedRelationships USING (RelationshipID)
+INNER JOIN ParsedObjects USING (ObjectID)
+INNER JOIN WikipediaAbstracts USING (MovieID)
+    -- insert WHERE here
+GROUP BY MovieID;
--- a/Scripts/DataCleaning/clean_relationship.ipynb
+++ b/Scripts/DataCleaning/clean_relationship.ipynb
@@ -0,0 +1,186 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "b9081b7c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# This file deletes in the pipeline the unwanted relationship by different rules\n",
+    "import pandas as pd\n",
+    "import sqlite3\n",
+    "import numpy as np\n",
+    "\n",
+    "\n",
+    "CONN = sqlite3.connect('../../Assets/Dataset/Tmp/dataset2.db')\n",
+    "\n",
+    "def get_RDF() -> pd.DataFrame:\n",
+    "    \"\"\"\n",
+    "    QUERY = \"SELECT * FROM RDFs \" \\\n",
+    "    \"INNER JOIN Subjects USING (SubjectID) \" \\\n",
+    "    \"INNER JOIN Relationships USING (RelationshipID) \" \\\n",
+    "    \"INNER JOIN Objects USING (ObjectID);\"\n",
+    "    RDF = pd.read_sql_query(QUERY, CONN)\n",
+    "    RDF = RDF[[\"SubjectURI\", \"RelationshipURI\", \"ObjectURI\"]]\n",
+    "    RDF = RDF.dropna()\n",
+    "    \"\"\"\n",
+    "    Subjects = pd.read_sql_query('SELECT * FROM Subjects;', CONN)\n",
+    "    Objects = pd.read_sql_query('SELECT * FROM Objects;', CONN)\n",
+    "    Relationships = pd.read_sql_query('SELECT * FROM Relationships;', CONN)\n",
+    "    RDF = pd.read_sql_query('SELECT * FROM RDFs;', CONN)\n",
+    "\n",
+    "    # drop '' values \n",
+    "    Subjects = Subjects.replace('', np.nan)# .dropna()\n",
+    "    Relationships = Relationships.replace('', np.nan)# .dropna()\n",
+    "    Objects = Objects.replace('', np.nan)# .dropna()\n",
+    "\n",
+    "    # join RDF with its components\n",
+    "    RDF = RDF.merge(Subjects, left_on=\"SubjectID\", right_on=\"SubjectID\")\n",
+    "    RDF = RDF.merge(Objects, left_on=\"ObjectID\", right_on=\"ObjectID\")\n",
+    "    RDF = RDF.merge(Relationships, left_on=\"RelationshipID\", right_on=\"RelationshipID\")\n",
+    "    RDF = RDF[[\"SubjectURI\", \"RelationshipURI\", \"ObjectURI\", \"MovieID\"]]\n",
+    "    return RDF\n",
+    "\n",
+    "\n",
+    "#def delete_relationship_by_uri(RDF: pd.DataFrame, )\n",
+    "\n",
+    "def delete_relationship_by_uri(RDF: pd.DataFrame, uri: str) -> pd.DataFrame:\n",
+    "    return RDF[RDF[\"RelationshipURI\"]!= uri]\n",
+    "\n",
+    "\n",
+    "\n",
+    "RDF = get_RDF()\n",
+    "# RDF = RDF.dropna()\n",
+    "# print(RDF)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "644690bb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def filter_by_frequence_relationship_uri(RDF: pd.DataFrame, count_treshold) -> pd.DataFrame:\n",
+    "    counts = RDF[\"RelationshipURI\"].value_counts() \n",
+    "    RDF[\"RelationshipFreq\"] = RDF[\"RelationshipURI\"].map(counts)\n",
+    "    RDF = RDF[RDF[\"RelationshipFreq\"] >= count_treshold]\n",
+    "    # counts is a series as key: relationship, value: count\n",
+    "    # counts = counts[counts > count_treshold]\n",
+    "    # relationships = counts.index\n",
+    "    # RDF = RDF[RDF[\"RelationshipURI\"].isin(relationships)]\n",
+    "    # RDF = RDF.groupby(\"RelationshipURI\").filter(lambda x: len(x) >= count_treshold)\n",
+    "    return RDF\n",
+    "\n",
+    "RDF = filter_by_frequence_relationship_uri(RDF, 1)\n",
+    "# print(new_RDF)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "34525be6",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                                                 SubjectURI  \\\n",
+      "0             http://dbpedia.org/resource/Nights_of_Cabiria   \n",
+      "1         http://dbpedia.org/resource/California_Science...   \n",
+      "2                 http://dbpedia.org/resource/China_Captain   \n",
+      "3         http://dbpedia.org/resource/Caravan_of_Courage...   \n",
+      "4                http://dbpedia.org/resource/WHIH_Newsfront   \n",
+      "...                                                     ...   \n",
+      "12725500   http://dbpedia.org/resource/I_Will_Follow_(film)   \n",
+      "12725501   http://dbpedia.org/resource/I_Will_Follow_(film)   \n",
+      "12725502  http://dbpedia.org/resource/I_Witnessed_Genoci...   \n",
+      "12725503  http://dbpedia.org/resource/I_Woke_Up_Early_th...   \n",
+      "12725504           http://dbpedia.org/resource/I_Won't_Play   \n",
+      "\n",
+      "                                       RelationshipURI  \\\n",
+      "0          http://www.w3.org/2002/07/owl#differentFrom   \n",
+      "1          http://www.w3.org/2002/07/owl#differentFrom   \n",
+      "2          http://www.w3.org/2002/07/owl#differentFrom   \n",
+      "3          http://www.w3.org/2002/07/owl#differentFrom   \n",
+      "4         http://www.w3.org/2000/01/rdf-schema#seeAlso   \n",
+      "...                                                ...   \n",
+      "12725500          http://dbpedia.org/ontology/producer   \n",
+      "12725501          http://dbpedia.org/ontology/producer   \n",
+      "12725502          http://dbpedia.org/ontology/producer   \n",
+      "12725503          http://dbpedia.org/ontology/producer   \n",
+      "12725504          http://dbpedia.org/ontology/producer   \n",
+      "\n",
+      "                                                  ObjectURI  MovieID  \\\n",
+      "0                       http://dbpedia.org/resource/Cabiria       26   \n",
+      "1         http://dbpedia.org/resource/California_Academy...      185   \n",
+      "2                 http://dbpedia.org/resource/Captain_China      614   \n",
+      "3         http://dbpedia.org/resource/Caravan_of_Courage...      740   \n",
+      "4         http://dbpedia.org/resource/Captain_America:_C...      594   \n",
+      "...                                                     ...      ...   \n",
+      "12725500           http://dbpedia.org/resource/Ava_DuVernay   145854   \n",
+      "12725501           http://dbpedia.org/resource/Molly_Mayeux   145854   \n",
+      "12725502        http://dbpedia.org/resource/Headlines_Today   145861   \n",
+      "12725503             http://dbpedia.org/resource/Billy_Zane   145862   \n",
+      "12725504    http://dbpedia.org/resource/Gordon_Hollingshead   145864   \n",
+      "\n",
+      "          RelationshipFreq  MovieFreq  \n",
+      "0                     2132        216  \n",
+      "1                     2132        264  \n",
+      "2                     2132         66  \n",
+      "3                     2132        131  \n",
+      "4                     1653        133  \n",
+      "...                    ...        ...  \n",
+      "12725500             80077         95  \n",
+      "12725501             80077         95  \n",
+      "12725502             80077         41  \n",
+      "12725503             80077         98  \n",
+      "12725504             80077         91  \n",
+      "\n",
+      "[12725505 rows x 6 columns]\n"
+     ]
+    }
+   ],
+   "source": [
+    "def filter_by_frequence_movie_id(RDF: pd.DataFrame, min_treshold, max_treshold) -> pd.DataFrame:\n",
+    "    counts = RDF[\"MovieID\"].value_counts() \n",
+    "    RDF[\"MovieFreq\"] = RDF[\"MovieID\"].map(counts)\n",
+    "    RDF = RDF[RDF[\"MovieFreq\"] >= min_treshold]\n",
+    "    RDF = RDF[RDF[\"MovieFreq\"] < max_treshold]\n",
+    "    # counts is a series as key: relationship, value: count\n",
+    "    # counts = counts[counts > count_treshold]\n",
+    "    # relationships = counts.index\n",
+    "    # RDF = RDF[RDF[\"RelationshipURI\"].isin(relationships)]\n",
+    "    # RDF = RDF.groupby(\"RelationshipURI\").filter(lambda x: len(x) >= count_treshold)\n",
+    "    return RDF\n",
+    "\n",
+    "RDF = filter_by_frequence_movie_id(RDF, 1, 1500)\n",
+    "print(RDF)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "deep_learning",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/Scripts/DataCleaning/data_output_models/bpe_corpus.py
+++ b/Scripts/DataCleaning/data_output_models/bpe_corpus.py
@@ -0,0 +1,21 @@
+from Scripts.Libs.Utils.dataframe_interaction import get_raw_from_dataframe
+from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
+import pandas as pd
+
+class BPE_corpus():
+
+    def __init__(self, output_path :str):
+        self.output_handler = open(output_path, "w")
+
+    def close(self):
+        # add corpus end before closing
+        self.output_handler.write(SpecialToken.CORPUS_END.value)
+        self.output_handler.close()
+        
+    def write_from_str(self, output: str):
+        if output == '':
+            return
+        self.output_handler.write(output)
+
+    def write_from_df(self, df: pd.DataFrame):
+        self.write_from_str(get_raw_from_dataframe(df))
--- a/Scripts/DataCleaning/data_output_models/debug_csv.py
+++ b/Scripts/DataCleaning/data_output_models/debug_csv.py
@@ -0,0 +1,21 @@
+import pandas as pd
+
+class Debug_csv():
+    def __init__(self, output_path:str):
+     
+
+        self.output =  open(output_path, "w")
+        # then the first row as header
+        header = ["MovieURI","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
+        self.output.write(",".join(header) + "\n")
+
+    def close(self):
+        self.output.close()
+
+    def write(self, RDF: pd.DataFrame):
+        """
+        Args:
+            RDF (pd.DataFrame): ["MovieURI","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
+        """        
+
+        RDF.to_csv(self.output, index=False, header=False)
--- a/Scripts/DataCleaning/data_output_models/rdf_completation_task.py
+++ b/Scripts/DataCleaning/data_output_models/rdf_completation_task.py
@@ -0,0 +1,26 @@
+import pandas as pd
+
+class RDF_completation_task_dataset():
+    """
+        Write the CSV for the fourth task, which is "Predicting subsequent triples based on a given context".
+        Each RDF is saved as str
+        CSV Composition: ["MovieID","RDF"]
+    """
+    def __init__(self, output_path:str):
+     
+
+        self.output =  open(output_path, "w")
+        # then the first row as header
+        header = ["MovieID","RDF"]
+        self.output.write(",".join(header) + "\n")
+
+    def close(self):
+        self.output.close()
+
+    def write(self, RDF: pd.DataFrame):
+        """
+        Args:
+            RDF (pd.DataFrame): ["MovieID","RDF"]
+        """        
+
+        RDF.to_csv(self.output, index=False, header=False)
--- a/Scripts/DataCleaning/data_output_models/rdf_mask_task.py
+++ b/Scripts/DataCleaning/data_output_models/rdf_mask_task.py
@@ -0,0 +1,58 @@
+import pandas as pd
+
+# do not worry about circular dependencies, this class will never call something else
+from Scripts.DataCleaning.legacy.filter import PipelineApplier
+
+class RDF_mask_task_dataset():
+    """
+        Write the CSV for the third task, which is "Predicting a masked component within an RDF triple".
+        The CSV is like: for each RDF there will be 3 rows, where every time one of the componments is missing.
+        CSV Composition: ["MovieID","IncompleteRDF","Missing","RDF"]
+    """
+    def __init__(self, output_path:str):
+     
+        # this methods will only be used by this class, but they belong in a lower level
+        self._build_triple = PipelineApplier.build_triple
+        self._build_incomplete_triple = PipelineApplier.build_incomplete_triple
+
+        self.output =  open(output_path, "w")
+        # then the first row as header
+        header = ["MovieID","IncompleteRDF","Missing","RDF"]
+        self.output.write(",".join(header) + "\n")
+
+    def close(self):
+        self.output.close()
+
+    def write(self, RDF: pd.DataFrame):
+        rdf_complete = self._build_triple(RDF)
+
+        rdf_without_subject = self._build_incomplete_triple(RDF.drop(columns=["SubjectURI"]))
+        rdf_without_relationship = self._build_incomplete_triple(RDF.drop(columns=["RelationshipURI"]))
+        rdf_without_object = self._build_incomplete_triple(RDF.drop(columns=["ObjectURI"]))
+        ####
+        df_subject = pd.DataFrame({
+            "MovieID": RDF["MovieID"],
+            "IncompleteRDF": rdf_without_subject,
+            "Missing": RDF["SubjectURI"],
+            "RDF": rdf_complete,
+        })
+
+        df_relationship = pd.DataFrame({
+            "MovieID": RDF["MovieID"],
+            "IncompleteRDF": rdf_without_relationship,
+            "Missing": RDF["RelationshipURI"],
+            "RDF": rdf_complete,
+        })
+
+        df_object = pd.DataFrame({
+            "MovieID": RDF["MovieID"],
+            "IncompleteRDF": rdf_without_object,
+            "Missing": RDF["ObjectURI"],
+            "RDF": rdf_complete,
+        })
+
+
+        output_df = pd.concat([df_subject, df_relationship, df_object], ignore_index=True)
+        output_df.to_csv(self.output, index=False, header=False)
+
+
--- a/Scripts/DataCleaning/data_output_models/rdf_text_tasks.py
+++ b/Scripts/DataCleaning/data_output_models/rdf_text_tasks.py
@@ -0,0 +1,26 @@
+import pandas as pd
+
+class RDF_text_task_dataset():
+    """
+        Write the CSV for the firsts two tasks, which are "Generating structured RDF triples from natural language text" and reverse.
+        In the CVS the RDFs will be saved toghether as a string.
+        CSV Composition: ["MovieID","RDFs","Abstract"]
+    """
+    def __init__(self, output_path:str):
+     
+
+        self.output =  open(output_path, "w")
+        # then the first row as header
+        header = ["MovieID","RDFs","Abstract"]
+        self.output.write(",".join(header) + "\n")
+
+    def close(self):
+        self.output.close()
+
+    def write(self, RDF: pd.DataFrame):
+        """
+        Args:
+            RDF (pd.DataFrame): ["MovieID","Triple","Abstract"]
+        """        
+
+        RDF.to_csv(self.output, index=False, header=False)
--- a/Scripts/DataCleaning/dbpedia-uri.py
+++ b/Scripts/DataCleaning/dbpedia-uri.py
@@ -0,0 +1,77 @@
+import argparse
+import sys
+
+
+
+class ProgramArgs:
+
+    def __init__(self, file: str, output: str, treshold: int):
+        self.file = file
+        self.output = output
+        self.treshold = treshold
+
+def get_args(args: list[str]) -> ProgramArgs:
+
+    PARSER = argparse.ArgumentParser()
+    PARSER.add_argument("--input-file", "-i", required=True, type=str)
+    PARSER.add_argument("--output-file", "-o", required=True, type=str)
+    PARSER.add_argument("--treshold", "-t", type=int, default=1)
+    parsed_args, _ = PARSER.parse_known_args(args)
+
+    # print(parsed_args.input_file)
+
+    return ProgramArgs(parsed_args.input_file,parsed_args.output_file, parsed_args.treshold)  # type ignore
+
+
+def print_dbpedia(file: str, out: str):
+
+
+    FILE = open(file, "r", encoding="utf-8")
+    OUT = open(out, mode="w", encoding="utf-8")
+
+    DOMAIN_PART = "dbpedia"
+
+    already_parsed : set[str] = set()
+
+
+    for row in FILE:
+
+        sections = row.split("/")
+        sections = list(filter(lambda item: item != "", sections))
+
+        # print(sections)
+
+        if len(sections) < 3:
+            continue
+
+        URI = "/".join(sections[1:3])
+        URI = "//".join([sections[0], URI])
+
+        if  URI in already_parsed:
+            continue
+
+        DOMAIN = sections[1]
+        SUBDOMAINS = DOMAIN.split(".")
+        TYPE = sections[2][0]
+
+        if DOMAIN_PART not in SUBDOMAINS:
+            continue
+
+        already_parsed.add(URI)
+
+        SUB_ID = SUBDOMAINS[0]
+
+        if len(SUB_ID) > 3:
+            SUB_ID = SUB_ID[:3]
+
+        OUT.write(f"\"{URI}/\", \"{SUB_ID}-db{TYPE}\"\n")
+
+
+    FILE.close()
+    OUT.close()
+
+
+if __name__ == "__main__":
+    ARGS = get_args(sys.argv)
+    # ARGS = get_debug_args()
+    print_dbpedia(ARGS.file, ARGS.output)
--- a/Scripts/DataCleaning/hold_out/divide.py
+++ b/Scripts/DataCleaning/hold_out/divide.py
@@ -0,0 +1,29 @@
+import pandas as pd
+
+def split_csv_by_percent(csv_path, train=70, val=15, test=15, seed=42):
+    # 1) Read and shuffle rows with a fixed seed for reproducibility
+    df = pd.read_csv(csv_path).sample(frac=1, random_state=seed).reset_index(drop=True)
+
+    # 2) Turn the three inputs into proportions relative to their sum
+    total = train + val + test # eheh you got it there :p
+    n = len(df)
+    n_train = int(n * train / total)   # floor to keep indices integral
+    n_val   = int(n * val   / total)
+    # 3) Give the remainder to test to ensure every row is assigned
+    #    (this naturally absorbs any rounding loss)
+    train_df = df.iloc[:n_train].reset_index(drop=True)
+    val_df   = df.iloc[n_train:n_train + n_val].reset_index(drop=True)
+    test_df  = df.iloc[n_train + n_val:].reset_index(drop=True)
+
+    return train_df, val_df, test_df
+
+# usage:
+DATASET =       "Assets/Dataset/Tmp/rdf_text.csv"
+TRAIN =         "Assets/Dataset/Tmp/hold_out/train.csv"
+TEST =          "Assets/Dataset/Tmp/hold_out/test.csv"
+EVALUATION =    "Assets/Dataset/Tmp/hold_out/evaluation.csv"
+train_df, val_df, test_df = split_csv_by_percent(DATASET, train=80, val=10, test=10, seed=42)
+
+train_df.to_csv(TRAIN)
+val_df.to_csv(EVALUATION)
+test_df.to_csv(TEST)
--- a/Scripts/DataCleaning/legacy/deprecated.py
+++ b/Scripts/DataCleaning/legacy/deprecated.py
@@ -0,0 +1,381 @@
+# This file deletes in the pipeline the unwanted relationship by different rules
+# -----------------------------------------------------------------------------
+# SQL-FIRST VERSION
+# -----------------------------------------------------------------------------
+# In the original (pandas) version this module:
+#   - stored frequency filters in DataFrames,
+#   - filtered/cleaned DataFrames in-memory,
+#   - added special tokens via string ops,
+#   - rebuilt one row per movie using groupby/aggregation.
+#
+# In this rewrite:
+#   - Every transformation RETURNS a SQLAlchemy `Select` object instead of a DataFrame.
+#   - Your pipeline can pass this `Select` (a "dataview") from one stage to the next,
+#     composing more SQL lazily. Nothing is executed until you call `session.execute(...)`.
+#   - Frequency filters are represented as SUBSELECTS, applied with `WHERE IN (subquery)`.
+#
+# Notes:
+#   - We keep the same CLASS and METHOD NAMES to preserve call sites.
+#   - Method comments/docstrings from your original file are carried over and updated
+#     to reflect Select-based behavior and return types.
+#   - We drop pandas/numpy/sqlite3 imports because filtering is pushed into SQL.
+#   - `GROUP_CONCAT` is used for the rebuild phase (SQLite-compatible). For other DBs,
+#     swap with an equivalent string-agg function.
+# -----------------------------------------------------------------------------
+
+from __future__ import annotations
+from typing import Optional
+
+from sqlalchemy import select, func, literal
+from sqlalchemy.sql import Select
+
+from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
+
+
+class PipelineApplier():
+    """
+    SQL-first pipeline applier.
+
+    In the pandas version, frequency filters were stored as DataFrames (self.MOVIE_FILTER / self.REL_FILTER)
+    and every method worked with/returned pandas.DataFrame. In this SQLAlchemy rewrite:
+
+      - self.MOVIE_FILTER and self.REL_FILTER become *subselects* (Select objects) that yield a single
+        column each (MovieID or RelationshipURI). These subselects can be applied via `WHERE IN (subquery)`.
+
+      - Every method that previously returned a DataFrame now returns a *Select* that represents the same
+        logical transformation, but pushed into the database engine.
+
+      - Comments and docstrings are updated to reflect SQL semantics while preserving your original intent.
+    """
+
+    def __init__(self):
+        # In the pandas version these were DataFrames storing allowed keys.
+        # Here they are Select objects (single-column subselects) or None.
+        # Expected column names:
+        #   - self.MOVIE_FILTER:      "MovieID"
+        #   - self.REL_FILTER:        "RelationshipURI"
+        self.MOVIE_FILTER: Optional[Select] = None
+        self.REL_FILTER: Optional[Select] = None
+
+    # -------------------------------------------------------------------------
+    # Relationship deletion
+    # -------------------------------------------------------------------------
+    def delete_relationship_by_str(self, RDF: Select, uri: str) -> Select:
+        """
+        Return a Select where rows having the given relationship URI are removed.
+
+        Original signature (pandas):
+            def delete_relationship_by_str(self, RDF: pd.DataFrame, uri: str) -> pd.DataFrame
+
+        Updated behavior:
+            - RDF is a Select with columns: MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
+            - We apply a WHERE clause: RelationshipURI != <uri>
+            - Returns a Select you can continue composing.
+
+        Args:
+            RDF (Select): a selectable representing the RDF joined view
+            uri (str): RelationshipURI to exclude
+
+        Returns:
+            Select: filtered selectable (no execution yet)
+        """
+        sc = RDF.selected_columns
+        return RDF.where(sc.RelationshipURI != literal(uri))
+
+    # -------------------------------------------------------------------------
+    # Frequency filter: MOVIE
+    # -------------------------------------------------------------------------
+    def generate_frequency_movie_filter(self, MOVIE_COUNT: Select, min_treshold: int, max_treshold: int):
+        """
+        You MUST call this before filtering by movie frequency [filter_by_frequency_movie_id()],
+        since this method creates such filter.
+
+        Original behavior:
+            - Input MOVIE_COUNT as DataFrame ["MovieID","Count"]
+            - Keep rows where Count in [min_treshold, max_treshold)
+            - Store the filtered keys in self.MOVIE_FILTER
+
+        Updated behavior (SQL):
+            - MOVIE_COUNT is a Select that yields ["MovieID","Count"].
+            - We build and store a *subselect* of allowed MovieID (single column) to be used by WHERE IN.
+            - No query is executed here; we only create a new Select.
+
+        Args:
+            MOVIE_COUNT (Select): yields columns MovieID, Count
+            min_treshold (int):
+            max_treshold (int):
+        """
+        sc = MOVIE_COUNT.selected_columns
+        filtered = MOVIE_COUNT.where(sc.Count >= min_treshold).where(sc.Count < max_treshold)
+        # Keep only the key column so it can be used in an IN (subquery)
+        self.MOVIE_FILTER = select(filtered.selected_columns.MovieID)
+
+    # -------------------------------------------------------------------------
+    # Frequency filter: RELATIONSHIP
+    # -------------------------------------------------------------------------
+    def generate_frequency_relationship_filter(self, REL_COUNT: Select, min_treshold: int, max_treshold: int):
+        """
+        Original behavior:
+            - Input REL_COUNT as DataFrame ["RelationshipURI","Count"]
+            - Keep rows where Count in [min_treshold, max_treshold)
+            - Store the filtered keys in self.REL_FILTER
+
+        Updated behavior (SQL):
+            - REL_COUNT is a Select that yields ["RelationshipURI","Count"].
+            - We build and store a *subselect* of allowed RelationshipURI (single column) to be used by WHERE IN.
+            - No query is executed here; we only create a new Select.
+
+        Args:
+            REL_COUNT (Select): yields columns RelationshipURI, Count
+            min_treshold (int):
+            max_treshold (int):
+        """
+        sc = REL_COUNT.selected_columns
+        filtered = REL_COUNT.where(sc.Count >= min_treshold).where(sc.Count < max_treshold)
+        self.REL_FILTER = select(filtered.selected_columns.RelationshipURI)
+
+    # -------------------------------------------------------------------------
+    # Apply frequency filters
+    # -------------------------------------------------------------------------
+    def filter_by_frequency_movie_id(self, RDF: Select) -> Select:
+        """
+        Original behavior (pandas):
+            RDF = RDF[RDF["MovieID"].isin(self.MOVIE_FILTER["MovieID"])]
+
+        Updated behavior (SQL):
+            - If self.MOVIE_FILTER is present, apply: WHERE MovieID IN ( <subselect> )
+            - Otherwise, return RDF unchanged.
+
+        Args:
+            RDF (Select): current dataset
+
+        Returns:
+            Select: filtered dataset (or unchanged if no filter exists)
+        """
+        if self.MOVIE_FILTER is None:
+            return RDF
+        sc = RDF.selected_columns
+        return RDF.where(sc.MovieID.in_(self.MOVIE_FILTER))
+
+    def filter_by_frequency_relationship(self, RDF: Select) -> Select:
+        """
+        Original behavior (pandas):
+            RDF = RDF[RDF["RelationshipURI"].isin(self.REL_FILTER["RelationshipURI"])]
+
+        Updated behavior (SQL):
+            - If self.REL_FILTER is present, apply: WHERE RelationshipURI IN ( <subselect> )
+            - Otherwise, return RDF unchanged.
+
+        Args:
+            RDF (Select): current dataset
+
+        Returns:
+            Select: filtered dataset (or unchanged if no filter exists)
+        """
+        if self.REL_FILTER is None:
+            return RDF
+        sc = RDF.selected_columns
+        return RDF.where(sc.RelationshipURI.in_(self.REL_FILTER))
+
+    # -------------------------------------------------------------------------
+    # Token prefixing (SubjectURI/RelationshipURI/ObjectURI)
+    # -------------------------------------------------------------------------
+    def rdf_add_special_token(self, RDF: Select) -> Select:
+        """
+        Adds RDF special token to each element of the tuple. i.e: SUBJ to SubjectURI,
+        OBJ to ObjectURI, REL to RelationshipURI. Check
+        Scripts/Libs/CleaningPipeline/special_token.py for the up-to-date special token.
+
+        It only adds the special token of the three elements of the RDF; no other special token.
+
+        Original behavior (pandas):
+            - String concatenation with columns in a DataFrame.
+            - Returned a new DataFrame.
+
+        Updated behavior (SQL):
+            - Build projected columns using SQL string concatenation.
+            - Return a new Select with the same output column names:
+              ["MovieID","SubjectURI","RelationshipURI","ObjectURI","Abstract"].
+
+        Args:
+            RDF (Select): current dataset
+
+        Returns:
+            Select: projected dataset with tokenized SubjectURI/RelationshipURI/ObjectURI
+        """
+        sc = RDF.selected_columns
+        subj_tok = literal(SpecialToken.SUBJECT.value) + sc.SubjectURI
+        rel_tok  = literal(SpecialToken.RELATIONSHIP.value) + sc.RelationshipURI
+        obj_tok  = literal(SpecialToken.OBJECT.value) + sc.ObjectURI
+
+        return RDF.with_only_columns(
+            sc.MovieID.label("MovieID"),
+            subj_tok.label("SubjectURI"),
+            rel_tok.label("RelationshipURI"),
+            obj_tok.label("ObjectURI"),
+            sc.Abstract.label("Abstract"),
+        )
+
+    # -------------------------------------------------------------------------
+    # NA/empty drop on key columns (SubjectURI, RelationshipURI, ObjectURI)
+    # -------------------------------------------------------------------------
+    def drop_na_from_dataset(self, RDF: Select) -> Select:
+        """
+        Dataset has SubjectURI, RelationshipURI, ObjectURI. We want to drop rows
+        where any of these is empty or NULL.
+
+        Original behavior (pandas):
+            - Replace '' with NaN and dropna on the three columns.
+
+        Updated behavior (SQL):
+            - Apply WHERE clauses checking for NOT NULL and not empty string.
+
+        Args:
+            RDF (Select): current dataset
+
+        Returns:
+            Select: dataset filtered to non-empty SubjectURI/RelationshipURI/ObjectURI
+        """
+        sc = RDF.selected_columns
+        return RDF.where(
+            (sc.SubjectURI.is_not(None)) & (sc.SubjectURI != "") &
+            (sc.RelationshipURI.is_not(None)) & (sc.RelationshipURI != "") &
+            (sc.ObjectURI.is_not(None)) & (sc.ObjectURI != "")
+        )
+
+    # -------------------------------------------------------------------------
+    # Rebuild by movie (one row per movie)
+    # -------------------------------------------------------------------------
+    def rebuild_by_movie(self, RDF: Select) -> Select:
+        """
+        To execute this method you have to have iterated by movie_id conceptually,
+        because as design we want at the end one row for each movie.
+
+        Original behavior (pandas):
+            - Build per-row "Triple" as SubjectURI + RelationshipURI + ObjectURI,
+              wrapped with START_TRIPLE/END_TRIPLE.
+            - Group by ["MovieID", "Abstract"] and join ("".join) all Triple strings into one.
+            - Prefix the whole list with START_TRIPLE_LIST and Abstract with ABSTRACT.
+            - Return DataFrame [["MovieID","Triple","Abstract"]].
+
+        Updated behavior (SQL):
+            - Build per-row Triple using SQL string concatenation and constants.
+            - Use GROUP_CONCAT (empty separator) to aggregate per-movie.
+            - Prefix with START_TRIPLE_LIST and ABSTRACT in SQL.
+            - Return a Select with columns: ["MovieID","Triple","Abstract"].
+
+        Args:
+            RDF (Select): current dataset with columns
+                          MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
+
+        Returns:
+            Select: aggregated dataset with one row per movie
+        """
+        sc = RDF.selected_columns
+
+        # Per-row triple with START/END_TRIPLE tokens
+        row_triple = (
+            literal(SpecialToken.START_TRIPLE.value) +
+            (sc.SubjectURI + sc.RelationshipURI + sc.ObjectURI) +
+            literal(SpecialToken.END_TRIPLE.value)
+        ).label("Triple")
+
+        # Prefixed abstract
+        abstract_tok = (literal(SpecialToken.ABSTRACT.value) + sc.Abstract).label("Abstract")
+
+        # Subquery of per-row triples / abstracts
+        row_view = RDF.with_only_columns(
+            sc.MovieID.label("MovieID"),
+            row_triple,
+            abstract_tok,
+        ).subquery()
+
+        # Concatenate all triples for each movie (SQLite syntax; adjust for other DBs)
+        triple_concat = (
+            literal(SpecialToken.START_TRIPLE_LIST.value) +
+            func.group_concat(row_view.c.Triple, literal(""))
+        ).label("Triple")
+
+        return (
+            select(
+                row_view.c.MovieID.label("MovieID"),
+                triple_concat,
+                row_view.c.Abstract.label("Abstract"),
+            )
+            .group_by(row_view.c.MovieID, row_view.c.Abstract)
+        )
+
+    # -------------------------------------------------------------------------
+    # Build triple(s) projection
+    # -------------------------------------------------------------------------
+    @staticmethod
+    def build_triple(RDF: Select) -> Select:
+        """
+        Obtains joined RDF triple in one element, together with START and END special tokens.
+
+        Original behavior (pandas):
+            - Returned a Series/DataFrame column "Triple" built from three string columns.
+
+        Updated behavior (SQL):
+            - Returns a Select with a single column "Triple" built in SQL.
+
+        Args:
+            RDF (Select): at least columns ["SubjectURI", "RelationshipURI", "ObjectURI"]
+
+        Returns:
+            Select: a projection containing one column named "Triple"
+        """
+        sc = RDF.selected_columns
+        triple = (
+            literal(SpecialToken.START_TRIPLE.value) +
+            (sc.SubjectURI + sc.RelationshipURI + sc.ObjectURI) +
+            literal(SpecialToken.END_TRIPLE.value)
+        ).label("Triple")
+        return RDF.with_only_columns(triple)
+
+    @staticmethod
+    def build_incomplete_triple(RDF: Select) -> Select:
+        """
+        Method helper used for the third task: "Predicting a masked component within an RDF triple".
+        Obtains joined RDF triple in one element, together with START and END special tokens.
+        The MISSING element will be replaced by the special token <MASK>.
+
+        Original behavior (pandas):
+            - Created a Series "Triple" using fallback values for missing columns.
+
+        Updated behavior (SQL):
+            - Uses COALESCE to replace NULLs with <MASK> directly in SQL.
+            - Returns a Select with a single column "Triple".
+
+        Args:
+            RDF (Select): 2 of the following columns present ["SubjectURI", "RelationshipURI", "ObjectURI"]
+
+        Returns:
+            Select: projection with column "Triple"
+        """
+        sc = RDF.selected_columns
+        mask = literal(SpecialToken.MASK.value)
+
+        triple = (
+            literal(SpecialToken.START_TRIPLE.value) +
+            (func.coalesce(sc.SubjectURI, mask) +
+             func.coalesce(sc.RelationshipURI, mask) +
+             func.coalesce(sc.ObjectURI, mask)) +
+            literal(SpecialToken.END_TRIPLE.value)
+        ).label("Triple")
+        return RDF.with_only_columns(triple)
+
+    @staticmethod
+    def build_for_mask_task(RDF_incomplete: Select, MISSING) -> None:
+        """
+        Currently not used.
+
+        Original intention:
+            Given two DataFrames (one incomplete RDF and another with just the missing component),
+            apply special tokens accordingly.
+
+        Updated note:
+            This stub remains for API parity. If needed in the future, it can be implemented
+            as a Select-building helper that merges/COALESCEs columns from different selects.
+        """
+        return None
--- a/Scripts/DataCleaning/legacy/fast_filter.py
+++ b/Scripts/DataCleaning/legacy/fast_filter.py
@@ -0,0 +1,148 @@
+# This file deletes in the pipeline the unwanted relationship by different rules
+import pandas as pd
+import sqlite3  # kept for compatibility
+import numpy as np
+
+from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
+from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
+
+
+class PipelineApplier:
+    def __init__(self):
+        # Fast internal caches for O(1) membership checks
+        self._MOVIE_FILTER_SET = set()
+        self._REL_FILTER_SET = set()
+
+    # ------------------------------
+    # Filters
+    # ------------------------------
+    def delete_relationship_by_str(self, RDF: pd.DataFrame, uri: str) -> pd.DataFrame:
+        # Vectorized boolean mask
+        return RDF.loc[RDF["RelationshipURI"] != uri]
+
+    def generate_frequency_movie_filter(self, MOVIE_COUNT: pd.DataFrame, min_threshold: int, max_threshold: int):
+        """
+        You MUST call this before filter the dataset by movie frequency [filter_by_frequency_movie_id()],
+        since this method creates such filter.
+        Args:
+            MOVIE_COUNT (pd.DataFrame): ["MovieID","Count"]
+        """
+        sel = (MOVIE_COUNT["Count"] >= min_threshold) & (MOVIE_COUNT["Count"] < max_threshold)
+        self._MOVIE_FILTER_SET = set(MOVIE_COUNT.loc[sel, "MovieID"].tolist())
+
+    def generate_frequency_relationship_filter(self, REL_COUNT: pd.DataFrame, min_threshold: int, max_threshold: int):
+        sel = (REL_COUNT["Count"] >= min_threshold) & (REL_COUNT["Count"] < max_threshold)
+        self._REL_FILTER_SET = set(REL_COUNT.loc[sel, "RelationshipURI"].tolist())
+
+    def filter_by_frequency_movie_id(self, RDF: pd.DataFrame) -> pd.DataFrame:
+        # Set-backed isin is the fastest path
+        return RDF.loc[RDF["MovieID"].isin(self._MOVIE_FILTER_SET)]
+
+    def filter_by_frequency_relationship(self, RDF: pd.DataFrame) -> pd.DataFrame:
+        return RDF.loc[RDF["RelationshipURI"].isin(self._REL_FILTER_SET)]
+
+    # ------------------------------
+    # Cleaning & preprocessing
+    # ------------------------------
+    def rdf_add_special_token(self, RDF: pd.DataFrame) -> pd.DataFrame:
+        """
+        Adds RDF special token to SubjectURI / RelationshipURI / ObjectURI.
+        Returns a new DataFrame (no inplace modification of the caller's object).
+        """
+        subj = np.char.add(SpecialToken.SUBJECT.value,      RDF["SubjectURI"].to_numpy(dtype=object))
+        rel  = np.char.add(SpecialToken.RELATIONSHIP.value, RDF["RelationshipURI"].to_numpy(dtype=object))
+        obj  = np.char.add(SpecialToken.OBJECT.value,        RDF["ObjectURI"].to_numpy(dtype=object))
+        return RDF.assign(SubjectURI=subj, RelationshipURI=rel, ObjectURI=obj)
+
+    def drop_na_from_dataset(self, RDF: pd.DataFrame) -> pd.DataFrame:
+        """
+        Replace '' with NaN only on key columns, then drop rows missing any of them.
+        """
+        cols = ["SubjectURI", "RelationshipURI", "ObjectURI"]
+        rdf = RDF.copy()
+        for c in cols:
+            m = rdf[c] == ""
+            if m.any():
+                rdf.loc[m, c] = np.nan
+        return rdf.dropna(subset=cols)
+
+    # ------------------------------
+    # Building triples
+    # ------------------------------
+    @staticmethod
+    def build_triple(RDF: pd.DataFrame):
+        """
+        Obtains joined RDF triple in one element, together with START and END special token.
+        Returns:
+            pd.Series: RDF["Triple"] (just this column). Side-effect: sets RDF["Triple"].
+        """
+        start = SpecialToken.START_TRIPLE.value
+        end   = SpecialToken.END_TRIPLE.value
+
+        subj = RDF["SubjectURI"].to_numpy(dtype=object)
+        rel  = RDF["RelationshipURI"].to_numpy(dtype=object)
+        obj  = RDF["ObjectURI"].to_numpy(dtype=object)
+
+        arr = np.char.add(np.char.add(np.char.add(start, subj),
+                                      np.char.add(rel, obj)),
+                          end)
+        RDF["Triple"] = pd.Series(arr, index=RDF.index, dtype=object, name="Triple")
+        return RDF["Triple"]
+
+    @staticmethod
+    def build_incomplete_triple(RDF: pd.DataFrame):
+        """
+        Helper used for the third task: "Predicting a masked component within an RDF triple".
+        Accepts any subset of ["SubjectURI","RelationshipURI","ObjectURI"] (typically 2 of 3).
+        Missing components are replaced by <MASK>.
+        Returns:
+            pd.Series: RDF["Triple"] (just this column). Side-effect: sets RDF["Triple"].
+        """
+        start = SpecialToken.START_TRIPLE.value
+        end   = SpecialToken.END_TRIPLE.value
+        maskv = SpecialToken.MASK.value
+        n = len(RDF.index)
+
+        subj = RDF["SubjectURI"].to_numpy(dtype=object)      if "SubjectURI"      in RDF else np.full(n, maskv, dtype=object)
+        rel  = RDF["RelationshipURI"].to_numpy(dtype=object) if "RelationshipURI" in RDF else np.full(n, maskv, dtype=object)
+        obj  = RDF["ObjectURI"].to_numpy(dtype=object)       if "ObjectURI"       in RDF else np.full(n, maskv, dtype=object)
+
+        arr = np.char.add(np.char.add(np.char.add(start, subj),
+                                      np.char.add(rel, obj)),
+                          end)
+        RDF["Triple"] = pd.Series(arr, index=RDF.index, dtype=object, name="Triple")
+        return RDF["Triple"]
+
+    def rebuild_by_movie(self, RDF: pd.DataFrame):
+        """
+        Collapse triples + abstract into a single row per movie.
+        Returns: ["MovieID","Triple","Abstract"]
+        """
+        # Build triples once (vectorized); method also sets RDF["Triple"]
+        triples = self.build_triple(RDF)
+
+        # Minimal frame for grouping (avoid carrying extra columns)
+        tmp = pd.DataFrame({
+            "MovieID":  RDF["MovieID"].to_numpy(),
+            "Abstract": RDF["Abstract"].to_numpy(),
+            "Triple":   triples.to_numpy(),
+        })
+
+        # Factorize high-cardinality keys to fast integer codes, group on codes,
+        # then map back to labels; sum concatenates strings for object dtype.
+        mid_codes, mid_uniques = pd.factorize(tmp["MovieID"], sort=False)
+        abs_codes, abs_uniques = pd.factorize(tmp["Abstract"], sort=False)
+
+        tmp["_mid"] = mid_codes
+        tmp["_abs"] = abs_codes
+
+        grouped = tmp.groupby(["_mid", "_abs"], sort=False, as_index=False)["Triple"].sum()
+
+        grouped["MovieID"]  = grouped["_mid"].map(lambda i: mid_uniques[i])
+        grouped["Abstract"] = grouped["_abs"].map(lambda i: abs_uniques[i])
+
+        # Final tokens
+        grouped["Triple"]   = SpecialToken.START_TRIPLE_LIST.value + grouped["Triple"]
+        grouped["Abstract"] = SpecialToken.ABSTRACT.value         + grouped["Abstract"]
+
+        return grouped[["MovieID", "Triple", "Abstract"]]
--- a/Scripts/DataCleaning/legacy/filter.py
+++ b/Scripts/DataCleaning/legacy/filter.py
@@ -0,0 +1,191 @@
+# This file deletes in the pipeline the unwanted relationship by different rules
+import pandas as pd
+import sqlite3
+import numpy as np
+
+from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
+from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
+
+
+class PipelineApplier():
+
+    def __init__(self):
+
+        self.MOVIE_FILTER = pd.DataFrame()
+        self.REL_FILTER = pd.DataFrame()
+
+
+    def delete_relationship_by_str(self, RDF: pd.DataFrame, uri: str) -> pd.DataFrame:
+        return RDF[RDF["RelationshipURI"]!= uri]
+    
+    def generate_list_relationship_filter(self, filter_list: list[str]) -> None:
+        """Store RelationshipURI filters as a set """
+        self.relationship_filter_list: set[str] = set(filter_list)
+    
+    def delete_relationship_by_list_filter(self, RDF: pd.DataFrame) -> pd.DataFrame:
+        """Remove rows whose RelationshipURI is in the stored filter. Generate it first callig the generate_list_relationship_filter"""
+        return RDF[~RDF["RelationshipURI"].isin(self.relationship_filter_list)]
+
+    # def filter_movie_by_rel_uri_frequence()
+
+    def generate_frequency_movie_filter(self, MOVIE_COUNT: pd.DataFrame ,min_treshold: int, max_treshold: int):
+        """
+        You MUST call this before filter the dataset by movie frequence [filter_by_frequence_movie_id()], 
+        since this method creates such filter
+        Args:
+            MOVIE_COUNT (pd.DataFrame): ["MovieID","Count"]
+            min_treshold (int): 
+            max_treshold (int): 
+        """        
+        MOVIE_COUNT = MOVIE_COUNT[MOVIE_COUNT["Count"] >= min_treshold]
+        MOVIE_COUNT = MOVIE_COUNT[MOVIE_COUNT["Count"] < max_treshold]
+        self.MOVIE_FILTER = MOVIE_COUNT #["MovieID"]
+
+    def generate_frequency_relationship_filter(self, REL_COUNT: pd.DataFrame ,min_treshold: int, max_treshold: int):
+        REL_COUNT = REL_COUNT[REL_COUNT["Count"] >= min_treshold]
+        REL_COUNT = REL_COUNT[REL_COUNT["Count"] < max_treshold]
+        self.REL_FILTER = REL_COUNT #["RelationshipURI"]
+
+    def filter_by_frequency_movie_id(self, RDF: pd.DataFrame) -> pd.DataFrame:
+        RDF = RDF[RDF["MovieID"].isin(self.MOVIE_FILTER["MovieID"])]
+        return RDF
+
+    def filter_by_frequency_relationship(self, RDF: pd.DataFrame) -> pd.DataFrame:
+        RDF = RDF[RDF["RelationshipURI"].isin(self.REL_FILTER["RelationshipURI"])]
+        return RDF
+
+    def rdf_add_special_token(self, RDF: pd.DataFrame):
+        """
+        Adds RDF special token to each element of the tuple. i.e: SUBJ to SubjectURI, OBJ to ObjectURI, REL to RelationshipURI. 
+        Check Scrits/Libs/CleaningPipeline/special_token.py for the up-to-date special token.
+        It only adds the special token of the three element of the RDF, no other special token.
+        Args:
+            RDF (pd.DataFrame):
+        Returns:
+            pd.DataFrame: ["MovieURI","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
+        """        
+        # if the filter runned before sliced the RDF and created a View, here the problem is resolved
+        # for more context: SettingWithCopyWarning
+        RDF = RDF.copy()
+        # at the beginning of SubjectURI RelationshipURI ObjectURI, add their special token  
+        RDF["SubjectURI"] = SpecialToken.SUBJECT.value + RDF["SubjectURI"]
+        RDF["ObjectURI"] = SpecialToken.OBJECT.value + RDF["ObjectURI"]
+        RDF["RelationshipURI"] = SpecialToken.RELATIONSHIP.value + RDF["RelationshipURI"]
+        return RDF
+
+
+    def drop_na_from_dataset(self, RDF: pd.DataFrame) -> pd.DataFrame:
+        # dataset has SubjectURI RelationshipURI ObjectURI
+        #  want to drop the '' in them
+        # Replace empty strings with NaN
+        RDF = RDF.replace('', np.nan)
+        # Drop rows where any of the key columns are NaN
+        RDF = RDF.dropna(subset=["SubjectURI", "RelationshipURI", "ObjectURI"])
+        return RDF
+    
+    def rebuild_by_movie(self, RDF: pd.DataFrame) -> pd.DataFrame:
+        """_summary_
+
+        Args:
+            RDF (pd.DataFrame): ["MovieID","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
+
+        Returns:
+            pd.DataFrame: ["MovieID","Triple","Abstract"]
+        """        
+        # to execute this method you have to have itereted by movie_id
+        # because as design we want at the end one row for each movie
+        # MovieID and abstract can be given as input for a more generic method
+        # movie_id = RDF["MovieID"].iloc(0)
+        # abstract = RDF["Abstract"].iloc(0)
+        # first let's combine each row creating column triple as join of rdf
+        RDF["Triple"] =  RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"]
+        # special token 
+        RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
+        # combine rows into one
+        # MovieID and Abstract are unique for each other 1 <-> 1
+        RDF = RDF.groupby(["MovieID", "Abstract"])["Triple"].apply("".join).reset_index()
+        # add special token for: start of triple, end of triple and start of abstract
+        RDF["Triple"] = SpecialToken.START_TRIPLE_LIST.value + RDF["Triple"] 
+        RDF["Abstract"] = SpecialToken.ABSTRACT.value + RDF["Abstract"]
+        return RDF[["MovieID","Triple","Abstract"]]
+
+    def group_by_movie_from_triple(self, RDF: pd.DataFrame) -> pd.DataFrame:
+        """
+        Args:
+            RDF (pd.DataFrame): ["MovieID","Triple","Abstract"]
+
+        Returns:
+            pd.DataFrame: ["MovieID","Triple","Abstract"]
+        """
+        # combine rows into one
+        # MovieID and Abstract are unique for each other 1 <-> 1
+        RDF = RDF.groupby(["MovieID", "Abstract"])["Triple"].apply("".join).reset_index()
+        # add special token for: start of triple, end of triple and start of abstract
+        RDF["Triple"] = SpecialToken.START_TRIPLE_LIST.value + RDF["Triple"] 
+        RDF["Abstract"] = SpecialToken.ABSTRACT.value + RDF["Abstract"]
+        return RDF[["MovieID","Triple","Abstract"]]
+
+
+    @staticmethod
+    def build_triple(RDF: pd.DataFrame):
+        """
+        Obtains joined RDF triple in one element, togheter with START and END special token
+        Args:
+            RDF (pd.DataFrame): at least ["SubjectURI", "RelationshipURI", "ObjectURI"]
+        Returns:
+            pd.DataFrame: RDF["Triple"] (just this column)
+        """        
+        # let's combine each row creating column triple as join of rdf
+        RDF["Triple"] =  RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"]
+        # special token 
+        RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
+        return RDF["Triple"]
+
+    @staticmethod
+    def build_incomplete_triple(RDF: pd.DataFrame):
+        """
+        Method helper used for the third task: "Predicting a masked component within an RDF triple". 
+        Obtains joined RDF triple in one element, togheter with START and END special token.
+        The MISSING element will be replaced by the special token <MASK>
+        Args:
+            RDF (pd.DataFrame): 2 of the following ["SubjectURI", "RelationshipURI", "ObjectURI"]
+        Returns:
+            RDF["Triple"]: pd.Series  (just this column, NOT A DATAFRAME)
+        """        
+        # let's create a new column "Triple" with the joined RDF
+
+        # the following creates a column of MASK token of the lenght of the dataframe,
+        # it is not needed since we expect to have a dataframe of just one column, but its more robust (AND SLOW)
+        MISSING = pd.Series([SpecialToken.MASK.value] * len(RDF), index=RDF.index)
+
+        RDF["Triple"] =  ( 
+                    RDF.get("SubjectURI", MISSING) + 
+                    RDF.get("RelationshipURI", MISSING) + 
+                    RDF.get("ObjectURI", MISSING))
+        # special token 
+        RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
+        return RDF["Triple"]
+
+    @staticmethod
+    def build_for_mask_task(RDF_incomplete: pd.DataFrame, MISSING: pd.DataFrame) -> pd.DataFrame:
+        # currently not used
+        """
+        Method helper used for the third task: "Predicting a masked component within an RDF triple". 
+        Given two Dataframe, the first containing the incompleted RDF and the other only the missing componment,
+        this methods applies the special token
+        Args:
+            RDF (pd.DataFrame): _description_
+
+        Returns:
+            pd.DataFrame: _description_
+        """  
+        # take an example dataframe as ["SubjectURI",""]    
+        # as input two dataframe, one with 2 column  
+        return None
+
+    def regex_on_objects(self, RDF: pd.DataFrame) -> pd.DataFrame:
+        RDF["ObjectURI"] = (RDF["ObjectURI"].astype("string")
+                   .str.replace(r"\r?\n+", ", ", regex=True)   # newlines -> ", "
+                   .str.replace(r"\*", "", regex=True))        # delete all asterisks
+
+        return RDF
--- a/Scripts/DataCleaning/legacy/pipeline.py
+++ b/Scripts/DataCleaning/legacy/pipeline.py
@@ -0,0 +1,145 @@
+import re
+from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
+from Scripts.DataCleaning.legacy.filter import PipelineApplier
+# tasks dataset builder
+from Scripts.DataCleaning.data_output_models.rdf_mask_task import RDF_mask_task_dataset
+from Scripts.DataCleaning.data_output_models.bpe_corpus import BPE_corpus
+from Scripts.DataCleaning.data_output_models.rdf_text_tasks import RDF_text_task_dataset
+from Scripts.DataCleaning.data_output_models.rdf_completation_task import RDF_completation_task_dataset
+from Scripts.DataCleaning.data_output_models.debug_csv import Debug_csv
+
+import pandas as pd
+
+class Pipeline():
+    def __init__(self):
+        self.sql_endpoint = SqlEndpoint()
+        # classes to manage taskes' datasets
+        self.task_rdf_mask = RDF_mask_task_dataset("./Assets/Dataset/Tmp/rdf_mask.csv")
+        self.task_bpe_corpus = BPE_corpus("./Assets/Dataset/Tmp/corpus.txt")
+        self.task_rdf_text = RDF_text_task_dataset("./Assets/Dataset/Tmp/rdf_text.csv")
+        self.task_rdf_completation = RDF_completation_task_dataset("./Assets/Dataset/Tmp/rdf_completation.csv")
+
+        # prepare the filter
+        # the filter applier needs to know the frequence of Movies and Relationship among all the Dataset
+        self.filter_applier = PipelineApplier()
+        MOVIE_COUNT = self.sql_endpoint.get_movies_id_count()
+        REL_COUNT = self.sql_endpoint.get_relationship_count()
+        self.filter_applier.generate_frequency_movie_filter(MOVIE_COUNT,50,3000)
+        self.filter_applier.generate_frequency_relationship_filter(REL_COUNT, 50, 2395627) # from 2718 to 3069 
+        # prepare the filter on the relationshipURI you want to delete:
+        relationship_uri_banned_list = [
+            "dbp-dbp:wikiPageUsesTemplate","w3:2000/01/rdf-schema#label","dbp-dbo:abstract",
+            "dbp-dbo:wikiPageID","dbp-dbo:wikiPageRevisionID", "dbp-dbo:wikiPageDisambiguates",
+            "w3:2002/07/owl#sameAs","dbp-dbp:image","dbp-dbo:wikiPageLength", "w3:2000/01/rdf-schema#comment",
+            "dbp-dbo:thumbnail", "foaf:depiction", "w3:1999/02/22-rdf-syntax-ns#type", 
+            "dbp-dbp:id","dbp-dbp:totalWidth", "w3:ns/prov#wasDerivedFrom", "dbp-dbp:n", "dbp-dbp:alt",
+            "dbp-dbo:soundRecording"
+            ]
+        self.filter_applier.generate_list_relationship_filter(relationship_uri_banned_list)
+
+
+    def execute_task_bpe_corpus(self):
+        for RDF in self._get_cleaned_movie_rows():
+            RDF = self.filter_applier.rebuild_by_movie(RDF)
+            RDF = RDF[["Triple","Abstract"]]
+            self.task_bpe_corpus.write_from_df(RDF)
+        self._end_file_handler()
+
+
+    def execute_task_rdf_mask(self):
+        for RDF in self._get_cleaned_movie_rows():
+            self.task_rdf_mask.write(RDF)
+        self._end_file_handler()
+
+
+    def execute_tasks_rdf_text(self):
+        for RDF in self._get_cleaned_movie_rows():
+            RDF = self.filter_applier.rebuild_by_movie(RDF)
+            self.task_rdf_text.write(RDF)
+        self._end_file_handler()
+
+
+    def execute_task_rdf_completation(self):
+        for RDF in self._get_cleaned_movie_rows():
+            RDF["Triple"] = self.filter_applier.build_triple(RDF)
+            self.task_rdf_completation.write(RDF[["MovieID","Triple"]])
+        self._end_file_handler()
+
+
+    def execute_all_task(self):
+        for RDF in self._get_cleaned_movie_rows():
+            self.task_rdf_mask.write(RDF)
+
+            RDF["Triple"] = self.filter_applier.build_triple(RDF)
+            self.task_rdf_completation.write(RDF[["MovieID","Triple"]])
+
+            RDF = self.filter_applier.group_by_movie_from_triple(RDF[["MovieID","Triple","Abstract"]])
+
+            self.task_rdf_text.write(RDF)
+            self.task_bpe_corpus.write_from_df(RDF[["Triple","Abstract"]])
+
+        self._end_file_handler()
+
+
+    def _end_file_handler(self):
+        self.task_bpe_corpus.close()
+        self.task_rdf_mask.close()
+        self.task_rdf_text.close()
+        self.task_rdf_completation.close()
+
+
+    def _get_cleaned_movie_rows(self):
+        for RDF in self.sql_endpoint.get_abbreviated_dataset_by_movie_id():
+            RDF = self.filter_applier.drop_na_from_dataset(RDF)
+            RDF = self.filter_applier.filter_by_frequency_movie_id(RDF)
+            RDF = self.filter_applier.filter_by_frequency_relationship(RDF)
+            # other filter
+            #
+            RDF = self.filter_applier.delete_relationship_by_list_filter(RDF)
+            # regex on ObjectURI
+            RDF = self.filter_applier.regex_on_objects(RDF)
+            if RDF.empty:
+                continue
+            RDF = self.filter_applier.rdf_add_special_token(RDF)  # WARNING, THIS MUST BE DONE AFTER FILTER BY FREQUENCE
+            yield RDF
+
+
+    def use_toy_dataset(self):
+        # CHOOSEN MOVIE:
+        # The Dark Knight   : 117248
+        # Inception         : 147074
+        # The Avengers      : 113621
+        # Cast Away         : 1123
+        # The Departed      : 117586
+        # American Psycho   : 90177
+        # Avatar            : 71587
+        # Django Unchained  : 138952
+        # Spirited Away     : 144137
+        # Knives Out        : 148025
+        movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]
+        self.sql_endpoint.movie_ids = movie_list
+
+    def generate_csv_debug_file(self, debug_path:str):
+        debug_csv = Debug_csv(debug_path)
+
+        for RDF in self._get_cleaned_movie_rows():
+            debug_csv.write(RDF)
+
+        debug_csv.close()
+
+
+# there are a lot of settings to manage
+# you only need to change settings: 
+# in the init for file paths, frequency filter limit, banned reletionshipURI
+# in the use_toy_dataset , to change the toy dataset
+# in _get_cleaned_movie_rows: to change how the pipeline behave
+
+pipeline = Pipeline()
+
+pipeline.use_toy_dataset()
+# pipeline.execute_task_bpe_corpus()
+# pipeline.execute_task_rdf_mask()
+# pipeline.execute_tasks_rdf_text()
+# pipeline.execute_task_rdf_completation()
+# pipeline.execute_all_task()
+pipeline.generate_csv_debug_file("Assets/Dataset/Tmp/debug.csv")
--- a/Scripts/DataCleaning/path_splitter_tree.py
+++ b/Scripts/DataCleaning/path_splitter_tree.py
@@ -6,8 +6,16 @@ from typing import Self

 class ProgramArgs:

-    def __init__(self, file: str, output: str, treshold: int):
+    def __init__(self, file: str, csv_uri_header: str, output: str, treshold: int):
+        """
+        Args:
+            file (str): 
+            csv_header (str): The name of the column of the csv file from which the program will get the URIs
+            output (str): 
+            treshold (int): 
+        """        
        self.file = file
+        self.csv_uri_header = csv_uri_header
        self.output = output
        self.treshold = treshold

@@ -33,11 +41,15 @@ class Node:
        KEY = child[0]

        if not self.children.get(KEY):
+            # if the key has no value, it means we are traversing this branch for the first time
+            # create another node for the key
            self.children[KEY] = Node(KEY, 0)

+        # take the node for the key
        CHILD = self.children[KEY]
        self.quantity += 1

+        # if the child list to enter has only one element, which is KEY, no more node will be created
        if len(child) == 1:
            return

@@ -53,27 +65,32 @@ def get_args(args: list[str]) -> ProgramArgs:

    PARSER = argparse.ArgumentParser()
    PARSER.add_argument("--input-file", "-i", required=True, type=str)
+    PARSER.add_argument("--header-name", "-c", required=True, type=str)                       # c stands for column
    PARSER.add_argument("--output-file", "-o", required=True, type=str)
    PARSER.add_argument("--treshold", "-t", type=int, default=1)
    parsed_args, _ = PARSER.parse_known_args(args)

    # print(parsed_args.input_file)

-    return ProgramArgs(parsed_args.input_file,parsed_args.output_file, parsed_args.treshold)  # type ignore
+    return ProgramArgs(parsed_args.input_file, parsed_args.header_name ,parsed_args.output_file, parsed_args.treshold)  # type ignore


 def get_debug_args() -> ProgramArgs:
-
-    FILE = "./Assets/Dataset/Tmp/reverse-rel.txt"
+    # -i ./Assets/Dataset/1-hop/movies.csv  -c subject -o Assets/Dataset/Tmp/prova.csv -t 1
+    FILE = "./Assets/Dataset/1-hop/movies.csv"
+    CSV_HEADER = "subject"
+    OUTPUT = "./Assets/Dataset/Tmp/prova.csv"
    TRESHOLD = 1

    return ProgramArgs(
        FILE,
+        CSV_HEADER,
+        OUTPUT,
        TRESHOLD
    )


-def tree_like(file: str, out: str):
+def tree_like(file: str, csv_uri_header:str, out: str):

    INDENTATION = "    "

@@ -84,9 +101,11 @@ def tree_like(file: str, out: str):

    FILE = open(file, "r", encoding="utf-8")

-    for row in FILE:
+    # It is needed the header-name
+    for row in csv.DictReader(FILE):

-        sections = row.split("/")
+        uri_element = row[csv_uri_header]
+        sections = uri_element.split("/")
        sections = list(filter(lambda item: item != "", sections))

        # print(sections)
@@ -115,7 +134,9 @@ def tree_like(file: str, out: str):

        INDENT: str = INDENTATION * DEPTH

-        if NODE.quantity < ARGS.treshold:
+        # Leaf node have quantity 0, so if i want them to appear the threshold have to be 0
+        # if NODE.quantity < ARGS.treshold:
+        if ARGS.treshold > NODE.quantity:
            continue

        OUT.write(f"{INDENT}- {NODE}\n")
@@ -133,7 +154,8 @@ def tree_like(file: str, out: str):
    OUT.close()


+
 if __name__ == "__main__":
    ARGS = get_args(sys.argv)
    # ARGS = get_debug_args()
-    tree_like(ARGS.file, ARGS.output)
+    tree_like(ARGS.file,ARGS.csv_uri_header, ARGS.output)
--- a/Scripts/DataCleaning/pipeline/cleaner.py
+++ b/Scripts/DataCleaning/pipeline/cleaner.py
@@ -0,0 +1,86 @@
+# This file deletes in the pipeline the unwanted relationship by different rules
+import pandas as pd
+import sqlite3
+import numpy as np
+
+from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
+from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
+
+
+class PipelineApplier():
+
+    def __init__(self):
+        pass
+
+    def rdf_add_special_token(self, RDF: pd.DataFrame):
+        """
+        Adds RDF special token to each element of the tuple. i.e: SUBJ to SubjectURI, OBJ to ObjectURI, REL to RelationshipURI. 
+        Check Scrits/Libs/CleaningPipeline/special_token.py for the up-to-date special token.
+        It only adds the special token of the three element of the RDF, no other special token.
+        Args:
+            RDF (pd.DataFrame):
+        Returns:
+            pd.DataFrame: ["MovieURI","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
+        """        
+        # if the filter runned before sliced the RDF and created a View, here the problem is resolved
+        # for more context: SettingWithCopyWarning
+        RDF = RDF.copy()
+        # at the beginning of SubjectURI RelationshipURI ObjectURI, add their special token  
+        RDF["SubjectURI"] = SpecialToken.SUBJECT.value + RDF["SubjectURI"]
+        RDF["ObjectURI"] = SpecialToken.OBJECT.value + RDF["ObjectURI"]
+        RDF["RelationshipURI"] = SpecialToken.RELATIONSHIP.value + RDF["RelationshipURI"]
+        return RDF
+
+
+    def drop_na_from_dataset(self, RDF: pd.DataFrame) -> pd.DataFrame:
+        RDF = RDF.replace('', np.nan)
+        # Drop rows where any of the key columns are NaN
+        RDF = RDF.dropna(subset=["SubjectURI", "RelationshipURI", "ObjectURI"])
+        return RDF
+    
+    def rebuild_by_movie(self, RDF: pd.DataFrame) -> pd.DataFrame:
+        """
+        Args:
+            RDF (pd.DataFrame): ["MovieID","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
+
+        Returns:
+            pd.DataFrame: ["MovieID","Triple","Abstract"]
+        """        
+        # to execute this method you have to have itereted by movie_id
+        # because as design we want at the end one row for each movie
+        # MovieID and abstract can be given as input for a more generic method
+        # first let's combine each row creating column triple as join of rdf
+        RDF["Triple"] =  RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"]
+        # special token 
+        RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
+        # combine rows into one
+        # MovieID and Abstract are unique for each other 1 <-> 1
+        RDF = RDF.groupby(["MovieID", "Abstract"])["Triple"].apply("".join).reset_index()
+        # add special token for: start of triple, end of triple and start of abstract
+        RDF["Triple"] = SpecialToken.START_TRIPLE_LIST.value + RDF["Triple"]+SpecialToken.END_OF_SENTENCE.value
+        RDF["Abstract"] = SpecialToken.ABSTRACT.value + RDF["Abstract"] + SpecialToken.END_OF_SENTENCE.value
+        return RDF[["MovieID","Triple","Abstract"]]
+
+
+    @staticmethod
+    def build_triple(RDF: pd.DataFrame):
+        """
+        Obtains joined RDF triple in one element, togheter with START and END special token
+        Args:
+            RDF (pd.DataFrame): at least ["SubjectURI", "RelationshipURI", "ObjectURI"]
+        Returns:
+            pd.DataFrame: RDF["Triple"] (just this column)
+        """        
+        # let's combine each row creating column triple as join of rdf
+        RDF["Triple"] =  RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"]
+        # special token 
+        RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
+        return RDF["Triple"]
+
+
+    def regex_on_objects(self, RDF: pd.DataFrame) -> pd.DataFrame:
+        RDF["ObjectURI"] = (RDF["ObjectURI"].astype("string")
+                   .str.replace(r"\r?\n+", ", ", regex=True)   # newlines -> ", "
+                   .str.replace(r"\*", "", regex=True))        # delete all asterisks
+
+        return RDF
--- a/Scripts/DataCleaning/pipeline/movie_filter.py
+++ b/Scripts/DataCleaning/pipeline/movie_filter.py
@@ -0,0 +1,103 @@
+import pandas as pd
+from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
+
+class MovieFilter:
+
+    def __init__(self) -> None:
+        self.sql_endpoint = SqlEndpoint()
+        # first obtain all movie_id
+        movie_query = "SELECT MovieID FROM Movies"
+        self.MOVIE_FILTER = self.sql_endpoint.get_dataframe_from_query(movie_query)
+
+
+    def frequency_filter(self, min_treshold:int, max_treshold:int):
+        movie_list_placeholder = ",".join(["?"] * len(self.MOVIE_FILTER))
+
+        filter_query = f"""
+            SELECT MovieID
+            FROM RDFs
+            WHERE MovieID IN ({movie_list_placeholder})
+            GROUP BY MovieID
+            HAVING COUNT(*) BETWEEN {min_treshold} AND {max_treshold};
+        """
+        self.MOVIE_FILTER = self.sql_endpoint.get_dataframe_from_query(filter_query, tuple(self.MOVIE_FILTER["MovieID"].to_list()))
+
+        
+    def get_movie_id(self):
+        return self.MOVIE_FILTER
+    
+
+    def relation_filter(self, parsed_rel_uri: str, min_treshold:int, max_treshold:int):
+        movie_ids = self.MOVIE_FILTER["MovieID"].to_list()
+        movie_list_placeholder = ",".join(["?"] * len(movie_ids))
+
+        filter_query = f"""
+            SELECT MovieID
+            FROM RDFs
+            JOIN ParsedRelationships ON ParsedRelationships.RelationshipID = RDFs.RelationshipID
+            WHERE MovieID IN ({movie_list_placeholder})
+            GROUP BY MovieID
+            HAVING SUM(CASE WHEN ParsedRelationships.RelationshipURI = '{parsed_rel_uri}'  THEN 1 ELSE 0 END)
+                BETWEEN {min_treshold} AND {max_treshold};
+        """
+
+        params = tuple(movie_ids) # + (parsed_rel_uri, min_treshold, max_treshold)
+        self.MOVIE_FILTER = self.sql_endpoint.get_dataframe_from_query(filter_query, params)
+
+
+    def filter_by_director(self):
+        director_list = ['dbp-dbo:director','dbp-dbp:director']
+
+        movie_ids = self.MOVIE_FILTER["MovieID"].to_list()
+        movie_list_placeholder = ",".join(["?"] * len(movie_ids))
+
+        filter_query = f"""
+            SELECT DISTINCT RDFs.MovieID
+            FROM RDFs
+            JOIN ParsedRelationships USING (RelationshipID)
+            WHERE RDFs.MovieID IN ({movie_list_placeholder})
+            AND ParsedRelationships.RelationshipURI IN {tuple(director_list)};
+        """
+
+        params = tuple(movie_ids)
+        self.MOVIE_FILTER = self.sql_endpoint.get_dataframe_from_query(filter_query, params)
+
+
+    def filter_by_english_movies(self):
+        movie_ids = self.MOVIE_FILTER["MovieID"].to_list()
+        movie_list_placeholder = ",".join(["?"] * len(movie_ids))
+
+        relationship = ["dbp-dbp:language"]
+        objects_list = ["English", "dbp-dbr:English_language"]
+
+        filter_query = f"""
+            SELECT DISTINCT RDFs.MovieID
+            FROM RDFs
+            INNER JOIN ParsedRelationships USING (RelationshipID)
+            INNER JOIN ParsedObjects USING (ObjectID)
+            WHERE RDFs.MovieID IN ({movie_list_placeholder})
+            AND ParsedRelationships.RelationshipURI IN ('{relationship[0]}')
+            AND ParsedObjects.ObjectURI in {tuple(objects_list)};
+        """
+
+        other_query = f"""
+            SELECT RDFs.MovieID
+            FROM RDFs
+            INNER JOIN ParsedRelationships USING (RelationshipID)
+            INNER JOIN ParsedObjects USING (ObjectID)
+            WHERE RDFs.MovieID IN ({movie_list_placeholder})
+            AND ParsedRelationships.RelationshipURI IN ('{relationship[0]}')
+            GROUP BY RDFs.MovieID
+            HAVING
+            SUM(CASE WHEN ParsedObjects.ObjectURI IN {tuple(objects_list)} THEN 1 ELSE 0 END) >= 1
+            AND
+            SUM(CASE WHEN ParsedObjects.ObjectURI NOT IN {tuple(objects_list)} THEN 1 ELSE 0 END) = 0;
+        """
+
+        params = tuple(movie_ids)
+        self.MOVIE_FILTER = self.sql_endpoint.get_dataframe_from_query(other_query, params)
+
+        
+
+# movie_filter = MovieFilter()
+# movie_filter.frequency_filter(5,10)
--- a/Scripts/DataCleaning/pipeline/pipeline.py
+++ b/Scripts/DataCleaning/pipeline/pipeline.py
@@ -0,0 +1,155 @@
+from movie_filter import MovieFilter
+from relationship_filter import RelationshipFilter
+from rdf_filter import RdfFilter
+from cleaner import PipelineApplier
+
+from Scripts.DataCleaning.data_output_models.bpe_corpus import BPE_corpus
+from Scripts.DataCleaning.data_output_models.rdf_text_tasks import RDF_text_task_dataset
+from Scripts.DataCleaning.data_output_models.rdf_completation_task import RDF_completation_task_dataset
+from Scripts.DataCleaning.data_output_models.debug_csv import Debug_csv
+
+import pandas as pd
+
+RELATIONSHIP_FILTER_LIST = [
+            "dbp-dbp:wikiPageUsesTemplate","w3:2000/01/rdf-schema#label","dbp-dbo:abstract",
+            "dbp-dbo:wikiPageID","dbp-dbo:wikiPageRevisionID", "dbp-dbo:wikiPageDisambiguates",
+            "w3:2002/07/owl#sameAs","dbp-dbp:image","dbp-dbo:wikiPageLength", "w3:2000/01/rdf-schema#comment",
+            "dbp-dbo:thumbnail", "foaf:depiction", "w3:1999/02/22-rdf-syntax-ns#type", 
+            "dbp-dbp:id","dbp-dbp:totalWidth", "w3:ns/prov#wasDerivedFrom", "dbp-dbp:n", "dbp-dbp:alt",
+            "dbp-dbo:soundRecording", "dbp-dbp:align", "dbp-dbp:format", 
+            "dbp-dbp:filename", "dbp-dbp:wikt", "foaf:isPrimaryTopicOf", "dbp-dbp:quote", "foaf:homepage",
+            "dbp-dbp:wordnet_type", "dbp-dbp:length","dbp-dbp:caption", "dbp-dbo:imdbId", "dbp-dbp:border", "dbp-dbp:note",
+            "dbp-dbp:postalCodeType", "dbp-dbp:extraColumn", "foaf:homepage", "dbp-dbp:bgcolor","dbp-dbp:prevTitle", 
+            "dbp-dbp:imageUpright", "dbp-dbp:url", "dbp-dbp:italicTitle", "dbp-dbp:imageSize", "dbp-dbp:text", 
+            "dbp-dbp:captionAlign", "dbp-dbp:headerAlign", "dbp-dbp:height", "dbp-dbp:link", "dbp-dbo:wikiPageInterLanguageLink",
+            "w3:2003/01/geo/wgs84_pos#lat", "w3:2003/01/geo/wgs84_pos#long", "http://www.georss.org/georss/point", 
+            "dbp-dbp:bgcolor", "dbp-dbp:mc", "dbp-dbp:rev3score", "dbp-dbp:rev4score", "dbp-dbp:imageAlt", 
+            "dbp-dbp:b", "dbp-dbp:s", "dbp-dbp:c", "dbp-dbp:d", "dbp-dbp:m", "dbp-dbp:v", "dbp-dbp:mw", "dbp-dbp:fontsize",
+            "dbp-dbp:salign", "dbp-dbp:q", "dbp-dbp:portal", "dbp-dbp:dSearch", "dbp-dbp:header", "w3:2003/01/geo/wgs84_pos#geometry",
+            "dbp-dbp:shortsummary", "dbp-dbp:fixAttempted", "dbp-dbo:developer", "dbp-dbp:no", "dbp-dbp:ref", "dbp-dbp:infoa"
+            "dbp-dbp:infob", "dbp-dbp:1a", "dbp-dbp:1p", "dbp-dbp:2a", "dbp-dbp:2p", "http://rdvocab.info/RDARelationshipsWEMI/manifestationOfWork",
+            "dbp-dbp:isbn", "dbp-dbp:titleWidth", "dbp-dbp:prodcode", "dbp-dbp:page", "w3:2004/02/skos/core#closeMatch",
+            "dbp-dbp:colwidth", "dbp-dbp:imagesize", "dbp-dbp:rr", "dbp-dbp:date", "dbp-dbp:type", "dbp-dbp:list", 
+            "dbp-dbp:listEpisodes", "dbp-dbp:footerAlign", "dbp-dbp:float", "dbp-dbp:bot", "dbp-dbp:p", "dbp-dbp:l", "dbp-dbp:t", "dbp-dbp:j",
+            "dbp-dbp:1y", "dbp-dbp:2y", "dbp-dbp:1pp", "dbp-dbp:vgs", "dbp-dbp:3a", "dbp-dbp:3p", "dbp-dbp:3y", "dbp-dbp:4a", "dbp-dbp:4y",
+            "dbp-dbp:website"
+            ]
+
+RELATIONSHIP_WHITE_LIST = [
+            "dbp-dbp:director","dbp-dbo:starring", "dbp-dbo:writer", "dbp-dbp:name", "dbp-dbp:genre", "purl:dc/terms/subject"
+            ]
+""" 
+SELECT DISTINCT field3
+FROM debug
+"""
+
+class Pipeline():
+
+    def __init__(self) -> None:
+        self._movie_filter = MovieFilter()
+        self._relationship_filter = RelationshipFilter()
+        self._rdf_filter = RdfFilter()
+        self._pipeline = PipelineApplier()
+
+        self.task_bpe_corpus = BPE_corpus("./Assets/Dataset/Tmp/corpus.txt")
+        self.task_rdf_text = RDF_text_task_dataset("./Assets/Dataset/Tmp/rdf_text.csv")
+        self.task_rdf_completation = RDF_completation_task_dataset("./Assets/Dataset/Tmp/rdf_completation.csv")
+
+        self._movie_filter.frequency_filter(50,3000)
+        self._relationship_filter.frequency_filter(25, 2395627) # from 2718 to 3069 
+        self._relationship_filter.delete_relationship_uri_by_list(RELATIONSHIP_FILTER_LIST)
+
+    def other_filter(self):
+        self._movie_filter.relation_filter("purl:dc/terms/subject",5,100)
+        self._movie_filter.filter_by_director()
+        self._movie_filter.filter_by_english_movies()
+        self._movie_filter.relation_filter("dbp-dbp:budget",1,100)      # the most important film have relationship budget
+        self._movie_filter.relation_filter("dbp-dbp:released",1,100)    # to cut to 2000 :(
+
+    def _get_cleaned_movie_rows(self):
+        movie_ids = self._movie_filter.get_movie_id()
+        rel_ids = self._relationship_filter.get_relationship_id()
+        # rel_ids = self._relationship_filter.get_relationship_id_from_white_list(RELATIONSHIP_WHITE_LIST)
+
+        for RDF in self._rdf_filter.yield_movie_abbreviated_rdfs(movie_ids,rel_ids):
+            RDF = self._pipeline.drop_na_from_dataset(RDF)
+            RDF = self._pipeline.regex_on_objects(RDF)
+            RDF = self._pipeline.rdf_add_special_token(RDF)
+
+            if RDF.empty:
+                continue
+            yield RDF
+
+
+    def execute_task_bpe_corpus(self):
+        for RDF in self._get_cleaned_movie_rows():
+            RDF = self._pipeline.rebuild_by_movie(RDF)
+            RDF = RDF[["Triple","Abstract"]]
+            self.task_bpe_corpus.write_from_df(RDF)
+        self._end_file_handler()
+
+
+    def execute_tasks_rdf_text(self):
+        for RDF in self._get_cleaned_movie_rows():
+            RDF = self._pipeline.rebuild_by_movie(RDF)
+            self.task_rdf_text.write(RDF)
+        self._end_file_handler()
+
+
+    def execute_task_rdf_completation(self):
+        for RDF in self._get_cleaned_movie_rows():
+            RDF["Triple"] = self._pipeline.build_triple(RDF)
+            self.task_rdf_completation.write(RDF[["MovieID","Triple"]])
+        self._end_file_handler()
+        
+
+    def _end_file_handler(self):
+        self.task_bpe_corpus.close()
+        self.task_rdf_text.close()
+        self.task_rdf_completation.close()
+
+
+    def execute_all_task(self):
+        for RDF in self._get_cleaned_movie_rows():
+            completation_RDF = RDF.copy()
+            completation_RDF["Triple"] = self._pipeline.build_triple(completation_RDF)
+            self.task_rdf_completation.write(completation_RDF[["MovieID","Triple"]])
+
+            RDF = self._pipeline.rebuild_by_movie(RDF)
+
+            self.task_rdf_text.write(RDF)
+            self.task_bpe_corpus.write_from_df(RDF[["Triple","Abstract"]])
+
+        self._end_file_handler()
+
+    
+    def use_toy_dataset(self):
+        # CHOOSEN MOVIE:
+        # The Dark Knight   : 117248
+        # Inception         : 147074
+        # The Avengers      : 113621
+        # Cast Away         : 1123
+        # The Departed      : 117586
+        # American Psycho   : 90177
+        # Avatar            : 71587
+        # Django Unchained  : 138952
+        # Spirited Away     : 144137
+        # Knives Out        : 148025
+        # [106465,106466,106467,106468,106469,106470,106471,106472,106473]
+        movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]
+        self._movie_filter.MOVIE_FILTER = pd.DataFrame({"MovieID": movie_list})
+
+    def generate_csv_debug_file(self, debug_path:str):
+        debug_csv = Debug_csv(debug_path)
+
+        for RDF in self._get_cleaned_movie_rows():
+            debug_csv.write(RDF)
+
+        debug_csv.close()
+
+
+pipe = Pipeline()
+#pipe.use_toy_dataset()
+pipe.other_filter()
+# pipe.execute_all_task()
+pipe.generate_csv_debug_file("Assets/Dataset/Tmp/debug.csv")
--- a/Scripts/DataCleaning/pipeline/rdf_filter.py
+++ b/Scripts/DataCleaning/pipeline/rdf_filter.py
@@ -0,0 +1,32 @@
+import pandas as pd
+from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
+
+class RdfFilter:
+
+    def __init__(self) -> None:
+        self.sql_endpoint = SqlEndpoint()
+
+
+    # def delete_hyperum_when_movie(self):
+        # purl:linguistics/gold/hypernym 
+        # is almost ever as "dbp-dbr:Movie" or "dbp-dbr:Film"
+        # banned triple
+
+    def yield_movie_abbreviated_rdfs(self, MOVIE_ID: pd.DataFrame, REL_ID: pd.DataFrame):
+        relationship_placeholder = ",".join(["?"] * len(REL_ID))
+
+        param = tuple(REL_ID["RelationshipID"].to_list())
+
+        QUERY = f"""
+                SELECT MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
+                FROM RDFs
+                INNER JOIN ParsedSubjects USING (SubjectID)
+                INNER JOIN ParsedRelationships USING (RelationshipID)
+                INNER JOIN ParsedObjects USING (ObjectID)
+                INNER JOIN WikipediaAbstracts USING (MovieID)
+                WHERE MovieID = (?) AND RelationshipID IN ({relationship_placeholder});
+                """        
+
+        for movie_id in MOVIE_ID["MovieID"].to_list():
+            params = (movie_id,) + param
+            yield self.sql_endpoint.get_dataframe_from_query(QUERY, params=params)
--- a/Scripts/DataCleaning/pipeline/relationship_filter.py
+++ b/Scripts/DataCleaning/pipeline/relationship_filter.py
@@ -0,0 +1,54 @@
+import pandas as pd
+from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
+
+class RelationshipFilter:
+
+    def __init__(self) -> None:
+        self.sql_endpoint = SqlEndpoint()
+        # first obtain all relationship_id
+        relationship_query = "SELECT RelationshipID FROM Relationships"
+        self.RELATIONSHIP_FILTER = self.sql_endpoint.get_dataframe_from_query(relationship_query)
+
+
+    def frequency_filter(self, min_treshold:int, max_treshold:int):
+        movie_list_placeholder = ",".join(["?"] * len(self.RELATIONSHIP_FILTER))
+
+        filter_query = f"""
+            SELECT RelationshipID
+            FROM RDFs
+            WHERE RelationshipID IN ({movie_list_placeholder})
+            GROUP BY RelationshipID
+            HAVING COUNT(*) BETWEEN {min_treshold} AND {max_treshold};
+        """
+        self.RELATIONSHIP_FILTER = self.sql_endpoint.get_dataframe_from_query(filter_query, tuple(self.RELATIONSHIP_FILTER["RelationshipID"].to_list()))
+
+        
+    def get_relationship_id(self):
+        return self.RELATIONSHIP_FILTER
+    
+    def get_relationship_id_from_white_list(self, relationship_list: list[str]):
+        ids_placeholder = ",".join(["?"] * len(self.RELATIONSHIP_FILTER))
+        uri_placeholder = ",".join(["?"] * len(relationship_list))
+        filter_query = f"""
+            SELECT RelationshipID
+            FROM ParsedRelationships
+            WHERE RelationshipID IN ({ids_placeholder})
+            AND RelationshipURI IN ({uri_placeholder});
+        """
+        params = tuple(self.RELATIONSHIP_FILTER["RelationshipID"].to_list()) + tuple(relationship_list)
+        return self.sql_endpoint.get_dataframe_from_query(filter_query, params)
+
+
+
+    def delete_relationship_uri_by_list(self, filter_list: list[str]):
+        ids_placeholder = ",".join(["?"] * len(self.RELATIONSHIP_FILTER))
+        uri_placeholder = ",".join(["?"] * len(filter_list))
+
+        filter_query = f"""
+            SELECT RelationshipID
+            FROM ParsedRelationships
+            WHERE RelationshipID IN ({ids_placeholder})
+            AND RelationshipURI NOT IN ({uri_placeholder});
+        """
+        params = tuple(self.RELATIONSHIP_FILTER["RelationshipID"].to_list()) + tuple(filter_list)
+        self.RELATIONSHIP_FILTER = self.sql_endpoint.get_dataframe_from_query(filter_query, params)
--- a/Scripts/DatasetMerging/SQL_Queries/db_creation.sql
+++ b/Scripts/DatasetMerging/SQL_Queries/db_creation.sql
@@ -1,65 +0,0 @@
-CREATE TABLE IF NOT EXISTS Movies (
-    MovieID INTEGER PRIMARY KEY AUTOINCREMENT,
-    MovieURI TEXT UNIQUE NOT NULL
-);
-
-
-CREATE TABLE IF NOT EXISTS WikiPageIDs (
-    MovieID INTEGER PRIMARY KEY,
-    PageID INTEGER UNIQUE NOT NULL,
-    FOREIGN KEY(MovieID) REFERENCES Movies(MovieID)
-);
-
-
-CREATE TABLE IF NOT EXISTS WikipediaAbstracts (
-    MovieID INTEGER PRIMARY KEY,
-    Abstract TEXT NOT NULL,
-    FOREIGN KEY(MovieID) REFERENCES Movies(MovieID)
-);
-
-
-CREATE TABLE IF NOT EXISTS Origins (
-    OriginID INTEGER PRIMARY KEY AUTOINCREMENT,
-    OriginName TEXT UNIQUE NOT NULL
-);
-
-
-CREATE TABLE IF NOT EXISTS Subjects (
-    SubjectID INTEGER PRIMARY KEY AUTOINCREMENT,
-    SubjectURI TEXT UNIQUE NOT NULL,
-    OriginID BIGINT NOT NULL,
-    FOREIGN KEY(OriginID) REFERENCES Origins(OriginID)
-);
-
-
-CREATE TABLE IF NOT EXISTS Relationships (
-    RelationshipID INTEGER PRIMARY KEY AUTOINCREMENT,
-    RelationshipURI TEXT UNIQUE NOT NULL
-);
-
-
-CREATE TABLE IF NOT EXISTS Objects (
-    ObjectID INTEGER PRIMARY KEY AUTOINCREMENT,
-    ObjectURI TEXT UNIQUE NOT NULL,
-    OriginID BIGINT NOT NULL,
-    FOREIGN KEY(OriginID) REFERENCES Origins(OriginID)
-);
-
-CREATE TABLE IF NOT EXISTS RDFs (
-    RDF_ID INTEGER PRIMARY KEY AUTOINCREMENT,
-    MovieID INTEGER NOT NULL,
-    SubjectID INTEGER NOT NULL,
-    RelationshipID INTEGER NOT NULL,
-    ObjectID INTEGER NOT NULL,
-    UNIQUE(SubjectID, RelationshipID, ObjectID),
-    FOREIGN KEY(MovieID) REFERENCES Movies(MovieID),
-    FOREIGN KEY(SubjectID) REFERENCES Subjects(SubjectID),
-    FOREIGN KEY(RelationshipID) REFERENCES Relationships(RelationshipID),
-    FOREIGN KEY(ObjectID) REFERENCES Objects(ObjectID)
-);
-
-CREATE INDEX IF NOT EXISTS idx_rdf_movie_id ON RDFs(MovieID);
-CREATE INDEX IF NOT EXISTS idx_rdf_subject_id ON RDFs(SubjectID);
-CREATE INDEX IF NOT EXISTS idx_rdf_relationship_id ON RDFs(RelationshipID);
-CREATE INDEX IF NOT EXISTS idx_rdf_object_id ON RDFs(ObjectID);
-
--- a/Scripts/DatasetMerging/datawarehouse.py
+++ b/Scripts/DatasetMerging/datawarehouse.py
@@ -8,7 +8,7 @@ import csv
 #####################################################################

 # sometimes you may need to build a new db file, here a little snippet for you
-# sqlite3 ./Assets/Dataset/Tmp/dataset.db < ./Scripts/DataCleaning/SQL_Queries/db_creation.sql  
+# sqlite3 ./Assets/Dataset/Tmp/dataset.db < ./Scripts/DataCleaning/SQL_Queries/db_creation.sql

 # --- Global configuration ---
 DB_NAME = "./Assets/Dataset/DatawareHouse/dataset.db"
@@ -17,12 +17,15 @@ PAGEID_CSV = "./Assets/Dataset/1-hop/movie-pageid.csv"
 SUMMARY_CSV = "./Assets/Dataset/1-hop/wikipedia-summary.csv"
 DATASET_CSV = "./Assets/Dataset/1-hop/dataset.csv"
 REVERSE_CSV = "./Assets/Dataset/1-hop/reverse.csv"
+URI_CSV = "./Assets/Dataset/1-hop/uri-abbreviations.csv"
+
+MOVIES_CSV_HANDLER = open(MOVIES_CSV, "r", newline="", encoding="utf-8")
+PAGEID_CSV_HANDLER = open(PAGEID_CSV, "r", newline="", encoding="utf-8")
+SUMMARY_CSV_HANDLER = open(SUMMARY_CSV, "r", newline="", encoding="utf-8")
+DATASET_CSV_HANDLER = open(DATASET_CSV, "r", newline="", encoding="utf-8")
+REVERSE_CSV_HANDLER = open(REVERSE_CSV, "r", newline="", encoding="utf-8")
+URI_ABBR_CSV_HANDLER = open(URI_CSV, "r", newline="", encoding="utf-8")

-MOVIES_CSV_HANDLER = open(MOVIES_CSV,"r",newline='', encoding="utf-8")
-PAGEID_CSV_HANDLER = open(PAGEID_CSV,"r",newline='', encoding="utf-8")
-SUMMARY_CSV_HANDLER = open(SUMMARY_CSV,"r",newline='', encoding="utf-8")
-DATASET_CSV_HANDLER = open(DATASET_CSV,"r",newline='', encoding="utf-8")
-REVERSE_CSV_HANDLER = open(REVERSE_CSV,"r",newline='', encoding="utf-8")

 CONN = sqlite3.connect(DB_NAME)
 CURS = CONN.cursor()
@@ -30,7 +33,8 @@ CURS = CONN.cursor()
 # MARK: SQL Definitions
 # Insert MovieURI

-def insertOrigin(curs : sqlite3.Cursor ) -> bool:
+
+def insertOrigin(curs: sqlite3.Cursor) -> bool:

    QUERY = "INSERT INTO  Origins (OriginName) VALUES ('dataset.csv'),('reverse.csv');"
    try:
@@ -38,24 +42,26 @@ def insertOrigin(curs : sqlite3.Cursor ) -> bool:
        return True
    except sqlite3.IntegrityError:
        return False
-    
+
+
 def selectOrigin(curs: sqlite3.Cursor, originName: str) -> int | None:

    QUERY = "SELECT OriginID FROM Origins WHERE OriginName = ?;"
-    
+
    curs.execute(QUERY, [originName])
    originId = curs.fetchone()
    if not originId:
        return None
-    
+
    # in this case the real id is the first element of the tuple
    return originId[0]

-def insertMovie(curs : sqlite3.Cursor , movieUri: str) -> bool:
+
+def insertMovie(curs: sqlite3.Cursor, movieUri: str) -> bool:

    QUERY = "INSERT INTO Movies (MovieURI) VALUES (?);"
    try:
-        curs.execute(QUERY,[movieUri])
+        curs.execute(QUERY, [movieUri])
        return True
    except sqlite3.IntegrityError:
        return False
@@ -64,12 +70,12 @@ def insertMovie(curs : sqlite3.Cursor , movieUri: str) -> bool:
 def selectMovieId(curs: sqlite3.Cursor, movieUri: str) -> int | None:

    QUERY = "SELECT MovieID FROM Movies WHERE MovieURI = ?;"
-    
+
    curs.execute(QUERY, [movieUri])
    movieId = curs.fetchone()
    if not movieId:
        return None
-    
+
    # in this case the real id is the first element of the tuple
    return movieId[0]

@@ -77,105 +83,164 @@ def selectMovieId(curs: sqlite3.Cursor, movieUri: str) -> int | None:
 def insertWikiPageId(curs: sqlite3.Cursor, movieId: int, pageId: int) -> bool:
    QUERY = "INSERT INTO  WikiPageIDs (MovieID, PageID) VALUES (?,?);"
    try:
-        curs.execute(QUERY,[movieId, pageId])
+        curs.execute(QUERY, [movieId, pageId])
        return True
    except sqlite3.IntegrityError:
        return False
-    
-def selectMovieIdFromWikiPageId(curs: sqlite3.Cursor,pageId: int) -> int | None:
+
+
+def selectMovieIdFromWikiPageId(curs: sqlite3.Cursor, pageId: int) -> int | None:

    QUERY = "SELECT MovieID FROM WikiPageIDs WHERE PageID = ?;"
-    
+
    curs.execute(QUERY, [pageId])
    movieId = curs.fetchone()
    if not movieId:
        return None
-    
+
    # in this case the real id is the first element of the tuple
    return movieId[0]

+
 def insertWikiAbstract(curs: sqlite3.Cursor, movieId: int, abstract: str) -> bool:
    QUERY = "INSERT INTO  WikipediaAbstracts (MovieID, Abstract) VALUES (?,?);"
    try:
-        curs.execute(QUERY,[movieId, abstract])
+        curs.execute(QUERY, [movieId, abstract])
        return True
    except sqlite3.IntegrityError:
        return False

+
 def insertSubject(curs: sqlite3.Cursor, subjectURI: str, originID: int) -> bool:
    QUERY = "INSERT INTO  Subjects (SubjectURI, OriginID) VALUES (?,?);"
    try:
-        curs.execute(QUERY,[subjectURI, originID])
+        curs.execute(QUERY, [subjectURI, originID])
        return True
    except sqlite3.IntegrityError:
        return False
-    
+
+
 def insertRelationship(curs: sqlite3.Cursor, relationshipURI: str) -> bool:
    QUERY = "INSERT INTO  Relationships (RelationshipURI) VALUES (?);"
    try:
-        curs.execute(QUERY,[relationshipURI])
+        curs.execute(QUERY, [relationshipURI])
        return True
    except sqlite3.IntegrityError:
        return False

+
 def insertObject(curs: sqlite3.Cursor, objectURI: str, originID: int) -> bool:
    QUERY = "INSERT INTO  objects (ObjectURI, OriginID) VALUES (?,?);"
    try:
-        curs.execute(QUERY,[objectURI, originID])
+        curs.execute(QUERY, [objectURI, originID])
        return True
    except sqlite3.IntegrityError:
        return False
-    
+
+
 def selectSubjectId(curs: sqlite3.Cursor, subjectURI: str) -> int | None:

    QUERY = "SELECT SubjectID FROM Subjects WHERE SubjectURI = ?;"
-    
+
    curs.execute(QUERY, [subjectURI])
    subjectId = curs.fetchone()
    if not subjectId:
        return None
-    
+
    # in this case the real id is the first element of the tuple
    return subjectId[0]

+
 def selectRelationshipId(curs: sqlite3.Cursor, relationshipURI: str) -> int | None:

    QUERY = "SELECT RelationshipID FROM Relationships WHERE RelationshipURI = ?;"
-    
+
    curs.execute(QUERY, [relationshipURI])
    relationshipId = curs.fetchone()
    if not relationshipId:
        return None
-    
+
    # in this case the real id is the first element of the tuple
    return relationshipId[0]

+
 def selectObjectId(curs: sqlite3.Cursor, objectURI: str) -> int | None:

    QUERY = "SELECT ObjectID FROM Objects WHERE ObjectURI = ?;"
-    
+
    curs.execute(QUERY, [objectURI])
    objectId = curs.fetchone()
    if not objectId:
        return None
-    
+
    # in this case the real id is the first element of the tuple
    return objectId[0]
-    
+
+
 def insertRDF(
-    curs: sqlite3.Cursor, 
-    movieId: int, 
+    curs: sqlite3.Cursor,
+    movieId: int,
    subjectId: int,
    relationshipId: int,
-    objectId: int 
+    objectId: int,
 ) -> bool:
    QUERY = "INSERT INTO  RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?,?,?,?);"
    try:
-        curs.execute(QUERY,[movieId,subjectId,relationshipId,objectId])
+        curs.execute(QUERY, [movieId, subjectId, relationshipId, objectId])
        return True
    except sqlite3.IntegrityError:
        return False
-    
+
+# UGLY: correct method to add cursor
+def insert_abbreviation(uri, abbreviation) -> bool:
+    QUERY = "INSERT INTO Abbreviations(URI, Abbreviation) VALUES (?,?);"
+    try:
+        CURS.execute(QUERY, [uri, abbreviation])
+        return True
+    except sqlite3.IntegrityError:
+        return False
+
+# UGLY: correct method to add cursor
+def insert_object_abbreviation(object_id, abbreviation_id) -> bool:
+    QUERY = "INSERT INTO Objects_Abbreviations(ObjectID, AbbreviationID) VALUES (?,?);"
+    try:
+        CURS.execute(QUERY, [object_id, abbreviation_id])
+        return True
+    except sqlite3.IntegrityError:
+        return False
+
+# UGLY: correct method to add cursor
+def insert_relationship_abbreviation(relationship_id, abbreviation_id) -> bool:
+    QUERY = "INSERT INTO Relationships_Abbreviations(RelationshipID, AbbreviationID) VALUES (?,?);"
+    try:
+        CURS.execute(QUERY, [relationship_id, abbreviation_id])
+        return True
+    except sqlite3.IntegrityError:
+        return False
+
+# UGLY: correct method to add cursor
+def insert_subject_abbreviation(subject_id, abbreviation_id) -> bool:
+    QUERY = (
+        "INSERT INTO Subjects_Abbreviations(SubjectID, AbbreviationID) VALUES (?,?);"
+    )
+    try:
+        CURS.execute(QUERY, [subject_id, abbreviation_id])
+        return True
+    except sqlite3.IntegrityError:
+        return False
+
+# UGLY: correct method to add cursor
+def select_abbreviation_id(uri) -> int | None:
+    QUERY = "SELECT AbbreviationID FROM Abbreviations WHERE URI LIKE ?;"
+    CURS.execute(QUERY, [uri])
+    abbreviation_id = CURS.fetchone()
+    if not abbreviation_id:
+        return None
+
+    # in this case the real id is the first element of the tuple
+    return abbreviation_id[0]
+
+
 # MARK: Parsing
 def parseMovies():

@@ -203,12 +268,11 @@ def parseWikiPageId():
 def parseAbstract():
    CSV_READER = csv.DictReader(SUMMARY_CSV_HANDLER)
    for row in CSV_READER:
-        
+
        WIKI_PAGE_ID = int(row["subject"])
        ABSTRACT = row["text"]
        MOVIE_ID = selectMovieIdFromWikiPageId(CURS, WIKI_PAGE_ID)

-
        if MOVIE_ID is None:
            print(f"The WikiPageId: {WIKI_PAGE_ID} has not a MovieId ")
            continue
@@ -216,10 +280,24 @@ def parseAbstract():
        insertWikiAbstract(CURS, MOVIE_ID, ABSTRACT)


+def parseAbbreviations():
+    URI_CSV = csv.DictReader(URI_ABBR_CSV_HANDLER)
+    for row in URI_CSV:
+
+        URI = row["uri"]
+        ABBREVIATION = row["abbreviation"]
+
+        insert_abbreviation(URI, ABBREVIATION)
+
+
 def parseRDF_Reverse():

    REVERSE_CSV_READER = csv.DictReader(REVERSE_CSV_HANDLER)
-    REVERSE_ORIGIN_ID = selectOrigin(CURS, 'reverse.csv')
+    REVERSE_ORIGIN_ID = selectOrigin(CURS, "reverse.csv")
+
+    if REVERSE_ORIGIN_ID is None:
+        return
+
    total = 0

    for row in REVERSE_CSV_READER:
@@ -227,7 +305,7 @@ def parseRDF_Reverse():
        RELATIONSHIP = row["relationship"]
        OBJECT = row["object"]
        print(f"RDF triplets:\n\t{SUBJECT} - {RELATIONSHIP} - {OBJECT}")
-        insertSubject(CURS,SUBJECT,REVERSE_ORIGIN_ID)
+        insertSubject(CURS, SUBJECT, REVERSE_ORIGIN_ID)
        insertRelationship(CURS, RELATIONSHIP)
        insertObject(CURS, OBJECT, REVERSE_ORIGIN_ID)

@@ -236,7 +314,6 @@ def parseRDF_Reverse():
        RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
        MOVIE_ID = selectMovieId(CURS, OBJECT)

-
        skip = False

        # guard
@@ -259,17 +336,19 @@ def parseRDF_Reverse():
        if skip:
            continue

-        if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID):
+        if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID):  # type: ignore
            total += 1

    print(total)


-
 def parseRDF_Dataset():

    DATASET_CSV_READER = csv.DictReader(DATASET_CSV_HANDLER)
-    DATASET_ORIGIN_ID = selectOrigin(CURS, 'dataset.csv')
+    DATASET_ORIGIN_ID = selectOrigin(CURS, "dataset.csv")
+
+    if DATASET_ORIGIN_ID is None:
+        return

    total = 0
    rdf_idx = 0
@@ -284,7 +363,7 @@ def parseRDF_Dataset():
        if rdf_idx % 100000 == 0:
            print(f"RDF number {rdf_idx}:\n\t{SUBJECT} - {RELATIONSHIP} - {OBJECT}")

-        insertSubject(CURS,SUBJECT,DATASET_ORIGIN_ID)
+        insertSubject(CURS, SUBJECT, DATASET_ORIGIN_ID)
        insertRelationship(CURS, RELATIONSHIP)
        insertObject(CURS, OBJECT, DATASET_ORIGIN_ID)

@@ -293,7 +372,6 @@ def parseRDF_Dataset():
        RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
        MOVIE_ID = selectMovieId(CURS, SUBJECT)

-
        skip = False

        # guard
@@ -316,24 +394,203 @@ def parseRDF_Dataset():
        if skip:
            continue

-        if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID):
+        if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID):  # type: ignore
            total += 1

    print(total)
-        
+
+
+def parseAbbr_Reverse():
+
+    REVERSE_CSV_READER = csv.DictReader(REVERSE_CSV_HANDLER)
+    REVERSE_ORIGIN_ID = selectOrigin(CURS, "reverse.csv")
+
+    if REVERSE_ORIGIN_ID is None:
+        return
+
+    total = 0
+
+    for row in REVERSE_CSV_READER:
+        SUBJECT = row["subject"]
+        RELATIONSHIP = row["relationship"]
+        OBJECT = row["object"]
+
+        SUBJECT_ID = selectSubjectId(CURS, SUBJECT)
+        OBJECT_ID = selectObjectId(CURS, OBJECT)
+        RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
+
+        SUB_SECTIONS = SUBJECT.split("/")
+        REL_SECTIONS = RELATIONSHIP.split("/")
+        OBJ_SECTIONS = OBJECT.split("/")
+
+        SUB_ABBR_ID = None
+        REL_ABBR_ID = None
+        OBJ_ABBR_ID = None
+
+        skip = False
+
+        # guard
+        if SUBJECT_ID is None:
+            print(f"No SubjectId for {SUBJECT}")
+            skip = True
+
+        if OBJECT_ID is None:
+            print(f"No ObjectId for {OBJECT}")
+            skip = True
+
+        if RELATIONSHIP_ID is None:
+            print(f"No RelationshipId for {RELATIONSHIP}")
+            skip = True
+
+
+        if skip:
+            continue
+
+        if len(SUB_SECTIONS) > 4:
+            index = min(len(SUB_SECTIONS), 7)
+            while index > 3:
+                PATH = "/".join(SUB_SECTIONS[0:index]) + "%"
+                SUB_ABBR_ID = select_abbreviation_id(PATH)
+
+                if SUB_ABBR_ID is not None:
+                    if insert_subject_abbreviation(SUBJECT_ID, SUB_ABBR_ID):
+                        total += 1
+                    index = 0
+                index -= 1
+
+        if len(REL_SECTIONS) > 4:
+            index = min(len(REL_SECTIONS), 7)
+            while index > 2:
+                PATH = "/".join(REL_SECTIONS[0:index]) + "%"
+                REL_ABBR_ID = select_abbreviation_id(PATH)
+
+
+                if REL_ABBR_ID is not None:
+                    if insert_relationship_abbreviation(RELATIONSHIP_ID, REL_ABBR_ID):
+                        total += 1
+                    index = 0
+                index -= 1
+
+        if len(OBJ_SECTIONS) > 4:
+            index = min(len(OBJ_SECTIONS), 7)
+            while index > 3:
+                PATH = "/".join(OBJ_SECTIONS[0:index]) + "%"
+                OBJ_ABBR_ID = select_abbreviation_id(PATH)
+
+                if OBJ_ABBR_ID is not None:
+                    if insert_object_abbreviation(OBJECT_ID, OBJ_ABBR_ID):
+                        total += 1
+                    index = 0
+                index -= 1
+
+    print(total)
+
+
+def parseAbbr_Dataset():
+
+    DATASET_CSV_READER = csv.DictReader(DATASET_CSV_HANDLER)
+    DATASET_ORIGIN_ID = selectOrigin(CURS, "dataset.csv")
+
+    if DATASET_ORIGIN_ID is None:
+        return
+
+    total = 0
+    rdf_idx = 0
+    for row in DATASET_CSV_READER:
+        SUBJECT = row["subject"]
+        RELATIONSHIP = row["relationship"]
+        OBJECT = row["object"]
+
+        rdf_idx += 1
+
+        if rdf_idx % 100000 == 0:
+            print(f"RDF number {rdf_idx}:\n\t{SUBJECT} - {RELATIONSHIP} - {OBJECT}")
+
+        SUBJECT_ID = selectSubjectId(CURS, SUBJECT)
+        OBJECT_ID = selectObjectId(CURS, OBJECT)
+        RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
+
+        SUB_SECTIONS = SUBJECT.split("/")
+        REL_SECTIONS = RELATIONSHIP.split("/")
+        OBJ_SECTIONS = OBJECT.split("/")
+
+        SUB_ABBR_ID = None
+        REL_ABBR_ID = None
+        OBJ_ABBR_ID = None
+
+        skip = False
+
+        # guard
+        if SUBJECT_ID is None:
+            print(f"No SubjectId for {SUBJECT}")
+            skip = True
+
+        if OBJECT_ID is None:
+            print(f"No ObjectId for {OBJECT}")
+            skip = True
+
+        if RELATIONSHIP_ID is None:
+            print(f"No RelationshipId for {RELATIONSHIP}")
+            skip = True
+
+
+        if skip:
+            continue
+
+        if len(SUB_SECTIONS) > 4:
+            index = min(len(SUB_SECTIONS), 7)
+            while index > 3:
+                PATH = "/".join(SUB_SECTIONS[0:index]) + "%"
+                SUB_ABBR_ID = select_abbreviation_id(PATH)
+
+                if SUB_ABBR_ID is not None:
+                    if insert_subject_abbreviation(SUBJECT_ID, SUB_ABBR_ID):
+                        total += 1
+                    index = 0
+                index -= 1
+
+        if len(REL_SECTIONS) > 4:
+            index = min(len(REL_SECTIONS), 7)
+            while index > 2:
+                PATH = "/".join(REL_SECTIONS[0:index]) + "%"
+                REL_ABBR_ID = select_abbreviation_id(PATH)
+
+
+                if REL_ABBR_ID is not None:
+                    if insert_relationship_abbreviation(RELATIONSHIP_ID, REL_ABBR_ID):
+                        total += 1
+                    index = 0
+                index -= 1
+
+        if len(OBJ_SECTIONS) > 4:
+            index = min(len(OBJ_SECTIONS), 7)
+            while index > 3:
+                PATH = "/".join(OBJ_SECTIONS[0:index]) + "%"
+                OBJ_ABBR_ID = select_abbreviation_id(PATH)
+
+                if OBJ_ABBR_ID is not None:
+                    if insert_object_abbreviation(OBJECT_ID, OBJ_ABBR_ID):
+                        total += 1
+                    index = 0
+                index -= 1
+
+    print(total)
+

 # MARK: Actual Code
 # parseMovies()
 # parseWikiPageId()
 # parseAbstract()
 # insertOrigin(CURS)
+# parseAbbreviations()
 # parseRDF_Reverse()
 # parseRDF_Dataset()
+# parseAbbr_Reverse()
+parseAbbr_Dataset()


 CONN.commit()
 CONN.close()
-    


 MOVIES_CSV_HANDLER.close()
@@ -341,35 +598,36 @@ PAGEID_CSV_HANDLER.close()
 SUMMARY_CSV_HANDLER.close()
 DATASET_CSV_HANDLER.close()
 REVERSE_CSV_HANDLER.close()
+URI_ABBR_CSV_HANDLER.close()


 """
-The MovieUri: http://dbpedia.org/resource/1%25_(film) has not a MovieId 
-The MovieUri: http://dbpedia.org/resource/10%25:_What_Makes_a_Hero%3F has not a MovieId 
-The MovieUri: http://dbpedia.org/resource/100%25_Arabica has not a MovieId 
-The MovieUri: http://dbpedia.org/resource/100%25_Kadhal has not a MovieId 
-The MovieUri: http://dbpedia.org/resource/100%25_Love_(2011_film) has not a MovieId 
-The MovieUri: http://dbpedia.org/resource/100%25_Love_(2012_film) has not a MovieId 
-The MovieUri: http://dbpedia.org/resource/100%25_Wolf has not a MovieId 
-The MovieUri: http://dbpedia.org/resource/Who_the_$&%25_Is_Jackson_Pollock%3F has not a MovieId 
-The MovieUri: http://dbpedia.org/resource/99%25:_The_Occupy_Wall_Street_Collaborative_Film has not a MovieId 
-The MovieUri: http://dbpedia.org/resource/99_and_44/100%25_Dead has not a MovieId 
-The MovieUri: http://dbpedia.org/resource/Postcards_from_the_48%25 has not a MovieId 
-The MovieUri: http://dbpedia.org/resource/Wool_100%25 has not a MovieId 
+The MovieUri: http://dbpedia.org/resource/1%25_(film) has not a MovieId
+The MovieUri: http://dbpedia.org/resource/10%25:_What_Makes_a_Hero%3F has not a MovieId
+The MovieUri: http://dbpedia.org/resource/100%25_Arabica has not a MovieId
+The MovieUri: http://dbpedia.org/resource/100%25_Kadhal has not a MovieId
+The MovieUri: http://dbpedia.org/resource/100%25_Love_(2011_film) has not a MovieId
+The MovieUri: http://dbpedia.org/resource/100%25_Love_(2012_film) has not a MovieId
+The MovieUri: http://dbpedia.org/resource/100%25_Wolf has not a MovieId
+The MovieUri: http://dbpedia.org/resource/Who_the_$&%25_Is_Jackson_Pollock%3F has not a MovieId
+The MovieUri: http://dbpedia.org/resource/99%25:_The_Occupy_Wall_Street_Collaborative_Film has not a MovieId
+The MovieUri: http://dbpedia.org/resource/99_and_44/100%25_Dead has not a MovieId
+The MovieUri: http://dbpedia.org/resource/Postcards_from_the_48%25 has not a MovieId
+The MovieUri: http://dbpedia.org/resource/Wool_100%25 has not a MovieId
 """

 """
-The WikiPageId: 10068850 has not a MovieId 
-The WikiPageId: 55069615 has not a MovieId 
-The WikiPageId: 49510056 has not a MovieId 
-The WikiPageId: 4049786 has not a MovieId 
-The WikiPageId: 55510238 has not a MovieId 
-The WikiPageId: 31239628 has not a MovieId 
-The WikiPageId: 34757217 has not a MovieId 
-The WikiPageId: 64311757 has not a MovieId 
-The WikiPageId: 8326198 has not a MovieId 
-The WikiPageId: 42162164 has not a MovieId 
-The WikiPageId: 18502369 has not a MovieId 
-The WikiPageId: 58092358 has not a MovieId 
-The WikiPageId: 40710250 has not a MovieId 
-"""
+The WikiPageId: 10068850 has not a MovieId
+The WikiPageId: 55069615 has not a MovieId
+The WikiPageId: 49510056 has not a MovieId
+The WikiPageId: 4049786 has not a MovieId
+The WikiPageId: 55510238 has not a MovieId
+The WikiPageId: 31239628 has not a MovieId
+The WikiPageId: 34757217 has not a MovieId
+The WikiPageId: 64311757 has not a MovieId
+The WikiPageId: 8326198 has not a MovieId
+The WikiPageId: 42162164 has not a MovieId
+The WikiPageId: 18502369 has not a MovieId
+The WikiPageId: 58092358 has not a MovieId
+The WikiPageId: 40710250 has not a MovieId
+"""
--- a/Scripts/Libs/CleaningPipeline/.gitkeep
+++ b/Scripts/Libs/CleaningPipeline/.gitkeep
--- a/Scripts/Libs/CleaningPipeline/special_token.py
+++ b/Scripts/Libs/CleaningPipeline/special_token.py
@@ -0,0 +1,22 @@
+from enum import Enum
+
+class SpecialToken(str, Enum):
+    # (Enum, str) -> throws an error
+    START_TRIPLE_LIST = "<SOTL>"
+    START_TRIPLE = "<SOT>"
+    END_TRIPLE = "<EOT>"
+    SUBJECT = "<SUBJ>"
+    RELATIONSHIP = "<PRED>"
+    OBJECT = "<OBJ>"
+    ABSTRACT = "<ABS>"
+    END_OF_SENTENCE = "<EOS>"
+    CORPUS_END = "<END>"
+
+    ## Tasks' Token
+    RDF_TO_TEXT = "<RDF2TXT>"
+    TEXT_TO_RDF = "<TEXT2RDF>"
+    CONTINUE_RDF = "<CONTINUERDF>"
+    MASK = "<MASK>"
+
+    #BPE Training:
+    
--- a/Scripts/Libs/CleaningPipeline/sql_endpoint.py
+++ b/Scripts/Libs/CleaningPipeline/sql_endpoint.py
@@ -0,0 +1,149 @@
+#######################################################
+#   This file stand as endpoint to interact with DB   #
+#######################################################
+
+# import sqlite3
+import pandas as pd
+from sqlalchemy import create_engine
+from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
+
+
+class SqlEndpoint():
+
+    def __init__(self, DB_PATH = "./Assets/Dataset/DatawareHouse/dataset.db", chunk_size_row = 500):
+        # self.CONN = sqlite3.connect(DB_PATH) # DEPRECATED
+        self.sql_engine = create_engine(f"sqlite:///{DB_PATH}")
+        # /// 3 slash -> relative path
+        # //// 4 slash -> absolute
+        # self.conn = self.sql_engine.connect().execution_options(stream_results=True)
+        # it seems that sqlite doenst support streamer cursor
+        # PRAGMA exeutes better in writing not reading
+        self.chunk_size_row = chunk_size_row                    # not used now, since each chunk is a movie
+        self.movie_ids = movie_ids = pd.read_sql_query("SELECT MovieID FROM Movies;", self.sql_engine)["MovieID"]
+
+    def get_RDF(self) -> pd.DataFrame :
+        
+        QUERY = """
+                SELECT MovieID, SubjectURI, RelationshipURI, ObjectURI
+                FROM RDFs
+                INNER JOIN Subjects USING (SubjectID)
+                INNER JOIN Relationships USING (RelationshipID)
+                INNER JOIN Objects USING (ObjectID);
+                """
+        
+        return pd.read_sql_query(QUERY, self.CONN)
+    
+    def get_chunked_abbreviated_dataset(self) -> pd.DataFrame :
+        """
+        Returns:
+            pd.DataFrame: MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
+        """        
+        
+        QUERY = """
+                SELECT MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
+                FROM RDFs
+                INNER JOIN ParsedSubjects USING (SubjectID)
+                INNER JOIN ParsedRelationships USING (RelationshipID)
+                INNER JOIN ParsedObjects USING (ObjectID)
+                INNER JOIN WikipediaAbstracts USING (MovieID);
+                """
+        
+        # return pd.read_sql_query(QUERY, self.CONN, chunksize=500)
+        # sqlite3
+        return pd.read_sql_query(QUERY, self.sql_engine, chunksize=self.chunk_size_row)
+
+    
+    def get_chunked_abbreviated_dataset_with_start_token(self)-> pd.DataFrame:
+        # DEPRECATED !
+        start_token = SpecialToken()
+        QUERY = """
+                SELECT 
+                    MovieID, 
+                    ? || SubjectURI AS SubjectURI,
+                    ? || RelationshipURI AS RelationshipURI, 
+                    ? || ObjectURI AS ObjectURI, 
+                    Abstract
+                FROM RDFs
+                INNER JOIN ParsedSubjects USING (SubjectID)
+                INNER JOIN ParsedRelationships USING (RelationshipID)
+                INNER JOIN ParsedObjects USING (ObjectID)
+                INNER JOIN WikipediaAbstracts USING (MovieID);
+                """
+        return pd.read_sql_query(QUERY, self.sql_engine, chunksize=self.chunk_size_row)
+    
+    def get_abbreviated_dataset_by_movie_id(self):# -> iter[pd.DataFrame]:
+        """
+        Gets each time a DataFrame per movie ( with all its rows in the dataset).
+        The retrieved RDFs are already abbrevieted by the sql parser
+        Yields:
+            Pandas.DataFrame: [MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract]
+        """        
+        # chunk by movieId, abstract is the same and some intersting logic are appliable
+        # movie_ids = pd.read_sql_query("SELECT MovieID FROM Movies;", self.sql_engine)["MovieID"]
+        # CHOOSEN MOVIE:
+        # The Dark Knight   : 117248
+        # Inception         : 147074
+        # The Avengers      : 113621
+        # Cast Away         : 1123
+        # The Departed      : 117586
+        # American Psycho   : 90177
+        # Avatar            : 71587
+        # Django Unchained  : 138952
+        # Spirited Away     : 144137
+        # Knives Out        : 148025
+        # movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]
+        # movie_ids = movie_list
+
+        QUERY = """
+                SELECT MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
+                FROM RDFs
+                INNER JOIN ParsedSubjects USING (SubjectID)
+                INNER JOIN ParsedRelationships USING (RelationshipID)
+                INNER JOIN ParsedObjects USING (ObjectID)
+                INNER JOIN WikipediaAbstracts USING (MovieID)
+                WHERE MovieID = (?);
+                """        
+
+        for movie_id in self.movie_ids:
+            yield pd.read_sql_query(QUERY, self.sql_engine, params=(movie_id,))
+
+    def get_movies_id_count(self) -> pd.DataFrame:
+        """
+        Gets the count of each Movie in the Dataset
+        Returns:
+            Pandas.DataFrame: [MovieID, Count]
+        """        
+        QUERY = """
+                SELECT MovieID, COUNT(*) AS Count
+                FROM RDFs
+                GROUP BY MovieID;
+                """        
+        return pd.read_sql_query(QUERY, self.sql_engine)
+    
+    def get_relationship_count(self) -> pd.DataFrame:
+        """
+        Gets the count of each Relationship in the Dataset
+        Returns:
+            Pandas.DataFrame: [RelationshipURI, Count]
+        """       
+        QUERY = """
+                SELECT RelationshipURI, COUNT(*) AS Count
+                FROM RDFs
+                INNER JOIN ParsedRelationships USING (RelationshipID)
+                GROUP BY RelationshipURI;
+                """        
+        return pd.read_sql_query(QUERY, self.sql_engine)
+    
+    def get_dataframe_from_query(self, query: str, params=None):
+        if params is None:
+            return pd.read_sql_query(query, self.sql_engine)
+        return pd.read_sql_query(query, self.sql_engine, params=params)
+
+
+
+if __name__ == "__main__" :
+    sql_endpoint = SqlEndpoint()
+    for pandas_row in sql_endpoint.get_abbreviated_dataset_by_movie_id():
+        print(pandas_row)
+    # sql_endpoint.get_RDF()
+    print("done")
--- a/Scripts/Libs/Utils/.gitkeep
+++ b/Scripts/Libs/Utils/.gitkeep
--- a/Scripts/Libs/Utils/dataframe_interaction.py
+++ b/Scripts/Libs/Utils/dataframe_interaction.py
@@ -0,0 +1,9 @@
+import pandas as pd
+
+
+
+def get_raw_from_dataframe(DF: pd.DataFrame) -> str:
+    output = ''
+    for row in DF.itertuples(index=False, name=None):
+        output += "".join(map(str, row))
+    return output
--- a/Scripts/UML/CleaningPipeline/bpe-pipeline.excalidraw.json
+++ b/Scripts/UML/CleaningPipeline/bpe-pipeline.excalidraw.json
@@ -0,0 +1,897 @@
+{
+  "type": "excalidraw",
+  "version": 2,
+  "source": "https://marketplace.visualstudio.com/items?itemName=pomdtr.excalidraw-editor",
+  "elements": [
+    {
+      "id": "3zbCui3XtIGozHXTVAGRp",
+      "type": "rectangle",
+      "x": 316.5,
+      "y": 123,
+      "width": 436.5,
+      "height": 145.5,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a0",
+      "roundness": {
+        "type": 3
+      },
+      "seed": 1698427950,
+      "version": 35,
+      "versionNonce": 601575602,
+      "isDeleted": false,
+      "boundElements": [
+        {
+          "id": "wD66RDbG05HfvRhAtMb0J",
+          "type": "text"
+        },
+        {
+          "id": "gus_rxauKJ6T2L_F59PfN",
+          "type": "arrow"
+        }
+      ],
+      "updated": 1758818588814,
+      "link": null,
+      "locked": false
+    },
+    {
+      "id": "wD66RDbG05HfvRhAtMb0J",
+      "type": "text",
+      "x": 480.98004150390625,
+      "y": 183.25,
+      "width": 107.5399169921875,
+      "height": 25,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a1",
+      "roundness": null,
+      "seed": 910769774,
+      "version": 31,
+      "versionNonce": 1120989938,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818416720,
+      "link": null,
+      "locked": false,
+      "text": "dataset.db",
+      "fontSize": 20,
+      "fontFamily": 5,
+      "textAlign": "center",
+      "verticalAlign": "middle",
+      "containerId": "3zbCui3XtIGozHXTVAGRp",
+      "originalText": "dataset.db",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "87-MeaiZGT1wln0nggYPZ",
+      "type": "rectangle",
+      "x": 339.5,
+      "y": 309.5,
+      "width": 392,
+      "height": 156,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a2",
+      "roundness": {
+        "type": 3
+      },
+      "seed": 655550318,
+      "version": 77,
+      "versionNonce": 1103939826,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818339000,
+      "link": null,
+      "locked": false
+    },
+    {
+      "id": "EjUxEhZqEBzwvlw0VE9eJ",
+      "type": "rectangle",
+      "x": 355.5,
+      "y": 327,
+      "width": 162,
+      "height": 125.5,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a3",
+      "roundness": {
+        "type": 3
+      },
+      "seed": 1739846638,
+      "version": 64,
+      "versionNonce": 1594290034,
+      "isDeleted": false,
+      "boundElements": [
+        {
+          "type": "text",
+          "id": "ogRkV0neHrhEKTE6zlggl"
+        }
+      ],
+      "updated": 1758818391415,
+      "link": null,
+      "locked": false
+    },
+    {
+      "id": "ogRkV0neHrhEKTE6zlggl",
+      "type": "text",
+      "x": 378.7100524902344,
+      "y": 377.25,
+      "width": 115.57989501953125,
+      "height": 25,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a3V",
+      "roundness": null,
+      "seed": 2037675630,
+      "version": 12,
+      "versionNonce": 1286472046,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818399222,
+      "link": null,
+      "locked": false,
+      "text": "RDF_String",
+      "fontSize": 20,
+      "fontFamily": 5,
+      "textAlign": "center",
+      "verticalAlign": "middle",
+      "containerId": "EjUxEhZqEBzwvlw0VE9eJ",
+      "originalText": "RDF_String",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "hoIRMNiMJZl4YDo-hovWy",
+      "type": "rectangle",
+      "x": 542.5,
+      "y": 327,
+      "width": 173,
+      "height": 125.5,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a4",
+      "roundness": {
+        "type": 3
+      },
+      "seed": 1189796530,
+      "version": 99,
+      "versionNonce": 1071057006,
+      "isDeleted": false,
+      "boundElements": [
+        {
+          "type": "text",
+          "id": "rsapATFAT5YSBCXzLupgZ"
+        },
+        {
+          "id": "gus_rxauKJ6T2L_F59PfN",
+          "type": "arrow"
+        },
+        {
+          "id": "Wk1bJbbtC31FqObEL5xWt",
+          "type": "arrow"
+        }
+      ],
+      "updated": 1758818593647,
+      "link": null,
+      "locked": false
+    },
+    {
+      "id": "rsapATFAT5YSBCXzLupgZ",
+      "type": "text",
+      "x": 585.6800384521484,
+      "y": 377.25,
+      "width": 86.63992309570312,
+      "height": 25,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a5",
+      "roundness": null,
+      "seed": 829619694,
+      "version": 12,
+      "versionNonce": 713902318,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818405150,
+      "link": null,
+      "locked": false,
+      "text": "Abstract",
+      "fontSize": 20,
+      "fontFamily": 5,
+      "textAlign": "center",
+      "verticalAlign": "middle",
+      "containerId": "hoIRMNiMJZl4YDo-hovWy",
+      "originalText": "Abstract",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "jSx8ApfhtRs_nk37VvDMb",
+      "type": "rectangle",
+      "x": 316.5,
+      "y": 511,
+      "width": 436.5,
+      "height": 145.5,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a6",
+      "roundness": {
+        "type": 3
+      },
+      "seed": 492582894,
+      "version": 132,
+      "versionNonce": 893797614,
+      "isDeleted": false,
+      "boundElements": [
+        {
+          "type": "text",
+          "id": "6E23g-rgowNqHsBxX-LuM"
+        },
+        {
+          "id": "hyFKqXwet_F79QM71atgI",
+          "type": "arrow"
+        },
+        {
+          "id": "x_DP1FcQ7jraGz0gBuDi3",
+          "type": "arrow"
+        },
+        {
+          "id": "1IGbCps2EHnzKgJUWM5nq",
+          "type": "arrow"
+        },
+        {
+          "id": "Wk1bJbbtC31FqObEL5xWt",
+          "type": "arrow"
+        }
+      ],
+      "updated": 1758818593647,
+      "link": null,
+      "locked": false
+    },
+    {
+      "id": "6E23g-rgowNqHsBxX-LuM",
+      "type": "text",
+      "x": 499.9100341796875,
+      "y": 571.25,
+      "width": 69.679931640625,
+      "height": 25,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a7",
+      "roundness": null,
+      "seed": 267696178,
+      "version": 132,
+      "versionNonce": 1668243186,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818543211,
+      "link": null,
+      "locked": false,
+      "text": "Pandas",
+      "fontSize": 20,
+      "fontFamily": 5,
+      "textAlign": "center",
+      "verticalAlign": "middle",
+      "containerId": "jSx8ApfhtRs_nk37VvDMb",
+      "originalText": "Pandas",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "ohj18N4AOTDz5lJNcV9gi",
+      "type": "rectangle",
+      "x": 261,
+      "y": 765.5,
+      "width": 157,
+      "height": 87,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a8",
+      "roundness": {
+        "type": 3
+      },
+      "seed": 1446207150,
+      "version": 279,
+      "versionNonce": 317375026,
+      "isDeleted": false,
+      "boundElements": [
+        {
+          "id": "Ea1_ke2wA0D8ZjVOUtvfY",
+          "type": "text"
+        },
+        {
+          "id": "hyFKqXwet_F79QM71atgI",
+          "type": "arrow"
+        }
+      ],
+      "updated": 1758818570993,
+      "link": null,
+      "locked": false
+    },
+    {
+      "id": "Ea1_ke2wA0D8ZjVOUtvfY",
+      "type": "text",
+      "x": 297.0800323486328,
+      "y": 796.5,
+      "width": 84.83993530273438,
+      "height": 25,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a9",
+      "roundness": null,
+      "seed": 435116270,
+      "version": 199,
+      "versionNonce": 1282911218,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818570993,
+      "link": null,
+      "locked": false,
+      "text": "train.txt",
+      "fontSize": 20,
+      "fontFamily": 5,
+      "textAlign": "center",
+      "verticalAlign": "middle",
+      "containerId": "ohj18N4AOTDz5lJNcV9gi",
+      "originalText": "train.txt",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "A4Y54Y26fe257U_QU9lxX",
+      "type": "rectangle",
+      "x": 464,
+      "y": 765.5,
+      "width": 157,
+      "height": 87,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aA",
+      "roundness": {
+        "type": 3
+      },
+      "seed": 186148850,
+      "version": 232,
+      "versionNonce": 997119858,
+      "isDeleted": false,
+      "boundElements": [
+        {
+          "id": "v4TvUlDEjH7EvPDmtbOn2",
+          "type": "text"
+        },
+        {
+          "id": "1IGbCps2EHnzKgJUWM5nq",
+          "type": "arrow"
+        }
+      ],
+      "updated": 1758818570993,
+      "link": null,
+      "locked": false
+    },
+    {
+      "id": "v4TvUlDEjH7EvPDmtbOn2",
+      "type": "text",
+      "x": 476.3500442504883,
+      "y": 796.5,
+      "width": 132.29991149902344,
+      "height": 25,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aB",
+      "roundness": null,
+      "seed": 1131059634,
+      "version": 171,
+      "versionNonce": 239540530,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818570993,
+      "link": null,
+      "locked": false,
+      "text": "validation.txt",
+      "fontSize": 20,
+      "fontFamily": 5,
+      "textAlign": "center",
+      "verticalAlign": "middle",
+      "containerId": "A4Y54Y26fe257U_QU9lxX",
+      "originalText": "validation.txt",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "mPaYpJ9Xn7tlJPmKPqJKJ",
+      "type": "rectangle",
+      "x": 674.5,
+      "y": 765.5,
+      "width": 157,
+      "height": 87,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aC",
+      "roundness": {
+        "type": 3
+      },
+      "seed": 1049323314,
+      "version": 235,
+      "versionNonce": 330560690,
+      "isDeleted": false,
+      "boundElements": [
+        {
+          "type": "text",
+          "id": "kg9nm2rpud6cax5aNPSnu"
+        },
+        {
+          "id": "x_DP1FcQ7jraGz0gBuDi3",
+          "type": "arrow"
+        }
+      ],
+      "updated": 1758818570993,
+      "link": null,
+      "locked": false
+    },
+    {
+      "id": "kg9nm2rpud6cax5aNPSnu",
+      "type": "text",
+      "x": 711.4300231933594,
+      "y": 796.5,
+      "width": 83.13995361328125,
+      "height": 25,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aD",
+      "roundness": null,
+      "seed": 522572142,
+      "version": 193,
+      "versionNonce": 1920372338,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818570993,
+      "link": null,
+      "locked": false,
+      "text": "test.txt",
+      "fontSize": 20,
+      "fontFamily": 5,
+      "textAlign": "center",
+      "verticalAlign": "middle",
+      "containerId": "mPaYpJ9Xn7tlJPmKPqJKJ",
+      "originalText": "test.txt",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "hyFKqXwet_F79QM71atgI",
+      "type": "arrow",
+      "x": 534.65,
+      "y": 661.5,
+      "width": 195.25,
+      "height": 99,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aG",
+      "roundness": null,
+      "seed": 873266098,
+      "version": 71,
+      "versionNonce": 541154738,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818570993,
+      "link": null,
+      "locked": false,
+      "points": [
+        [
+          0,
+          0
+        ],
+        [
+          0,
+          49.5
+        ],
+        [
+          -195.25,
+          49.5
+        ],
+        [
+          -195.25,
+          99
+        ]
+      ],
+      "lastCommittedPoint": null,
+      "startBinding": {
+        "elementId": "jSx8ApfhtRs_nk37VvDMb",
+        "fixedPoint": [
+          0.49977090492554405,
+          1.034364261168385
+        ],
+        "focus": 0,
+        "gap": 0
+      },
+      "endBinding": {
+        "elementId": "ohj18N4AOTDz5lJNcV9gi",
+        "fixedPoint": [
+          0.4993630573248406,
+          -0.05747126436781609
+        ],
+        "focus": 0,
+        "gap": 0
+      },
+      "startArrowhead": null,
+      "endArrowhead": "triangle",
+      "elbowed": true,
+      "fixedSegments": null,
+      "startIsSpecial": null,
+      "endIsSpecial": null
+    },
+    {
+      "id": "x_DP1FcQ7jraGz0gBuDi3",
+      "type": "arrow",
+      "x": 534.65,
+      "y": 661.5,
+      "width": 218.25,
+      "height": 99,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aH",
+      "roundness": null,
+      "seed": 1210817582,
+      "version": 77,
+      "versionNonce": 1483392370,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818580594,
+      "link": null,
+      "locked": false,
+      "points": [
+        [
+          0,
+          0
+        ],
+        [
+          0,
+          49.5
+        ],
+        [
+          218.25,
+          49.5
+        ],
+        [
+          218.25,
+          99
+        ]
+      ],
+      "lastCommittedPoint": null,
+      "startBinding": {
+        "elementId": "jSx8ApfhtRs_nk37VvDMb",
+        "fixedPoint": [
+          0.49977090492554405,
+          1.034364261168385
+        ],
+        "focus": 0,
+        "gap": 0
+      },
+      "endBinding": {
+        "elementId": "mPaYpJ9Xn7tlJPmKPqJKJ",
+        "fixedPoint": [
+          0.4993630573248406,
+          -0.05747126436781609
+        ],
+        "focus": 0,
+        "gap": 0
+      },
+      "startArrowhead": null,
+      "endArrowhead": "triangle",
+      "elbowed": true,
+      "fixedSegments": null,
+      "startIsSpecial": null,
+      "endIsSpecial": null
+    },
+    {
+      "id": "1IGbCps2EHnzKgJUWM5nq",
+      "type": "arrow",
+      "x": 534.65,
+      "y": 661.5,
+      "width": 0.5719232650604908,
+      "height": 99.07394122590165,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aK",
+      "roundness": null,
+      "seed": 1205316658,
+      "version": 96,
+      "versionNonce": 1748050674,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818570993,
+      "link": null,
+      "locked": false,
+      "points": [
+        [
+          0,
+          0
+        ],
+        [
+          -0.5719232650604908,
+          99.07394122590165
+        ]
+      ],
+      "lastCommittedPoint": null,
+      "startBinding": {
+        "elementId": "jSx8ApfhtRs_nk37VvDMb",
+        "fixedPoint": [
+          0.49977090492554405,
+          1.034364261168385
+        ],
+        "focus": 0,
+        "gap": 0
+      },
+      "endBinding": {
+        "elementId": "A4Y54Y26fe257U_QU9lxX",
+        "fixedPoint": [
+          0.44635717665566554,
+          -0.056621365219521276
+        ],
+        "focus": 0,
+        "gap": 0
+      },
+      "startArrowhead": null,
+      "endArrowhead": "triangle",
+      "elbowed": true,
+      "fixedSegments": null,
+      "startIsSpecial": null,
+      "endIsSpecial": null
+    },
+    {
+      "id": "gus_rxauKJ6T2L_F59PfN",
+      "type": "arrow",
+      "x": 539,
+      "y": 271.5,
+      "width": 0,
+      "height": 33.5,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aL",
+      "roundness": null,
+      "seed": 763990258,
+      "version": 17,
+      "versionNonce": 1028811378,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818588814,
+      "link": null,
+      "locked": false,
+      "points": [
+        [
+          0,
+          0
+        ],
+        [
+          0,
+          33.5
+        ]
+      ],
+      "lastCommittedPoint": null,
+      "startBinding": {
+        "elementId": "3zbCui3XtIGozHXTVAGRp",
+        "focus": -0.019473081328751418,
+        "gap": 3
+      },
+      "endBinding": {
+        "elementId": "hoIRMNiMJZl4YDo-hovWy",
+        "focus": -1.0404624277456647,
+        "gap": 30.7545797799829
+      },
+      "startArrowhead": null,
+      "endArrowhead": "triangle",
+      "elbowed": false
+    },
+    {
+      "id": "Wk1bJbbtC31FqObEL5xWt",
+      "type": "arrow",
+      "x": 536.5,
+      "y": 468.5,
+      "width": 0,
+      "height": 39,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aM",
+      "roundness": null,
+      "seed": 1489771054,
+      "version": 33,
+      "versionNonce": 1828178606,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818593647,
+      "link": null,
+      "locked": false,
+      "points": [
+        [
+          0,
+          0
+        ],
+        [
+          0,
+          39
+        ]
+      ],
+      "lastCommittedPoint": null,
+      "startBinding": {
+        "elementId": "hoIRMNiMJZl4YDo-hovWy",
+        "focus": 1.0693641618497107,
+        "gap": 27.157190169432425
+      },
+      "endBinding": {
+        "elementId": "jSx8ApfhtRs_nk37VvDMb",
+        "focus": 0.008018327605956525,
+        "gap": 3.5
+      },
+      "startArrowhead": null,
+      "endArrowhead": "triangle",
+      "elbowed": false
+    }
+  ],
+  "appState": {
+    "gridSize": 20,
+    "gridStep": 5,
+    "gridModeEnabled": false,
+    "viewBackgroundColor": "#ffffff"
+  },
+  "files": {}
+}
--- a/Scripts/UML/CleaningPipeline/classes.excalidraw.json
+++ b/Scripts/UML/CleaningPipeline/classes.excalidraw.json
@@ -0,0 +1,826 @@
+{
+  "type": "excalidraw",
+  "version": 2,
+  "source": "https://marketplace.visualstudio.com/items?itemName=pomdtr.excalidraw-editor",
+  "elements": [
+    {
+      "type": "line",
+      "version": 4622,
+      "versionNonce": 1623045672,
+      "isDeleted": false,
+      "id": "twu_PiAvEuQ4l1YYtZLET",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 0,
+      "x": 289.8504963515835,
+      "y": 91.87474806402287,
+      "strokeColor": "#000000",
+      "backgroundColor": "#a5d8ff",
+      "width": 77.09201683999922,
+      "height": 99.49948667804088,
+      "seed": 1975340120,
+      "groupIds": [
+        "9PT4BXPfQ6UoCaB-T-h9A",
+        "dp_TZJyYdyPIH1hOkAPlb"
+      ],
+      "strokeSharpness": "round",
+      "boundElementIds": [],
+      "startBinding": null,
+      "endBinding": null,
+      "lastCommittedPoint": null,
+      "startArrowhead": null,
+      "endArrowhead": null,
+      "points": [
+        [
+          0,
+          0
+        ],
+        [
+          0.2542098813493443,
+          75.20117273657175
+        ],
+        [
+          0.011896425679918422,
+          83.76249969444815
+        ],
+        [
+          3.970409367559332,
+          87.46174320643391
+        ],
+        [
+          17.75573317066317,
+          90.59250103325854
+        ],
+        [
+          41.05683533152865,
+          91.56737225214069
+        ],
+        [
+          63.319497586673116,
+          90.01084754868091
+        ],
+        [
+          75.14781395923075,
+          86.28844687220405
+        ],
+        [
+          76.81603792670788,
+          83.15042405259751
+        ],
+        [
+          77.05033394391478,
+          76.25776215104557
+        ],
+        [
+          76.86643881413028,
+          6.3089586511537865
+        ],
+        [
+          76.45188016352971,
+          -0.2999144698665015
+        ],
+        [
+          71.50179495549581,
+          -3.9936571317850627
+        ],
+        [
+          61.077971898861186,
+          -6.132877429442784
+        ],
+        [
+          37.32348754161154,
+          -7.932114425900202
+        ],
+        [
+          18.278415656797975,
+          -6.859225353587373
+        ],
+        [
+          3.2995959613238286,
+          -3.2201165291205287
+        ],
+        [
+          -0.04168289608444441,
+          -0.045185660461322996
+        ],
+        [
+          0,
+          0
+        ]
+      ],
+      "index": "a1",
+      "frameId": null,
+      "roundness": {
+        "type": 2
+      },
+      "boundElements": [],
+      "updated": 1758646548051,
+      "link": null,
+      "locked": false
+    },
+    {
+      "type": "line",
+      "version": 2327,
+      "versionNonce": 1593094440,
+      "isDeleted": false,
+      "id": "hmJk4dH9VpOsfkrCTkhvh",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 0,
+      "x": 290.3744257898585,
+      "y": 149.00103172175278,
+      "strokeColor": "#000000",
+      "backgroundColor": "#a5d8ff",
+      "width": 77.17198221193564,
+      "height": 8.562348957853036,
+      "seed": 637665624,
+      "groupIds": [
+        "9PT4BXPfQ6UoCaB-T-h9A",
+        "dp_TZJyYdyPIH1hOkAPlb"
+      ],
+      "strokeSharpness": "round",
+      "boundElementIds": [],
+      "startBinding": null,
+      "endBinding": null,
+      "lastCommittedPoint": null,
+      "startArrowhead": null,
+      "endArrowhead": null,
+      "points": [
+        [
+          0,
+          0
+        ],
+        [
+          2.033150371639873,
+          3.413095389435587
+        ],
+        [
+          10.801287372573954,
+          6.276651055277943
+        ],
+        [
+          22.468666942209353,
+          8.010803051612635
+        ],
+        [
+          40.747074201802775,
+          8.168828515515864
+        ],
+        [
+          62.077348233027564,
+          7.0647721921469495
+        ],
+        [
+          74.53446931782398,
+          3.04824021069218
+        ],
+        [
+          77.17198221193564,
+          -0.3935204423371723
+        ]
+      ],
+      "index": "a2",
+      "frameId": null,
+      "roundness": {
+        "type": 2
+      },
+      "boundElements": [],
+      "updated": 1758646548051,
+      "link": null,
+      "locked": false
+    },
+    {
+      "type": "line",
+      "version": 2413,
+      "versionNonce": 311708712,
+      "isDeleted": false,
+      "id": "X1ldVIXm4DfBal5N2Pwn9",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 0,
+      "x": 289.3425684673547,
+      "y": 120.03697638652972,
+      "strokeColor": "#000000",
+      "backgroundColor": "#a5d8ff",
+      "width": 77.17198221193564,
+      "height": 8.562348957853036,
+      "seed": 904402520,
+      "groupIds": [
+        "9PT4BXPfQ6UoCaB-T-h9A",
+        "dp_TZJyYdyPIH1hOkAPlb"
+      ],
+      "strokeSharpness": "round",
+      "boundElementIds": [],
+      "startBinding": null,
+      "endBinding": null,
+      "lastCommittedPoint": null,
+      "startArrowhead": null,
+      "endArrowhead": null,
+      "points": [
+        [
+          0,
+          0
+        ],
+        [
+          2.033150371639873,
+          3.413095389435587
+        ],
+        [
+          10.801287372573954,
+          6.276651055277943
+        ],
+        [
+          22.468666942209353,
+          8.010803051612635
+        ],
+        [
+          40.747074201802775,
+          8.168828515515864
+        ],
+        [
+          62.077348233027564,
+          7.0647721921469495
+        ],
+        [
+          74.53446931782398,
+          3.04824021069218
+        ],
+        [
+          77.17198221193564,
+          -0.3935204423371723
+        ]
+      ],
+      "index": "a3",
+      "frameId": null,
+      "roundness": {
+        "type": 2
+      },
+      "boundElements": [],
+      "updated": 1758646548051,
+      "link": null,
+      "locked": false
+    },
+    {
+      "type": "ellipse",
+      "version": 5410,
+      "versionNonce": 92833576,
+      "isDeleted": false,
+      "id": "CFhp5ZxSVwHYzGUj4hEn1",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 0,
+      "x": 288.28461948527263,
+      "y": 84.74247943834126,
+      "strokeColor": "#000000",
+      "backgroundColor": "#a5d8ff",
+      "width": 76.59753601865496,
+      "height": 15.49127539284798,
+      "seed": 1782811480,
+      "groupIds": [
+        "9PT4BXPfQ6UoCaB-T-h9A",
+        "dp_TZJyYdyPIH1hOkAPlb"
+      ],
+      "strokeSharpness": "sharp",
+      "boundElementIds": [
+        "bxuMGTzXLn7H-uBCptINx"
+      ],
+      "index": "a4",
+      "frameId": null,
+      "roundness": null,
+      "boundElements": [],
+      "updated": 1758646548051,
+      "link": null,
+      "locked": false
+    },
+    {
+      "type": "ellipse",
+      "version": 820,
+      "versionNonce": 608002600,
+      "isDeleted": false,
+      "id": "B43R7rWwK2_vdiRHBSSPk",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 0,
+      "x": 324.77660659049513,
+      "y": 109.21914711824485,
+      "strokeColor": "#000000",
+      "backgroundColor": "#228be6",
+      "width": 11.226103154161754,
+      "height": 12.183758484455605,
+      "seed": 1298686040,
+      "groupIds": [
+        "9PT4BXPfQ6UoCaB-T-h9A",
+        "dp_TZJyYdyPIH1hOkAPlb"
+      ],
+      "strokeSharpness": "sharp",
+      "boundElementIds": [],
+      "index": "a5",
+      "frameId": null,
+      "roundness": null,
+      "boundElements": [],
+      "updated": 1758646548051,
+      "link": null,
+      "locked": false
+    },
+    {
+      "type": "ellipse",
+      "version": 1108,
+      "versionNonce": 1839127848,
+      "isDeleted": false,
+      "id": "CkKMb9wkJfVk04T217zSs",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 0,
+      "x": 325.12774837442873,
+      "y": 135.43576140530996,
+      "strokeColor": "#000000",
+      "backgroundColor": "#228be6",
+      "width": 11.226103154161754,
+      "height": 12.183758484455605,
+      "seed": 2133497176,
+      "groupIds": [
+        "9PT4BXPfQ6UoCaB-T-h9A",
+        "dp_TZJyYdyPIH1hOkAPlb"
+      ],
+      "strokeSharpness": "sharp",
+      "boundElementIds": [],
+      "index": "a6",
+      "frameId": null,
+      "roundness": null,
+      "boundElements": [],
+      "updated": 1758646548051,
+      "link": null,
+      "locked": false
+    },
+    {
+      "type": "ellipse",
+      "version": 991,
+      "versionNonce": 588838952,
+      "isDeleted": false,
+      "id": "SHJdKeQPkfpvzSoNH--3o",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 6.239590202363168,
+      "x": 325.77660659049513,
+      "y": 164.20448797661635,
+      "strokeColor": "#000000",
+      "backgroundColor": "#228be6",
+      "width": 11.226103154161754,
+      "height": 12.183758484455605,
+      "seed": 81668696,
+      "groupIds": [
+        "9PT4BXPfQ6UoCaB-T-h9A",
+        "dp_TZJyYdyPIH1hOkAPlb"
+      ],
+      "strokeSharpness": "sharp",
+      "boundElementIds": [],
+      "index": "a7",
+      "frameId": null,
+      "roundness": null,
+      "boundElements": [],
+      "updated": 1758646548051,
+      "link": null,
+      "locked": false
+    },
+    {
+      "type": "text",
+      "version": 489,
+      "versionNonce": 2023207720,
+      "isDeleted": false,
+      "id": "vUSyMBPup0jZ71CYXKyGb",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 0,
+      "x": 280.1846389770508,
+      "y": 185.79462957545917,
+      "strokeColor": "#000000",
+      "backgroundColor": "#a5d8ff",
+      "width": 95.63072204589844,
+      "height": 23.595161071904883,
+      "seed": 425140056,
+      "groupIds": [
+        "dp_TZJyYdyPIH1hOkAPlb"
+      ],
+      "strokeSharpness": "sharp",
+      "boundElementIds": [],
+      "fontSize": 17.4778970902999,
+      "fontFamily": 1,
+      "text": "dataset.db",
+      "baseline": 16.595161071904883,
+      "textAlign": "center",
+      "verticalAlign": "top",
+      "index": "a8",
+      "frameId": null,
+      "roundness": null,
+      "boundElements": [],
+      "updated": 1758646548051,
+      "link": null,
+      "locked": false,
+      "containerId": null,
+      "originalText": "dataset.db",
+      "autoResize": true,
+      "lineHeight": 1.350000000000001
+    },
+    {
+      "id": "R7pU0VP6CFKCAwuvt0xsr",
+      "type": "text",
+      "x": 295.5,
+      "y": 342,
+      "width": 374,
+      "height": 225,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "#228be6",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a9",
+      "roundness": null,
+      "seed": 705463336,
+      "version": 1130,
+      "versionNonce": 72522328,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758648226024,
+      "link": null,
+      "locked": false,
+      "text": "class Extract(Action):\n    # Static\n    + type : ActionTypes = Extract\n    \n    # Properties\n    - db_connection: Path\n    - query: str\n    - query_parameters: [str]\n    - output_mapper: [str]",
+      "fontSize": 20,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "class Extract(Action):\n    # Static\n    + type : ActionTypes = Extract\n    \n    # Properties\n    - db_connection: Path\n    - query: str\n    - query_parameters: [str]\n    - output_mapper: [str]",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "G1xIRcJgm34_NMEWQFFlW",
+      "type": "text",
+      "x": 1419.5,
+      "y": 110,
+      "width": 253,
+      "height": 75,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "#228be6",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aA",
+      "roundness": null,
+      "seed": 651981400,
+      "version": 256,
+      "versionNonce": 138082856,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758646570344,
+      "link": null,
+      "locked": false,
+      "text": "class Pipeline\n    - actions: [Action]\n    ",
+      "fontSize": 20,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "class Pipeline\n    - actions: [Action]\n    ",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "TBVy3JbJCkbA9kjVEJ8lv",
+      "type": "text",
+      "x": 694,
+      "y": 100,
+      "width": 495,
+      "height": 150,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "#228be6",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aB",
+      "roundness": null,
+      "seed": 680960040,
+      "version": 560,
+      "versionNonce": 85012520,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758649442239,
+      "link": null,
+      "locked": false,
+      "text": "class Action\n    + type: ActionTypes\n    + name: str\n    + depends_on: [str]\n\n    + execute(mem) -> [Dict<str, any>] | Void",
+      "fontSize": 20,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "class Action\n    + type: ActionTypes\n    + name: str\n    + depends_on: [str]\n\n    + execute(mem) -> [Dict<str, any>] | Void",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "an7KRTzWpCytKNKgHftKC",
+      "type": "text",
+      "x": 1528.5,
+      "y": 365.5,
+      "width": 187,
+      "height": 150,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "#228be6",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aC",
+      "roundness": null,
+      "seed": 1974317656,
+      "version": 306,
+      "versionNonce": 1574962264,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758648154009,
+      "link": null,
+      "locked": false,
+      "text": "enum ActionTypes:\n    + Extract\n    + Aggregate\n    + Filter\n    + Map\n    + Dump",
+      "fontSize": 20,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "enum ActionTypes:\n    + Extract\n    + Aggregate\n    + Filter\n    + Map\n    + Dump",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "2pQ5EULirrWs_QZPbClhh",
+      "type": "text",
+      "x": 785,
+      "y": 332.5,
+      "width": 418,
+      "height": 375,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "#228be6",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aH",
+      "roundness": null,
+      "seed": 1402251560,
+      "version": 742,
+      "versionNonce": 680432168,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758649532881,
+      "link": null,
+      "locked": false,
+      "text": "class Aggregate(Action):\n    # Static\n    + type: ActionTypes = Aggregate\n\n    # Properties\n    - actionIDs: [str]\n    - associations: [Association]\n    - output_mapper: [str]\n\n    + execute(mem):\n        tables = mem.gather(actionIDs)\n\n        for join in association:\n            \n            ",
+      "fontSize": 20,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "class Aggregate(Action):\n    # Static\n    + type: ActionTypes = Aggregate\n\n    # Properties\n    - actionIDs: [str]\n    - associations: [Association]\n    - output_mapper: [str]\n\n    + execute(mem):\n        tables = mem.gather(actionIDs)\n\n        for join in association:\n            \n            ",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "O0fso8DJqFfwJEzmpUikM",
+      "type": "text",
+      "x": 1289,
+      "y": 195,
+      "width": 594,
+      "height": 100,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "#228be6",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aI",
+      "roundness": null,
+      "seed": 1582329944,
+      "version": 459,
+      "versionNonce": 1080077144,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758647067031,
+      "link": null,
+      "locked": false,
+      "text": "input_mapper: \n    - key: ActionID (name) that produced such output\n    - value: list of strings that represent the values\n                to take",
+      "fontSize": 20,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "input_mapper: \n    - key: ActionID (name) that produced such output\n    - value: list of strings that represent the values\n                to take",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "v0kzO6vlBWOdJCV3yoG69",
+      "type": "text",
+      "x": 1379.5,
+      "y": 718.5,
+      "width": 286,
+      "height": 175,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "#228be6",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aL",
+      "roundness": null,
+      "seed": 1462407976,
+      "version": 635,
+      "versionNonce": 1012998696,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758649495598,
+      "link": null,
+      "locked": false,
+      "text": "class Association:\n    - from_actionID: str\n    - from_key_name: str\n    - from_value_name: str\n    - to_actionID: str\n    - to_value_name: str\n    - type: Type",
+      "fontSize": 20,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "class Association:\n    - from_actionID: str\n    - from_key_name: str\n    - from_value_name: str\n    - to_actionID: str\n    - to_value_name: str\n    - type: Type",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "WK34n9xeVxntypCtrlK6p",
+      "type": "text",
+      "x": 256.5,
+      "y": 787.5,
+      "width": 517,
+      "height": 175,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "#228be6",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aM",
+      "roundness": null,
+      "seed": 1166526296,
+      "version": 318,
+      "versionNonce": 1042162520,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758649002604,
+      "link": null,
+      "locked": false,
+      "text": "class Filter(Action):\n    # Static\n    + type: ActionTypes = Filter\n\n    # Properties\n    - compare: function(Dict<str, any>) -> bool\n    - output_mapper: [str]",
+      "fontSize": 20,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "class Filter(Action):\n    # Static\n    + type: ActionTypes = Filter\n\n    # Properties\n    - compare: function(Dict<str, any>) -> bool\n    - output_mapper: [str]",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "NY9jyUFLFFCNPE2sh00SX",
+      "type": "text",
+      "x": 1639,
+      "y": 606.5,
+      "width": 407,
+      "height": 200,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "#228be6",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aP",
+      "roundness": null,
+      "seed": 20345896,
+      "version": 168,
+      "versionNonce": 627282472,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758649426380,
+      "link": null,
+      "locked": false,
+      "text": "class Map(Action):\n    # Static\n    + type: ActionTypes = Map\n\n    # Properties\n    - compare_mapper: [str]\n    - mapper: function(any...) -> any\n    - output_mapper: [str]",
+      "fontSize": 20,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "class Map(Action):\n    # Static\n    + type: ActionTypes = Map\n\n    # Properties\n    - compare_mapper: [str]\n    - mapper: function(any...) -> any\n    - output_mapper: [str]",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "SkhaoW-3TTKDZzEii3Lf6",
+      "type": "text",
+      "x": 1457.5,
+      "y": 955.5,
+      "width": 121,
+      "height": 50,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "#228be6",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aQ",
+      "roundness": null,
+      "seed": 2071523672,
+      "version": 37,
+      "versionNonce": 105260376,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758648834435,
+      "link": null,
+      "locked": false,
+      "text": "class Dump:\n    -",
+      "fontSize": 20,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "class Dump:\n    -",
+      "autoResize": true,
+      "lineHeight": 1.25
+    }
+  ],
+  "appState": {
+    "gridSize": 20,
+    "gridStep": 5,
+    "gridModeEnabled": false,
+    "viewBackgroundColor": "#ffffff"
+  },
+  "files": {}
+}
--- a/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json
+++ b/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json
@@ -0,0 +1,634 @@
+{
+  "type": "excalidraw",
+  "version": 2,
+  "source": "https://marketplace.visualstudio.com/items?itemName=pomdtr.excalidraw-editor",
+  "elements": [
+    {
+      "id": "JNB9z-PeqZ4s8KDfWaoXe",
+      "type": "rectangle",
+      "x": 106,
+      "y": 27,
+      "width": 653,
+      "height": 263,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a2",
+      "roundness": {
+        "type": 3
+      },
+      "seed": 710740889,
+      "version": 326,
+      "versionNonce": 1107631703,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1759156408059,
+      "link": null,
+      "locked": false
+    },
+    {
+      "id": "e13wNTgUpn2flMpmMttqx",
+      "type": "text",
+      "x": 200.5943407656526,
+      "y": 44.07937975075269,
+      "width": 307.2781467269385,
+      "height": 23.3097531902191,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a3",
+      "roundness": null,
+      "seed": 1012740663,
+      "version": 444,
+      "versionNonce": 589551257,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1759156408059,
+      "link": null,
+      "locked": false,
+      "text": "Libs/CleaningPipeline/sql_endpoint",
+      "fontSize": 18.64780255217528,
+      "fontFamily": 5,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "Libs/CleaningPipeline/sql_endpoint",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "CgxCElJkKBtIHv-5WQrbo",
+      "type": "text",
+      "x": 195,
+      "y": 80.44259472749451,
+      "width": 403.64997665852184,
+      "height": 186.4780255217528,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a4",
+      "roundness": null,
+      "seed": 1261951799,
+      "version": 507,
+      "versionNonce": 1922906999,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1759156408059,
+      "link": null,
+      "locked": false,
+      "text": "Class SqlEndpoint:\n    - sql_engine\n    + movie_ids: list[int]\n\n    #\n    + get_abbreviated_dataset_by_movie_id\n\n",
+      "fontSize": 18.64780255217528,
+      "fontFamily": 5,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "Class SqlEndpoint:\n    - sql_engine\n    + movie_ids: list[int]\n\n    #\n    + get_abbreviated_dataset_by_movie_id\n\n",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "type": "line",
+      "version": 4979,
+      "versionNonce": 1473849177,
+      "isDeleted": false,
+      "id": "sYReMTdYblr-oJtYYJALU",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 0,
+      "x": -67.14432426259049,
+      "y": 87.19293561900287,
+      "strokeColor": "#000000",
+      "backgroundColor": "#a5d8ff",
+      "width": 77.09201683999922,
+      "height": 99.49948667804088,
+      "seed": 1263944119,
+      "groupIds": [
+        "9YkNe1yqnfZy9Z1JX2xr4",
+        "BDBCTrrhjbJynRAyuf3xJ"
+      ],
+      "strokeSharpness": "round",
+      "boundElementIds": [],
+      "startBinding": null,
+      "endBinding": null,
+      "lastCommittedPoint": null,
+      "startArrowhead": null,
+      "endArrowhead": null,
+      "points": [
+        [
+          0,
+          0
+        ],
+        [
+          0.2542098813493443,
+          75.20117273657175
+        ],
+        [
+          0.011896425679918422,
+          83.76249969444815
+        ],
+        [
+          3.970409367559332,
+          87.46174320643391
+        ],
+        [
+          17.75573317066317,
+          90.59250103325854
+        ],
+        [
+          41.05683533152865,
+          91.56737225214069
+        ],
+        [
+          63.319497586673116,
+          90.01084754868091
+        ],
+        [
+          75.14781395923075,
+          86.28844687220405
+        ],
+        [
+          76.81603792670788,
+          83.15042405259751
+        ],
+        [
+          77.05033394391478,
+          76.25776215104557
+        ],
+        [
+          76.86643881413028,
+          6.3089586511537865
+        ],
+        [
+          76.45188016352971,
+          -0.2999144698665015
+        ],
+        [
+          71.50179495549581,
+          -3.9936571317850627
+        ],
+        [
+          61.077971898861186,
+          -6.132877429442784
+        ],
+        [
+          37.32348754161154,
+          -7.932114425900202
+        ],
+        [
+          18.278415656797975,
+          -6.859225353587373
+        ],
+        [
+          3.2995959613238286,
+          -3.2201165291205287
+        ],
+        [
+          -0.04168289608444441,
+          -0.045185660461322996
+        ],
+        [
+          0,
+          0
+        ]
+      ],
+      "index": "a6",
+      "frameId": null,
+      "roundness": {
+        "type": 2
+      },
+      "boundElements": [],
+      "updated": 1759158252997,
+      "link": null,
+      "locked": false
+    },
+    {
+      "type": "line",
+      "version": 2684,
+      "versionNonce": 952947769,
+      "isDeleted": false,
+      "id": "0S6dEWQVqKUVkP6Z5IX1l",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 0,
+      "x": -66.6203948243155,
+      "y": 144.31921927673278,
+      "strokeColor": "#000000",
+      "backgroundColor": "#a5d8ff",
+      "width": 77.17198221193564,
+      "height": 8.562348957853036,
+      "seed": 817033943,
+      "groupIds": [
+        "9YkNe1yqnfZy9Z1JX2xr4",
+        "BDBCTrrhjbJynRAyuf3xJ"
+      ],
+      "strokeSharpness": "round",
+      "boundElementIds": [],
+      "startBinding": null,
+      "endBinding": null,
+      "lastCommittedPoint": null,
+      "startArrowhead": null,
+      "endArrowhead": null,
+      "points": [
+        [
+          0,
+          0
+        ],
+        [
+          2.033150371639873,
+          3.413095389435587
+        ],
+        [
+          10.801287372573954,
+          6.276651055277943
+        ],
+        [
+          22.468666942209353,
+          8.010803051612635
+        ],
+        [
+          40.747074201802775,
+          8.168828515515864
+        ],
+        [
+          62.077348233027564,
+          7.0647721921469495
+        ],
+        [
+          74.53446931782398,
+          3.04824021069218
+        ],
+        [
+          77.17198221193564,
+          -0.3935204423371723
+        ]
+      ],
+      "index": "a7",
+      "frameId": null,
+      "roundness": {
+        "type": 2
+      },
+      "boundElements": [],
+      "updated": 1759158252997,
+      "link": null,
+      "locked": false
+    },
+    {
+      "type": "line",
+      "version": 2770,
+      "versionNonce": 477619481,
+      "isDeleted": false,
+      "id": "szGLND7J0nVOvRkNXX9AS",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 0,
+      "x": -67.65225214681931,
+      "y": 115.35516394150972,
+      "strokeColor": "#000000",
+      "backgroundColor": "#a5d8ff",
+      "width": 77.17198221193564,
+      "height": 8.562348957853036,
+      "seed": 1704755191,
+      "groupIds": [
+        "9YkNe1yqnfZy9Z1JX2xr4",
+        "BDBCTrrhjbJynRAyuf3xJ"
+      ],
+      "strokeSharpness": "round",
+      "boundElementIds": [],
+      "startBinding": null,
+      "endBinding": null,
+      "lastCommittedPoint": null,
+      "startArrowhead": null,
+      "endArrowhead": null,
+      "points": [
+        [
+          0,
+          0
+        ],
+        [
+          2.033150371639873,
+          3.413095389435587
+        ],
+        [
+          10.801287372573954,
+          6.276651055277943
+        ],
+        [
+          22.468666942209353,
+          8.010803051612635
+        ],
+        [
+          40.747074201802775,
+          8.168828515515864
+        ],
+        [
+          62.077348233027564,
+          7.0647721921469495
+        ],
+        [
+          74.53446931782398,
+          3.04824021069218
+        ],
+        [
+          77.17198221193564,
+          -0.3935204423371723
+        ]
+      ],
+      "index": "a8",
+      "frameId": null,
+      "roundness": {
+        "type": 2
+      },
+      "boundElements": [],
+      "updated": 1759158252997,
+      "link": null,
+      "locked": false
+    },
+    {
+      "type": "ellipse",
+      "version": 5767,
+      "versionNonce": 2119031289,
+      "isDeleted": false,
+      "id": "O3t2uGktJlDd1_OX_bpV4",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 0,
+      "x": -68.71020112890136,
+      "y": 80.06066699332126,
+      "strokeColor": "#000000",
+      "backgroundColor": "#a5d8ff",
+      "width": 76.59753601865496,
+      "height": 15.49127539284798,
+      "seed": 471296279,
+      "groupIds": [
+        "9YkNe1yqnfZy9Z1JX2xr4",
+        "BDBCTrrhjbJynRAyuf3xJ"
+      ],
+      "strokeSharpness": "sharp",
+      "boundElementIds": [
+        "bxuMGTzXLn7H-uBCptINx"
+      ],
+      "index": "a9",
+      "frameId": null,
+      "roundness": null,
+      "boundElements": [],
+      "updated": 1759158252997,
+      "link": null,
+      "locked": false
+    },
+    {
+      "type": "ellipse",
+      "version": 1177,
+      "versionNonce": 525480665,
+      "isDeleted": false,
+      "id": "_SzKlOBOvJgBg7FX0JTTM",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 0,
+      "x": -32.218214023678854,
+      "y": 104.53733467322485,
+      "strokeColor": "#000000",
+      "backgroundColor": "#228be6",
+      "width": 11.226103154161754,
+      "height": 12.183758484455605,
+      "seed": 1368927799,
+      "groupIds": [
+        "9YkNe1yqnfZy9Z1JX2xr4",
+        "BDBCTrrhjbJynRAyuf3xJ"
+      ],
+      "strokeSharpness": "sharp",
+      "boundElementIds": [],
+      "index": "aA",
+      "frameId": null,
+      "roundness": null,
+      "boundElements": [],
+      "updated": 1759158252997,
+      "link": null,
+      "locked": false
+    },
+    {
+      "type": "ellipse",
+      "version": 1465,
+      "versionNonce": 1410887609,
+      "isDeleted": false,
+      "id": "oJMl2Kxa3SPaiAY0kxo7A",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 0,
+      "x": -31.867072239745255,
+      "y": 130.75394896028996,
+      "strokeColor": "#000000",
+      "backgroundColor": "#228be6",
+      "width": 11.226103154161754,
+      "height": 12.183758484455605,
+      "seed": 1627606871,
+      "groupIds": [
+        "9YkNe1yqnfZy9Z1JX2xr4",
+        "BDBCTrrhjbJynRAyuf3xJ"
+      ],
+      "strokeSharpness": "sharp",
+      "boundElementIds": [],
+      "index": "aB",
+      "frameId": null,
+      "roundness": null,
+      "boundElements": [],
+      "updated": 1759158252997,
+      "link": null,
+      "locked": false
+    },
+    {
+      "type": "ellipse",
+      "version": 1348,
+      "versionNonce": 314839193,
+      "isDeleted": false,
+      "id": "fB6pJBSMA-pRHrpgYKaLL",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 6.239590202363168,
+      "x": -31.218214023678854,
+      "y": 159.52267553159635,
+      "strokeColor": "#000000",
+      "backgroundColor": "#228be6",
+      "width": 11.226103154161754,
+      "height": 12.183758484455605,
+      "seed": 1420643447,
+      "groupIds": [
+        "9YkNe1yqnfZy9Z1JX2xr4",
+        "BDBCTrrhjbJynRAyuf3xJ"
+      ],
+      "strokeSharpness": "sharp",
+      "boundElementIds": [],
+      "index": "aC",
+      "frameId": null,
+      "roundness": null,
+      "boundElements": [],
+      "updated": 1759158252997,
+      "link": null,
+      "locked": false
+    },
+    {
+      "type": "text",
+      "version": 846,
+      "versionNonce": 1091081593,
+      "isDeleted": false,
+      "id": "9gZ3Yy1MeP9kEOTLODqLG",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 0,
+      "x": -76.81018163712321,
+      "y": 181.11281713043917,
+      "strokeColor": "#000000",
+      "backgroundColor": "#a5d8ff",
+      "width": 95.63072204589844,
+      "height": 23.595161071904883,
+      "seed": 2019206551,
+      "groupIds": [
+        "BDBCTrrhjbJynRAyuf3xJ"
+      ],
+      "strokeSharpness": "sharp",
+      "boundElementIds": [],
+      "fontSize": 17.4778970902999,
+      "fontFamily": 1,
+      "text": "dataset.db",
+      "baseline": 16.595161071904883,
+      "textAlign": "center",
+      "verticalAlign": "top",
+      "index": "aD",
+      "frameId": null,
+      "roundness": null,
+      "boundElements": [],
+      "updated": 1759158252997,
+      "link": null,
+      "locked": false,
+      "containerId": null,
+      "originalText": "dataset.db",
+      "autoResize": true,
+      "lineHeight": 1.350000000000001
+    },
+    {
+      "id": "3eOw20xMhpB5jf_RMG24P",
+      "type": "text",
+      "x": 1131.3333333333335,
+      "y": 31.333333333333428,
+      "width": 508.3333333333333,
+      "height": 550,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aE",
+      "roundness": null,
+      "seed": 1535658041,
+      "version": 821,
+      "versionNonce": 1630266809,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1759157181677,
+      "link": null,
+      "locked": false,
+      "text": "Class PipelineApplier\n    - movie_frequence_filter : pd.DataFrame()\n    - rel_Frequence_Filter : pd.DataFrame()\n    - rel_banned_list: list[str]\n\n    + generate_movie_frequency_filter()\n    + generate_rel_frequency_filter()\n    + generate_list_relationship_filter()\n    \n    + filter_by_movie_frequency()\n    + filter_by_relationship_frequency()\n    + delete_relationship_by_list_filter()\n    + delete_relationship_by_str()\n\n    + drop_na()    \n\n    + rdf_add_special_token()\n    + group_triple_by_movie()\n    + build_by_movie()\n    # static\n    + build_triple()\n    + build_incomplete_triple()",
+      "fontSize": 20,
+      "fontFamily": 5,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "Class PipelineApplier\n    - movie_frequence_filter : pd.DataFrame()\n    - rel_Frequence_Filter : pd.DataFrame()\n    - rel_banned_list: list[str]\n\n    + generate_movie_frequency_filter()\n    + generate_rel_frequency_filter()\n    + generate_list_relationship_filter()\n    \n    + filter_by_movie_frequency()\n    + filter_by_relationship_frequency()\n    + delete_relationship_by_list_filter()\n    + delete_relationship_by_str()\n\n    + drop_na()    \n\n    + rdf_add_special_token()\n    + group_triple_by_movie()\n    + build_by_movie()\n    # static\n    + build_triple()\n    + build_incomplete_triple()",
+      "autoResize": false,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "Fbl1gpb5r7QrdRauGUWm2",
+      "type": "text",
+      "x": 158.23809523809535,
+      "y": 502.52380952380935,
+      "width": 484.2857142857143,
+      "height": 500,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aF",
+      "roundness": null,
+      "seed": 2066618807,
+      "version": 552,
+      "versionNonce": 1269344823,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1759158199532,
+      "link": null,
+      "locked": false,
+      "text": "Class Pipeline\n    - sql_endpoint: SqlEndpoint()\n\n    - task_rdf_mask_file_handler:\n    - task_bpe_corpus_file_handler:\n    - task_rdf_text_file_handler:\n    - task_rdf_completation_file_handler:\n\n    - Filter_applier : PipelineApplier()\n\n    #\n    - get_cleaned_movie_rows()\n    \n    + execute_task_bpe_corpus()\n    + execute_task_rdf_mask()\n    + execute_task_rdf_text()\n    + execute_task_rdf_completation()\n    + execute_all_task()\n\n    + use_toy_dataset()",
+      "fontSize": 20,
+      "fontFamily": 5,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "Class Pipeline\n    - sql_endpoint: SqlEndpoint()\n\n    - task_rdf_mask_file_handler:\n    - task_bpe_corpus_file_handler:\n    - task_rdf_text_file_handler:\n    - task_rdf_completation_file_handler:\n\n    - Filter_applier : PipelineApplier()\n\n    #\n    - get_cleaned_movie_rows()\n    \n    + execute_task_bpe_corpus()\n    + execute_task_rdf_mask()\n    + execute_task_rdf_text()\n    + execute_task_rdf_completation()\n    + execute_all_task()\n\n    + use_toy_dataset()",
+      "autoResize": false,
+      "lineHeight": 1.25
+    }
+  ],
+  "appState": {
+    "gridSize": 20,
+    "gridStep": 5,
+    "gridModeEnabled": false,
+    "viewBackgroundColor": "#ffffff"
+  },
+  "files": {}
+}
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,3 +15,4 @@ tzdata==2025.2
 urllib3==2.5.0
 wheel==0.45.1
 Wikipedia-API==0.8.1
+SQLAlchemy
Author	SHA1	Message	Date
GassiGiuseppe	856c693650	Added possibility to whitelist relationships	2025-10-12 12:26:26 +02:00
GassiGiuseppe	e9d30b3cea	add divide method to create hold out dataset	2025-10-11 16:49:36 +02:00
GassiGiuseppe	ee12f53f12	Added EOS token	2025-10-07 22:47:59 +02:00
GassiGiuseppe	a04f4c7cb7	changes to shorten the dataset	2025-10-07 15:49:25 +02:00
GassiGiuseppe	a93e61b8c1	Update ETL	2025-10-07 00:54:00 +02:00
GassiGiuseppe	0373460105	Movie filters updated	2025-10-06 10:57:50 +02:00
GassiGiuseppe	7307916891	update sql_endpoint to work with the new pipeline	2025-10-05 14:58:03 +02:00
GassiGiuseppe	acb43fc899	new faster pipeline	2025-10-05 14:57:45 +02:00
GassiGiuseppe	255d801a80	updated the mask rdf_mask_task. however since the model will build the mask itself, it is deprecated	2025-10-05 14:56:33 +02:00
GassiGiuseppe	2bd24ec278	Created legacy folder for old pipeline this pipeline still works but is slower then the new, some ot its method can be used later	2025-10-05 14:54:32 +02:00
GassiGiuseppe	69fba7c3e9	new utility to generate a csv debug file of the output of the pipeline	2025-10-04 21:33:09 +02:00
GassiGiuseppe	64e355e80c	Added regex to delete new lines and * from ObjectURI	2025-09-30 15:00:07 +02:00
GassiGiuseppe	007f1e9554	minor updates	2025-09-29 18:53:33 +02:00
GassiGiuseppe	c319398ca0	little update to UML pipeline	2025-09-29 17:03:31 +02:00
GassiGiuseppe	255d8a072d	First implementation of the cleaning pipeline UML	2025-09-29 16:59:52 +02:00
GassiGiuseppe	8167c9d435	Added Toy Dataset entry point into the Pipeline class Before it was forced into the sql_endpoint, now all the pipeline can be managed in the Pipeline class	2025-09-29 16:03:49 +02:00
GassiGiuseppe	bd72ad3571	Added file to execute the complete cleaning pipeline	2025-09-29 15:21:26 +02:00
GassiGiuseppe	6ddb7de9da	Added sqlAlchemy to requirements	2025-09-29 15:19:19 +02:00
GassiGiuseppe	650b37c586	Added vscode setting to execute jupyternotebook from root dir	2025-09-26 11:24:34 +02:00
GassiGiuseppe	e521b0704e	deleted TODO in path_splitter_tree, as it was already resolved	2025-09-25 19:19:11 +02:00
Christian Risi	0a698e9837	Added schema to extract from DB for BPE	2025-09-25 19:09:52 +02:00
GassiGiuseppe	9440a562f2	Merge branch 'dev.etl' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev.etl	2025-09-25 18:33:51 +02:00
Christian Risi	5eda131aac	Fixed creation query to be unique even with movieID in RDFs	2025-09-25 17:58:09 +02:00
GassiGiuseppe	57884eaf2e	CSV support added to path_splitter_tree Also resolved a minor bug to print also leaf nodes	2025-09-25 17:57:46 +02:00
Christian Risi	4548a683c2	Fixed DB	2025-09-25 17:57:45 +02:00
GassiGiuseppe	3eec49ffa5	WIP: added test file: clean_relationship.jupyter to create a first cleaning pipeline	2025-09-25 16:28:24 +02:00
Christian Risi	0bc7f4b227	Fixed Typos	2025-09-25 12:37:52 +02:00
Christian Risi	f28952b0a2	Added todo	2025-09-25 12:00:26 +02:00
Christian Risi	0b626a8e09	Modified query to take all data	2025-09-25 11:53:12 +02:00
Christian Risi	b254098532	Added views to count for subjects and objects	2025-09-25 11:40:44 +02:00
Christian Risi	ee88ffe4cf	Added View to filter over relationship counts	2025-09-25 11:32:03 +02:00
Christian Risi	70b4bd8645	Added Complex query	2025-09-25 11:31:34 +02:00
Christian Risi	6316d2bfc4	Added queries to take data from SQL for dataset	2025-09-25 11:27:19 +02:00
Christian Risi	87ca748f45	Updated DB to reflect new changes	2025-09-24 19:29:57 +02:00
Christian Risi	4315d70109	Merged abbreviation_datawarehouse into datawarehouse	2025-09-24 19:29:43 +02:00
Christian Risi	9a5d633b5e	Fixed Typos	2025-09-24 19:29:07 +02:00
Christian Risi	a6760cd52d	Updated SQL Queries to support parsing in DB	2025-09-24 19:28:55 +02:00
GassiGiuseppe	a7eb92227d	Moved all db queries file in their own folder	2025-09-24 16:44:55 +02:00
GassiGiuseppe	9f221e31cd	Merge branch 'dev.etl' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev.etl	2025-09-24 16:32:52 +02:00
GassiGiuseppe	47197194d5	WIP abbrevietion_datawarehouse to creat an abbreviation system	2025-09-24 16:32:09 +02:00
Christian Risi	0cdbf6f624	Added query to retrieve a dirty dataset from SQLite DB	2025-09-24 16:15:47 +02:00
Christian Risi	3e30489f86	Updated Queries for DB	2025-09-24 14:44:53 +02:00
Christian Risi	8a22e453e4	Fixed csv	2025-09-24 14:44:25 +02:00
Christian Risi	7feb4eb857	Fixed URI generation	2025-09-24 14:44:07 +02:00
Christian Risi	70af19d356	Removed unused imports and added trailing slashes	2025-09-24 14:04:48 +02:00
Christian Risi	a4b44ab2ee	Fixed Typos	2025-09-24 14:04:27 +02:00
Christian Risi	74b6b609dd	Fixed typos	2025-09-24 13:59:19 +02:00
Christian Risi	59796c37cb	Added script to take dbpedia uris	2025-09-24 13:49:29 +02:00
Christian Risi	f696f5950b	Added uri-abbreviations	2025-09-24 13:48:53 +02:00
Christian Risi	605b496da7	Added barebone UML diagram for a Cleaning Pipeline	2025-09-23 19:49:01 +02:00
Christian Risi	7d693964dd	Added new directories to tree structure	2025-09-23 19:47:56 +02:00