Added possibility to whitelist relationships

add divide method to create hold out dataset
Added EOS token
2025-10-12 12:26:26 +02:00 · 2025-10-11 16:49:36 +02:00 · 2025-10-07 22:47:59 +02:00 · 2025-10-07 15:49:25 +02:00 · 2025-10-07 00:54:00 +02:00 · 2025-10-06 10:57:50 +02:00
57 changed files with 6213 additions and 2 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -1,2 +1,3 @@
 Exam/Deep_Learning_2025_VIII.pdf filter=lfs diff=lfs merge=lfs -text
 Assets/** filter=lfs diff=lfs merge=lfs -text
 Assets/Dataset/1-hop/dataset.csv filter=lfs diff=lfs merge=lfs -text
--- a/.gitignore
+++ b/.gitignore
@ -191,6 +191,7 @@ ipython_config.py
 # Icon must end with two \r
 Icon
 # Thumbnails
 ._*
@ -251,3 +252,6 @@ $RECYCLE.BIN/
 # .nfs files are created when an open file is removed but is still being accessed
 .nfs*
 # ---> Custom
 **/Tmp/**
 !**/.gitkeep
--- a/.vscode/extensions.json
+++ b/.vscode/extensions.json
@ -0,0 +1,14 @@
 {
    "recommendations": [
        "bierner.github-markdown-preview",
        "bierner.markdown-checkbox",
        "bierner.markdown-emoji",
        "bierner.markdown-footnotes",
        "bierner.markdown-mermaid",
        "bierner.markdown-preview-github-styles",
        "bierner.markdown-yaml-preamble",
        "davidanson.vscode-markdownlint",
        "kejun.markdown-alert",
        "yzhang.markdown-all-in-one"
    ]
 }
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@ -0,0 +1,24 @@
 {
  // Always treat the project root as the working dir for Jupyter
  "jupyter.notebookFileRoot": "${workspaceFolder}",
  // When you click "Run Python File in Terminal", DON'T cd into the file's folder
  "python.terminal.executeInFileDir": false,
  // Start new integrated terminals at the project root
  "terminal.integrated.cwd": "${workspaceFolder}",
  // Ensure Python can import from the project root no matter which file you run
  // (so `src/` is on sys.path). Linux shown here; add osx/windows if needed.
  "terminal.integrated.env.linux": {
    "PYTHONPATH": "${workspaceFolder}"
  },
  // Make pytest run from the root without needing a pytest.ini
  "python.testing.pytestEnabled": true,
  "python.testing.cwd": "${workspaceFolder}",
  "python.testing.pytestArgs": ["src/test"],
  // Help Pylance resolve imports like `from src...` without red squiggles
  "python.analysis.extraPaths": ["${workspaceFolder}"]
 }
--- a/Assets/Dataset/1-hop/dataset.csv
+++ b/Assets/Dataset/1-hop/dataset.csv
--- a/Assets/Dataset/1-hop/movie-pageid.csv
+++ b/Assets/Dataset/1-hop/movie-pageid.csv
--- a/Assets/Dataset/1-hop/movies.csv
+++ b/Assets/Dataset/1-hop/movies.csv
--- a/Assets/Dataset/1-hop/reverse.csv
+++ b/Assets/Dataset/1-hop/reverse.csv
--- a/Assets/Dataset/1-hop/uri-abbreviations.csv
+++ b/Assets/Dataset/1-hop/uri-abbreviations.csv
--- a/Assets/Dataset/1-hop/wikipedia-movie.csv
+++ b/Assets/Dataset/1-hop/wikipedia-movie.csv
--- a/Assets/Dataset/1-hop/wikipedia-summary.csv
+++ b/Assets/Dataset/1-hop/wikipedia-summary.csv
--- a/Assets/Dataset/DatawareHouse/dataset.db
+++ b/Assets/Dataset/DatawareHouse/dataset.db
--- a/Assets/Dataset/Tmp/.gitkeep
+++ b/Assets/Dataset/Tmp/.gitkeep
--- a/README.md
+++ b/README.md
@ -1,3 +1,28 @@
 # NanoSocrates
 This is the work project for the DeepLearning exam of 16th September 2025
 ## Index
 - [Resources](./docs/RESOURCES.md)
 ## Setup
 Create and activate you Conda enviroment with:
       conda env create -f environment.yaml
       conda activate deep_learning
 Now install dependencies on pip:
        pip install -r requirements.txt
 ## TroubleShooting
 Sometimes when uploading really large batch of data, git can stop the uploads thanks to the timeout.
 The solution is to locally change its settings:
       git config lfs.dialtimeout 3600
       git config lfs.activitytimeout 3600
 for clearance check the link: https://stackoverflow.com/questions/58961697/i-o-timeout-when-pushing-to-a-git-reporsitory
--- a/Scripts/DataBaseQueries/dataset.sql
+++ b/Scripts/DataBaseQueries/dataset.sql
@ -0,0 +1,30 @@
 -- To pass to Pandas
 SELECT *
 FROM RDFs
 INNER JOIN Subjects USING (SubjectID)
 INNER JOIN Relationships USING (RelationshipID)
 INNER JOIN Objects USING (ObjectID);
 -- To pass to Pandas for abstracts
 SELECT *
 FROM RDFs
 INNER JOIN WikipediaAbstracts USING (MovieID);
 -- To pass to Pandas for abbreviations
 SELECT *
 FROM Abbreviations;
 -- More complex to have clean dataset
 -- More complex to have clean dataset
 SELECT MovieID, GROUP_CONCAT('<SOT>' || '<SUB>' || SubjectURI || '<REL>' || RelationshipURI || '<OBJ>' || ObjectURI || '<EOT>', '') as RDF_String, Abstract
 FROM RDFs
 INNER JOIN SubjectsCountInRDFs USING (SubjectID)
 INNER JOIN RelationshipsCountInRDFs USING(RelationshipID)
 INNER JOIN ObjectsCountInRDFs USING (ObjectID)
 INNER JOIN ParsedSubjects USING (SubjectID)
 INNER JOIN ParsedRelationships USING (RelationshipID)
 INNER JOIN ParsedObjects USING (ObjectID)
 INNER JOIN WikipediaAbstracts USING (MovieID)
    -- insert WHERE here
 -- WHERE SubjectID = 134626
 GROUP BY MovieID;
--- a/Scripts/DataBaseQueries/db_creation.sql
+++ b/Scripts/DataBaseQueries/db_creation.sql
@ -0,0 +1,174 @@
 CREATE TABLE IF NOT EXISTS Movies (
    MovieID INTEGER PRIMARY KEY AUTOINCREMENT,
    MovieURI TEXT UNIQUE NOT NULL
 );
 CREATE TABLE IF NOT EXISTS WikiPageIDs (
    MovieID INTEGER PRIMARY KEY,
    PageID INTEGER UNIQUE NOT NULL,
    FOREIGN KEY(MovieID) REFERENCES Movies(MovieID)
 );
 CREATE TABLE IF NOT EXISTS WikipediaAbstracts (
    MovieID INTEGER PRIMARY KEY,
    Abstract TEXT NOT NULL,
    FOREIGN KEY(MovieID) REFERENCES Movies(MovieID)
 );
 CREATE TABLE IF NOT EXISTS Origins (
    OriginID INTEGER PRIMARY KEY AUTOINCREMENT,
    OriginName TEXT UNIQUE NOT NULL
 );
 CREATE TABLE IF NOT EXISTS Subjects (
    SubjectID INTEGER PRIMARY KEY AUTOINCREMENT,
    SubjectURI TEXT UNIQUE NOT NULL,
    OriginID BIGINT NOT NULL,
    FOREIGN KEY(OriginID) REFERENCES Origins(OriginID)
 );
 CREATE TABLE IF NOT EXISTS Relationships (
    RelationshipID INTEGER PRIMARY KEY AUTOINCREMENT,
    RelationshipURI TEXT UNIQUE NOT NULL
 );
 CREATE TABLE IF NOT EXISTS Objects (
    ObjectID INTEGER PRIMARY KEY AUTOINCREMENT,
    ObjectURI TEXT UNIQUE NOT NULL,
    OriginID BIGINT NOT NULL,
    FOREIGN KEY(OriginID) REFERENCES Origins(OriginID)
 );
 CREATE TABLE IF NOT EXISTS RDFs (
    RDF_ID INTEGER PRIMARY KEY AUTOINCREMENT,
    MovieID INTEGER NOT NULL,
    SubjectID INTEGER NOT NULL,
    RelationshipID INTEGER NOT NULL,
    ObjectID INTEGER NOT NULL,
    UNIQUE(MovieID, SubjectID, RelationshipID, ObjectID),
    FOREIGN KEY(MovieID) REFERENCES Movies(MovieID),
    FOREIGN KEY(SubjectID) REFERENCES Subjects(SubjectID),
    FOREIGN KEY(RelationshipID) REFERENCES Relationships(RelationshipID),
    FOREIGN KEY(ObjectID) REFERENCES Objects(ObjectID)
 );
 CREATE INDEX IF NOT EXISTS idx_rdf_movie_id ON RDFs(MovieID);
 CREATE INDEX IF NOT EXISTS idx_rdf_subject_id ON RDFs(SubjectID);
 CREATE INDEX IF NOT EXISTS idx_rdf_relationship_id ON RDFs(RelationshipID);
 CREATE INDEX IF NOT EXISTS idx_rdf_object_id ON RDFs(ObjectID);
 CREATE TABLE IF NOT EXISTS Abbreviations (
    AbbreviationID INTEGER PRIMARY KEY AUTOINCREMENT,
    URI TEXT UNIQUE NOT NULL,
    Abbreviation TEXT UNIQUE NOT NULL
 );
 CREATE TABLE IF NOT EXISTS Subjects_Abbreviations (
    SubjectID INTEGER NOT NULL,
    AbbreviationID INTEGER NOT NULL,
    PRIMARY KEY(SubjectID, AbbreviationID),
    FOREIGN KEY(SubjectID) REFERENCES Subjects(SubjectID),
    FOREIGN KEY(AbbreviationID) REFERENCES Abbreviations(AbbreviationID)
 );
 CREATE TABLE IF NOT EXISTS Relationships_Abbreviations (
    RelationshipID INTEGER NOT NULL,
    AbbreviationID INTEGER NOT NULL,
    PRIMARY KEY(RelationshipID, AbbreviationID),
    FOREIGN KEY(RelationshipID) REFERENCES Relationships(RelationshipID),
    FOREIGN KEY(AbbreviationID) REFERENCES Abbreviations(AbbreviationID)
 );
 CREATE TABLE IF NOT EXISTS Objects_Abbreviations (
    ObjectID INTEGER NOT NULL,
    AbbreviationID INTEGER NOT NULL,
    PRIMARY KEY(ObjectID, AbbreviationID),
    FOREIGN KEY(ObjectID) REFERENCES Objects(ObjectID),
    FOREIGN KEY(AbbreviationID) REFERENCES Abbreviations(AbbreviationID)
 );
 CREATE INDEX IF NOT EXISTS idx_sub_abbr_sub_id ON Subjects_Abbreviations(SubjectID);
 CREATE INDEX IF NOT EXISTS idx_sub_abbr_abbr_id ON Subjects_Abbreviations(AbbreviationID);
 CREATE INDEX IF NOT EXISTS idx_rel_abbr_rel_id ON Relationships_Abbreviations(RelationshipID);
 CREATE INDEX IF NOT EXISTS idx_rel_abbr_abbr_id ON Relationships_Abbreviations(AbbreviationID);
 CREATE INDEX IF NOT EXISTS idx_obj_abbr_obj_id ON Objects_Abbreviations(ObjectID);
 CREATE INDEX IF NOT EXISTS idx_obj_abbr_abbr_id ON Objects_Abbreviations(AbbreviationID);
 -- Views
 -- Subjects
 CREATE VIEW IF NOT EXISTS ParsedSubjects
 AS
 SELECT
 	SubjectID,
 	CASE WHEN Abbreviation IS NULL
 		THEN SubjectURI
 		ELSE Abbreviation || ':' || replace(SubjectURI, URI, '') END
 		AS SubjectURI
 FROM Subjects
 	LEFT JOIN Subjects_Abbreviations USING (SubjectID)
 	LEFT JOIN Abbreviations USING (AbbreviationID);
 -- Relationships
 CREATE VIEW IF NOT EXISTS ParsedRelationships
 AS
 SELECT
 	RelationshipID,
 	CASE WHEN Abbreviation IS NULL
 		THEN RelationshipURI
 		ELSE Abbreviation || ':' || replace(RelationshipURI, URI, '') END
 		AS RelationshipURI
 FROM Relationships
 	LEFT JOIN Relationships_Abbreviations USING (RelationshipID)
 	LEFT JOIN Abbreviations USING (AbbreviationID);
 -- Objects
 CREATE VIEW IF NOT EXISTS ParsedObjects
 AS
 SELECT
 	ObjectID,
 	CASE WHEN Abbreviation IS NULL
 		THEN ObjectURI
 		ELSE Abbreviation || ':' || replace(ObjectURI, URI, '') END
 		AS ObjectURI
 FROM Objects
 	LEFT JOIN Objects_Abbreviations USING (ObjectID)
 	LEFT JOIN Abbreviations USING (AbbreviationID);
 -- Subject Count
 CREATE VIEW IF NOT EXISTS SubjectsCountInRDFs
 AS
 SELECT SubjectID, count(SubjectID) as Sub_Count
 FROM RDFs
 GROUP BY SubjectID;
 -- Relationship Count
 CREATE VIEW IF NOT EXISTS RelationshipsCountInRDFs
 AS
 SELECT RelationshipID, count(RelationshipID) as Rel_Count
 FROM RDFs
 GROUP BY RelationshipID;
 -- Object Count
 CREATE VIEW IF NOT EXISTS ObjectsCountInRDFs
 AS
 SELECT ObjectID, count(ObjectID) as Obj_Count
 FROM RDFs
 GROUP BY ObjectID;
--- a/Scripts/DataBaseQueries/query.sql
+++ b/Scripts/DataBaseQueries/query.sql
@ -0,0 +1,55 @@
 -- Insert MovieURI into Movies ; MovieID is auto incremental
 INSERT INTO  Movies (MovieURI) VALUES (?);
 -- Get MovieID where MovieURI equal given value
 SELECT MovieID FROM Movies WHERE MovieURI = ?;
 -- SetPageId
 INSERT INTO  WikiPageIDs (MovieID, PageID) VALUES (?,?);
 -- Get MovieId by PageID ... ( to create WikipediaAbstract)
 SELECT MovieID FROM WikiPageIDs WHERE PageID = ?;
 -- SetAbstract ...
 INSERT INTO  WikipediaAbstracts (MovieID, Abstract) VALUES (?,?);
 -- SetOrigin
 ---
 INSERT INTO  Origins (OriginName) VALUES ("dataset.csv"),("reverse.csv");
 -- GetOrigin
 SELECT OriginID FROM Origins WHERE OriginName = ?;
 -- Subject, Relationship, Object, RDF
 INSERT INTO  Subjects (SubjectURI, OriginID) VALUES (?,?);
 INSERT INTO  Relationships (RelationshipURI) VALUES (?);
 INSERT INTO  Objects (ObjectURI, OriginID) VALUES (?,?);
 SELECT SubjectID FROM Subjects WHERE SubjectURI = ?;
 SELECT RelationshipID FROM Relationships WHERE RelationshipURI = ?;
 SELECT ObjectID FROM Objects WHERE ObjectURI = ?;
 INSERT INTO  RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?,?,?,?);
 -- Prefixes
 INSERT INTO Abbreviations(URI, Abbreviation) VALUES (?,?);
 INSERT INTO Objects_Abbreviations(ObjectID, AbbreviationID) VALUES (?,?);
 INSERT INTO Relationships_Abbreviations(RelationshipID, AbbreviationID) VALUES (?,?);
 INSERT INTO Subjects_Abbreviations(SubjectID, AbbreviationID) VALUES (?,?);
 -- Please be sure it is a URI before running this query
 --  and take at least until the domain and the first path part
 SELECT AbbreviationID FROM Abbreviations WHERE URI LIKE ?;
 -- Query to retrieve data
 SELECT MovieID, GROUP_CONCAT('<SOT>' || '<SUB>' || SubjectURI || '<REL>' || RelationshipURI || '<OBJ>' || ObjectURI || '<EOT>', '') as RDF_String, Abstract
 FROM RDFs
 INNER JOIN ParsedSubjects USING (SubjectID)
 INNER JOIN ParsedRelationships USING (RelationshipID)
 INNER JOIN ParsedObjects USING (ObjectID)
 INNER JOIN WikipediaAbstracts USING (MovieID)
    -- insert WHERE here
 GROUP BY MovieID;
--- a/Scripts/DataCleaning/clean_relationship.ipynb
+++ b/Scripts/DataCleaning/clean_relationship.ipynb
@ -0,0 +1,186 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "b9081b7c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# This file deletes in the pipeline the unwanted relationship by different rules\n",
    "import pandas as pd\n",
    "import sqlite3\n",
    "import numpy as np\n",
    "\n",
    "\n",
    "CONN = sqlite3.connect('../../Assets/Dataset/Tmp/dataset2.db')\n",
    "\n",
    "def get_RDF() -> pd.DataFrame:\n",
    "    \"\"\"\n",
    "    QUERY = \"SELECT * FROM RDFs \" \\\n",
    "    \"INNER JOIN Subjects USING (SubjectID) \" \\\n",
    "    \"INNER JOIN Relationships USING (RelationshipID) \" \\\n",
    "    \"INNER JOIN Objects USING (ObjectID);\"\n",
    "    RDF = pd.read_sql_query(QUERY, CONN)\n",
    "    RDF = RDF[[\"SubjectURI\", \"RelationshipURI\", \"ObjectURI\"]]\n",
    "    RDF = RDF.dropna()\n",
    "    \"\"\"\n",
    "    Subjects = pd.read_sql_query('SELECT * FROM Subjects;', CONN)\n",
    "    Objects = pd.read_sql_query('SELECT * FROM Objects;', CONN)\n",
    "    Relationships = pd.read_sql_query('SELECT * FROM Relationships;', CONN)\n",
    "    RDF = pd.read_sql_query('SELECT * FROM RDFs;', CONN)\n",
    "\n",
    "    # drop '' values \n",
    "    Subjects = Subjects.replace('', np.nan)# .dropna()\n",
    "    Relationships = Relationships.replace('', np.nan)# .dropna()\n",
    "    Objects = Objects.replace('', np.nan)# .dropna()\n",
    "\n",
    "    # join RDF with its components\n",
    "    RDF = RDF.merge(Subjects, left_on=\"SubjectID\", right_on=\"SubjectID\")\n",
    "    RDF = RDF.merge(Objects, left_on=\"ObjectID\", right_on=\"ObjectID\")\n",
    "    RDF = RDF.merge(Relationships, left_on=\"RelationshipID\", right_on=\"RelationshipID\")\n",
    "    RDF = RDF[[\"SubjectURI\", \"RelationshipURI\", \"ObjectURI\", \"MovieID\"]]\n",
    "    return RDF\n",
    "\n",
    "\n",
    "#def delete_relationship_by_uri(RDF: pd.DataFrame, )\n",
    "\n",
    "def delete_relationship_by_uri(RDF: pd.DataFrame, uri: str) -> pd.DataFrame:\n",
    "    return RDF[RDF[\"RelationshipURI\"]!= uri]\n",
    "\n",
    "\n",
    "\n",
    "RDF = get_RDF()\n",
    "# RDF = RDF.dropna()\n",
    "# print(RDF)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "644690bb",
   "metadata": {},
   "outputs": [],
   "source": [
    "def filter_by_frequence_relationship_uri(RDF: pd.DataFrame, count_treshold) -> pd.DataFrame:\n",
    "    counts = RDF[\"RelationshipURI\"].value_counts() \n",
    "    RDF[\"RelationshipFreq\"] = RDF[\"RelationshipURI\"].map(counts)\n",
    "    RDF = RDF[RDF[\"RelationshipFreq\"] >= count_treshold]\n",
    "    # counts is a series as key: relationship, value: count\n",
    "    # counts = counts[counts > count_treshold]\n",
    "    # relationships = counts.index\n",
    "    # RDF = RDF[RDF[\"RelationshipURI\"].isin(relationships)]\n",
    "    # RDF = RDF.groupby(\"RelationshipURI\").filter(lambda x: len(x) >= count_treshold)\n",
    "    return RDF\n",
    "\n",
    "RDF = filter_by_frequence_relationship_uri(RDF, 1)\n",
    "# print(new_RDF)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "34525be6",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                                 SubjectURI  \\\n",
      "0             http://dbpedia.org/resource/Nights_of_Cabiria   \n",
      "1         http://dbpedia.org/resource/California_Science...   \n",
      "2                 http://dbpedia.org/resource/China_Captain   \n",
      "3         http://dbpedia.org/resource/Caravan_of_Courage...   \n",
      "4                http://dbpedia.org/resource/WHIH_Newsfront   \n",
      "...                                                     ...   \n",
      "12725500   http://dbpedia.org/resource/I_Will_Follow_(film)   \n",
      "12725501   http://dbpedia.org/resource/I_Will_Follow_(film)   \n",
      "12725502  http://dbpedia.org/resource/I_Witnessed_Genoci...   \n",
      "12725503  http://dbpedia.org/resource/I_Woke_Up_Early_th...   \n",
      "12725504           http://dbpedia.org/resource/I_Won't_Play   \n",
      "\n",
      "                                       RelationshipURI  \\\n",
      "0          http://www.w3.org/2002/07/owl#differentFrom   \n",
      "1          http://www.w3.org/2002/07/owl#differentFrom   \n",
      "2          http://www.w3.org/2002/07/owl#differentFrom   \n",
      "3          http://www.w3.org/2002/07/owl#differentFrom   \n",
      "4         http://www.w3.org/2000/01/rdf-schema#seeAlso   \n",
      "...                                                ...   \n",
      "12725500          http://dbpedia.org/ontology/producer   \n",
      "12725501          http://dbpedia.org/ontology/producer   \n",
      "12725502          http://dbpedia.org/ontology/producer   \n",
      "12725503          http://dbpedia.org/ontology/producer   \n",
      "12725504          http://dbpedia.org/ontology/producer   \n",
      "\n",
      "                                                  ObjectURI  MovieID  \\\n",
      "0                       http://dbpedia.org/resource/Cabiria       26   \n",
      "1         http://dbpedia.org/resource/California_Academy...      185   \n",
      "2                 http://dbpedia.org/resource/Captain_China      614   \n",
      "3         http://dbpedia.org/resource/Caravan_of_Courage...      740   \n",
      "4         http://dbpedia.org/resource/Captain_America:_C...      594   \n",
      "...                                                     ...      ...   \n",
      "12725500           http://dbpedia.org/resource/Ava_DuVernay   145854   \n",
      "12725501           http://dbpedia.org/resource/Molly_Mayeux   145854   \n",
      "12725502        http://dbpedia.org/resource/Headlines_Today   145861   \n",
      "12725503             http://dbpedia.org/resource/Billy_Zane   145862   \n",
      "12725504    http://dbpedia.org/resource/Gordon_Hollingshead   145864   \n",
      "\n",
      "          RelationshipFreq  MovieFreq  \n",
      "0                     2132        216  \n",
      "1                     2132        264  \n",
      "2                     2132         66  \n",
      "3                     2132        131  \n",
      "4                     1653        133  \n",
      "...                    ...        ...  \n",
      "12725500             80077         95  \n",
      "12725501             80077         95  \n",
      "12725502             80077         41  \n",
      "12725503             80077         98  \n",
      "12725504             80077         91  \n",
      "\n",
      "[12725505 rows x 6 columns]\n"
     ]
    }
   ],
   "source": [
    "def filter_by_frequence_movie_id(RDF: pd.DataFrame, min_treshold, max_treshold) -> pd.DataFrame:\n",
    "    counts = RDF[\"MovieID\"].value_counts() \n",
    "    RDF[\"MovieFreq\"] = RDF[\"MovieID\"].map(counts)\n",
    "    RDF = RDF[RDF[\"MovieFreq\"] >= min_treshold]\n",
    "    RDF = RDF[RDF[\"MovieFreq\"] < max_treshold]\n",
    "    # counts is a series as key: relationship, value: count\n",
    "    # counts = counts[counts > count_treshold]\n",
    "    # relationships = counts.index\n",
    "    # RDF = RDF[RDF[\"RelationshipURI\"].isin(relationships)]\n",
    "    # RDF = RDF.groupby(\"RelationshipURI\").filter(lambda x: len(x) >= count_treshold)\n",
    "    return RDF\n",
    "\n",
    "RDF = filter_by_frequence_movie_id(RDF, 1, 1500)\n",
    "print(RDF)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "deep_learning",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
--- a/Scripts/DataCleaning/data_output_models/bpe_corpus.py
+++ b/Scripts/DataCleaning/data_output_models/bpe_corpus.py
@ -0,0 +1,21 @@
 from Scripts.Libs.Utils.dataframe_interaction import get_raw_from_dataframe
 from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
 import pandas as pd
 class BPE_corpus():
    def __init__(self, output_path :str):
        self.output_handler = open(output_path, "w")
    def close(self):
        # add corpus end before closing
        self.output_handler.write(SpecialToken.CORPUS_END.value)
        self.output_handler.close()
    def write_from_str(self, output: str):
        if output == '':
            return
        self.output_handler.write(output)
    def write_from_df(self, df: pd.DataFrame):
        self.write_from_str(get_raw_from_dataframe(df))
--- a/Scripts/DataCleaning/data_output_models/debug_csv.py
+++ b/Scripts/DataCleaning/data_output_models/debug_csv.py
@ -0,0 +1,21 @@
 import pandas as pd
 class Debug_csv():
    def __init__(self, output_path:str):
        self.output =  open(output_path, "w")
        # then the first row as header
        header = ["MovieURI","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
        self.output.write(",".join(header) + "\n")
    def close(self):
        self.output.close()
    def write(self, RDF: pd.DataFrame):
        """
        Args:
            RDF (pd.DataFrame): ["MovieURI","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
        """        
        RDF.to_csv(self.output, index=False, header=False)
--- a/Scripts/DataCleaning/data_output_models/rdf_completation_task.py
+++ b/Scripts/DataCleaning/data_output_models/rdf_completation_task.py
@ -0,0 +1,26 @@
 import pandas as pd
 class RDF_completation_task_dataset():
    """
        Write the CSV for the fourth task, which is "Predicting subsequent triples based on a given context".
        Each RDF is saved as str
        CSV Composition: ["MovieID","RDF"]
    """
    def __init__(self, output_path:str):
        self.output =  open(output_path, "w")
        # then the first row as header
        header = ["MovieID","RDF"]
        self.output.write(",".join(header) + "\n")
    def close(self):
        self.output.close()
    def write(self, RDF: pd.DataFrame):
        """
        Args:
            RDF (pd.DataFrame): ["MovieID","RDF"]
        """        
        RDF.to_csv(self.output, index=False, header=False)
--- a/Scripts/DataCleaning/data_output_models/rdf_mask_task.py
+++ b/Scripts/DataCleaning/data_output_models/rdf_mask_task.py
@ -0,0 +1,58 @@
 import pandas as pd
 # do not worry about circular dependencies, this class will never call something else
 from Scripts.DataCleaning.legacy.filter import PipelineApplier
 class RDF_mask_task_dataset():
    """
        Write the CSV for the third task, which is "Predicting a masked component within an RDF triple".
        The CSV is like: for each RDF there will be 3 rows, where every time one of the componments is missing.
        CSV Composition: ["MovieID","IncompleteRDF","Missing","RDF"]
    """
    def __init__(self, output_path:str):
        # this methods will only be used by this class, but they belong in a lower level
        self._build_triple = PipelineApplier.build_triple
        self._build_incomplete_triple = PipelineApplier.build_incomplete_triple
        self.output =  open(output_path, "w")
        # then the first row as header
        header = ["MovieID","IncompleteRDF","Missing","RDF"]
        self.output.write(",".join(header) + "\n")
    def close(self):
        self.output.close()
    def write(self, RDF: pd.DataFrame):
        rdf_complete = self._build_triple(RDF)
        rdf_without_subject = self._build_incomplete_triple(RDF.drop(columns=["SubjectURI"]))
        rdf_without_relationship = self._build_incomplete_triple(RDF.drop(columns=["RelationshipURI"]))
        rdf_without_object = self._build_incomplete_triple(RDF.drop(columns=["ObjectURI"]))
        ####
        df_subject = pd.DataFrame({
            "MovieID": RDF["MovieID"],
            "IncompleteRDF": rdf_without_subject,
            "Missing": RDF["SubjectURI"],
            "RDF": rdf_complete,
        })
        df_relationship = pd.DataFrame({
            "MovieID": RDF["MovieID"],
            "IncompleteRDF": rdf_without_relationship,
            "Missing": RDF["RelationshipURI"],
            "RDF": rdf_complete,
        })
        df_object = pd.DataFrame({
            "MovieID": RDF["MovieID"],
            "IncompleteRDF": rdf_without_object,
            "Missing": RDF["ObjectURI"],
            "RDF": rdf_complete,
        })
        output_df = pd.concat([df_subject, df_relationship, df_object], ignore_index=True)
        output_df.to_csv(self.output, index=False, header=False)
--- a/Scripts/DataCleaning/data_output_models/rdf_text_tasks.py
+++ b/Scripts/DataCleaning/data_output_models/rdf_text_tasks.py
@ -0,0 +1,26 @@
 import pandas as pd
 class RDF_text_task_dataset():
    """
        Write the CSV for the firsts two tasks, which are "Generating structured RDF triples from natural language text" and reverse.
        In the CVS the RDFs will be saved toghether as a string.
        CSV Composition: ["MovieID","RDFs","Abstract"]
    """
    def __init__(self, output_path:str):
        self.output =  open(output_path, "w")
        # then the first row as header
        header = ["MovieID","RDFs","Abstract"]
        self.output.write(",".join(header) + "\n")
    def close(self):
        self.output.close()
    def write(self, RDF: pd.DataFrame):
        """
        Args:
            RDF (pd.DataFrame): ["MovieID","Triple","Abstract"]
        """        
        RDF.to_csv(self.output, index=False, header=False)
--- a/Scripts/DataCleaning/dbpedia-uri.py
+++ b/Scripts/DataCleaning/dbpedia-uri.py
@ -0,0 +1,77 @@
 import argparse
 import sys
 class ProgramArgs:
    def __init__(self, file: str, output: str, treshold: int):
        self.file = file
        self.output = output
        self.treshold = treshold
 def get_args(args: list[str]) -> ProgramArgs:
    PARSER = argparse.ArgumentParser()
    PARSER.add_argument("--input-file", "-i", required=True, type=str)
    PARSER.add_argument("--output-file", "-o", required=True, type=str)
    PARSER.add_argument("--treshold", "-t", type=int, default=1)
    parsed_args, _ = PARSER.parse_known_args(args)
    # print(parsed_args.input_file)
    return ProgramArgs(parsed_args.input_file,parsed_args.output_file, parsed_args.treshold)  # type ignore
 def print_dbpedia(file: str, out: str):
    FILE = open(file, "r", encoding="utf-8")
    OUT = open(out, mode="w", encoding="utf-8")
    DOMAIN_PART = "dbpedia"
    already_parsed : set[str] = set()
    for row in FILE:
        sections = row.split("/")
        sections = list(filter(lambda item: item != "", sections))
        # print(sections)
        if len(sections) < 3:
            continue
        URI = "/".join(sections[1:3])
        URI = "//".join([sections[0], URI])
        if  URI in already_parsed:
            continue
        DOMAIN = sections[1]
        SUBDOMAINS = DOMAIN.split(".")
        TYPE = sections[2][0]
        if DOMAIN_PART not in SUBDOMAINS:
            continue
        already_parsed.add(URI)
        SUB_ID = SUBDOMAINS[0]
        if len(SUB_ID) > 3:
            SUB_ID = SUB_ID[:3]
        OUT.write(f"\"{URI}/\", \"{SUB_ID}-db{TYPE}\"\n")
    FILE.close()
    OUT.close()
 if __name__ == "__main__":
    ARGS = get_args(sys.argv)
    # ARGS = get_debug_args()
    print_dbpedia(ARGS.file, ARGS.output)
--- a/Scripts/DataCleaning/hold_out/divide.py
+++ b/Scripts/DataCleaning/hold_out/divide.py
@ -0,0 +1,29 @@
 import pandas as pd
 def split_csv_by_percent(csv_path, train=70, val=15, test=15, seed=42):
    # 1) Read and shuffle rows with a fixed seed for reproducibility
    df = pd.read_csv(csv_path).sample(frac=1, random_state=seed).reset_index(drop=True)
    # 2) Turn the three inputs into proportions relative to their sum
    total = train + val + test # eheh you got it there :p
    n = len(df)
    n_train = int(n * train / total)   # floor to keep indices integral
    n_val   = int(n * val   / total)
    # 3) Give the remainder to test to ensure every row is assigned
    #    (this naturally absorbs any rounding loss)
    train_df = df.iloc[:n_train].reset_index(drop=True)
    val_df   = df.iloc[n_train:n_train + n_val].reset_index(drop=True)
    test_df  = df.iloc[n_train + n_val:].reset_index(drop=True)
    return train_df, val_df, test_df
 # usage:
 DATASET =       "Assets/Dataset/Tmp/rdf_text.csv"
 TRAIN =         "Assets/Dataset/Tmp/hold_out/train.csv"
 TEST =          "Assets/Dataset/Tmp/hold_out/test.csv"
 EVALUATION =    "Assets/Dataset/Tmp/hold_out/evaluation.csv"
 train_df, val_df, test_df = split_csv_by_percent(DATASET, train=80, val=10, test=10, seed=42)
 train_df.to_csv(TRAIN)
 val_df.to_csv(EVALUATION)
 test_df.to_csv(TEST)
--- a/Scripts/DataCleaning/legacy/deprecated.py
+++ b/Scripts/DataCleaning/legacy/deprecated.py
@ -0,0 +1,381 @@
 # This file deletes in the pipeline the unwanted relationship by different rules
 # -----------------------------------------------------------------------------
 # SQL-FIRST VERSION
 # -----------------------------------------------------------------------------
 # In the original (pandas) version this module:
 #   - stored frequency filters in DataFrames,
 #   - filtered/cleaned DataFrames in-memory,
 #   - added special tokens via string ops,
 #   - rebuilt one row per movie using groupby/aggregation.
 #
 # In this rewrite:
 #   - Every transformation RETURNS a SQLAlchemy `Select` object instead of a DataFrame.
 #   - Your pipeline can pass this `Select` (a "dataview") from one stage to the next,
 #     composing more SQL lazily. Nothing is executed until you call `session.execute(...)`.
 #   - Frequency filters are represented as SUBSELECTS, applied with `WHERE IN (subquery)`.
 #
 # Notes:
 #   - We keep the same CLASS and METHOD NAMES to preserve call sites.
 #   - Method comments/docstrings from your original file are carried over and updated
 #     to reflect Select-based behavior and return types.
 #   - We drop pandas/numpy/sqlite3 imports because filtering is pushed into SQL.
 #   - `GROUP_CONCAT` is used for the rebuild phase (SQLite-compatible). For other DBs,
 #     swap with an equivalent string-agg function.
 # -----------------------------------------------------------------------------
 from __future__ import annotations
 from typing import Optional
 from sqlalchemy import select, func, literal
 from sqlalchemy.sql import Select
 from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
 class PipelineApplier():
    """
    SQL-first pipeline applier.
    In the pandas version, frequency filters were stored as DataFrames (self.MOVIE_FILTER / self.REL_FILTER)
    and every method worked with/returned pandas.DataFrame. In this SQLAlchemy rewrite:
      - self.MOVIE_FILTER and self.REL_FILTER become *subselects* (Select objects) that yield a single
        column each (MovieID or RelationshipURI). These subselects can be applied via `WHERE IN (subquery)`.
      - Every method that previously returned a DataFrame now returns a *Select* that represents the same
        logical transformation, but pushed into the database engine.
      - Comments and docstrings are updated to reflect SQL semantics while preserving your original intent.
    """
    def __init__(self):
        # In the pandas version these were DataFrames storing allowed keys.
        # Here they are Select objects (single-column subselects) or None.
        # Expected column names:
        #   - self.MOVIE_FILTER:      "MovieID"
        #   - self.REL_FILTER:        "RelationshipURI"
        self.MOVIE_FILTER: Optional[Select] = None
        self.REL_FILTER: Optional[Select] = None
    # -------------------------------------------------------------------------
    # Relationship deletion
    # -------------------------------------------------------------------------
    def delete_relationship_by_str(self, RDF: Select, uri: str) -> Select:
        """
        Return a Select where rows having the given relationship URI are removed.
        Original signature (pandas):
            def delete_relationship_by_str(self, RDF: pd.DataFrame, uri: str) -> pd.DataFrame
        Updated behavior:
            - RDF is a Select with columns: MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
            - We apply a WHERE clause: RelationshipURI != <uri>
            - Returns a Select you can continue composing.
        Args:
            RDF (Select): a selectable representing the RDF joined view
            uri (str): RelationshipURI to exclude
        Returns:
            Select: filtered selectable (no execution yet)
        """
        sc = RDF.selected_columns
        return RDF.where(sc.RelationshipURI != literal(uri))
    # -------------------------------------------------------------------------
    # Frequency filter: MOVIE
    # -------------------------------------------------------------------------
    def generate_frequency_movie_filter(self, MOVIE_COUNT: Select, min_treshold: int, max_treshold: int):
        """
        You MUST call this before filtering by movie frequency [filter_by_frequency_movie_id()],
        since this method creates such filter.
        Original behavior:
            - Input MOVIE_COUNT as DataFrame ["MovieID","Count"]
            - Keep rows where Count in [min_treshold, max_treshold)
            - Store the filtered keys in self.MOVIE_FILTER
        Updated behavior (SQL):
            - MOVIE_COUNT is a Select that yields ["MovieID","Count"].
            - We build and store a *subselect* of allowed MovieID (single column) to be used by WHERE IN.
            - No query is executed here; we only create a new Select.
        Args:
            MOVIE_COUNT (Select): yields columns MovieID, Count
            min_treshold (int):
            max_treshold (int):
        """
        sc = MOVIE_COUNT.selected_columns
        filtered = MOVIE_COUNT.where(sc.Count >= min_treshold).where(sc.Count < max_treshold)
        # Keep only the key column so it can be used in an IN (subquery)
        self.MOVIE_FILTER = select(filtered.selected_columns.MovieID)
    # -------------------------------------------------------------------------
    # Frequency filter: RELATIONSHIP
    # -------------------------------------------------------------------------
    def generate_frequency_relationship_filter(self, REL_COUNT: Select, min_treshold: int, max_treshold: int):
        """
        Original behavior:
            - Input REL_COUNT as DataFrame ["RelationshipURI","Count"]
            - Keep rows where Count in [min_treshold, max_treshold)
            - Store the filtered keys in self.REL_FILTER
        Updated behavior (SQL):
            - REL_COUNT is a Select that yields ["RelationshipURI","Count"].
            - We build and store a *subselect* of allowed RelationshipURI (single column) to be used by WHERE IN.
            - No query is executed here; we only create a new Select.
        Args:
            REL_COUNT (Select): yields columns RelationshipURI, Count
            min_treshold (int):
            max_treshold (int):
        """
        sc = REL_COUNT.selected_columns
        filtered = REL_COUNT.where(sc.Count >= min_treshold).where(sc.Count < max_treshold)
        self.REL_FILTER = select(filtered.selected_columns.RelationshipURI)
    # -------------------------------------------------------------------------
    # Apply frequency filters
    # -------------------------------------------------------------------------
    def filter_by_frequency_movie_id(self, RDF: Select) -> Select:
        """
        Original behavior (pandas):
            RDF = RDF[RDF["MovieID"].isin(self.MOVIE_FILTER["MovieID"])]
        Updated behavior (SQL):
            - If self.MOVIE_FILTER is present, apply: WHERE MovieID IN ( <subselect> )
            - Otherwise, return RDF unchanged.
        Args:
            RDF (Select): current dataset
        Returns:
            Select: filtered dataset (or unchanged if no filter exists)
        """
        if self.MOVIE_FILTER is None:
            return RDF
        sc = RDF.selected_columns
        return RDF.where(sc.MovieID.in_(self.MOVIE_FILTER))
    def filter_by_frequency_relationship(self, RDF: Select) -> Select:
        """
        Original behavior (pandas):
            RDF = RDF[RDF["RelationshipURI"].isin(self.REL_FILTER["RelationshipURI"])]
        Updated behavior (SQL):
            - If self.REL_FILTER is present, apply: WHERE RelationshipURI IN ( <subselect> )
            - Otherwise, return RDF unchanged.
        Args:
            RDF (Select): current dataset
        Returns:
            Select: filtered dataset (or unchanged if no filter exists)
        """
        if self.REL_FILTER is None:
            return RDF
        sc = RDF.selected_columns
        return RDF.where(sc.RelationshipURI.in_(self.REL_FILTER))
    # -------------------------------------------------------------------------
    # Token prefixing (SubjectURI/RelationshipURI/ObjectURI)
    # -------------------------------------------------------------------------
    def rdf_add_special_token(self, RDF: Select) -> Select:
        """
        Adds RDF special token to each element of the tuple. i.e: SUBJ to SubjectURI,
        OBJ to ObjectURI, REL to RelationshipURI. Check
        Scripts/Libs/CleaningPipeline/special_token.py for the up-to-date special token.
        It only adds the special token of the three elements of the RDF; no other special token.
        Original behavior (pandas):
            - String concatenation with columns in a DataFrame.
            - Returned a new DataFrame.
        Updated behavior (SQL):
            - Build projected columns using SQL string concatenation.
            - Return a new Select with the same output column names:
              ["MovieID","SubjectURI","RelationshipURI","ObjectURI","Abstract"].
        Args:
            RDF (Select): current dataset
        Returns:
            Select: projected dataset with tokenized SubjectURI/RelationshipURI/ObjectURI
        """
        sc = RDF.selected_columns
        subj_tok = literal(SpecialToken.SUBJECT.value) + sc.SubjectURI
        rel_tok  = literal(SpecialToken.RELATIONSHIP.value) + sc.RelationshipURI
        obj_tok  = literal(SpecialToken.OBJECT.value) + sc.ObjectURI
        return RDF.with_only_columns(
            sc.MovieID.label("MovieID"),
            subj_tok.label("SubjectURI"),
            rel_tok.label("RelationshipURI"),
            obj_tok.label("ObjectURI"),
            sc.Abstract.label("Abstract"),
        )
    # -------------------------------------------------------------------------
    # NA/empty drop on key columns (SubjectURI, RelationshipURI, ObjectURI)
    # -------------------------------------------------------------------------
    def drop_na_from_dataset(self, RDF: Select) -> Select:
        """
        Dataset has SubjectURI, RelationshipURI, ObjectURI. We want to drop rows
        where any of these is empty or NULL.
        Original behavior (pandas):
            - Replace '' with NaN and dropna on the three columns.
        Updated behavior (SQL):
            - Apply WHERE clauses checking for NOT NULL and not empty string.
        Args:
            RDF (Select): current dataset
        Returns:
            Select: dataset filtered to non-empty SubjectURI/RelationshipURI/ObjectURI
        """
        sc = RDF.selected_columns
        return RDF.where(
            (sc.SubjectURI.is_not(None)) & (sc.SubjectURI != "") &
            (sc.RelationshipURI.is_not(None)) & (sc.RelationshipURI != "") &
            (sc.ObjectURI.is_not(None)) & (sc.ObjectURI != "")
        )
    # -------------------------------------------------------------------------
    # Rebuild by movie (one row per movie)
    # -------------------------------------------------------------------------
    def rebuild_by_movie(self, RDF: Select) -> Select:
        """
        To execute this method you have to have iterated by movie_id conceptually,
        because as design we want at the end one row for each movie.
        Original behavior (pandas):
            - Build per-row "Triple" as SubjectURI + RelationshipURI + ObjectURI,
              wrapped with START_TRIPLE/END_TRIPLE.
            - Group by ["MovieID", "Abstract"] and join ("".join) all Triple strings into one.
            - Prefix the whole list with START_TRIPLE_LIST and Abstract with ABSTRACT.
            - Return DataFrame [["MovieID","Triple","Abstract"]].
        Updated behavior (SQL):
            - Build per-row Triple using SQL string concatenation and constants.
            - Use GROUP_CONCAT (empty separator) to aggregate per-movie.
            - Prefix with START_TRIPLE_LIST and ABSTRACT in SQL.
            - Return a Select with columns: ["MovieID","Triple","Abstract"].
        Args:
            RDF (Select): current dataset with columns
                          MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
        Returns:
            Select: aggregated dataset with one row per movie
        """
        sc = RDF.selected_columns
        # Per-row triple with START/END_TRIPLE tokens
        row_triple = (
            literal(SpecialToken.START_TRIPLE.value) +
            (sc.SubjectURI + sc.RelationshipURI + sc.ObjectURI) +
            literal(SpecialToken.END_TRIPLE.value)
        ).label("Triple")
        # Prefixed abstract
        abstract_tok = (literal(SpecialToken.ABSTRACT.value) + sc.Abstract).label("Abstract")
        # Subquery of per-row triples / abstracts
        row_view = RDF.with_only_columns(
            sc.MovieID.label("MovieID"),
            row_triple,
            abstract_tok,
        ).subquery()
        # Concatenate all triples for each movie (SQLite syntax; adjust for other DBs)
        triple_concat = (
            literal(SpecialToken.START_TRIPLE_LIST.value) +
            func.group_concat(row_view.c.Triple, literal(""))
        ).label("Triple")
        return (
            select(
                row_view.c.MovieID.label("MovieID"),
                triple_concat,
                row_view.c.Abstract.label("Abstract"),
            )
            .group_by(row_view.c.MovieID, row_view.c.Abstract)
        )
    # -------------------------------------------------------------------------
    # Build triple(s) projection
    # -------------------------------------------------------------------------
    @staticmethod
    def build_triple(RDF: Select) -> Select:
        """
        Obtains joined RDF triple in one element, together with START and END special tokens.
        Original behavior (pandas):
            - Returned a Series/DataFrame column "Triple" built from three string columns.
        Updated behavior (SQL):
            - Returns a Select with a single column "Triple" built in SQL.
        Args:
            RDF (Select): at least columns ["SubjectURI", "RelationshipURI", "ObjectURI"]
        Returns:
            Select: a projection containing one column named "Triple"
        """
        sc = RDF.selected_columns
        triple = (
            literal(SpecialToken.START_TRIPLE.value) +
            (sc.SubjectURI + sc.RelationshipURI + sc.ObjectURI) +
            literal(SpecialToken.END_TRIPLE.value)
        ).label("Triple")
        return RDF.with_only_columns(triple)
    @staticmethod
    def build_incomplete_triple(RDF: Select) -> Select:
        """
        Method helper used for the third task: "Predicting a masked component within an RDF triple".
        Obtains joined RDF triple in one element, together with START and END special tokens.
        The MISSING element will be replaced by the special token <MASK>.
        Original behavior (pandas):
            - Created a Series "Triple" using fallback values for missing columns.
        Updated behavior (SQL):
            - Uses COALESCE to replace NULLs with <MASK> directly in SQL.
            - Returns a Select with a single column "Triple".
        Args:
            RDF (Select): 2 of the following columns present ["SubjectURI", "RelationshipURI", "ObjectURI"]
        Returns:
            Select: projection with column "Triple"
        """
        sc = RDF.selected_columns
        mask = literal(SpecialToken.MASK.value)
        triple = (
            literal(SpecialToken.START_TRIPLE.value) +
            (func.coalesce(sc.SubjectURI, mask) +
             func.coalesce(sc.RelationshipURI, mask) +
             func.coalesce(sc.ObjectURI, mask)) +
            literal(SpecialToken.END_TRIPLE.value)
        ).label("Triple")
        return RDF.with_only_columns(triple)
    @staticmethod
    def build_for_mask_task(RDF_incomplete: Select, MISSING) -> None:
        """
        Currently not used.
        Original intention:
            Given two DataFrames (one incomplete RDF and another with just the missing component),
            apply special tokens accordingly.
        Updated note:
            This stub remains for API parity. If needed in the future, it can be implemented
            as a Select-building helper that merges/COALESCEs columns from different selects.
        """
        return None
--- a/Scripts/DataCleaning/legacy/fast_filter.py
+++ b/Scripts/DataCleaning/legacy/fast_filter.py
@ -0,0 +1,148 @@
 # This file deletes in the pipeline the unwanted relationship by different rules
 import pandas as pd
 import sqlite3  # kept for compatibility
 import numpy as np
 from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
 from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
 class PipelineApplier:
    def __init__(self):
        # Fast internal caches for O(1) membership checks
        self._MOVIE_FILTER_SET = set()
        self._REL_FILTER_SET = set()
    # ------------------------------
    # Filters
    # ------------------------------
    def delete_relationship_by_str(self, RDF: pd.DataFrame, uri: str) -> pd.DataFrame:
        # Vectorized boolean mask
        return RDF.loc[RDF["RelationshipURI"] != uri]
    def generate_frequency_movie_filter(self, MOVIE_COUNT: pd.DataFrame, min_threshold: int, max_threshold: int):
        """
        You MUST call this before filter the dataset by movie frequency [filter_by_frequency_movie_id()],
        since this method creates such filter.
        Args:
            MOVIE_COUNT (pd.DataFrame): ["MovieID","Count"]
        """
        sel = (MOVIE_COUNT["Count"] >= min_threshold) & (MOVIE_COUNT["Count"] < max_threshold)
        self._MOVIE_FILTER_SET = set(MOVIE_COUNT.loc[sel, "MovieID"].tolist())
    def generate_frequency_relationship_filter(self, REL_COUNT: pd.DataFrame, min_threshold: int, max_threshold: int):
        sel = (REL_COUNT["Count"] >= min_threshold) & (REL_COUNT["Count"] < max_threshold)
        self._REL_FILTER_SET = set(REL_COUNT.loc[sel, "RelationshipURI"].tolist())
    def filter_by_frequency_movie_id(self, RDF: pd.DataFrame) -> pd.DataFrame:
        # Set-backed isin is the fastest path
        return RDF.loc[RDF["MovieID"].isin(self._MOVIE_FILTER_SET)]
    def filter_by_frequency_relationship(self, RDF: pd.DataFrame) -> pd.DataFrame:
        return RDF.loc[RDF["RelationshipURI"].isin(self._REL_FILTER_SET)]
    # ------------------------------
    # Cleaning & preprocessing
    # ------------------------------
    def rdf_add_special_token(self, RDF: pd.DataFrame) -> pd.DataFrame:
        """
        Adds RDF special token to SubjectURI / RelationshipURI / ObjectURI.
        Returns a new DataFrame (no inplace modification of the caller's object).
        """
        subj = np.char.add(SpecialToken.SUBJECT.value,      RDF["SubjectURI"].to_numpy(dtype=object))
        rel  = np.char.add(SpecialToken.RELATIONSHIP.value, RDF["RelationshipURI"].to_numpy(dtype=object))
        obj  = np.char.add(SpecialToken.OBJECT.value,        RDF["ObjectURI"].to_numpy(dtype=object))
        return RDF.assign(SubjectURI=subj, RelationshipURI=rel, ObjectURI=obj)
    def drop_na_from_dataset(self, RDF: pd.DataFrame) -> pd.DataFrame:
        """
        Replace '' with NaN only on key columns, then drop rows missing any of them.
        """
        cols = ["SubjectURI", "RelationshipURI", "ObjectURI"]
        rdf = RDF.copy()
        for c in cols:
            m = rdf[c] == ""
            if m.any():
                rdf.loc[m, c] = np.nan
        return rdf.dropna(subset=cols)
    # ------------------------------
    # Building triples
    # ------------------------------
    @staticmethod
    def build_triple(RDF: pd.DataFrame):
        """
        Obtains joined RDF triple in one element, together with START and END special token.
        Returns:
            pd.Series: RDF["Triple"] (just this column). Side-effect: sets RDF["Triple"].
        """
        start = SpecialToken.START_TRIPLE.value
        end   = SpecialToken.END_TRIPLE.value
        subj = RDF["SubjectURI"].to_numpy(dtype=object)
        rel  = RDF["RelationshipURI"].to_numpy(dtype=object)
        obj  = RDF["ObjectURI"].to_numpy(dtype=object)
        arr = np.char.add(np.char.add(np.char.add(start, subj),
                                      np.char.add(rel, obj)),
                          end)
        RDF["Triple"] = pd.Series(arr, index=RDF.index, dtype=object, name="Triple")
        return RDF["Triple"]
    @staticmethod
    def build_incomplete_triple(RDF: pd.DataFrame):
        """
        Helper used for the third task: "Predicting a masked component within an RDF triple".
        Accepts any subset of ["SubjectURI","RelationshipURI","ObjectURI"] (typically 2 of 3).
        Missing components are replaced by <MASK>.
        Returns:
            pd.Series: RDF["Triple"] (just this column). Side-effect: sets RDF["Triple"].
        """
        start = SpecialToken.START_TRIPLE.value
        end   = SpecialToken.END_TRIPLE.value
        maskv = SpecialToken.MASK.value
        n = len(RDF.index)
        subj = RDF["SubjectURI"].to_numpy(dtype=object)      if "SubjectURI"      in RDF else np.full(n, maskv, dtype=object)
        rel  = RDF["RelationshipURI"].to_numpy(dtype=object) if "RelationshipURI" in RDF else np.full(n, maskv, dtype=object)
        obj  = RDF["ObjectURI"].to_numpy(dtype=object)       if "ObjectURI"       in RDF else np.full(n, maskv, dtype=object)
        arr = np.char.add(np.char.add(np.char.add(start, subj),
                                      np.char.add(rel, obj)),
                          end)
        RDF["Triple"] = pd.Series(arr, index=RDF.index, dtype=object, name="Triple")
        return RDF["Triple"]
    def rebuild_by_movie(self, RDF: pd.DataFrame):
        """
        Collapse triples + abstract into a single row per movie.
        Returns: ["MovieID","Triple","Abstract"]
        """
        # Build triples once (vectorized); method also sets RDF["Triple"]
        triples = self.build_triple(RDF)
        # Minimal frame for grouping (avoid carrying extra columns)
        tmp = pd.DataFrame({
            "MovieID":  RDF["MovieID"].to_numpy(),
            "Abstract": RDF["Abstract"].to_numpy(),
            "Triple":   triples.to_numpy(),
        })
        # Factorize high-cardinality keys to fast integer codes, group on codes,
        # then map back to labels; sum concatenates strings for object dtype.
        mid_codes, mid_uniques = pd.factorize(tmp["MovieID"], sort=False)
        abs_codes, abs_uniques = pd.factorize(tmp["Abstract"], sort=False)
        tmp["_mid"] = mid_codes
        tmp["_abs"] = abs_codes
        grouped = tmp.groupby(["_mid", "_abs"], sort=False, as_index=False)["Triple"].sum()
        grouped["MovieID"]  = grouped["_mid"].map(lambda i: mid_uniques[i])
        grouped["Abstract"] = grouped["_abs"].map(lambda i: abs_uniques[i])
        # Final tokens
        grouped["Triple"]   = SpecialToken.START_TRIPLE_LIST.value + grouped["Triple"]
        grouped["Abstract"] = SpecialToken.ABSTRACT.value         + grouped["Abstract"]
        return grouped[["MovieID", "Triple", "Abstract"]]
--- a/Scripts/DataCleaning/legacy/filter.py
+++ b/Scripts/DataCleaning/legacy/filter.py
@ -0,0 +1,191 @@
 # This file deletes in the pipeline the unwanted relationship by different rules
 import pandas as pd
 import sqlite3
 import numpy as np
 from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
 from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
 class PipelineApplier():
    def __init__(self):
        self.MOVIE_FILTER = pd.DataFrame()
        self.REL_FILTER = pd.DataFrame()
    def delete_relationship_by_str(self, RDF: pd.DataFrame, uri: str) -> pd.DataFrame:
        return RDF[RDF["RelationshipURI"]!= uri]
    def generate_list_relationship_filter(self, filter_list: list[str]) -> None:
        """Store RelationshipURI filters as a set """
        self.relationship_filter_list: set[str] = set(filter_list)
    def delete_relationship_by_list_filter(self, RDF: pd.DataFrame) -> pd.DataFrame:
        """Remove rows whose RelationshipURI is in the stored filter. Generate it first callig the generate_list_relationship_filter"""
        return RDF[~RDF["RelationshipURI"].isin(self.relationship_filter_list)]
    # def filter_movie_by_rel_uri_frequence()
    def generate_frequency_movie_filter(self, MOVIE_COUNT: pd.DataFrame ,min_treshold: int, max_treshold: int):
        """
        You MUST call this before filter the dataset by movie frequence [filter_by_frequence_movie_id()], 
        since this method creates such filter
        Args:
            MOVIE_COUNT (pd.DataFrame): ["MovieID","Count"]
            min_treshold (int): 
            max_treshold (int): 
        """        
        MOVIE_COUNT = MOVIE_COUNT[MOVIE_COUNT["Count"] >= min_treshold]
        MOVIE_COUNT = MOVIE_COUNT[MOVIE_COUNT["Count"] < max_treshold]
        self.MOVIE_FILTER = MOVIE_COUNT #["MovieID"]
    def generate_frequency_relationship_filter(self, REL_COUNT: pd.DataFrame ,min_treshold: int, max_treshold: int):
        REL_COUNT = REL_COUNT[REL_COUNT["Count"] >= min_treshold]
        REL_COUNT = REL_COUNT[REL_COUNT["Count"] < max_treshold]
        self.REL_FILTER = REL_COUNT #["RelationshipURI"]
    def filter_by_frequency_movie_id(self, RDF: pd.DataFrame) -> pd.DataFrame:
        RDF = RDF[RDF["MovieID"].isin(self.MOVIE_FILTER["MovieID"])]
        return RDF
    def filter_by_frequency_relationship(self, RDF: pd.DataFrame) -> pd.DataFrame:
        RDF = RDF[RDF["RelationshipURI"].isin(self.REL_FILTER["RelationshipURI"])]
        return RDF
    def rdf_add_special_token(self, RDF: pd.DataFrame):
        """
        Adds RDF special token to each element of the tuple. i.e: SUBJ to SubjectURI, OBJ to ObjectURI, REL to RelationshipURI. 
        Check Scrits/Libs/CleaningPipeline/special_token.py for the up-to-date special token.
        It only adds the special token of the three element of the RDF, no other special token.
        Args:
            RDF (pd.DataFrame):
        Returns:
            pd.DataFrame: ["MovieURI","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
        """        
        # if the filter runned before sliced the RDF and created a View, here the problem is resolved
        # for more context: SettingWithCopyWarning
        RDF = RDF.copy()
        # at the beginning of SubjectURI RelationshipURI ObjectURI, add their special token  
        RDF["SubjectURI"] = SpecialToken.SUBJECT.value + RDF["SubjectURI"]
        RDF["ObjectURI"] = SpecialToken.OBJECT.value + RDF["ObjectURI"]
        RDF["RelationshipURI"] = SpecialToken.RELATIONSHIP.value + RDF["RelationshipURI"]
        return RDF
    def drop_na_from_dataset(self, RDF: pd.DataFrame) -> pd.DataFrame:
        # dataset has SubjectURI RelationshipURI ObjectURI
        #  want to drop the '' in them
        # Replace empty strings with NaN
        RDF = RDF.replace('', np.nan)
        # Drop rows where any of the key columns are NaN
        RDF = RDF.dropna(subset=["SubjectURI", "RelationshipURI", "ObjectURI"])
        return RDF
    def rebuild_by_movie(self, RDF: pd.DataFrame) -> pd.DataFrame:
        """_summary_
        Args:
            RDF (pd.DataFrame): ["MovieID","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
        Returns:
            pd.DataFrame: ["MovieID","Triple","Abstract"]
        """        
        # to execute this method you have to have itereted by movie_id
        # because as design we want at the end one row for each movie
        # MovieID and abstract can be given as input for a more generic method
        # movie_id = RDF["MovieID"].iloc(0)
        # abstract = RDF["Abstract"].iloc(0)
        # first let's combine each row creating column triple as join of rdf
        RDF["Triple"] =  RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"]
        # special token 
        RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
        # combine rows into one
        # MovieID and Abstract are unique for each other 1 <-> 1
        RDF = RDF.groupby(["MovieID", "Abstract"])["Triple"].apply("".join).reset_index()
        # add special token for: start of triple, end of triple and start of abstract
        RDF["Triple"] = SpecialToken.START_TRIPLE_LIST.value + RDF["Triple"] 
        RDF["Abstract"] = SpecialToken.ABSTRACT.value + RDF["Abstract"]
        return RDF[["MovieID","Triple","Abstract"]]
    def group_by_movie_from_triple(self, RDF: pd.DataFrame) -> pd.DataFrame:
        """
        Args:
            RDF (pd.DataFrame): ["MovieID","Triple","Abstract"]
        Returns:
            pd.DataFrame: ["MovieID","Triple","Abstract"]
        """
        # combine rows into one
        # MovieID and Abstract are unique for each other 1 <-> 1
        RDF = RDF.groupby(["MovieID", "Abstract"])["Triple"].apply("".join).reset_index()
        # add special token for: start of triple, end of triple and start of abstract
        RDF["Triple"] = SpecialToken.START_TRIPLE_LIST.value + RDF["Triple"] 
        RDF["Abstract"] = SpecialToken.ABSTRACT.value + RDF["Abstract"]
        return RDF[["MovieID","Triple","Abstract"]]
    @staticmethod
    def build_triple(RDF: pd.DataFrame):
        """
        Obtains joined RDF triple in one element, togheter with START and END special token
        Args:
            RDF (pd.DataFrame): at least ["SubjectURI", "RelationshipURI", "ObjectURI"]
        Returns:
            pd.DataFrame: RDF["Triple"] (just this column)
        """        
        # let's combine each row creating column triple as join of rdf
        RDF["Triple"] =  RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"]
        # special token 
        RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
        return RDF["Triple"]
    @staticmethod
    def build_incomplete_triple(RDF: pd.DataFrame):
        """
        Method helper used for the third task: "Predicting a masked component within an RDF triple". 
        Obtains joined RDF triple in one element, togheter with START and END special token.
        The MISSING element will be replaced by the special token <MASK>
        Args:
            RDF (pd.DataFrame): 2 of the following ["SubjectURI", "RelationshipURI", "ObjectURI"]
        Returns:
            RDF["Triple"]: pd.Series  (just this column, NOT A DATAFRAME)
        """        
        # let's create a new column "Triple" with the joined RDF
        # the following creates a column of MASK token of the lenght of the dataframe,
        # it is not needed since we expect to have a dataframe of just one column, but its more robust (AND SLOW)
        MISSING = pd.Series([SpecialToken.MASK.value] * len(RDF), index=RDF.index)
        RDF["Triple"] =  ( 
                    RDF.get("SubjectURI", MISSING) + 
                    RDF.get("RelationshipURI", MISSING) + 
                    RDF.get("ObjectURI", MISSING))
        # special token 
        RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
        return RDF["Triple"]
    @staticmethod
    def build_for_mask_task(RDF_incomplete: pd.DataFrame, MISSING: pd.DataFrame) -> pd.DataFrame:
        # currently not used
        """
        Method helper used for the third task: "Predicting a masked component within an RDF triple". 
        Given two Dataframe, the first containing the incompleted RDF and the other only the missing componment,
        this methods applies the special token
        Args:
            RDF (pd.DataFrame): _description_
        Returns:
            pd.DataFrame: _description_
        """  
        # take an example dataframe as ["SubjectURI",""]    
        # as input two dataframe, one with 2 column  
        return None
    def regex_on_objects(self, RDF: pd.DataFrame) -> pd.DataFrame:
        RDF["ObjectURI"] = (RDF["ObjectURI"].astype("string")
                   .str.replace(r"\r?\n+", ", ", regex=True)   # newlines -> ", "
                   .str.replace(r"\*", "", regex=True))        # delete all asterisks
        return RDF
--- a/Scripts/DataCleaning/legacy/pipeline.py
+++ b/Scripts/DataCleaning/legacy/pipeline.py
@ -0,0 +1,145 @@
 import re
 from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
 from Scripts.DataCleaning.legacy.filter import PipelineApplier
 # tasks dataset builder
 from Scripts.DataCleaning.data_output_models.rdf_mask_task import RDF_mask_task_dataset
 from Scripts.DataCleaning.data_output_models.bpe_corpus import BPE_corpus
 from Scripts.DataCleaning.data_output_models.rdf_text_tasks import RDF_text_task_dataset
 from Scripts.DataCleaning.data_output_models.rdf_completation_task import RDF_completation_task_dataset
 from Scripts.DataCleaning.data_output_models.debug_csv import Debug_csv
 import pandas as pd
 class Pipeline():
    def __init__(self):
        self.sql_endpoint = SqlEndpoint()
        # classes to manage taskes' datasets
        self.task_rdf_mask = RDF_mask_task_dataset("./Assets/Dataset/Tmp/rdf_mask.csv")
        self.task_bpe_corpus = BPE_corpus("./Assets/Dataset/Tmp/corpus.txt")
        self.task_rdf_text = RDF_text_task_dataset("./Assets/Dataset/Tmp/rdf_text.csv")
        self.task_rdf_completation = RDF_completation_task_dataset("./Assets/Dataset/Tmp/rdf_completation.csv")
        # prepare the filter
        # the filter applier needs to know the frequence of Movies and Relationship among all the Dataset
        self.filter_applier = PipelineApplier()
        MOVIE_COUNT = self.sql_endpoint.get_movies_id_count()
        REL_COUNT = self.sql_endpoint.get_relationship_count()
        self.filter_applier.generate_frequency_movie_filter(MOVIE_COUNT,50,3000)
        self.filter_applier.generate_frequency_relationship_filter(REL_COUNT, 50, 2395627) # from 2718 to 3069 
        # prepare the filter on the relationshipURI you want to delete:
        relationship_uri_banned_list = [
            "dbp-dbp:wikiPageUsesTemplate","w3:2000/01/rdf-schema#label","dbp-dbo:abstract",
            "dbp-dbo:wikiPageID","dbp-dbo:wikiPageRevisionID", "dbp-dbo:wikiPageDisambiguates",
            "w3:2002/07/owl#sameAs","dbp-dbp:image","dbp-dbo:wikiPageLength", "w3:2000/01/rdf-schema#comment",
            "dbp-dbo:thumbnail", "foaf:depiction", "w3:1999/02/22-rdf-syntax-ns#type", 
            "dbp-dbp:id","dbp-dbp:totalWidth", "w3:ns/prov#wasDerivedFrom", "dbp-dbp:n", "dbp-dbp:alt",
            "dbp-dbo:soundRecording"
            ]
        self.filter_applier.generate_list_relationship_filter(relationship_uri_banned_list)
    def execute_task_bpe_corpus(self):
        for RDF in self._get_cleaned_movie_rows():
            RDF = self.filter_applier.rebuild_by_movie(RDF)
            RDF = RDF[["Triple","Abstract"]]
            self.task_bpe_corpus.write_from_df(RDF)
        self._end_file_handler()
    def execute_task_rdf_mask(self):
        for RDF in self._get_cleaned_movie_rows():
            self.task_rdf_mask.write(RDF)
        self._end_file_handler()
    def execute_tasks_rdf_text(self):
        for RDF in self._get_cleaned_movie_rows():
            RDF = self.filter_applier.rebuild_by_movie(RDF)
            self.task_rdf_text.write(RDF)
        self._end_file_handler()
    def execute_task_rdf_completation(self):
        for RDF in self._get_cleaned_movie_rows():
            RDF["Triple"] = self.filter_applier.build_triple(RDF)
            self.task_rdf_completation.write(RDF[["MovieID","Triple"]])
        self._end_file_handler()
    def execute_all_task(self):
        for RDF in self._get_cleaned_movie_rows():
            self.task_rdf_mask.write(RDF)
            RDF["Triple"] = self.filter_applier.build_triple(RDF)
            self.task_rdf_completation.write(RDF[["MovieID","Triple"]])
            RDF = self.filter_applier.group_by_movie_from_triple(RDF[["MovieID","Triple","Abstract"]])
            self.task_rdf_text.write(RDF)
            self.task_bpe_corpus.write_from_df(RDF[["Triple","Abstract"]])
        self._end_file_handler()
    def _end_file_handler(self):
        self.task_bpe_corpus.close()
        self.task_rdf_mask.close()
        self.task_rdf_text.close()
        self.task_rdf_completation.close()
    def _get_cleaned_movie_rows(self):
        for RDF in self.sql_endpoint.get_abbreviated_dataset_by_movie_id():
            RDF = self.filter_applier.drop_na_from_dataset(RDF)
            RDF = self.filter_applier.filter_by_frequency_movie_id(RDF)
            RDF = self.filter_applier.filter_by_frequency_relationship(RDF)
            # other filter
            #
            RDF = self.filter_applier.delete_relationship_by_list_filter(RDF)
            # regex on ObjectURI
            RDF = self.filter_applier.regex_on_objects(RDF)
            if RDF.empty:
                continue
            RDF = self.filter_applier.rdf_add_special_token(RDF)  # WARNING, THIS MUST BE DONE AFTER FILTER BY FREQUENCE
            yield RDF
    def use_toy_dataset(self):
        # CHOOSEN MOVIE:
        # The Dark Knight   : 117248
        # Inception         : 147074
        # The Avengers      : 113621
        # Cast Away         : 1123
        # The Departed      : 117586
        # American Psycho   : 90177
        # Avatar            : 71587
        # Django Unchained  : 138952
        # Spirited Away     : 144137
        # Knives Out        : 148025
        movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]
        self.sql_endpoint.movie_ids = movie_list
    def generate_csv_debug_file(self, debug_path:str):
        debug_csv = Debug_csv(debug_path)
        for RDF in self._get_cleaned_movie_rows():
            debug_csv.write(RDF)
        debug_csv.close()
 # there are a lot of settings to manage
 # you only need to change settings: 
 # in the init for file paths, frequency filter limit, banned reletionshipURI
 # in the use_toy_dataset , to change the toy dataset
 # in _get_cleaned_movie_rows: to change how the pipeline behave
 pipeline = Pipeline()
 pipeline.use_toy_dataset()
 # pipeline.execute_task_bpe_corpus()
 # pipeline.execute_task_rdf_mask()
 # pipeline.execute_tasks_rdf_text()
 # pipeline.execute_task_rdf_completation()
 # pipeline.execute_all_task()
 pipeline.generate_csv_debug_file("Assets/Dataset/Tmp/debug.csv")
--- a/Scripts/DataCleaning/path_splitter_tree.py
+++ b/Scripts/DataCleaning/path_splitter_tree.py
@ -0,0 +1,161 @@
 import argparse
 import csv
 import sys
 from typing import Self
 class ProgramArgs:
    def __init__(self, file: str, csv_uri_header: str, output: str, treshold: int):
        """
        Args:
            file (str): 
            csv_header (str): The name of the column of the csv file from which the program will get the URIs
            output (str): 
            treshold (int): 
        """        
        self.file = file
        self.csv_uri_header = csv_uri_header
        self.output = output
        self.treshold = treshold
 class Node:
    def __init__(
        self,
        name: str,
        quantity: int = 0,
    ):
        self.name = name
        self.quantity = quantity
        self.children: dict[str, Node] = {}
    @property
    def is_leaf(self):
        return len(self.children) == 0
    def append_child(self, child: list[str]):
        # print(child)
        KEY = child[0]
        if not self.children.get(KEY):
            # if the key has no value, it means we are traversing this branch for the first time
            # create another node for the key
            self.children[KEY] = Node(KEY, 0)
        # take the node for the key
        CHILD = self.children[KEY]
        self.quantity += 1
        # if the child list to enter has only one element, which is KEY, no more node will be created
        if len(child) == 1:
            return
        new_children = child[1:]
        CHILD.append_child(new_children)
    def __str__(self):
        return f"{self.name}/ - {self.quantity}"
 def get_args(args: list[str]) -> ProgramArgs:
    PARSER = argparse.ArgumentParser()
    PARSER.add_argument("--input-file", "-i", required=True, type=str)
    PARSER.add_argument("--header-name", "-c", required=True, type=str)                       # c stands for column
    PARSER.add_argument("--output-file", "-o", required=True, type=str)
    PARSER.add_argument("--treshold", "-t", type=int, default=1)
    parsed_args, _ = PARSER.parse_known_args(args)
    # print(parsed_args.input_file)
    return ProgramArgs(parsed_args.input_file, parsed_args.header_name ,parsed_args.output_file, parsed_args.treshold)  # type ignore
 def get_debug_args() -> ProgramArgs:
    # -i ./Assets/Dataset/1-hop/movies.csv  -c subject -o Assets/Dataset/Tmp/prova.csv -t 1
    FILE = "./Assets/Dataset/1-hop/movies.csv"
    CSV_HEADER = "subject"
    OUTPUT = "./Assets/Dataset/Tmp/prova.csv"
    TRESHOLD = 1
    return ProgramArgs(
        FILE,
        CSV_HEADER,
        OUTPUT,
        TRESHOLD
    )
 def tree_like(file: str, csv_uri_header:str, out: str):
    INDENTATION = "    "
    properties: dict[str, Node] = {}
    properties["pure"] = Node("pure", 0)
    properties["URI"] = Node("uri", 0)
    FILE = open(file, "r", encoding="utf-8")
    # It is needed the header-name
    for row in csv.DictReader(FILE):
        uri_element = row[csv_uri_header]
        sections = uri_element.split("/")
        sections = list(filter(lambda item: item != "", sections))
        # print(sections)
        if sections[0] != "http:" and sections[0] != "https:":
            properties["pure"].append_child(sections)
            continue
        properties["URI"].append_child(sections)
    FILE.close()
    stack: list[tuple[Node, int]] = []
    for _, item in properties.items():
        stack.append((item, 0))
    OUT = open(out, mode="w", encoding="utf-8")
    while len(stack) > 0:
        LAST_ITEM = stack.pop()
        NODE: Node = LAST_ITEM[0]
        DEPTH: int = LAST_ITEM[1]
        INDENT: str = INDENTATION * DEPTH
        # Leaf node have quantity 0, so if i want them to appear the threshold have to be 0
        # if NODE.quantity < ARGS.treshold:
        if ARGS.treshold > NODE.quantity:
            continue
        OUT.write(f"{INDENT}- {NODE}\n")
        if NODE.is_leaf:
            continue
        CHILDREN = []
        for _, child in NODE.children.items():
            CHILDREN.append((child, DEPTH + 1))
        stack.extend(CHILDREN)
    OUT.close()
 if __name__ == "__main__":
    ARGS = get_args(sys.argv)
    # ARGS = get_debug_args()
    tree_like(ARGS.file,ARGS.csv_uri_header, ARGS.output)
--- a/Scripts/DataCleaning/pipeline/cleaner.py
+++ b/Scripts/DataCleaning/pipeline/cleaner.py
@ -0,0 +1,86 @@
 # This file deletes in the pipeline the unwanted relationship by different rules
 import pandas as pd
 import sqlite3
 import numpy as np
 from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
 from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
 class PipelineApplier():
    def __init__(self):
        pass
    def rdf_add_special_token(self, RDF: pd.DataFrame):
        """
        Adds RDF special token to each element of the tuple. i.e: SUBJ to SubjectURI, OBJ to ObjectURI, REL to RelationshipURI. 
        Check Scrits/Libs/CleaningPipeline/special_token.py for the up-to-date special token.
        It only adds the special token of the three element of the RDF, no other special token.
        Args:
            RDF (pd.DataFrame):
        Returns:
            pd.DataFrame: ["MovieURI","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
        """        
        # if the filter runned before sliced the RDF and created a View, here the problem is resolved
        # for more context: SettingWithCopyWarning
        RDF = RDF.copy()
        # at the beginning of SubjectURI RelationshipURI ObjectURI, add their special token  
        RDF["SubjectURI"] = SpecialToken.SUBJECT.value + RDF["SubjectURI"]
        RDF["ObjectURI"] = SpecialToken.OBJECT.value + RDF["ObjectURI"]
        RDF["RelationshipURI"] = SpecialToken.RELATIONSHIP.value + RDF["RelationshipURI"]
        return RDF
    def drop_na_from_dataset(self, RDF: pd.DataFrame) -> pd.DataFrame:
        RDF = RDF.replace('', np.nan)
        # Drop rows where any of the key columns are NaN
        RDF = RDF.dropna(subset=["SubjectURI", "RelationshipURI", "ObjectURI"])
        return RDF
    def rebuild_by_movie(self, RDF: pd.DataFrame) -> pd.DataFrame:
        """
        Args:
            RDF (pd.DataFrame): ["MovieID","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
        Returns:
            pd.DataFrame: ["MovieID","Triple","Abstract"]
        """        
        # to execute this method you have to have itereted by movie_id
        # because as design we want at the end one row for each movie
        # MovieID and abstract can be given as input for a more generic method
        # first let's combine each row creating column triple as join of rdf
        RDF["Triple"] =  RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"]
        # special token 
        RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
        # combine rows into one
        # MovieID and Abstract are unique for each other 1 <-> 1
        RDF = RDF.groupby(["MovieID", "Abstract"])["Triple"].apply("".join).reset_index()
        # add special token for: start of triple, end of triple and start of abstract
        RDF["Triple"] = SpecialToken.START_TRIPLE_LIST.value + RDF["Triple"]+SpecialToken.END_OF_SENTENCE.value
        RDF["Abstract"] = SpecialToken.ABSTRACT.value + RDF["Abstract"] + SpecialToken.END_OF_SENTENCE.value
        return RDF[["MovieID","Triple","Abstract"]]
    @staticmethod
    def build_triple(RDF: pd.DataFrame):
        """
        Obtains joined RDF triple in one element, togheter with START and END special token
        Args:
            RDF (pd.DataFrame): at least ["SubjectURI", "RelationshipURI", "ObjectURI"]
        Returns:
            pd.DataFrame: RDF["Triple"] (just this column)
        """        
        # let's combine each row creating column triple as join of rdf
        RDF["Triple"] =  RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"]
        # special token 
        RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
        return RDF["Triple"]
    def regex_on_objects(self, RDF: pd.DataFrame) -> pd.DataFrame:
        RDF["ObjectURI"] = (RDF["ObjectURI"].astype("string")
                   .str.replace(r"\r?\n+", ", ", regex=True)   # newlines -> ", "
                   .str.replace(r"\*", "", regex=True))        # delete all asterisks
        return RDF
--- a/Scripts/DataCleaning/pipeline/movie_filter.py
+++ b/Scripts/DataCleaning/pipeline/movie_filter.py
@ -0,0 +1,103 @@
 import pandas as pd
 from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
 class MovieFilter:
    def __init__(self) -> None:
        self.sql_endpoint = SqlEndpoint()
        # first obtain all movie_id
        movie_query = "SELECT MovieID FROM Movies"
        self.MOVIE_FILTER = self.sql_endpoint.get_dataframe_from_query(movie_query)
    def frequency_filter(self, min_treshold:int, max_treshold:int):
        movie_list_placeholder = ",".join(["?"] * len(self.MOVIE_FILTER))
        filter_query = f"""
            SELECT MovieID
            FROM RDFs
            WHERE MovieID IN ({movie_list_placeholder})
            GROUP BY MovieID
            HAVING COUNT(*) BETWEEN {min_treshold} AND {max_treshold};
        """
        self.MOVIE_FILTER = self.sql_endpoint.get_dataframe_from_query(filter_query, tuple(self.MOVIE_FILTER["MovieID"].to_list()))
    def get_movie_id(self):
        return self.MOVIE_FILTER
    def relation_filter(self, parsed_rel_uri: str, min_treshold:int, max_treshold:int):
        movie_ids = self.MOVIE_FILTER["MovieID"].to_list()
        movie_list_placeholder = ",".join(["?"] * len(movie_ids))
        filter_query = f"""
            SELECT MovieID
            FROM RDFs
            JOIN ParsedRelationships ON ParsedRelationships.RelationshipID = RDFs.RelationshipID
            WHERE MovieID IN ({movie_list_placeholder})
            GROUP BY MovieID
            HAVING SUM(CASE WHEN ParsedRelationships.RelationshipURI = '{parsed_rel_uri}'  THEN 1 ELSE 0 END)
                BETWEEN {min_treshold} AND {max_treshold};
        """
        params = tuple(movie_ids) # + (parsed_rel_uri, min_treshold, max_treshold)
        self.MOVIE_FILTER = self.sql_endpoint.get_dataframe_from_query(filter_query, params)
    def filter_by_director(self):
        director_list = ['dbp-dbo:director','dbp-dbp:director']
        movie_ids = self.MOVIE_FILTER["MovieID"].to_list()
        movie_list_placeholder = ",".join(["?"] * len(movie_ids))
        filter_query = f"""
            SELECT DISTINCT RDFs.MovieID
            FROM RDFs
            JOIN ParsedRelationships USING (RelationshipID)
            WHERE RDFs.MovieID IN ({movie_list_placeholder})
            AND ParsedRelationships.RelationshipURI IN {tuple(director_list)};
        """
        params = tuple(movie_ids)
        self.MOVIE_FILTER = self.sql_endpoint.get_dataframe_from_query(filter_query, params)
    def filter_by_english_movies(self):
        movie_ids = self.MOVIE_FILTER["MovieID"].to_list()
        movie_list_placeholder = ",".join(["?"] * len(movie_ids))
        relationship = ["dbp-dbp:language"]
        objects_list = ["English", "dbp-dbr:English_language"]
        filter_query = f"""
            SELECT DISTINCT RDFs.MovieID
            FROM RDFs
            INNER JOIN ParsedRelationships USING (RelationshipID)
            INNER JOIN ParsedObjects USING (ObjectID)
            WHERE RDFs.MovieID IN ({movie_list_placeholder})
            AND ParsedRelationships.RelationshipURI IN ('{relationship[0]}')
            AND ParsedObjects.ObjectURI in {tuple(objects_list)};
        """
        other_query = f"""
            SELECT RDFs.MovieID
            FROM RDFs
            INNER JOIN ParsedRelationships USING (RelationshipID)
            INNER JOIN ParsedObjects USING (ObjectID)
            WHERE RDFs.MovieID IN ({movie_list_placeholder})
            AND ParsedRelationships.RelationshipURI IN ('{relationship[0]}')
            GROUP BY RDFs.MovieID
            HAVING
            SUM(CASE WHEN ParsedObjects.ObjectURI IN {tuple(objects_list)} THEN 1 ELSE 0 END) >= 1
            AND
            SUM(CASE WHEN ParsedObjects.ObjectURI NOT IN {tuple(objects_list)} THEN 1 ELSE 0 END) = 0;
        """
        params = tuple(movie_ids)
        self.MOVIE_FILTER = self.sql_endpoint.get_dataframe_from_query(other_query, params)
 # movie_filter = MovieFilter()
 # movie_filter.frequency_filter(5,10)
--- a/Scripts/DataCleaning/pipeline/pipeline.py
+++ b/Scripts/DataCleaning/pipeline/pipeline.py
@ -0,0 +1,155 @@
 from movie_filter import MovieFilter
 from relationship_filter import RelationshipFilter
 from rdf_filter import RdfFilter
 from cleaner import PipelineApplier
 from Scripts.DataCleaning.data_output_models.bpe_corpus import BPE_corpus
 from Scripts.DataCleaning.data_output_models.rdf_text_tasks import RDF_text_task_dataset
 from Scripts.DataCleaning.data_output_models.rdf_completation_task import RDF_completation_task_dataset
 from Scripts.DataCleaning.data_output_models.debug_csv import Debug_csv
 import pandas as pd
 RELATIONSHIP_FILTER_LIST = [
            "dbp-dbp:wikiPageUsesTemplate","w3:2000/01/rdf-schema#label","dbp-dbo:abstract",
            "dbp-dbo:wikiPageID","dbp-dbo:wikiPageRevisionID", "dbp-dbo:wikiPageDisambiguates",
            "w3:2002/07/owl#sameAs","dbp-dbp:image","dbp-dbo:wikiPageLength", "w3:2000/01/rdf-schema#comment",
            "dbp-dbo:thumbnail", "foaf:depiction", "w3:1999/02/22-rdf-syntax-ns#type", 
            "dbp-dbp:id","dbp-dbp:totalWidth", "w3:ns/prov#wasDerivedFrom", "dbp-dbp:n", "dbp-dbp:alt",
            "dbp-dbo:soundRecording", "dbp-dbp:align", "dbp-dbp:format", 
            "dbp-dbp:filename", "dbp-dbp:wikt", "foaf:isPrimaryTopicOf", "dbp-dbp:quote", "foaf:homepage",
            "dbp-dbp:wordnet_type", "dbp-dbp:length","dbp-dbp:caption", "dbp-dbo:imdbId", "dbp-dbp:border", "dbp-dbp:note",
            "dbp-dbp:postalCodeType", "dbp-dbp:extraColumn", "foaf:homepage", "dbp-dbp:bgcolor","dbp-dbp:prevTitle", 
            "dbp-dbp:imageUpright", "dbp-dbp:url", "dbp-dbp:italicTitle", "dbp-dbp:imageSize", "dbp-dbp:text", 
            "dbp-dbp:captionAlign", "dbp-dbp:headerAlign", "dbp-dbp:height", "dbp-dbp:link", "dbp-dbo:wikiPageInterLanguageLink",
            "w3:2003/01/geo/wgs84_pos#lat", "w3:2003/01/geo/wgs84_pos#long", "http://www.georss.org/georss/point", 
            "dbp-dbp:bgcolor", "dbp-dbp:mc", "dbp-dbp:rev3score", "dbp-dbp:rev4score", "dbp-dbp:imageAlt", 
            "dbp-dbp:b", "dbp-dbp:s", "dbp-dbp:c", "dbp-dbp:d", "dbp-dbp:m", "dbp-dbp:v", "dbp-dbp:mw", "dbp-dbp:fontsize",
            "dbp-dbp:salign", "dbp-dbp:q", "dbp-dbp:portal", "dbp-dbp:dSearch", "dbp-dbp:header", "w3:2003/01/geo/wgs84_pos#geometry",
            "dbp-dbp:shortsummary", "dbp-dbp:fixAttempted", "dbp-dbo:developer", "dbp-dbp:no", "dbp-dbp:ref", "dbp-dbp:infoa"
            "dbp-dbp:infob", "dbp-dbp:1a", "dbp-dbp:1p", "dbp-dbp:2a", "dbp-dbp:2p", "http://rdvocab.info/RDARelationshipsWEMI/manifestationOfWork",
            "dbp-dbp:isbn", "dbp-dbp:titleWidth", "dbp-dbp:prodcode", "dbp-dbp:page", "w3:2004/02/skos/core#closeMatch",
            "dbp-dbp:colwidth", "dbp-dbp:imagesize", "dbp-dbp:rr", "dbp-dbp:date", "dbp-dbp:type", "dbp-dbp:list", 
            "dbp-dbp:listEpisodes", "dbp-dbp:footerAlign", "dbp-dbp:float", "dbp-dbp:bot", "dbp-dbp:p", "dbp-dbp:l", "dbp-dbp:t", "dbp-dbp:j",
            "dbp-dbp:1y", "dbp-dbp:2y", "dbp-dbp:1pp", "dbp-dbp:vgs", "dbp-dbp:3a", "dbp-dbp:3p", "dbp-dbp:3y", "dbp-dbp:4a", "dbp-dbp:4y",
            "dbp-dbp:website"
            ]
 RELATIONSHIP_WHITE_LIST = [
            "dbp-dbp:director","dbp-dbo:starring", "dbp-dbo:writer", "dbp-dbp:name", "dbp-dbp:genre", "purl:dc/terms/subject"
            ]
 """ 
 SELECT DISTINCT field3
 FROM debug
 """
 class Pipeline():
    def __init__(self) -> None:
        self._movie_filter = MovieFilter()
        self._relationship_filter = RelationshipFilter()
        self._rdf_filter = RdfFilter()
        self._pipeline = PipelineApplier()
        self.task_bpe_corpus = BPE_corpus("./Assets/Dataset/Tmp/corpus.txt")
        self.task_rdf_text = RDF_text_task_dataset("./Assets/Dataset/Tmp/rdf_text.csv")
        self.task_rdf_completation = RDF_completation_task_dataset("./Assets/Dataset/Tmp/rdf_completation.csv")
        self._movie_filter.frequency_filter(50,3000)
        self._relationship_filter.frequency_filter(25, 2395627) # from 2718 to 3069 
        self._relationship_filter.delete_relationship_uri_by_list(RELATIONSHIP_FILTER_LIST)
    def other_filter(self):
        self._movie_filter.relation_filter("purl:dc/terms/subject",5,100)
        self._movie_filter.filter_by_director()
        self._movie_filter.filter_by_english_movies()
        self._movie_filter.relation_filter("dbp-dbp:budget",1,100)      # the most important film have relationship budget
        self._movie_filter.relation_filter("dbp-dbp:released",1,100)    # to cut to 2000 :(
    def _get_cleaned_movie_rows(self):
        movie_ids = self._movie_filter.get_movie_id()
        rel_ids = self._relationship_filter.get_relationship_id()
        # rel_ids = self._relationship_filter.get_relationship_id_from_white_list(RELATIONSHIP_WHITE_LIST)
        for RDF in self._rdf_filter.yield_movie_abbreviated_rdfs(movie_ids,rel_ids):
            RDF = self._pipeline.drop_na_from_dataset(RDF)
            RDF = self._pipeline.regex_on_objects(RDF)
            RDF = self._pipeline.rdf_add_special_token(RDF)
            if RDF.empty:
                continue
            yield RDF
    def execute_task_bpe_corpus(self):
        for RDF in self._get_cleaned_movie_rows():
            RDF = self._pipeline.rebuild_by_movie(RDF)
            RDF = RDF[["Triple","Abstract"]]
            self.task_bpe_corpus.write_from_df(RDF)
        self._end_file_handler()
    def execute_tasks_rdf_text(self):
        for RDF in self._get_cleaned_movie_rows():
            RDF = self._pipeline.rebuild_by_movie(RDF)
            self.task_rdf_text.write(RDF)
        self._end_file_handler()
    def execute_task_rdf_completation(self):
        for RDF in self._get_cleaned_movie_rows():
            RDF["Triple"] = self._pipeline.build_triple(RDF)
            self.task_rdf_completation.write(RDF[["MovieID","Triple"]])
        self._end_file_handler()
    def _end_file_handler(self):
        self.task_bpe_corpus.close()
        self.task_rdf_text.close()
        self.task_rdf_completation.close()
    def execute_all_task(self):
        for RDF in self._get_cleaned_movie_rows():
            completation_RDF = RDF.copy()
            completation_RDF["Triple"] = self._pipeline.build_triple(completation_RDF)
            self.task_rdf_completation.write(completation_RDF[["MovieID","Triple"]])
            RDF = self._pipeline.rebuild_by_movie(RDF)
            self.task_rdf_text.write(RDF)
            self.task_bpe_corpus.write_from_df(RDF[["Triple","Abstract"]])
        self._end_file_handler()
    def use_toy_dataset(self):
        # CHOOSEN MOVIE:
        # The Dark Knight   : 117248
        # Inception         : 147074
        # The Avengers      : 113621
        # Cast Away         : 1123
        # The Departed      : 117586
        # American Psycho   : 90177
        # Avatar            : 71587
        # Django Unchained  : 138952
        # Spirited Away     : 144137
        # Knives Out        : 148025
        # [106465,106466,106467,106468,106469,106470,106471,106472,106473]
        movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]
        self._movie_filter.MOVIE_FILTER = pd.DataFrame({"MovieID": movie_list})
    def generate_csv_debug_file(self, debug_path:str):
        debug_csv = Debug_csv(debug_path)
        for RDF in self._get_cleaned_movie_rows():
            debug_csv.write(RDF)
        debug_csv.close()
 pipe = Pipeline()
 #pipe.use_toy_dataset()
 pipe.other_filter()
 # pipe.execute_all_task()
 pipe.generate_csv_debug_file("Assets/Dataset/Tmp/debug.csv")
--- a/Scripts/DataCleaning/pipeline/rdf_filter.py
+++ b/Scripts/DataCleaning/pipeline/rdf_filter.py
@ -0,0 +1,32 @@
 import pandas as pd
 from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
 class RdfFilter:
    def __init__(self) -> None:
        self.sql_endpoint = SqlEndpoint()
    # def delete_hyperum_when_movie(self):
        # purl:linguistics/gold/hypernym 
        # is almost ever as "dbp-dbr:Movie" or "dbp-dbr:Film"
        # banned triple
    def yield_movie_abbreviated_rdfs(self, MOVIE_ID: pd.DataFrame, REL_ID: pd.DataFrame):
        relationship_placeholder = ",".join(["?"] * len(REL_ID))
        param = tuple(REL_ID["RelationshipID"].to_list())
        QUERY = f"""
                SELECT MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
                FROM RDFs
                INNER JOIN ParsedSubjects USING (SubjectID)
                INNER JOIN ParsedRelationships USING (RelationshipID)
                INNER JOIN ParsedObjects USING (ObjectID)
                INNER JOIN WikipediaAbstracts USING (MovieID)
                WHERE MovieID = (?) AND RelationshipID IN ({relationship_placeholder});
                """        
        for movie_id in MOVIE_ID["MovieID"].to_list():
            params = (movie_id,) + param
            yield self.sql_endpoint.get_dataframe_from_query(QUERY, params=params)
--- a/Scripts/DataCleaning/pipeline/relationship_filter.py
+++ b/Scripts/DataCleaning/pipeline/relationship_filter.py
@ -0,0 +1,54 @@
 import pandas as pd
 from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
 class RelationshipFilter:
    def __init__(self) -> None:
        self.sql_endpoint = SqlEndpoint()
        # first obtain all relationship_id
        relationship_query = "SELECT RelationshipID FROM Relationships"
        self.RELATIONSHIP_FILTER = self.sql_endpoint.get_dataframe_from_query(relationship_query)
    def frequency_filter(self, min_treshold:int, max_treshold:int):
        movie_list_placeholder = ",".join(["?"] * len(self.RELATIONSHIP_FILTER))
        filter_query = f"""
            SELECT RelationshipID
            FROM RDFs
            WHERE RelationshipID IN ({movie_list_placeholder})
            GROUP BY RelationshipID
            HAVING COUNT(*) BETWEEN {min_treshold} AND {max_treshold};
        """
        self.RELATIONSHIP_FILTER = self.sql_endpoint.get_dataframe_from_query(filter_query, tuple(self.RELATIONSHIP_FILTER["RelationshipID"].to_list()))
    def get_relationship_id(self):
        return self.RELATIONSHIP_FILTER
    def get_relationship_id_from_white_list(self, relationship_list: list[str]):
        ids_placeholder = ",".join(["?"] * len(self.RELATIONSHIP_FILTER))
        uri_placeholder = ",".join(["?"] * len(relationship_list))
        filter_query = f"""
            SELECT RelationshipID
            FROM ParsedRelationships
            WHERE RelationshipID IN ({ids_placeholder})
            AND RelationshipURI IN ({uri_placeholder});
        """
        params = tuple(self.RELATIONSHIP_FILTER["RelationshipID"].to_list()) + tuple(relationship_list)
        return self.sql_endpoint.get_dataframe_from_query(filter_query, params)
    def delete_relationship_uri_by_list(self, filter_list: list[str]):
        ids_placeholder = ",".join(["?"] * len(self.RELATIONSHIP_FILTER))
        uri_placeholder = ",".join(["?"] * len(filter_list))
        filter_query = f"""
            SELECT RelationshipID
            FROM ParsedRelationships
            WHERE RelationshipID IN ({ids_placeholder})
            AND RelationshipURI NOT IN ({uri_placeholder});
        """
        params = tuple(self.RELATIONSHIP_FILTER["RelationshipID"].to_list()) + tuple(filter_list)
        self.RELATIONSHIP_FILTER = self.sql_endpoint.get_dataframe_from_query(filter_query, params)
--- a/Scripts/DataGathering/analysis.py
+++ b/Scripts/DataGathering/analysis.py
@ -0,0 +1,53 @@
 import argparse
 import sys
 import pandas as pd
 class ProgramArgs:
    def __init__(
        self, input_file: str, column: str, output_file: str, count: bool
    ) -> None:
        self.input_file = input_file
        self.column = column
        self.output_file = output_file
        self.count = count
 def get_args(args: list[str]) -> ProgramArgs:
    PARSER = argparse.ArgumentParser()
    PARSER.add_argument("--input-file", "--input", "-i", required=True, type=str)
    PARSER.add_argument("--output-file", "--output", "-o", required=True, type=str)
    PARSER.add_argument("--column", "--col", required=True, type=str)
    PARSER.add_argument(
        "--count", "-c", action="store_const", const=True, default=False
    )
    parsed_args, _ = PARSER.parse_known_args(args)
    return ProgramArgs(
        parsed_args.input_file,
        parsed_args.column,
        parsed_args.output_file,
        parsed_args.count,
    )  # type ignore
 if __name__ == "__main__":
    ARGS = get_args(sys.argv)
    OUTPUT_FILE = open(ARGS.output_file, "w+", encoding="utf-8")
    # Load the CSV
    df = pd.read_csv(ARGS.input_file)
    # Count occurrences of each unique last part
    item_counts = df[ARGS.column].value_counts()
    # Print the counts
    for item, count in item_counts.items():
        if ARGS.count:
            OUTPUT_FILE.write(f"{item}: {count}\n")
        else:
            OUTPUT_FILE.write(f"{item}\n")
--- a/Scripts/DataGathering/fetchdata.py
+++ b/Scripts/DataGathering/fetchdata.py
@ -0,0 +1,146 @@
 import argparse
 from math import floor
 import sys
 from time import sleep
 import SPARQLWrapper
 class ProgramData:
    def __init__(
        self,
        local_url,
        query_url,
        sparql_url,
        output_type,
        initial_offset,
        timeout,
        limit,
        max_pages,
        verbosity_level,
    ) -> None:
        self.local_url = local_url
        self.query_url = query_url
        self.sparql_url = sparql_url
        self.output_type = output_type
        self.initial_offset = initial_offset
        self.timeout = timeout
        self.limit = limit
        self.max_pages = max_pages
        self.verbosity_level = verbosity_level
    @property
    def offset(self):
        return self.limit
    @property
    def query(self):
        with open(self.query_url, "r") as file:
            return file.read()
 DBPEDIA_URL = "https://dbpedia.org/sparql"
 TYPE = SPARQLWrapper.CSV
 TIMEOUT_SECONDS = 1.5
 LIMIT = int(1E4)
 INITIAL_OFFSET = 0
 MAX_PAGES = int(1E9)
 def gather_cli_args(args: list[str]) -> ProgramData:
    # TODO: Add argument for type
    PARSER = argparse.ArgumentParser("sparql data fetcher")
    PARSER.add_argument("--file-path", "--file", "--output", "-o", required=True, type=str)
    PARSER.add_argument("--query-file", "--query", "-q", required=True, type=str)
    PARSER.add_argument("--url", type=str, default=DBPEDIA_URL)
    PARSER.add_argument("--limit", type=int, default=LIMIT)
    PARSER.add_argument("--timeout", type=float, default=TIMEOUT_SECONDS)
    PARSER.add_argument("--offset", type=int, default=INITIAL_OFFSET)
    PARSER.add_argument("--max-pages", type=int, default=MAX_PAGES)
    PARSER.add_argument("--verbose", "-v", action="count", default=0)
    parsed_args, _ = PARSER.parse_known_args(args)
    return ProgramData(
        parsed_args.file_path,
        parsed_args.query_file,
        parsed_args.url,
        SPARQLWrapper.CSV,
        parsed_args.offset,
        parsed_args.timeout,
        parsed_args.limit,
        parsed_args.max_pages,
        parsed_args.verbose
    )
    # type: ignore
 def fetch_data(DATA: ProgramData):
    # Take correction of page into account
    page = int(floor(DATA.initial_offset / DATA.limit)) - 1
    exit = False
    while not exit:
        print(f"Starting to get page {page}")
        CURRENT_OFFSET = int(DATA.offset + (page * DATA.limit))
        sparql = SPARQLWrapper.SPARQLWrapper(DATA.sparql_url)
        sparql.setReturnFormat(TYPE)
        CURRENT_PAGE_QUERY = "\n".join([
            DATA.query,
            f"LIMIT {LIMIT}",
            f"OFFSET {CURRENT_OFFSET}"
        ])
        print(f"\nCurrent Query:\n{CURRENT_PAGE_QUERY}\n")
        sparql.setQuery(CURRENT_PAGE_QUERY)
        try:
            res = sparql.queryAndConvert()
            text = ""
            if type(res) == bytes:
                initial_offset = 0
                if page != 0:
                    initial_offset = 1
                lines = res.decode("utf-8", "ignore").split("\n")
                text = "\n".join(lines[initial_offset:])
            if text == "":
                exit = True
                continue
            with open(DATA.local_url, "a+", encoding="utf-8") as dataset:
                print(f"Writing page {page} on {DATA.local_url}")
                dataset.write(
                    text
                )
        except Exception as ex:
            print(f"Something went wrong during page {page}:\n\t{ex}")
        print(f"Sleeping for {TIMEOUT_SECONDS}")
        page += 1
        if page == MAX_PAGES - 1:
            exit = True
        sleep(TIMEOUT_SECONDS)
 if __name__ == "__main__":
    DATA = gather_cli_args(sys.argv)
    fetch_data(DATA)
--- a/Scripts/DataGathering/wikipedia_gathering.py
+++ b/Scripts/DataGathering/wikipedia_gathering.py
@ -0,0 +1,154 @@
 from pathlib import Path
 import pandas as pd
 import csv
 import time
 import requests
 input_csv = "./Assets/Dataset/1-hop/movie-pageid.csv"
 output_csv = "./Assets/Dataset/Tmp/wikipedia-summary.csv"
 sess = requests.Session()
 CHUNK = 20
 # Function to get clean full text from Wikipedia PageID
 def get_clean_text(pageIDS: list[str]):
    parsing_time = 0
    start_full = time.time()
    API_URL = "https://en.wikipedia.org/w/api.php"
    headers = {
        "User-Agent": "CoolBot/0.0"
        ""
        " (https://example.org/coolbot/; coolbot@example.org)"
    }
    ids = "|".join(pageIDS)
    start_fetch = time.time()
    res = sess.get(headers=headers, url=f"{API_URL}?action=query&pageids={ids}&prop=extracts&exintro=1&explaintext=1&format=json")
    end_fetch = time.time()
    fetch_time = end_fetch - start_fetch
    print(f"Time elapsed FETCH: {fetch_time} seconds")
    data = res.json()
    abstracts = {}
    # Make sure 'query' and the page exist
    SKIPPED = 0
    if "query" in data and "pages" in data["query"]:
        for pageID in pageIDS:
            if pageID in data["query"]["pages"]:
                page = data["query"]["pages"][pageID]
                extract: str = page.get("extract")
                if extract:
                    print(f"Entry FOUND for pageID {pageID}")
                    start_parse = time.time()
                    extract = extract.strip()
                    extract = extract.replace("\n", "")
                    end_parse = time.time()
                    parsing_time = end_parse - start_parse
                    print(f"Time elapsed PARSE: {parsing_time} seconds")
                    abstracts[pageID] = extract
                else:
                    SKIPPED += 1
                    print(f"Entry MISSING for pageID {pageID}")
            else:
                SKIPPED += 1
                print(f"Page MISSING for pageID {pageID}")
    print(f"Chunk done - Skipped {SKIPPED}")
    end_full = time.time()
    print(f"Time elapsed FULL: {end_full - start_full} seconds\n\tNO PARSE: {(end_full - start_full) - parsing_time} seconds")
    return abstracts
 def flush(movie_ids):
        abstracts = get_clean_text(movie_ids)
        start = time.time()
        with open(output_csv, "a", newline="", encoding="utf-8") as f_out:
            writer = csv.DictWriter(f_out, fieldnames=["subject", "text"])
            for id, text in abstracts.items():
                writer.writerow({"subject": id, "text": text})
        end = time.time()
        print(f"Time elapsed WRITE: {end - start} seconds")
 def reconcile() -> int:
    start = time.time()
    input_file = open(input_csv, "r", newline="", encoding="utf-8")
    output_file = open(output_csv, "r", newline="", encoding="utf-8")
    next(input_file)
    LAST_CHECKED = output_file.readlines()[-1].split(",")[0]
    current_check = input_file.readline().split(",")[1]
    index = 1
    while current_check != LAST_CHECKED:
        current_check = input_file.readline().split(",")[1].replace("\n", "")
        index += 1
    input_file.close()
    output_file.close()
    end = time.time()
    print(f"Time elapsed RECONCILE: {end - start} seconds")
    print(f"FOUND, we need to skip {index} lines")
    return index
 if not Path(output_csv).is_file():
    # Initialize output CSV
    with open(output_csv, "w", newline="", encoding="utf-8") as f_out:
        writer = csv.DictWriter(f_out, fieldnames=["subject", "text"])
        writer.writeheader()
 SKIP = reconcile()
 # Read CSV in RAM
 with open(input_csv, "r", newline="", encoding="utf-8") as input:
    # Skip already done
    for i in range(0, SKIP):
        next(input)
    reader = csv.reader(input)
    index = -1
    movie_ids = []
    for line in reader:
        index += 1
        if index == 0:
            continue
        # Save movies in map
        movie_ids.append(line[1])
        if index % CHUNK == 0:
            # Flush movies
            flush(movie_ids)
            movie_ids = []
--- a/Scripts/DatasetMerging/datasetInfo.md
+++ b/Scripts/DatasetMerging/datasetInfo.md
@ -0,0 +1,26 @@
 # HOW THE DATASET IS BUILT AND POPULATED
 Note: the data are taken from CSV files in 1-hop
 ## CSV files composition
 | CSV files          | Original structure                    | Saved AS                            |
 |--------------------|---------------------------------------|-------------------------------------|
 | Wikipeda-summary   | PageId / abstract                     | subject, text                       |
 | Movies             | Movie URI                             | "subject"                           |
 | Dataset            | Movie URI / Relationship / Object [RDF] | subject, relationship, object       |
 | Movies-PageId      | Movie URI / PageId (wiki)             | "subject", "object"                 |
 | Reverse            | Subject / Relationship / Movie URI    | "subject", "relationship", "object" |
 ## Wanted tables schema
 | Table         | Columns                                                                 |
 |---------------|-------------------------------------------------------------------------|
 | Movies        | MovieID [PK], Movie URI                                                 |
 | WikiPageIDs   | MovieID [PK, FK], PageId [IDX] (wiki) *(Not important for now)*         |
 | Abstracts     | MovieID [PK, FK], abstract                                              |
 | Subjects      | SubjectID [PK], RDF Subject (from Dataset.csv or Reverse.csv), OriginID [FK] |
 | Relationships | RelationshipID [PK], RDF Relationship (value only, not the actual relation) |
 | Objects       | ObjectID [PK], RDF Object, OriginID [FK]                                |
 | Origins       | OriginID [PK], Origin Name                                              |
 | RDFs          | RDF_ID [PK], MovieID [FK], SubjectID [FK], RelationshipID [FK], ObjectID [FK] |
--- a/Scripts/DatasetMerging/datawarehouse.py
+++ b/Scripts/DatasetMerging/datawarehouse.py
@ -0,0 +1,633 @@
 import sqlite3
 import csv
 #####################################################################
 #   This file builds DatawareHouse/dataset.db from 1-hop csv files  #
 #   Its Schema in . /SQL_Queries/db_creation.sql                    #
 #   The sql query used to popualate id in . /SQL_Queries/query.sql  #
 #####################################################################
 # sometimes you may need to build a new db file, here a little snippet for you
 # sqlite3 ./Assets/Dataset/Tmp/dataset.db < ./Scripts/DataCleaning/SQL_Queries/db_creation.sql
 # --- Global configuration ---
 DB_NAME = "./Assets/Dataset/DatawareHouse/dataset.db"
 MOVIES_CSV = "./Assets/Dataset/1-hop/movies.csv"
 PAGEID_CSV = "./Assets/Dataset/1-hop/movie-pageid.csv"
 SUMMARY_CSV = "./Assets/Dataset/1-hop/wikipedia-summary.csv"
 DATASET_CSV = "./Assets/Dataset/1-hop/dataset.csv"
 REVERSE_CSV = "./Assets/Dataset/1-hop/reverse.csv"
 URI_CSV = "./Assets/Dataset/1-hop/uri-abbreviations.csv"
 MOVIES_CSV_HANDLER = open(MOVIES_CSV, "r", newline="", encoding="utf-8")
 PAGEID_CSV_HANDLER = open(PAGEID_CSV, "r", newline="", encoding="utf-8")
 SUMMARY_CSV_HANDLER = open(SUMMARY_CSV, "r", newline="", encoding="utf-8")
 DATASET_CSV_HANDLER = open(DATASET_CSV, "r", newline="", encoding="utf-8")
 REVERSE_CSV_HANDLER = open(REVERSE_CSV, "r", newline="", encoding="utf-8")
 URI_ABBR_CSV_HANDLER = open(URI_CSV, "r", newline="", encoding="utf-8")
 CONN = sqlite3.connect(DB_NAME)
 CURS = CONN.cursor()
 # MARK: SQL Definitions
 # Insert MovieURI
 def insertOrigin(curs: sqlite3.Cursor) -> bool:
    QUERY = "INSERT INTO  Origins (OriginName) VALUES ('dataset.csv'),('reverse.csv');"
    try:
        curs.execute(QUERY)
        return True
    except sqlite3.IntegrityError:
        return False
 def selectOrigin(curs: sqlite3.Cursor, originName: str) -> int | None:
    QUERY = "SELECT OriginID FROM Origins WHERE OriginName = ?;"
    curs.execute(QUERY, [originName])
    originId = curs.fetchone()
    if not originId:
        return None
    # in this case the real id is the first element of the tuple
    return originId[0]
 def insertMovie(curs: sqlite3.Cursor, movieUri: str) -> bool:
    QUERY = "INSERT INTO Movies (MovieURI) VALUES (?);"
    try:
        curs.execute(QUERY, [movieUri])
        return True
    except sqlite3.IntegrityError:
        return False
 def selectMovieId(curs: sqlite3.Cursor, movieUri: str) -> int | None:
    QUERY = "SELECT MovieID FROM Movies WHERE MovieURI = ?;"
    curs.execute(QUERY, [movieUri])
    movieId = curs.fetchone()
    if not movieId:
        return None
    # in this case the real id is the first element of the tuple
    return movieId[0]
 def insertWikiPageId(curs: sqlite3.Cursor, movieId: int, pageId: int) -> bool:
    QUERY = "INSERT INTO  WikiPageIDs (MovieID, PageID) VALUES (?,?);"
    try:
        curs.execute(QUERY, [movieId, pageId])
        return True
    except sqlite3.IntegrityError:
        return False
 def selectMovieIdFromWikiPageId(curs: sqlite3.Cursor, pageId: int) -> int | None:
    QUERY = "SELECT MovieID FROM WikiPageIDs WHERE PageID = ?;"
    curs.execute(QUERY, [pageId])
    movieId = curs.fetchone()
    if not movieId:
        return None
    # in this case the real id is the first element of the tuple
    return movieId[0]
 def insertWikiAbstract(curs: sqlite3.Cursor, movieId: int, abstract: str) -> bool:
    QUERY = "INSERT INTO  WikipediaAbstracts (MovieID, Abstract) VALUES (?,?);"
    try:
        curs.execute(QUERY, [movieId, abstract])
        return True
    except sqlite3.IntegrityError:
        return False
 def insertSubject(curs: sqlite3.Cursor, subjectURI: str, originID: int) -> bool:
    QUERY = "INSERT INTO  Subjects (SubjectURI, OriginID) VALUES (?,?);"
    try:
        curs.execute(QUERY, [subjectURI, originID])
        return True
    except sqlite3.IntegrityError:
        return False
 def insertRelationship(curs: sqlite3.Cursor, relationshipURI: str) -> bool:
    QUERY = "INSERT INTO  Relationships (RelationshipURI) VALUES (?);"
    try:
        curs.execute(QUERY, [relationshipURI])
        return True
    except sqlite3.IntegrityError:
        return False
 def insertObject(curs: sqlite3.Cursor, objectURI: str, originID: int) -> bool:
    QUERY = "INSERT INTO  objects (ObjectURI, OriginID) VALUES (?,?);"
    try:
        curs.execute(QUERY, [objectURI, originID])
        return True
    except sqlite3.IntegrityError:
        return False
 def selectSubjectId(curs: sqlite3.Cursor, subjectURI: str) -> int | None:
    QUERY = "SELECT SubjectID FROM Subjects WHERE SubjectURI = ?;"
    curs.execute(QUERY, [subjectURI])
    subjectId = curs.fetchone()
    if not subjectId:
        return None
    # in this case the real id is the first element of the tuple
    return subjectId[0]
 def selectRelationshipId(curs: sqlite3.Cursor, relationshipURI: str) -> int | None:
    QUERY = "SELECT RelationshipID FROM Relationships WHERE RelationshipURI = ?;"
    curs.execute(QUERY, [relationshipURI])
    relationshipId = curs.fetchone()
    if not relationshipId:
        return None
    # in this case the real id is the first element of the tuple
    return relationshipId[0]
 def selectObjectId(curs: sqlite3.Cursor, objectURI: str) -> int | None:
    QUERY = "SELECT ObjectID FROM Objects WHERE ObjectURI = ?;"
    curs.execute(QUERY, [objectURI])
    objectId = curs.fetchone()
    if not objectId:
        return None
    # in this case the real id is the first element of the tuple
    return objectId[0]
 def insertRDF(
    curs: sqlite3.Cursor,
    movieId: int,
    subjectId: int,
    relationshipId: int,
    objectId: int,
 ) -> bool:
    QUERY = "INSERT INTO  RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?,?,?,?);"
    try:
        curs.execute(QUERY, [movieId, subjectId, relationshipId, objectId])
        return True
    except sqlite3.IntegrityError:
        return False
 # UGLY: correct method to add cursor
 def insert_abbreviation(uri, abbreviation) -> bool:
    QUERY = "INSERT INTO Abbreviations(URI, Abbreviation) VALUES (?,?);"
    try:
        CURS.execute(QUERY, [uri, abbreviation])
        return True
    except sqlite3.IntegrityError:
        return False
 # UGLY: correct method to add cursor
 def insert_object_abbreviation(object_id, abbreviation_id) -> bool:
    QUERY = "INSERT INTO Objects_Abbreviations(ObjectID, AbbreviationID) VALUES (?,?);"
    try:
        CURS.execute(QUERY, [object_id, abbreviation_id])
        return True
    except sqlite3.IntegrityError:
        return False
 # UGLY: correct method to add cursor
 def insert_relationship_abbreviation(relationship_id, abbreviation_id) -> bool:
    QUERY = "INSERT INTO Relationships_Abbreviations(RelationshipID, AbbreviationID) VALUES (?,?);"
    try:
        CURS.execute(QUERY, [relationship_id, abbreviation_id])
        return True
    except sqlite3.IntegrityError:
        return False
 # UGLY: correct method to add cursor
 def insert_subject_abbreviation(subject_id, abbreviation_id) -> bool:
    QUERY = (
        "INSERT INTO Subjects_Abbreviations(SubjectID, AbbreviationID) VALUES (?,?);"
    )
    try:
        CURS.execute(QUERY, [subject_id, abbreviation_id])
        return True
    except sqlite3.IntegrityError:
        return False
 # UGLY: correct method to add cursor
 def select_abbreviation_id(uri) -> int | None:
    QUERY = "SELECT AbbreviationID FROM Abbreviations WHERE URI LIKE ?;"
    CURS.execute(QUERY, [uri])
    abbreviation_id = CURS.fetchone()
    if not abbreviation_id:
        return None
    # in this case the real id is the first element of the tuple
    return abbreviation_id[0]
 # MARK: Parsing
 def parseMovies():
    CSV_READER = csv.reader(MOVIES_CSV_HANDLER)
    next(CSV_READER)
    for row in CSV_READER:
        MOVIE = row[0]
        insertMovie(CURS, MOVIE)
 def parseWikiPageId():
    CSV_READER = csv.DictReader(PAGEID_CSV_HANDLER)
    for row in CSV_READER:
        MOVIE_URI = row["subject"]
        WIKI_PAGE_ID = int(row["object"])
        MOVIE_ID = selectMovieId(CURS, MOVIE_URI)
        if MOVIE_ID is None:
            print(f"The MovieUri: {MOVIE_URI} has not a MovieId ")
            continue
        insertWikiPageId(CURS, MOVIE_ID, WIKI_PAGE_ID)
 def parseAbstract():
    CSV_READER = csv.DictReader(SUMMARY_CSV_HANDLER)
    for row in CSV_READER:
        WIKI_PAGE_ID = int(row["subject"])
        ABSTRACT = row["text"]
        MOVIE_ID = selectMovieIdFromWikiPageId(CURS, WIKI_PAGE_ID)
        if MOVIE_ID is None:
            print(f"The WikiPageId: {WIKI_PAGE_ID} has not a MovieId ")
            continue
        insertWikiAbstract(CURS, MOVIE_ID, ABSTRACT)
 def parseAbbreviations():
    URI_CSV = csv.DictReader(URI_ABBR_CSV_HANDLER)
    for row in URI_CSV:
        URI = row["uri"]
        ABBREVIATION = row["abbreviation"]
        insert_abbreviation(URI, ABBREVIATION)
 def parseRDF_Reverse():
    REVERSE_CSV_READER = csv.DictReader(REVERSE_CSV_HANDLER)
    REVERSE_ORIGIN_ID = selectOrigin(CURS, "reverse.csv")
    if REVERSE_ORIGIN_ID is None:
        return
    total = 0
    for row in REVERSE_CSV_READER:
        SUBJECT = row["subject"]
        RELATIONSHIP = row["relationship"]
        OBJECT = row["object"]
        print(f"RDF triplets:\n\t{SUBJECT} - {RELATIONSHIP} - {OBJECT}")
        insertSubject(CURS, SUBJECT, REVERSE_ORIGIN_ID)
        insertRelationship(CURS, RELATIONSHIP)
        insertObject(CURS, OBJECT, REVERSE_ORIGIN_ID)
        SUBJECT_ID = selectSubjectId(CURS, SUBJECT)
        OBJECT_ID = selectObjectId(CURS, OBJECT)
        RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
        MOVIE_ID = selectMovieId(CURS, OBJECT)
        skip = False
        # guard
        if SUBJECT_ID is None:
            print(f"No SubjectId for {SUBJECT}")
            skip = True
        if OBJECT_ID is None:
            print(f"No ObjectId for {OBJECT}")
            skip = True
        if RELATIONSHIP_ID is None:
            print(f"No RelationshipId for {RELATIONSHIP}")
            skip = True
        if MOVIE_ID is None:
            print(f"No MovieId for {OBJECT}")
            skip = True
        if skip:
            continue
        if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID):  # type: ignore
            total += 1
    print(total)
 def parseRDF_Dataset():
    DATASET_CSV_READER = csv.DictReader(DATASET_CSV_HANDLER)
    DATASET_ORIGIN_ID = selectOrigin(CURS, "dataset.csv")
    if DATASET_ORIGIN_ID is None:
        return
    total = 0
    rdf_idx = 0
    for row in DATASET_CSV_READER:
        SUBJECT = row["subject"]
        RELATIONSHIP = row["relationship"]
        OBJECT = row["object"]
        rdf_idx += 1
        if rdf_idx % 100000 == 0:
            print(f"RDF number {rdf_idx}:\n\t{SUBJECT} - {RELATIONSHIP} - {OBJECT}")
        insertSubject(CURS, SUBJECT, DATASET_ORIGIN_ID)
        insertRelationship(CURS, RELATIONSHIP)
        insertObject(CURS, OBJECT, DATASET_ORIGIN_ID)
        SUBJECT_ID = selectSubjectId(CURS, SUBJECT)
        OBJECT_ID = selectObjectId(CURS, OBJECT)
        RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
        MOVIE_ID = selectMovieId(CURS, SUBJECT)
        skip = False
        # guard
        if SUBJECT_ID is None:
            print(f"No SubjectId for {SUBJECT}")
            skip = True
        if OBJECT_ID is None:
            print(f"No ObjectId for {OBJECT}")
            skip = True
        if RELATIONSHIP_ID is None:
            print(f"No RelationshipId for {RELATIONSHIP}")
            skip = True
        if MOVIE_ID is None:
            print(f"No MovieId for {SUBJECT}")
            skip = True
        if skip:
            continue
        if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID):  # type: ignore
            total += 1
    print(total)
 def parseAbbr_Reverse():
    REVERSE_CSV_READER = csv.DictReader(REVERSE_CSV_HANDLER)
    REVERSE_ORIGIN_ID = selectOrigin(CURS, "reverse.csv")
    if REVERSE_ORIGIN_ID is None:
        return
    total = 0
    for row in REVERSE_CSV_READER:
        SUBJECT = row["subject"]
        RELATIONSHIP = row["relationship"]
        OBJECT = row["object"]
        SUBJECT_ID = selectSubjectId(CURS, SUBJECT)
        OBJECT_ID = selectObjectId(CURS, OBJECT)
        RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
        SUB_SECTIONS = SUBJECT.split("/")
        REL_SECTIONS = RELATIONSHIP.split("/")
        OBJ_SECTIONS = OBJECT.split("/")
        SUB_ABBR_ID = None
        REL_ABBR_ID = None
        OBJ_ABBR_ID = None
        skip = False
        # guard
        if SUBJECT_ID is None:
            print(f"No SubjectId for {SUBJECT}")
            skip = True
        if OBJECT_ID is None:
            print(f"No ObjectId for {OBJECT}")
            skip = True
        if RELATIONSHIP_ID is None:
            print(f"No RelationshipId for {RELATIONSHIP}")
            skip = True
        if skip:
            continue
        if len(SUB_SECTIONS) > 4:
            index = min(len(SUB_SECTIONS), 7)
            while index > 3:
                PATH = "/".join(SUB_SECTIONS[0:index]) + "%"
                SUB_ABBR_ID = select_abbreviation_id(PATH)
                if SUB_ABBR_ID is not None:
                    if insert_subject_abbreviation(SUBJECT_ID, SUB_ABBR_ID):
                        total += 1
                    index = 0
                index -= 1
        if len(REL_SECTIONS) > 4:
            index = min(len(REL_SECTIONS), 7)
            while index > 2:
                PATH = "/".join(REL_SECTIONS[0:index]) + "%"
                REL_ABBR_ID = select_abbreviation_id(PATH)
                if REL_ABBR_ID is not None:
                    if insert_relationship_abbreviation(RELATIONSHIP_ID, REL_ABBR_ID):
                        total += 1
                    index = 0
                index -= 1
        if len(OBJ_SECTIONS) > 4:
            index = min(len(OBJ_SECTIONS), 7)
            while index > 3:
                PATH = "/".join(OBJ_SECTIONS[0:index]) + "%"
                OBJ_ABBR_ID = select_abbreviation_id(PATH)
                if OBJ_ABBR_ID is not None:
                    if insert_object_abbreviation(OBJECT_ID, OBJ_ABBR_ID):
                        total += 1
                    index = 0
                index -= 1
    print(total)
 def parseAbbr_Dataset():
    DATASET_CSV_READER = csv.DictReader(DATASET_CSV_HANDLER)
    DATASET_ORIGIN_ID = selectOrigin(CURS, "dataset.csv")
    if DATASET_ORIGIN_ID is None:
        return
    total = 0
    rdf_idx = 0
    for row in DATASET_CSV_READER:
        SUBJECT = row["subject"]
        RELATIONSHIP = row["relationship"]
        OBJECT = row["object"]
        rdf_idx += 1
        if rdf_idx % 100000 == 0:
            print(f"RDF number {rdf_idx}:\n\t{SUBJECT} - {RELATIONSHIP} - {OBJECT}")
        SUBJECT_ID = selectSubjectId(CURS, SUBJECT)
        OBJECT_ID = selectObjectId(CURS, OBJECT)
        RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
        SUB_SECTIONS = SUBJECT.split("/")
        REL_SECTIONS = RELATIONSHIP.split("/")
        OBJ_SECTIONS = OBJECT.split("/")
        SUB_ABBR_ID = None
        REL_ABBR_ID = None
        OBJ_ABBR_ID = None
        skip = False
        # guard
        if SUBJECT_ID is None:
            print(f"No SubjectId for {SUBJECT}")
            skip = True
        if OBJECT_ID is None:
            print(f"No ObjectId for {OBJECT}")
            skip = True
        if RELATIONSHIP_ID is None:
            print(f"No RelationshipId for {RELATIONSHIP}")
            skip = True
        if skip:
            continue
        if len(SUB_SECTIONS) > 4:
            index = min(len(SUB_SECTIONS), 7)
            while index > 3:
                PATH = "/".join(SUB_SECTIONS[0:index]) + "%"
                SUB_ABBR_ID = select_abbreviation_id(PATH)
                if SUB_ABBR_ID is not None:
                    if insert_subject_abbreviation(SUBJECT_ID, SUB_ABBR_ID):
                        total += 1
                    index = 0
                index -= 1
        if len(REL_SECTIONS) > 4:
            index = min(len(REL_SECTIONS), 7)
            while index > 2:
                PATH = "/".join(REL_SECTIONS[0:index]) + "%"
                REL_ABBR_ID = select_abbreviation_id(PATH)
                if REL_ABBR_ID is not None:
                    if insert_relationship_abbreviation(RELATIONSHIP_ID, REL_ABBR_ID):
                        total += 1
                    index = 0
                index -= 1
        if len(OBJ_SECTIONS) > 4:
            index = min(len(OBJ_SECTIONS), 7)
            while index > 3:
                PATH = "/".join(OBJ_SECTIONS[0:index]) + "%"
                OBJ_ABBR_ID = select_abbreviation_id(PATH)
                if OBJ_ABBR_ID is not None:
                    if insert_object_abbreviation(OBJECT_ID, OBJ_ABBR_ID):
                        total += 1
                    index = 0
                index -= 1
    print(total)
 # MARK: Actual Code
 # parseMovies()
 # parseWikiPageId()
 # parseAbstract()
 # insertOrigin(CURS)
 # parseAbbreviations()
 # parseRDF_Reverse()
 # parseRDF_Dataset()
 # parseAbbr_Reverse()
 parseAbbr_Dataset()
 CONN.commit()
 CONN.close()
 MOVIES_CSV_HANDLER.close()
 PAGEID_CSV_HANDLER.close()
 SUMMARY_CSV_HANDLER.close()
 DATASET_CSV_HANDLER.close()
 REVERSE_CSV_HANDLER.close()
 URI_ABBR_CSV_HANDLER.close()
 """
 The MovieUri: http://dbpedia.org/resource/1%25_(film) has not a MovieId
 The MovieUri: http://dbpedia.org/resource/10%25:_What_Makes_a_Hero%3F has not a MovieId
 The MovieUri: http://dbpedia.org/resource/100%25_Arabica has not a MovieId
 The MovieUri: http://dbpedia.org/resource/100%25_Kadhal has not a MovieId
 The MovieUri: http://dbpedia.org/resource/100%25_Love_(2011_film) has not a MovieId
 The MovieUri: http://dbpedia.org/resource/100%25_Love_(2012_film) has not a MovieId
 The MovieUri: http://dbpedia.org/resource/100%25_Wolf has not a MovieId
 The MovieUri: http://dbpedia.org/resource/Who_the_$&%25_Is_Jackson_Pollock%3F has not a MovieId
 The MovieUri: http://dbpedia.org/resource/99%25:_The_Occupy_Wall_Street_Collaborative_Film has not a MovieId
 The MovieUri: http://dbpedia.org/resource/99_and_44/100%25_Dead has not a MovieId
 The MovieUri: http://dbpedia.org/resource/Postcards_from_the_48%25 has not a MovieId
 The MovieUri: http://dbpedia.org/resource/Wool_100%25 has not a MovieId
 """
 """
 The WikiPageId: 10068850 has not a MovieId
 The WikiPageId: 55069615 has not a MovieId
 The WikiPageId: 49510056 has not a MovieId
 The WikiPageId: 4049786 has not a MovieId
 The WikiPageId: 55510238 has not a MovieId
 The WikiPageId: 31239628 has not a MovieId
 The WikiPageId: 34757217 has not a MovieId
 The WikiPageId: 64311757 has not a MovieId
 The WikiPageId: 8326198 has not a MovieId
 The WikiPageId: 42162164 has not a MovieId
 The WikiPageId: 18502369 has not a MovieId
 The WikiPageId: 58092358 has not a MovieId
 The WikiPageId: 40710250 has not a MovieId
 """
--- a/Scripts/Experiments/.gitkeep
+++ b/Scripts/Experiments/.gitkeep
--- a/Scripts/Experiments/Queries/.gitkeep
+++ b/Scripts/Experiments/Queries/.gitkeep
--- a/Scripts/Experiments/Tmp/.gitkeep
+++ b/Scripts/Experiments/Tmp/.gitkeep
--- a/Scripts/Libs/CleaningPipeline/.gitkeep
+++ b/Scripts/Libs/CleaningPipeline/.gitkeep
--- a/Scripts/Libs/CleaningPipeline/special_token.py
+++ b/Scripts/Libs/CleaningPipeline/special_token.py
@ -0,0 +1,22 @@
 from enum import Enum
 class SpecialToken(str, Enum):
    # (Enum, str) -> throws an error
    START_TRIPLE_LIST = "<SOTL>"
    START_TRIPLE = "<SOT>"
    END_TRIPLE = "<EOT>"
    SUBJECT = "<SUBJ>"
    RELATIONSHIP = "<PRED>"
    OBJECT = "<OBJ>"
    ABSTRACT = "<ABS>"
    END_OF_SENTENCE = "<EOS>"
    CORPUS_END = "<END>"
    ## Tasks' Token
    RDF_TO_TEXT = "<RDF2TXT>"
    TEXT_TO_RDF = "<TEXT2RDF>"
    CONTINUE_RDF = "<CONTINUERDF>"
    MASK = "<MASK>"
    #BPE Training:
--- a/Scripts/Libs/CleaningPipeline/sql_endpoint.py
+++ b/Scripts/Libs/CleaningPipeline/sql_endpoint.py
@ -0,0 +1,149 @@
 #######################################################
 #   This file stand as endpoint to interact with DB   #
 #######################################################
 # import sqlite3
 import pandas as pd
 from sqlalchemy import create_engine
 from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
 class SqlEndpoint():
    def __init__(self, DB_PATH = "./Assets/Dataset/DatawareHouse/dataset.db", chunk_size_row = 500):
        # self.CONN = sqlite3.connect(DB_PATH) # DEPRECATED
        self.sql_engine = create_engine(f"sqlite:///{DB_PATH}")
        # /// 3 slash -> relative path
        # //// 4 slash -> absolute
        # self.conn = self.sql_engine.connect().execution_options(stream_results=True)
        # it seems that sqlite doenst support streamer cursor
        # PRAGMA exeutes better in writing not reading
        self.chunk_size_row = chunk_size_row                    # not used now, since each chunk is a movie
        self.movie_ids = movie_ids = pd.read_sql_query("SELECT MovieID FROM Movies;", self.sql_engine)["MovieID"]
    def get_RDF(self) -> pd.DataFrame :
        QUERY = """
                SELECT MovieID, SubjectURI, RelationshipURI, ObjectURI
                FROM RDFs
                INNER JOIN Subjects USING (SubjectID)
                INNER JOIN Relationships USING (RelationshipID)
                INNER JOIN Objects USING (ObjectID);
                """
        return pd.read_sql_query(QUERY, self.CONN)
    def get_chunked_abbreviated_dataset(self) -> pd.DataFrame :
        """
        Returns:
            pd.DataFrame: MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
        """        
        QUERY = """
                SELECT MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
                FROM RDFs
                INNER JOIN ParsedSubjects USING (SubjectID)
                INNER JOIN ParsedRelationships USING (RelationshipID)
                INNER JOIN ParsedObjects USING (ObjectID)
                INNER JOIN WikipediaAbstracts USING (MovieID);
                """
        # return pd.read_sql_query(QUERY, self.CONN, chunksize=500)
        # sqlite3
        return pd.read_sql_query(QUERY, self.sql_engine, chunksize=self.chunk_size_row)
    def get_chunked_abbreviated_dataset_with_start_token(self)-> pd.DataFrame:
        # DEPRECATED !
        start_token = SpecialToken()
        QUERY = """
                SELECT 
                    MovieID, 
                    ? || SubjectURI AS SubjectURI,
                    ? || RelationshipURI AS RelationshipURI, 
                    ? || ObjectURI AS ObjectURI, 
                    Abstract
                FROM RDFs
                INNER JOIN ParsedSubjects USING (SubjectID)
                INNER JOIN ParsedRelationships USING (RelationshipID)
                INNER JOIN ParsedObjects USING (ObjectID)
                INNER JOIN WikipediaAbstracts USING (MovieID);
                """
        return pd.read_sql_query(QUERY, self.sql_engine, chunksize=self.chunk_size_row)
    def get_abbreviated_dataset_by_movie_id(self):# -> iter[pd.DataFrame]:
        """
        Gets each time a DataFrame per movie ( with all its rows in the dataset).
        The retrieved RDFs are already abbrevieted by the sql parser
        Yields:
            Pandas.DataFrame: [MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract]
        """        
        # chunk by movieId, abstract is the same and some intersting logic are appliable
        # movie_ids = pd.read_sql_query("SELECT MovieID FROM Movies;", self.sql_engine)["MovieID"]
        # CHOOSEN MOVIE:
        # The Dark Knight   : 117248
        # Inception         : 147074
        # The Avengers      : 113621
        # Cast Away         : 1123
        # The Departed      : 117586
        # American Psycho   : 90177
        # Avatar            : 71587
        # Django Unchained  : 138952
        # Spirited Away     : 144137
        # Knives Out        : 148025
        # movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]
        # movie_ids = movie_list
        QUERY = """
                SELECT MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
                FROM RDFs
                INNER JOIN ParsedSubjects USING (SubjectID)
                INNER JOIN ParsedRelationships USING (RelationshipID)
                INNER JOIN ParsedObjects USING (ObjectID)
                INNER JOIN WikipediaAbstracts USING (MovieID)
                WHERE MovieID = (?);
                """        
        for movie_id in self.movie_ids:
            yield pd.read_sql_query(QUERY, self.sql_engine, params=(movie_id,))
    def get_movies_id_count(self) -> pd.DataFrame:
        """
        Gets the count of each Movie in the Dataset
        Returns:
            Pandas.DataFrame: [MovieID, Count]
        """        
        QUERY = """
                SELECT MovieID, COUNT(*) AS Count
                FROM RDFs
                GROUP BY MovieID;
                """        
        return pd.read_sql_query(QUERY, self.sql_engine)
    def get_relationship_count(self) -> pd.DataFrame:
        """
        Gets the count of each Relationship in the Dataset
        Returns:
            Pandas.DataFrame: [RelationshipURI, Count]
        """       
        QUERY = """
                SELECT RelationshipURI, COUNT(*) AS Count
                FROM RDFs
                INNER JOIN ParsedRelationships USING (RelationshipID)
                GROUP BY RelationshipURI;
                """        
        return pd.read_sql_query(QUERY, self.sql_engine)
    def get_dataframe_from_query(self, query: str, params=None):
        if params is None:
            return pd.read_sql_query(query, self.sql_engine)
        return pd.read_sql_query(query, self.sql_engine, params=params)
 if __name__ == "__main__" :
    sql_endpoint = SqlEndpoint()
    for pandas_row in sql_endpoint.get_abbreviated_dataset_by_movie_id():
        print(pandas_row)
    # sql_endpoint.get_RDF()
    print("done")
--- a/Scripts/Libs/Utils/.gitkeep
+++ b/Scripts/Libs/Utils/.gitkeep
--- a/Scripts/Libs/Utils/dataframe_interaction.py
+++ b/Scripts/Libs/Utils/dataframe_interaction.py
@ -0,0 +1,9 @@
 import pandas as pd
 def get_raw_from_dataframe(DF: pd.DataFrame) -> str:
    output = ''
    for row in DF.itertuples(index=False, name=None):
        output += "".join(map(str, row))
    return output
--- a/Scripts/UML/CleaningPipeline/bpe-pipeline.excalidraw.json
+++ b/Scripts/UML/CleaningPipeline/bpe-pipeline.excalidraw.json
@ -0,0 +1,897 @@
 {
  "type": "excalidraw",
  "version": 2,
  "source": "https://marketplace.visualstudio.com/items?itemName=pomdtr.excalidraw-editor",
  "elements": [
    {
      "id": "3zbCui3XtIGozHXTVAGRp",
      "type": "rectangle",
      "x": 316.5,
      "y": 123,
      "width": 436.5,
      "height": 145.5,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a0",
      "roundness": {
        "type": 3
      },
      "seed": 1698427950,
      "version": 35,
      "versionNonce": 601575602,
      "isDeleted": false,
      "boundElements": [
        {
          "id": "wD66RDbG05HfvRhAtMb0J",
          "type": "text"
        },
        {
          "id": "gus_rxauKJ6T2L_F59PfN",
          "type": "arrow"
        }
      ],
      "updated": 1758818588814,
      "link": null,
      "locked": false
    },
    {
      "id": "wD66RDbG05HfvRhAtMb0J",
      "type": "text",
      "x": 480.98004150390625,
      "y": 183.25,
      "width": 107.5399169921875,
      "height": 25,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a1",
      "roundness": null,
      "seed": 910769774,
      "version": 31,
      "versionNonce": 1120989938,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758818416720,
      "link": null,
      "locked": false,
      "text": "dataset.db",
      "fontSize": 20,
      "fontFamily": 5,
      "textAlign": "center",
      "verticalAlign": "middle",
      "containerId": "3zbCui3XtIGozHXTVAGRp",
      "originalText": "dataset.db",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "87-MeaiZGT1wln0nggYPZ",
      "type": "rectangle",
      "x": 339.5,
      "y": 309.5,
      "width": 392,
      "height": 156,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a2",
      "roundness": {
        "type": 3
      },
      "seed": 655550318,
      "version": 77,
      "versionNonce": 1103939826,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758818339000,
      "link": null,
      "locked": false
    },
    {
      "id": "EjUxEhZqEBzwvlw0VE9eJ",
      "type": "rectangle",
      "x": 355.5,
      "y": 327,
      "width": 162,
      "height": 125.5,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a3",
      "roundness": {
        "type": 3
      },
      "seed": 1739846638,
      "version": 64,
      "versionNonce": 1594290034,
      "isDeleted": false,
      "boundElements": [
        {
          "type": "text",
          "id": "ogRkV0neHrhEKTE6zlggl"
        }
      ],
      "updated": 1758818391415,
      "link": null,
      "locked": false
    },
    {
      "id": "ogRkV0neHrhEKTE6zlggl",
      "type": "text",
      "x": 378.7100524902344,
      "y": 377.25,
      "width": 115.57989501953125,
      "height": 25,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a3V",
      "roundness": null,
      "seed": 2037675630,
      "version": 12,
      "versionNonce": 1286472046,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758818399222,
      "link": null,
      "locked": false,
      "text": "RDF_String",
      "fontSize": 20,
      "fontFamily": 5,
      "textAlign": "center",
      "verticalAlign": "middle",
      "containerId": "EjUxEhZqEBzwvlw0VE9eJ",
      "originalText": "RDF_String",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "hoIRMNiMJZl4YDo-hovWy",
      "type": "rectangle",
      "x": 542.5,
      "y": 327,
      "width": 173,
      "height": 125.5,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a4",
      "roundness": {
        "type": 3
      },
      "seed": 1189796530,
      "version": 99,
      "versionNonce": 1071057006,
      "isDeleted": false,
      "boundElements": [
        {
          "type": "text",
          "id": "rsapATFAT5YSBCXzLupgZ"
        },
        {
          "id": "gus_rxauKJ6T2L_F59PfN",
          "type": "arrow"
        },
        {
          "id": "Wk1bJbbtC31FqObEL5xWt",
          "type": "arrow"
        }
      ],
      "updated": 1758818593647,
      "link": null,
      "locked": false
    },
    {
      "id": "rsapATFAT5YSBCXzLupgZ",
      "type": "text",
      "x": 585.6800384521484,
      "y": 377.25,
      "width": 86.63992309570312,
      "height": 25,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a5",
      "roundness": null,
      "seed": 829619694,
      "version": 12,
      "versionNonce": 713902318,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758818405150,
      "link": null,
      "locked": false,
      "text": "Abstract",
      "fontSize": 20,
      "fontFamily": 5,
      "textAlign": "center",
      "verticalAlign": "middle",
      "containerId": "hoIRMNiMJZl4YDo-hovWy",
      "originalText": "Abstract",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "jSx8ApfhtRs_nk37VvDMb",
      "type": "rectangle",
      "x": 316.5,
      "y": 511,
      "width": 436.5,
      "height": 145.5,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a6",
      "roundness": {
        "type": 3
      },
      "seed": 492582894,
      "version": 132,
      "versionNonce": 893797614,
      "isDeleted": false,
      "boundElements": [
        {
          "type": "text",
          "id": "6E23g-rgowNqHsBxX-LuM"
        },
        {
          "id": "hyFKqXwet_F79QM71atgI",
          "type": "arrow"
        },
        {
          "id": "x_DP1FcQ7jraGz0gBuDi3",
          "type": "arrow"
        },
        {
          "id": "1IGbCps2EHnzKgJUWM5nq",
          "type": "arrow"
        },
        {
          "id": "Wk1bJbbtC31FqObEL5xWt",
          "type": "arrow"
        }
      ],
      "updated": 1758818593647,
      "link": null,
      "locked": false
    },
    {
      "id": "6E23g-rgowNqHsBxX-LuM",
      "type": "text",
      "x": 499.9100341796875,
      "y": 571.25,
      "width": 69.679931640625,
      "height": 25,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a7",
      "roundness": null,
      "seed": 267696178,
      "version": 132,
      "versionNonce": 1668243186,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758818543211,
      "link": null,
      "locked": false,
      "text": "Pandas",
      "fontSize": 20,
      "fontFamily": 5,
      "textAlign": "center",
      "verticalAlign": "middle",
      "containerId": "jSx8ApfhtRs_nk37VvDMb",
      "originalText": "Pandas",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "ohj18N4AOTDz5lJNcV9gi",
      "type": "rectangle",
      "x": 261,
      "y": 765.5,
      "width": 157,
      "height": 87,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a8",
      "roundness": {
        "type": 3
      },
      "seed": 1446207150,
      "version": 279,
      "versionNonce": 317375026,
      "isDeleted": false,
      "boundElements": [
        {
          "id": "Ea1_ke2wA0D8ZjVOUtvfY",
          "type": "text"
        },
        {
          "id": "hyFKqXwet_F79QM71atgI",
          "type": "arrow"
        }
      ],
      "updated": 1758818570993,
      "link": null,
      "locked": false
    },
    {
      "id": "Ea1_ke2wA0D8ZjVOUtvfY",
      "type": "text",
      "x": 297.0800323486328,
      "y": 796.5,
      "width": 84.83993530273438,
      "height": 25,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a9",
      "roundness": null,
      "seed": 435116270,
      "version": 199,
      "versionNonce": 1282911218,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758818570993,
      "link": null,
      "locked": false,
      "text": "train.txt",
      "fontSize": 20,
      "fontFamily": 5,
      "textAlign": "center",
      "verticalAlign": "middle",
      "containerId": "ohj18N4AOTDz5lJNcV9gi",
      "originalText": "train.txt",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "A4Y54Y26fe257U_QU9lxX",
      "type": "rectangle",
      "x": 464,
      "y": 765.5,
      "width": 157,
      "height": 87,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aA",
      "roundness": {
        "type": 3
      },
      "seed": 186148850,
      "version": 232,
      "versionNonce": 997119858,
      "isDeleted": false,
      "boundElements": [
        {
          "id": "v4TvUlDEjH7EvPDmtbOn2",
          "type": "text"
        },
        {
          "id": "1IGbCps2EHnzKgJUWM5nq",
          "type": "arrow"
        }
      ],
      "updated": 1758818570993,
      "link": null,
      "locked": false
    },
    {
      "id": "v4TvUlDEjH7EvPDmtbOn2",
      "type": "text",
      "x": 476.3500442504883,
      "y": 796.5,
      "width": 132.29991149902344,
      "height": 25,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aB",
      "roundness": null,
      "seed": 1131059634,
      "version": 171,
      "versionNonce": 239540530,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758818570993,
      "link": null,
      "locked": false,
      "text": "validation.txt",
      "fontSize": 20,
      "fontFamily": 5,
      "textAlign": "center",
      "verticalAlign": "middle",
      "containerId": "A4Y54Y26fe257U_QU9lxX",
      "originalText": "validation.txt",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "mPaYpJ9Xn7tlJPmKPqJKJ",
      "type": "rectangle",
      "x": 674.5,
      "y": 765.5,
      "width": 157,
      "height": 87,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aC",
      "roundness": {
        "type": 3
      },
      "seed": 1049323314,
      "version": 235,
      "versionNonce": 330560690,
      "isDeleted": false,
      "boundElements": [
        {
          "type": "text",
          "id": "kg9nm2rpud6cax5aNPSnu"
        },
        {
          "id": "x_DP1FcQ7jraGz0gBuDi3",
          "type": "arrow"
        }
      ],
      "updated": 1758818570993,
      "link": null,
      "locked": false
    },
    {
      "id": "kg9nm2rpud6cax5aNPSnu",
      "type": "text",
      "x": 711.4300231933594,
      "y": 796.5,
      "width": 83.13995361328125,
      "height": 25,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aD",
      "roundness": null,
      "seed": 522572142,
      "version": 193,
      "versionNonce": 1920372338,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758818570993,
      "link": null,
      "locked": false,
      "text": "test.txt",
      "fontSize": 20,
      "fontFamily": 5,
      "textAlign": "center",
      "verticalAlign": "middle",
      "containerId": "mPaYpJ9Xn7tlJPmKPqJKJ",
      "originalText": "test.txt",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "hyFKqXwet_F79QM71atgI",
      "type": "arrow",
      "x": 534.65,
      "y": 661.5,
      "width": 195.25,
      "height": 99,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aG",
      "roundness": null,
      "seed": 873266098,
      "version": 71,
      "versionNonce": 541154738,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758818570993,
      "link": null,
      "locked": false,
      "points": [
        [
          0,
          0
        ],
        [
          0,
          49.5
        ],
        [
          -195.25,
          49.5
        ],
        [
          -195.25,
          99
        ]
      ],
      "lastCommittedPoint": null,
      "startBinding": {
        "elementId": "jSx8ApfhtRs_nk37VvDMb",
        "fixedPoint": [
          0.49977090492554405,
          1.034364261168385
        ],
        "focus": 0,
        "gap": 0
      },
      "endBinding": {
        "elementId": "ohj18N4AOTDz5lJNcV9gi",
        "fixedPoint": [
          0.4993630573248406,
          -0.05747126436781609
        ],
        "focus": 0,
        "gap": 0
      },
      "startArrowhead": null,
      "endArrowhead": "triangle",
      "elbowed": true,
      "fixedSegments": null,
      "startIsSpecial": null,
      "endIsSpecial": null
    },
    {
      "id": "x_DP1FcQ7jraGz0gBuDi3",
      "type": "arrow",
      "x": 534.65,
      "y": 661.5,
      "width": 218.25,
      "height": 99,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aH",
      "roundness": null,
      "seed": 1210817582,
      "version": 77,
      "versionNonce": 1483392370,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758818580594,
      "link": null,
      "locked": false,
      "points": [
        [
          0,
          0
        ],
        [
          0,
          49.5
        ],
        [
          218.25,
          49.5
        ],
        [
          218.25,
          99
        ]
      ],
      "lastCommittedPoint": null,
      "startBinding": {
        "elementId": "jSx8ApfhtRs_nk37VvDMb",
        "fixedPoint": [
          0.49977090492554405,
          1.034364261168385
        ],
        "focus": 0,
        "gap": 0
      },
      "endBinding": {
        "elementId": "mPaYpJ9Xn7tlJPmKPqJKJ",
        "fixedPoint": [
          0.4993630573248406,
          -0.05747126436781609
        ],
        "focus": 0,
        "gap": 0
      },
      "startArrowhead": null,
      "endArrowhead": "triangle",
      "elbowed": true,
      "fixedSegments": null,
      "startIsSpecial": null,
      "endIsSpecial": null
    },
    {
      "id": "1IGbCps2EHnzKgJUWM5nq",
      "type": "arrow",
      "x": 534.65,
      "y": 661.5,
      "width": 0.5719232650604908,
      "height": 99.07394122590165,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aK",
      "roundness": null,
      "seed": 1205316658,
      "version": 96,
      "versionNonce": 1748050674,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758818570993,
      "link": null,
      "locked": false,
      "points": [
        [
          0,
          0
        ],
        [
          -0.5719232650604908,
          99.07394122590165
        ]
      ],
      "lastCommittedPoint": null,
      "startBinding": {
        "elementId": "jSx8ApfhtRs_nk37VvDMb",
        "fixedPoint": [
          0.49977090492554405,
          1.034364261168385
        ],
        "focus": 0,
        "gap": 0
      },
      "endBinding": {
        "elementId": "A4Y54Y26fe257U_QU9lxX",
        "fixedPoint": [
          0.44635717665566554,
          -0.056621365219521276
        ],
        "focus": 0,
        "gap": 0
      },
      "startArrowhead": null,
      "endArrowhead": "triangle",
      "elbowed": true,
      "fixedSegments": null,
      "startIsSpecial": null,
      "endIsSpecial": null
    },
    {
      "id": "gus_rxauKJ6T2L_F59PfN",
      "type": "arrow",
      "x": 539,
      "y": 271.5,
      "width": 0,
      "height": 33.5,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aL",
      "roundness": null,
      "seed": 763990258,
      "version": 17,
      "versionNonce": 1028811378,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758818588814,
      "link": null,
      "locked": false,
      "points": [
        [
          0,
          0
        ],
        [
          0,
          33.5
        ]
      ],
      "lastCommittedPoint": null,
      "startBinding": {
        "elementId": "3zbCui3XtIGozHXTVAGRp",
        "focus": -0.019473081328751418,
        "gap": 3
      },
      "endBinding": {
        "elementId": "hoIRMNiMJZl4YDo-hovWy",
        "focus": -1.0404624277456647,
        "gap": 30.7545797799829
      },
      "startArrowhead": null,
      "endArrowhead": "triangle",
      "elbowed": false
    },
    {
      "id": "Wk1bJbbtC31FqObEL5xWt",
      "type": "arrow",
      "x": 536.5,
      "y": 468.5,
      "width": 0,
      "height": 39,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aM",
      "roundness": null,
      "seed": 1489771054,
      "version": 33,
      "versionNonce": 1828178606,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758818593647,
      "link": null,
      "locked": false,
      "points": [
        [
          0,
          0
        ],
        [
          0,
          39
        ]
      ],
      "lastCommittedPoint": null,
      "startBinding": {
        "elementId": "hoIRMNiMJZl4YDo-hovWy",
        "focus": 1.0693641618497107,
        "gap": 27.157190169432425
      },
      "endBinding": {
        "elementId": "jSx8ApfhtRs_nk37VvDMb",
        "focus": 0.008018327605956525,
        "gap": 3.5
      },
      "startArrowhead": null,
      "endArrowhead": "triangle",
      "elbowed": false
    }
  ],
  "appState": {
    "gridSize": 20,
    "gridStep": 5,
    "gridModeEnabled": false,
    "viewBackgroundColor": "#ffffff"
  },
  "files": {}
 }
--- a/Scripts/UML/CleaningPipeline/classes.excalidraw.json
+++ b/Scripts/UML/CleaningPipeline/classes.excalidraw.json
@ -0,0 +1,826 @@
 {
  "type": "excalidraw",
  "version": 2,
  "source": "https://marketplace.visualstudio.com/items?itemName=pomdtr.excalidraw-editor",
  "elements": [
    {
      "type": "line",
      "version": 4622,
      "versionNonce": 1623045672,
      "isDeleted": false,
      "id": "twu_PiAvEuQ4l1YYtZLET",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 289.8504963515835,
      "y": 91.87474806402287,
      "strokeColor": "#000000",
      "backgroundColor": "#a5d8ff",
      "width": 77.09201683999922,
      "height": 99.49948667804088,
      "seed": 1975340120,
      "groupIds": [
        "9PT4BXPfQ6UoCaB-T-h9A",
        "dp_TZJyYdyPIH1hOkAPlb"
      ],
      "strokeSharpness": "round",
      "boundElementIds": [],
      "startBinding": null,
      "endBinding": null,
      "lastCommittedPoint": null,
      "startArrowhead": null,
      "endArrowhead": null,
      "points": [
        [
          0,
          0
        ],
        [
          0.2542098813493443,
          75.20117273657175
        ],
        [
          0.011896425679918422,
          83.76249969444815
        ],
        [
          3.970409367559332,
          87.46174320643391
        ],
        [
          17.75573317066317,
          90.59250103325854
        ],
        [
          41.05683533152865,
          91.56737225214069
        ],
        [
          63.319497586673116,
          90.01084754868091
        ],
        [
          75.14781395923075,
          86.28844687220405
        ],
        [
          76.81603792670788,
          83.15042405259751
        ],
        [
          77.05033394391478,
          76.25776215104557
        ],
        [
          76.86643881413028,
          6.3089586511537865
        ],
        [
          76.45188016352971,
          -0.2999144698665015
        ],
        [
          71.50179495549581,
          -3.9936571317850627
        ],
        [
          61.077971898861186,
          -6.132877429442784
        ],
        [
          37.32348754161154,
          -7.932114425900202
        ],
        [
          18.278415656797975,
          -6.859225353587373
        ],
        [
          3.2995959613238286,
          -3.2201165291205287
        ],
        [
          -0.04168289608444441,
          -0.045185660461322996
        ],
        [
          0,
          0
        ]
      ],
      "index": "a1",
      "frameId": null,
      "roundness": {
        "type": 2
      },
      "boundElements": [],
      "updated": 1758646548051,
      "link": null,
      "locked": false
    },
    {
      "type": "line",
      "version": 2327,
      "versionNonce": 1593094440,
      "isDeleted": false,
      "id": "hmJk4dH9VpOsfkrCTkhvh",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 290.3744257898585,
      "y": 149.00103172175278,
      "strokeColor": "#000000",
      "backgroundColor": "#a5d8ff",
      "width": 77.17198221193564,
      "height": 8.562348957853036,
      "seed": 637665624,
      "groupIds": [
        "9PT4BXPfQ6UoCaB-T-h9A",
        "dp_TZJyYdyPIH1hOkAPlb"
      ],
      "strokeSharpness": "round",
      "boundElementIds": [],
      "startBinding": null,
      "endBinding": null,
      "lastCommittedPoint": null,
      "startArrowhead": null,
      "endArrowhead": null,
      "points": [
        [
          0,
          0
        ],
        [
          2.033150371639873,
          3.413095389435587
        ],
        [
          10.801287372573954,
          6.276651055277943
        ],
        [
          22.468666942209353,
          8.010803051612635
        ],
        [
          40.747074201802775,
          8.168828515515864
        ],
        [
          62.077348233027564,
          7.0647721921469495
        ],
        [
          74.53446931782398,
          3.04824021069218
        ],
        [
          77.17198221193564,
          -0.3935204423371723
        ]
      ],
      "index": "a2",
      "frameId": null,
      "roundness": {
        "type": 2
      },
      "boundElements": [],
      "updated": 1758646548051,
      "link": null,
      "locked": false
    },
    {
      "type": "line",
      "version": 2413,
      "versionNonce": 311708712,
      "isDeleted": false,
      "id": "X1ldVIXm4DfBal5N2Pwn9",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 289.3425684673547,
      "y": 120.03697638652972,
      "strokeColor": "#000000",
      "backgroundColor": "#a5d8ff",
      "width": 77.17198221193564,
      "height": 8.562348957853036,
      "seed": 904402520,
      "groupIds": [
        "9PT4BXPfQ6UoCaB-T-h9A",
        "dp_TZJyYdyPIH1hOkAPlb"
      ],
      "strokeSharpness": "round",
      "boundElementIds": [],
      "startBinding": null,
      "endBinding": null,
      "lastCommittedPoint": null,
      "startArrowhead": null,
      "endArrowhead": null,
      "points": [
        [
          0,
          0
        ],
        [
          2.033150371639873,
          3.413095389435587
        ],
        [
          10.801287372573954,
          6.276651055277943
        ],
        [
          22.468666942209353,
          8.010803051612635
        ],
        [
          40.747074201802775,
          8.168828515515864
        ],
        [
          62.077348233027564,
          7.0647721921469495
        ],
        [
          74.53446931782398,
          3.04824021069218
        ],
        [
          77.17198221193564,
          -0.3935204423371723
        ]
      ],
      "index": "a3",
      "frameId": null,
      "roundness": {
        "type": 2
      },
      "boundElements": [],
      "updated": 1758646548051,
      "link": null,
      "locked": false
    },
    {
      "type": "ellipse",
      "version": 5410,
      "versionNonce": 92833576,
      "isDeleted": false,
      "id": "CFhp5ZxSVwHYzGUj4hEn1",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 288.28461948527263,
      "y": 84.74247943834126,
      "strokeColor": "#000000",
      "backgroundColor": "#a5d8ff",
      "width": 76.59753601865496,
      "height": 15.49127539284798,
      "seed": 1782811480,
      "groupIds": [
        "9PT4BXPfQ6UoCaB-T-h9A",
        "dp_TZJyYdyPIH1hOkAPlb"
      ],
      "strokeSharpness": "sharp",
      "boundElementIds": [
        "bxuMGTzXLn7H-uBCptINx"
      ],
      "index": "a4",
      "frameId": null,
      "roundness": null,
      "boundElements": [],
      "updated": 1758646548051,
      "link": null,
      "locked": false
    },
    {
      "type": "ellipse",
      "version": 820,
      "versionNonce": 608002600,
      "isDeleted": false,
      "id": "B43R7rWwK2_vdiRHBSSPk",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 324.77660659049513,
      "y": 109.21914711824485,
      "strokeColor": "#000000",
      "backgroundColor": "#228be6",
      "width": 11.226103154161754,
      "height": 12.183758484455605,
      "seed": 1298686040,
      "groupIds": [
        "9PT4BXPfQ6UoCaB-T-h9A",
        "dp_TZJyYdyPIH1hOkAPlb"
      ],
      "strokeSharpness": "sharp",
      "boundElementIds": [],
      "index": "a5",
      "frameId": null,
      "roundness": null,
      "boundElements": [],
      "updated": 1758646548051,
      "link": null,
      "locked": false
    },
    {
      "type": "ellipse",
      "version": 1108,
      "versionNonce": 1839127848,
      "isDeleted": false,
      "id": "CkKMb9wkJfVk04T217zSs",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 325.12774837442873,
      "y": 135.43576140530996,
      "strokeColor": "#000000",
      "backgroundColor": "#228be6",
      "width": 11.226103154161754,
      "height": 12.183758484455605,
      "seed": 2133497176,
      "groupIds": [
        "9PT4BXPfQ6UoCaB-T-h9A",
        "dp_TZJyYdyPIH1hOkAPlb"
      ],
      "strokeSharpness": "sharp",
      "boundElementIds": [],
      "index": "a6",
      "frameId": null,
      "roundness": null,
      "boundElements": [],
      "updated": 1758646548051,
      "link": null,
      "locked": false
    },
    {
      "type": "ellipse",
      "version": 991,
      "versionNonce": 588838952,
      "isDeleted": false,
      "id": "SHJdKeQPkfpvzSoNH--3o",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 6.239590202363168,
      "x": 325.77660659049513,
      "y": 164.20448797661635,
      "strokeColor": "#000000",
      "backgroundColor": "#228be6",
      "width": 11.226103154161754,
      "height": 12.183758484455605,
      "seed": 81668696,
      "groupIds": [
        "9PT4BXPfQ6UoCaB-T-h9A",
        "dp_TZJyYdyPIH1hOkAPlb"
      ],
      "strokeSharpness": "sharp",
      "boundElementIds": [],
      "index": "a7",
      "frameId": null,
      "roundness": null,
      "boundElements": [],
      "updated": 1758646548051,
      "link": null,
      "locked": false
    },
    {
      "type": "text",
      "version": 489,
      "versionNonce": 2023207720,
      "isDeleted": false,
      "id": "vUSyMBPup0jZ71CYXKyGb",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": 280.1846389770508,
      "y": 185.79462957545917,
      "strokeColor": "#000000",
      "backgroundColor": "#a5d8ff",
      "width": 95.63072204589844,
      "height": 23.595161071904883,
      "seed": 425140056,
      "groupIds": [
        "dp_TZJyYdyPIH1hOkAPlb"
      ],
      "strokeSharpness": "sharp",
      "boundElementIds": [],
      "fontSize": 17.4778970902999,
      "fontFamily": 1,
      "text": "dataset.db",
      "baseline": 16.595161071904883,
      "textAlign": "center",
      "verticalAlign": "top",
      "index": "a8",
      "frameId": null,
      "roundness": null,
      "boundElements": [],
      "updated": 1758646548051,
      "link": null,
      "locked": false,
      "containerId": null,
      "originalText": "dataset.db",
      "autoResize": true,
      "lineHeight": 1.350000000000001
    },
    {
      "id": "R7pU0VP6CFKCAwuvt0xsr",
      "type": "text",
      "x": 295.5,
      "y": 342,
      "width": 374,
      "height": 225,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "#228be6",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a9",
      "roundness": null,
      "seed": 705463336,
      "version": 1130,
      "versionNonce": 72522328,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758648226024,
      "link": null,
      "locked": false,
      "text": "class Extract(Action):\n    # Static\n    + type : ActionTypes = Extract\n    \n    # Properties\n    - db_connection: Path\n    - query: str\n    - query_parameters: [str]\n    - output_mapper: [str]",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class Extract(Action):\n    # Static\n    + type : ActionTypes = Extract\n    \n    # Properties\n    - db_connection: Path\n    - query: str\n    - query_parameters: [str]\n    - output_mapper: [str]",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "G1xIRcJgm34_NMEWQFFlW",
      "type": "text",
      "x": 1419.5,
      "y": 110,
      "width": 253,
      "height": 75,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "#228be6",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aA",
      "roundness": null,
      "seed": 651981400,
      "version": 256,
      "versionNonce": 138082856,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758646570344,
      "link": null,
      "locked": false,
      "text": "class Pipeline\n    - actions: [Action]\n    ",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class Pipeline\n    - actions: [Action]\n    ",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "TBVy3JbJCkbA9kjVEJ8lv",
      "type": "text",
      "x": 694,
      "y": 100,
      "width": 495,
      "height": 150,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "#228be6",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aB",
      "roundness": null,
      "seed": 680960040,
      "version": 560,
      "versionNonce": 85012520,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758649442239,
      "link": null,
      "locked": false,
      "text": "class Action\n    + type: ActionTypes\n    + name: str\n    + depends_on: [str]\n\n    + execute(mem) -> [Dict<str, any>] | Void",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class Action\n    + type: ActionTypes\n    + name: str\n    + depends_on: [str]\n\n    + execute(mem) -> [Dict<str, any>] | Void",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "an7KRTzWpCytKNKgHftKC",
      "type": "text",
      "x": 1528.5,
      "y": 365.5,
      "width": 187,
      "height": 150,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "#228be6",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aC",
      "roundness": null,
      "seed": 1974317656,
      "version": 306,
      "versionNonce": 1574962264,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758648154009,
      "link": null,
      "locked": false,
      "text": "enum ActionTypes:\n    + Extract\n    + Aggregate\n    + Filter\n    + Map\n    + Dump",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "enum ActionTypes:\n    + Extract\n    + Aggregate\n    + Filter\n    + Map\n    + Dump",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "2pQ5EULirrWs_QZPbClhh",
      "type": "text",
      "x": 785,
      "y": 332.5,
      "width": 418,
      "height": 375,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "#228be6",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aH",
      "roundness": null,
      "seed": 1402251560,
      "version": 742,
      "versionNonce": 680432168,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758649532881,
      "link": null,
      "locked": false,
      "text": "class Aggregate(Action):\n    # Static\n    + type: ActionTypes = Aggregate\n\n    # Properties\n    - actionIDs: [str]\n    - associations: [Association]\n    - output_mapper: [str]\n\n    + execute(mem):\n        tables = mem.gather(actionIDs)\n\n        for join in association:\n            \n            ",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class Aggregate(Action):\n    # Static\n    + type: ActionTypes = Aggregate\n\n    # Properties\n    - actionIDs: [str]\n    - associations: [Association]\n    - output_mapper: [str]\n\n    + execute(mem):\n        tables = mem.gather(actionIDs)\n\n        for join in association:\n            \n            ",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "O0fso8DJqFfwJEzmpUikM",
      "type": "text",
      "x": 1289,
      "y": 195,
      "width": 594,
      "height": 100,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "#228be6",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aI",
      "roundness": null,
      "seed": 1582329944,
      "version": 459,
      "versionNonce": 1080077144,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758647067031,
      "link": null,
      "locked": false,
      "text": "input_mapper: \n    - key: ActionID (name) that produced such output\n    - value: list of strings that represent the values\n                to take",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "input_mapper: \n    - key: ActionID (name) that produced such output\n    - value: list of strings that represent the values\n                to take",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "v0kzO6vlBWOdJCV3yoG69",
      "type": "text",
      "x": 1379.5,
      "y": 718.5,
      "width": 286,
      "height": 175,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "#228be6",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aL",
      "roundness": null,
      "seed": 1462407976,
      "version": 635,
      "versionNonce": 1012998696,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758649495598,
      "link": null,
      "locked": false,
      "text": "class Association:\n    - from_actionID: str\n    - from_key_name: str\n    - from_value_name: str\n    - to_actionID: str\n    - to_value_name: str\n    - type: Type",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class Association:\n    - from_actionID: str\n    - from_key_name: str\n    - from_value_name: str\n    - to_actionID: str\n    - to_value_name: str\n    - type: Type",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "WK34n9xeVxntypCtrlK6p",
      "type": "text",
      "x": 256.5,
      "y": 787.5,
      "width": 517,
      "height": 175,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "#228be6",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aM",
      "roundness": null,
      "seed": 1166526296,
      "version": 318,
      "versionNonce": 1042162520,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758649002604,
      "link": null,
      "locked": false,
      "text": "class Filter(Action):\n    # Static\n    + type: ActionTypes = Filter\n\n    # Properties\n    - compare: function(Dict<str, any>) -> bool\n    - output_mapper: [str]",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class Filter(Action):\n    # Static\n    + type: ActionTypes = Filter\n\n    # Properties\n    - compare: function(Dict<str, any>) -> bool\n    - output_mapper: [str]",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "NY9jyUFLFFCNPE2sh00SX",
      "type": "text",
      "x": 1639,
      "y": 606.5,
      "width": 407,
      "height": 200,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "#228be6",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aP",
      "roundness": null,
      "seed": 20345896,
      "version": 168,
      "versionNonce": 627282472,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758649426380,
      "link": null,
      "locked": false,
      "text": "class Map(Action):\n    # Static\n    + type: ActionTypes = Map\n\n    # Properties\n    - compare_mapper: [str]\n    - mapper: function(any...) -> any\n    - output_mapper: [str]",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class Map(Action):\n    # Static\n    + type: ActionTypes = Map\n\n    # Properties\n    - compare_mapper: [str]\n    - mapper: function(any...) -> any\n    - output_mapper: [str]",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "SkhaoW-3TTKDZzEii3Lf6",
      "type": "text",
      "x": 1457.5,
      "y": 955.5,
      "width": 121,
      "height": 50,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "#228be6",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aQ",
      "roundness": null,
      "seed": 2071523672,
      "version": 37,
      "versionNonce": 105260376,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758648834435,
      "link": null,
      "locked": false,
      "text": "class Dump:\n    -",
      "fontSize": 20,
      "fontFamily": 8,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "class Dump:\n    -",
      "autoResize": true,
      "lineHeight": 1.25
    }
  ],
  "appState": {
    "gridSize": 20,
    "gridStep": 5,
    "gridModeEnabled": false,
    "viewBackgroundColor": "#ffffff"
  },
  "files": {}
 }
--- a/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json
+++ b/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json
@ -0,0 +1,634 @@
 {
  "type": "excalidraw",
  "version": 2,
  "source": "https://marketplace.visualstudio.com/items?itemName=pomdtr.excalidraw-editor",
  "elements": [
    {
      "id": "JNB9z-PeqZ4s8KDfWaoXe",
      "type": "rectangle",
      "x": 106,
      "y": 27,
      "width": 653,
      "height": 263,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a2",
      "roundness": {
        "type": 3
      },
      "seed": 710740889,
      "version": 326,
      "versionNonce": 1107631703,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1759156408059,
      "link": null,
      "locked": false
    },
    {
      "id": "e13wNTgUpn2flMpmMttqx",
      "type": "text",
      "x": 200.5943407656526,
      "y": 44.07937975075269,
      "width": 307.2781467269385,
      "height": 23.3097531902191,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a3",
      "roundness": null,
      "seed": 1012740663,
      "version": 444,
      "versionNonce": 589551257,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1759156408059,
      "link": null,
      "locked": false,
      "text": "Libs/CleaningPipeline/sql_endpoint",
      "fontSize": 18.64780255217528,
      "fontFamily": 5,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "Libs/CleaningPipeline/sql_endpoint",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "CgxCElJkKBtIHv-5WQrbo",
      "type": "text",
      "x": 195,
      "y": 80.44259472749451,
      "width": 403.64997665852184,
      "height": 186.4780255217528,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a4",
      "roundness": null,
      "seed": 1261951799,
      "version": 507,
      "versionNonce": 1922906999,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1759156408059,
      "link": null,
      "locked": false,
      "text": "Class SqlEndpoint:\n    - sql_engine\n    + movie_ids: list[int]\n\n    #\n    + get_abbreviated_dataset_by_movie_id\n\n",
      "fontSize": 18.64780255217528,
      "fontFamily": 5,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "Class SqlEndpoint:\n    - sql_engine\n    + movie_ids: list[int]\n\n    #\n    + get_abbreviated_dataset_by_movie_id\n\n",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "type": "line",
      "version": 4979,
      "versionNonce": 1473849177,
      "isDeleted": false,
      "id": "sYReMTdYblr-oJtYYJALU",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": -67.14432426259049,
      "y": 87.19293561900287,
      "strokeColor": "#000000",
      "backgroundColor": "#a5d8ff",
      "width": 77.09201683999922,
      "height": 99.49948667804088,
      "seed": 1263944119,
      "groupIds": [
        "9YkNe1yqnfZy9Z1JX2xr4",
        "BDBCTrrhjbJynRAyuf3xJ"
      ],
      "strokeSharpness": "round",
      "boundElementIds": [],
      "startBinding": null,
      "endBinding": null,
      "lastCommittedPoint": null,
      "startArrowhead": null,
      "endArrowhead": null,
      "points": [
        [
          0,
          0
        ],
        [
          0.2542098813493443,
          75.20117273657175
        ],
        [
          0.011896425679918422,
          83.76249969444815
        ],
        [
          3.970409367559332,
          87.46174320643391
        ],
        [
          17.75573317066317,
          90.59250103325854
        ],
        [
          41.05683533152865,
          91.56737225214069
        ],
        [
          63.319497586673116,
          90.01084754868091
        ],
        [
          75.14781395923075,
          86.28844687220405
        ],
        [
          76.81603792670788,
          83.15042405259751
        ],
        [
          77.05033394391478,
          76.25776215104557
        ],
        [
          76.86643881413028,
          6.3089586511537865
        ],
        [
          76.45188016352971,
          -0.2999144698665015
        ],
        [
          71.50179495549581,
          -3.9936571317850627
        ],
        [
          61.077971898861186,
          -6.132877429442784
        ],
        [
          37.32348754161154,
          -7.932114425900202
        ],
        [
          18.278415656797975,
          -6.859225353587373
        ],
        [
          3.2995959613238286,
          -3.2201165291205287
        ],
        [
          -0.04168289608444441,
          -0.045185660461322996
        ],
        [
          0,
          0
        ]
      ],
      "index": "a6",
      "frameId": null,
      "roundness": {
        "type": 2
      },
      "boundElements": [],
      "updated": 1759158252997,
      "link": null,
      "locked": false
    },
    {
      "type": "line",
      "version": 2684,
      "versionNonce": 952947769,
      "isDeleted": false,
      "id": "0S6dEWQVqKUVkP6Z5IX1l",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": -66.6203948243155,
      "y": 144.31921927673278,
      "strokeColor": "#000000",
      "backgroundColor": "#a5d8ff",
      "width": 77.17198221193564,
      "height": 8.562348957853036,
      "seed": 817033943,
      "groupIds": [
        "9YkNe1yqnfZy9Z1JX2xr4",
        "BDBCTrrhjbJynRAyuf3xJ"
      ],
      "strokeSharpness": "round",
      "boundElementIds": [],
      "startBinding": null,
      "endBinding": null,
      "lastCommittedPoint": null,
      "startArrowhead": null,
      "endArrowhead": null,
      "points": [
        [
          0,
          0
        ],
        [
          2.033150371639873,
          3.413095389435587
        ],
        [
          10.801287372573954,
          6.276651055277943
        ],
        [
          22.468666942209353,
          8.010803051612635
        ],
        [
          40.747074201802775,
          8.168828515515864
        ],
        [
          62.077348233027564,
          7.0647721921469495
        ],
        [
          74.53446931782398,
          3.04824021069218
        ],
        [
          77.17198221193564,
          -0.3935204423371723
        ]
      ],
      "index": "a7",
      "frameId": null,
      "roundness": {
        "type": 2
      },
      "boundElements": [],
      "updated": 1759158252997,
      "link": null,
      "locked": false
    },
    {
      "type": "line",
      "version": 2770,
      "versionNonce": 477619481,
      "isDeleted": false,
      "id": "szGLND7J0nVOvRkNXX9AS",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": -67.65225214681931,
      "y": 115.35516394150972,
      "strokeColor": "#000000",
      "backgroundColor": "#a5d8ff",
      "width": 77.17198221193564,
      "height": 8.562348957853036,
      "seed": 1704755191,
      "groupIds": [
        "9YkNe1yqnfZy9Z1JX2xr4",
        "BDBCTrrhjbJynRAyuf3xJ"
      ],
      "strokeSharpness": "round",
      "boundElementIds": [],
      "startBinding": null,
      "endBinding": null,
      "lastCommittedPoint": null,
      "startArrowhead": null,
      "endArrowhead": null,
      "points": [
        [
          0,
          0
        ],
        [
          2.033150371639873,
          3.413095389435587
        ],
        [
          10.801287372573954,
          6.276651055277943
        ],
        [
          22.468666942209353,
          8.010803051612635
        ],
        [
          40.747074201802775,
          8.168828515515864
        ],
        [
          62.077348233027564,
          7.0647721921469495
        ],
        [
          74.53446931782398,
          3.04824021069218
        ],
        [
          77.17198221193564,
          -0.3935204423371723
        ]
      ],
      "index": "a8",
      "frameId": null,
      "roundness": {
        "type": 2
      },
      "boundElements": [],
      "updated": 1759158252997,
      "link": null,
      "locked": false
    },
    {
      "type": "ellipse",
      "version": 5767,
      "versionNonce": 2119031289,
      "isDeleted": false,
      "id": "O3t2uGktJlDd1_OX_bpV4",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": -68.71020112890136,
      "y": 80.06066699332126,
      "strokeColor": "#000000",
      "backgroundColor": "#a5d8ff",
      "width": 76.59753601865496,
      "height": 15.49127539284798,
      "seed": 471296279,
      "groupIds": [
        "9YkNe1yqnfZy9Z1JX2xr4",
        "BDBCTrrhjbJynRAyuf3xJ"
      ],
      "strokeSharpness": "sharp",
      "boundElementIds": [
        "bxuMGTzXLn7H-uBCptINx"
      ],
      "index": "a9",
      "frameId": null,
      "roundness": null,
      "boundElements": [],
      "updated": 1759158252997,
      "link": null,
      "locked": false
    },
    {
      "type": "ellipse",
      "version": 1177,
      "versionNonce": 525480665,
      "isDeleted": false,
      "id": "_SzKlOBOvJgBg7FX0JTTM",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": -32.218214023678854,
      "y": 104.53733467322485,
      "strokeColor": "#000000",
      "backgroundColor": "#228be6",
      "width": 11.226103154161754,
      "height": 12.183758484455605,
      "seed": 1368927799,
      "groupIds": [
        "9YkNe1yqnfZy9Z1JX2xr4",
        "BDBCTrrhjbJynRAyuf3xJ"
      ],
      "strokeSharpness": "sharp",
      "boundElementIds": [],
      "index": "aA",
      "frameId": null,
      "roundness": null,
      "boundElements": [],
      "updated": 1759158252997,
      "link": null,
      "locked": false
    },
    {
      "type": "ellipse",
      "version": 1465,
      "versionNonce": 1410887609,
      "isDeleted": false,
      "id": "oJMl2Kxa3SPaiAY0kxo7A",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": -31.867072239745255,
      "y": 130.75394896028996,
      "strokeColor": "#000000",
      "backgroundColor": "#228be6",
      "width": 11.226103154161754,
      "height": 12.183758484455605,
      "seed": 1627606871,
      "groupIds": [
        "9YkNe1yqnfZy9Z1JX2xr4",
        "BDBCTrrhjbJynRAyuf3xJ"
      ],
      "strokeSharpness": "sharp",
      "boundElementIds": [],
      "index": "aB",
      "frameId": null,
      "roundness": null,
      "boundElements": [],
      "updated": 1759158252997,
      "link": null,
      "locked": false
    },
    {
      "type": "ellipse",
      "version": 1348,
      "versionNonce": 314839193,
      "isDeleted": false,
      "id": "fB6pJBSMA-pRHrpgYKaLL",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 6.239590202363168,
      "x": -31.218214023678854,
      "y": 159.52267553159635,
      "strokeColor": "#000000",
      "backgroundColor": "#228be6",
      "width": 11.226103154161754,
      "height": 12.183758484455605,
      "seed": 1420643447,
      "groupIds": [
        "9YkNe1yqnfZy9Z1JX2xr4",
        "BDBCTrrhjbJynRAyuf3xJ"
      ],
      "strokeSharpness": "sharp",
      "boundElementIds": [],
      "index": "aC",
      "frameId": null,
      "roundness": null,
      "boundElements": [],
      "updated": 1759158252997,
      "link": null,
      "locked": false
    },
    {
      "type": "text",
      "version": 846,
      "versionNonce": 1091081593,
      "isDeleted": false,
      "id": "9gZ3Yy1MeP9kEOTLODqLG",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": -76.81018163712321,
      "y": 181.11281713043917,
      "strokeColor": "#000000",
      "backgroundColor": "#a5d8ff",
      "width": 95.63072204589844,
      "height": 23.595161071904883,
      "seed": 2019206551,
      "groupIds": [
        "BDBCTrrhjbJynRAyuf3xJ"
      ],
      "strokeSharpness": "sharp",
      "boundElementIds": [],
      "fontSize": 17.4778970902999,
      "fontFamily": 1,
      "text": "dataset.db",
      "baseline": 16.595161071904883,
      "textAlign": "center",
      "verticalAlign": "top",
      "index": "aD",
      "frameId": null,
      "roundness": null,
      "boundElements": [],
      "updated": 1759158252997,
      "link": null,
      "locked": false,
      "containerId": null,
      "originalText": "dataset.db",
      "autoResize": true,
      "lineHeight": 1.350000000000001
    },
    {
      "id": "3eOw20xMhpB5jf_RMG24P",
      "type": "text",
      "x": 1131.3333333333335,
      "y": 31.333333333333428,
      "width": 508.3333333333333,
      "height": 550,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aE",
      "roundness": null,
      "seed": 1535658041,
      "version": 821,
      "versionNonce": 1630266809,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1759157181677,
      "link": null,
      "locked": false,
      "text": "Class PipelineApplier\n    - movie_frequence_filter : pd.DataFrame()\n    - rel_Frequence_Filter : pd.DataFrame()\n    - rel_banned_list: list[str]\n\n    + generate_movie_frequency_filter()\n    + generate_rel_frequency_filter()\n    + generate_list_relationship_filter()\n    \n    + filter_by_movie_frequency()\n    + filter_by_relationship_frequency()\n    + delete_relationship_by_list_filter()\n    + delete_relationship_by_str()\n\n    + drop_na()    \n\n    + rdf_add_special_token()\n    + group_triple_by_movie()\n    + build_by_movie()\n    # static\n    + build_triple()\n    + build_incomplete_triple()",
      "fontSize": 20,
      "fontFamily": 5,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "Class PipelineApplier\n    - movie_frequence_filter : pd.DataFrame()\n    - rel_Frequence_Filter : pd.DataFrame()\n    - rel_banned_list: list[str]\n\n    + generate_movie_frequency_filter()\n    + generate_rel_frequency_filter()\n    + generate_list_relationship_filter()\n    \n    + filter_by_movie_frequency()\n    + filter_by_relationship_frequency()\n    + delete_relationship_by_list_filter()\n    + delete_relationship_by_str()\n\n    + drop_na()    \n\n    + rdf_add_special_token()\n    + group_triple_by_movie()\n    + build_by_movie()\n    # static\n    + build_triple()\n    + build_incomplete_triple()",
      "autoResize": false,
      "lineHeight": 1.25
    },
    {
      "id": "Fbl1gpb5r7QrdRauGUWm2",
      "type": "text",
      "x": 158.23809523809535,
      "y": 502.52380952380935,
      "width": 484.2857142857143,
      "height": 500,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aF",
      "roundness": null,
      "seed": 2066618807,
      "version": 552,
      "versionNonce": 1269344823,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1759158199532,
      "link": null,
      "locked": false,
      "text": "Class Pipeline\n    - sql_endpoint: SqlEndpoint()\n\n    - task_rdf_mask_file_handler:\n    - task_bpe_corpus_file_handler:\n    - task_rdf_text_file_handler:\n    - task_rdf_completation_file_handler:\n\n    - Filter_applier : PipelineApplier()\n\n    #\n    - get_cleaned_movie_rows()\n    \n    + execute_task_bpe_corpus()\n    + execute_task_rdf_mask()\n    + execute_task_rdf_text()\n    + execute_task_rdf_completation()\n    + execute_all_task()\n\n    + use_toy_dataset()",
      "fontSize": 20,
      "fontFamily": 5,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "Class Pipeline\n    - sql_endpoint: SqlEndpoint()\n\n    - task_rdf_mask_file_handler:\n    - task_bpe_corpus_file_handler:\n    - task_rdf_text_file_handler:\n    - task_rdf_completation_file_handler:\n\n    - Filter_applier : PipelineApplier()\n\n    #\n    - get_cleaned_movie_rows()\n    \n    + execute_task_bpe_corpus()\n    + execute_task_rdf_mask()\n    + execute_task_rdf_text()\n    + execute_task_rdf_completation()\n    + execute_all_task()\n\n    + use_toy_dataset()",
      "autoResize": false,
      "lineHeight": 1.25
    }
  ],
  "appState": {
    "gridSize": 20,
    "gridStep": 5,
    "gridModeEnabled": false,
    "viewBackgroundColor": "#ffffff"
  },
  "files": {}
 }
--- a/docs/DBPEDIA.md
+++ b/docs/DBPEDIA.md
@ -0,0 +1,215 @@
 # DBPedia
 ## GraphIRI
 This is the graph identifier (URI):
 `http://dbpedia.org`
 ## History of queries
 ```SQL
 PREFIX dbo:  <http://dbpedia.org/ontology/>
 PREFIX dbp:  <http://dbpedia.org/property/>
 PREFIX dbr:  <http://dbpedia.org/resource/>
 SELECT ?subject, ?relationship, ?object
 WHERE {
  ?subject ?relationship ?object .
  {
    SELECT ?object
    WHERE {
      ?m rdf:type dbo:Film .
      ?object ?r ?m
    }
  }
 }
 ```
 ### 2 Hops
 ```SQL
 PREFIX dbo:  <http://dbpedia.org/ontology/>
 PREFIX dbp:  <http://dbpedia.org/property/>
 PREFIX dbr:  <http://dbpedia.org/resource/>
 SELECT ?subject, ?relationship, ?object
 WHERE {
  ?subject ?relationship ?object .
  FILTER (?relationship != <http://dbpedia.org/ontology/wikiPageWikiLink>)
  {
    SELECT ?object
    WHERE {
      ?m rdf:type dbo:Film .
      ?object ?r ?m
      FILTER (?r != <http://dbpedia.org/ontology/wikiPageWikiLink>)
    }
  }
 }
 LIMIT 1000000
 ```
 ### 1 Hop
 ```SQL
 PREFIX dbo:  <http://dbpedia.org/ontology/>
 PREFIX dbp:  <http://dbpedia.org/property/>
 PREFIX dbr:  <http://dbpedia.org/resource/>
 SELECT ?subject, ?relationship, ?object
 WHERE {
  ?subject ?relationship ?object .
  ?object rdf:type dbo:Film .
  FILTER (?relationship != <http://dbpedia.org/ontology/wikiPageWikiLink>)
 }
 LIMIT 1000000
 ```
 ```SQL
 PREFIX dbo:  <http://dbpedia.org/ontology/>
 PREFIX dbp:  <http://dbpedia.org/property/>
 PREFIX dbr:  <http://dbpedia.org/resource/>
 SELECT ?subject, ?relationship, ?object
 WHERE {
  ?subject ?relationship ?object .
  ?subject rdf:type dbo:Film .
 }
 ```
 ```SQL
 PREFIX dbo:  <http://dbpedia.org/ontology/>
 PREFIX dbp:  <http://dbpedia.org/property/>
 PREFIX dbr:  <http://dbpedia.org/resource/>
 PREFIX foaf: <http://xmlns.com/foaf/0.1/>
 SELECT ?subject, ?relationship, ?object
 WHERE {
  ?subject ?relationship ?object .
  ?subject rdf:type dbo:Film .
  ?a foaf:primaryTopic ?subject
 }
 ```
 ```SQL
 PREFIX dbo:  <http://dbpedia.org/ontology/>
 PREFIX dbp:  <http://dbpedia.org/property/>
 PREFIX dbr:  <http://dbpedia.org/resource/>
 SELECT ?subject
 WHERE {
  ?subject rdf:type dbo:Film .
 }
 ```
 ```SQL
 PREFIX dbo:  <http://dbpedia.org/ontology/>
 PREFIX dbp:  <http://dbpedia.org/property/>
 PREFIX dbr:  <http://dbpedia.org/resource/>
 PREFIX foaf: <http://xmlns.com/foaf/0.1/>
 SELECT ?subject
 WHERE {
  ?subject rdf:type dbo:Film .
  ?a foaf:primaryTopic ?subject
 }
 ```
 ```SQL
 PREFIX dbo:  <http://dbpedia.org/ontology/>
 PREFIX dbp:  <http://dbpedia.org/property/>
 PREFIX dbr:  <http://dbpedia.org/resource/>
 PREFIX foaf: <http://xmlns.com/foaf/0.1/>
 SELECT ?subject, ?relationship, ?object
 WHERE {
  ?subject ?relationship ?object .
  ?subject rdf:type dbo:Film .
  ?a foaf:primaryTopic ?subject
  FILTER (?relationship NOT IN (
    dbo:wikiPageRedirects,
    dbo:wikiPageExternalLink,
    dbo:wikiPageWikiLink,
    foaf:primaryTopic
  ))
 }
 ```
 #### Wikipedia-movie
 a.k.a the file with the wikipedia abstract
 ```SQL
 PREFIX dbo:  <http://dbpedia.org/ontology/>
 PREFIX dbp:  <http://dbpedia.org/property/>
 PREFIX dbr:  <http://dbpedia.org/resource/>
 PREFIX foaf: <http://xmlns.com/foaf/0.1/>
 SELECT  ?subject , ?object
 WHERE {
  ?subject foaf:primaryTopic ?object .
  ?object rdf:type dbo:Film 
 }
 ```
 #### Reverse
 ```SQL
 PREFIX dbo:  <http://dbpedia.org/ontology/>
 PREFIX dbp:  <http://dbpedia.org/property/>
 PREFIX dbr:  <http://dbpedia.org/resource/>
 PREFIX foaf: <http://xmlns.com/foaf/0.1/>
 SELECT ?subject, ?relationship, ?object
 WHERE {
  ?subject ?relationship ?object .
  ?object rdf:type dbo:Film .
  ?a foaf:primaryTopic ?object
  FILTER (?relationship NOT IN (
    dbo:wikiPageRedirects,
    dbo:wikiPageExternalLink,
    dbo:wikiPageWikiLink,
    foaf:primaryTopic
  ))
 }
 ```
 ```SQL
 PREFIX dbo:  <http://dbpedia.org/ontology/>
 PREFIX dbp:  <http://dbpedia.org/property/>
 PREFIX dbr:  <http://dbpedia.org/resource/>
 PREFIX foaf: <http://xmlns.com/foaf/0.1/>
 SELECT ?subject, ?relationship, ?object
 WHERE {
  ?subject ?relationship ?object .
  ?object rdf:type dbo:Film .
  ?a foaf:primaryTopic ?object
  FILTER (?relationship NOT IN (
    dbo:wikiPageRedirects,
    dbo:wikiPageExternalLink,
    dbo:wikiPageWikiLink,
    foaf:primaryTopic
  ))
 ```
 #### Film \ wiki page ID
 ```SQL
 PREFIX dbo:  <http://dbpedia.org/ontology/>
 PREFIX dbp:  <http://dbpedia.org/property/>
 PREFIX dbr:  <http://dbpedia.org/resource/>
 PREFIX foaf: <http://xmlns.com/foaf/0.1/>
 PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
 SELECT ?subject ?pageID
 WHERE {
  ?subject rdf:type dbo:Film .
  ?subject dbo:wikiPageID ?pageID .
  ?subject rdfs:label ?label .
  FILTER (lang(?label) = "en")
 }
 ```
--- a/docs/DEVELOPMENT.md
+++ b/docs/DEVELOPMENT.md
@ -0,0 +1,3 @@
 # Development
 ## Data Gathering
--- a/docs/RESOURCES.md
+++ b/docs/RESOURCES.md
@ -0,0 +1,108 @@
 # Resources
 ## Byte-Pair Encoding (BPE)
 ### Overview
 Byte-Pair Encoding (BPE) is a simple but powerful text compression and tokenization algorithm.
 Originally introduced as a data compression method, it has been widely adopted in **Natural Language Processing (NLP)** to build subword vocabularies for models such as GPT and BERT.
 ---
 ### Key Idea
 BPE works by iteratively replacing the most frequent pair of symbols (initially characters) with a new symbol.
 Over time, frequent character sequences (e.g., common morphemes, prefixes, suffixes) are merged into single tokens.
 ---
 ### Algorithm Steps
 1. **Initialization**
   - Treat each character of the input text as a token.
 2. **Find Frequent Pairs**
   - Count all adjacent token pairs in the sequence.
 3. **Merge Most Frequent Pair**
   - Replace the most frequent pair with a new symbol not used in the text.
 4. **Repeat**
   - Continue until no frequent pairs remain or a desired vocabulary size is reached.
 ---
 ### Example
 Suppose the data to be encoded is:
 ```text
 aaabdaaabac
 ```
 #### Step 1: Merge `"aa"`
 Most frequent pair: `"aa"` → replace with `"Z"`
 ```text
 ZabdZabac
 Z = aa
 ```
 ---
 #### Step 2: Merge `"ab"`
 Most frequent pair: `"ab"` → replace with `"Y"`
 ```text
 ZYdZYac
 Y = ab
 Z = aa
 ```
 ---
 #### Step 3: Merge `"ZY"`
 Most frequent pair: `"ZY"` → replace with `"X"`
 ```text
 XdXac
 X = ZY
 Y = ab
 Z = aa
 ```
 ---
 At this point, no pairs occur more than once, so the process stops.
 ---
 ### Decompression
 To recover the original data, replacements are applied in **reverse order**:
 ```text
 XdXac
 → ZYdZYac
 → ZabdZabac
 → aaabdaaabac
 ```
 ---
 ### Advantages
 - **Efficient vocabulary building**: reduces the need for massive word lists.
 - **Handles rare words**: breaks them into meaningful subword units.
 - **Balances character- and word-level tokenization**.
 ---
 ### Limitations
 - Does not consider linguistic meaning—merges are frequency-based.
 - May create tokens that are not linguistically natural.
 - Vocabulary is fixed after training.
--- a/docs/SPARQL.md
+++ b/docs/SPARQL.md
@ -0,0 +1,67 @@
 # SparQL
 > [!NOTE]
 > Resources taken from [this website](https://sparql.dev/)
 ## SQL Queries
 ### SELECT
 ```SQL
 SELECT ?var1, ?var2, ...
 ```
 ### WHERE
 ```SQL
 WHERE {
    pattern1 .
    pattern2 .
    ...
 }
 ```
 ### FILTER
 It's used to restrict [`WHERE`](#where) clauses
 ```SQL
 WHERE {
  ?person <http://example.com/hasCar> ?car .
  FILTER (?car = <http://example.com/Car1>)
 }
 ```
 ### OPTIONAL
 It's used to fetch available content if exists
 ```SQL
 SELECT ?person ?car
 WHERE {
  ?person <http://example.com/hasCar> ?car .
  OPTIONAL {
    ?car <http://example.com/hasColor> ?color .
  }
 }
 ```
 ### LIMIT
 Limits results
 ```SQL
 LIMIT 10 -- Take only 10 results
 ```
 ## SparQL functions
 ### COUNT
 ```SQL
 SELECT (COUNT(?person) AS ?count)
 WHERE {
  ?person <http://example.com/hasCar> ?car .
 }
 ```
--- a/environment.yaml
+++ b/environment.yaml
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,18 @@
 certifi==2025.8.3
 charset-normalizer==3.4.3
 idna==3.10
 numpy==2.3.3
 pandas==2.3.2
 pyparsing==3.2.4
 python-dateutil==2.9.0.post0
 pytz==2025.2
 rdflib==7.1.4
 requests==2.32.5
 setuptools==78.1.1
 six==1.17.0
 SPARQLWrapper==2.0.0
 tzdata==2025.2
 urllib3==2.5.0
 wheel==0.45.1
 Wikipedia-API==0.8.1
 SQLAlchemy
Author	SHA1	Message	Date
GassiGiuseppe	856c693650	Added possibility to whitelist relationships	2025-10-12 12:26:26 +02:00
GassiGiuseppe	e9d30b3cea	add divide method to create hold out dataset	2025-10-11 16:49:36 +02:00
GassiGiuseppe	ee12f53f12	Added EOS token	2025-10-07 22:47:59 +02:00
GassiGiuseppe	a04f4c7cb7	changes to shorten the dataset	2025-10-07 15:49:25 +02:00
GassiGiuseppe	a93e61b8c1	Update ETL	2025-10-07 00:54:00 +02:00
GassiGiuseppe	0373460105	Movie filters updated	2025-10-06 10:57:50 +02:00
GassiGiuseppe	7307916891	update sql_endpoint to work with the new pipeline	2025-10-05 14:58:03 +02:00
GassiGiuseppe	acb43fc899	new faster pipeline	2025-10-05 14:57:45 +02:00
GassiGiuseppe	255d801a80	updated the mask rdf_mask_task. however since the model will build the mask itself, it is deprecated	2025-10-05 14:56:33 +02:00
GassiGiuseppe	2bd24ec278	Created legacy folder for old pipeline this pipeline still works but is slower then the new, some ot its method can be used later	2025-10-05 14:54:32 +02:00
GassiGiuseppe	69fba7c3e9	new utility to generate a csv debug file of the output of the pipeline	2025-10-04 21:33:09 +02:00
GassiGiuseppe	64e355e80c	Added regex to delete new lines and * from ObjectURI	2025-09-30 15:00:07 +02:00
GassiGiuseppe	007f1e9554	minor updates	2025-09-29 18:53:33 +02:00
GassiGiuseppe	c319398ca0	little update to UML pipeline	2025-09-29 17:03:31 +02:00
GassiGiuseppe	255d8a072d	First implementation of the cleaning pipeline UML	2025-09-29 16:59:52 +02:00
GassiGiuseppe	8167c9d435	Added Toy Dataset entry point into the Pipeline class Before it was forced into the sql_endpoint, now all the pipeline can be managed in the Pipeline class	2025-09-29 16:03:49 +02:00
GassiGiuseppe	bd72ad3571	Added file to execute the complete cleaning pipeline	2025-09-29 15:21:26 +02:00
GassiGiuseppe	6ddb7de9da	Added sqlAlchemy to requirements	2025-09-29 15:19:19 +02:00
GassiGiuseppe	650b37c586	Added vscode setting to execute jupyternotebook from root dir	2025-09-26 11:24:34 +02:00
GassiGiuseppe	e521b0704e	deleted TODO in path_splitter_tree, as it was already resolved	2025-09-25 19:19:11 +02:00
Christian Risi	0a698e9837	Added schema to extract from DB for BPE	2025-09-25 19:09:52 +02:00
GassiGiuseppe	9440a562f2	Merge branch 'dev.etl' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev.etl	2025-09-25 18:33:51 +02:00
Christian Risi	5eda131aac	Fixed creation query to be unique even with movieID in RDFs	2025-09-25 17:58:09 +02:00
GassiGiuseppe	57884eaf2e	CSV support added to path_splitter_tree Also resolved a minor bug to print also leaf nodes	2025-09-25 17:57:46 +02:00
Christian Risi	4548a683c2	Fixed DB	2025-09-25 17:57:45 +02:00
GassiGiuseppe	3eec49ffa5	WIP: added test file: clean_relationship.jupyter to create a first cleaning pipeline	2025-09-25 16:28:24 +02:00
Christian Risi	0bc7f4b227	Fixed Typos	2025-09-25 12:37:52 +02:00
Christian Risi	f28952b0a2	Added todo	2025-09-25 12:00:26 +02:00
Christian Risi	0b626a8e09	Modified query to take all data	2025-09-25 11:53:12 +02:00
Christian Risi	b254098532	Added views to count for subjects and objects	2025-09-25 11:40:44 +02:00
Christian Risi	ee88ffe4cf	Added View to filter over relationship counts	2025-09-25 11:32:03 +02:00
Christian Risi	70b4bd8645	Added Complex query	2025-09-25 11:31:34 +02:00
Christian Risi	6316d2bfc4	Added queries to take data from SQL for dataset	2025-09-25 11:27:19 +02:00
Christian Risi	87ca748f45	Updated DB to reflect new changes	2025-09-24 19:29:57 +02:00
Christian Risi	4315d70109	Merged abbreviation_datawarehouse into datawarehouse	2025-09-24 19:29:43 +02:00
Christian Risi	9a5d633b5e	Fixed Typos	2025-09-24 19:29:07 +02:00
Christian Risi	a6760cd52d	Updated SQL Queries to support parsing in DB	2025-09-24 19:28:55 +02:00
GassiGiuseppe	a7eb92227d	Moved all db queries file in their own folder	2025-09-24 16:44:55 +02:00
GassiGiuseppe	9f221e31cd	Merge branch 'dev.etl' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev.etl	2025-09-24 16:32:52 +02:00
GassiGiuseppe	47197194d5	WIP abbrevietion_datawarehouse to creat an abbreviation system	2025-09-24 16:32:09 +02:00
Christian Risi	0cdbf6f624	Added query to retrieve a dirty dataset from SQLite DB	2025-09-24 16:15:47 +02:00
Christian Risi	3e30489f86	Updated Queries for DB	2025-09-24 14:44:53 +02:00
Christian Risi	8a22e453e4	Fixed csv	2025-09-24 14:44:25 +02:00
Christian Risi	7feb4eb857	Fixed URI generation	2025-09-24 14:44:07 +02:00
Christian Risi	70af19d356	Removed unused imports and added trailing slashes	2025-09-24 14:04:48 +02:00
Christian Risi	a4b44ab2ee	Fixed Typos	2025-09-24 14:04:27 +02:00
Christian Risi	74b6b609dd	Fixed typos	2025-09-24 13:59:19 +02:00
Christian Risi	59796c37cb	Added script to take dbpedia uris	2025-09-24 13:49:29 +02:00
Christian Risi	f696f5950b	Added uri-abbreviations	2025-09-24 13:48:53 +02:00
Christian Risi	605b496da7	Added barebone UML diagram for a Cleaning Pipeline	2025-09-23 19:49:01 +02:00
Christian Risi	7d693964dd	Added new directories to tree structure	2025-09-23 19:47:56 +02:00
Christian Risi	25f401b577	Fixed bug for parsing and added CLI functionalities	2025-09-23 17:58:08 +02:00
Christian Risi	14c5ade230	Added CLI functionalities	2025-09-23 17:57:38 +02:00
chris-admin	4c9c51f902	Added barebone to have a splitter	2025-09-23 15:34:53 +02:00
GassiGiuseppe	63c1a4a160	added little snippet to rebuild db from db_creation.sql	2025-09-22 17:52:23 +02:00
GassiGiuseppe	51114af853	DataRetrivial deleted since it does the same thing as datawarehouse.py	2025-09-22 17:51:35 +02:00
GassiGiuseppe	3a6dca0681	Infos about Dataset contruction from csv moved from python file to markdown	2025-09-22 17:39:44 +02:00
GassiGiuseppe	346098d2b7	Added query.sql , file with the query used to populate the Dataset	2025-09-22 17:21:32 +02:00
GassiGiuseppe	64f9b41378	Built datawarehouse.py which populate the dataset	2025-09-22 17:17:22 +02:00
GassiGiuseppe	ac1ed42c49	Folder DataCleaning renamed to DatasetMerging since it doesn't clean nothing and instead Build the dataset	2025-09-22 17:11:49 +02:00
GassiGiuseppe	edd01a2c83	Dataset updated, the new one is built with the new method ( 50 new rows found ... upon 13 milion )	2025-09-22 16:57:06 +02:00
GassiGiuseppe	5aa9e3fcf3	Added in DBPEDIA the query to get Film \ wiki page ID plus some editing	2025-09-22 15:42:57 +02:00
GassiGiuseppe	0970cabf92	reverse.csv grammar correction of the header it seemed to have missplaced the header also in the middle of the csv	2025-09-22 13:47:20 +02:00
GassiGiuseppe	a26d92750f	Update movie-pageid.csv : grammar correction of the header	2025-09-22 12:59:35 +02:00
GassiGiuseppe	34c4782232	Dataset.db update. it seems to be correct	2025-09-20 23:33:56 +02:00
GassiGiuseppe	c5439533e6	DataRetrivial update, without df	2025-09-20 23:32:08 +02:00
GassiGiuseppe	8819b8e87f	DataRetrivial populate the db from csv	2025-09-20 19:56:24 +02:00
Christian Risi	1076dc8aa6	Run /Scripts/DataCleaning/SQL_Queries/db_creation.sql	2025-09-20 16:39:16 +02:00
Christian Risi	3d15e03b09	Renamed file to fix spelling	2025-09-20 16:38:38 +02:00
Christian Risi	0ee2ec6fcd	Spelling corrections	2025-09-20 16:37:57 +02:00
Christian Risi	95cfa5486c	Added instructions to create databse schema	2025-09-20 16:30:08 +02:00
GassiGiuseppe	0d30e90ee0	Created file for the db DatawareHouse Also decided firsts schema models into DBMerger	2025-09-20 15:53:32 +02:00
GassiGiuseppe	faaba17a98	Merge branch 'dev' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev	2025-09-20 14:34:25 +02:00
Christian Risi	854e5f1d98	Updated file to gather data from wikipedia	2025-09-20 14:32:30 +02:00
GassiGiuseppe	242d7f674f	wikipedia summary file uploaded Dataset composed of PageId and wikipedia Summary	2025-09-20 14:32:25 +02:00
Christian Risi	de8c2afceb	Added reconciliation	2025-09-19 22:22:09 +02:00
Christian Risi	f89dffff75	Created script to gather wikipedia abstracts	2025-09-19 19:01:38 +02:00
GassiGiuseppe	e39bad8348	Added Troubleshooting section to README where are corrected some potential issue with git and big files	2025-09-19 13:39:56 +02:00
GassiGiuseppe	7a1a221017	update of the database of movie-pageid which has subject has film uri and object wikipage id	2025-09-19 13:37:56 +02:00
Christian Risi	fafe6ae0f9	Modified tree structure with more TMP directories	2025-09-19 12:46:31 +02:00
Christian Risi	e32444df75	Updated fetchdata to be used in terminal Changes: - now you can use it as if it were a cli command Missing: - documentation	2025-09-19 12:35:15 +02:00
Christian Risi	b74b7ac4f0	Added new directories to make experiments and updated .gitignore Changes: - Added /Scripts/Experiments/Queries to keep track of important queries, once set - Added /Scripts/Experiments/Tmp to run quick experiments when still unsure while explorating datasets	2025-09-19 08:43:54 +02:00
Christian Risi	22134391d9	Added Scripts/Experiment directory This directory is to place files to make experiments	2025-09-19 08:41:46 +02:00
Christian Risi	82c9023849	Ignoring Scripts/Experiments files and always tracking .gitkeep files	2025-09-19 08:39:47 +02:00
Christian Risi	00b87e01ea	Moved fetchdata.py to reflect working tree old - ${Proj}/Scripts/fetchdata.py new - ${Proj}/Scripts/DataGathering/fetchdata.py	2025-09-19 08:37:04 +02:00
Christian Risi	ce3d4bf6c5	Renamed dir from Script to Scripts	2025-09-19 08:31:00 +02:00
GassiGiuseppe	c415b175a0	added reverse.csv with the reletion incoming to films	2025-09-18 20:26:51 +02:00
GassiGiuseppe	ec81ea7930	Added file to gather wikipedia abstract from url	2025-09-18 20:26:11 +02:00
GassiGiuseppe	4bb03f86b3	Added file to study the most frequent relationship into a csv triplet	2025-09-18 20:25:25 +02:00
GassiGiuseppe	e5f201f3db	DEVELOPMENT file makrdown created	2025-09-18 20:24:54 +02:00
GassiGiuseppe	1c715dc569	Typo correction in the markdown	2025-09-18 20:24:11 +02:00
GassiGiuseppe	6686b47328	Added SQL to obtain wikipedia url with movies	2025-09-18 20:23:10 +02:00
GassiGiuseppe	9a5a7d84fd	Merge branch 'dev' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev	2025-09-18 19:20:26 +02:00
GassiGiuseppe	9678ece9c0	Requirements changed added Pandas and some other	2025-09-18 19:07:38 +02:00
Christian Risi	67bcd732b5	Updated movies	2025-09-18 18:36:52 +02:00
Christian Risi	1a4f900500	Updated git attributes	2025-09-18 18:36:42 +02:00
Christian Risi	ca8729b67c	Merge branch 'dev' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev	2025-09-18 18:36:30 +02:00
GassiGiuseppe	9dbffc52ed	Added dataset of movies and their wikipedia's page link	2025-09-18 18:16:51 +02:00
Christian Risi	b7f504942a	Created Dataset	2025-09-18 17:24:08 +02:00
Christian Risi	7f0c5ce8d3	Updated File for fetching	2025-09-18 17:23:56 +02:00
Christian Risi	9838e287a4	Updated file	2025-09-18 12:03:09 +02:00
Christian Risi	ca6143ea3c	Updated Query histories	2025-09-18 11:46:32 +02:00
Christian Risi	16e7ab4d9f	Modified Datasets	2025-09-17 17:30:51 +02:00
Christian Risi	28723ab662	Merge branch 'dev' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev	2025-09-17 17:06:16 +02:00
Christian Risi	3e59efcf33	Generated datasets	2025-09-17 17:06:14 +02:00
Christian Risi	7c04309cc1	Added script to fetch data from DBPedia	2025-09-17 17:05:27 +02:00
Christian Risi	db87295890	Added history of queries	2025-09-17 17:04:58 +02:00
GassiGiuseppe	61568200a8	README update with setup chapter where are scripted the command to manage conda and pip	2025-09-17 16:50:50 +02:00
Christian Risi	8df2736b97	Added environments	2025-09-17 16:16:58 +02:00
Christian Risi	eb5b7f629a	Conda env	2025-09-17 15:53:17 +02:00
Christian Risi	79232b391e	First SparQL query	2025-09-17 14:26:37 +02:00
Christian Risi	72eb937b47	Fixed Markdown violations	2025-09-17 12:51:14 +02:00
Christian Risi	cececa14ce	Merge branch 'dev' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev	2025-09-17 12:48:34 +02:00
Christian Risi	2487d44abd	Added SparQL	2025-09-17 12:48:33 +02:00
GassiGiuseppe	553b86cac2	Resources file updated with Byte-Pair Encoding a technique we will use to tokenize the engress' words	2025-09-17 12:06:01 +02:00
Christian Risi	12bd781fd3	Added workspace recommendations	2025-09-17 11:38:23 +02:00
Christian Risi	463f4907b8	Added Resources documentation	2025-09-17 11:36:02 +02:00