Added possibility to whitelist relationships

add divide method to create hold out dataset
Added EOS token
2025-10-12 12:26:26 +02:00 · 2025-10-11 16:49:36 +02:00 · 2025-10-07 22:47:59 +02:00 · 2025-10-07 15:49:25 +02:00 · 2025-10-07 00:54:00 +02:00 · 2025-10-06 10:57:50 +02:00
23 changed files with 3212 additions and 1 deletions
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -0,0 +1,24 @@
 {
  // Always treat the project root as the working dir for Jupyter
  "jupyter.notebookFileRoot": "${workspaceFolder}",
  // When you click "Run Python File in Terminal", DON'T cd into the file's folder
  "python.terminal.executeInFileDir": false,
  // Start new integrated terminals at the project root
  "terminal.integrated.cwd": "${workspaceFolder}",
  // Ensure Python can import from the project root no matter which file you run
  // (so `src/` is on sys.path). Linux shown here; add osx/windows if needed.
  "terminal.integrated.env.linux": {
    "PYTHONPATH": "${workspaceFolder}"
  },
  // Make pytest run from the root without needing a pytest.ini
  "python.testing.pytestEnabled": true,
  "python.testing.cwd": "${workspaceFolder}",
  "python.testing.pytestArgs": ["src/test"],
  // Help Pylance resolve imports like `from src...` without red squiggles
  "python.analysis.extraPaths": ["${workspaceFolder}"]
 }
--- a/Scripts/DataCleaning/data_output_models/bpe_corpus.py
+++ b/Scripts/DataCleaning/data_output_models/bpe_corpus.py
@@ -0,0 +1,21 @@
 from Scripts.Libs.Utils.dataframe_interaction import get_raw_from_dataframe
 from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
 import pandas as pd
 class BPE_corpus():
    def __init__(self, output_path :str):
        self.output_handler = open(output_path, "w")
    def close(self):
        # add corpus end before closing
        self.output_handler.write(SpecialToken.CORPUS_END.value)
        self.output_handler.close()
    def write_from_str(self, output: str):
        if output == '':
            return
        self.output_handler.write(output)
    def write_from_df(self, df: pd.DataFrame):
        self.write_from_str(get_raw_from_dataframe(df))
--- a/Scripts/DataCleaning/data_output_models/debug_csv.py
+++ b/Scripts/DataCleaning/data_output_models/debug_csv.py
@@ -0,0 +1,21 @@
 import pandas as pd
 class Debug_csv():
    def __init__(self, output_path:str):
        self.output =  open(output_path, "w")
        # then the first row as header
        header = ["MovieURI","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
        self.output.write(",".join(header) + "\n")
    def close(self):
        self.output.close()
    def write(self, RDF: pd.DataFrame):
        """
        Args:
            RDF (pd.DataFrame): ["MovieURI","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
        """        
        RDF.to_csv(self.output, index=False, header=False)
--- a/Scripts/DataCleaning/data_output_models/rdf_completation_task.py
+++ b/Scripts/DataCleaning/data_output_models/rdf_completation_task.py
@@ -0,0 +1,26 @@
 import pandas as pd
 class RDF_completation_task_dataset():
    """
        Write the CSV for the fourth task, which is "Predicting subsequent triples based on a given context".
        Each RDF is saved as str
        CSV Composition: ["MovieID","RDF"]
    """
    def __init__(self, output_path:str):
        self.output =  open(output_path, "w")
        # then the first row as header
        header = ["MovieID","RDF"]
        self.output.write(",".join(header) + "\n")
    def close(self):
        self.output.close()
    def write(self, RDF: pd.DataFrame):
        """
        Args:
            RDF (pd.DataFrame): ["MovieID","RDF"]
        """        
        RDF.to_csv(self.output, index=False, header=False)
--- a/Scripts/DataCleaning/data_output_models/rdf_mask_task.py
+++ b/Scripts/DataCleaning/data_output_models/rdf_mask_task.py
@@ -0,0 +1,58 @@
 import pandas as pd
 # do not worry about circular dependencies, this class will never call something else
 from Scripts.DataCleaning.legacy.filter import PipelineApplier
 class RDF_mask_task_dataset():
    """
        Write the CSV for the third task, which is "Predicting a masked component within an RDF triple".
        The CSV is like: for each RDF there will be 3 rows, where every time one of the componments is missing.
        CSV Composition: ["MovieID","IncompleteRDF","Missing","RDF"]
    """
    def __init__(self, output_path:str):
        # this methods will only be used by this class, but they belong in a lower level
        self._build_triple = PipelineApplier.build_triple
        self._build_incomplete_triple = PipelineApplier.build_incomplete_triple
        self.output =  open(output_path, "w")
        # then the first row as header
        header = ["MovieID","IncompleteRDF","Missing","RDF"]
        self.output.write(",".join(header) + "\n")
    def close(self):
        self.output.close()
    def write(self, RDF: pd.DataFrame):
        rdf_complete = self._build_triple(RDF)
        rdf_without_subject = self._build_incomplete_triple(RDF.drop(columns=["SubjectURI"]))
        rdf_without_relationship = self._build_incomplete_triple(RDF.drop(columns=["RelationshipURI"]))
        rdf_without_object = self._build_incomplete_triple(RDF.drop(columns=["ObjectURI"]))
        ####
        df_subject = pd.DataFrame({
            "MovieID": RDF["MovieID"],
            "IncompleteRDF": rdf_without_subject,
            "Missing": RDF["SubjectURI"],
            "RDF": rdf_complete,
        })
        df_relationship = pd.DataFrame({
            "MovieID": RDF["MovieID"],
            "IncompleteRDF": rdf_without_relationship,
            "Missing": RDF["RelationshipURI"],
            "RDF": rdf_complete,
        })
        df_object = pd.DataFrame({
            "MovieID": RDF["MovieID"],
            "IncompleteRDF": rdf_without_object,
            "Missing": RDF["ObjectURI"],
            "RDF": rdf_complete,
        })
        output_df = pd.concat([df_subject, df_relationship, df_object], ignore_index=True)
        output_df.to_csv(self.output, index=False, header=False)
--- a/Scripts/DataCleaning/data_output_models/rdf_text_tasks.py
+++ b/Scripts/DataCleaning/data_output_models/rdf_text_tasks.py
@@ -0,0 +1,26 @@
 import pandas as pd
 class RDF_text_task_dataset():
    """
        Write the CSV for the firsts two tasks, which are "Generating structured RDF triples from natural language text" and reverse.
        In the CVS the RDFs will be saved toghether as a string.
        CSV Composition: ["MovieID","RDFs","Abstract"]
    """
    def __init__(self, output_path:str):
        self.output =  open(output_path, "w")
        # then the first row as header
        header = ["MovieID","RDFs","Abstract"]
        self.output.write(",".join(header) + "\n")
    def close(self):
        self.output.close()
    def write(self, RDF: pd.DataFrame):
        """
        Args:
            RDF (pd.DataFrame): ["MovieID","Triple","Abstract"]
        """        
        RDF.to_csv(self.output, index=False, header=False)
--- a/Scripts/DataCleaning/hold_out/divide.py
+++ b/Scripts/DataCleaning/hold_out/divide.py
@@ -0,0 +1,29 @@
 import pandas as pd
 def split_csv_by_percent(csv_path, train=70, val=15, test=15, seed=42):
    # 1) Read and shuffle rows with a fixed seed for reproducibility
    df = pd.read_csv(csv_path).sample(frac=1, random_state=seed).reset_index(drop=True)
    # 2) Turn the three inputs into proportions relative to their sum
    total = train + val + test # eheh you got it there :p
    n = len(df)
    n_train = int(n * train / total)   # floor to keep indices integral
    n_val   = int(n * val   / total)
    # 3) Give the remainder to test to ensure every row is assigned
    #    (this naturally absorbs any rounding loss)
    train_df = df.iloc[:n_train].reset_index(drop=True)
    val_df   = df.iloc[n_train:n_train + n_val].reset_index(drop=True)
    test_df  = df.iloc[n_train + n_val:].reset_index(drop=True)
    return train_df, val_df, test_df
 # usage:
 DATASET =       "Assets/Dataset/Tmp/rdf_text.csv"
 TRAIN =         "Assets/Dataset/Tmp/hold_out/train.csv"
 TEST =          "Assets/Dataset/Tmp/hold_out/test.csv"
 EVALUATION =    "Assets/Dataset/Tmp/hold_out/evaluation.csv"
 train_df, val_df, test_df = split_csv_by_percent(DATASET, train=80, val=10, test=10, seed=42)
 train_df.to_csv(TRAIN)
 val_df.to_csv(EVALUATION)
 test_df.to_csv(TEST)
--- a/Scripts/DataCleaning/legacy/deprecated.py
+++ b/Scripts/DataCleaning/legacy/deprecated.py
@@ -0,0 +1,381 @@
 # This file deletes in the pipeline the unwanted relationship by different rules
 # -----------------------------------------------------------------------------
 # SQL-FIRST VERSION
 # -----------------------------------------------------------------------------
 # In the original (pandas) version this module:
 #   - stored frequency filters in DataFrames,
 #   - filtered/cleaned DataFrames in-memory,
 #   - added special tokens via string ops,
 #   - rebuilt one row per movie using groupby/aggregation.
 #
 # In this rewrite:
 #   - Every transformation RETURNS a SQLAlchemy `Select` object instead of a DataFrame.
 #   - Your pipeline can pass this `Select` (a "dataview") from one stage to the next,
 #     composing more SQL lazily. Nothing is executed until you call `session.execute(...)`.
 #   - Frequency filters are represented as SUBSELECTS, applied with `WHERE IN (subquery)`.
 #
 # Notes:
 #   - We keep the same CLASS and METHOD NAMES to preserve call sites.
 #   - Method comments/docstrings from your original file are carried over and updated
 #     to reflect Select-based behavior and return types.
 #   - We drop pandas/numpy/sqlite3 imports because filtering is pushed into SQL.
 #   - `GROUP_CONCAT` is used for the rebuild phase (SQLite-compatible). For other DBs,
 #     swap with an equivalent string-agg function.
 # -----------------------------------------------------------------------------
 from __future__ import annotations
 from typing import Optional
 from sqlalchemy import select, func, literal
 from sqlalchemy.sql import Select
 from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
 class PipelineApplier():
    """
    SQL-first pipeline applier.
    In the pandas version, frequency filters were stored as DataFrames (self.MOVIE_FILTER / self.REL_FILTER)
    and every method worked with/returned pandas.DataFrame. In this SQLAlchemy rewrite:
      - self.MOVIE_FILTER and self.REL_FILTER become *subselects* (Select objects) that yield a single
        column each (MovieID or RelationshipURI). These subselects can be applied via `WHERE IN (subquery)`.
      - Every method that previously returned a DataFrame now returns a *Select* that represents the same
        logical transformation, but pushed into the database engine.
      - Comments and docstrings are updated to reflect SQL semantics while preserving your original intent.
    """
    def __init__(self):
        # In the pandas version these were DataFrames storing allowed keys.
        # Here they are Select objects (single-column subselects) or None.
        # Expected column names:
        #   - self.MOVIE_FILTER:      "MovieID"
        #   - self.REL_FILTER:        "RelationshipURI"
        self.MOVIE_FILTER: Optional[Select] = None
        self.REL_FILTER: Optional[Select] = None
    # -------------------------------------------------------------------------
    # Relationship deletion
    # -------------------------------------------------------------------------
    def delete_relationship_by_str(self, RDF: Select, uri: str) -> Select:
        """
        Return a Select where rows having the given relationship URI are removed.
        Original signature (pandas):
            def delete_relationship_by_str(self, RDF: pd.DataFrame, uri: str) -> pd.DataFrame
        Updated behavior:
            - RDF is a Select with columns: MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
            - We apply a WHERE clause: RelationshipURI != <uri>
            - Returns a Select you can continue composing.
        Args:
            RDF (Select): a selectable representing the RDF joined view
            uri (str): RelationshipURI to exclude
        Returns:
            Select: filtered selectable (no execution yet)
        """
        sc = RDF.selected_columns
        return RDF.where(sc.RelationshipURI != literal(uri))
    # -------------------------------------------------------------------------
    # Frequency filter: MOVIE
    # -------------------------------------------------------------------------
    def generate_frequency_movie_filter(self, MOVIE_COUNT: Select, min_treshold: int, max_treshold: int):
        """
        You MUST call this before filtering by movie frequency [filter_by_frequency_movie_id()],
        since this method creates such filter.
        Original behavior:
            - Input MOVIE_COUNT as DataFrame ["MovieID","Count"]
            - Keep rows where Count in [min_treshold, max_treshold)
            - Store the filtered keys in self.MOVIE_FILTER
        Updated behavior (SQL):
            - MOVIE_COUNT is a Select that yields ["MovieID","Count"].
            - We build and store a *subselect* of allowed MovieID (single column) to be used by WHERE IN.
            - No query is executed here; we only create a new Select.
        Args:
            MOVIE_COUNT (Select): yields columns MovieID, Count
            min_treshold (int):
            max_treshold (int):
        """
        sc = MOVIE_COUNT.selected_columns
        filtered = MOVIE_COUNT.where(sc.Count >= min_treshold).where(sc.Count < max_treshold)
        # Keep only the key column so it can be used in an IN (subquery)
        self.MOVIE_FILTER = select(filtered.selected_columns.MovieID)
    # -------------------------------------------------------------------------
    # Frequency filter: RELATIONSHIP
    # -------------------------------------------------------------------------
    def generate_frequency_relationship_filter(self, REL_COUNT: Select, min_treshold: int, max_treshold: int):
        """
        Original behavior:
            - Input REL_COUNT as DataFrame ["RelationshipURI","Count"]
            - Keep rows where Count in [min_treshold, max_treshold)
            - Store the filtered keys in self.REL_FILTER
        Updated behavior (SQL):
            - REL_COUNT is a Select that yields ["RelationshipURI","Count"].
            - We build and store a *subselect* of allowed RelationshipURI (single column) to be used by WHERE IN.
            - No query is executed here; we only create a new Select.
        Args:
            REL_COUNT (Select): yields columns RelationshipURI, Count
            min_treshold (int):
            max_treshold (int):
        """
        sc = REL_COUNT.selected_columns
        filtered = REL_COUNT.where(sc.Count >= min_treshold).where(sc.Count < max_treshold)
        self.REL_FILTER = select(filtered.selected_columns.RelationshipURI)
    # -------------------------------------------------------------------------
    # Apply frequency filters
    # -------------------------------------------------------------------------
    def filter_by_frequency_movie_id(self, RDF: Select) -> Select:
        """
        Original behavior (pandas):
            RDF = RDF[RDF["MovieID"].isin(self.MOVIE_FILTER["MovieID"])]
        Updated behavior (SQL):
            - If self.MOVIE_FILTER is present, apply: WHERE MovieID IN ( <subselect> )
            - Otherwise, return RDF unchanged.
        Args:
            RDF (Select): current dataset
        Returns:
            Select: filtered dataset (or unchanged if no filter exists)
        """
        if self.MOVIE_FILTER is None:
            return RDF
        sc = RDF.selected_columns
        return RDF.where(sc.MovieID.in_(self.MOVIE_FILTER))
    def filter_by_frequency_relationship(self, RDF: Select) -> Select:
        """
        Original behavior (pandas):
            RDF = RDF[RDF["RelationshipURI"].isin(self.REL_FILTER["RelationshipURI"])]
        Updated behavior (SQL):
            - If self.REL_FILTER is present, apply: WHERE RelationshipURI IN ( <subselect> )
            - Otherwise, return RDF unchanged.
        Args:
            RDF (Select): current dataset
        Returns:
            Select: filtered dataset (or unchanged if no filter exists)
        """
        if self.REL_FILTER is None:
            return RDF
        sc = RDF.selected_columns
        return RDF.where(sc.RelationshipURI.in_(self.REL_FILTER))
    # -------------------------------------------------------------------------
    # Token prefixing (SubjectURI/RelationshipURI/ObjectURI)
    # -------------------------------------------------------------------------
    def rdf_add_special_token(self, RDF: Select) -> Select:
        """
        Adds RDF special token to each element of the tuple. i.e: SUBJ to SubjectURI,
        OBJ to ObjectURI, REL to RelationshipURI. Check
        Scripts/Libs/CleaningPipeline/special_token.py for the up-to-date special token.
        It only adds the special token of the three elements of the RDF; no other special token.
        Original behavior (pandas):
            - String concatenation with columns in a DataFrame.
            - Returned a new DataFrame.
        Updated behavior (SQL):
            - Build projected columns using SQL string concatenation.
            - Return a new Select with the same output column names:
              ["MovieID","SubjectURI","RelationshipURI","ObjectURI","Abstract"].
        Args:
            RDF (Select): current dataset
        Returns:
            Select: projected dataset with tokenized SubjectURI/RelationshipURI/ObjectURI
        """
        sc = RDF.selected_columns
        subj_tok = literal(SpecialToken.SUBJECT.value) + sc.SubjectURI
        rel_tok  = literal(SpecialToken.RELATIONSHIP.value) + sc.RelationshipURI
        obj_tok  = literal(SpecialToken.OBJECT.value) + sc.ObjectURI
        return RDF.with_only_columns(
            sc.MovieID.label("MovieID"),
            subj_tok.label("SubjectURI"),
            rel_tok.label("RelationshipURI"),
            obj_tok.label("ObjectURI"),
            sc.Abstract.label("Abstract"),
        )
    # -------------------------------------------------------------------------
    # NA/empty drop on key columns (SubjectURI, RelationshipURI, ObjectURI)
    # -------------------------------------------------------------------------
    def drop_na_from_dataset(self, RDF: Select) -> Select:
        """
        Dataset has SubjectURI, RelationshipURI, ObjectURI. We want to drop rows
        where any of these is empty or NULL.
        Original behavior (pandas):
            - Replace '' with NaN and dropna on the three columns.
        Updated behavior (SQL):
            - Apply WHERE clauses checking for NOT NULL and not empty string.
        Args:
            RDF (Select): current dataset
        Returns:
            Select: dataset filtered to non-empty SubjectURI/RelationshipURI/ObjectURI
        """
        sc = RDF.selected_columns
        return RDF.where(
            (sc.SubjectURI.is_not(None)) & (sc.SubjectURI != "") &
            (sc.RelationshipURI.is_not(None)) & (sc.RelationshipURI != "") &
            (sc.ObjectURI.is_not(None)) & (sc.ObjectURI != "")
        )
    # -------------------------------------------------------------------------
    # Rebuild by movie (one row per movie)
    # -------------------------------------------------------------------------
    def rebuild_by_movie(self, RDF: Select) -> Select:
        """
        To execute this method you have to have iterated by movie_id conceptually,
        because as design we want at the end one row for each movie.
        Original behavior (pandas):
            - Build per-row "Triple" as SubjectURI + RelationshipURI + ObjectURI,
              wrapped with START_TRIPLE/END_TRIPLE.
            - Group by ["MovieID", "Abstract"] and join ("".join) all Triple strings into one.
            - Prefix the whole list with START_TRIPLE_LIST and Abstract with ABSTRACT.
            - Return DataFrame [["MovieID","Triple","Abstract"]].
        Updated behavior (SQL):
            - Build per-row Triple using SQL string concatenation and constants.
            - Use GROUP_CONCAT (empty separator) to aggregate per-movie.
            - Prefix with START_TRIPLE_LIST and ABSTRACT in SQL.
            - Return a Select with columns: ["MovieID","Triple","Abstract"].
        Args:
            RDF (Select): current dataset with columns
                          MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
        Returns:
            Select: aggregated dataset with one row per movie
        """
        sc = RDF.selected_columns
        # Per-row triple with START/END_TRIPLE tokens
        row_triple = (
            literal(SpecialToken.START_TRIPLE.value) +
            (sc.SubjectURI + sc.RelationshipURI + sc.ObjectURI) +
            literal(SpecialToken.END_TRIPLE.value)
        ).label("Triple")
        # Prefixed abstract
        abstract_tok = (literal(SpecialToken.ABSTRACT.value) + sc.Abstract).label("Abstract")
        # Subquery of per-row triples / abstracts
        row_view = RDF.with_only_columns(
            sc.MovieID.label("MovieID"),
            row_triple,
            abstract_tok,
        ).subquery()
        # Concatenate all triples for each movie (SQLite syntax; adjust for other DBs)
        triple_concat = (
            literal(SpecialToken.START_TRIPLE_LIST.value) +
            func.group_concat(row_view.c.Triple, literal(""))
        ).label("Triple")
        return (
            select(
                row_view.c.MovieID.label("MovieID"),
                triple_concat,
                row_view.c.Abstract.label("Abstract"),
            )
            .group_by(row_view.c.MovieID, row_view.c.Abstract)
        )
    # -------------------------------------------------------------------------
    # Build triple(s) projection
    # -------------------------------------------------------------------------
    @staticmethod
    def build_triple(RDF: Select) -> Select:
        """
        Obtains joined RDF triple in one element, together with START and END special tokens.
        Original behavior (pandas):
            - Returned a Series/DataFrame column "Triple" built from three string columns.
        Updated behavior (SQL):
            - Returns a Select with a single column "Triple" built in SQL.
        Args:
            RDF (Select): at least columns ["SubjectURI", "RelationshipURI", "ObjectURI"]
        Returns:
            Select: a projection containing one column named "Triple"
        """
        sc = RDF.selected_columns
        triple = (
            literal(SpecialToken.START_TRIPLE.value) +
            (sc.SubjectURI + sc.RelationshipURI + sc.ObjectURI) +
            literal(SpecialToken.END_TRIPLE.value)
        ).label("Triple")
        return RDF.with_only_columns(triple)
    @staticmethod
    def build_incomplete_triple(RDF: Select) -> Select:
        """
        Method helper used for the third task: "Predicting a masked component within an RDF triple".
        Obtains joined RDF triple in one element, together with START and END special tokens.
        The MISSING element will be replaced by the special token <MASK>.
        Original behavior (pandas):
            - Created a Series "Triple" using fallback values for missing columns.
        Updated behavior (SQL):
            - Uses COALESCE to replace NULLs with <MASK> directly in SQL.
            - Returns a Select with a single column "Triple".
        Args:
            RDF (Select): 2 of the following columns present ["SubjectURI", "RelationshipURI", "ObjectURI"]
        Returns:
            Select: projection with column "Triple"
        """
        sc = RDF.selected_columns
        mask = literal(SpecialToken.MASK.value)
        triple = (
            literal(SpecialToken.START_TRIPLE.value) +
            (func.coalesce(sc.SubjectURI, mask) +
             func.coalesce(sc.RelationshipURI, mask) +
             func.coalesce(sc.ObjectURI, mask)) +
            literal(SpecialToken.END_TRIPLE.value)
        ).label("Triple")
        return RDF.with_only_columns(triple)
    @staticmethod
    def build_for_mask_task(RDF_incomplete: Select, MISSING) -> None:
        """
        Currently not used.
        Original intention:
            Given two DataFrames (one incomplete RDF and another with just the missing component),
            apply special tokens accordingly.
        Updated note:
            This stub remains for API parity. If needed in the future, it can be implemented
            as a Select-building helper that merges/COALESCEs columns from different selects.
        """
        return None
--- a/Scripts/DataCleaning/legacy/fast_filter.py
+++ b/Scripts/DataCleaning/legacy/fast_filter.py
@@ -0,0 +1,148 @@
 # This file deletes in the pipeline the unwanted relationship by different rules
 import pandas as pd
 import sqlite3  # kept for compatibility
 import numpy as np
 from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
 from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
 class PipelineApplier:
    def __init__(self):
        # Fast internal caches for O(1) membership checks
        self._MOVIE_FILTER_SET = set()
        self._REL_FILTER_SET = set()
    # ------------------------------
    # Filters
    # ------------------------------
    def delete_relationship_by_str(self, RDF: pd.DataFrame, uri: str) -> pd.DataFrame:
        # Vectorized boolean mask
        return RDF.loc[RDF["RelationshipURI"] != uri]
    def generate_frequency_movie_filter(self, MOVIE_COUNT: pd.DataFrame, min_threshold: int, max_threshold: int):
        """
        You MUST call this before filter the dataset by movie frequency [filter_by_frequency_movie_id()],
        since this method creates such filter.
        Args:
            MOVIE_COUNT (pd.DataFrame): ["MovieID","Count"]
        """
        sel = (MOVIE_COUNT["Count"] >= min_threshold) & (MOVIE_COUNT["Count"] < max_threshold)
        self._MOVIE_FILTER_SET = set(MOVIE_COUNT.loc[sel, "MovieID"].tolist())
    def generate_frequency_relationship_filter(self, REL_COUNT: pd.DataFrame, min_threshold: int, max_threshold: int):
        sel = (REL_COUNT["Count"] >= min_threshold) & (REL_COUNT["Count"] < max_threshold)
        self._REL_FILTER_SET = set(REL_COUNT.loc[sel, "RelationshipURI"].tolist())
    def filter_by_frequency_movie_id(self, RDF: pd.DataFrame) -> pd.DataFrame:
        # Set-backed isin is the fastest path
        return RDF.loc[RDF["MovieID"].isin(self._MOVIE_FILTER_SET)]
    def filter_by_frequency_relationship(self, RDF: pd.DataFrame) -> pd.DataFrame:
        return RDF.loc[RDF["RelationshipURI"].isin(self._REL_FILTER_SET)]
    # ------------------------------
    # Cleaning & preprocessing
    # ------------------------------
    def rdf_add_special_token(self, RDF: pd.DataFrame) -> pd.DataFrame:
        """
        Adds RDF special token to SubjectURI / RelationshipURI / ObjectURI.
        Returns a new DataFrame (no inplace modification of the caller's object).
        """
        subj = np.char.add(SpecialToken.SUBJECT.value,      RDF["SubjectURI"].to_numpy(dtype=object))
        rel  = np.char.add(SpecialToken.RELATIONSHIP.value, RDF["RelationshipURI"].to_numpy(dtype=object))
        obj  = np.char.add(SpecialToken.OBJECT.value,        RDF["ObjectURI"].to_numpy(dtype=object))
        return RDF.assign(SubjectURI=subj, RelationshipURI=rel, ObjectURI=obj)
    def drop_na_from_dataset(self, RDF: pd.DataFrame) -> pd.DataFrame:
        """
        Replace '' with NaN only on key columns, then drop rows missing any of them.
        """
        cols = ["SubjectURI", "RelationshipURI", "ObjectURI"]
        rdf = RDF.copy()
        for c in cols:
            m = rdf[c] == ""
            if m.any():
                rdf.loc[m, c] = np.nan
        return rdf.dropna(subset=cols)
    # ------------------------------
    # Building triples
    # ------------------------------
    @staticmethod
    def build_triple(RDF: pd.DataFrame):
        """
        Obtains joined RDF triple in one element, together with START and END special token.
        Returns:
            pd.Series: RDF["Triple"] (just this column). Side-effect: sets RDF["Triple"].
        """
        start = SpecialToken.START_TRIPLE.value
        end   = SpecialToken.END_TRIPLE.value
        subj = RDF["SubjectURI"].to_numpy(dtype=object)
        rel  = RDF["RelationshipURI"].to_numpy(dtype=object)
        obj  = RDF["ObjectURI"].to_numpy(dtype=object)
        arr = np.char.add(np.char.add(np.char.add(start, subj),
                                      np.char.add(rel, obj)),
                          end)
        RDF["Triple"] = pd.Series(arr, index=RDF.index, dtype=object, name="Triple")
        return RDF["Triple"]
    @staticmethod
    def build_incomplete_triple(RDF: pd.DataFrame):
        """
        Helper used for the third task: "Predicting a masked component within an RDF triple".
        Accepts any subset of ["SubjectURI","RelationshipURI","ObjectURI"] (typically 2 of 3).
        Missing components are replaced by <MASK>.
        Returns:
            pd.Series: RDF["Triple"] (just this column). Side-effect: sets RDF["Triple"].
        """
        start = SpecialToken.START_TRIPLE.value
        end   = SpecialToken.END_TRIPLE.value
        maskv = SpecialToken.MASK.value
        n = len(RDF.index)
        subj = RDF["SubjectURI"].to_numpy(dtype=object)      if "SubjectURI"      in RDF else np.full(n, maskv, dtype=object)
        rel  = RDF["RelationshipURI"].to_numpy(dtype=object) if "RelationshipURI" in RDF else np.full(n, maskv, dtype=object)
        obj  = RDF["ObjectURI"].to_numpy(dtype=object)       if "ObjectURI"       in RDF else np.full(n, maskv, dtype=object)
        arr = np.char.add(np.char.add(np.char.add(start, subj),
                                      np.char.add(rel, obj)),
                          end)
        RDF["Triple"] = pd.Series(arr, index=RDF.index, dtype=object, name="Triple")
        return RDF["Triple"]
    def rebuild_by_movie(self, RDF: pd.DataFrame):
        """
        Collapse triples + abstract into a single row per movie.
        Returns: ["MovieID","Triple","Abstract"]
        """
        # Build triples once (vectorized); method also sets RDF["Triple"]
        triples = self.build_triple(RDF)
        # Minimal frame for grouping (avoid carrying extra columns)
        tmp = pd.DataFrame({
            "MovieID":  RDF["MovieID"].to_numpy(),
            "Abstract": RDF["Abstract"].to_numpy(),
            "Triple":   triples.to_numpy(),
        })
        # Factorize high-cardinality keys to fast integer codes, group on codes,
        # then map back to labels; sum concatenates strings for object dtype.
        mid_codes, mid_uniques = pd.factorize(tmp["MovieID"], sort=False)
        abs_codes, abs_uniques = pd.factorize(tmp["Abstract"], sort=False)
        tmp["_mid"] = mid_codes
        tmp["_abs"] = abs_codes
        grouped = tmp.groupby(["_mid", "_abs"], sort=False, as_index=False)["Triple"].sum()
        grouped["MovieID"]  = grouped["_mid"].map(lambda i: mid_uniques[i])
        grouped["Abstract"] = grouped["_abs"].map(lambda i: abs_uniques[i])
        # Final tokens
        grouped["Triple"]   = SpecialToken.START_TRIPLE_LIST.value + grouped["Triple"]
        grouped["Abstract"] = SpecialToken.ABSTRACT.value         + grouped["Abstract"]
        return grouped[["MovieID", "Triple", "Abstract"]]
--- a/Scripts/DataCleaning/legacy/filter.py
+++ b/Scripts/DataCleaning/legacy/filter.py
@@ -0,0 +1,191 @@
 # This file deletes in the pipeline the unwanted relationship by different rules
 import pandas as pd
 import sqlite3
 import numpy as np
 from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
 from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
 class PipelineApplier():
    def __init__(self):
        self.MOVIE_FILTER = pd.DataFrame()
        self.REL_FILTER = pd.DataFrame()
    def delete_relationship_by_str(self, RDF: pd.DataFrame, uri: str) -> pd.DataFrame:
        return RDF[RDF["RelationshipURI"]!= uri]
    def generate_list_relationship_filter(self, filter_list: list[str]) -> None:
        """Store RelationshipURI filters as a set """
        self.relationship_filter_list: set[str] = set(filter_list)
    def delete_relationship_by_list_filter(self, RDF: pd.DataFrame) -> pd.DataFrame:
        """Remove rows whose RelationshipURI is in the stored filter. Generate it first callig the generate_list_relationship_filter"""
        return RDF[~RDF["RelationshipURI"].isin(self.relationship_filter_list)]
    # def filter_movie_by_rel_uri_frequence()
    def generate_frequency_movie_filter(self, MOVIE_COUNT: pd.DataFrame ,min_treshold: int, max_treshold: int):
        """
        You MUST call this before filter the dataset by movie frequence [filter_by_frequence_movie_id()], 
        since this method creates such filter
        Args:
            MOVIE_COUNT (pd.DataFrame): ["MovieID","Count"]
            min_treshold (int): 
            max_treshold (int): 
        """        
        MOVIE_COUNT = MOVIE_COUNT[MOVIE_COUNT["Count"] >= min_treshold]
        MOVIE_COUNT = MOVIE_COUNT[MOVIE_COUNT["Count"] < max_treshold]
        self.MOVIE_FILTER = MOVIE_COUNT #["MovieID"]
    def generate_frequency_relationship_filter(self, REL_COUNT: pd.DataFrame ,min_treshold: int, max_treshold: int):
        REL_COUNT = REL_COUNT[REL_COUNT["Count"] >= min_treshold]
        REL_COUNT = REL_COUNT[REL_COUNT["Count"] < max_treshold]
        self.REL_FILTER = REL_COUNT #["RelationshipURI"]
    def filter_by_frequency_movie_id(self, RDF: pd.DataFrame) -> pd.DataFrame:
        RDF = RDF[RDF["MovieID"].isin(self.MOVIE_FILTER["MovieID"])]
        return RDF
    def filter_by_frequency_relationship(self, RDF: pd.DataFrame) -> pd.DataFrame:
        RDF = RDF[RDF["RelationshipURI"].isin(self.REL_FILTER["RelationshipURI"])]
        return RDF
    def rdf_add_special_token(self, RDF: pd.DataFrame):
        """
        Adds RDF special token to each element of the tuple. i.e: SUBJ to SubjectURI, OBJ to ObjectURI, REL to RelationshipURI. 
        Check Scrits/Libs/CleaningPipeline/special_token.py for the up-to-date special token.
        It only adds the special token of the three element of the RDF, no other special token.
        Args:
            RDF (pd.DataFrame):
        Returns:
            pd.DataFrame: ["MovieURI","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
        """        
        # if the filter runned before sliced the RDF and created a View, here the problem is resolved
        # for more context: SettingWithCopyWarning
        RDF = RDF.copy()
        # at the beginning of SubjectURI RelationshipURI ObjectURI, add their special token  
        RDF["SubjectURI"] = SpecialToken.SUBJECT.value + RDF["SubjectURI"]
        RDF["ObjectURI"] = SpecialToken.OBJECT.value + RDF["ObjectURI"]
        RDF["RelationshipURI"] = SpecialToken.RELATIONSHIP.value + RDF["RelationshipURI"]
        return RDF
    def drop_na_from_dataset(self, RDF: pd.DataFrame) -> pd.DataFrame:
        # dataset has SubjectURI RelationshipURI ObjectURI
        #  want to drop the '' in them
        # Replace empty strings with NaN
        RDF = RDF.replace('', np.nan)
        # Drop rows where any of the key columns are NaN
        RDF = RDF.dropna(subset=["SubjectURI", "RelationshipURI", "ObjectURI"])
        return RDF
    def rebuild_by_movie(self, RDF: pd.DataFrame) -> pd.DataFrame:
        """_summary_
        Args:
            RDF (pd.DataFrame): ["MovieID","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
        Returns:
            pd.DataFrame: ["MovieID","Triple","Abstract"]
        """        
        # to execute this method you have to have itereted by movie_id
        # because as design we want at the end one row for each movie
        # MovieID and abstract can be given as input for a more generic method
        # movie_id = RDF["MovieID"].iloc(0)
        # abstract = RDF["Abstract"].iloc(0)
        # first let's combine each row creating column triple as join of rdf
        RDF["Triple"] =  RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"]
        # special token 
        RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
        # combine rows into one
        # MovieID and Abstract are unique for each other 1 <-> 1
        RDF = RDF.groupby(["MovieID", "Abstract"])["Triple"].apply("".join).reset_index()
        # add special token for: start of triple, end of triple and start of abstract
        RDF["Triple"] = SpecialToken.START_TRIPLE_LIST.value + RDF["Triple"] 
        RDF["Abstract"] = SpecialToken.ABSTRACT.value + RDF["Abstract"]
        return RDF[["MovieID","Triple","Abstract"]]
    def group_by_movie_from_triple(self, RDF: pd.DataFrame) -> pd.DataFrame:
        """
        Args:
            RDF (pd.DataFrame): ["MovieID","Triple","Abstract"]
        Returns:
            pd.DataFrame: ["MovieID","Triple","Abstract"]
        """
        # combine rows into one
        # MovieID and Abstract are unique for each other 1 <-> 1
        RDF = RDF.groupby(["MovieID", "Abstract"])["Triple"].apply("".join).reset_index()
        # add special token for: start of triple, end of triple and start of abstract
        RDF["Triple"] = SpecialToken.START_TRIPLE_LIST.value + RDF["Triple"] 
        RDF["Abstract"] = SpecialToken.ABSTRACT.value + RDF["Abstract"]
        return RDF[["MovieID","Triple","Abstract"]]
    @staticmethod
    def build_triple(RDF: pd.DataFrame):
        """
        Obtains joined RDF triple in one element, togheter with START and END special token
        Args:
            RDF (pd.DataFrame): at least ["SubjectURI", "RelationshipURI", "ObjectURI"]
        Returns:
            pd.DataFrame: RDF["Triple"] (just this column)
        """        
        # let's combine each row creating column triple as join of rdf
        RDF["Triple"] =  RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"]
        # special token 
        RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
        return RDF["Triple"]
    @staticmethod
    def build_incomplete_triple(RDF: pd.DataFrame):
        """
        Method helper used for the third task: "Predicting a masked component within an RDF triple". 
        Obtains joined RDF triple in one element, togheter with START and END special token.
        The MISSING element will be replaced by the special token <MASK>
        Args:
            RDF (pd.DataFrame): 2 of the following ["SubjectURI", "RelationshipURI", "ObjectURI"]
        Returns:
            RDF["Triple"]: pd.Series  (just this column, NOT A DATAFRAME)
        """        
        # let's create a new column "Triple" with the joined RDF
        # the following creates a column of MASK token of the lenght of the dataframe,
        # it is not needed since we expect to have a dataframe of just one column, but its more robust (AND SLOW)
        MISSING = pd.Series([SpecialToken.MASK.value] * len(RDF), index=RDF.index)
        RDF["Triple"] =  ( 
                    RDF.get("SubjectURI", MISSING) + 
                    RDF.get("RelationshipURI", MISSING) + 
                    RDF.get("ObjectURI", MISSING))
        # special token 
        RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
        return RDF["Triple"]
    @staticmethod
    def build_for_mask_task(RDF_incomplete: pd.DataFrame, MISSING: pd.DataFrame) -> pd.DataFrame:
        # currently not used
        """
        Method helper used for the third task: "Predicting a masked component within an RDF triple". 
        Given two Dataframe, the first containing the incompleted RDF and the other only the missing componment,
        this methods applies the special token
        Args:
            RDF (pd.DataFrame): _description_
        Returns:
            pd.DataFrame: _description_
        """  
        # take an example dataframe as ["SubjectURI",""]    
        # as input two dataframe, one with 2 column  
        return None
    def regex_on_objects(self, RDF: pd.DataFrame) -> pd.DataFrame:
        RDF["ObjectURI"] = (RDF["ObjectURI"].astype("string")
                   .str.replace(r"\r?\n+", ", ", regex=True)   # newlines -> ", "
                   .str.replace(r"\*", "", regex=True))        # delete all asterisks
        return RDF
--- a/Scripts/DataCleaning/legacy/pipeline.py
+++ b/Scripts/DataCleaning/legacy/pipeline.py
@@ -0,0 +1,145 @@
 import re
 from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
 from Scripts.DataCleaning.legacy.filter import PipelineApplier
 # tasks dataset builder
 from Scripts.DataCleaning.data_output_models.rdf_mask_task import RDF_mask_task_dataset
 from Scripts.DataCleaning.data_output_models.bpe_corpus import BPE_corpus
 from Scripts.DataCleaning.data_output_models.rdf_text_tasks import RDF_text_task_dataset
 from Scripts.DataCleaning.data_output_models.rdf_completation_task import RDF_completation_task_dataset
 from Scripts.DataCleaning.data_output_models.debug_csv import Debug_csv
 import pandas as pd
 class Pipeline():
    def __init__(self):
        self.sql_endpoint = SqlEndpoint()
        # classes to manage taskes' datasets
        self.task_rdf_mask = RDF_mask_task_dataset("./Assets/Dataset/Tmp/rdf_mask.csv")
        self.task_bpe_corpus = BPE_corpus("./Assets/Dataset/Tmp/corpus.txt")
        self.task_rdf_text = RDF_text_task_dataset("./Assets/Dataset/Tmp/rdf_text.csv")
        self.task_rdf_completation = RDF_completation_task_dataset("./Assets/Dataset/Tmp/rdf_completation.csv")
        # prepare the filter
        # the filter applier needs to know the frequence of Movies and Relationship among all the Dataset
        self.filter_applier = PipelineApplier()
        MOVIE_COUNT = self.sql_endpoint.get_movies_id_count()
        REL_COUNT = self.sql_endpoint.get_relationship_count()
        self.filter_applier.generate_frequency_movie_filter(MOVIE_COUNT,50,3000)
        self.filter_applier.generate_frequency_relationship_filter(REL_COUNT, 50, 2395627) # from 2718 to 3069 
        # prepare the filter on the relationshipURI you want to delete:
        relationship_uri_banned_list = [
            "dbp-dbp:wikiPageUsesTemplate","w3:2000/01/rdf-schema#label","dbp-dbo:abstract",
            "dbp-dbo:wikiPageID","dbp-dbo:wikiPageRevisionID", "dbp-dbo:wikiPageDisambiguates",
            "w3:2002/07/owl#sameAs","dbp-dbp:image","dbp-dbo:wikiPageLength", "w3:2000/01/rdf-schema#comment",
            "dbp-dbo:thumbnail", "foaf:depiction", "w3:1999/02/22-rdf-syntax-ns#type", 
            "dbp-dbp:id","dbp-dbp:totalWidth", "w3:ns/prov#wasDerivedFrom", "dbp-dbp:n", "dbp-dbp:alt",
            "dbp-dbo:soundRecording"
            ]
        self.filter_applier.generate_list_relationship_filter(relationship_uri_banned_list)
    def execute_task_bpe_corpus(self):
        for RDF in self._get_cleaned_movie_rows():
            RDF = self.filter_applier.rebuild_by_movie(RDF)
            RDF = RDF[["Triple","Abstract"]]
            self.task_bpe_corpus.write_from_df(RDF)
        self._end_file_handler()
    def execute_task_rdf_mask(self):
        for RDF in self._get_cleaned_movie_rows():
            self.task_rdf_mask.write(RDF)
        self._end_file_handler()
    def execute_tasks_rdf_text(self):
        for RDF in self._get_cleaned_movie_rows():
            RDF = self.filter_applier.rebuild_by_movie(RDF)
            self.task_rdf_text.write(RDF)
        self._end_file_handler()
    def execute_task_rdf_completation(self):
        for RDF in self._get_cleaned_movie_rows():
            RDF["Triple"] = self.filter_applier.build_triple(RDF)
            self.task_rdf_completation.write(RDF[["MovieID","Triple"]])
        self._end_file_handler()
    def execute_all_task(self):
        for RDF in self._get_cleaned_movie_rows():
            self.task_rdf_mask.write(RDF)
            RDF["Triple"] = self.filter_applier.build_triple(RDF)
            self.task_rdf_completation.write(RDF[["MovieID","Triple"]])
            RDF = self.filter_applier.group_by_movie_from_triple(RDF[["MovieID","Triple","Abstract"]])
            self.task_rdf_text.write(RDF)
            self.task_bpe_corpus.write_from_df(RDF[["Triple","Abstract"]])
        self._end_file_handler()
    def _end_file_handler(self):
        self.task_bpe_corpus.close()
        self.task_rdf_mask.close()
        self.task_rdf_text.close()
        self.task_rdf_completation.close()
    def _get_cleaned_movie_rows(self):
        for RDF in self.sql_endpoint.get_abbreviated_dataset_by_movie_id():
            RDF = self.filter_applier.drop_na_from_dataset(RDF)
            RDF = self.filter_applier.filter_by_frequency_movie_id(RDF)
            RDF = self.filter_applier.filter_by_frequency_relationship(RDF)
            # other filter
            #
            RDF = self.filter_applier.delete_relationship_by_list_filter(RDF)
            # regex on ObjectURI
            RDF = self.filter_applier.regex_on_objects(RDF)
            if RDF.empty:
                continue
            RDF = self.filter_applier.rdf_add_special_token(RDF)  # WARNING, THIS MUST BE DONE AFTER FILTER BY FREQUENCE
            yield RDF
    def use_toy_dataset(self):
        # CHOOSEN MOVIE:
        # The Dark Knight   : 117248
        # Inception         : 147074
        # The Avengers      : 113621
        # Cast Away         : 1123
        # The Departed      : 117586
        # American Psycho   : 90177
        # Avatar            : 71587
        # Django Unchained  : 138952
        # Spirited Away     : 144137
        # Knives Out        : 148025
        movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]
        self.sql_endpoint.movie_ids = movie_list
    def generate_csv_debug_file(self, debug_path:str):
        debug_csv = Debug_csv(debug_path)
        for RDF in self._get_cleaned_movie_rows():
            debug_csv.write(RDF)
        debug_csv.close()
 # there are a lot of settings to manage
 # you only need to change settings: 
 # in the init for file paths, frequency filter limit, banned reletionshipURI
 # in the use_toy_dataset , to change the toy dataset
 # in _get_cleaned_movie_rows: to change how the pipeline behave
 pipeline = Pipeline()
 pipeline.use_toy_dataset()
 # pipeline.execute_task_bpe_corpus()
 # pipeline.execute_task_rdf_mask()
 # pipeline.execute_tasks_rdf_text()
 # pipeline.execute_task_rdf_completation()
 # pipeline.execute_all_task()
 pipeline.generate_csv_debug_file("Assets/Dataset/Tmp/debug.csv")
--- a/Scripts/DataCleaning/path_splitter_tree.py
+++ b/Scripts/DataCleaning/path_splitter_tree.py
@@ -101,7 +101,6 @@ def tree_like(file: str, csv_uri_header:str, out: str):
    FILE = open(file, "r", encoding="utf-8")
    # TODO: Change here so it takes single URI from a CSV file
    # It is needed the header-name
    for row in csv.DictReader(FILE):
--- a/Scripts/DataCleaning/pipeline/cleaner.py
+++ b/Scripts/DataCleaning/pipeline/cleaner.py
@@ -0,0 +1,86 @@
 # This file deletes in the pipeline the unwanted relationship by different rules
 import pandas as pd
 import sqlite3
 import numpy as np
 from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
 from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
 class PipelineApplier():
    def __init__(self):
        pass
    def rdf_add_special_token(self, RDF: pd.DataFrame):
        """
        Adds RDF special token to each element of the tuple. i.e: SUBJ to SubjectURI, OBJ to ObjectURI, REL to RelationshipURI. 
        Check Scrits/Libs/CleaningPipeline/special_token.py for the up-to-date special token.
        It only adds the special token of the three element of the RDF, no other special token.
        Args:
            RDF (pd.DataFrame):
        Returns:
            pd.DataFrame: ["MovieURI","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
        """        
        # if the filter runned before sliced the RDF and created a View, here the problem is resolved
        # for more context: SettingWithCopyWarning
        RDF = RDF.copy()
        # at the beginning of SubjectURI RelationshipURI ObjectURI, add their special token  
        RDF["SubjectURI"] = SpecialToken.SUBJECT.value + RDF["SubjectURI"]
        RDF["ObjectURI"] = SpecialToken.OBJECT.value + RDF["ObjectURI"]
        RDF["RelationshipURI"] = SpecialToken.RELATIONSHIP.value + RDF["RelationshipURI"]
        return RDF
    def drop_na_from_dataset(self, RDF: pd.DataFrame) -> pd.DataFrame:
        RDF = RDF.replace('', np.nan)
        # Drop rows where any of the key columns are NaN
        RDF = RDF.dropna(subset=["SubjectURI", "RelationshipURI", "ObjectURI"])
        return RDF
    def rebuild_by_movie(self, RDF: pd.DataFrame) -> pd.DataFrame:
        """
        Args:
            RDF (pd.DataFrame): ["MovieID","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
        Returns:
            pd.DataFrame: ["MovieID","Triple","Abstract"]
        """        
        # to execute this method you have to have itereted by movie_id
        # because as design we want at the end one row for each movie
        # MovieID and abstract can be given as input for a more generic method
        # first let's combine each row creating column triple as join of rdf
        RDF["Triple"] =  RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"]
        # special token 
        RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
        # combine rows into one
        # MovieID and Abstract are unique for each other 1 <-> 1
        RDF = RDF.groupby(["MovieID", "Abstract"])["Triple"].apply("".join).reset_index()
        # add special token for: start of triple, end of triple and start of abstract
        RDF["Triple"] = SpecialToken.START_TRIPLE_LIST.value + RDF["Triple"]+SpecialToken.END_OF_SENTENCE.value
        RDF["Abstract"] = SpecialToken.ABSTRACT.value + RDF["Abstract"] + SpecialToken.END_OF_SENTENCE.value
        return RDF[["MovieID","Triple","Abstract"]]
    @staticmethod
    def build_triple(RDF: pd.DataFrame):
        """
        Obtains joined RDF triple in one element, togheter with START and END special token
        Args:
            RDF (pd.DataFrame): at least ["SubjectURI", "RelationshipURI", "ObjectURI"]
        Returns:
            pd.DataFrame: RDF["Triple"] (just this column)
        """        
        # let's combine each row creating column triple as join of rdf
        RDF["Triple"] =  RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"]
        # special token 
        RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
        return RDF["Triple"]
    def regex_on_objects(self, RDF: pd.DataFrame) -> pd.DataFrame:
        RDF["ObjectURI"] = (RDF["ObjectURI"].astype("string")
                   .str.replace(r"\r?\n+", ", ", regex=True)   # newlines -> ", "
                   .str.replace(r"\*", "", regex=True))        # delete all asterisks
        return RDF
--- a/Scripts/DataCleaning/pipeline/movie_filter.py
+++ b/Scripts/DataCleaning/pipeline/movie_filter.py
@@ -0,0 +1,103 @@
 import pandas as pd
 from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
 class MovieFilter:
    def __init__(self) -> None:
        self.sql_endpoint = SqlEndpoint()
        # first obtain all movie_id
        movie_query = "SELECT MovieID FROM Movies"
        self.MOVIE_FILTER = self.sql_endpoint.get_dataframe_from_query(movie_query)
    def frequency_filter(self, min_treshold:int, max_treshold:int):
        movie_list_placeholder = ",".join(["?"] * len(self.MOVIE_FILTER))
        filter_query = f"""
            SELECT MovieID
            FROM RDFs
            WHERE MovieID IN ({movie_list_placeholder})
            GROUP BY MovieID
            HAVING COUNT(*) BETWEEN {min_treshold} AND {max_treshold};
        """
        self.MOVIE_FILTER = self.sql_endpoint.get_dataframe_from_query(filter_query, tuple(self.MOVIE_FILTER["MovieID"].to_list()))
    def get_movie_id(self):
        return self.MOVIE_FILTER
    def relation_filter(self, parsed_rel_uri: str, min_treshold:int, max_treshold:int):
        movie_ids = self.MOVIE_FILTER["MovieID"].to_list()
        movie_list_placeholder = ",".join(["?"] * len(movie_ids))
        filter_query = f"""
            SELECT MovieID
            FROM RDFs
            JOIN ParsedRelationships ON ParsedRelationships.RelationshipID = RDFs.RelationshipID
            WHERE MovieID IN ({movie_list_placeholder})
            GROUP BY MovieID
            HAVING SUM(CASE WHEN ParsedRelationships.RelationshipURI = '{parsed_rel_uri}'  THEN 1 ELSE 0 END)
                BETWEEN {min_treshold} AND {max_treshold};
        """
        params = tuple(movie_ids) # + (parsed_rel_uri, min_treshold, max_treshold)
        self.MOVIE_FILTER = self.sql_endpoint.get_dataframe_from_query(filter_query, params)
    def filter_by_director(self):
        director_list = ['dbp-dbo:director','dbp-dbp:director']
        movie_ids = self.MOVIE_FILTER["MovieID"].to_list()
        movie_list_placeholder = ",".join(["?"] * len(movie_ids))
        filter_query = f"""
            SELECT DISTINCT RDFs.MovieID
            FROM RDFs
            JOIN ParsedRelationships USING (RelationshipID)
            WHERE RDFs.MovieID IN ({movie_list_placeholder})
            AND ParsedRelationships.RelationshipURI IN {tuple(director_list)};
        """
        params = tuple(movie_ids)
        self.MOVIE_FILTER = self.sql_endpoint.get_dataframe_from_query(filter_query, params)
    def filter_by_english_movies(self):
        movie_ids = self.MOVIE_FILTER["MovieID"].to_list()
        movie_list_placeholder = ",".join(["?"] * len(movie_ids))
        relationship = ["dbp-dbp:language"]
        objects_list = ["English", "dbp-dbr:English_language"]
        filter_query = f"""
            SELECT DISTINCT RDFs.MovieID
            FROM RDFs
            INNER JOIN ParsedRelationships USING (RelationshipID)
            INNER JOIN ParsedObjects USING (ObjectID)
            WHERE RDFs.MovieID IN ({movie_list_placeholder})
            AND ParsedRelationships.RelationshipURI IN ('{relationship[0]}')
            AND ParsedObjects.ObjectURI in {tuple(objects_list)};
        """
        other_query = f"""
            SELECT RDFs.MovieID
            FROM RDFs
            INNER JOIN ParsedRelationships USING (RelationshipID)
            INNER JOIN ParsedObjects USING (ObjectID)
            WHERE RDFs.MovieID IN ({movie_list_placeholder})
            AND ParsedRelationships.RelationshipURI IN ('{relationship[0]}')
            GROUP BY RDFs.MovieID
            HAVING
            SUM(CASE WHEN ParsedObjects.ObjectURI IN {tuple(objects_list)} THEN 1 ELSE 0 END) >= 1
            AND
            SUM(CASE WHEN ParsedObjects.ObjectURI NOT IN {tuple(objects_list)} THEN 1 ELSE 0 END) = 0;
        """
        params = tuple(movie_ids)
        self.MOVIE_FILTER = self.sql_endpoint.get_dataframe_from_query(other_query, params)
 # movie_filter = MovieFilter()
 # movie_filter.frequency_filter(5,10)
--- a/Scripts/DataCleaning/pipeline/pipeline.py
+++ b/Scripts/DataCleaning/pipeline/pipeline.py
@@ -0,0 +1,155 @@
 from movie_filter import MovieFilter
 from relationship_filter import RelationshipFilter
 from rdf_filter import RdfFilter
 from cleaner import PipelineApplier
 from Scripts.DataCleaning.data_output_models.bpe_corpus import BPE_corpus
 from Scripts.DataCleaning.data_output_models.rdf_text_tasks import RDF_text_task_dataset
 from Scripts.DataCleaning.data_output_models.rdf_completation_task import RDF_completation_task_dataset
 from Scripts.DataCleaning.data_output_models.debug_csv import Debug_csv
 import pandas as pd
 RELATIONSHIP_FILTER_LIST = [
            "dbp-dbp:wikiPageUsesTemplate","w3:2000/01/rdf-schema#label","dbp-dbo:abstract",
            "dbp-dbo:wikiPageID","dbp-dbo:wikiPageRevisionID", "dbp-dbo:wikiPageDisambiguates",
            "w3:2002/07/owl#sameAs","dbp-dbp:image","dbp-dbo:wikiPageLength", "w3:2000/01/rdf-schema#comment",
            "dbp-dbo:thumbnail", "foaf:depiction", "w3:1999/02/22-rdf-syntax-ns#type", 
            "dbp-dbp:id","dbp-dbp:totalWidth", "w3:ns/prov#wasDerivedFrom", "dbp-dbp:n", "dbp-dbp:alt",
            "dbp-dbo:soundRecording", "dbp-dbp:align", "dbp-dbp:format", 
            "dbp-dbp:filename", "dbp-dbp:wikt", "foaf:isPrimaryTopicOf", "dbp-dbp:quote", "foaf:homepage",
            "dbp-dbp:wordnet_type", "dbp-dbp:length","dbp-dbp:caption", "dbp-dbo:imdbId", "dbp-dbp:border", "dbp-dbp:note",
            "dbp-dbp:postalCodeType", "dbp-dbp:extraColumn", "foaf:homepage", "dbp-dbp:bgcolor","dbp-dbp:prevTitle", 
            "dbp-dbp:imageUpright", "dbp-dbp:url", "dbp-dbp:italicTitle", "dbp-dbp:imageSize", "dbp-dbp:text", 
            "dbp-dbp:captionAlign", "dbp-dbp:headerAlign", "dbp-dbp:height", "dbp-dbp:link", "dbp-dbo:wikiPageInterLanguageLink",
            "w3:2003/01/geo/wgs84_pos#lat", "w3:2003/01/geo/wgs84_pos#long", "http://www.georss.org/georss/point", 
            "dbp-dbp:bgcolor", "dbp-dbp:mc", "dbp-dbp:rev3score", "dbp-dbp:rev4score", "dbp-dbp:imageAlt", 
            "dbp-dbp:b", "dbp-dbp:s", "dbp-dbp:c", "dbp-dbp:d", "dbp-dbp:m", "dbp-dbp:v", "dbp-dbp:mw", "dbp-dbp:fontsize",
            "dbp-dbp:salign", "dbp-dbp:q", "dbp-dbp:portal", "dbp-dbp:dSearch", "dbp-dbp:header", "w3:2003/01/geo/wgs84_pos#geometry",
            "dbp-dbp:shortsummary", "dbp-dbp:fixAttempted", "dbp-dbo:developer", "dbp-dbp:no", "dbp-dbp:ref", "dbp-dbp:infoa"
            "dbp-dbp:infob", "dbp-dbp:1a", "dbp-dbp:1p", "dbp-dbp:2a", "dbp-dbp:2p", "http://rdvocab.info/RDARelationshipsWEMI/manifestationOfWork",
            "dbp-dbp:isbn", "dbp-dbp:titleWidth", "dbp-dbp:prodcode", "dbp-dbp:page", "w3:2004/02/skos/core#closeMatch",
            "dbp-dbp:colwidth", "dbp-dbp:imagesize", "dbp-dbp:rr", "dbp-dbp:date", "dbp-dbp:type", "dbp-dbp:list", 
            "dbp-dbp:listEpisodes", "dbp-dbp:footerAlign", "dbp-dbp:float", "dbp-dbp:bot", "dbp-dbp:p", "dbp-dbp:l", "dbp-dbp:t", "dbp-dbp:j",
            "dbp-dbp:1y", "dbp-dbp:2y", "dbp-dbp:1pp", "dbp-dbp:vgs", "dbp-dbp:3a", "dbp-dbp:3p", "dbp-dbp:3y", "dbp-dbp:4a", "dbp-dbp:4y",
            "dbp-dbp:website"
            ]
 RELATIONSHIP_WHITE_LIST = [
            "dbp-dbp:director","dbp-dbo:starring", "dbp-dbo:writer", "dbp-dbp:name", "dbp-dbp:genre", "purl:dc/terms/subject"
            ]
 """ 
 SELECT DISTINCT field3
 FROM debug
 """
 class Pipeline():
    def __init__(self) -> None:
        self._movie_filter = MovieFilter()
        self._relationship_filter = RelationshipFilter()
        self._rdf_filter = RdfFilter()
        self._pipeline = PipelineApplier()
        self.task_bpe_corpus = BPE_corpus("./Assets/Dataset/Tmp/corpus.txt")
        self.task_rdf_text = RDF_text_task_dataset("./Assets/Dataset/Tmp/rdf_text.csv")
        self.task_rdf_completation = RDF_completation_task_dataset("./Assets/Dataset/Tmp/rdf_completation.csv")
        self._movie_filter.frequency_filter(50,3000)
        self._relationship_filter.frequency_filter(25, 2395627) # from 2718 to 3069 
        self._relationship_filter.delete_relationship_uri_by_list(RELATIONSHIP_FILTER_LIST)
    def other_filter(self):
        self._movie_filter.relation_filter("purl:dc/terms/subject",5,100)
        self._movie_filter.filter_by_director()
        self._movie_filter.filter_by_english_movies()
        self._movie_filter.relation_filter("dbp-dbp:budget",1,100)      # the most important film have relationship budget
        self._movie_filter.relation_filter("dbp-dbp:released",1,100)    # to cut to 2000 :(
    def _get_cleaned_movie_rows(self):
        movie_ids = self._movie_filter.get_movie_id()
        rel_ids = self._relationship_filter.get_relationship_id()
        # rel_ids = self._relationship_filter.get_relationship_id_from_white_list(RELATIONSHIP_WHITE_LIST)
        for RDF in self._rdf_filter.yield_movie_abbreviated_rdfs(movie_ids,rel_ids):
            RDF = self._pipeline.drop_na_from_dataset(RDF)
            RDF = self._pipeline.regex_on_objects(RDF)
            RDF = self._pipeline.rdf_add_special_token(RDF)
            if RDF.empty:
                continue
            yield RDF
    def execute_task_bpe_corpus(self):
        for RDF in self._get_cleaned_movie_rows():
            RDF = self._pipeline.rebuild_by_movie(RDF)
            RDF = RDF[["Triple","Abstract"]]
            self.task_bpe_corpus.write_from_df(RDF)
        self._end_file_handler()
    def execute_tasks_rdf_text(self):
        for RDF in self._get_cleaned_movie_rows():
            RDF = self._pipeline.rebuild_by_movie(RDF)
            self.task_rdf_text.write(RDF)
        self._end_file_handler()
    def execute_task_rdf_completation(self):
        for RDF in self._get_cleaned_movie_rows():
            RDF["Triple"] = self._pipeline.build_triple(RDF)
            self.task_rdf_completation.write(RDF[["MovieID","Triple"]])
        self._end_file_handler()
    def _end_file_handler(self):
        self.task_bpe_corpus.close()
        self.task_rdf_text.close()
        self.task_rdf_completation.close()
    def execute_all_task(self):
        for RDF in self._get_cleaned_movie_rows():
            completation_RDF = RDF.copy()
            completation_RDF["Triple"] = self._pipeline.build_triple(completation_RDF)
            self.task_rdf_completation.write(completation_RDF[["MovieID","Triple"]])
            RDF = self._pipeline.rebuild_by_movie(RDF)
            self.task_rdf_text.write(RDF)
            self.task_bpe_corpus.write_from_df(RDF[["Triple","Abstract"]])
        self._end_file_handler()
    def use_toy_dataset(self):
        # CHOOSEN MOVIE:
        # The Dark Knight   : 117248
        # Inception         : 147074
        # The Avengers      : 113621
        # Cast Away         : 1123
        # The Departed      : 117586
        # American Psycho   : 90177
        # Avatar            : 71587
        # Django Unchained  : 138952
        # Spirited Away     : 144137
        # Knives Out        : 148025
        # [106465,106466,106467,106468,106469,106470,106471,106472,106473]
        movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]
        self._movie_filter.MOVIE_FILTER = pd.DataFrame({"MovieID": movie_list})
    def generate_csv_debug_file(self, debug_path:str):
        debug_csv = Debug_csv(debug_path)
        for RDF in self._get_cleaned_movie_rows():
            debug_csv.write(RDF)
        debug_csv.close()
 pipe = Pipeline()
 #pipe.use_toy_dataset()
 pipe.other_filter()
 # pipe.execute_all_task()
 pipe.generate_csv_debug_file("Assets/Dataset/Tmp/debug.csv")
--- a/Scripts/DataCleaning/pipeline/rdf_filter.py
+++ b/Scripts/DataCleaning/pipeline/rdf_filter.py
@@ -0,0 +1,32 @@
 import pandas as pd
 from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
 class RdfFilter:
    def __init__(self) -> None:
        self.sql_endpoint = SqlEndpoint()
    # def delete_hyperum_when_movie(self):
        # purl:linguistics/gold/hypernym 
        # is almost ever as "dbp-dbr:Movie" or "dbp-dbr:Film"
        # banned triple
    def yield_movie_abbreviated_rdfs(self, MOVIE_ID: pd.DataFrame, REL_ID: pd.DataFrame):
        relationship_placeholder = ",".join(["?"] * len(REL_ID))
        param = tuple(REL_ID["RelationshipID"].to_list())
        QUERY = f"""
                SELECT MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
                FROM RDFs
                INNER JOIN ParsedSubjects USING (SubjectID)
                INNER JOIN ParsedRelationships USING (RelationshipID)
                INNER JOIN ParsedObjects USING (ObjectID)
                INNER JOIN WikipediaAbstracts USING (MovieID)
                WHERE MovieID = (?) AND RelationshipID IN ({relationship_placeholder});
                """        
        for movie_id in MOVIE_ID["MovieID"].to_list():
            params = (movie_id,) + param
            yield self.sql_endpoint.get_dataframe_from_query(QUERY, params=params)
--- a/Scripts/DataCleaning/pipeline/relationship_filter.py
+++ b/Scripts/DataCleaning/pipeline/relationship_filter.py
@@ -0,0 +1,54 @@
 import pandas as pd
 from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
 class RelationshipFilter:
    def __init__(self) -> None:
        self.sql_endpoint = SqlEndpoint()
        # first obtain all relationship_id
        relationship_query = "SELECT RelationshipID FROM Relationships"
        self.RELATIONSHIP_FILTER = self.sql_endpoint.get_dataframe_from_query(relationship_query)
    def frequency_filter(self, min_treshold:int, max_treshold:int):
        movie_list_placeholder = ",".join(["?"] * len(self.RELATIONSHIP_FILTER))
        filter_query = f"""
            SELECT RelationshipID
            FROM RDFs
            WHERE RelationshipID IN ({movie_list_placeholder})
            GROUP BY RelationshipID
            HAVING COUNT(*) BETWEEN {min_treshold} AND {max_treshold};
        """
        self.RELATIONSHIP_FILTER = self.sql_endpoint.get_dataframe_from_query(filter_query, tuple(self.RELATIONSHIP_FILTER["RelationshipID"].to_list()))
    def get_relationship_id(self):
        return self.RELATIONSHIP_FILTER
    def get_relationship_id_from_white_list(self, relationship_list: list[str]):
        ids_placeholder = ",".join(["?"] * len(self.RELATIONSHIP_FILTER))
        uri_placeholder = ",".join(["?"] * len(relationship_list))
        filter_query = f"""
            SELECT RelationshipID
            FROM ParsedRelationships
            WHERE RelationshipID IN ({ids_placeholder})
            AND RelationshipURI IN ({uri_placeholder});
        """
        params = tuple(self.RELATIONSHIP_FILTER["RelationshipID"].to_list()) + tuple(relationship_list)
        return self.sql_endpoint.get_dataframe_from_query(filter_query, params)
    def delete_relationship_uri_by_list(self, filter_list: list[str]):
        ids_placeholder = ",".join(["?"] * len(self.RELATIONSHIP_FILTER))
        uri_placeholder = ",".join(["?"] * len(filter_list))
        filter_query = f"""
            SELECT RelationshipID
            FROM ParsedRelationships
            WHERE RelationshipID IN ({ids_placeholder})
            AND RelationshipURI NOT IN ({uri_placeholder});
        """
        params = tuple(self.RELATIONSHIP_FILTER["RelationshipID"].to_list()) + tuple(filter_list)
        self.RELATIONSHIP_FILTER = self.sql_endpoint.get_dataframe_from_query(filter_query, params)
--- a/Scripts/Libs/CleaningPipeline/special_token.py
+++ b/Scripts/Libs/CleaningPipeline/special_token.py
@@ -0,0 +1,22 @@
 from enum import Enum
 class SpecialToken(str, Enum):
    # (Enum, str) -> throws an error
    START_TRIPLE_LIST = "<SOTL>"
    START_TRIPLE = "<SOT>"
    END_TRIPLE = "<EOT>"
    SUBJECT = "<SUBJ>"
    RELATIONSHIP = "<PRED>"
    OBJECT = "<OBJ>"
    ABSTRACT = "<ABS>"
    END_OF_SENTENCE = "<EOS>"
    CORPUS_END = "<END>"
    ## Tasks' Token
    RDF_TO_TEXT = "<RDF2TXT>"
    TEXT_TO_RDF = "<TEXT2RDF>"
    CONTINUE_RDF = "<CONTINUERDF>"
    MASK = "<MASK>"
    #BPE Training:
--- a/Scripts/Libs/CleaningPipeline/sql_endpoint.py
+++ b/Scripts/Libs/CleaningPipeline/sql_endpoint.py
@@ -0,0 +1,149 @@
 #######################################################
 #   This file stand as endpoint to interact with DB   #
 #######################################################
 # import sqlite3
 import pandas as pd
 from sqlalchemy import create_engine
 from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
 class SqlEndpoint():
    def __init__(self, DB_PATH = "./Assets/Dataset/DatawareHouse/dataset.db", chunk_size_row = 500):
        # self.CONN = sqlite3.connect(DB_PATH) # DEPRECATED
        self.sql_engine = create_engine(f"sqlite:///{DB_PATH}")
        # /// 3 slash -> relative path
        # //// 4 slash -> absolute
        # self.conn = self.sql_engine.connect().execution_options(stream_results=True)
        # it seems that sqlite doenst support streamer cursor
        # PRAGMA exeutes better in writing not reading
        self.chunk_size_row = chunk_size_row                    # not used now, since each chunk is a movie
        self.movie_ids = movie_ids = pd.read_sql_query("SELECT MovieID FROM Movies;", self.sql_engine)["MovieID"]
    def get_RDF(self) -> pd.DataFrame :
        QUERY = """
                SELECT MovieID, SubjectURI, RelationshipURI, ObjectURI
                FROM RDFs
                INNER JOIN Subjects USING (SubjectID)
                INNER JOIN Relationships USING (RelationshipID)
                INNER JOIN Objects USING (ObjectID);
                """
        return pd.read_sql_query(QUERY, self.CONN)
    def get_chunked_abbreviated_dataset(self) -> pd.DataFrame :
        """
        Returns:
            pd.DataFrame: MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
        """        
        QUERY = """
                SELECT MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
                FROM RDFs
                INNER JOIN ParsedSubjects USING (SubjectID)
                INNER JOIN ParsedRelationships USING (RelationshipID)
                INNER JOIN ParsedObjects USING (ObjectID)
                INNER JOIN WikipediaAbstracts USING (MovieID);
                """
        # return pd.read_sql_query(QUERY, self.CONN, chunksize=500)
        # sqlite3
        return pd.read_sql_query(QUERY, self.sql_engine, chunksize=self.chunk_size_row)
    def get_chunked_abbreviated_dataset_with_start_token(self)-> pd.DataFrame:
        # DEPRECATED !
        start_token = SpecialToken()
        QUERY = """
                SELECT 
                    MovieID, 
                    ? || SubjectURI AS SubjectURI,
                    ? || RelationshipURI AS RelationshipURI, 
                    ? || ObjectURI AS ObjectURI, 
                    Abstract
                FROM RDFs
                INNER JOIN ParsedSubjects USING (SubjectID)
                INNER JOIN ParsedRelationships USING (RelationshipID)
                INNER JOIN ParsedObjects USING (ObjectID)
                INNER JOIN WikipediaAbstracts USING (MovieID);
                """
        return pd.read_sql_query(QUERY, self.sql_engine, chunksize=self.chunk_size_row)
    def get_abbreviated_dataset_by_movie_id(self):# -> iter[pd.DataFrame]:
        """
        Gets each time a DataFrame per movie ( with all its rows in the dataset).
        The retrieved RDFs are already abbrevieted by the sql parser
        Yields:
            Pandas.DataFrame: [MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract]
        """        
        # chunk by movieId, abstract is the same and some intersting logic are appliable
        # movie_ids = pd.read_sql_query("SELECT MovieID FROM Movies;", self.sql_engine)["MovieID"]
        # CHOOSEN MOVIE:
        # The Dark Knight   : 117248
        # Inception         : 147074
        # The Avengers      : 113621
        # Cast Away         : 1123
        # The Departed      : 117586
        # American Psycho   : 90177
        # Avatar            : 71587
        # Django Unchained  : 138952
        # Spirited Away     : 144137
        # Knives Out        : 148025
        # movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]
        # movie_ids = movie_list
        QUERY = """
                SELECT MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
                FROM RDFs
                INNER JOIN ParsedSubjects USING (SubjectID)
                INNER JOIN ParsedRelationships USING (RelationshipID)
                INNER JOIN ParsedObjects USING (ObjectID)
                INNER JOIN WikipediaAbstracts USING (MovieID)
                WHERE MovieID = (?);
                """        
        for movie_id in self.movie_ids:
            yield pd.read_sql_query(QUERY, self.sql_engine, params=(movie_id,))
    def get_movies_id_count(self) -> pd.DataFrame:
        """
        Gets the count of each Movie in the Dataset
        Returns:
            Pandas.DataFrame: [MovieID, Count]
        """        
        QUERY = """
                SELECT MovieID, COUNT(*) AS Count
                FROM RDFs
                GROUP BY MovieID;
                """        
        return pd.read_sql_query(QUERY, self.sql_engine)
    def get_relationship_count(self) -> pd.DataFrame:
        """
        Gets the count of each Relationship in the Dataset
        Returns:
            Pandas.DataFrame: [RelationshipURI, Count]
        """       
        QUERY = """
                SELECT RelationshipURI, COUNT(*) AS Count
                FROM RDFs
                INNER JOIN ParsedRelationships USING (RelationshipID)
                GROUP BY RelationshipURI;
                """        
        return pd.read_sql_query(QUERY, self.sql_engine)
    def get_dataframe_from_query(self, query: str, params=None):
        if params is None:
            return pd.read_sql_query(query, self.sql_engine)
        return pd.read_sql_query(query, self.sql_engine, params=params)
 if __name__ == "__main__" :
    sql_endpoint = SqlEndpoint()
    for pandas_row in sql_endpoint.get_abbreviated_dataset_by_movie_id():
        print(pandas_row)
    # sql_endpoint.get_RDF()
    print("done")
--- a/Scripts/Libs/Utils/dataframe_interaction.py
+++ b/Scripts/Libs/Utils/dataframe_interaction.py
@@ -0,0 +1,9 @@
 import pandas as pd
 def get_raw_from_dataframe(DF: pd.DataFrame) -> str:
    output = ''
    for row in DF.itertuples(index=False, name=None):
        output += "".join(map(str, row))
    return output
--- a/Scripts/UML/CleaningPipeline/bpe-pipeline.excalidraw.json
+++ b/Scripts/UML/CleaningPipeline/bpe-pipeline.excalidraw.json
@@ -0,0 +1,897 @@
 {
  "type": "excalidraw",
  "version": 2,
  "source": "https://marketplace.visualstudio.com/items?itemName=pomdtr.excalidraw-editor",
  "elements": [
    {
      "id": "3zbCui3XtIGozHXTVAGRp",
      "type": "rectangle",
      "x": 316.5,
      "y": 123,
      "width": 436.5,
      "height": 145.5,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a0",
      "roundness": {
        "type": 3
      },
      "seed": 1698427950,
      "version": 35,
      "versionNonce": 601575602,
      "isDeleted": false,
      "boundElements": [
        {
          "id": "wD66RDbG05HfvRhAtMb0J",
          "type": "text"
        },
        {
          "id": "gus_rxauKJ6T2L_F59PfN",
          "type": "arrow"
        }
      ],
      "updated": 1758818588814,
      "link": null,
      "locked": false
    },
    {
      "id": "wD66RDbG05HfvRhAtMb0J",
      "type": "text",
      "x": 480.98004150390625,
      "y": 183.25,
      "width": 107.5399169921875,
      "height": 25,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a1",
      "roundness": null,
      "seed": 910769774,
      "version": 31,
      "versionNonce": 1120989938,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758818416720,
      "link": null,
      "locked": false,
      "text": "dataset.db",
      "fontSize": 20,
      "fontFamily": 5,
      "textAlign": "center",
      "verticalAlign": "middle",
      "containerId": "3zbCui3XtIGozHXTVAGRp",
      "originalText": "dataset.db",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "87-MeaiZGT1wln0nggYPZ",
      "type": "rectangle",
      "x": 339.5,
      "y": 309.5,
      "width": 392,
      "height": 156,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a2",
      "roundness": {
        "type": 3
      },
      "seed": 655550318,
      "version": 77,
      "versionNonce": 1103939826,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758818339000,
      "link": null,
      "locked": false
    },
    {
      "id": "EjUxEhZqEBzwvlw0VE9eJ",
      "type": "rectangle",
      "x": 355.5,
      "y": 327,
      "width": 162,
      "height": 125.5,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a3",
      "roundness": {
        "type": 3
      },
      "seed": 1739846638,
      "version": 64,
      "versionNonce": 1594290034,
      "isDeleted": false,
      "boundElements": [
        {
          "type": "text",
          "id": "ogRkV0neHrhEKTE6zlggl"
        }
      ],
      "updated": 1758818391415,
      "link": null,
      "locked": false
    },
    {
      "id": "ogRkV0neHrhEKTE6zlggl",
      "type": "text",
      "x": 378.7100524902344,
      "y": 377.25,
      "width": 115.57989501953125,
      "height": 25,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a3V",
      "roundness": null,
      "seed": 2037675630,
      "version": 12,
      "versionNonce": 1286472046,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758818399222,
      "link": null,
      "locked": false,
      "text": "RDF_String",
      "fontSize": 20,
      "fontFamily": 5,
      "textAlign": "center",
      "verticalAlign": "middle",
      "containerId": "EjUxEhZqEBzwvlw0VE9eJ",
      "originalText": "RDF_String",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "hoIRMNiMJZl4YDo-hovWy",
      "type": "rectangle",
      "x": 542.5,
      "y": 327,
      "width": 173,
      "height": 125.5,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a4",
      "roundness": {
        "type": 3
      },
      "seed": 1189796530,
      "version": 99,
      "versionNonce": 1071057006,
      "isDeleted": false,
      "boundElements": [
        {
          "type": "text",
          "id": "rsapATFAT5YSBCXzLupgZ"
        },
        {
          "id": "gus_rxauKJ6T2L_F59PfN",
          "type": "arrow"
        },
        {
          "id": "Wk1bJbbtC31FqObEL5xWt",
          "type": "arrow"
        }
      ],
      "updated": 1758818593647,
      "link": null,
      "locked": false
    },
    {
      "id": "rsapATFAT5YSBCXzLupgZ",
      "type": "text",
      "x": 585.6800384521484,
      "y": 377.25,
      "width": 86.63992309570312,
      "height": 25,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a5",
      "roundness": null,
      "seed": 829619694,
      "version": 12,
      "versionNonce": 713902318,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758818405150,
      "link": null,
      "locked": false,
      "text": "Abstract",
      "fontSize": 20,
      "fontFamily": 5,
      "textAlign": "center",
      "verticalAlign": "middle",
      "containerId": "hoIRMNiMJZl4YDo-hovWy",
      "originalText": "Abstract",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "jSx8ApfhtRs_nk37VvDMb",
      "type": "rectangle",
      "x": 316.5,
      "y": 511,
      "width": 436.5,
      "height": 145.5,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a6",
      "roundness": {
        "type": 3
      },
      "seed": 492582894,
      "version": 132,
      "versionNonce": 893797614,
      "isDeleted": false,
      "boundElements": [
        {
          "type": "text",
          "id": "6E23g-rgowNqHsBxX-LuM"
        },
        {
          "id": "hyFKqXwet_F79QM71atgI",
          "type": "arrow"
        },
        {
          "id": "x_DP1FcQ7jraGz0gBuDi3",
          "type": "arrow"
        },
        {
          "id": "1IGbCps2EHnzKgJUWM5nq",
          "type": "arrow"
        },
        {
          "id": "Wk1bJbbtC31FqObEL5xWt",
          "type": "arrow"
        }
      ],
      "updated": 1758818593647,
      "link": null,
      "locked": false
    },
    {
      "id": "6E23g-rgowNqHsBxX-LuM",
      "type": "text",
      "x": 499.9100341796875,
      "y": 571.25,
      "width": 69.679931640625,
      "height": 25,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a7",
      "roundness": null,
      "seed": 267696178,
      "version": 132,
      "versionNonce": 1668243186,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758818543211,
      "link": null,
      "locked": false,
      "text": "Pandas",
      "fontSize": 20,
      "fontFamily": 5,
      "textAlign": "center",
      "verticalAlign": "middle",
      "containerId": "jSx8ApfhtRs_nk37VvDMb",
      "originalText": "Pandas",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "ohj18N4AOTDz5lJNcV9gi",
      "type": "rectangle",
      "x": 261,
      "y": 765.5,
      "width": 157,
      "height": 87,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a8",
      "roundness": {
        "type": 3
      },
      "seed": 1446207150,
      "version": 279,
      "versionNonce": 317375026,
      "isDeleted": false,
      "boundElements": [
        {
          "id": "Ea1_ke2wA0D8ZjVOUtvfY",
          "type": "text"
        },
        {
          "id": "hyFKqXwet_F79QM71atgI",
          "type": "arrow"
        }
      ],
      "updated": 1758818570993,
      "link": null,
      "locked": false
    },
    {
      "id": "Ea1_ke2wA0D8ZjVOUtvfY",
      "type": "text",
      "x": 297.0800323486328,
      "y": 796.5,
      "width": 84.83993530273438,
      "height": 25,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a9",
      "roundness": null,
      "seed": 435116270,
      "version": 199,
      "versionNonce": 1282911218,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758818570993,
      "link": null,
      "locked": false,
      "text": "train.txt",
      "fontSize": 20,
      "fontFamily": 5,
      "textAlign": "center",
      "verticalAlign": "middle",
      "containerId": "ohj18N4AOTDz5lJNcV9gi",
      "originalText": "train.txt",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "A4Y54Y26fe257U_QU9lxX",
      "type": "rectangle",
      "x": 464,
      "y": 765.5,
      "width": 157,
      "height": 87,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aA",
      "roundness": {
        "type": 3
      },
      "seed": 186148850,
      "version": 232,
      "versionNonce": 997119858,
      "isDeleted": false,
      "boundElements": [
        {
          "id": "v4TvUlDEjH7EvPDmtbOn2",
          "type": "text"
        },
        {
          "id": "1IGbCps2EHnzKgJUWM5nq",
          "type": "arrow"
        }
      ],
      "updated": 1758818570993,
      "link": null,
      "locked": false
    },
    {
      "id": "v4TvUlDEjH7EvPDmtbOn2",
      "type": "text",
      "x": 476.3500442504883,
      "y": 796.5,
      "width": 132.29991149902344,
      "height": 25,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aB",
      "roundness": null,
      "seed": 1131059634,
      "version": 171,
      "versionNonce": 239540530,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758818570993,
      "link": null,
      "locked": false,
      "text": "validation.txt",
      "fontSize": 20,
      "fontFamily": 5,
      "textAlign": "center",
      "verticalAlign": "middle",
      "containerId": "A4Y54Y26fe257U_QU9lxX",
      "originalText": "validation.txt",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "mPaYpJ9Xn7tlJPmKPqJKJ",
      "type": "rectangle",
      "x": 674.5,
      "y": 765.5,
      "width": 157,
      "height": 87,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aC",
      "roundness": {
        "type": 3
      },
      "seed": 1049323314,
      "version": 235,
      "versionNonce": 330560690,
      "isDeleted": false,
      "boundElements": [
        {
          "type": "text",
          "id": "kg9nm2rpud6cax5aNPSnu"
        },
        {
          "id": "x_DP1FcQ7jraGz0gBuDi3",
          "type": "arrow"
        }
      ],
      "updated": 1758818570993,
      "link": null,
      "locked": false
    },
    {
      "id": "kg9nm2rpud6cax5aNPSnu",
      "type": "text",
      "x": 711.4300231933594,
      "y": 796.5,
      "width": 83.13995361328125,
      "height": 25,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aD",
      "roundness": null,
      "seed": 522572142,
      "version": 193,
      "versionNonce": 1920372338,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758818570993,
      "link": null,
      "locked": false,
      "text": "test.txt",
      "fontSize": 20,
      "fontFamily": 5,
      "textAlign": "center",
      "verticalAlign": "middle",
      "containerId": "mPaYpJ9Xn7tlJPmKPqJKJ",
      "originalText": "test.txt",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "hyFKqXwet_F79QM71atgI",
      "type": "arrow",
      "x": 534.65,
      "y": 661.5,
      "width": 195.25,
      "height": 99,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aG",
      "roundness": null,
      "seed": 873266098,
      "version": 71,
      "versionNonce": 541154738,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758818570993,
      "link": null,
      "locked": false,
      "points": [
        [
          0,
          0
        ],
        [
          0,
          49.5
        ],
        [
          -195.25,
          49.5
        ],
        [
          -195.25,
          99
        ]
      ],
      "lastCommittedPoint": null,
      "startBinding": {
        "elementId": "jSx8ApfhtRs_nk37VvDMb",
        "fixedPoint": [
          0.49977090492554405,
          1.034364261168385
        ],
        "focus": 0,
        "gap": 0
      },
      "endBinding": {
        "elementId": "ohj18N4AOTDz5lJNcV9gi",
        "fixedPoint": [
          0.4993630573248406,
          -0.05747126436781609
        ],
        "focus": 0,
        "gap": 0
      },
      "startArrowhead": null,
      "endArrowhead": "triangle",
      "elbowed": true,
      "fixedSegments": null,
      "startIsSpecial": null,
      "endIsSpecial": null
    },
    {
      "id": "x_DP1FcQ7jraGz0gBuDi3",
      "type": "arrow",
      "x": 534.65,
      "y": 661.5,
      "width": 218.25,
      "height": 99,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aH",
      "roundness": null,
      "seed": 1210817582,
      "version": 77,
      "versionNonce": 1483392370,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758818580594,
      "link": null,
      "locked": false,
      "points": [
        [
          0,
          0
        ],
        [
          0,
          49.5
        ],
        [
          218.25,
          49.5
        ],
        [
          218.25,
          99
        ]
      ],
      "lastCommittedPoint": null,
      "startBinding": {
        "elementId": "jSx8ApfhtRs_nk37VvDMb",
        "fixedPoint": [
          0.49977090492554405,
          1.034364261168385
        ],
        "focus": 0,
        "gap": 0
      },
      "endBinding": {
        "elementId": "mPaYpJ9Xn7tlJPmKPqJKJ",
        "fixedPoint": [
          0.4993630573248406,
          -0.05747126436781609
        ],
        "focus": 0,
        "gap": 0
      },
      "startArrowhead": null,
      "endArrowhead": "triangle",
      "elbowed": true,
      "fixedSegments": null,
      "startIsSpecial": null,
      "endIsSpecial": null
    },
    {
      "id": "1IGbCps2EHnzKgJUWM5nq",
      "type": "arrow",
      "x": 534.65,
      "y": 661.5,
      "width": 0.5719232650604908,
      "height": 99.07394122590165,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aK",
      "roundness": null,
      "seed": 1205316658,
      "version": 96,
      "versionNonce": 1748050674,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758818570993,
      "link": null,
      "locked": false,
      "points": [
        [
          0,
          0
        ],
        [
          -0.5719232650604908,
          99.07394122590165
        ]
      ],
      "lastCommittedPoint": null,
      "startBinding": {
        "elementId": "jSx8ApfhtRs_nk37VvDMb",
        "fixedPoint": [
          0.49977090492554405,
          1.034364261168385
        ],
        "focus": 0,
        "gap": 0
      },
      "endBinding": {
        "elementId": "A4Y54Y26fe257U_QU9lxX",
        "fixedPoint": [
          0.44635717665566554,
          -0.056621365219521276
        ],
        "focus": 0,
        "gap": 0
      },
      "startArrowhead": null,
      "endArrowhead": "triangle",
      "elbowed": true,
      "fixedSegments": null,
      "startIsSpecial": null,
      "endIsSpecial": null
    },
    {
      "id": "gus_rxauKJ6T2L_F59PfN",
      "type": "arrow",
      "x": 539,
      "y": 271.5,
      "width": 0,
      "height": 33.5,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aL",
      "roundness": null,
      "seed": 763990258,
      "version": 17,
      "versionNonce": 1028811378,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758818588814,
      "link": null,
      "locked": false,
      "points": [
        [
          0,
          0
        ],
        [
          0,
          33.5
        ]
      ],
      "lastCommittedPoint": null,
      "startBinding": {
        "elementId": "3zbCui3XtIGozHXTVAGRp",
        "focus": -0.019473081328751418,
        "gap": 3
      },
      "endBinding": {
        "elementId": "hoIRMNiMJZl4YDo-hovWy",
        "focus": -1.0404624277456647,
        "gap": 30.7545797799829
      },
      "startArrowhead": null,
      "endArrowhead": "triangle",
      "elbowed": false
    },
    {
      "id": "Wk1bJbbtC31FqObEL5xWt",
      "type": "arrow",
      "x": 536.5,
      "y": 468.5,
      "width": 0,
      "height": 39,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aM",
      "roundness": null,
      "seed": 1489771054,
      "version": 33,
      "versionNonce": 1828178606,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1758818593647,
      "link": null,
      "locked": false,
      "points": [
        [
          0,
          0
        ],
        [
          0,
          39
        ]
      ],
      "lastCommittedPoint": null,
      "startBinding": {
        "elementId": "hoIRMNiMJZl4YDo-hovWy",
        "focus": 1.0693641618497107,
        "gap": 27.157190169432425
      },
      "endBinding": {
        "elementId": "jSx8ApfhtRs_nk37VvDMb",
        "focus": 0.008018327605956525,
        "gap": 3.5
      },
      "startArrowhead": null,
      "endArrowhead": "triangle",
      "elbowed": false
    }
  ],
  "appState": {
    "gridSize": 20,
    "gridStep": 5,
    "gridModeEnabled": false,
    "viewBackgroundColor": "#ffffff"
  },
  "files": {}
 }
--- a/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json
+++ b/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json
@@ -0,0 +1,634 @@
 {
  "type": "excalidraw",
  "version": 2,
  "source": "https://marketplace.visualstudio.com/items?itemName=pomdtr.excalidraw-editor",
  "elements": [
    {
      "id": "JNB9z-PeqZ4s8KDfWaoXe",
      "type": "rectangle",
      "x": 106,
      "y": 27,
      "width": 653,
      "height": 263,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a2",
      "roundness": {
        "type": 3
      },
      "seed": 710740889,
      "version": 326,
      "versionNonce": 1107631703,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1759156408059,
      "link": null,
      "locked": false
    },
    {
      "id": "e13wNTgUpn2flMpmMttqx",
      "type": "text",
      "x": 200.5943407656526,
      "y": 44.07937975075269,
      "width": 307.2781467269385,
      "height": 23.3097531902191,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a3",
      "roundness": null,
      "seed": 1012740663,
      "version": 444,
      "versionNonce": 589551257,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1759156408059,
      "link": null,
      "locked": false,
      "text": "Libs/CleaningPipeline/sql_endpoint",
      "fontSize": 18.64780255217528,
      "fontFamily": 5,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "Libs/CleaningPipeline/sql_endpoint",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "id": "CgxCElJkKBtIHv-5WQrbo",
      "type": "text",
      "x": 195,
      "y": 80.44259472749451,
      "width": 403.64997665852184,
      "height": 186.4780255217528,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "a4",
      "roundness": null,
      "seed": 1261951799,
      "version": 507,
      "versionNonce": 1922906999,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1759156408059,
      "link": null,
      "locked": false,
      "text": "Class SqlEndpoint:\n    - sql_engine\n    + movie_ids: list[int]\n\n    #\n    + get_abbreviated_dataset_by_movie_id\n\n",
      "fontSize": 18.64780255217528,
      "fontFamily": 5,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "Class SqlEndpoint:\n    - sql_engine\n    + movie_ids: list[int]\n\n    #\n    + get_abbreviated_dataset_by_movie_id\n\n",
      "autoResize": true,
      "lineHeight": 1.25
    },
    {
      "type": "line",
      "version": 4979,
      "versionNonce": 1473849177,
      "isDeleted": false,
      "id": "sYReMTdYblr-oJtYYJALU",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": -67.14432426259049,
      "y": 87.19293561900287,
      "strokeColor": "#000000",
      "backgroundColor": "#a5d8ff",
      "width": 77.09201683999922,
      "height": 99.49948667804088,
      "seed": 1263944119,
      "groupIds": [
        "9YkNe1yqnfZy9Z1JX2xr4",
        "BDBCTrrhjbJynRAyuf3xJ"
      ],
      "strokeSharpness": "round",
      "boundElementIds": [],
      "startBinding": null,
      "endBinding": null,
      "lastCommittedPoint": null,
      "startArrowhead": null,
      "endArrowhead": null,
      "points": [
        [
          0,
          0
        ],
        [
          0.2542098813493443,
          75.20117273657175
        ],
        [
          0.011896425679918422,
          83.76249969444815
        ],
        [
          3.970409367559332,
          87.46174320643391
        ],
        [
          17.75573317066317,
          90.59250103325854
        ],
        [
          41.05683533152865,
          91.56737225214069
        ],
        [
          63.319497586673116,
          90.01084754868091
        ],
        [
          75.14781395923075,
          86.28844687220405
        ],
        [
          76.81603792670788,
          83.15042405259751
        ],
        [
          77.05033394391478,
          76.25776215104557
        ],
        [
          76.86643881413028,
          6.3089586511537865
        ],
        [
          76.45188016352971,
          -0.2999144698665015
        ],
        [
          71.50179495549581,
          -3.9936571317850627
        ],
        [
          61.077971898861186,
          -6.132877429442784
        ],
        [
          37.32348754161154,
          -7.932114425900202
        ],
        [
          18.278415656797975,
          -6.859225353587373
        ],
        [
          3.2995959613238286,
          -3.2201165291205287
        ],
        [
          -0.04168289608444441,
          -0.045185660461322996
        ],
        [
          0,
          0
        ]
      ],
      "index": "a6",
      "frameId": null,
      "roundness": {
        "type": 2
      },
      "boundElements": [],
      "updated": 1759158252997,
      "link": null,
      "locked": false
    },
    {
      "type": "line",
      "version": 2684,
      "versionNonce": 952947769,
      "isDeleted": false,
      "id": "0S6dEWQVqKUVkP6Z5IX1l",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": -66.6203948243155,
      "y": 144.31921927673278,
      "strokeColor": "#000000",
      "backgroundColor": "#a5d8ff",
      "width": 77.17198221193564,
      "height": 8.562348957853036,
      "seed": 817033943,
      "groupIds": [
        "9YkNe1yqnfZy9Z1JX2xr4",
        "BDBCTrrhjbJynRAyuf3xJ"
      ],
      "strokeSharpness": "round",
      "boundElementIds": [],
      "startBinding": null,
      "endBinding": null,
      "lastCommittedPoint": null,
      "startArrowhead": null,
      "endArrowhead": null,
      "points": [
        [
          0,
          0
        ],
        [
          2.033150371639873,
          3.413095389435587
        ],
        [
          10.801287372573954,
          6.276651055277943
        ],
        [
          22.468666942209353,
          8.010803051612635
        ],
        [
          40.747074201802775,
          8.168828515515864
        ],
        [
          62.077348233027564,
          7.0647721921469495
        ],
        [
          74.53446931782398,
          3.04824021069218
        ],
        [
          77.17198221193564,
          -0.3935204423371723
        ]
      ],
      "index": "a7",
      "frameId": null,
      "roundness": {
        "type": 2
      },
      "boundElements": [],
      "updated": 1759158252997,
      "link": null,
      "locked": false
    },
    {
      "type": "line",
      "version": 2770,
      "versionNonce": 477619481,
      "isDeleted": false,
      "id": "szGLND7J0nVOvRkNXX9AS",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": -67.65225214681931,
      "y": 115.35516394150972,
      "strokeColor": "#000000",
      "backgroundColor": "#a5d8ff",
      "width": 77.17198221193564,
      "height": 8.562348957853036,
      "seed": 1704755191,
      "groupIds": [
        "9YkNe1yqnfZy9Z1JX2xr4",
        "BDBCTrrhjbJynRAyuf3xJ"
      ],
      "strokeSharpness": "round",
      "boundElementIds": [],
      "startBinding": null,
      "endBinding": null,
      "lastCommittedPoint": null,
      "startArrowhead": null,
      "endArrowhead": null,
      "points": [
        [
          0,
          0
        ],
        [
          2.033150371639873,
          3.413095389435587
        ],
        [
          10.801287372573954,
          6.276651055277943
        ],
        [
          22.468666942209353,
          8.010803051612635
        ],
        [
          40.747074201802775,
          8.168828515515864
        ],
        [
          62.077348233027564,
          7.0647721921469495
        ],
        [
          74.53446931782398,
          3.04824021069218
        ],
        [
          77.17198221193564,
          -0.3935204423371723
        ]
      ],
      "index": "a8",
      "frameId": null,
      "roundness": {
        "type": 2
      },
      "boundElements": [],
      "updated": 1759158252997,
      "link": null,
      "locked": false
    },
    {
      "type": "ellipse",
      "version": 5767,
      "versionNonce": 2119031289,
      "isDeleted": false,
      "id": "O3t2uGktJlDd1_OX_bpV4",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": -68.71020112890136,
      "y": 80.06066699332126,
      "strokeColor": "#000000",
      "backgroundColor": "#a5d8ff",
      "width": 76.59753601865496,
      "height": 15.49127539284798,
      "seed": 471296279,
      "groupIds": [
        "9YkNe1yqnfZy9Z1JX2xr4",
        "BDBCTrrhjbJynRAyuf3xJ"
      ],
      "strokeSharpness": "sharp",
      "boundElementIds": [
        "bxuMGTzXLn7H-uBCptINx"
      ],
      "index": "a9",
      "frameId": null,
      "roundness": null,
      "boundElements": [],
      "updated": 1759158252997,
      "link": null,
      "locked": false
    },
    {
      "type": "ellipse",
      "version": 1177,
      "versionNonce": 525480665,
      "isDeleted": false,
      "id": "_SzKlOBOvJgBg7FX0JTTM",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": -32.218214023678854,
      "y": 104.53733467322485,
      "strokeColor": "#000000",
      "backgroundColor": "#228be6",
      "width": 11.226103154161754,
      "height": 12.183758484455605,
      "seed": 1368927799,
      "groupIds": [
        "9YkNe1yqnfZy9Z1JX2xr4",
        "BDBCTrrhjbJynRAyuf3xJ"
      ],
      "strokeSharpness": "sharp",
      "boundElementIds": [],
      "index": "aA",
      "frameId": null,
      "roundness": null,
      "boundElements": [],
      "updated": 1759158252997,
      "link": null,
      "locked": false
    },
    {
      "type": "ellipse",
      "version": 1465,
      "versionNonce": 1410887609,
      "isDeleted": false,
      "id": "oJMl2Kxa3SPaiAY0kxo7A",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": -31.867072239745255,
      "y": 130.75394896028996,
      "strokeColor": "#000000",
      "backgroundColor": "#228be6",
      "width": 11.226103154161754,
      "height": 12.183758484455605,
      "seed": 1627606871,
      "groupIds": [
        "9YkNe1yqnfZy9Z1JX2xr4",
        "BDBCTrrhjbJynRAyuf3xJ"
      ],
      "strokeSharpness": "sharp",
      "boundElementIds": [],
      "index": "aB",
      "frameId": null,
      "roundness": null,
      "boundElements": [],
      "updated": 1759158252997,
      "link": null,
      "locked": false
    },
    {
      "type": "ellipse",
      "version": 1348,
      "versionNonce": 314839193,
      "isDeleted": false,
      "id": "fB6pJBSMA-pRHrpgYKaLL",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 6.239590202363168,
      "x": -31.218214023678854,
      "y": 159.52267553159635,
      "strokeColor": "#000000",
      "backgroundColor": "#228be6",
      "width": 11.226103154161754,
      "height": 12.183758484455605,
      "seed": 1420643447,
      "groupIds": [
        "9YkNe1yqnfZy9Z1JX2xr4",
        "BDBCTrrhjbJynRAyuf3xJ"
      ],
      "strokeSharpness": "sharp",
      "boundElementIds": [],
      "index": "aC",
      "frameId": null,
      "roundness": null,
      "boundElements": [],
      "updated": 1759158252997,
      "link": null,
      "locked": false
    },
    {
      "type": "text",
      "version": 846,
      "versionNonce": 1091081593,
      "isDeleted": false,
      "id": "9gZ3Yy1MeP9kEOTLODqLG",
      "fillStyle": "solid",
      "strokeWidth": 1,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "angle": 0,
      "x": -76.81018163712321,
      "y": 181.11281713043917,
      "strokeColor": "#000000",
      "backgroundColor": "#a5d8ff",
      "width": 95.63072204589844,
      "height": 23.595161071904883,
      "seed": 2019206551,
      "groupIds": [
        "BDBCTrrhjbJynRAyuf3xJ"
      ],
      "strokeSharpness": "sharp",
      "boundElementIds": [],
      "fontSize": 17.4778970902999,
      "fontFamily": 1,
      "text": "dataset.db",
      "baseline": 16.595161071904883,
      "textAlign": "center",
      "verticalAlign": "top",
      "index": "aD",
      "frameId": null,
      "roundness": null,
      "boundElements": [],
      "updated": 1759158252997,
      "link": null,
      "locked": false,
      "containerId": null,
      "originalText": "dataset.db",
      "autoResize": true,
      "lineHeight": 1.350000000000001
    },
    {
      "id": "3eOw20xMhpB5jf_RMG24P",
      "type": "text",
      "x": 1131.3333333333335,
      "y": 31.333333333333428,
      "width": 508.3333333333333,
      "height": 550,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aE",
      "roundness": null,
      "seed": 1535658041,
      "version": 821,
      "versionNonce": 1630266809,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1759157181677,
      "link": null,
      "locked": false,
      "text": "Class PipelineApplier\n    - movie_frequence_filter : pd.DataFrame()\n    - rel_Frequence_Filter : pd.DataFrame()\n    - rel_banned_list: list[str]\n\n    + generate_movie_frequency_filter()\n    + generate_rel_frequency_filter()\n    + generate_list_relationship_filter()\n    \n    + filter_by_movie_frequency()\n    + filter_by_relationship_frequency()\n    + delete_relationship_by_list_filter()\n    + delete_relationship_by_str()\n\n    + drop_na()    \n\n    + rdf_add_special_token()\n    + group_triple_by_movie()\n    + build_by_movie()\n    # static\n    + build_triple()\n    + build_incomplete_triple()",
      "fontSize": 20,
      "fontFamily": 5,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "Class PipelineApplier\n    - movie_frequence_filter : pd.DataFrame()\n    - rel_Frequence_Filter : pd.DataFrame()\n    - rel_banned_list: list[str]\n\n    + generate_movie_frequency_filter()\n    + generate_rel_frequency_filter()\n    + generate_list_relationship_filter()\n    \n    + filter_by_movie_frequency()\n    + filter_by_relationship_frequency()\n    + delete_relationship_by_list_filter()\n    + delete_relationship_by_str()\n\n    + drop_na()    \n\n    + rdf_add_special_token()\n    + group_triple_by_movie()\n    + build_by_movie()\n    # static\n    + build_triple()\n    + build_incomplete_triple()",
      "autoResize": false,
      "lineHeight": 1.25
    },
    {
      "id": "Fbl1gpb5r7QrdRauGUWm2",
      "type": "text",
      "x": 158.23809523809535,
      "y": 502.52380952380935,
      "width": 484.2857142857143,
      "height": 500,
      "angle": 0,
      "strokeColor": "#1e1e1e",
      "backgroundColor": "transparent",
      "fillStyle": "solid",
      "strokeWidth": 2,
      "strokeStyle": "solid",
      "roughness": 1,
      "opacity": 100,
      "groupIds": [],
      "frameId": null,
      "index": "aF",
      "roundness": null,
      "seed": 2066618807,
      "version": 552,
      "versionNonce": 1269344823,
      "isDeleted": false,
      "boundElements": null,
      "updated": 1759158199532,
      "link": null,
      "locked": false,
      "text": "Class Pipeline\n    - sql_endpoint: SqlEndpoint()\n\n    - task_rdf_mask_file_handler:\n    - task_bpe_corpus_file_handler:\n    - task_rdf_text_file_handler:\n    - task_rdf_completation_file_handler:\n\n    - Filter_applier : PipelineApplier()\n\n    #\n    - get_cleaned_movie_rows()\n    \n    + execute_task_bpe_corpus()\n    + execute_task_rdf_mask()\n    + execute_task_rdf_text()\n    + execute_task_rdf_completation()\n    + execute_all_task()\n\n    + use_toy_dataset()",
      "fontSize": 20,
      "fontFamily": 5,
      "textAlign": "left",
      "verticalAlign": "top",
      "containerId": null,
      "originalText": "Class Pipeline\n    - sql_endpoint: SqlEndpoint()\n\n    - task_rdf_mask_file_handler:\n    - task_bpe_corpus_file_handler:\n    - task_rdf_text_file_handler:\n    - task_rdf_completation_file_handler:\n\n    - Filter_applier : PipelineApplier()\n\n    #\n    - get_cleaned_movie_rows()\n    \n    + execute_task_bpe_corpus()\n    + execute_task_rdf_mask()\n    + execute_task_rdf_text()\n    + execute_task_rdf_completation()\n    + execute_all_task()\n\n    + use_toy_dataset()",
      "autoResize": false,
      "lineHeight": 1.25
    }
  ],
  "appState": {
    "gridSize": 20,
    "gridStep": 5,
    "gridModeEnabled": false,
    "viewBackgroundColor": "#ffffff"
  },
  "files": {}
 }
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,3 +15,4 @@ tzdata==2025.2
 urllib3==2.5.0
 wheel==0.45.1
 Wikipedia-API==0.8.1
 SQLAlchemy
Author	SHA1	Message	Date
GassiGiuseppe	856c693650	Added possibility to whitelist relationships	2025-10-12 12:26:26 +02:00
GassiGiuseppe	e9d30b3cea	add divide method to create hold out dataset	2025-10-11 16:49:36 +02:00
GassiGiuseppe	ee12f53f12	Added EOS token	2025-10-07 22:47:59 +02:00
GassiGiuseppe	a04f4c7cb7	changes to shorten the dataset	2025-10-07 15:49:25 +02:00
GassiGiuseppe	a93e61b8c1	Update ETL	2025-10-07 00:54:00 +02:00
GassiGiuseppe	0373460105	Movie filters updated	2025-10-06 10:57:50 +02:00
GassiGiuseppe	7307916891	update sql_endpoint to work with the new pipeline	2025-10-05 14:58:03 +02:00
GassiGiuseppe	acb43fc899	new faster pipeline	2025-10-05 14:57:45 +02:00
GassiGiuseppe	255d801a80	updated the mask rdf_mask_task. however since the model will build the mask itself, it is deprecated	2025-10-05 14:56:33 +02:00
GassiGiuseppe	2bd24ec278	Created legacy folder for old pipeline this pipeline still works but is slower then the new, some ot its method can be used later	2025-10-05 14:54:32 +02:00
GassiGiuseppe	69fba7c3e9	new utility to generate a csv debug file of the output of the pipeline	2025-10-04 21:33:09 +02:00
GassiGiuseppe	64e355e80c	Added regex to delete new lines and * from ObjectURI	2025-09-30 15:00:07 +02:00
GassiGiuseppe	007f1e9554	minor updates	2025-09-29 18:53:33 +02:00
GassiGiuseppe	c319398ca0	little update to UML pipeline	2025-09-29 17:03:31 +02:00
GassiGiuseppe	255d8a072d	First implementation of the cleaning pipeline UML	2025-09-29 16:59:52 +02:00
GassiGiuseppe	8167c9d435	Added Toy Dataset entry point into the Pipeline class Before it was forced into the sql_endpoint, now all the pipeline can be managed in the Pipeline class	2025-09-29 16:03:49 +02:00
GassiGiuseppe	bd72ad3571	Added file to execute the complete cleaning pipeline	2025-09-29 15:21:26 +02:00
GassiGiuseppe	6ddb7de9da	Added sqlAlchemy to requirements	2025-09-29 15:19:19 +02:00
GassiGiuseppe	650b37c586	Added vscode setting to execute jupyternotebook from root dir	2025-09-26 11:24:34 +02:00
GassiGiuseppe	e521b0704e	deleted TODO in path_splitter_tree, as it was already resolved	2025-09-25 19:19:11 +02:00
Christian Risi	0a698e9837	Added schema to extract from DB for BPE	2025-09-25 19:09:52 +02:00