From 0a698e9837367de4e42d5b7506ed2a84b4e8f440 Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Thu, 25 Sep 2025 19:09:52 +0200
Subject: [PATCH 1/9] Added schema to extract from DB for BPE

---
 .../bpe-pipeline.excalidraw.json              | 897 ++++++++++++++++++
 1 file changed, 897 insertions(+)
 create mode 100644 Scripts/UML/CleaningPipeline/bpe-pipeline.excalidraw.json

diff --git a/Scripts/UML/CleaningPipeline/bpe-pipeline.excalidraw.json b/Scripts/UML/CleaningPipeline/bpe-pipeline.excalidraw.json
new file mode 100644
index 0000000..0edf3cf
--- /dev/null
+++ b/Scripts/UML/CleaningPipeline/bpe-pipeline.excalidraw.json
@@ -0,0 +1,897 @@
+{
+  "type": "excalidraw",
+  "version": 2,
+  "source": "https://marketplace.visualstudio.com/items?itemName=pomdtr.excalidraw-editor",
+  "elements": [
+    {
+      "id": "3zbCui3XtIGozHXTVAGRp",
+      "type": "rectangle",
+      "x": 316.5,
+      "y": 123,
+      "width": 436.5,
+      "height": 145.5,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a0",
+      "roundness": {
+        "type": 3
+      },
+      "seed": 1698427950,
+      "version": 35,
+      "versionNonce": 601575602,
+      "isDeleted": false,
+      "boundElements": [
+        {
+          "id": "wD66RDbG05HfvRhAtMb0J",
+          "type": "text"
+        },
+        {
+          "id": "gus_rxauKJ6T2L_F59PfN",
+          "type": "arrow"
+        }
+      ],
+      "updated": 1758818588814,
+      "link": null,
+      "locked": false
+    },
+    {
+      "id": "wD66RDbG05HfvRhAtMb0J",
+      "type": "text",
+      "x": 480.98004150390625,
+      "y": 183.25,
+      "width": 107.5399169921875,
+      "height": 25,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a1",
+      "roundness": null,
+      "seed": 910769774,
+      "version": 31,
+      "versionNonce": 1120989938,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818416720,
+      "link": null,
+      "locked": false,
+      "text": "dataset.db",
+      "fontSize": 20,
+      "fontFamily": 5,
+      "textAlign": "center",
+      "verticalAlign": "middle",
+      "containerId": "3zbCui3XtIGozHXTVAGRp",
+      "originalText": "dataset.db",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "87-MeaiZGT1wln0nggYPZ",
+      "type": "rectangle",
+      "x": 339.5,
+      "y": 309.5,
+      "width": 392,
+      "height": 156,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a2",
+      "roundness": {
+        "type": 3
+      },
+      "seed": 655550318,
+      "version": 77,
+      "versionNonce": 1103939826,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818339000,
+      "link": null,
+      "locked": false
+    },
+    {
+      "id": "EjUxEhZqEBzwvlw0VE9eJ",
+      "type": "rectangle",
+      "x": 355.5,
+      "y": 327,
+      "width": 162,
+      "height": 125.5,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a3",
+      "roundness": {
+        "type": 3
+      },
+      "seed": 1739846638,
+      "version": 64,
+      "versionNonce": 1594290034,
+      "isDeleted": false,
+      "boundElements": [
+        {
+          "type": "text",
+          "id": "ogRkV0neHrhEKTE6zlggl"
+        }
+      ],
+      "updated": 1758818391415,
+      "link": null,
+      "locked": false
+    },
+    {
+      "id": "ogRkV0neHrhEKTE6zlggl",
+      "type": "text",
+      "x": 378.7100524902344,
+      "y": 377.25,
+      "width": 115.57989501953125,
+      "height": 25,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a3V",
+      "roundness": null,
+      "seed": 2037675630,
+      "version": 12,
+      "versionNonce": 1286472046,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818399222,
+      "link": null,
+      "locked": false,
+      "text": "RDF_String",
+      "fontSize": 20,
+      "fontFamily": 5,
+      "textAlign": "center",
+      "verticalAlign": "middle",
+      "containerId": "EjUxEhZqEBzwvlw0VE9eJ",
+      "originalText": "RDF_String",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "hoIRMNiMJZl4YDo-hovWy",
+      "type": "rectangle",
+      "x": 542.5,
+      "y": 327,
+      "width": 173,
+      "height": 125.5,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a4",
+      "roundness": {
+        "type": 3
+      },
+      "seed": 1189796530,
+      "version": 99,
+      "versionNonce": 1071057006,
+      "isDeleted": false,
+      "boundElements": [
+        {
+          "type": "text",
+          "id": "rsapATFAT5YSBCXzLupgZ"
+        },
+        {
+          "id": "gus_rxauKJ6T2L_F59PfN",
+          "type": "arrow"
+        },
+        {
+          "id": "Wk1bJbbtC31FqObEL5xWt",
+          "type": "arrow"
+        }
+      ],
+      "updated": 1758818593647,
+      "link": null,
+      "locked": false
+    },
+    {
+      "id": "rsapATFAT5YSBCXzLupgZ",
+      "type": "text",
+      "x": 585.6800384521484,
+      "y": 377.25,
+      "width": 86.63992309570312,
+      "height": 25,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a5",
+      "roundness": null,
+      "seed": 829619694,
+      "version": 12,
+      "versionNonce": 713902318,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818405150,
+      "link": null,
+      "locked": false,
+      "text": "Abstract",
+      "fontSize": 20,
+      "fontFamily": 5,
+      "textAlign": "center",
+      "verticalAlign": "middle",
+      "containerId": "hoIRMNiMJZl4YDo-hovWy",
+      "originalText": "Abstract",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "jSx8ApfhtRs_nk37VvDMb",
+      "type": "rectangle",
+      "x": 316.5,
+      "y": 511,
+      "width": 436.5,
+      "height": 145.5,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a6",
+      "roundness": {
+        "type": 3
+      },
+      "seed": 492582894,
+      "version": 132,
+      "versionNonce": 893797614,
+      "isDeleted": false,
+      "boundElements": [
+        {
+          "type": "text",
+          "id": "6E23g-rgowNqHsBxX-LuM"
+        },
+        {
+          "id": "hyFKqXwet_F79QM71atgI",
+          "type": "arrow"
+        },
+        {
+          "id": "x_DP1FcQ7jraGz0gBuDi3",
+          "type": "arrow"
+        },
+        {
+          "id": "1IGbCps2EHnzKgJUWM5nq",
+          "type": "arrow"
+        },
+        {
+          "id": "Wk1bJbbtC31FqObEL5xWt",
+          "type": "arrow"
+        }
+      ],
+      "updated": 1758818593647,
+      "link": null,
+      "locked": false
+    },
+    {
+      "id": "6E23g-rgowNqHsBxX-LuM",
+      "type": "text",
+      "x": 499.9100341796875,
+      "y": 571.25,
+      "width": 69.679931640625,
+      "height": 25,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a7",
+      "roundness": null,
+      "seed": 267696178,
+      "version": 132,
+      "versionNonce": 1668243186,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818543211,
+      "link": null,
+      "locked": false,
+      "text": "Pandas",
+      "fontSize": 20,
+      "fontFamily": 5,
+      "textAlign": "center",
+      "verticalAlign": "middle",
+      "containerId": "jSx8ApfhtRs_nk37VvDMb",
+      "originalText": "Pandas",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "ohj18N4AOTDz5lJNcV9gi",
+      "type": "rectangle",
+      "x": 261,
+      "y": 765.5,
+      "width": 157,
+      "height": 87,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a8",
+      "roundness": {
+        "type": 3
+      },
+      "seed": 1446207150,
+      "version": 279,
+      "versionNonce": 317375026,
+      "isDeleted": false,
+      "boundElements": [
+        {
+          "id": "Ea1_ke2wA0D8ZjVOUtvfY",
+          "type": "text"
+        },
+        {
+          "id": "hyFKqXwet_F79QM71atgI",
+          "type": "arrow"
+        }
+      ],
+      "updated": 1758818570993,
+      "link": null,
+      "locked": false
+    },
+    {
+      "id": "Ea1_ke2wA0D8ZjVOUtvfY",
+      "type": "text",
+      "x": 297.0800323486328,
+      "y": 796.5,
+      "width": 84.83993530273438,
+      "height": 25,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a9",
+      "roundness": null,
+      "seed": 435116270,
+      "version": 199,
+      "versionNonce": 1282911218,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818570993,
+      "link": null,
+      "locked": false,
+      "text": "train.txt",
+      "fontSize": 20,
+      "fontFamily": 5,
+      "textAlign": "center",
+      "verticalAlign": "middle",
+      "containerId": "ohj18N4AOTDz5lJNcV9gi",
+      "originalText": "train.txt",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "A4Y54Y26fe257U_QU9lxX",
+      "type": "rectangle",
+      "x": 464,
+      "y": 765.5,
+      "width": 157,
+      "height": 87,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aA",
+      "roundness": {
+        "type": 3
+      },
+      "seed": 186148850,
+      "version": 232,
+      "versionNonce": 997119858,
+      "isDeleted": false,
+      "boundElements": [
+        {
+          "id": "v4TvUlDEjH7EvPDmtbOn2",
+          "type": "text"
+        },
+        {
+          "id": "1IGbCps2EHnzKgJUWM5nq",
+          "type": "arrow"
+        }
+      ],
+      "updated": 1758818570993,
+      "link": null,
+      "locked": false
+    },
+    {
+      "id": "v4TvUlDEjH7EvPDmtbOn2",
+      "type": "text",
+      "x": 476.3500442504883,
+      "y": 796.5,
+      "width": 132.29991149902344,
+      "height": 25,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aB",
+      "roundness": null,
+      "seed": 1131059634,
+      "version": 171,
+      "versionNonce": 239540530,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818570993,
+      "link": null,
+      "locked": false,
+      "text": "validation.txt",
+      "fontSize": 20,
+      "fontFamily": 5,
+      "textAlign": "center",
+      "verticalAlign": "middle",
+      "containerId": "A4Y54Y26fe257U_QU9lxX",
+      "originalText": "validation.txt",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "mPaYpJ9Xn7tlJPmKPqJKJ",
+      "type": "rectangle",
+      "x": 674.5,
+      "y": 765.5,
+      "width": 157,
+      "height": 87,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aC",
+      "roundness": {
+        "type": 3
+      },
+      "seed": 1049323314,
+      "version": 235,
+      "versionNonce": 330560690,
+      "isDeleted": false,
+      "boundElements": [
+        {
+          "type": "text",
+          "id": "kg9nm2rpud6cax5aNPSnu"
+        },
+        {
+          "id": "x_DP1FcQ7jraGz0gBuDi3",
+          "type": "arrow"
+        }
+      ],
+      "updated": 1758818570993,
+      "link": null,
+      "locked": false
+    },
+    {
+      "id": "kg9nm2rpud6cax5aNPSnu",
+      "type": "text",
+      "x": 711.4300231933594,
+      "y": 796.5,
+      "width": 83.13995361328125,
+      "height": 25,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aD",
+      "roundness": null,
+      "seed": 522572142,
+      "version": 193,
+      "versionNonce": 1920372338,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818570993,
+      "link": null,
+      "locked": false,
+      "text": "test.txt",
+      "fontSize": 20,
+      "fontFamily": 5,
+      "textAlign": "center",
+      "verticalAlign": "middle",
+      "containerId": "mPaYpJ9Xn7tlJPmKPqJKJ",
+      "originalText": "test.txt",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "hyFKqXwet_F79QM71atgI",
+      "type": "arrow",
+      "x": 534.65,
+      "y": 661.5,
+      "width": 195.25,
+      "height": 99,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aG",
+      "roundness": null,
+      "seed": 873266098,
+      "version": 71,
+      "versionNonce": 541154738,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818570993,
+      "link": null,
+      "locked": false,
+      "points": [
+        [
+          0,
+          0
+        ],
+        [
+          0,
+          49.5
+        ],
+        [
+          -195.25,
+          49.5
+        ],
+        [
+          -195.25,
+          99
+        ]
+      ],
+      "lastCommittedPoint": null,
+      "startBinding": {
+        "elementId": "jSx8ApfhtRs_nk37VvDMb",
+        "fixedPoint": [
+          0.49977090492554405,
+          1.034364261168385
+        ],
+        "focus": 0,
+        "gap": 0
+      },
+      "endBinding": {
+        "elementId": "ohj18N4AOTDz5lJNcV9gi",
+        "fixedPoint": [
+          0.4993630573248406,
+          -0.05747126436781609
+        ],
+        "focus": 0,
+        "gap": 0
+      },
+      "startArrowhead": null,
+      "endArrowhead": "triangle",
+      "elbowed": true,
+      "fixedSegments": null,
+      "startIsSpecial": null,
+      "endIsSpecial": null
+    },
+    {
+      "id": "x_DP1FcQ7jraGz0gBuDi3",
+      "type": "arrow",
+      "x": 534.65,
+      "y": 661.5,
+      "width": 218.25,
+      "height": 99,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aH",
+      "roundness": null,
+      "seed": 1210817582,
+      "version": 77,
+      "versionNonce": 1483392370,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818580594,
+      "link": null,
+      "locked": false,
+      "points": [
+        [
+          0,
+          0
+        ],
+        [
+          0,
+          49.5
+        ],
+        [
+          218.25,
+          49.5
+        ],
+        [
+          218.25,
+          99
+        ]
+      ],
+      "lastCommittedPoint": null,
+      "startBinding": {
+        "elementId": "jSx8ApfhtRs_nk37VvDMb",
+        "fixedPoint": [
+          0.49977090492554405,
+          1.034364261168385
+        ],
+        "focus": 0,
+        "gap": 0
+      },
+      "endBinding": {
+        "elementId": "mPaYpJ9Xn7tlJPmKPqJKJ",
+        "fixedPoint": [
+          0.4993630573248406,
+          -0.05747126436781609
+        ],
+        "focus": 0,
+        "gap": 0
+      },
+      "startArrowhead": null,
+      "endArrowhead": "triangle",
+      "elbowed": true,
+      "fixedSegments": null,
+      "startIsSpecial": null,
+      "endIsSpecial": null
+    },
+    {
+      "id": "1IGbCps2EHnzKgJUWM5nq",
+      "type": "arrow",
+      "x": 534.65,
+      "y": 661.5,
+      "width": 0.5719232650604908,
+      "height": 99.07394122590165,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aK",
+      "roundness": null,
+      "seed": 1205316658,
+      "version": 96,
+      "versionNonce": 1748050674,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818570993,
+      "link": null,
+      "locked": false,
+      "points": [
+        [
+          0,
+          0
+        ],
+        [
+          -0.5719232650604908,
+          99.07394122590165
+        ]
+      ],
+      "lastCommittedPoint": null,
+      "startBinding": {
+        "elementId": "jSx8ApfhtRs_nk37VvDMb",
+        "fixedPoint": [
+          0.49977090492554405,
+          1.034364261168385
+        ],
+        "focus": 0,
+        "gap": 0
+      },
+      "endBinding": {
+        "elementId": "A4Y54Y26fe257U_QU9lxX",
+        "fixedPoint": [
+          0.44635717665566554,
+          -0.056621365219521276
+        ],
+        "focus": 0,
+        "gap": 0
+      },
+      "startArrowhead": null,
+      "endArrowhead": "triangle",
+      "elbowed": true,
+      "fixedSegments": null,
+      "startIsSpecial": null,
+      "endIsSpecial": null
+    },
+    {
+      "id": "gus_rxauKJ6T2L_F59PfN",
+      "type": "arrow",
+      "x": 539,
+      "y": 271.5,
+      "width": 0,
+      "height": 33.5,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aL",
+      "roundness": null,
+      "seed": 763990258,
+      "version": 17,
+      "versionNonce": 1028811378,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818588814,
+      "link": null,
+      "locked": false,
+      "points": [
+        [
+          0,
+          0
+        ],
+        [
+          0,
+          33.5
+        ]
+      ],
+      "lastCommittedPoint": null,
+      "startBinding": {
+        "elementId": "3zbCui3XtIGozHXTVAGRp",
+        "focus": -0.019473081328751418,
+        "gap": 3
+      },
+      "endBinding": {
+        "elementId": "hoIRMNiMJZl4YDo-hovWy",
+        "focus": -1.0404624277456647,
+        "gap": 30.7545797799829
+      },
+      "startArrowhead": null,
+      "endArrowhead": "triangle",
+      "elbowed": false
+    },
+    {
+      "id": "Wk1bJbbtC31FqObEL5xWt",
+      "type": "arrow",
+      "x": 536.5,
+      "y": 468.5,
+      "width": 0,
+      "height": 39,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aM",
+      "roundness": null,
+      "seed": 1489771054,
+      "version": 33,
+      "versionNonce": 1828178606,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818593647,
+      "link": null,
+      "locked": false,
+      "points": [
+        [
+          0,
+          0
+        ],
+        [
+          0,
+          39
+        ]
+      ],
+      "lastCommittedPoint": null,
+      "startBinding": {
+        "elementId": "hoIRMNiMJZl4YDo-hovWy",
+        "focus": 1.0693641618497107,
+        "gap": 27.157190169432425
+      },
+      "endBinding": {
+        "elementId": "jSx8ApfhtRs_nk37VvDMb",
+        "focus": 0.008018327605956525,
+        "gap": 3.5
+      },
+      "startArrowhead": null,
+      "endArrowhead": "triangle",
+      "elbowed": false
+    }
+  ],
+  "appState": {
+    "gridSize": 20,
+    "gridStep": 5,
+    "gridModeEnabled": false,
+    "viewBackgroundColor": "#ffffff"
+  },
+  "files": {}
+}
\ No newline at end of file

From e521b0704e1941ede504f58a615d8a20fa77461b Mon Sep 17 00:00:00 2001
From: GassiGiuseppe <g.gassi@studenti.poliba.it>
Date: Thu, 25 Sep 2025 19:19:11 +0200
Subject: [PATCH 2/9] deleted TODO in path_splitter_tree, as it was already
 resolved

---
 Scripts/DataCleaning/path_splitter_tree.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Scripts/DataCleaning/path_splitter_tree.py b/Scripts/DataCleaning/path_splitter_tree.py
index e7f6f9e..9c0914a 100644
--- a/Scripts/DataCleaning/path_splitter_tree.py
+++ b/Scripts/DataCleaning/path_splitter_tree.py
@@ -101,7 +101,6 @@ def tree_like(file: str, csv_uri_header:str, out: str):
 
     FILE = open(file, "r", encoding="utf-8")
 
-    # TODO: Change here so it takes single URI from a CSV file
     # It is needed the header-name
     for row in csv.DictReader(FILE):
 

From 650b37c586fe07d9bb83d4471a727c12cd717dfb Mon Sep 17 00:00:00 2001
From: GassiGiuseppe <g.gassi@studenti.poliba.it>
Date: Fri, 26 Sep 2025 11:24:34 +0200
Subject: [PATCH 3/9] Added vscode setting to execute jupyternotebook from root
 dir

---
 .vscode/settings.json | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 .vscode/settings.json

diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..17ae78b
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,3 @@
+{
+  "jupyter.notebookFileRoot": "${workspaceFolder}"
+}
\ No newline at end of file

From 6ddb7de9da1af4fad8d8bae265f0622f56ba6bec Mon Sep 17 00:00:00 2001
From: GassiGiuseppe <g.gassi@studenti.poliba.it>
Date: Mon, 29 Sep 2025 15:19:19 +0200
Subject: [PATCH 4/9] Added sqlAlchemy to requirements

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index e87882c..70a3169 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,3 +15,4 @@ tzdata==2025.2
 urllib3==2.5.0
 wheel==0.45.1
 Wikipedia-API==0.8.1
+SQLAlchemy

From bd72ad3571bf2710cd154c5cf08b448dc194f13d Mon Sep 17 00:00:00 2001
From: GassiGiuseppe <g.gassi@studenti.poliba.it>
Date: Mon, 29 Sep 2025 15:21:26 +0200
Subject: [PATCH 5/9] Added file to execute the complete cleaning pipeline

---
 .../data_output_models/bpe_corpus.py          |  21 ++
 .../rdf_completation_task.py                  |  26 +++
 .../data_output_models/rdf_mask_task.py       |  58 ++++++
 .../data_output_models/rdf_text_tasks.py      |  26 +++
 Scripts/DataCleaning/filter.py                | 184 ++++++++++++++++++
 Scripts/DataCleaning/pipeline.py              | 107 ++++++++++
 .../Libs/CleaningPipeline/special_token.py    |  21 ++
 Scripts/Libs/CleaningPipeline/sql_endpoint.py | 144 ++++++++++++++
 Scripts/Libs/Utils/dataframe_interaction.py   |   9 +
 9 files changed, 596 insertions(+)
 create mode 100644 Scripts/DataCleaning/data_output_models/bpe_corpus.py
 create mode 100644 Scripts/DataCleaning/data_output_models/rdf_completation_task.py
 create mode 100644 Scripts/DataCleaning/data_output_models/rdf_mask_task.py
 create mode 100644 Scripts/DataCleaning/data_output_models/rdf_text_tasks.py
 create mode 100644 Scripts/DataCleaning/filter.py
 create mode 100644 Scripts/DataCleaning/pipeline.py
 create mode 100644 Scripts/Libs/CleaningPipeline/special_token.py
 create mode 100644 Scripts/Libs/CleaningPipeline/sql_endpoint.py
 create mode 100644 Scripts/Libs/Utils/dataframe_interaction.py

diff --git a/Scripts/DataCleaning/data_output_models/bpe_corpus.py b/Scripts/DataCleaning/data_output_models/bpe_corpus.py
new file mode 100644
index 0000000..a0348b6
--- /dev/null
+++ b/Scripts/DataCleaning/data_output_models/bpe_corpus.py
@@ -0,0 +1,21 @@
+from Scripts.Libs.Utils.dataframe_interaction import get_raw_from_dataframe
+from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
+import pandas as pd
+
+class BPE_corpus():
+
+    def __init__(self, output_path :str):
+        self.output_handler = open(output_path, "w")
+
+    def close(self):
+        # add corpus end before closing
+        self.output_handler.write(SpecialToken.CORPUS_END.value)
+        self.output_handler.close()
+        
+    def write_from_str(self, output: str):
+        if output == '':
+            return
+        self.output_handler.write(output)
+
+    def write_from_df(self, df: pd.DataFrame):
+        self.write_from_str(get_raw_from_dataframe(df))
\ No newline at end of file
diff --git a/Scripts/DataCleaning/data_output_models/rdf_completation_task.py b/Scripts/DataCleaning/data_output_models/rdf_completation_task.py
new file mode 100644
index 0000000..111b2b9
--- /dev/null
+++ b/Scripts/DataCleaning/data_output_models/rdf_completation_task.py
@@ -0,0 +1,26 @@
+import pandas as pd
+
+class RDF_completation_task_dataset():
+    """
+        Write the CSV for the fourth task, which is "Predicting subsequent triples based on a given context".
+        Each RDF is saved as str
+        CSV Composition: ["MovieID","RDF"]
+    """
+    def __init__(self, output_path:str):
+     
+
+        self.output =  open(output_path, "w")
+        # then the first row as header
+        header = ["MovieID","RDF"]
+        self.output.write(",".join(header) + "\n")
+
+    def close(self):
+        self.output.close()
+
+    def write(self, RDF: pd.DataFrame):
+        """
+        Args:
+            RDF (pd.DataFrame): ["MovieID","RDF"]
+        """        
+
+        RDF.to_csv(self.output, index=False, header=False)
\ No newline at end of file
diff --git a/Scripts/DataCleaning/data_output_models/rdf_mask_task.py b/Scripts/DataCleaning/data_output_models/rdf_mask_task.py
new file mode 100644
index 0000000..01b943d
--- /dev/null
+++ b/Scripts/DataCleaning/data_output_models/rdf_mask_task.py
@@ -0,0 +1,58 @@
+import pandas as pd
+
+# do not worry about circular dependencies, this class will never call something else
+from Scripts.DataCleaning.filter import PipelineApplier
+
+class RDF_mask_task_dataset():
+    """
+        Write the CSV for the third task, which is "Predicting a masked component within an RDF triple".
+        The CSV is like: for each RDF there will be 3 rows, where every time one of the componments is missing.
+        CSV Composition: ["MovieID","IncompleteRDF","Missing","RDF"]
+    """
+    def __init__(self, output_path:str):
+     
+        # this methods will only be used by this class, but they belong in a lower level
+        self._build_triple = PipelineApplier.build_triple
+        self._build_incomplete_triple = PipelineApplier.build_incomplete_triple
+
+        self.output =  open(output_path, "w")
+        # then the first row as header
+        header = ["MovieID","IncompleteRDF","Missing","RDF"]
+        self.output.write(",".join(header) + "\n")
+
+    def close(self):
+        self.output.close()
+
+    def write(self, RDF: pd.DataFrame):
+        rdf_complete = self._build_triple(RDF)
+
+        rdf_without_subject = self._build_incomplete_triple(RDF.drop(columns=["SubjectURI"]))
+        rdf_without_relationship = self._build_incomplete_triple(RDF.drop(columns=["RelationshipURI"]))
+        rdf_without_object = self._build_incomplete_triple(RDF.drop(columns=["ObjectURI"]))
+        ####
+        df_subject = pd.DataFrame({
+            "MovieID": RDF["MovieID"],
+            "IncompleteRDF": rdf_without_subject,
+            "Missing": RDF["SubjectURI"],
+            "RDF": rdf_complete,
+        })
+
+        df_relationship = pd.DataFrame({
+            "MovieID": RDF["MovieID"],
+            "IncompleteRDF": rdf_without_relationship,
+            "Missing": RDF["RelationshipURI"],
+            "RDF": rdf_complete,
+        })
+
+        df_object = pd.DataFrame({
+            "MovieID": RDF["MovieID"],
+            "IncompleteRDF": rdf_without_object,
+            "Missing": RDF["ObjectURI"],
+            "RDF": rdf_complete,
+        })
+
+
+        output_df = pd.concat([df_subject, df_relationship, df_object], ignore_index=True)
+        output_df.to_csv(self.output, index=False, header=False)
+
+
diff --git a/Scripts/DataCleaning/data_output_models/rdf_text_tasks.py b/Scripts/DataCleaning/data_output_models/rdf_text_tasks.py
new file mode 100644
index 0000000..918e600
--- /dev/null
+++ b/Scripts/DataCleaning/data_output_models/rdf_text_tasks.py
@@ -0,0 +1,26 @@
+import pandas as pd
+
+class RDF_text_task_dataset():
+    """
+        Write the CSV for the firsts two tasks, which are "Generating structured RDF triples from natural language text" and reverse.
+        In the CVS the RDFs will be saved toghether as a string.
+        CSV Composition: ["MovieID","RDFs","Abstract"]
+    """
+    def __init__(self, output_path:str):
+     
+
+        self.output =  open(output_path, "w")
+        # then the first row as header
+        header = ["MovieID","RDFs","Abstract"]
+        self.output.write(",".join(header) + "\n")
+
+    def close(self):
+        self.output.close()
+
+    def write(self, RDF: pd.DataFrame):
+        """
+        Args:
+            RDF (pd.DataFrame): ["MovieID","Triple","Abstract"]
+        """        
+
+        RDF.to_csv(self.output, index=False, header=False)
\ No newline at end of file
diff --git a/Scripts/DataCleaning/filter.py b/Scripts/DataCleaning/filter.py
new file mode 100644
index 0000000..50d6ead
--- /dev/null
+++ b/Scripts/DataCleaning/filter.py
@@ -0,0 +1,184 @@
+# This file deletes in the pipeline the unwanted relationship by different rules
+import pandas as pd
+import sqlite3
+import numpy as np
+
+from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
+from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
+
+
+class PipelineApplier():
+
+    def __init__(self):
+
+        self.MOVIE_FILTER = pd.DataFrame()
+        self.REL_FILTER = pd.DataFrame()
+
+
+    def delete_relationship_by_str(self, RDF: pd.DataFrame, uri: str) -> pd.DataFrame:
+        return RDF[RDF["RelationshipURI"]!= uri]
+    
+    def generate_list_relationship_filter(self, filter_list: list[str]) -> None:
+        """Store RelationshipURI filters as a set """
+        self.relationship_filter_list: set[str] = set(filter_list)
+    
+    def delete_relationship_by_list_filter(self, RDF: pd.DataFrame) -> pd.DataFrame:
+        """Remove rows whose RelationshipURI is in the stored filter. Generate it first callig the generate_list_relationship_filter"""
+        return RDF[~RDF["RelationshipURI"].isin(self.relationship_filter_list)]
+
+
+    def generate_frequency_movie_filter(self, MOVIE_COUNT: pd.DataFrame ,min_treshold: int, max_treshold: int):
+        """
+        You MUST call this before filter the dataset by movie frequence [filter_by_frequence_movie_id()], 
+        since this method creates such filter
+        Args:
+            MOVIE_COUNT (pd.DataFrame): ["MovieID","Count"]
+            min_treshold (int): 
+            max_treshold (int): 
+        """        
+        MOVIE_COUNT = MOVIE_COUNT[MOVIE_COUNT["Count"] >= min_treshold]
+        MOVIE_COUNT = MOVIE_COUNT[MOVIE_COUNT["Count"] < max_treshold]
+        self.MOVIE_FILTER = MOVIE_COUNT #["MovieID"]
+
+    def generate_frequency_relationship_filter(self, REL_COUNT: pd.DataFrame ,min_treshold: int, max_treshold: int):
+        REL_COUNT = REL_COUNT[REL_COUNT["Count"] >= min_treshold]
+        REL_COUNT = REL_COUNT[REL_COUNT["Count"] < max_treshold]
+        self.REL_FILTER = REL_COUNT #["RelationshipURI"]
+
+    def filter_by_frequency_movie_id(self, RDF: pd.DataFrame) -> pd.DataFrame:
+        RDF = RDF[RDF["MovieID"].isin(self.MOVIE_FILTER["MovieID"])]
+        return RDF
+
+    def filter_by_frequency_relationship(self, RDF: pd.DataFrame) -> pd.DataFrame:
+        RDF = RDF[RDF["RelationshipURI"].isin(self.REL_FILTER["RelationshipURI"])]
+        return RDF
+
+    def rdf_add_special_token(self, RDF: pd.DataFrame):
+        """
+        Adds RDF special token to each element of the tuple. i.e: SUBJ to SubjectURI, OBJ to ObjectURI, REL to RelationshipURI. 
+        Check Scrits/Libs/CleaningPipeline/special_token.py for the up-to-date special token.
+        It only adds the special token of the three element of the RDF, no other special token.
+        Args:
+            RDF (pd.DataFrame):
+        Returns:
+            pd.DataFrame: ["MovieURI","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
+        """        
+        # if the filter runned before sliced the RDF and created a View, here the problem is resolved
+        # for more context: SettingWithCopyWarning
+        RDF = RDF.copy()
+        # at the beginning of SubjectURI RelationshipURI ObjectURI, add their special token  
+        RDF["SubjectURI"] = SpecialToken.SUBJECT.value + RDF["SubjectURI"]
+        RDF["ObjectURI"] = SpecialToken.OBJECT.value + RDF["ObjectURI"]
+        RDF["RelationshipURI"] = SpecialToken.RELATIONSHIP.value + RDF["RelationshipURI"]
+        return RDF
+
+
+    def drop_na_from_dataset(self, RDF: pd.DataFrame) -> pd.DataFrame:
+        # dataset has SubjectURI RelationshipURI ObjectURI
+        #  want to drop the '' in them
+        # Replace empty strings with NaN
+        RDF = RDF.replace('', np.nan)
+        # Drop rows where any of the key columns are NaN
+        RDF = RDF.dropna(subset=["SubjectURI", "RelationshipURI", "ObjectURI"])
+        return RDF
+    
+    def rebuild_by_movie(self, RDF: pd.DataFrame) -> pd.DataFrame:
+        """_summary_
+
+        Args:
+            RDF (pd.DataFrame): ["MovieID","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
+
+        Returns:
+            pd.DataFrame: ["MovieID","Triple","Abstract"]
+        """        
+        # to execute this method you have to have itereted by movie_id
+        # because as design we want at the end one row for each movie
+        # MovieID and abstract can be given as input for a more generic method
+        # movie_id = RDF["MovieID"].iloc(0)
+        # abstract = RDF["Abstract"].iloc(0)
+        # first let's combine each row creating column triple as join of rdf
+        RDF["Triple"] =  RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"]
+        # special token 
+        RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
+        # combine rows into one
+        # MovieID and Abstract are unique for each other 1 <-> 1
+        RDF = RDF.groupby(["MovieID", "Abstract"])["Triple"].apply("".join).reset_index()
+        # add special token for: start of triple, end of triple and start of abstract
+        RDF["Triple"] = SpecialToken.START_TRIPLE_LIST.value + RDF["Triple"] 
+        RDF["Abstract"] = SpecialToken.ABSTRACT.value + RDF["Abstract"]
+        return RDF[["MovieID","Triple","Abstract"]]
+
+    def group_by_movie_from_triple(self, RDF: pd.DataFrame) -> pd.DataFrame:
+        """
+        Args:
+            RDF (pd.DataFrame): ["MovieID","Triple","Abstract"]
+
+        Returns:
+            pd.DataFrame: ["MovieID","Triple","Abstract"]
+        """
+        # combine rows into one
+        # MovieID and Abstract are unique for each other 1 <-> 1
+        RDF = RDF.groupby(["MovieID", "Abstract"])["Triple"].apply("".join).reset_index()
+        # add special token for: start of triple, end of triple and start of abstract
+        RDF["Triple"] = SpecialToken.START_TRIPLE_LIST.value + RDF["Triple"] 
+        RDF["Abstract"] = SpecialToken.ABSTRACT.value + RDF["Abstract"]
+        return RDF[["MovieID","Triple","Abstract"]]
+
+
+    @staticmethod
+    def build_triple(RDF: pd.DataFrame):
+        """
+        Obtains joined RDF triple in one element, togheter with START and END special token
+        Args:
+            RDF (pd.DataFrame): at least ["SubjectURI", "RelationshipURI", "ObjectURI"]
+        Returns:
+            pd.DataFrame: RDF["Triple"] (just this column)
+        """        
+        # let's combine each row creating column triple as join of rdf
+        RDF["Triple"] =  RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"]
+        # special token 
+        RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
+        return RDF["Triple"]
+
+    @staticmethod
+    def build_incomplete_triple(RDF: pd.DataFrame):
+        """
+        Method helper used for the third task: "Predicting a masked component within an RDF triple". 
+        Obtains joined RDF triple in one element, togheter with START and END special token.
+        The MISSING element will be replaced by the special token <MASK>
+        Args:
+            RDF (pd.DataFrame): 2 of the following ["SubjectURI", "RelationshipURI", "ObjectURI"]
+        Returns:
+            RDF["Triple"]: pd.Series  (just this column, NOT A DATAFRAME)
+        """        
+        # let's create a new column "Triple" with the joined RDF
+
+        # the following creates a column of MASK token of the lenght of the dataframe,
+        # it is not needed since we expect to have a dataframe of just one column, but its more robust (AND SLOW)
+        MISSING = pd.Series([SpecialToken.MASK.value] * len(RDF), index=RDF.index)
+
+        RDF["Triple"] =  ( 
+                    RDF.get("SubjectURI", MISSING) + 
+                    RDF.get("RelationshipURI", MISSING) + 
+                    RDF.get("ObjectURI", MISSING))
+        # special token 
+        RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
+        return RDF["Triple"]
+
+    @staticmethod
+    def build_for_mask_task(RDF_incomplete: pd.DataFrame, MISSING: pd.DataFrame) -> pd.DataFrame:
+        # currently not used
+        """
+        Method helper used for the third task: "Predicting a masked component within an RDF triple". 
+        Given two Dataframe, the first containing the incompleted RDF and the other only the missing componment,
+        this methods applies the special token
+        Args:
+            RDF (pd.DataFrame): _description_
+
+        Returns:
+            pd.DataFrame: _description_
+        """  
+        # take an example dataframe as ["SubjectURI",""]    
+        # as input two dataframe, one with 2 column  
+        return None
+
diff --git a/Scripts/DataCleaning/pipeline.py b/Scripts/DataCleaning/pipeline.py
new file mode 100644
index 0000000..e07294b
--- /dev/null
+++ b/Scripts/DataCleaning/pipeline.py
@@ -0,0 +1,107 @@
+import re
+from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
+from Scripts.DataCleaning.filter import PipelineApplier
+# tasks dataset builder
+from Scripts.DataCleaning.data_output_models.rdf_mask_task import RDF_mask_task_dataset
+from Scripts.DataCleaning.data_output_models.bpe_corpus import BPE_corpus
+from Scripts.DataCleaning.data_output_models.rdf_text_tasks import RDF_text_task_dataset
+from Scripts.DataCleaning.data_output_models.rdf_completation_task import RDF_completation_task_dataset
+
+import pandas as pd
+
+class Pipeline():
+    def __init__(self, output):
+        self.sql_endpoint = SqlEndpoint()
+        # classes to manage taskes' datasets
+        self.task_rdf_mask = RDF_mask_task_dataset("./Assets/Dataset/Tmp/debug.csv")
+        self.task_bpe_corpus = BPE_corpus("./Assets/Dataset/Tmp/output.txt")
+        self.task_rdf_text = RDF_text_task_dataset("./Assets/Dataset/Tmp/rdf_text.csv")
+        self.task_rdf_completation = RDF_completation_task_dataset("./Assets/Dataset/Tmp/rdf_completation.csv")
+
+        # prepare the filter
+        # the filter applier needs to know thefrequence of Movies and Relationship among all the Dataset
+        self.filter_applier = PipelineApplier()
+        MOVIE_COUNT = self.sql_endpoint.get_movies_id_count()
+        REL_COUNT = self.sql_endpoint.get_relationship_count()
+        self.filter_applier.generate_frequency_movie_filter(MOVIE_COUNT,50,3000)
+        self.filter_applier.generate_frequency_relationship_filter(REL_COUNT, 50, 2395627)
+        # prepare the filter ot the relationshipURI you want to delete:
+        relationship_uri_banned_list = [
+            "dbp-dbp:wikiPageUsesTemplate","w3:2000/01/rdf-schema#label","dbp-dbo:abstract",
+            "dbp-dbo:wikiPageID","dbp-dbo:wikiPageRevisionID", "dbp-dbo:wikiPageDisambiguates",
+            "w3:2002/07/owl#sameAs","dbp-dbp:image","dbp-dbo:wikiPageLength", "w3:2000/01/rdf-schema#comment",
+            "dbp-dbo:thumbnail", "foaf:depiction", "w3:1999/02/22-rdf-syntax-ns#type"]
+        self.filter_applier.generate_list_relationship_filter(relationship_uri_banned_list)
+
+
+    def _end_file_handler(self):
+        self.task_bpe_corpus.close()
+        self.task_rdf_mask.close()
+        self.task_rdf_text.close()
+        self.task_rdf_completation.close()
+
+    def _get_cleaned_movie_rows(self):
+        for RDF in self.sql_endpoint.get_abbreviated_dataset_by_movie_id():
+            RDF = self.filter_applier.drop_na_from_dataset(RDF)
+            RDF = self.filter_applier.filter_by_frequency_movie_id(RDF)
+            RDF = self.filter_applier.filter_by_frequency_relationship(RDF)
+            # other filter
+            #
+            RDF = self.filter_applier.delete_relationship_by_list_filter(RDF)
+            if RDF.empty:
+                continue
+            RDF = self.filter_applier.rdf_add_special_token(RDF)  # WARNING, THIS MUST BE DONE AFTER FILTER BY FREQUENCE
+            yield RDF
+
+    def execute_task_bpe_corpus(self):
+        for RDF in self._get_cleaned_movie_rows():
+            RDF = self.filter_applier.rebuild_by_movie(RDF)
+            RDF = RDF[["Triple","Abstract"]]
+            self.task_bpe_corpus.write_from_df(RDF)
+        self._end_file_handler()
+
+
+    def execute_task_rdf_mask(self):
+        for RDF in self._get_cleaned_movie_rows():
+            self.task_rdf_mask.write(RDF)
+        self._end_file_handler()
+
+    def execute_tasks_rdf_text(self):
+        for RDF in self._get_cleaned_movie_rows():
+            RDF = self.filter_applier.rebuild_by_movie(RDF)
+            self.task_rdf_text.write(RDF)
+        self._end_file_handler()
+
+    def execute_task_rdf_completation(self):
+        for RDF in self._get_cleaned_movie_rows():
+            RDF["Triple"] = self.filter_applier.build_triple(RDF)
+            self.task_rdf_completation.write(RDF[["MovieID","Triple"]])
+        self._end_file_handler()
+
+
+    def execute_all_task(self):
+        for RDF in self._get_cleaned_movie_rows():
+            self.task_rdf_mask.write(RDF)
+
+            RDF["Triple"] = self.filter_applier.build_triple(RDF)
+            self.task_rdf_completation.write(RDF[["MovieID","Triple"]])
+
+            RDF = self.filter_applier.group_by_movie_from_triple(RDF[["MovieID","Triple","Abstract"]])
+
+            self.task_rdf_text.write(RDF)
+            self.task_bpe_corpus.write_from_df(RDF[["Triple","Abstract"]])
+
+        self._end_file_handler()
+        
+
+
+
+
+
+
+pipeline = Pipeline("./Assets/Dataset/Tmp/output.txt")
+# pipeline.execute_task_bpe_corpus()
+# pipeline.execute_task_rdf_mask()
+# pipeline.execute_tasks_rdf_text()
+# pipeline.execute_task_rdf_completation()
+pipeline.execute_all_task()
\ No newline at end of file
diff --git a/Scripts/Libs/CleaningPipeline/special_token.py b/Scripts/Libs/CleaningPipeline/special_token.py
new file mode 100644
index 0000000..644ad71
--- /dev/null
+++ b/Scripts/Libs/CleaningPipeline/special_token.py
@@ -0,0 +1,21 @@
+from enum import Enum
+
+class SpecialToken(str, Enum):
+    # (Enum, str) -> throws an error
+    START_TRIPLE_LIST = "<SOTL>"
+    START_TRIPLE = "<SOT>"
+    END_TRIPLE = "<EOT>"
+    SUBJECT = "<SUBJ>"
+    RELATIONSHIP = "<PRED>"
+    OBJECT = "<OBJ>"
+    ABSTRACT = "<ABS>"
+    CORPUS_END = "<END>"
+
+    ## Tasks' Token
+    RDF_TO_TEXT = "<RDF2TXT>"
+    TEXT_TO_RDF = "<TEXT2RDF>"
+    CONTINUE_RDF = "<CONTINUERDF>"
+    MASK = "<MASK>"
+
+    #BPE Training:
+    
\ No newline at end of file
diff --git a/Scripts/Libs/CleaningPipeline/sql_endpoint.py b/Scripts/Libs/CleaningPipeline/sql_endpoint.py
new file mode 100644
index 0000000..4e43528
--- /dev/null
+++ b/Scripts/Libs/CleaningPipeline/sql_endpoint.py
@@ -0,0 +1,144 @@
+#######################################################
+#   This file stand as endpoint to interact with DB   #
+#######################################################
+
+# import sqlite3
+import pandas as pd
+from sqlalchemy import create_engine
+from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
+
+
+class SqlEndpoint():
+
+    def __init__(self, DB_PATH = "./Assets/Dataset/DatawareHouse/dataset.db", chunk_size_row = 500):
+        # self.CONN = sqlite3.connect(DB_PATH) # DEPRECATED
+        self.sql_engine = create_engine(f"sqlite:///{DB_PATH}")
+        # /// 3 slash -> relative path
+        # //// 4 slash -> absolute
+        # self.conn = self.sql_engine.connect().execution_options(stream_results=True)
+        # it seems that sqlite doenst support streamer cursor
+        # PRAGMA exeutes better in writing not reading
+        self.chunk_size_row = chunk_size_row
+        pass
+
+    def get_RDF(self) -> pd.DataFrame :
+        
+        QUERY = """
+                SELECT MovieID, SubjectURI, RelationshipURI, ObjectURI
+                FROM RDFs
+                INNER JOIN Subjects USING (SubjectID)
+                INNER JOIN Relationships USING (RelationshipID)
+                INNER JOIN Objects USING (ObjectID);
+                """
+        
+        return pd.read_sql_query(QUERY, self.CONN)
+    
+    def get_chunked_abbreviated_dataset(self) -> pd.DataFrame :
+        """
+        Returns:
+            pd.DataFrame: MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
+        """        
+        
+        QUERY = """
+                SELECT MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
+                FROM RDFs
+                INNER JOIN ParsedSubjects USING (SubjectID)
+                INNER JOIN ParsedRelationships USING (RelationshipID)
+                INNER JOIN ParsedObjects USING (ObjectID)
+                INNER JOIN WikipediaAbstracts USING (MovieID);
+                """
+        
+        # return pd.read_sql_query(QUERY, self.CONN, chunksize=500)
+        # sqlite3
+        return pd.read_sql_query(QUERY, self.sql_engine, chunksize=self.chunk_size_row)
+
+    
+    def get_chunked_abbreviated_dataset_with_start_token(self)-> pd.DataFrame:
+        # DEPRECATED !
+        start_token = SpecialToken()
+        QUERY = """
+                SELECT 
+                    MovieID, 
+                    ? || SubjectURI AS SubjectURI,
+                    ? || RelationshipURI AS RelationshipURI, 
+                    ? || ObjectURI AS ObjectURI, 
+                    Abstract
+                FROM RDFs
+                INNER JOIN ParsedSubjects USING (SubjectID)
+                INNER JOIN ParsedRelationships USING (RelationshipID)
+                INNER JOIN ParsedObjects USING (ObjectID)
+                INNER JOIN WikipediaAbstracts USING (MovieID);
+                """
+        return pd.read_sql_query(QUERY, self.sql_engine, chunksize=self.chunk_size_row)
+    
+    def get_abbreviated_dataset_by_movie_id(self):# -> iter[pd.DataFrame]:
+        """
+        Gets each time a DataFrame per movie ( with all its rows in the dataset).
+        The retrieved RDFs are already abbrevieted by the sql parser
+        Yields:
+            Pandas.DataFrame: [MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract]
+        """        
+        # chunk by movieId, abstract is the same and some intersting logic are appliable
+        movie_ids = pd.read_sql_query("SELECT MovieID FROM Movies;", self.sql_engine)["MovieID"]
+        # CHOOSEN MOVIE:
+        # The Dark Knight   : 117248
+        # Inception         : 147074
+        # The Avengers      : 113621
+        # Cast Away         : 1123
+        # The Departed      : 117586
+        # American Psycho   : 90177
+        # Avatar            : 71587
+        # Django Unchained  : 138952
+        # Spirited Away     : 144137
+        # Knives Out        : 148025
+        movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]
+        movie_ids = movie_list
+
+        QUERY = """
+                SELECT MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
+                FROM RDFs
+                INNER JOIN ParsedSubjects USING (SubjectID)
+                INNER JOIN ParsedRelationships USING (RelationshipID)
+                INNER JOIN ParsedObjects USING (ObjectID)
+                INNER JOIN WikipediaAbstracts USING (MovieID)
+                WHERE MovieID = (?);
+                """        
+
+        for movie_id in movie_ids:
+            yield pd.read_sql_query(QUERY, self.sql_engine, params=(movie_id,))
+
+    def get_movies_id_count(self) -> pd.DataFrame:
+        """
+        Gets the count of each Movie in the Dataset
+        Returns:
+            Pandas.DataFrame: [MovieID, Count]
+        """        
+        QUERY = """
+                SELECT MovieID, COUNT(*) AS Count
+                FROM RDFs
+                GROUP BY MovieID;
+                """        
+        return pd.read_sql_query(QUERY, self.sql_engine)
+    
+    def get_relationship_count(self) -> pd.DataFrame:
+        """
+        Gets the count of each Relationship in the Dataset
+        Returns:
+            Pandas.DataFrame: [RelationshipURI, Count]
+        """       
+        QUERY = """
+                SELECT RelationshipURI, COUNT(*) AS Count
+                FROM RDFs
+                INNER JOIN ParsedRelationships USING (RelationshipID)
+                GROUP BY RelationshipURI;
+                """        
+        return pd.read_sql_query(QUERY, self.sql_engine)
+
+
+
+if __name__ == "__main__" :
+    sql_endpoint = SqlEndpoint()
+    for pandas_row in sql_endpoint.get_abbreviated_dataset_by_movie_id():
+        print(pandas_row)
+    # sql_endpoint.get_RDF()
+    print("done")
\ No newline at end of file
diff --git a/Scripts/Libs/Utils/dataframe_interaction.py b/Scripts/Libs/Utils/dataframe_interaction.py
new file mode 100644
index 0000000..c4df33a
--- /dev/null
+++ b/Scripts/Libs/Utils/dataframe_interaction.py
@@ -0,0 +1,9 @@
+import pandas as pd
+
+
+
+def get_raw_from_dataframe(DF: pd.DataFrame) -> str:
+    output = ''
+    for row in DF.itertuples(index=False, name=None):
+        output += "".join(map(str, row))
+    return output

From 8167c9d435b15a4f189d57b6644a376fed2f2e2c Mon Sep 17 00:00:00 2001
From: GassiGiuseppe <g.gassi@studenti.poliba.it>
Date: Mon, 29 Sep 2025 16:03:49 +0200
Subject: [PATCH 6/9] Added Toy Dataset entry point into the Pipeline class
 Before it was forced into the sql_endpoint, now all the pipeline can be
 managed in the Pipeline class

---
 Scripts/DataCleaning/pipeline.py              | 78 ++++++++++++-------
 Scripts/Libs/CleaningPipeline/sql_endpoint.py | 12 +--
 2 files changed, 57 insertions(+), 33 deletions(-)

diff --git a/Scripts/DataCleaning/pipeline.py b/Scripts/DataCleaning/pipeline.py
index e07294b..eb5b2f7 100644
--- a/Scripts/DataCleaning/pipeline.py
+++ b/Scripts/DataCleaning/pipeline.py
@@ -10,22 +10,22 @@ from Scripts.DataCleaning.data_output_models.rdf_completation_task import RDF_co
 import pandas as pd
 
 class Pipeline():
-    def __init__(self, output):
+    def __init__(self):
         self.sql_endpoint = SqlEndpoint()
         # classes to manage taskes' datasets
-        self.task_rdf_mask = RDF_mask_task_dataset("./Assets/Dataset/Tmp/debug.csv")
-        self.task_bpe_corpus = BPE_corpus("./Assets/Dataset/Tmp/output.txt")
+        self.task_rdf_mask = RDF_mask_task_dataset("./Assets/Dataset/Tmp/rdf_mask.csv")
+        self.task_bpe_corpus = BPE_corpus("./Assets/Dataset/Tmp/corpus.txt")
         self.task_rdf_text = RDF_text_task_dataset("./Assets/Dataset/Tmp/rdf_text.csv")
         self.task_rdf_completation = RDF_completation_task_dataset("./Assets/Dataset/Tmp/rdf_completation.csv")
 
         # prepare the filter
-        # the filter applier needs to know thefrequence of Movies and Relationship among all the Dataset
+        # the filter applier needs to know the frequence of Movies and Relationship among all the Dataset
         self.filter_applier = PipelineApplier()
         MOVIE_COUNT = self.sql_endpoint.get_movies_id_count()
         REL_COUNT = self.sql_endpoint.get_relationship_count()
         self.filter_applier.generate_frequency_movie_filter(MOVIE_COUNT,50,3000)
         self.filter_applier.generate_frequency_relationship_filter(REL_COUNT, 50, 2395627)
-        # prepare the filter ot the relationshipURI you want to delete:
+        # prepare the filter on the relationshipURI you want to delete:
         relationship_uri_banned_list = [
             "dbp-dbp:wikiPageUsesTemplate","w3:2000/01/rdf-schema#label","dbp-dbo:abstract",
             "dbp-dbo:wikiPageID","dbp-dbo:wikiPageRevisionID", "dbp-dbo:wikiPageDisambiguates",
@@ -34,25 +34,6 @@ class Pipeline():
         self.filter_applier.generate_list_relationship_filter(relationship_uri_banned_list)
 
 
-    def _end_file_handler(self):
-        self.task_bpe_corpus.close()
-        self.task_rdf_mask.close()
-        self.task_rdf_text.close()
-        self.task_rdf_completation.close()
-
-    def _get_cleaned_movie_rows(self):
-        for RDF in self.sql_endpoint.get_abbreviated_dataset_by_movie_id():
-            RDF = self.filter_applier.drop_na_from_dataset(RDF)
-            RDF = self.filter_applier.filter_by_frequency_movie_id(RDF)
-            RDF = self.filter_applier.filter_by_frequency_relationship(RDF)
-            # other filter
-            #
-            RDF = self.filter_applier.delete_relationship_by_list_filter(RDF)
-            if RDF.empty:
-                continue
-            RDF = self.filter_applier.rdf_add_special_token(RDF)  # WARNING, THIS MUST BE DONE AFTER FILTER BY FREQUENCE
-            yield RDF
-
     def execute_task_bpe_corpus(self):
         for RDF in self._get_cleaned_movie_rows():
             RDF = self.filter_applier.rebuild_by_movie(RDF)
@@ -66,12 +47,14 @@ class Pipeline():
             self.task_rdf_mask.write(RDF)
         self._end_file_handler()
 
+
     def execute_tasks_rdf_text(self):
         for RDF in self._get_cleaned_movie_rows():
             RDF = self.filter_applier.rebuild_by_movie(RDF)
             self.task_rdf_text.write(RDF)
         self._end_file_handler()
 
+
     def execute_task_rdf_completation(self):
         for RDF in self._get_cleaned_movie_rows():
             RDF["Triple"] = self.filter_applier.build_triple(RDF)
@@ -92,14 +75,55 @@ class Pipeline():
             self.task_bpe_corpus.write_from_df(RDF[["Triple","Abstract"]])
 
         self._end_file_handler()
-        
+
+
+    def _end_file_handler(self):
+        self.task_bpe_corpus.close()
+        self.task_rdf_mask.close()
+        self.task_rdf_text.close()
+        self.task_rdf_completation.close()
+
+
+    def _get_cleaned_movie_rows(self):
+        for RDF in self.sql_endpoint.get_abbreviated_dataset_by_movie_id():
+            RDF = self.filter_applier.drop_na_from_dataset(RDF)
+            RDF = self.filter_applier.filter_by_frequency_movie_id(RDF)
+            RDF = self.filter_applier.filter_by_frequency_relationship(RDF)
+            # other filter
+            #
+            RDF = self.filter_applier.delete_relationship_by_list_filter(RDF)
+            if RDF.empty:
+                continue
+            RDF = self.filter_applier.rdf_add_special_token(RDF)  # WARNING, THIS MUST BE DONE AFTER FILTER BY FREQUENCE
+            yield RDF
+
+
+    def use_toy_dataset(self):
+        # CHOOSEN MOVIE:
+        # The Dark Knight   : 117248
+        # Inception         : 147074
+        # The Avengers      : 113621
+        # Cast Away         : 1123
+        # The Departed      : 117586
+        # American Psycho   : 90177
+        # Avatar            : 71587
+        # Django Unchained  : 138952
+        # Spirited Away     : 144137
+        # Knives Out        : 148025
+        movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]
+        self.sql_endpoint.movie_ids = movie_list
 
 
 
+# there are a lot of settings to manage
+# you only need to change settings: 
+# in the init for file paths, frequency filter limit, banned reletionshipURI
+# in the use_toy_dataset , to change the toy dataset
+# in _get_cleaned_movie_rows: to change how the pipeline behave
 
+pipeline = Pipeline()
 
-
-pipeline = Pipeline("./Assets/Dataset/Tmp/output.txt")
+# pipeline.use_toy_dataset()
 # pipeline.execute_task_bpe_corpus()
 # pipeline.execute_task_rdf_mask()
 # pipeline.execute_tasks_rdf_text()
diff --git a/Scripts/Libs/CleaningPipeline/sql_endpoint.py b/Scripts/Libs/CleaningPipeline/sql_endpoint.py
index 4e43528..66ba1ea 100644
--- a/Scripts/Libs/CleaningPipeline/sql_endpoint.py
+++ b/Scripts/Libs/CleaningPipeline/sql_endpoint.py
@@ -18,8 +18,8 @@ class SqlEndpoint():
         # self.conn = self.sql_engine.connect().execution_options(stream_results=True)
         # it seems that sqlite doenst support streamer cursor
         # PRAGMA exeutes better in writing not reading
-        self.chunk_size_row = chunk_size_row
-        pass
+        self.chunk_size_row = chunk_size_row                    # not used now, since each chunk is a movie
+        self.movie_ids = movie_ids = pd.read_sql_query("SELECT MovieID FROM Movies;", self.sql_engine)["MovieID"]
 
     def get_RDF(self) -> pd.DataFrame :
         
@@ -79,7 +79,7 @@ class SqlEndpoint():
             Pandas.DataFrame: [MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract]
         """        
         # chunk by movieId, abstract is the same and some intersting logic are appliable
-        movie_ids = pd.read_sql_query("SELECT MovieID FROM Movies;", self.sql_engine)["MovieID"]
+        # movie_ids = pd.read_sql_query("SELECT MovieID FROM Movies;", self.sql_engine)["MovieID"]
         # CHOOSEN MOVIE:
         # The Dark Knight   : 117248
         # Inception         : 147074
@@ -91,8 +91,8 @@ class SqlEndpoint():
         # Django Unchained  : 138952
         # Spirited Away     : 144137
         # Knives Out        : 148025
-        movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]
-        movie_ids = movie_list
+        # movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]
+        # movie_ids = movie_list
 
         QUERY = """
                 SELECT MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
@@ -104,7 +104,7 @@ class SqlEndpoint():
                 WHERE MovieID = (?);
                 """        
 
-        for movie_id in movie_ids:
+        for movie_id in self.movie_ids:
             yield pd.read_sql_query(QUERY, self.sql_engine, params=(movie_id,))
 
     def get_movies_id_count(self) -> pd.DataFrame:

From 255d8a072d8e95920bbb723c4536f454a741ab02 Mon Sep 17 00:00:00 2001
From: GassiGiuseppe <g.gassi@studenti.poliba.it>
Date: Mon, 29 Sep 2025 16:59:52 +0200
Subject: [PATCH 7/9] First implementation of the cleaning pipeline UML

---
 .../cleaning-pipeline.excalidraw.json         | 634 ++++++++++++++++++
 1 file changed, 634 insertions(+)
 create mode 100644 Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json

diff --git a/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json b/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json
new file mode 100644
index 0000000..1249185
--- /dev/null
+++ b/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json
@@ -0,0 +1,634 @@
+{
+  "type": "excalidraw",
+  "version": 2,
+  "source": "https://marketplace.visualstudio.com/items?itemName=pomdtr.excalidraw-editor",
+  "elements": [
+    {
+      "id": "JNB9z-PeqZ4s8KDfWaoXe",
+      "type": "rectangle",
+      "x": 106,
+      "y": 27,
+      "width": 653,
+      "height": 263,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a2",
+      "roundness": {
+        "type": 3
+      },
+      "seed": 710740889,
+      "version": 326,
+      "versionNonce": 1107631703,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1759156408059,
+      "link": null,
+      "locked": false
+    },
+    {
+      "id": "e13wNTgUpn2flMpmMttqx",
+      "type": "text",
+      "x": 200.5943407656526,
+      "y": 44.07937975075269,
+      "width": 307.2781467269385,
+      "height": 23.3097531902191,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a3",
+      "roundness": null,
+      "seed": 1012740663,
+      "version": 444,
+      "versionNonce": 589551257,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1759156408059,
+      "link": null,
+      "locked": false,
+      "text": "Libs/CleaningPipeline/sql_endpoint",
+      "fontSize": 18.64780255217528,
+      "fontFamily": 5,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "Libs/CleaningPipeline/sql_endpoint",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "CgxCElJkKBtIHv-5WQrbo",
+      "type": "text",
+      "x": 195,
+      "y": 80.44259472749451,
+      "width": 403.64997665852184,
+      "height": 186.4780255217528,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a4",
+      "roundness": null,
+      "seed": 1261951799,
+      "version": 507,
+      "versionNonce": 1922906999,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1759156408059,
+      "link": null,
+      "locked": false,
+      "text": "Class SqlEndpoint:\n    - sql_engine\n    + movie_ids: list[int]\n\n    #\n    + get_abbreviated_dataset_by_movie_id\n\n",
+      "fontSize": 18.64780255217528,
+      "fontFamily": 5,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "Class SqlEndpoint:\n    - sql_engine\n    + movie_ids: list[int]\n\n    #\n    + get_abbreviated_dataset_by_movie_id\n\n",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "type": "line",
+      "version": 4978,
+      "versionNonce": 2079525497,
+      "isDeleted": false,
+      "id": "sYReMTdYblr-oJtYYJALU",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 0,
+      "x": -68.05426555317842,
+      "y": 87.19293561900287,
+      "strokeColor": "#000000",
+      "backgroundColor": "#a5d8ff",
+      "width": 77.09201683999922,
+      "height": 99.49948667804088,
+      "seed": 1263944119,
+      "groupIds": [
+        "9YkNe1yqnfZy9Z1JX2xr4",
+        "BDBCTrrhjbJynRAyuf3xJ"
+      ],
+      "strokeSharpness": "round",
+      "boundElementIds": [],
+      "startBinding": null,
+      "endBinding": null,
+      "lastCommittedPoint": null,
+      "startArrowhead": null,
+      "endArrowhead": null,
+      "points": [
+        [
+          0,
+          0
+        ],
+        [
+          0.2542098813493443,
+          75.20117273657175
+        ],
+        [
+          0.011896425679918422,
+          83.76249969444815
+        ],
+        [
+          3.970409367559332,
+          87.46174320643391
+        ],
+        [
+          17.75573317066317,
+          90.59250103325854
+        ],
+        [
+          41.05683533152865,
+          91.56737225214069
+        ],
+        [
+          63.319497586673116,
+          90.01084754868091
+        ],
+        [
+          75.14781395923075,
+          86.28844687220405
+        ],
+        [
+          76.81603792670788,
+          83.15042405259751
+        ],
+        [
+          77.05033394391478,
+          76.25776215104557
+        ],
+        [
+          76.86643881413028,
+          6.3089586511537865
+        ],
+        [
+          76.45188016352971,
+          -0.2999144698665015
+        ],
+        [
+          71.50179495549581,
+          -3.9936571317850627
+        ],
+        [
+          61.077971898861186,
+          -6.132877429442784
+        ],
+        [
+          37.32348754161154,
+          -7.932114425900202
+        ],
+        [
+          18.278415656797975,
+          -6.859225353587373
+        ],
+        [
+          3.2995959613238286,
+          -3.2201165291205287
+        ],
+        [
+          -0.04168289608444441,
+          -0.045185660461322996
+        ],
+        [
+          0,
+          0
+        ]
+      ],
+      "index": "a6",
+      "frameId": null,
+      "roundness": {
+        "type": 2
+      },
+      "boundElements": [],
+      "updated": 1759157176189,
+      "link": null,
+      "locked": false
+    },
+    {
+      "type": "line",
+      "version": 2683,
+      "versionNonce": 33379161,
+      "isDeleted": false,
+      "id": "0S6dEWQVqKUVkP6Z5IX1l",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 0,
+      "x": -67.53033611490343,
+      "y": 144.31921927673278,
+      "strokeColor": "#000000",
+      "backgroundColor": "#a5d8ff",
+      "width": 77.17198221193564,
+      "height": 8.562348957853036,
+      "seed": 817033943,
+      "groupIds": [
+        "9YkNe1yqnfZy9Z1JX2xr4",
+        "BDBCTrrhjbJynRAyuf3xJ"
+      ],
+      "strokeSharpness": "round",
+      "boundElementIds": [],
+      "startBinding": null,
+      "endBinding": null,
+      "lastCommittedPoint": null,
+      "startArrowhead": null,
+      "endArrowhead": null,
+      "points": [
+        [
+          0,
+          0
+        ],
+        [
+          2.033150371639873,
+          3.413095389435587
+        ],
+        [
+          10.801287372573954,
+          6.276651055277943
+        ],
+        [
+          22.468666942209353,
+          8.010803051612635
+        ],
+        [
+          40.747074201802775,
+          8.168828515515864
+        ],
+        [
+          62.077348233027564,
+          7.0647721921469495
+        ],
+        [
+          74.53446931782398,
+          3.04824021069218
+        ],
+        [
+          77.17198221193564,
+          -0.3935204423371723
+        ]
+      ],
+      "index": "a7",
+      "frameId": null,
+      "roundness": {
+        "type": 2
+      },
+      "boundElements": [],
+      "updated": 1759157176189,
+      "link": null,
+      "locked": false
+    },
+    {
+      "type": "line",
+      "version": 2769,
+      "versionNonce": 1703641145,
+      "isDeleted": false,
+      "id": "szGLND7J0nVOvRkNXX9AS",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 0,
+      "x": -68.56219343740725,
+      "y": 115.35516394150972,
+      "strokeColor": "#000000",
+      "backgroundColor": "#a5d8ff",
+      "width": 77.17198221193564,
+      "height": 8.562348957853036,
+      "seed": 1704755191,
+      "groupIds": [
+        "9YkNe1yqnfZy9Z1JX2xr4",
+        "BDBCTrrhjbJynRAyuf3xJ"
+      ],
+      "strokeSharpness": "round",
+      "boundElementIds": [],
+      "startBinding": null,
+      "endBinding": null,
+      "lastCommittedPoint": null,
+      "startArrowhead": null,
+      "endArrowhead": null,
+      "points": [
+        [
+          0,
+          0
+        ],
+        [
+          2.033150371639873,
+          3.413095389435587
+        ],
+        [
+          10.801287372573954,
+          6.276651055277943
+        ],
+        [
+          22.468666942209353,
+          8.010803051612635
+        ],
+        [
+          40.747074201802775,
+          8.168828515515864
+        ],
+        [
+          62.077348233027564,
+          7.0647721921469495
+        ],
+        [
+          74.53446931782398,
+          3.04824021069218
+        ],
+        [
+          77.17198221193564,
+          -0.3935204423371723
+        ]
+      ],
+      "index": "a8",
+      "frameId": null,
+      "roundness": {
+        "type": 2
+      },
+      "boundElements": [],
+      "updated": 1759157176189,
+      "link": null,
+      "locked": false
+    },
+    {
+      "type": "ellipse",
+      "version": 5766,
+      "versionNonce": 344002841,
+      "isDeleted": false,
+      "id": "O3t2uGktJlDd1_OX_bpV4",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 0,
+      "x": -69.6201424194893,
+      "y": 80.06066699332126,
+      "strokeColor": "#000000",
+      "backgroundColor": "#a5d8ff",
+      "width": 76.59753601865496,
+      "height": 15.49127539284798,
+      "seed": 471296279,
+      "groupIds": [
+        "9YkNe1yqnfZy9Z1JX2xr4",
+        "BDBCTrrhjbJynRAyuf3xJ"
+      ],
+      "strokeSharpness": "sharp",
+      "boundElementIds": [
+        "bxuMGTzXLn7H-uBCptINx"
+      ],
+      "index": "a9",
+      "frameId": null,
+      "roundness": null,
+      "boundElements": [],
+      "updated": 1759157176189,
+      "link": null,
+      "locked": false
+    },
+    {
+      "type": "ellipse",
+      "version": 1176,
+      "versionNonce": 1951499769,
+      "isDeleted": false,
+      "id": "_SzKlOBOvJgBg7FX0JTTM",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 0,
+      "x": -33.12815531426679,
+      "y": 104.53733467322485,
+      "strokeColor": "#000000",
+      "backgroundColor": "#228be6",
+      "width": 11.226103154161754,
+      "height": 12.183758484455605,
+      "seed": 1368927799,
+      "groupIds": [
+        "9YkNe1yqnfZy9Z1JX2xr4",
+        "BDBCTrrhjbJynRAyuf3xJ"
+      ],
+      "strokeSharpness": "sharp",
+      "boundElementIds": [],
+      "index": "aA",
+      "frameId": null,
+      "roundness": null,
+      "boundElements": [],
+      "updated": 1759157176189,
+      "link": null,
+      "locked": false
+    },
+    {
+      "type": "ellipse",
+      "version": 1464,
+      "versionNonce": 1879072473,
+      "isDeleted": false,
+      "id": "oJMl2Kxa3SPaiAY0kxo7A",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 0,
+      "x": -32.77701353033319,
+      "y": 130.75394896028996,
+      "strokeColor": "#000000",
+      "backgroundColor": "#228be6",
+      "width": 11.226103154161754,
+      "height": 12.183758484455605,
+      "seed": 1627606871,
+      "groupIds": [
+        "9YkNe1yqnfZy9Z1JX2xr4",
+        "BDBCTrrhjbJynRAyuf3xJ"
+      ],
+      "strokeSharpness": "sharp",
+      "boundElementIds": [],
+      "index": "aB",
+      "frameId": null,
+      "roundness": null,
+      "boundElements": [],
+      "updated": 1759157176189,
+      "link": null,
+      "locked": false
+    },
+    {
+      "type": "ellipse",
+      "version": 1347,
+      "versionNonce": 1176574905,
+      "isDeleted": false,
+      "id": "fB6pJBSMA-pRHrpgYKaLL",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 6.239590202363168,
+      "x": -32.12815531426679,
+      "y": 159.52267553159635,
+      "strokeColor": "#000000",
+      "backgroundColor": "#228be6",
+      "width": 11.226103154161754,
+      "height": 12.183758484455605,
+      "seed": 1420643447,
+      "groupIds": [
+        "9YkNe1yqnfZy9Z1JX2xr4",
+        "BDBCTrrhjbJynRAyuf3xJ"
+      ],
+      "strokeSharpness": "sharp",
+      "boundElementIds": [],
+      "index": "aC",
+      "frameId": null,
+      "roundness": null,
+      "boundElements": [],
+      "updated": 1759157176189,
+      "link": null,
+      "locked": false
+    },
+    {
+      "type": "text",
+      "version": 845,
+      "versionNonce": 383204505,
+      "isDeleted": false,
+      "id": "9gZ3Yy1MeP9kEOTLODqLG",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 0,
+      "x": -77.72012292771115,
+      "y": 181.11281713043917,
+      "strokeColor": "#000000",
+      "backgroundColor": "#a5d8ff",
+      "width": 95.63072204589844,
+      "height": 23.595161071904883,
+      "seed": 2019206551,
+      "groupIds": [
+        "BDBCTrrhjbJynRAyuf3xJ"
+      ],
+      "strokeSharpness": "sharp",
+      "boundElementIds": [],
+      "fontSize": 17.4778970902999,
+      "fontFamily": 1,
+      "text": "dataset.db",
+      "baseline": 16.595161071904883,
+      "textAlign": "center",
+      "verticalAlign": "top",
+      "index": "aD",
+      "frameId": null,
+      "roundness": null,
+      "boundElements": [],
+      "updated": 1759157176189,
+      "link": null,
+      "locked": false,
+      "containerId": null,
+      "originalText": "dataset.db",
+      "autoResize": true,
+      "lineHeight": 1.350000000000001
+    },
+    {
+      "id": "3eOw20xMhpB5jf_RMG24P",
+      "type": "text",
+      "x": 1131.3333333333335,
+      "y": 31.333333333333428,
+      "width": 508.3333333333333,
+      "height": 550,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aE",
+      "roundness": null,
+      "seed": 1535658041,
+      "version": 821,
+      "versionNonce": 1630266809,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1759157181677,
+      "link": null,
+      "locked": false,
+      "text": "Class PipelineApplier\n    - movie_frequence_filter : pd.DataFrame()\n    - rel_Frequence_Filter : pd.DataFrame()\n    - rel_banned_list: list[str]\n\n    + generate_movie_frequency_filter()\n    + generate_rel_frequency_filter()\n    + generate_list_relationship_filter()\n    \n    + filter_by_movie_frequency()\n    + filter_by_relationship_frequency()\n    + delete_relationship_by_list_filter()\n    + delete_relationship_by_str()\n\n    + drop_na()    \n\n    + rdf_add_special_token()\n    + group_triple_by_movie()\n    + build_by_movie()\n    # static\n    + build_triple()\n    + build_incomplete_triple()",
+      "fontSize": 20,
+      "fontFamily": 5,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "Class PipelineApplier\n    - movie_frequence_filter : pd.DataFrame()\n    - rel_Frequence_Filter : pd.DataFrame()\n    - rel_banned_list: list[str]\n\n    + generate_movie_frequency_filter()\n    + generate_rel_frequency_filter()\n    + generate_list_relationship_filter()\n    \n    + filter_by_movie_frequency()\n    + filter_by_relationship_frequency()\n    + delete_relationship_by_list_filter()\n    + delete_relationship_by_str()\n\n    + drop_na()    \n\n    + rdf_add_special_token()\n    + group_triple_by_movie()\n    + build_by_movie()\n    # static\n    + build_triple()\n    + build_incomplete_triple()",
+      "autoResize": false,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "Fbl1gpb5r7QrdRauGUWm2",
+      "type": "text",
+      "x": 158.23809523809535,
+      "y": 502.52380952380935,
+      "width": 484.2857142857143,
+      "height": 475,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aF",
+      "roundness": null,
+      "seed": 2066618807,
+      "version": 541,
+      "versionNonce": 7392153,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1759157954202,
+      "link": null,
+      "locked": false,
+      "text": "Class Pipeline\n    - sql_endpoint: SqlEndpoint()\n\n    - task_rdf_mask_file_handler:\n    - task_bpe_corpus_file_handler:\n    - task_rdf_text_file_handler:\n    - task_rdf_completation_file_handler:\n\n    - Filter_applier : PipelineApplier()\n\n\n    \n    + execute_task_bpe_corpus()\n    + execute_task_rdf_mask()\n    + execute_task_rdf_text()\n    + execute_task_rdf_completation()\n    + execute_all_task()\n\n    + use_toy_dataset()",
+      "fontSize": 20,
+      "fontFamily": 5,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "Class Pipeline\n    - sql_endpoint: SqlEndpoint()\n\n    - task_rdf_mask_file_handler:\n    - task_bpe_corpus_file_handler:\n    - task_rdf_text_file_handler:\n    - task_rdf_completation_file_handler:\n\n    - Filter_applier : PipelineApplier()\n\n\n    \n    + execute_task_bpe_corpus()\n    + execute_task_rdf_mask()\n    + execute_task_rdf_text()\n    + execute_task_rdf_completation()\n    + execute_all_task()\n\n    + use_toy_dataset()",
+      "autoResize": false,
+      "lineHeight": 1.25
+    }
+  ],
+  "appState": {
+    "gridSize": 20,
+    "gridStep": 5,
+    "gridModeEnabled": false,
+    "viewBackgroundColor": "#ffffff"
+  },
+  "files": {}
+}
\ No newline at end of file

From c319398ca01f10f5a2099219146649390cfec4a9 Mon Sep 17 00:00:00 2001
From: GassiGiuseppe <g.gassi@studenti.poliba.it>
Date: Mon, 29 Sep 2025 17:03:31 +0200
Subject: [PATCH 8/9] little update to UML pipeline

---
 .../cleaning-pipeline.excalidraw.json                | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json b/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json
index 1249185..a3b4660 100644
--- a/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json
+++ b/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json
@@ -592,7 +592,7 @@
       "x": 158.23809523809535,
       "y": 502.52380952380935,
       "width": 484.2857142857143,
-      "height": 475,
+      "height": 500,
       "angle": 0,
       "strokeColor": "#1e1e1e",
       "backgroundColor": "transparent",
@@ -606,20 +606,20 @@
       "index": "aF",
       "roundness": null,
       "seed": 2066618807,
-      "version": 541,
-      "versionNonce": 7392153,
+      "version": 552,
+      "versionNonce": 1269344823,
       "isDeleted": false,
       "boundElements": null,
-      "updated": 1759157954202,
+      "updated": 1759158199532,
       "link": null,
       "locked": false,
-      "text": "Class Pipeline\n    - sql_endpoint: SqlEndpoint()\n\n    - task_rdf_mask_file_handler:\n    - task_bpe_corpus_file_handler:\n    - task_rdf_text_file_handler:\n    - task_rdf_completation_file_handler:\n\n    - Filter_applier : PipelineApplier()\n\n\n    \n    + execute_task_bpe_corpus()\n    + execute_task_rdf_mask()\n    + execute_task_rdf_text()\n    + execute_task_rdf_completation()\n    + execute_all_task()\n\n    + use_toy_dataset()",
+      "text": "Class Pipeline\n    - sql_endpoint: SqlEndpoint()\n\n    - task_rdf_mask_file_handler:\n    - task_bpe_corpus_file_handler:\n    - task_rdf_text_file_handler:\n    - task_rdf_completation_file_handler:\n\n    - Filter_applier : PipelineApplier()\n\n    #\n    - get_cleaned_movie_rows()\n    \n    + execute_task_bpe_corpus()\n    + execute_task_rdf_mask()\n    + execute_task_rdf_text()\n    + execute_task_rdf_completation()\n    + execute_all_task()\n\n    + use_toy_dataset()",
       "fontSize": 20,
       "fontFamily": 5,
       "textAlign": "left",
       "verticalAlign": "top",
       "containerId": null,
-      "originalText": "Class Pipeline\n    - sql_endpoint: SqlEndpoint()\n\n    - task_rdf_mask_file_handler:\n    - task_bpe_corpus_file_handler:\n    - task_rdf_text_file_handler:\n    - task_rdf_completation_file_handler:\n\n    - Filter_applier : PipelineApplier()\n\n\n    \n    + execute_task_bpe_corpus()\n    + execute_task_rdf_mask()\n    + execute_task_rdf_text()\n    + execute_task_rdf_completation()\n    + execute_all_task()\n\n    + use_toy_dataset()",
+      "originalText": "Class Pipeline\n    - sql_endpoint: SqlEndpoint()\n\n    - task_rdf_mask_file_handler:\n    - task_bpe_corpus_file_handler:\n    - task_rdf_text_file_handler:\n    - task_rdf_completation_file_handler:\n\n    - Filter_applier : PipelineApplier()\n\n    #\n    - get_cleaned_movie_rows()\n    \n    + execute_task_bpe_corpus()\n    + execute_task_rdf_mask()\n    + execute_task_rdf_text()\n    + execute_task_rdf_completation()\n    + execute_all_task()\n\n    + use_toy_dataset()",
       "autoResize": false,
       "lineHeight": 1.25
     }

From 007f1e955405ba466ab68ac0c7da656c3edca905 Mon Sep 17 00:00:00 2001
From: GassiGiuseppe <g.gassi@studenti.poliba.it>
Date: Mon, 29 Sep 2025 18:53:33 +0200
Subject: [PATCH 9/9] minor updates

---
 .vscode/settings.json                         | 23 ++++++-
 .../cleaning-pipeline.excalidraw.json         | 64 +++++++++----------
 2 files changed, 54 insertions(+), 33 deletions(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 17ae78b..226939d 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -1,3 +1,24 @@
 {
-  "jupyter.notebookFileRoot": "${workspaceFolder}"
+  // Always treat the project root as the working dir for Jupyter
+  "jupyter.notebookFileRoot": "${workspaceFolder}",
+
+  // When you click "Run Python File in Terminal", DON'T cd into the file's folder
+  "python.terminal.executeInFileDir": false,
+
+  // Start new integrated terminals at the project root
+  "terminal.integrated.cwd": "${workspaceFolder}",
+
+  // Ensure Python can import from the project root no matter which file you run
+  // (so `src/` is on sys.path). Linux shown here; add osx/windows if needed.
+  "terminal.integrated.env.linux": {
+    "PYTHONPATH": "${workspaceFolder}"
+  },
+
+  // Make pytest run from the root without needing a pytest.ini
+  "python.testing.pytestEnabled": true,
+  "python.testing.cwd": "${workspaceFolder}",
+  "python.testing.pytestArgs": ["src/test"],
+
+  // Help Pylance resolve imports like `from src...` without red squiggles
+  "python.analysis.extraPaths": ["${workspaceFolder}"]
 }
\ No newline at end of file
diff --git a/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json b/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json
index a3b4660..c7019f5 100644
--- a/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json
+++ b/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json
@@ -109,8 +109,8 @@
     },
     {
       "type": "line",
-      "version": 4978,
-      "versionNonce": 2079525497,
+      "version": 4979,
+      "versionNonce": 1473849177,
       "isDeleted": false,
       "id": "sYReMTdYblr-oJtYYJALU",
       "fillStyle": "solid",
@@ -119,7 +119,7 @@
       "roughness": 1,
       "opacity": 100,
       "angle": 0,
-      "x": -68.05426555317842,
+      "x": -67.14432426259049,
       "y": 87.19293561900287,
       "strokeColor": "#000000",
       "backgroundColor": "#a5d8ff",
@@ -221,14 +221,14 @@
         "type": 2
       },
       "boundElements": [],
-      "updated": 1759157176189,
+      "updated": 1759158252997,
       "link": null,
       "locked": false
     },
     {
       "type": "line",
-      "version": 2683,
-      "versionNonce": 33379161,
+      "version": 2684,
+      "versionNonce": 952947769,
       "isDeleted": false,
       "id": "0S6dEWQVqKUVkP6Z5IX1l",
       "fillStyle": "solid",
@@ -237,7 +237,7 @@
       "roughness": 1,
       "opacity": 100,
       "angle": 0,
-      "x": -67.53033611490343,
+      "x": -66.6203948243155,
       "y": 144.31921927673278,
       "strokeColor": "#000000",
       "backgroundColor": "#a5d8ff",
@@ -295,14 +295,14 @@
         "type": 2
       },
       "boundElements": [],
-      "updated": 1759157176189,
+      "updated": 1759158252997,
       "link": null,
       "locked": false
     },
     {
       "type": "line",
-      "version": 2769,
-      "versionNonce": 1703641145,
+      "version": 2770,
+      "versionNonce": 477619481,
       "isDeleted": false,
       "id": "szGLND7J0nVOvRkNXX9AS",
       "fillStyle": "solid",
@@ -311,7 +311,7 @@
       "roughness": 1,
       "opacity": 100,
       "angle": 0,
-      "x": -68.56219343740725,
+      "x": -67.65225214681931,
       "y": 115.35516394150972,
       "strokeColor": "#000000",
       "backgroundColor": "#a5d8ff",
@@ -369,14 +369,14 @@
         "type": 2
       },
       "boundElements": [],
-      "updated": 1759157176189,
+      "updated": 1759158252997,
       "link": null,
       "locked": false
     },
     {
       "type": "ellipse",
-      "version": 5766,
-      "versionNonce": 344002841,
+      "version": 5767,
+      "versionNonce": 2119031289,
       "isDeleted": false,
       "id": "O3t2uGktJlDd1_OX_bpV4",
       "fillStyle": "solid",
@@ -385,7 +385,7 @@
       "roughness": 1,
       "opacity": 100,
       "angle": 0,
-      "x": -69.6201424194893,
+      "x": -68.71020112890136,
       "y": 80.06066699332126,
       "strokeColor": "#000000",
       "backgroundColor": "#a5d8ff",
@@ -404,14 +404,14 @@
       "frameId": null,
       "roundness": null,
       "boundElements": [],
-      "updated": 1759157176189,
+      "updated": 1759158252997,
       "link": null,
       "locked": false
     },
     {
       "type": "ellipse",
-      "version": 1176,
-      "versionNonce": 1951499769,
+      "version": 1177,
+      "versionNonce": 525480665,
       "isDeleted": false,
       "id": "_SzKlOBOvJgBg7FX0JTTM",
       "fillStyle": "solid",
@@ -420,7 +420,7 @@
       "roughness": 1,
       "opacity": 100,
       "angle": 0,
-      "x": -33.12815531426679,
+      "x": -32.218214023678854,
       "y": 104.53733467322485,
       "strokeColor": "#000000",
       "backgroundColor": "#228be6",
@@ -437,14 +437,14 @@
       "frameId": null,
       "roundness": null,
       "boundElements": [],
-      "updated": 1759157176189,
+      "updated": 1759158252997,
       "link": null,
       "locked": false
     },
     {
       "type": "ellipse",
-      "version": 1464,
-      "versionNonce": 1879072473,
+      "version": 1465,
+      "versionNonce": 1410887609,
       "isDeleted": false,
       "id": "oJMl2Kxa3SPaiAY0kxo7A",
       "fillStyle": "solid",
@@ -453,7 +453,7 @@
       "roughness": 1,
       "opacity": 100,
       "angle": 0,
-      "x": -32.77701353033319,
+      "x": -31.867072239745255,
       "y": 130.75394896028996,
       "strokeColor": "#000000",
       "backgroundColor": "#228be6",
@@ -470,14 +470,14 @@
       "frameId": null,
       "roundness": null,
       "boundElements": [],
-      "updated": 1759157176189,
+      "updated": 1759158252997,
       "link": null,
       "locked": false
     },
     {
       "type": "ellipse",
-      "version": 1347,
-      "versionNonce": 1176574905,
+      "version": 1348,
+      "versionNonce": 314839193,
       "isDeleted": false,
       "id": "fB6pJBSMA-pRHrpgYKaLL",
       "fillStyle": "solid",
@@ -486,7 +486,7 @@
       "roughness": 1,
       "opacity": 100,
       "angle": 6.239590202363168,
-      "x": -32.12815531426679,
+      "x": -31.218214023678854,
       "y": 159.52267553159635,
       "strokeColor": "#000000",
       "backgroundColor": "#228be6",
@@ -503,14 +503,14 @@
       "frameId": null,
       "roundness": null,
       "boundElements": [],
-      "updated": 1759157176189,
+      "updated": 1759158252997,
       "link": null,
       "locked": false
     },
     {
       "type": "text",
-      "version": 845,
-      "versionNonce": 383204505,
+      "version": 846,
+      "versionNonce": 1091081593,
       "isDeleted": false,
       "id": "9gZ3Yy1MeP9kEOTLODqLG",
       "fillStyle": "solid",
@@ -519,7 +519,7 @@
       "roughness": 1,
       "opacity": 100,
       "angle": 0,
-      "x": -77.72012292771115,
+      "x": -76.81018163712321,
       "y": 181.11281713043917,
       "strokeColor": "#000000",
       "backgroundColor": "#a5d8ff",
@@ -541,7 +541,7 @@
       "frameId": null,
       "roundness": null,
       "boundElements": [],
-      "updated": 1759157176189,
+      "updated": 1759158252997,
       "link": null,
       "locked": false,
       "containerId": null,