From 0a698e9837367de4e42d5b7506ed2a84b4e8f440 Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Thu, 25 Sep 2025 19:09:52 +0200
Subject: [PATCH 01/75] Added schema to extract from DB for BPE

---
 .../bpe-pipeline.excalidraw.json              | 897 ++++++++++++++++++
 1 file changed, 897 insertions(+)
 create mode 100644 Scripts/UML/CleaningPipeline/bpe-pipeline.excalidraw.json

diff --git a/Scripts/UML/CleaningPipeline/bpe-pipeline.excalidraw.json b/Scripts/UML/CleaningPipeline/bpe-pipeline.excalidraw.json
new file mode 100644
index 0000000..0edf3cf
--- /dev/null
+++ b/Scripts/UML/CleaningPipeline/bpe-pipeline.excalidraw.json
@@ -0,0 +1,897 @@
+{
+  "type": "excalidraw",
+  "version": 2,
+  "source": "https://marketplace.visualstudio.com/items?itemName=pomdtr.excalidraw-editor",
+  "elements": [
+    {
+      "id": "3zbCui3XtIGozHXTVAGRp",
+      "type": "rectangle",
+      "x": 316.5,
+      "y": 123,
+      "width": 436.5,
+      "height": 145.5,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a0",
+      "roundness": {
+        "type": 3
+      },
+      "seed": 1698427950,
+      "version": 35,
+      "versionNonce": 601575602,
+      "isDeleted": false,
+      "boundElements": [
+        {
+          "id": "wD66RDbG05HfvRhAtMb0J",
+          "type": "text"
+        },
+        {
+          "id": "gus_rxauKJ6T2L_F59PfN",
+          "type": "arrow"
+        }
+      ],
+      "updated": 1758818588814,
+      "link": null,
+      "locked": false
+    },
+    {
+      "id": "wD66RDbG05HfvRhAtMb0J",
+      "type": "text",
+      "x": 480.98004150390625,
+      "y": 183.25,
+      "width": 107.5399169921875,
+      "height": 25,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a1",
+      "roundness": null,
+      "seed": 910769774,
+      "version": 31,
+      "versionNonce": 1120989938,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818416720,
+      "link": null,
+      "locked": false,
+      "text": "dataset.db",
+      "fontSize": 20,
+      "fontFamily": 5,
+      "textAlign": "center",
+      "verticalAlign": "middle",
+      "containerId": "3zbCui3XtIGozHXTVAGRp",
+      "originalText": "dataset.db",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "87-MeaiZGT1wln0nggYPZ",
+      "type": "rectangle",
+      "x": 339.5,
+      "y": 309.5,
+      "width": 392,
+      "height": 156,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a2",
+      "roundness": {
+        "type": 3
+      },
+      "seed": 655550318,
+      "version": 77,
+      "versionNonce": 1103939826,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818339000,
+      "link": null,
+      "locked": false
+    },
+    {
+      "id": "EjUxEhZqEBzwvlw0VE9eJ",
+      "type": "rectangle",
+      "x": 355.5,
+      "y": 327,
+      "width": 162,
+      "height": 125.5,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a3",
+      "roundness": {
+        "type": 3
+      },
+      "seed": 1739846638,
+      "version": 64,
+      "versionNonce": 1594290034,
+      "isDeleted": false,
+      "boundElements": [
+        {
+          "type": "text",
+          "id": "ogRkV0neHrhEKTE6zlggl"
+        }
+      ],
+      "updated": 1758818391415,
+      "link": null,
+      "locked": false
+    },
+    {
+      "id": "ogRkV0neHrhEKTE6zlggl",
+      "type": "text",
+      "x": 378.7100524902344,
+      "y": 377.25,
+      "width": 115.57989501953125,
+      "height": 25,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a3V",
+      "roundness": null,
+      "seed": 2037675630,
+      "version": 12,
+      "versionNonce": 1286472046,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818399222,
+      "link": null,
+      "locked": false,
+      "text": "RDF_String",
+      "fontSize": 20,
+      "fontFamily": 5,
+      "textAlign": "center",
+      "verticalAlign": "middle",
+      "containerId": "EjUxEhZqEBzwvlw0VE9eJ",
+      "originalText": "RDF_String",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "hoIRMNiMJZl4YDo-hovWy",
+      "type": "rectangle",
+      "x": 542.5,
+      "y": 327,
+      "width": 173,
+      "height": 125.5,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a4",
+      "roundness": {
+        "type": 3
+      },
+      "seed": 1189796530,
+      "version": 99,
+      "versionNonce": 1071057006,
+      "isDeleted": false,
+      "boundElements": [
+        {
+          "type": "text",
+          "id": "rsapATFAT5YSBCXzLupgZ"
+        },
+        {
+          "id": "gus_rxauKJ6T2L_F59PfN",
+          "type": "arrow"
+        },
+        {
+          "id": "Wk1bJbbtC31FqObEL5xWt",
+          "type": "arrow"
+        }
+      ],
+      "updated": 1758818593647,
+      "link": null,
+      "locked": false
+    },
+    {
+      "id": "rsapATFAT5YSBCXzLupgZ",
+      "type": "text",
+      "x": 585.6800384521484,
+      "y": 377.25,
+      "width": 86.63992309570312,
+      "height": 25,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a5",
+      "roundness": null,
+      "seed": 829619694,
+      "version": 12,
+      "versionNonce": 713902318,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818405150,
+      "link": null,
+      "locked": false,
+      "text": "Abstract",
+      "fontSize": 20,
+      "fontFamily": 5,
+      "textAlign": "center",
+      "verticalAlign": "middle",
+      "containerId": "hoIRMNiMJZl4YDo-hovWy",
+      "originalText": "Abstract",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "jSx8ApfhtRs_nk37VvDMb",
+      "type": "rectangle",
+      "x": 316.5,
+      "y": 511,
+      "width": 436.5,
+      "height": 145.5,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a6",
+      "roundness": {
+        "type": 3
+      },
+      "seed": 492582894,
+      "version": 132,
+      "versionNonce": 893797614,
+      "isDeleted": false,
+      "boundElements": [
+        {
+          "type": "text",
+          "id": "6E23g-rgowNqHsBxX-LuM"
+        },
+        {
+          "id": "hyFKqXwet_F79QM71atgI",
+          "type": "arrow"
+        },
+        {
+          "id": "x_DP1FcQ7jraGz0gBuDi3",
+          "type": "arrow"
+        },
+        {
+          "id": "1IGbCps2EHnzKgJUWM5nq",
+          "type": "arrow"
+        },
+        {
+          "id": "Wk1bJbbtC31FqObEL5xWt",
+          "type": "arrow"
+        }
+      ],
+      "updated": 1758818593647,
+      "link": null,
+      "locked": false
+    },
+    {
+      "id": "6E23g-rgowNqHsBxX-LuM",
+      "type": "text",
+      "x": 499.9100341796875,
+      "y": 571.25,
+      "width": 69.679931640625,
+      "height": 25,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a7",
+      "roundness": null,
+      "seed": 267696178,
+      "version": 132,
+      "versionNonce": 1668243186,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818543211,
+      "link": null,
+      "locked": false,
+      "text": "Pandas",
+      "fontSize": 20,
+      "fontFamily": 5,
+      "textAlign": "center",
+      "verticalAlign": "middle",
+      "containerId": "jSx8ApfhtRs_nk37VvDMb",
+      "originalText": "Pandas",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "ohj18N4AOTDz5lJNcV9gi",
+      "type": "rectangle",
+      "x": 261,
+      "y": 765.5,
+      "width": 157,
+      "height": 87,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a8",
+      "roundness": {
+        "type": 3
+      },
+      "seed": 1446207150,
+      "version": 279,
+      "versionNonce": 317375026,
+      "isDeleted": false,
+      "boundElements": [
+        {
+          "id": "Ea1_ke2wA0D8ZjVOUtvfY",
+          "type": "text"
+        },
+        {
+          "id": "hyFKqXwet_F79QM71atgI",
+          "type": "arrow"
+        }
+      ],
+      "updated": 1758818570993,
+      "link": null,
+      "locked": false
+    },
+    {
+      "id": "Ea1_ke2wA0D8ZjVOUtvfY",
+      "type": "text",
+      "x": 297.0800323486328,
+      "y": 796.5,
+      "width": 84.83993530273438,
+      "height": 25,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a9",
+      "roundness": null,
+      "seed": 435116270,
+      "version": 199,
+      "versionNonce": 1282911218,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818570993,
+      "link": null,
+      "locked": false,
+      "text": "train.txt",
+      "fontSize": 20,
+      "fontFamily": 5,
+      "textAlign": "center",
+      "verticalAlign": "middle",
+      "containerId": "ohj18N4AOTDz5lJNcV9gi",
+      "originalText": "train.txt",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "A4Y54Y26fe257U_QU9lxX",
+      "type": "rectangle",
+      "x": 464,
+      "y": 765.5,
+      "width": 157,
+      "height": 87,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aA",
+      "roundness": {
+        "type": 3
+      },
+      "seed": 186148850,
+      "version": 232,
+      "versionNonce": 997119858,
+      "isDeleted": false,
+      "boundElements": [
+        {
+          "id": "v4TvUlDEjH7EvPDmtbOn2",
+          "type": "text"
+        },
+        {
+          "id": "1IGbCps2EHnzKgJUWM5nq",
+          "type": "arrow"
+        }
+      ],
+      "updated": 1758818570993,
+      "link": null,
+      "locked": false
+    },
+    {
+      "id": "v4TvUlDEjH7EvPDmtbOn2",
+      "type": "text",
+      "x": 476.3500442504883,
+      "y": 796.5,
+      "width": 132.29991149902344,
+      "height": 25,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aB",
+      "roundness": null,
+      "seed": 1131059634,
+      "version": 171,
+      "versionNonce": 239540530,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818570993,
+      "link": null,
+      "locked": false,
+      "text": "validation.txt",
+      "fontSize": 20,
+      "fontFamily": 5,
+      "textAlign": "center",
+      "verticalAlign": "middle",
+      "containerId": "A4Y54Y26fe257U_QU9lxX",
+      "originalText": "validation.txt",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "mPaYpJ9Xn7tlJPmKPqJKJ",
+      "type": "rectangle",
+      "x": 674.5,
+      "y": 765.5,
+      "width": 157,
+      "height": 87,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aC",
+      "roundness": {
+        "type": 3
+      },
+      "seed": 1049323314,
+      "version": 235,
+      "versionNonce": 330560690,
+      "isDeleted": false,
+      "boundElements": [
+        {
+          "type": "text",
+          "id": "kg9nm2rpud6cax5aNPSnu"
+        },
+        {
+          "id": "x_DP1FcQ7jraGz0gBuDi3",
+          "type": "arrow"
+        }
+      ],
+      "updated": 1758818570993,
+      "link": null,
+      "locked": false
+    },
+    {
+      "id": "kg9nm2rpud6cax5aNPSnu",
+      "type": "text",
+      "x": 711.4300231933594,
+      "y": 796.5,
+      "width": 83.13995361328125,
+      "height": 25,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aD",
+      "roundness": null,
+      "seed": 522572142,
+      "version": 193,
+      "versionNonce": 1920372338,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818570993,
+      "link": null,
+      "locked": false,
+      "text": "test.txt",
+      "fontSize": 20,
+      "fontFamily": 5,
+      "textAlign": "center",
+      "verticalAlign": "middle",
+      "containerId": "mPaYpJ9Xn7tlJPmKPqJKJ",
+      "originalText": "test.txt",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "hyFKqXwet_F79QM71atgI",
+      "type": "arrow",
+      "x": 534.65,
+      "y": 661.5,
+      "width": 195.25,
+      "height": 99,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aG",
+      "roundness": null,
+      "seed": 873266098,
+      "version": 71,
+      "versionNonce": 541154738,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818570993,
+      "link": null,
+      "locked": false,
+      "points": [
+        [
+          0,
+          0
+        ],
+        [
+          0,
+          49.5
+        ],
+        [
+          -195.25,
+          49.5
+        ],
+        [
+          -195.25,
+          99
+        ]
+      ],
+      "lastCommittedPoint": null,
+      "startBinding": {
+        "elementId": "jSx8ApfhtRs_nk37VvDMb",
+        "fixedPoint": [
+          0.49977090492554405,
+          1.034364261168385
+        ],
+        "focus": 0,
+        "gap": 0
+      },
+      "endBinding": {
+        "elementId": "ohj18N4AOTDz5lJNcV9gi",
+        "fixedPoint": [
+          0.4993630573248406,
+          -0.05747126436781609
+        ],
+        "focus": 0,
+        "gap": 0
+      },
+      "startArrowhead": null,
+      "endArrowhead": "triangle",
+      "elbowed": true,
+      "fixedSegments": null,
+      "startIsSpecial": null,
+      "endIsSpecial": null
+    },
+    {
+      "id": "x_DP1FcQ7jraGz0gBuDi3",
+      "type": "arrow",
+      "x": 534.65,
+      "y": 661.5,
+      "width": 218.25,
+      "height": 99,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aH",
+      "roundness": null,
+      "seed": 1210817582,
+      "version": 77,
+      "versionNonce": 1483392370,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818580594,
+      "link": null,
+      "locked": false,
+      "points": [
+        [
+          0,
+          0
+        ],
+        [
+          0,
+          49.5
+        ],
+        [
+          218.25,
+          49.5
+        ],
+        [
+          218.25,
+          99
+        ]
+      ],
+      "lastCommittedPoint": null,
+      "startBinding": {
+        "elementId": "jSx8ApfhtRs_nk37VvDMb",
+        "fixedPoint": [
+          0.49977090492554405,
+          1.034364261168385
+        ],
+        "focus": 0,
+        "gap": 0
+      },
+      "endBinding": {
+        "elementId": "mPaYpJ9Xn7tlJPmKPqJKJ",
+        "fixedPoint": [
+          0.4993630573248406,
+          -0.05747126436781609
+        ],
+        "focus": 0,
+        "gap": 0
+      },
+      "startArrowhead": null,
+      "endArrowhead": "triangle",
+      "elbowed": true,
+      "fixedSegments": null,
+      "startIsSpecial": null,
+      "endIsSpecial": null
+    },
+    {
+      "id": "1IGbCps2EHnzKgJUWM5nq",
+      "type": "arrow",
+      "x": 534.65,
+      "y": 661.5,
+      "width": 0.5719232650604908,
+      "height": 99.07394122590165,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aK",
+      "roundness": null,
+      "seed": 1205316658,
+      "version": 96,
+      "versionNonce": 1748050674,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818570993,
+      "link": null,
+      "locked": false,
+      "points": [
+        [
+          0,
+          0
+        ],
+        [
+          -0.5719232650604908,
+          99.07394122590165
+        ]
+      ],
+      "lastCommittedPoint": null,
+      "startBinding": {
+        "elementId": "jSx8ApfhtRs_nk37VvDMb",
+        "fixedPoint": [
+          0.49977090492554405,
+          1.034364261168385
+        ],
+        "focus": 0,
+        "gap": 0
+      },
+      "endBinding": {
+        "elementId": "A4Y54Y26fe257U_QU9lxX",
+        "fixedPoint": [
+          0.44635717665566554,
+          -0.056621365219521276
+        ],
+        "focus": 0,
+        "gap": 0
+      },
+      "startArrowhead": null,
+      "endArrowhead": "triangle",
+      "elbowed": true,
+      "fixedSegments": null,
+      "startIsSpecial": null,
+      "endIsSpecial": null
+    },
+    {
+      "id": "gus_rxauKJ6T2L_F59PfN",
+      "type": "arrow",
+      "x": 539,
+      "y": 271.5,
+      "width": 0,
+      "height": 33.5,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aL",
+      "roundness": null,
+      "seed": 763990258,
+      "version": 17,
+      "versionNonce": 1028811378,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818588814,
+      "link": null,
+      "locked": false,
+      "points": [
+        [
+          0,
+          0
+        ],
+        [
+          0,
+          33.5
+        ]
+      ],
+      "lastCommittedPoint": null,
+      "startBinding": {
+        "elementId": "3zbCui3XtIGozHXTVAGRp",
+        "focus": -0.019473081328751418,
+        "gap": 3
+      },
+      "endBinding": {
+        "elementId": "hoIRMNiMJZl4YDo-hovWy",
+        "focus": -1.0404624277456647,
+        "gap": 30.7545797799829
+      },
+      "startArrowhead": null,
+      "endArrowhead": "triangle",
+      "elbowed": false
+    },
+    {
+      "id": "Wk1bJbbtC31FqObEL5xWt",
+      "type": "arrow",
+      "x": 536.5,
+      "y": 468.5,
+      "width": 0,
+      "height": 39,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aM",
+      "roundness": null,
+      "seed": 1489771054,
+      "version": 33,
+      "versionNonce": 1828178606,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818593647,
+      "link": null,
+      "locked": false,
+      "points": [
+        [
+          0,
+          0
+        ],
+        [
+          0,
+          39
+        ]
+      ],
+      "lastCommittedPoint": null,
+      "startBinding": {
+        "elementId": "hoIRMNiMJZl4YDo-hovWy",
+        "focus": 1.0693641618497107,
+        "gap": 27.157190169432425
+      },
+      "endBinding": {
+        "elementId": "jSx8ApfhtRs_nk37VvDMb",
+        "focus": 0.008018327605956525,
+        "gap": 3.5
+      },
+      "startArrowhead": null,
+      "endArrowhead": "triangle",
+      "elbowed": false
+    }
+  ],
+  "appState": {
+    "gridSize": 20,
+    "gridStep": 5,
+    "gridModeEnabled": false,
+    "viewBackgroundColor": "#ffffff"
+  },
+  "files": {}
+}
\ No newline at end of file

From ee0aa583d53d0e23daf60754586058be1ecf6c1d Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Thu, 25 Sep 2025 19:10:45 +0200
Subject: [PATCH 02/75] Added Docs for BPE research

---
 docs/BPE.md | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)
 create mode 100644 docs/BPE.md

diff --git a/docs/BPE.md b/docs/BPE.md
new file mode 100644
index 0000000..02dca0a
--- /dev/null
+++ b/docs/BPE.md
@@ -0,0 +1,21 @@
+# BPE
+
+## Reasearch Material
+
+- [BPE Wikipedia](https://en.wikipedia.org/wiki/Byte-pair_encoding)
+- [BPE Hugging Face](https://huggingface.co/learn/llm-course/chapter6/5)
+- [BPE GeeksForGeeks](https://www.geeksforgeeks.org/nlp/byte-pair-encoding-bpe-in-nlp/)
+- [BPE Medium Chetna Khanna](https://medium.com/data-science/byte-pair-encoding-subword-based-tokenization-algorithm-77828a70bee0)
+- [Stack Overflow "Explain bpe (Byte Pair Encoding) with examples?"](https://stackoverflow.com/questions/50583254/explain-bpe-byte-pair-encoding-with-examples)
+- [Implementing a byte pair encoding(BPE) Tokenizer from scratch](https://sebastianraschka.com/blog/2025/bpe-from-scratch.html)
+- [Thoretical Analysis of Byte-Pair Encoding](https://arxiv.org/pdf/2411.08671)
+- [A Formal Perspective on Byte-Pair Encoding](https://aclanthology.org/2023.findings-acl.38v2.pdf)
+- [Byte Pair Encoding is Suboptimal for Language Model Pretraining](https://arxiv.org/pdf/2004.03720)
+- [Byte pair encoding: a text compression scheme that accelerates pattern matching](https://www.researchgate.net/profile/Takeshi-Shinohara/publication/2310624_Byte_Pair_Encoding_A_Text_Compression_Scheme_That_Accelerates_Pattern_Matching/links/02e7e522f8ea00c318000000/Byte-Pair-Encoding-A-Text-Compression-Scheme-That-Accelerates-Pattern-Matching.pdf)
+- [A Formal Perspective on Byte-Pair Encoding](https://arxiv.org/pdf/2306.16837)
+- [Controlling byte pair encoding for neural machine translation](https://ieeexplore.ieee.org/abstract/document/8300571)
+- [Scaffold-BPE: Enhancing Byte Pair Encoding for Large Language Models with Simple and Effective Scaffold Token Removal](https://ojs.aaai.org/index.php/AAAI/article/view/34633)
+- [Parity-Aware Byte-Pair Encoding: Improving Cross-lingual Fairness in Tokenization](https://arxiv.org/pdf/2508.04796)
+- [Code Completion using Neural Aention and Byte Pair Encoding](https://arxiv.org/pdf/2004.06343)
+
+

From e521b0704e1941ede504f58a615d8a20fa77461b Mon Sep 17 00:00:00 2001
From: GassiGiuseppe <g.gassi@studenti.poliba.it>
Date: Thu, 25 Sep 2025 19:19:11 +0200
Subject: [PATCH 03/75] deleted TODO in path_splitter_tree, as it was already
 resolved

---
 Scripts/DataCleaning/path_splitter_tree.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Scripts/DataCleaning/path_splitter_tree.py b/Scripts/DataCleaning/path_splitter_tree.py
index e7f6f9e..9c0914a 100644
--- a/Scripts/DataCleaning/path_splitter_tree.py
+++ b/Scripts/DataCleaning/path_splitter_tree.py
@@ -101,7 +101,6 @@ def tree_like(file: str, csv_uri_header:str, out: str):
 
     FILE = open(file, "r", encoding="utf-8")
 
-    # TODO: Change here so it takes single URI from a CSV file
     # It is needed the header-name
     for row in csv.DictReader(FILE):
 

From 1bbb4a0999ef289d7f17cb1231f12e95576eaae6 Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Thu, 25 Sep 2025 20:17:48 +0200
Subject: [PATCH 04/75] Added new paper

---
 docs/BPE.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/BPE.md b/docs/BPE.md
index 02dca0a..eee3bac 100644
--- a/docs/BPE.md
+++ b/docs/BPE.md
@@ -17,5 +17,6 @@
 - [Scaffold-BPE: Enhancing Byte Pair Encoding for Large Language Models with Simple and Effective Scaffold Token Removal](https://ojs.aaai.org/index.php/AAAI/article/view/34633)
 - [Parity-Aware Byte-Pair Encoding: Improving Cross-lingual Fairness in Tokenization](https://arxiv.org/pdf/2508.04796)
 - [Code Completion using Neural Aention and Byte Pair Encoding](https://arxiv.org/pdf/2004.06343)
+- [Getting the most out of your tokenizer for pre-training and domain adaptation](https://arxiv.org/html/2402.01035v2)
 
 

From 90012285b5473c078c5fa28457054a7954ac167a Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Thu, 25 Sep 2025 20:18:21 +0200
Subject: [PATCH 05/75] UML Diagram to explain bpe workflows

---
 Projec-Model/UML/bpe.excalidraw.json          | 362 +++++++
 .../bpe-pipeline.excalidraw.json              | 897 ++++++++++++++++++
 2 files changed, 1259 insertions(+)
 create mode 100644 Projec-Model/UML/bpe.excalidraw.json
 create mode 100644 Scripts/UML/CleaningPipeline/bpe-pipeline.excalidraw.json

diff --git a/Projec-Model/UML/bpe.excalidraw.json b/Projec-Model/UML/bpe.excalidraw.json
new file mode 100644
index 0000000..1400c25
--- /dev/null
+++ b/Projec-Model/UML/bpe.excalidraw.json
@@ -0,0 +1,362 @@
+{
+  "type": "excalidraw",
+  "version": 2,
+  "source": "https://marketplace.visualstudio.com/items?itemName=pomdtr.excalidraw-editor",
+  "elements": [
+    {
+      "id": "EcT-dGsjmfW571ov8Gg4F",
+      "type": "text",
+      "x": 425.5,
+      "y": 130,
+      "width": 506,
+      "height": 550,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [
+        "4rCC2-N1thmII8_dwNhe1"
+      ],
+      "frameId": null,
+      "index": "a3V",
+      "roundness": null,
+      "seed": 523521109,
+      "version": 758,
+      "versionNonce": 383976373,
+      "isDeleted": false,
+      "boundElements": [
+        {
+          "id": "OA_NKjb3n3NLtUo_tKmPS",
+          "type": "arrow"
+        }
+      ],
+      "updated": 1758823931674,
+      "link": null,
+      "locked": false,
+      "text": "class NanoSocratesBPE:\n    - vocabulary: Vocabulary\n\n    - max_iterations: int\n    - max_vocabulary_size: int\n    - merge_treshold: int\n\n    - reserve_capacity: float\n    - token_length: int\n\n    + fit(\n        data: [[int]], \n        memory: NanoSocratesBPE_BatchMemory, \n        last_batch: bool\n      ) -> (Self, NanoSocratesBPE_BatchMemory)\n\n    + encode(word: [byte]) -> [int]\n\n    + decode(token: [int]) -> [byte]\n\n    \n",
+      "fontSize": 20,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "class NanoSocratesBPE:\n    - vocabulary: Vocabulary\n\n    - max_iterations: int\n    - max_vocabulary_size: int\n    - merge_treshold: int\n\n    - reserve_capacity: float\n    - token_length: int\n\n    + fit(\n        data: [[int]], \n        memory: NanoSocratesBPE_BatchMemory, \n        last_batch: bool\n      ) -> (Self, NanoSocratesBPE_BatchMemory)\n\n    + encode(word: [byte]) -> [int]\n\n    + decode(token: [int]) -> [byte]\n\n    \n",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "74i4oK-JpcM4CgAqhz_x_",
+      "type": "rectangle",
+      "x": 382.5,
+      "y": 104,
+      "width": 592.5,
+      "height": 555,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [
+        "4rCC2-N1thmII8_dwNhe1"
+      ],
+      "frameId": null,
+      "index": "a4",
+      "roundness": {
+        "type": 3
+      },
+      "seed": 50827893,
+      "version": 212,
+      "versionNonce": 692313525,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758822941942,
+      "link": null,
+      "locked": false
+    },
+    {
+      "id": "s8I1JoKulE3Vnti9a374p",
+      "type": "text",
+      "x": 1113,
+      "y": 128,
+      "width": 440,
+      "height": 250,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [
+        "M6w9efVFwOZHkJGgwkyEw"
+      ],
+      "frameId": null,
+      "index": "a5",
+      "roundness": null,
+      "seed": 2091174261,
+      "version": 442,
+      "versionNonce": 1108352309,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758822765308,
+      "link": null,
+      "locked": false,
+      "text": "class Vocabulary:\n\n    - vocabulary: dict<int, int>\n    - reverse_vocabulary: dict<int, int>\n\n    + add_word(int) -> int\n    + encode(int) -> int\n    + decode(int) -> int\n\n",
+      "fontSize": 20,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "class Vocabulary:\n\n    - vocabulary: dict<int, int>\n    - reverse_vocabulary: dict<int, int>\n\n    + add_word(int) -> int\n    + encode(int) -> int\n    + decode(int) -> int\n\n",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "BY_Why7XDNftdMzPcwjVZ",
+      "type": "rectangle",
+      "x": 1086.5,
+      "y": 104,
+      "width": 504.5,
+      "height": 260.5,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [
+        "M6w9efVFwOZHkJGgwkyEw"
+      ],
+      "frameId": null,
+      "index": "a6",
+      "roundness": {
+        "type": 3
+      },
+      "seed": 153939611,
+      "version": 153,
+      "versionNonce": 1903356469,
+      "isDeleted": false,
+      "boundElements": [
+        {
+          "id": "WcDks9DR8UqeZEaxAcRf9",
+          "type": "arrow"
+        }
+      ],
+      "updated": 1758822805382,
+      "link": null,
+      "locked": false
+    },
+    {
+      "id": "JCPDhuTKRx4MN950Q3jL-",
+      "type": "text",
+      "x": 1116.411067193676,
+      "y": 535.1519268774704,
+      "width": 427.72826086956525,
+      "height": 99.70355731225297,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [
+        "DbtlKVF_9SjH2-9iMq9zy"
+      ],
+      "frameId": null,
+      "index": "a7",
+      "roundness": null,
+      "seed": 1326854235,
+      "version": 345,
+      "versionNonce": 592556603,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758822845014,
+      "link": null,
+      "locked": false,
+      "text": "class NanoSocrateBPE_BatchMemory:\n\n    + max_word_length: int\n    + frequency: dict<(int, int), int> ",
+      "fontSize": 19.940711462450594,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "class NanoSocrateBPE_BatchMemory:\n\n    + max_word_length: int\n    + frequency: dict<(int, int), int> ",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "l-O0rMS3SruV22_MPX9Jz",
+      "type": "rectangle",
+      "x": 1086.5,
+      "y": 509.22900197628456,
+      "width": 504.49999999999994,
+      "height": 154.04199604743084,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [
+        "DbtlKVF_9SjH2-9iMq9zy"
+      ],
+      "frameId": null,
+      "index": "a8",
+      "roundness": {
+        "type": 3
+      },
+      "seed": 1490898171,
+      "version": 186,
+      "versionNonce": 1953870555,
+      "isDeleted": false,
+      "boundElements": [
+        {
+          "id": "OA_NKjb3n3NLtUo_tKmPS",
+          "type": "arrow"
+        }
+      ],
+      "updated": 1758822845014,
+      "link": null,
+      "locked": false
+    },
+    {
+      "id": "WcDks9DR8UqeZEaxAcRf9",
+      "type": "arrow",
+      "x": 773.5,
+      "y": 167,
+      "width": 298.5,
+      "height": 30,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aB",
+      "roundness": {
+        "type": 2
+      },
+      "seed": 1681364149,
+      "version": 205,
+      "versionNonce": 1154753851,
+      "isDeleted": false,
+      "boundElements": [],
+      "updated": 1758823291274,
+      "link": null,
+      "locked": false,
+      "points": [
+        [
+          0,
+          0
+        ],
+        [
+          144.5,
+          -1.5
+        ],
+        [
+          177.5,
+          -30
+        ],
+        [
+          298.5,
+          -29.5
+        ]
+      ],
+      "lastCommittedPoint": null,
+      "startBinding": null,
+      "endBinding": {
+        "elementId": "BY_Why7XDNftdMzPcwjVZ",
+        "focus": 0.7285094931977862,
+        "gap": 14.5
+      },
+      "startArrowhead": null,
+      "endArrowhead": "triangle",
+      "elbowed": false
+    },
+    {
+      "id": "OA_NKjb3n3NLtUo_tKmPS",
+      "type": "arrow",
+      "x": 941,
+      "y": 440.7646462573778,
+      "width": 132.9833600541258,
+      "height": 105.33206183359624,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aD",
+      "roundness": {
+        "type": 2
+      },
+      "seed": 1871768059,
+      "version": 402,
+      "versionNonce": 462603541,
+      "isDeleted": false,
+      "boundElements": [],
+      "updated": 1758823931675,
+      "link": null,
+      "locked": false,
+      "points": [
+        [
+          0,
+          0
+        ],
+        [
+          53,
+          8.23535374262218
+        ],
+        [
+          63,
+          97.73535374262218
+        ],
+        [
+          132.9833600541258,
+          105.33206183359624
+        ]
+      ],
+      "lastCommittedPoint": null,
+      "startBinding": {
+        "elementId": "EcT-dGsjmfW571ov8Gg4F",
+        "focus": -0.01598303536344995,
+        "gap": 9.500000000000114
+      },
+      "endBinding": {
+        "elementId": "l-O0rMS3SruV22_MPX9Jz",
+        "focus": 0.10931526948750278,
+        "gap": 13.22003639101672
+      },
+      "startArrowhead": null,
+      "endArrowhead": "triangle",
+      "elbowed": false
+    }
+  ],
+  "appState": {
+    "gridSize": 20,
+    "gridStep": 5,
+    "gridModeEnabled": false,
+    "viewBackgroundColor": "#ffffff"
+  },
+  "files": {}
+}
\ No newline at end of file
diff --git a/Scripts/UML/CleaningPipeline/bpe-pipeline.excalidraw.json b/Scripts/UML/CleaningPipeline/bpe-pipeline.excalidraw.json
new file mode 100644
index 0000000..0edf3cf
--- /dev/null
+++ b/Scripts/UML/CleaningPipeline/bpe-pipeline.excalidraw.json
@@ -0,0 +1,897 @@
+{
+  "type": "excalidraw",
+  "version": 2,
+  "source": "https://marketplace.visualstudio.com/items?itemName=pomdtr.excalidraw-editor",
+  "elements": [
+    {
+      "id": "3zbCui3XtIGozHXTVAGRp",
+      "type": "rectangle",
+      "x": 316.5,
+      "y": 123,
+      "width": 436.5,
+      "height": 145.5,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a0",
+      "roundness": {
+        "type": 3
+      },
+      "seed": 1698427950,
+      "version": 35,
+      "versionNonce": 601575602,
+      "isDeleted": false,
+      "boundElements": [
+        {
+          "id": "wD66RDbG05HfvRhAtMb0J",
+          "type": "text"
+        },
+        {
+          "id": "gus_rxauKJ6T2L_F59PfN",
+          "type": "arrow"
+        }
+      ],
+      "updated": 1758818588814,
+      "link": null,
+      "locked": false
+    },
+    {
+      "id": "wD66RDbG05HfvRhAtMb0J",
+      "type": "text",
+      "x": 480.98004150390625,
+      "y": 183.25,
+      "width": 107.5399169921875,
+      "height": 25,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a1",
+      "roundness": null,
+      "seed": 910769774,
+      "version": 31,
+      "versionNonce": 1120989938,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818416720,
+      "link": null,
+      "locked": false,
+      "text": "dataset.db",
+      "fontSize": 20,
+      "fontFamily": 5,
+      "textAlign": "center",
+      "verticalAlign": "middle",
+      "containerId": "3zbCui3XtIGozHXTVAGRp",
+      "originalText": "dataset.db",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "87-MeaiZGT1wln0nggYPZ",
+      "type": "rectangle",
+      "x": 339.5,
+      "y": 309.5,
+      "width": 392,
+      "height": 156,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a2",
+      "roundness": {
+        "type": 3
+      },
+      "seed": 655550318,
+      "version": 77,
+      "versionNonce": 1103939826,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818339000,
+      "link": null,
+      "locked": false
+    },
+    {
+      "id": "EjUxEhZqEBzwvlw0VE9eJ",
+      "type": "rectangle",
+      "x": 355.5,
+      "y": 327,
+      "width": 162,
+      "height": 125.5,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a3",
+      "roundness": {
+        "type": 3
+      },
+      "seed": 1739846638,
+      "version": 64,
+      "versionNonce": 1594290034,
+      "isDeleted": false,
+      "boundElements": [
+        {
+          "type": "text",
+          "id": "ogRkV0neHrhEKTE6zlggl"
+        }
+      ],
+      "updated": 1758818391415,
+      "link": null,
+      "locked": false
+    },
+    {
+      "id": "ogRkV0neHrhEKTE6zlggl",
+      "type": "text",
+      "x": 378.7100524902344,
+      "y": 377.25,
+      "width": 115.57989501953125,
+      "height": 25,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a3V",
+      "roundness": null,
+      "seed": 2037675630,
+      "version": 12,
+      "versionNonce": 1286472046,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818399222,
+      "link": null,
+      "locked": false,
+      "text": "RDF_String",
+      "fontSize": 20,
+      "fontFamily": 5,
+      "textAlign": "center",
+      "verticalAlign": "middle",
+      "containerId": "EjUxEhZqEBzwvlw0VE9eJ",
+      "originalText": "RDF_String",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "hoIRMNiMJZl4YDo-hovWy",
+      "type": "rectangle",
+      "x": 542.5,
+      "y": 327,
+      "width": 173,
+      "height": 125.5,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a4",
+      "roundness": {
+        "type": 3
+      },
+      "seed": 1189796530,
+      "version": 99,
+      "versionNonce": 1071057006,
+      "isDeleted": false,
+      "boundElements": [
+        {
+          "type": "text",
+          "id": "rsapATFAT5YSBCXzLupgZ"
+        },
+        {
+          "id": "gus_rxauKJ6T2L_F59PfN",
+          "type": "arrow"
+        },
+        {
+          "id": "Wk1bJbbtC31FqObEL5xWt",
+          "type": "arrow"
+        }
+      ],
+      "updated": 1758818593647,
+      "link": null,
+      "locked": false
+    },
+    {
+      "id": "rsapATFAT5YSBCXzLupgZ",
+      "type": "text",
+      "x": 585.6800384521484,
+      "y": 377.25,
+      "width": 86.63992309570312,
+      "height": 25,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a5",
+      "roundness": null,
+      "seed": 829619694,
+      "version": 12,
+      "versionNonce": 713902318,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818405150,
+      "link": null,
+      "locked": false,
+      "text": "Abstract",
+      "fontSize": 20,
+      "fontFamily": 5,
+      "textAlign": "center",
+      "verticalAlign": "middle",
+      "containerId": "hoIRMNiMJZl4YDo-hovWy",
+      "originalText": "Abstract",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "jSx8ApfhtRs_nk37VvDMb",
+      "type": "rectangle",
+      "x": 316.5,
+      "y": 511,
+      "width": 436.5,
+      "height": 145.5,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a6",
+      "roundness": {
+        "type": 3
+      },
+      "seed": 492582894,
+      "version": 132,
+      "versionNonce": 893797614,
+      "isDeleted": false,
+      "boundElements": [
+        {
+          "type": "text",
+          "id": "6E23g-rgowNqHsBxX-LuM"
+        },
+        {
+          "id": "hyFKqXwet_F79QM71atgI",
+          "type": "arrow"
+        },
+        {
+          "id": "x_DP1FcQ7jraGz0gBuDi3",
+          "type": "arrow"
+        },
+        {
+          "id": "1IGbCps2EHnzKgJUWM5nq",
+          "type": "arrow"
+        },
+        {
+          "id": "Wk1bJbbtC31FqObEL5xWt",
+          "type": "arrow"
+        }
+      ],
+      "updated": 1758818593647,
+      "link": null,
+      "locked": false
+    },
+    {
+      "id": "6E23g-rgowNqHsBxX-LuM",
+      "type": "text",
+      "x": 499.9100341796875,
+      "y": 571.25,
+      "width": 69.679931640625,
+      "height": 25,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a7",
+      "roundness": null,
+      "seed": 267696178,
+      "version": 132,
+      "versionNonce": 1668243186,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818543211,
+      "link": null,
+      "locked": false,
+      "text": "Pandas",
+      "fontSize": 20,
+      "fontFamily": 5,
+      "textAlign": "center",
+      "verticalAlign": "middle",
+      "containerId": "jSx8ApfhtRs_nk37VvDMb",
+      "originalText": "Pandas",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "ohj18N4AOTDz5lJNcV9gi",
+      "type": "rectangle",
+      "x": 261,
+      "y": 765.5,
+      "width": 157,
+      "height": 87,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a8",
+      "roundness": {
+        "type": 3
+      },
+      "seed": 1446207150,
+      "version": 279,
+      "versionNonce": 317375026,
+      "isDeleted": false,
+      "boundElements": [
+        {
+          "id": "Ea1_ke2wA0D8ZjVOUtvfY",
+          "type": "text"
+        },
+        {
+          "id": "hyFKqXwet_F79QM71atgI",
+          "type": "arrow"
+        }
+      ],
+      "updated": 1758818570993,
+      "link": null,
+      "locked": false
+    },
+    {
+      "id": "Ea1_ke2wA0D8ZjVOUtvfY",
+      "type": "text",
+      "x": 297.0800323486328,
+      "y": 796.5,
+      "width": 84.83993530273438,
+      "height": 25,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a9",
+      "roundness": null,
+      "seed": 435116270,
+      "version": 199,
+      "versionNonce": 1282911218,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818570993,
+      "link": null,
+      "locked": false,
+      "text": "train.txt",
+      "fontSize": 20,
+      "fontFamily": 5,
+      "textAlign": "center",
+      "verticalAlign": "middle",
+      "containerId": "ohj18N4AOTDz5lJNcV9gi",
+      "originalText": "train.txt",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "A4Y54Y26fe257U_QU9lxX",
+      "type": "rectangle",
+      "x": 464,
+      "y": 765.5,
+      "width": 157,
+      "height": 87,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aA",
+      "roundness": {
+        "type": 3
+      },
+      "seed": 186148850,
+      "version": 232,
+      "versionNonce": 997119858,
+      "isDeleted": false,
+      "boundElements": [
+        {
+          "id": "v4TvUlDEjH7EvPDmtbOn2",
+          "type": "text"
+        },
+        {
+          "id": "1IGbCps2EHnzKgJUWM5nq",
+          "type": "arrow"
+        }
+      ],
+      "updated": 1758818570993,
+      "link": null,
+      "locked": false
+    },
+    {
+      "id": "v4TvUlDEjH7EvPDmtbOn2",
+      "type": "text",
+      "x": 476.3500442504883,
+      "y": 796.5,
+      "width": 132.29991149902344,
+      "height": 25,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aB",
+      "roundness": null,
+      "seed": 1131059634,
+      "version": 171,
+      "versionNonce": 239540530,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818570993,
+      "link": null,
+      "locked": false,
+      "text": "validation.txt",
+      "fontSize": 20,
+      "fontFamily": 5,
+      "textAlign": "center",
+      "verticalAlign": "middle",
+      "containerId": "A4Y54Y26fe257U_QU9lxX",
+      "originalText": "validation.txt",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "mPaYpJ9Xn7tlJPmKPqJKJ",
+      "type": "rectangle",
+      "x": 674.5,
+      "y": 765.5,
+      "width": 157,
+      "height": 87,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aC",
+      "roundness": {
+        "type": 3
+      },
+      "seed": 1049323314,
+      "version": 235,
+      "versionNonce": 330560690,
+      "isDeleted": false,
+      "boundElements": [
+        {
+          "type": "text",
+          "id": "kg9nm2rpud6cax5aNPSnu"
+        },
+        {
+          "id": "x_DP1FcQ7jraGz0gBuDi3",
+          "type": "arrow"
+        }
+      ],
+      "updated": 1758818570993,
+      "link": null,
+      "locked": false
+    },
+    {
+      "id": "kg9nm2rpud6cax5aNPSnu",
+      "type": "text",
+      "x": 711.4300231933594,
+      "y": 796.5,
+      "width": 83.13995361328125,
+      "height": 25,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aD",
+      "roundness": null,
+      "seed": 522572142,
+      "version": 193,
+      "versionNonce": 1920372338,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818570993,
+      "link": null,
+      "locked": false,
+      "text": "test.txt",
+      "fontSize": 20,
+      "fontFamily": 5,
+      "textAlign": "center",
+      "verticalAlign": "middle",
+      "containerId": "mPaYpJ9Xn7tlJPmKPqJKJ",
+      "originalText": "test.txt",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "hyFKqXwet_F79QM71atgI",
+      "type": "arrow",
+      "x": 534.65,
+      "y": 661.5,
+      "width": 195.25,
+      "height": 99,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aG",
+      "roundness": null,
+      "seed": 873266098,
+      "version": 71,
+      "versionNonce": 541154738,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818570993,
+      "link": null,
+      "locked": false,
+      "points": [
+        [
+          0,
+          0
+        ],
+        [
+          0,
+          49.5
+        ],
+        [
+          -195.25,
+          49.5
+        ],
+        [
+          -195.25,
+          99
+        ]
+      ],
+      "lastCommittedPoint": null,
+      "startBinding": {
+        "elementId": "jSx8ApfhtRs_nk37VvDMb",
+        "fixedPoint": [
+          0.49977090492554405,
+          1.034364261168385
+        ],
+        "focus": 0,
+        "gap": 0
+      },
+      "endBinding": {
+        "elementId": "ohj18N4AOTDz5lJNcV9gi",
+        "fixedPoint": [
+          0.4993630573248406,
+          -0.05747126436781609
+        ],
+        "focus": 0,
+        "gap": 0
+      },
+      "startArrowhead": null,
+      "endArrowhead": "triangle",
+      "elbowed": true,
+      "fixedSegments": null,
+      "startIsSpecial": null,
+      "endIsSpecial": null
+    },
+    {
+      "id": "x_DP1FcQ7jraGz0gBuDi3",
+      "type": "arrow",
+      "x": 534.65,
+      "y": 661.5,
+      "width": 218.25,
+      "height": 99,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aH",
+      "roundness": null,
+      "seed": 1210817582,
+      "version": 77,
+      "versionNonce": 1483392370,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818580594,
+      "link": null,
+      "locked": false,
+      "points": [
+        [
+          0,
+          0
+        ],
+        [
+          0,
+          49.5
+        ],
+        [
+          218.25,
+          49.5
+        ],
+        [
+          218.25,
+          99
+        ]
+      ],
+      "lastCommittedPoint": null,
+      "startBinding": {
+        "elementId": "jSx8ApfhtRs_nk37VvDMb",
+        "fixedPoint": [
+          0.49977090492554405,
+          1.034364261168385
+        ],
+        "focus": 0,
+        "gap": 0
+      },
+      "endBinding": {
+        "elementId": "mPaYpJ9Xn7tlJPmKPqJKJ",
+        "fixedPoint": [
+          0.4993630573248406,
+          -0.05747126436781609
+        ],
+        "focus": 0,
+        "gap": 0
+      },
+      "startArrowhead": null,
+      "endArrowhead": "triangle",
+      "elbowed": true,
+      "fixedSegments": null,
+      "startIsSpecial": null,
+      "endIsSpecial": null
+    },
+    {
+      "id": "1IGbCps2EHnzKgJUWM5nq",
+      "type": "arrow",
+      "x": 534.65,
+      "y": 661.5,
+      "width": 0.5719232650604908,
+      "height": 99.07394122590165,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aK",
+      "roundness": null,
+      "seed": 1205316658,
+      "version": 96,
+      "versionNonce": 1748050674,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818570993,
+      "link": null,
+      "locked": false,
+      "points": [
+        [
+          0,
+          0
+        ],
+        [
+          -0.5719232650604908,
+          99.07394122590165
+        ]
+      ],
+      "lastCommittedPoint": null,
+      "startBinding": {
+        "elementId": "jSx8ApfhtRs_nk37VvDMb",
+        "fixedPoint": [
+          0.49977090492554405,
+          1.034364261168385
+        ],
+        "focus": 0,
+        "gap": 0
+      },
+      "endBinding": {
+        "elementId": "A4Y54Y26fe257U_QU9lxX",
+        "fixedPoint": [
+          0.44635717665566554,
+          -0.056621365219521276
+        ],
+        "focus": 0,
+        "gap": 0
+      },
+      "startArrowhead": null,
+      "endArrowhead": "triangle",
+      "elbowed": true,
+      "fixedSegments": null,
+      "startIsSpecial": null,
+      "endIsSpecial": null
+    },
+    {
+      "id": "gus_rxauKJ6T2L_F59PfN",
+      "type": "arrow",
+      "x": 539,
+      "y": 271.5,
+      "width": 0,
+      "height": 33.5,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aL",
+      "roundness": null,
+      "seed": 763990258,
+      "version": 17,
+      "versionNonce": 1028811378,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818588814,
+      "link": null,
+      "locked": false,
+      "points": [
+        [
+          0,
+          0
+        ],
+        [
+          0,
+          33.5
+        ]
+      ],
+      "lastCommittedPoint": null,
+      "startBinding": {
+        "elementId": "3zbCui3XtIGozHXTVAGRp",
+        "focus": -0.019473081328751418,
+        "gap": 3
+      },
+      "endBinding": {
+        "elementId": "hoIRMNiMJZl4YDo-hovWy",
+        "focus": -1.0404624277456647,
+        "gap": 30.7545797799829
+      },
+      "startArrowhead": null,
+      "endArrowhead": "triangle",
+      "elbowed": false
+    },
+    {
+      "id": "Wk1bJbbtC31FqObEL5xWt",
+      "type": "arrow",
+      "x": 536.5,
+      "y": 468.5,
+      "width": 0,
+      "height": 39,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aM",
+      "roundness": null,
+      "seed": 1489771054,
+      "version": 33,
+      "versionNonce": 1828178606,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1758818593647,
+      "link": null,
+      "locked": false,
+      "points": [
+        [
+          0,
+          0
+        ],
+        [
+          0,
+          39
+        ]
+      ],
+      "lastCommittedPoint": null,
+      "startBinding": {
+        "elementId": "hoIRMNiMJZl4YDo-hovWy",
+        "focus": 1.0693641618497107,
+        "gap": 27.157190169432425
+      },
+      "endBinding": {
+        "elementId": "jSx8ApfhtRs_nk37VvDMb",
+        "focus": 0.008018327605956525,
+        "gap": 3.5
+      },
+      "startArrowhead": null,
+      "endArrowhead": "triangle",
+      "elbowed": false
+    }
+  ],
+  "appState": {
+    "gridSize": 20,
+    "gridStep": 5,
+    "gridModeEnabled": false,
+    "viewBackgroundColor": "#ffffff"
+  },
+  "files": {}
+}
\ No newline at end of file

From 650b37c586fe07d9bb83d4471a727c12cd717dfb Mon Sep 17 00:00:00 2001
From: GassiGiuseppe <g.gassi@studenti.poliba.it>
Date: Fri, 26 Sep 2025 11:24:34 +0200
Subject: [PATCH 06/75] Added vscode setting to execute jupyternotebook from
 root dir

---
 .vscode/settings.json | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 .vscode/settings.json

diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..17ae78b
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,3 @@
+{
+  "jupyter.notebookFileRoot": "${workspaceFolder}"
+}
\ No newline at end of file

From 9972ab8a511e785bbd14148023808dc6a329e09f Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Fri, 26 Sep 2025 18:48:23 +0200
Subject: [PATCH 07/75] Added imports

---
 Project_Model/Libs/BPE/Classes/__init__.py | 5 +++++
 Project_Model/Libs/BPE/Enums/__init__.py   | 0
 Project_Model/Libs/BPE/Errors/__init__.py  | 5 +++++
 Project_Model/Libs/BPE/__init__.py         | 3 +++
 Project_Model/Libs/__init__.py             | 1 +
 5 files changed, 14 insertions(+)
 create mode 100644 Project_Model/Libs/BPE/Classes/__init__.py
 create mode 100644 Project_Model/Libs/BPE/Enums/__init__.py
 create mode 100644 Project_Model/Libs/BPE/Errors/__init__.py
 create mode 100644 Project_Model/Libs/BPE/__init__.py
 create mode 100644 Project_Model/Libs/__init__.py

diff --git a/Project_Model/Libs/BPE/Classes/__init__.py b/Project_Model/Libs/BPE/Classes/__init__.py
new file mode 100644
index 0000000..e8e65e5
--- /dev/null
+++ b/Project_Model/Libs/BPE/Classes/__init__.py
@@ -0,0 +1,5 @@
+from .NanoSocratesChunker import NanoSocratesChunker
+
+__all__ = [
+    "NanoSocratesChunker"
+]
\ No newline at end of file
diff --git a/Project_Model/Libs/BPE/Enums/__init__.py b/Project_Model/Libs/BPE/Enums/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/Project_Model/Libs/BPE/Errors/__init__.py b/Project_Model/Libs/BPE/Errors/__init__.py
new file mode 100644
index 0000000..0aab0ad
--- /dev/null
+++ b/Project_Model/Libs/BPE/Errors/__init__.py
@@ -0,0 +1,5 @@
+from .DelimiterNotFoundException import DelimiterNotFoundException
+
+__all__ = [
+    "DelimiterNotFoundException"
+]
diff --git a/Project_Model/Libs/BPE/__init__.py b/Project_Model/Libs/BPE/__init__.py
new file mode 100644
index 0000000..2292a87
--- /dev/null
+++ b/Project_Model/Libs/BPE/__init__.py
@@ -0,0 +1,3 @@
+from .Classes import *
+from .Enums import *
+from .Errors import *
diff --git a/Project_Model/Libs/__init__.py b/Project_Model/Libs/__init__.py
new file mode 100644
index 0000000..39fcdff
--- /dev/null
+++ b/Project_Model/Libs/__init__.py
@@ -0,0 +1 @@
+from . import BPE
\ No newline at end of file

From 3f48b5c4286be537d2d6a5fa80aabee6849faba6 Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Fri, 26 Sep 2025 18:48:44 +0200
Subject: [PATCH 08/75] Added text files to test a chunker

---
 Project_Model/Tests/chunker_files/edge-1.txt | 4 ++++
 Project_Model/Tests/chunker_files/simple.txt | 2 ++
 Project_Model/Tests/chunker_files/stress.txt | 3 +++
 3 files changed, 9 insertions(+)
 create mode 100644 Project_Model/Tests/chunker_files/edge-1.txt
 create mode 100644 Project_Model/Tests/chunker_files/simple.txt
 create mode 100644 Project_Model/Tests/chunker_files/stress.txt

diff --git a/Project_Model/Tests/chunker_files/edge-1.txt b/Project_Model/Tests/chunker_files/edge-1.txt
new file mode 100644
index 0000000..d93fc54
--- /dev/null
+++ b/Project_Model/Tests/chunker_files/edge-1.txt
@@ -0,0 +1,4 @@
+<SOT>Lorem <SEP>ipsu<SEP>m d<SEP>olor s<SEP>it ame<SEP>t,
+<SEP>conse<SEP>cte<SEP>tur adip<SEP>iscin<SEP>g elit.
+<SEP>Aenean a<SEP>t dui he<SEP>ndrer<SEP>it an<SEP>te soll<SEP>icitud
+<SEP>in sce<SEP>lerisque<EOT>
\ No newline at end of file
diff --git a/Project_Model/Tests/chunker_files/simple.txt b/Project_Model/Tests/chunker_files/simple.txt
new file mode 100644
index 0000000..fbc222a
--- /dev/null
+++ b/Project_Model/Tests/chunker_files/simple.txt
@@ -0,0 +1,2 @@
+<SOT>Lorem ipsum <SEP>dolor sit amet<SEP>, consectetur <SEP>adipiscing elit.
+<SEP>Aenean at dui <SEP>hendrerit ante <SEP>sollicitudin <SEP>scelerisque<EOT>
\ No newline at end of file
diff --git a/Project_Model/Tests/chunker_files/stress.txt b/Project_Model/Tests/chunker_files/stress.txt
new file mode 100644
index 0000000..b3cf4c7
--- /dev/null
+++ b/Project_Model/Tests/chunker_files/stress.txt
@@ -0,0 +1,3 @@
+<SOT>Lorem ipsum <SEP>dolor sit amet<SEP>, consectetur <SEP>adipiscing elit.
+<SEP>Aenean at dui <SEP>hendrerit an te <SEP>sollicitudin <SEP>scelerisque
+<SEP>dsdsasssdfdsdsfkjddsnfkjdsnfkjdnsjkfndf<EOT>
\ No newline at end of file

From 5801a819e9e058e360e0ee983c688e87e0fa778e Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Fri, 26 Sep 2025 18:49:06 +0200
Subject: [PATCH 09/75] Added vars to make it easier to work here

---
 .vscode/settings.json | 39 +++++++++++++++++++++++++++++++++++++++
 README.md             | 21 ++++++++++++++++++++-
 2 files changed, 59 insertions(+), 1 deletion(-)
 create mode 100644 .vscode/settings.json

diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..1d34b01
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,39 @@
+{
+    // For linux
+    "terminal.integrated.env.linux": {
+        "PYTHONPATH": "${workspaceFolder}"
+    },
+    // For OSX
+    "terminal.integrated.env.osx": {
+        "PYTHONPATH": "${workspaceFolder}"
+    },
+    // For Windows
+    "terminal.integrated.env.windows": {
+        "PYTHONPATH": "${workspaceFolder}"
+    }
+}
+
+// {
+//   // Always treat the project root as the working dir for Jupyter
+//   "jupyter.notebookFileRoot": "${workspaceFolder}",
+//
+//   // When you click "Run Python File in Terminal", DON'T cd into the file's folder
+//   "python.terminal.executeInFileDir": false,
+//
+//   // Start new integrated terminals at the project root
+//   "terminal.integrated.cwd": "${workspaceFolder}",
+//
+//   // Ensure Python can import from the project root no matter which file you run
+//   // (so `src/` is on sys.path). Linux shown here; add osx/windows if needed.
+//   "terminal.integrated.env.windows": {
+//     "PYTHONPATH": "${workspaceFolder}"
+//   },
+//
+//   // Make pytest run from the root without needing a pytest.ini
+//   "python.testing.pytestEnabled": true,
+//   "python.testing.cwd": "${workspaceFolder}",
+//   "python.testing.pytestArgs": ["src/test"],
+//
+//   // Help Pylance resolve imports like `from src...` without red squiggles
+//   "python.analysis.extraPaths": ["${workspaceFolder}"]
+// }
\ No newline at end of file
diff --git a/README.md b/README.md
index 1789589..1aec207 100644
--- a/README.md
+++ b/README.md
@@ -12,11 +12,30 @@ Create and activate you Conda enviroment with:
 
        conda env create -f environment.yaml
        conda activate deep_learning
-  
+
 Now install dependencies on pip:
 
         pip install -r requirements.txt
 
+Add the following on .vscode/settings.json
+
+       ```json
+       {
+              // For linux
+              "terminal.integrated.env.linux": {
+                     "PYTHONPATH": "${workspaceFolder}"
+              },
+              // For OSX
+              "terminal.integrated.env.osx": {
+                     "PYTHONPATH": "${workspaceFolder}"
+              },
+              // For Windows
+              "terminal.integrated.env.windows": {
+                     "PYTHONPATH": "${workspaceFolder}"
+              }
+       }
+       ```
+
 ## TroubleShooting
 
 Sometimes when uploading really large batch of data, git can stop the uploads thanks to the timeout.

From be8a87ce0165dba6ab79793967b7767eaa21629d Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Fri, 26 Sep 2025 18:49:29 +0200
Subject: [PATCH 10/75] Modified the architecture for BPE

---
 Projec-Model/UML/bpe.excalidraw.json  | 362 --------------
 Project_Model/UML/bpe.excalidraw.json | 658 ++++++++++++++++++++++++++
 2 files changed, 658 insertions(+), 362 deletions(-)
 delete mode 100644 Projec-Model/UML/bpe.excalidraw.json
 create mode 100644 Project_Model/UML/bpe.excalidraw.json

diff --git a/Projec-Model/UML/bpe.excalidraw.json b/Projec-Model/UML/bpe.excalidraw.json
deleted file mode 100644
index 1400c25..0000000
--- a/Projec-Model/UML/bpe.excalidraw.json
+++ /dev/null
@@ -1,362 +0,0 @@
-{
-  "type": "excalidraw",
-  "version": 2,
-  "source": "https://marketplace.visualstudio.com/items?itemName=pomdtr.excalidraw-editor",
-  "elements": [
-    {
-      "id": "EcT-dGsjmfW571ov8Gg4F",
-      "type": "text",
-      "x": 425.5,
-      "y": 130,
-      "width": 506,
-      "height": 550,
-      "angle": 0,
-      "strokeColor": "#1e1e1e",
-      "backgroundColor": "transparent",
-      "fillStyle": "solid",
-      "strokeWidth": 2,
-      "strokeStyle": "solid",
-      "roughness": 1,
-      "opacity": 100,
-      "groupIds": [
-        "4rCC2-N1thmII8_dwNhe1"
-      ],
-      "frameId": null,
-      "index": "a3V",
-      "roundness": null,
-      "seed": 523521109,
-      "version": 758,
-      "versionNonce": 383976373,
-      "isDeleted": false,
-      "boundElements": [
-        {
-          "id": "OA_NKjb3n3NLtUo_tKmPS",
-          "type": "arrow"
-        }
-      ],
-      "updated": 1758823931674,
-      "link": null,
-      "locked": false,
-      "text": "class NanoSocratesBPE:\n    - vocabulary: Vocabulary\n\n    - max_iterations: int\n    - max_vocabulary_size: int\n    - merge_treshold: int\n\n    - reserve_capacity: float\n    - token_length: int\n\n    + fit(\n        data: [[int]], \n        memory: NanoSocratesBPE_BatchMemory, \n        last_batch: bool\n      ) -> (Self, NanoSocratesBPE_BatchMemory)\n\n    + encode(word: [byte]) -> [int]\n\n    + decode(token: [int]) -> [byte]\n\n    \n",
-      "fontSize": 20,
-      "fontFamily": 8,
-      "textAlign": "left",
-      "verticalAlign": "top",
-      "containerId": null,
-      "originalText": "class NanoSocratesBPE:\n    - vocabulary: Vocabulary\n\n    - max_iterations: int\n    - max_vocabulary_size: int\n    - merge_treshold: int\n\n    - reserve_capacity: float\n    - token_length: int\n\n    + fit(\n        data: [[int]], \n        memory: NanoSocratesBPE_BatchMemory, \n        last_batch: bool\n      ) -> (Self, NanoSocratesBPE_BatchMemory)\n\n    + encode(word: [byte]) -> [int]\n\n    + decode(token: [int]) -> [byte]\n\n    \n",
-      "autoResize": true,
-      "lineHeight": 1.25
-    },
-    {
-      "id": "74i4oK-JpcM4CgAqhz_x_",
-      "type": "rectangle",
-      "x": 382.5,
-      "y": 104,
-      "width": 592.5,
-      "height": 555,
-      "angle": 0,
-      "strokeColor": "#1e1e1e",
-      "backgroundColor": "transparent",
-      "fillStyle": "solid",
-      "strokeWidth": 2,
-      "strokeStyle": "solid",
-      "roughness": 1,
-      "opacity": 100,
-      "groupIds": [
-        "4rCC2-N1thmII8_dwNhe1"
-      ],
-      "frameId": null,
-      "index": "a4",
-      "roundness": {
-        "type": 3
-      },
-      "seed": 50827893,
-      "version": 212,
-      "versionNonce": 692313525,
-      "isDeleted": false,
-      "boundElements": null,
-      "updated": 1758822941942,
-      "link": null,
-      "locked": false
-    },
-    {
-      "id": "s8I1JoKulE3Vnti9a374p",
-      "type": "text",
-      "x": 1113,
-      "y": 128,
-      "width": 440,
-      "height": 250,
-      "angle": 0,
-      "strokeColor": "#1e1e1e",
-      "backgroundColor": "transparent",
-      "fillStyle": "solid",
-      "strokeWidth": 2,
-      "strokeStyle": "solid",
-      "roughness": 1,
-      "opacity": 100,
-      "groupIds": [
-        "M6w9efVFwOZHkJGgwkyEw"
-      ],
-      "frameId": null,
-      "index": "a5",
-      "roundness": null,
-      "seed": 2091174261,
-      "version": 442,
-      "versionNonce": 1108352309,
-      "isDeleted": false,
-      "boundElements": null,
-      "updated": 1758822765308,
-      "link": null,
-      "locked": false,
-      "text": "class Vocabulary:\n\n    - vocabulary: dict<int, int>\n    - reverse_vocabulary: dict<int, int>\n\n    + add_word(int) -> int\n    + encode(int) -> int\n    + decode(int) -> int\n\n",
-      "fontSize": 20,
-      "fontFamily": 8,
-      "textAlign": "left",
-      "verticalAlign": "top",
-      "containerId": null,
-      "originalText": "class Vocabulary:\n\n    - vocabulary: dict<int, int>\n    - reverse_vocabulary: dict<int, int>\n\n    + add_word(int) -> int\n    + encode(int) -> int\n    + decode(int) -> int\n\n",
-      "autoResize": true,
-      "lineHeight": 1.25
-    },
-    {
-      "id": "BY_Why7XDNftdMzPcwjVZ",
-      "type": "rectangle",
-      "x": 1086.5,
-      "y": 104,
-      "width": 504.5,
-      "height": 260.5,
-      "angle": 0,
-      "strokeColor": "#1e1e1e",
-      "backgroundColor": "transparent",
-      "fillStyle": "solid",
-      "strokeWidth": 2,
-      "strokeStyle": "solid",
-      "roughness": 1,
-      "opacity": 100,
-      "groupIds": [
-        "M6w9efVFwOZHkJGgwkyEw"
-      ],
-      "frameId": null,
-      "index": "a6",
-      "roundness": {
-        "type": 3
-      },
-      "seed": 153939611,
-      "version": 153,
-      "versionNonce": 1903356469,
-      "isDeleted": false,
-      "boundElements": [
-        {
-          "id": "WcDks9DR8UqeZEaxAcRf9",
-          "type": "arrow"
-        }
-      ],
-      "updated": 1758822805382,
-      "link": null,
-      "locked": false
-    },
-    {
-      "id": "JCPDhuTKRx4MN950Q3jL-",
-      "type": "text",
-      "x": 1116.411067193676,
-      "y": 535.1519268774704,
-      "width": 427.72826086956525,
-      "height": 99.70355731225297,
-      "angle": 0,
-      "strokeColor": "#1e1e1e",
-      "backgroundColor": "transparent",
-      "fillStyle": "solid",
-      "strokeWidth": 2,
-      "strokeStyle": "solid",
-      "roughness": 1,
-      "opacity": 100,
-      "groupIds": [
-        "DbtlKVF_9SjH2-9iMq9zy"
-      ],
-      "frameId": null,
-      "index": "a7",
-      "roundness": null,
-      "seed": 1326854235,
-      "version": 345,
-      "versionNonce": 592556603,
-      "isDeleted": false,
-      "boundElements": null,
-      "updated": 1758822845014,
-      "link": null,
-      "locked": false,
-      "text": "class NanoSocrateBPE_BatchMemory:\n\n    + max_word_length: int\n    + frequency: dict<(int, int), int> ",
-      "fontSize": 19.940711462450594,
-      "fontFamily": 8,
-      "textAlign": "left",
-      "verticalAlign": "top",
-      "containerId": null,
-      "originalText": "class NanoSocrateBPE_BatchMemory:\n\n    + max_word_length: int\n    + frequency: dict<(int, int), int> ",
-      "autoResize": true,
-      "lineHeight": 1.25
-    },
-    {
-      "id": "l-O0rMS3SruV22_MPX9Jz",
-      "type": "rectangle",
-      "x": 1086.5,
-      "y": 509.22900197628456,
-      "width": 504.49999999999994,
-      "height": 154.04199604743084,
-      "angle": 0,
-      "strokeColor": "#1e1e1e",
-      "backgroundColor": "transparent",
-      "fillStyle": "solid",
-      "strokeWidth": 2,
-      "strokeStyle": "solid",
-      "roughness": 1,
-      "opacity": 100,
-      "groupIds": [
-        "DbtlKVF_9SjH2-9iMq9zy"
-      ],
-      "frameId": null,
-      "index": "a8",
-      "roundness": {
-        "type": 3
-      },
-      "seed": 1490898171,
-      "version": 186,
-      "versionNonce": 1953870555,
-      "isDeleted": false,
-      "boundElements": [
-        {
-          "id": "OA_NKjb3n3NLtUo_tKmPS",
-          "type": "arrow"
-        }
-      ],
-      "updated": 1758822845014,
-      "link": null,
-      "locked": false
-    },
-    {
-      "id": "WcDks9DR8UqeZEaxAcRf9",
-      "type": "arrow",
-      "x": 773.5,
-      "y": 167,
-      "width": 298.5,
-      "height": 30,
-      "angle": 0,
-      "strokeColor": "#1e1e1e",
-      "backgroundColor": "transparent",
-      "fillStyle": "solid",
-      "strokeWidth": 2,
-      "strokeStyle": "solid",
-      "roughness": 1,
-      "opacity": 100,
-      "groupIds": [],
-      "frameId": null,
-      "index": "aB",
-      "roundness": {
-        "type": 2
-      },
-      "seed": 1681364149,
-      "version": 205,
-      "versionNonce": 1154753851,
-      "isDeleted": false,
-      "boundElements": [],
-      "updated": 1758823291274,
-      "link": null,
-      "locked": false,
-      "points": [
-        [
-          0,
-          0
-        ],
-        [
-          144.5,
-          -1.5
-        ],
-        [
-          177.5,
-          -30
-        ],
-        [
-          298.5,
-          -29.5
-        ]
-      ],
-      "lastCommittedPoint": null,
-      "startBinding": null,
-      "endBinding": {
-        "elementId": "BY_Why7XDNftdMzPcwjVZ",
-        "focus": 0.7285094931977862,
-        "gap": 14.5
-      },
-      "startArrowhead": null,
-      "endArrowhead": "triangle",
-      "elbowed": false
-    },
-    {
-      "id": "OA_NKjb3n3NLtUo_tKmPS",
-      "type": "arrow",
-      "x": 941,
-      "y": 440.7646462573778,
-      "width": 132.9833600541258,
-      "height": 105.33206183359624,
-      "angle": 0,
-      "strokeColor": "#1e1e1e",
-      "backgroundColor": "transparent",
-      "fillStyle": "solid",
-      "strokeWidth": 2,
-      "strokeStyle": "solid",
-      "roughness": 1,
-      "opacity": 100,
-      "groupIds": [],
-      "frameId": null,
-      "index": "aD",
-      "roundness": {
-        "type": 2
-      },
-      "seed": 1871768059,
-      "version": 402,
-      "versionNonce": 462603541,
-      "isDeleted": false,
-      "boundElements": [],
-      "updated": 1758823931675,
-      "link": null,
-      "locked": false,
-      "points": [
-        [
-          0,
-          0
-        ],
-        [
-          53,
-          8.23535374262218
-        ],
-        [
-          63,
-          97.73535374262218
-        ],
-        [
-          132.9833600541258,
-          105.33206183359624
-        ]
-      ],
-      "lastCommittedPoint": null,
-      "startBinding": {
-        "elementId": "EcT-dGsjmfW571ov8Gg4F",
-        "focus": -0.01598303536344995,
-        "gap": 9.500000000000114
-      },
-      "endBinding": {
-        "elementId": "l-O0rMS3SruV22_MPX9Jz",
-        "focus": 0.10931526948750278,
-        "gap": 13.22003639101672
-      },
-      "startArrowhead": null,
-      "endArrowhead": "triangle",
-      "elbowed": false
-    }
-  ],
-  "appState": {
-    "gridSize": 20,
-    "gridStep": 5,
-    "gridModeEnabled": false,
-    "viewBackgroundColor": "#ffffff"
-  },
-  "files": {}
-}
\ No newline at end of file
diff --git a/Project_Model/UML/bpe.excalidraw.json b/Project_Model/UML/bpe.excalidraw.json
new file mode 100644
index 0000000..d706222
--- /dev/null
+++ b/Project_Model/UML/bpe.excalidraw.json
@@ -0,0 +1,658 @@
+{
+  "type": "excalidraw",
+  "version": 2,
+  "source": "https://marketplace.visualstudio.com/items?itemName=pomdtr.excalidraw-editor",
+  "elements": [
+    {
+      "id": "EcT-dGsjmfW571ov8Gg4F",
+      "type": "text",
+      "x": 425.5,
+      "y": 132,
+      "width": 506,
+      "height": 425,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [
+        "4rCC2-N1thmII8_dwNhe1"
+      ],
+      "frameId": null,
+      "index": "a3V",
+      "roundness": null,
+      "seed": 523521109,
+      "version": 883,
+      "versionNonce": 1590682729,
+      "isDeleted": false,
+      "boundElements": [
+        {
+          "id": "OA_NKjb3n3NLtUo_tKmPS",
+          "type": "arrow"
+        }
+      ],
+      "updated": 1758881654155,
+      "link": null,
+      "locked": false,
+      "text": "class NanoSocratesBPE(Encoder):\n    - vocabulary: Vocabulary\n\n    + fit(\n        data: [[int]], \n        memory: NanoSocratesBPE_BatchMemory,\n        last_sentence_chunk: bool, \n        last_batch: bool\n      ) -> (Self, NanoSocratesBPE_BatchMemory)\n\n    + encode(word: [byte]) -> [int]\n\n    + decode(token: [int]) -> [byte]\n\n    + get_vocabulary_size() -> int\n    \n",
+      "fontSize": 20,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "class NanoSocratesBPE(Encoder):\n    - vocabulary: Vocabulary\n\n    + fit(\n        data: [[int]], \n        memory: NanoSocratesBPE_BatchMemory,\n        last_sentence_chunk: bool, \n        last_batch: bool\n      ) -> (Self, NanoSocratesBPE_BatchMemory)\n\n    + encode(word: [byte]) -> [int]\n\n    + decode(token: [int]) -> [byte]\n\n    + get_vocabulary_size() -> int\n    \n",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "74i4oK-JpcM4CgAqhz_x_",
+      "type": "rectangle",
+      "x": 382.5,
+      "y": 104.5,
+      "width": 592.5,
+      "height": 421,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [
+        "4rCC2-N1thmII8_dwNhe1"
+      ],
+      "frameId": null,
+      "index": "a4",
+      "roundness": {
+        "type": 3
+      },
+      "seed": 50827893,
+      "version": 319,
+      "versionNonce": 704459557,
+      "isDeleted": false,
+      "boundElements": [],
+      "updated": 1758878226277,
+      "link": null,
+      "locked": false
+    },
+    {
+      "id": "s8I1JoKulE3Vnti9a374p",
+      "type": "text",
+      "x": 1113.5,
+      "y": 127,
+      "width": 517,
+      "height": 325,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [
+        "M6w9efVFwOZHkJGgwkyEw"
+      ],
+      "frameId": null,
+      "index": "a5",
+      "roundness": null,
+      "seed": 2091174261,
+      "version": 480,
+      "versionNonce": 1964948039,
+      "isDeleted": false,
+      "boundElements": [],
+      "updated": 1758881941367,
+      "link": null,
+      "locked": false,
+      "text": "class Vocabulary:\n\n    - vocabulary: dict<(int, int), int>\n    - reverse_vocabulary: dict<int, (int, int)>\n\n    + size -> int\n\n    + add_word(int) -> int\n    + encode(int) -> int\n    + decode(int) -> int\n    \n\n",
+      "fontSize": 20,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "class Vocabulary:\n\n    - vocabulary: dict<(int, int), int>\n    - reverse_vocabulary: dict<int, (int, int)>\n\n    + size -> int\n\n    + add_word(int) -> int\n    + encode(int) -> int\n    + decode(int) -> int\n    \n\n",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "BY_Why7XDNftdMzPcwjVZ",
+      "type": "rectangle",
+      "x": 1086.5,
+      "y": 105.5,
+      "width": 593.0000000000001,
+      "height": 325.5,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [
+        "M6w9efVFwOZHkJGgwkyEw"
+      ],
+      "frameId": null,
+      "index": "a6",
+      "roundness": {
+        "type": 3
+      },
+      "seed": 153939611,
+      "version": 234,
+      "versionNonce": 2068149129,
+      "isDeleted": false,
+      "boundElements": [
+        {
+          "id": "WcDks9DR8UqeZEaxAcRf9",
+          "type": "arrow"
+        }
+      ],
+      "updated": 1758881945661,
+      "link": null,
+      "locked": false
+    },
+    {
+      "id": "JCPDhuTKRx4MN950Q3jL-",
+      "type": "text",
+      "x": 1116.411067193676,
+      "y": 477.3809288774704,
+      "width": 416.74578857421875,
+      "height": 99.70355731225297,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [
+        "DbtlKVF_9SjH2-9iMq9zy"
+      ],
+      "frameId": null,
+      "index": "a7",
+      "roundness": null,
+      "seed": 1326854235,
+      "version": 479,
+      "versionNonce": 595084597,
+      "isDeleted": false,
+      "boundElements": [],
+      "updated": 1758902358518,
+      "link": null,
+      "locked": false,
+      "text": "class NanoSocrateBPE_BatchMemory:\n\n    + frequency: dict<(int, int), int>\n    + merge_treshold: int",
+      "fontSize": 19.940711462450594,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "class NanoSocrateBPE_BatchMemory:\n\n    + frequency: dict<(int, int), int>\n    + merge_treshold: int",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "l-O0rMS3SruV22_MPX9Jz",
+      "type": "rectangle",
+      "x": 1086.5,
+      "y": 451.4580039762846,
+      "width": 593,
+      "height": 208.0419960474308,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [
+        "DbtlKVF_9SjH2-9iMq9zy"
+      ],
+      "frameId": null,
+      "index": "a8",
+      "roundness": {
+        "type": 3
+      },
+      "seed": 1490898171,
+      "version": 305,
+      "versionNonce": 587306139,
+      "isDeleted": false,
+      "boundElements": [
+        {
+          "id": "OA_NKjb3n3NLtUo_tKmPS",
+          "type": "arrow"
+        }
+      ],
+      "updated": 1758902358518,
+      "link": null,
+      "locked": false
+    },
+    {
+      "id": "WcDks9DR8UqeZEaxAcRf9",
+      "type": "arrow",
+      "x": 773.5,
+      "y": 167,
+      "width": 297.17936724485867,
+      "height": 30,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aB",
+      "roundness": {
+        "type": 2
+      },
+      "seed": 1681364149,
+      "version": 303,
+      "versionNonce": 1262492265,
+      "isDeleted": false,
+      "boundElements": [],
+      "updated": 1758881945661,
+      "link": null,
+      "locked": false,
+      "points": [
+        [
+          0,
+          0
+        ],
+        [
+          144.5,
+          -1.5
+        ],
+        [
+          177.5,
+          -30
+        ],
+        [
+          297.17936724485867,
+          -29.020420978562214
+        ]
+      ],
+      "lastCommittedPoint": null,
+      "startBinding": null,
+      "endBinding": {
+        "elementId": "BY_Why7XDNftdMzPcwjVZ",
+        "focus": 0.77319587628866,
+        "gap": 18.25
+      },
+      "startArrowhead": null,
+      "endArrowhead": "triangle",
+      "elbowed": false
+    },
+    {
+      "id": "OA_NKjb3n3NLtUo_tKmPS",
+      "type": "arrow",
+      "x": 946.0000000000002,
+      "y": 274.95951048200493,
+      "width": 130.016707976343,
+      "height": 209.36808480159067,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aD",
+      "roundness": {
+        "type": 2
+      },
+      "seed": 1871768059,
+      "version": 1039,
+      "versionNonce": 213535035,
+      "isDeleted": false,
+      "boundElements": [],
+      "updated": 1758902358519,
+      "link": null,
+      "locked": false,
+      "points": [
+        [
+          0,
+          0
+        ],
+        [
+          54.99999999999977,
+          12.54048951799507
+        ],
+        [
+          69.49999999999977,
+          188.54048951799507
+        ],
+        [
+          130.016707976343,
+          209.36808480159067
+        ]
+      ],
+      "lastCommittedPoint": null,
+      "startBinding": {
+        "elementId": "EcT-dGsjmfW571ov8Gg4F",
+        "focus": -0.48312180762055096,
+        "gap": 14.500000000000114
+      },
+      "endBinding": {
+        "elementId": "l-O0rMS3SruV22_MPX9Jz",
+        "focus": -0.16742658425737647,
+        "gap": 11.194126334166185
+      },
+      "startArrowhead": null,
+      "endArrowhead": "triangle",
+      "elbowed": false
+    },
+    {
+      "id": "snZ__VDsIlri6NTp8M2Gf",
+      "type": "text",
+      "x": -245.25,
+      "y": 103,
+      "width": 330,
+      "height": 125,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aE",
+      "roundness": null,
+      "seed": 1758461093,
+      "version": 265,
+      "versionNonce": 1069481861,
+      "isDeleted": false,
+      "boundElements": [],
+      "updated": 1758879566916,
+      "link": null,
+      "locked": false,
+      "text": "class NanoSocratesBPETrainer:\n\n    - max_iterations: int\n    - max_vocabulary_size: int\n    - merge_treshold: int",
+      "fontSize": 20,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "class NanoSocratesBPETrainer:\n\n    - max_iterations: int\n    - max_vocabulary_size: int\n    - merge_treshold: int",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "PnbmqwEWYkP8oXElKFyTp",
+      "type": "text",
+      "x": -237.75,
+      "y": 544,
+      "width": 561,
+      "height": 125,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aH",
+      "roundness": null,
+      "seed": 501304683,
+      "version": 241,
+      "versionNonce": 1306401003,
+      "isDeleted": false,
+      "boundElements": [],
+      "updated": 1758878748210,
+      "link": null,
+      "locked": false,
+      "text": "class NanoSocratesSplitter:\n    + splitter_regex: regex\n\n    + split_text(corpus: str) -> [(str, TokenType)]\n",
+      "fontSize": 20,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "class NanoSocratesSplitter:\n    + splitter_regex: regex\n\n    + split_text(corpus: str) -> [(str, TokenType)]\n",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "xR_11IzgXX5O-m6WoRfCL",
+      "type": "text",
+      "x": -233.25,
+      "y": 366.5,
+      "width": 165,
+      "height": 75,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aI",
+      "roundness": null,
+      "seed": 2025585125,
+      "version": 395,
+      "versionNonce": 1799178985,
+      "isDeleted": false,
+      "boundElements": [],
+      "updated": 1758883940168,
+      "link": null,
+      "locked": false,
+      "text": "enum TokenType:\n    + SPECIAL\n    + BPE",
+      "fontSize": 20,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "enum TokenType:\n    + SPECIAL\n    + BPE",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "lgKSd9qCb94-5e8rd9I3r",
+      "type": "text",
+      "x": -219.75,
+      "y": 764.5,
+      "width": 462,
+      "height": 275,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aJ",
+      "roundness": null,
+      "seed": 1963214021,
+      "version": 422,
+      "versionNonce": 903841927,
+      "isDeleted": false,
+      "boundElements": [],
+      "updated": 1758879973600,
+      "link": null,
+      "locked": false,
+      "text": "class TokeNano:\n\n    - splitter: NanoSocratesSplitter\n    - bpe_encoder: NanoSocratesBPE\n    - special_encoder: NanoSocratesSpecial\n\n    + encode(corpus: str) -> [int]\n\n    - encode_special(piece: str) -> int\n\n    - encode_bpe(piece: str) -> [int]",
+      "fontSize": 20,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "class TokeNano:\n\n    - splitter: NanoSocratesSplitter\n    - bpe_encoder: NanoSocratesBPE\n    - special_encoder: NanoSocratesSpecial\n\n    + encode(corpus: str) -> [int]\n\n    - encode_special(piece: str) -> int\n\n    - encode_bpe(piece: str) -> [int]",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "DwFJoUpVT2YAEe9qPYAXa",
+      "type": "text",
+      "x": 496.75,
+      "y": 666,
+      "width": 440,
+      "height": 100,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aL",
+      "roundness": null,
+      "seed": 1317596203,
+      "version": 152,
+      "versionNonce": 1840679687,
+      "isDeleted": false,
+      "boundElements": [],
+      "updated": 1758880107704,
+      "link": null,
+      "locked": false,
+      "text": "class NanoSocratesSpecial(Encoder):\n\n    + vocabulary: dict<str, int>\n    + reverse_vocabulary: dict<int, str>",
+      "fontSize": 20,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "class NanoSocratesSpecial(Encoder):\n\n    + vocabulary: dict<str, int>\n    + reverse_vocabulary: dict<int, str>",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "78gC46xatoO1_cRtaN8EC",
+      "type": "text",
+      "x": 396.375,
+      "y": -107.75,
+      "width": 346.3997802734375,
+      "height": 100,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aM",
+      "roundness": null,
+      "seed": 1187595241,
+      "version": 128,
+      "versionNonce": 1487192455,
+      "isDeleted": false,
+      "boundElements": [],
+      "updated": 1758879825591,
+      "link": null,
+      "locked": false,
+      "text": "class Encoder(ABC):\n\n    + encode(corpus: str) -> [int]\n    + decode(encoded: [int]) -> str ",
+      "fontSize": 20,
+      "fontFamily": 5,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "class Encoder(ABC):\n\n    + encode(corpus: str) -> [int]\n    + decode(encoded: [int]) -> str ",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "3j50Ds74uU7oXoJ9kMOYJ",
+      "type": "text",
+      "x": 457.375,
+      "y": 903.75,
+      "width": 949.7594604492188,
+      "height": 25,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aN",
+      "roundness": null,
+      "seed": 1994335529,
+      "version": 198,
+      "versionNonce": 1492696519,
+      "isDeleted": false,
+      "boundElements": [],
+      "updated": 1758882694747,
+      "link": null,
+      "locked": false,
+      "text": "@@mamma@@è bell^^issima e @@^^le voglio molto b^^ene @--@ replit ^^ è molto ^^bello e^^ lo amo",
+      "fontSize": 20,
+      "fontFamily": 5,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "@@mamma@@è bell^^issima e @@^^le voglio molto b^^ene @--@ replit ^^ è molto ^^bello e^^ lo amo",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "yg-TvQvz4MwJZ0y8K7Ix0",
+      "type": "text",
+      "x": 435.375,
+      "y": 1026.25,
+      "width": 352,
+      "height": 250,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aP",
+      "roundness": null,
+      "seed": 1877486407,
+      "version": 344,
+      "versionNonce": 25830153,
+      "isDeleted": false,
+      "boundElements": [],
+      "updated": 1758883468886,
+      "link": null,
+      "locked": false,
+      "text": "class NanoSocratesChunker:\n\n    - max_bytes: int\n    - max_special_length: int\n    - special_token_regex: regex\n\n    - residuals: str\n\n    # This must be an iterator\n    + read(path: Path) -> str",
+      "fontSize": 20,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "class NanoSocratesChunker:\n\n    - max_bytes: int\n    - max_special_length: int\n    - special_token_regex: regex\n\n    - residuals: str\n\n    # This must be an iterator\n    + read(path: Path) -> str",
+      "autoResize": true,
+      "lineHeight": 1.25
+    }
+  ],
+  "appState": {
+    "gridSize": 20,
+    "gridStep": 5,
+    "gridModeEnabled": false,
+    "viewBackgroundColor": "#ffffff"
+  },
+  "files": {}
+}
\ No newline at end of file

From 9552d61f8d3a54bdf97c640d33075e595a43011b Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Fri, 26 Sep 2025 18:49:56 +0200
Subject: [PATCH 11/75] Added Excetption for when we don't find a delimiter

---
 Project_Model/Libs/BPE/Errors/DelimiterNotFoundException.py | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 Project_Model/Libs/BPE/Errors/DelimiterNotFoundException.py

diff --git a/Project_Model/Libs/BPE/Errors/DelimiterNotFoundException.py b/Project_Model/Libs/BPE/Errors/DelimiterNotFoundException.py
new file mode 100644
index 0000000..2823d5d
--- /dev/null
+++ b/Project_Model/Libs/BPE/Errors/DelimiterNotFoundException.py
@@ -0,0 +1,4 @@
+class DelimiterNotFoundException(Exception):
+
+    def __init__(self, *args: object) -> None:
+        super().__init__(*args)
\ No newline at end of file

From 8db35732f9e9c3a10fb0301cc8991bdca5e18399 Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Fri, 26 Sep 2025 18:50:23 +0200
Subject: [PATCH 12/75] Added Chunker to restrict our domains

---
 .../Libs/BPE/Classes/NanoSocratesChunker.py   | 66 +++++++++++++++++++
 1 file changed, 66 insertions(+)
 create mode 100644 Project_Model/Libs/BPE/Classes/NanoSocratesChunker.py

diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesChunker.py b/Project_Model/Libs/BPE/Classes/NanoSocratesChunker.py
new file mode 100644
index 0000000..6821151
--- /dev/null
+++ b/Project_Model/Libs/BPE/Classes/NanoSocratesChunker.py
@@ -0,0 +1,66 @@
+from pathlib import Path
+import re
+from ..Errors import DelimiterNotFoundException
+
+
+class NanoSocratesChunker:
+
+    def __init__(self, max_size: int, special_token_regex: re.Pattern) -> None:
+        self.__max_size: int = max_size
+        self.__special_token_regex: re.Pattern = special_token_regex
+        self.__residual: str = ""
+
+    def chunk(self, file_path: Path):
+        # read_file
+        FILE = open(file_path, "r", encoding="utf-8")
+        exit = False
+
+        while not exit:
+            REMAINING_SIZE = self.__max_size - len(self.__residual)
+            READ_SIZE = min(self.__max_size, REMAINING_SIZE)
+            FILE_CHUNK = FILE.read(READ_SIZE)
+
+            if len(FILE_CHUNK) == 0:
+                exit = True
+                continue
+
+            CHUNK = self.__append_residuals(FILE_CHUNK)
+
+            boundaries = self.__identify_boudaries(CHUNK)
+
+            if boundaries is None:
+
+                # boundaries not found in 2 chunks,
+                if len(CHUNK) > self.__max_size - 1:
+                    raise DelimiterNotFoundException()
+
+                if exit:
+                    yield CHUNK
+
+                self.__set_residual(0, CHUNK)
+                continue
+
+            start, end = boundaries
+            self.__set_residual(end, CHUNK)
+            yield CHUNK[start:end]
+
+    def __identify_boudaries(self, corpus: str) -> tuple[int, int] | None:
+
+        end = 0
+
+        for match in self.__special_token_regex.finditer(corpus):
+            # print(match)
+            end = match.end()
+
+        if end == 0:
+            return None
+
+        return (0, end)
+
+    def __append_residuals(self, corpus: str) -> str:
+        RESIDUAL = self.__residual
+        self.__residual = ""
+        return RESIDUAL + corpus
+
+    def __set_residual(self, index: int, corpus: str):
+        self.__residual = corpus[index:]

From 3e8b5c55796963ba9e8db3dde9dbbf241c1819b2 Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Fri, 26 Sep 2025 18:50:32 +0200
Subject: [PATCH 13/75] Added test for chunker

---
 Project_Model/Tests/chunker_test.py | 89 +++++++++++++++++++++++++++++
 1 file changed, 89 insertions(+)
 create mode 100644 Project_Model/Tests/chunker_test.py

diff --git a/Project_Model/Tests/chunker_test.py b/Project_Model/Tests/chunker_test.py
new file mode 100644
index 0000000..7bac3bc
--- /dev/null
+++ b/Project_Model/Tests/chunker_test.py
@@ -0,0 +1,89 @@
+from pathlib import Path
+import re
+import pytest
+import Project_Model.Libs.BPE as BPE
+
+PATTERN = "<(TOKEN|SOT|SEP|EOT)>"
+SYMBOL_REGEX = re.compile(PATTERN)
+
+class TestChunker:
+
+    def test_correct_simple(self):
+
+        FILE_PATH = Path("Project_Model/Tests/chunker_files/simple.txt")
+        LEAST_EXPECTED_CHUNKS = 3
+        ORIG_TEXT = FILE_PATH.read_text(encoding="utf-8")
+
+        CHUNKER = BPE.NanoSocratesChunker(40, SYMBOL_REGEX)
+
+        CHUNKS = []
+
+        for chunk in CHUNKER.chunk(FILE_PATH):
+            print(chunk)
+            CHUNKS.append(
+                chunk
+            )
+
+        NANO_TEXT = "".join(CHUNKS)
+
+        assert len(CHUNKS) > (LEAST_EXPECTED_CHUNKS - 1)
+        assert NANO_TEXT == ORIG_TEXT
+
+
+
+    def test_correct_edge_1(self):
+
+        FILE_PATH = Path("Project_Model/Tests/chunker_files/edge-1.txt")
+        LEAST_EXPECTED_CHUNKS = 3
+        ORIG_TEXT = FILE_PATH.read_text(encoding="utf-8")
+
+        CHUNKER = BPE.NanoSocratesChunker(15, SYMBOL_REGEX)
+
+        CHUNKS = []
+
+        for chunk in CHUNKER.chunk(FILE_PATH):
+            print(chunk)
+            CHUNKS.append(
+                chunk
+            )
+
+        NANO_TEXT = "".join(CHUNKS)
+
+        assert len(CHUNKS) > (LEAST_EXPECTED_CHUNKS - 1)
+        assert NANO_TEXT == ORIG_TEXT
+
+
+
+    def test_throwing(self):
+
+        FILE_PATH = Path("Project_Model/Tests/chunker_files/simple.txt")
+
+        CHUNKER = BPE.NanoSocratesChunker(5, SYMBOL_REGEX)
+
+        with pytest.raises(BPE.DelimiterNotFoundException):
+            for chunk in CHUNKER.chunk(FILE_PATH):
+                print(chunk)
+
+if __name__ == "__main__":
+
+    FILE_PATH = Path("Project_Model/Tests/chunker_files/stress.txt")
+    LEAST_EXPECTED_CHUNKS = 3
+    ORIG_TEXT = FILE_PATH.read_text(encoding="utf-8")
+
+    CHUNKER = BPE.NanoSocratesChunker(40, SYMBOL_REGEX)
+
+    CHUNKS = []
+
+    try:
+        for chunk in CHUNKER.chunk(FILE_PATH):
+            print(f"CHUNK START:\n{chunk} - {len(chunk)}\nCHUNK END\n")
+            CHUNKS.append(
+                chunk
+            )
+    except:
+        exit(0)
+
+    NANO_TEXT = "".join(CHUNKS)
+
+    assert len(CHUNKS) > (LEAST_EXPECTED_CHUNKS - 1)
+    assert NANO_TEXT == ORIG_TEXT

From ed0255e99babd60ee3277190f88dc99258f5e34f Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Sun, 28 Sep 2025 18:01:35 +0200
Subject: [PATCH 14/75] Updated imports

---
 Project_Model/Libs/BPE/Classes/__init__.py | 6 +++++-
 Project_Model/Libs/BPE/Enums/__init__.py   | 1 +
 Project_Model/Libs/BPE/Errors/__init__.py  | 4 +++-
 Project_Model/Libs/BPE/__init__.py         | 4 ++++
 4 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/Project_Model/Libs/BPE/Classes/__init__.py b/Project_Model/Libs/BPE/Classes/__init__.py
index e8e65e5..d8a7364 100644
--- a/Project_Model/Libs/BPE/Classes/__init__.py
+++ b/Project_Model/Libs/BPE/Classes/__init__.py
@@ -1,5 +1,9 @@
 from .NanoSocratesChunker import NanoSocratesChunker
+from .NanoSocratesSplitter import NanoSocratesSplitter
+from .NanoSocratesBPE import NanoSocratesBPE
 
 __all__ = [
-    "NanoSocratesChunker"
+    "NanoSocratesChunker",
+    "NanoSocratesSplitter",
+    "NanoSocratesBPE"
 ]
\ No newline at end of file
diff --git a/Project_Model/Libs/BPE/Enums/__init__.py b/Project_Model/Libs/BPE/Enums/__init__.py
index e69de29..8ef388a 100644
--- a/Project_Model/Libs/BPE/Enums/__init__.py
+++ b/Project_Model/Libs/BPE/Enums/__init__.py
@@ -0,0 +1 @@
+from .TokenType import TokenType
diff --git a/Project_Model/Libs/BPE/Errors/__init__.py b/Project_Model/Libs/BPE/Errors/__init__.py
index 0aab0ad..587873f 100644
--- a/Project_Model/Libs/BPE/Errors/__init__.py
+++ b/Project_Model/Libs/BPE/Errors/__init__.py
@@ -1,5 +1,7 @@
 from .DelimiterNotFoundException import DelimiterNotFoundException
+from .OutOfDictionaryException import OutOfDictionaryException
 
 __all__ = [
-    "DelimiterNotFoundException"
+    "DelimiterNotFoundException",
+    "OutOfDictionaryException"
 ]
diff --git a/Project_Model/Libs/BPE/__init__.py b/Project_Model/Libs/BPE/__init__.py
index 2292a87..6f7d1f2 100644
--- a/Project_Model/Libs/BPE/__init__.py
+++ b/Project_Model/Libs/BPE/__init__.py
@@ -1,3 +1,7 @@
 from .Classes import *
 from .Enums import *
 from .Errors import *
+
+from . import Classes
+from . import Enums
+from . import Errors

From b071145f6eff631bdc651182d7b93cf10f88d784 Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Sun, 28 Sep 2025 18:02:06 +0200
Subject: [PATCH 15/75] Added Chunker

---
 Project_Model/Libs/BPE/Classes/NanoSocratesChunker.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesChunker.py b/Project_Model/Libs/BPE/Classes/NanoSocratesChunker.py
index 6821151..a81587c 100644
--- a/Project_Model/Libs/BPE/Classes/NanoSocratesChunker.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocratesChunker.py
@@ -10,6 +10,10 @@ class NanoSocratesChunker:
         self.__special_token_regex: re.Pattern = special_token_regex
         self.__residual: str = ""
 
+    # max theorethical size of chars
+    #   between special tokens:
+    #       - min: size - len(longest_token)
+    #       - MAX: size - len(shortest_token)
     def chunk(self, file_path: Path):
         # read_file
         FILE = open(file_path, "r", encoding="utf-8")

From d179e0197109479bb06fea16788e1210aac9b27a Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Sun, 28 Sep 2025 18:03:16 +0200
Subject: [PATCH 16/75] Added Splitter to divide tokens from text

---
 .../Libs/BPE/Classes/NanoSocratesSplitter.py  |  40 ++++++
 Project_Model/Tests/splitter_test.py          | 131 ++++++++++++++++++
 2 files changed, 171 insertions(+)
 create mode 100644 Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py
 create mode 100644 Project_Model/Tests/splitter_test.py

diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py b/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py
new file mode 100644
index 0000000..ccca300
--- /dev/null
+++ b/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py
@@ -0,0 +1,40 @@
+import re
+from typing import Generator
+from ..Enums import TokenType
+
+
+class NanoSocratesSplitter:
+
+    def __init__(
+        self,
+        special_token_regex: re.Pattern
+    ) -> None:
+        self.__special_token_regex = special_token_regex
+
+    def split_text(self, corpus: str) -> Generator[tuple[str, TokenType]]:
+
+        bpe_start = 0
+        bpe_end = len(corpus)
+
+        for bound_start, bound_end in self.__find_boundaries(corpus):
+
+            bpe_end = bound_start
+            BPE_TOKEN_TEXT = corpus[bpe_start:bpe_end]
+
+            if BPE_TOKEN_TEXT != "":
+                yield (BPE_TOKEN_TEXT, TokenType.BPE)
+
+            bpe_start = bound_end
+            SPECIAL_TOKEN_TEXT = corpus[bound_start:bound_end]
+
+            if SPECIAL_TOKEN_TEXT != "":
+                yield (SPECIAL_TOKEN_TEXT, TokenType.SPECIAL)
+
+    def __find_boundaries(self, corpus: str) -> Generator[tuple[int, int]]:
+
+        for match in self.__special_token_regex.finditer(corpus):
+            start = match.start()
+            end = match.end()
+
+            yield (start, end)
+
diff --git a/Project_Model/Tests/splitter_test.py b/Project_Model/Tests/splitter_test.py
new file mode 100644
index 0000000..eda95b6
--- /dev/null
+++ b/Project_Model/Tests/splitter_test.py
@@ -0,0 +1,131 @@
+from Project_Model.Libs.BPE.Enums import TokenType
+import Project_Model.Libs.BPE as BPE
+
+import re
+
+
+PATTERN = "<(TOKEN|SOT|SEP|EOT)>"
+SYMBOL_REGEX = re.compile(PATTERN)
+
+
+class TestSplitter:
+
+    def test_split(self):
+
+        TEXT = "<SOT>Lorem <SEP>"
+
+        SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX)
+
+        EXPECTED_CHUNKS = [
+            ("<SOT>", TokenType.SPECIAL),
+            ("Lorem ", TokenType.BPE),
+            ("<SEP>", TokenType.SPECIAL),
+        ]
+
+        CHUNKS = list(SPLITTER.split_text(TEXT))
+
+        assert len(CHUNKS) == len(EXPECTED_CHUNKS)
+
+        for chunk, expected_chunk in zip(EXPECTED_CHUNKS, CHUNKS):
+            print(f"TEST:\n\tCHUNK:\t\t{chunk}\n\tEXPECTED:\t\t{expected_chunk}")
+            RECEIVED_TOKEN_STRING, RECEIVED_TOKEN_TYPE = chunk
+            EXPECTED_TOKEN_STRING, EXPECTED_TOKEN_TYPE = expected_chunk
+
+            assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING
+            assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE
+
+    def test_split_trailing_text(self):
+
+        TEXT = "ipsu<SEP>m d<SEP>olor"
+
+        SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX)
+
+        EXPECTED_CHUNKS = [
+            ("ipsu", TokenType.BPE),
+            ("<SEP>", TokenType.SPECIAL),
+            ("m d", TokenType.BPE),
+            ("<SEP>", TokenType.SPECIAL),
+        ]
+
+        CHUNKS = list(SPLITTER.split_text(TEXT))
+
+        assert len(CHUNKS) == len(EXPECTED_CHUNKS)
+
+        for chunk, expected_chunk in zip(EXPECTED_CHUNKS, CHUNKS):
+            print(f"TEST:\n\tCHUNK:\t\t{chunk}\n\tEXPECTED:\t\t{expected_chunk}")
+            RECEIVED_TOKEN_STRING, RECEIVED_TOKEN_TYPE = chunk
+            EXPECTED_TOKEN_STRING, EXPECTED_TOKEN_TYPE = expected_chunk
+
+            assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING
+            assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE
+
+    def test_split_multi_token(self):
+
+        TEXT = "ipsu<SEP>m d<SEP><SEP><SEP>dsg<SEP>olor"
+
+        SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX)
+
+        EXPECTED_CHUNKS = [
+            ("ipsu", TokenType.BPE),
+            ("<SEP>", TokenType.SPECIAL),
+            ("m d", TokenType.BPE),
+            ("<SEP>", TokenType.SPECIAL),
+            ("<SEP>", TokenType.SPECIAL),
+            ("<SEP>", TokenType.SPECIAL),
+            ("dsg", TokenType.BPE),
+            ("<SEP>", TokenType.SPECIAL),
+        ]
+
+        CHUNKS = list(SPLITTER.split_text(TEXT))
+
+        assert len(CHUNKS) == len(EXPECTED_CHUNKS)
+
+        for chunk, expected_chunk in zip(EXPECTED_CHUNKS, CHUNKS):
+            print(f"TEST:\n\tCHUNK:\t\t{chunk}\n\tEXPECTED:\t\t{expected_chunk}")
+            RECEIVED_TOKEN_STRING, RECEIVED_TOKEN_TYPE = chunk
+            EXPECTED_TOKEN_STRING, EXPECTED_TOKEN_TYPE = expected_chunk
+
+            assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING
+            assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE
+
+    def test_split_malformed_1(self):
+
+        TEXT = "<SEP>lerisque"
+
+        SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX)
+
+        EXPECTED_CHUNKS = [
+            ("<SEP>", TokenType.SPECIAL),
+        ]
+
+        CHUNKS = list(SPLITTER.split_text(TEXT))
+
+        assert len(CHUNKS) == len(EXPECTED_CHUNKS)
+
+        for chunk, expected_chunk in zip(EXPECTED_CHUNKS, CHUNKS):
+            print(f"TEST:\n\tCHUNK:\t\t{chunk}\n\tEXPECTED:\t\t{expected_chunk}")
+            RECEIVED_TOKEN_STRING, RECEIVED_TOKEN_TYPE = chunk
+            EXPECTED_TOKEN_STRING, EXPECTED_TOKEN_TYPE = expected_chunk
+
+            assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING
+            assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE
+
+    def test_split_malformed_2(self):
+
+        TEXT = "lerisque"
+
+        SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX)
+
+        EXPECTED_CHUNKS = []
+
+        CHUNKS = list(SPLITTER.split_text(TEXT))
+
+        assert len(CHUNKS) == len(EXPECTED_CHUNKS)
+
+        for chunk, expected_chunk in zip(EXPECTED_CHUNKS, CHUNKS):
+            print(f"TEST:\n\tCHUNK:\t\t{chunk}\n\tEXPECTED:\t\t{expected_chunk}")
+            RECEIVED_TOKEN_STRING, RECEIVED_TOKEN_TYPE = chunk
+            EXPECTED_TOKEN_STRING, EXPECTED_TOKEN_TYPE = expected_chunk
+
+            assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING
+            assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE

From b46df4f91aca930f83a93934c6bf2e6e8a684d2a Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Sun, 28 Sep 2025 18:03:47 +0200
Subject: [PATCH 17/75] Added Special Encoder

---
 Project_Model/Libs/BPE/Classes/Encoder.py     |  4 ++
 .../Libs/BPE/Classes/NanoSocratesSpecial.py   | 54 +++++++++++++++++++
 .../BPE/Errors/OutOfDictionaryException.py    |  4 ++
 3 files changed, 62 insertions(+)
 create mode 100644 Project_Model/Libs/BPE/Classes/Encoder.py
 create mode 100644 Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py
 create mode 100644 Project_Model/Libs/BPE/Errors/OutOfDictionaryException.py

diff --git a/Project_Model/Libs/BPE/Classes/Encoder.py b/Project_Model/Libs/BPE/Classes/Encoder.py
new file mode 100644
index 0000000..800772b
--- /dev/null
+++ b/Project_Model/Libs/BPE/Classes/Encoder.py
@@ -0,0 +1,4 @@
+from abc import ABC
+
+class Encoder(ABC):
+    pass
\ No newline at end of file
diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py b/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py
new file mode 100644
index 0000000..e551d6c
--- /dev/null
+++ b/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py
@@ -0,0 +1,54 @@
+from .Encoder import Encoder
+from ..Errors import OutOfDictionaryException
+
+class NanoSocratesSpecial(Encoder):
+
+    def __init__(
+        self,
+        initial_vocabulary: list[str] | None = None
+    ) -> None:
+        super().__init__()
+
+        self.__vocabulary: dict[str, int] = {}
+        self.__reverse_vocabulary: dict[int, str] = {}
+        self.__current_index = 0
+
+        if initial_vocabulary is None:
+            return
+
+        for word in initial_vocabulary:
+
+            CURRENT_INDEX = self.__current_index
+            self.__vocabulary[word] = CURRENT_INDEX
+            self.__reverse_vocabulary[CURRENT_INDEX] = word
+
+            self.__current_index += 1
+
+    @property
+    def vocabulary_size(self):
+        return self.__current_index
+
+    def add_special_word(self, word:str):
+        CURRENT_INDEX = self.__current_index
+        self.__vocabulary[word] = CURRENT_INDEX
+        self.__reverse_vocabulary[CURRENT_INDEX] = word
+        self.__current_index += 1
+
+    def encode(self, word: str) -> list[int]:
+        ID = self.__vocabulary.get(word)
+
+        if ID is None:
+            raise OutOfDictionaryException()
+
+        return [ID]
+
+    def decode(self, token_id: int) -> str:
+
+        ID = token_id
+        WORD = self.__reverse_vocabulary.get(ID)
+
+        if WORD is None:
+            raise OutOfDictionaryException()
+
+        return WORD
+
diff --git a/Project_Model/Libs/BPE/Errors/OutOfDictionaryException.py b/Project_Model/Libs/BPE/Errors/OutOfDictionaryException.py
new file mode 100644
index 0000000..2c4c440
--- /dev/null
+++ b/Project_Model/Libs/BPE/Errors/OutOfDictionaryException.py
@@ -0,0 +1,4 @@
+class OutOfDictionaryException(Exception):
+
+    def __init__(self, *args: object) -> None:
+        super().__init__(*args)
\ No newline at end of file

From e43394140577187ebd5818c975ba3ebfaff52b8d Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Sun, 28 Sep 2025 18:04:44 +0200
Subject: [PATCH 18/75] Added BPE

TODO:
- complete the fit method
---
 .../Libs/BPE/Classes/NanoSocratesBPE.py       | 106 ++++++++++++++++++
 Project_Model/Libs/BPE/Enums/TokenType.py     |   6 +
 Project_Model/Tests/bpe_test.py               |  52 +++++++++
 3 files changed, 164 insertions(+)
 create mode 100644 Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py
 create mode 100644 Project_Model/Libs/BPE/Enums/TokenType.py
 create mode 100644 Project_Model/Tests/bpe_test.py

diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py
new file mode 100644
index 0000000..844e860
--- /dev/null
+++ b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py
@@ -0,0 +1,106 @@
+from .Encoder import Encoder
+from ..Errors import OutOfDictionaryException
+
+
+class NanoSocratesBatchMemoryBPE:
+
+    def __init__(self) -> None:
+        pass
+
+
+class NanoSocratesBPE(Encoder):
+
+    def __init__(self, vocabulary: dict[tuple[int, int], int] | None = None) -> None:
+        super().__init__()
+
+        self.__vocabulary: dict[tuple[int, int], int] = {}
+        self.__reverse_vocabulary: dict[int, tuple[int, int]] = {}
+
+        if vocabulary is None:
+            return
+
+        for key, value in vocabulary.items():
+            if value < 256:
+                raise OutOfDictionaryException()
+            self.__vocabulary[key] = value
+            self.__reverse_vocabulary[value] = key
+
+    # TODO: implement fit
+    def fit():
+        pass
+
+    def encode(self, piece: str) -> list[int]:
+
+        current_piece = list(map(ord, piece))
+        new_piece = self.__round_encode(current_piece)
+
+        while len(current_piece) != len(new_piece):
+            current_piece = new_piece
+            new_piece = self.__round_encode(current_piece)
+
+        return current_piece
+
+    def __round_encode(self, piece: list[int]):
+
+        if len(piece) == 1:
+            return piece
+
+        PIECE_LENGTH = len(piece) - 1
+        NEW_PIECE = []
+
+        index = 0
+        while index < PIECE_LENGTH:
+
+            CANDIDATE_WORD = (piece[index], piece[index + 1])
+            CANDIDATE_TOKEN = self.__vocabulary.get(CANDIDATE_WORD)
+
+            if CANDIDATE_TOKEN is None:
+                NEW_PIECE.append(piece[index])
+                index += 1
+
+                if index == PIECE_LENGTH:
+                    NEW_PIECE.append(piece[index])
+
+                continue
+
+            NEW_PIECE.append(CANDIDATE_TOKEN)
+            index += 2
+
+
+        return NEW_PIECE
+
+    # TODO: decode
+    def decode(self, token_id: int) -> str:
+
+        token_stack: list[int] = [token_id]
+        DECODED_STRING_ARR: list[str] = []
+
+        while len(token_stack) > 0:
+            TOKEN_ID = token_stack.pop()
+
+            if TOKEN_ID < 256:
+                DECODED_CHAR = chr(TOKEN_ID)
+                DECODED_STRING_ARR.append(
+                    DECODED_CHAR
+                )
+                continue
+
+            left_token, right_token = self.__token_decode(TOKEN_ID)
+
+            token_stack.append(
+                right_token
+            )
+            token_stack.append(
+                left_token
+            )
+
+        return "".join(DECODED_STRING_ARR)
+
+    def __token_decode(self, token_id: int) -> tuple[int, int]:
+
+        CANDIDATE_DECODED = self.__reverse_vocabulary.get(token_id)
+
+        if CANDIDATE_DECODED is None:
+            raise OutOfDictionaryException()
+
+        return CANDIDATE_DECODED
diff --git a/Project_Model/Libs/BPE/Enums/TokenType.py b/Project_Model/Libs/BPE/Enums/TokenType.py
new file mode 100644
index 0000000..7a27c34
--- /dev/null
+++ b/Project_Model/Libs/BPE/Enums/TokenType.py
@@ -0,0 +1,6 @@
+from enum import Enum, auto
+
+class TokenType(Enum):
+
+    SPECIAL = auto()
+    BPE = auto()
\ No newline at end of file
diff --git a/Project_Model/Tests/bpe_test.py b/Project_Model/Tests/bpe_test.py
new file mode 100644
index 0000000..7332f65
--- /dev/null
+++ b/Project_Model/Tests/bpe_test.py
@@ -0,0 +1,52 @@
+from Project_Model.Libs.BPE.Enums import TokenType
+import Project_Model.Libs.BPE as BPE
+
+import re
+
+
+class TestBPE:
+
+    def test_bpe_encoding_simple(self):
+
+        TEXT = "abababab"
+
+        # ab = 256
+        # 256, 256 = 257
+        # 257, 257 = 258
+
+        VOCABULARY = {(ord("a"), ord("b")): 256, (256, 256): 257, (257, 257): 258}
+        EXPECTED = [258]
+
+        BPE_ENCODER = BPE.NanoSocratesBPE(VOCABULARY)
+
+        ENCODED = BPE_ENCODER.encode(TEXT)
+
+        assert len(ENCODED) == len(EXPECTED)
+
+        for encoded, expected in zip(ENCODED, EXPECTED):
+            assert encoded == expected
+
+    def test_bpe_decoding_simple(self):
+
+
+        INPUT = 258
+
+        # ab = 256
+        # 256, 256 = 257
+        # 257, 257 = 258
+
+        VOCABULARY = {(ord("a"), ord("b")): 256, (256, 256): 257, (257, 257): 258}
+        EXPECTED = "abababab"
+
+        BPE_ENCODER = BPE.NanoSocratesBPE(VOCABULARY)
+
+        DECODED = BPE_ENCODER.decode(INPUT)
+
+        assert len(DECODED) == len(EXPECTED)
+
+        for encoded, expected in zip(DECODED, EXPECTED):
+            assert encoded == expected
+
+# Useful to debug weird cases
+if __name__ == "__main__":
+    TestBPE().test_bpe_decoding_simple()

From 564b0d712ec571d2846ab7bba5b2c88c972236e9 Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Sun, 28 Sep 2025 18:05:03 +0200
Subject: [PATCH 19/75] Modified UML diagram

---
 Project_Model/UML/bpe.excalidraw.json | 57 ++++++++++++++++++++++-----
 1 file changed, 47 insertions(+), 10 deletions(-)

diff --git a/Project_Model/UML/bpe.excalidraw.json b/Project_Model/UML/bpe.excalidraw.json
index d706222..1a53327 100644
--- a/Project_Model/UML/bpe.excalidraw.json
+++ b/Project_Model/UML/bpe.excalidraw.json
@@ -482,20 +482,20 @@
       "index": "aJ",
       "roundness": null,
       "seed": 1963214021,
-      "version": 422,
-      "versionNonce": 903841927,
+      "version": 464,
+      "versionNonce": 1104453739,
       "isDeleted": false,
       "boundElements": [],
-      "updated": 1758879973600,
+      "updated": 1759053302739,
       "link": null,
       "locked": false,
-      "text": "class TokeNano:\n\n    - splitter: NanoSocratesSplitter\n    - bpe_encoder: NanoSocratesBPE\n    - special_encoder: NanoSocratesSpecial\n\n    + encode(corpus: str) -> [int]\n\n    - encode_special(piece: str) -> int\n\n    - encode_bpe(piece: str) -> [int]",
+      "text": "class TokeNanoCore:\n\n    - splitter: NanoSocratesSplitter\n    - bpe_encoder: NanoSocratesBPE\n    - special_encoder: NanoSocratesSpecial\n\n    + encode(corpus: str) -> [int]\n\n    - encode_special(piece: str) -> int\n\n    - encode_bpe(piece: str) -> [int]",
       "fontSize": 20,
       "fontFamily": 8,
       "textAlign": "left",
       "verticalAlign": "top",
       "containerId": null,
-      "originalText": "class TokeNano:\n\n    - splitter: NanoSocratesSplitter\n    - bpe_encoder: NanoSocratesBPE\n    - special_encoder: NanoSocratesSpecial\n\n    + encode(corpus: str) -> [int]\n\n    - encode_special(piece: str) -> int\n\n    - encode_bpe(piece: str) -> [int]",
+      "originalText": "class TokeNanoCore:\n\n    - splitter: NanoSocratesSplitter\n    - bpe_encoder: NanoSocratesBPE\n    - special_encoder: NanoSocratesSpecial\n\n    + encode(corpus: str) -> [int]\n\n    - encode_special(piece: str) -> int\n\n    - encode_bpe(piece: str) -> [int]",
       "autoResize": true,
       "lineHeight": 1.25
     },
@@ -541,7 +541,7 @@
       "type": "text",
       "x": 396.375,
       "y": -107.75,
-      "width": 346.3997802734375,
+      "width": 396,
       "height": 100,
       "angle": 0,
       "strokeColor": "#1e1e1e",
@@ -556,16 +556,16 @@
       "index": "aM",
       "roundness": null,
       "seed": 1187595241,
-      "version": 128,
-      "versionNonce": 1487192455,
+      "version": 130,
+      "versionNonce": 1273030504,
       "isDeleted": false,
       "boundElements": [],
-      "updated": 1758879825591,
+      "updated": 1759070012771,
       "link": null,
       "locked": false,
       "text": "class Encoder(ABC):\n\n    + encode(corpus: str) -> [int]\n    + decode(encoded: [int]) -> str ",
       "fontSize": 20,
-      "fontFamily": 5,
+      "fontFamily": 8,
       "textAlign": "left",
       "verticalAlign": "top",
       "containerId": null,
@@ -646,6 +646,43 @@
       "originalText": "class NanoSocratesChunker:\n\n    - max_bytes: int\n    - max_special_length: int\n    - special_token_regex: regex\n\n    - residuals: str\n\n    # This must be an iterator\n    + read(path: Path) -> str",
       "autoResize": true,
       "lineHeight": 1.25
+    },
+    {
+      "id": "2UXjWdE_jMcsCE2oQgTXn",
+      "type": "text",
+      "x": -334.75,
+      "y": 1112.5,
+      "width": 165,
+      "height": 25,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aQ",
+      "roundness": null,
+      "seed": 700532363,
+      "version": 76,
+      "versionNonce": 1671597672,
+      "isDeleted": false,
+      "boundElements": [],
+      "updated": 1759070020002,
+      "link": null,
+      "locked": false,
+      "text": "class TokeNano:",
+      "fontSize": 20,
+      "fontFamily": 8,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "class TokeNano:",
+      "autoResize": true,
+      "lineHeight": 1.25
     }
   ],
   "appState": {

From 6ddb7de9da1af4fad8d8bae265f0622f56ba6bec Mon Sep 17 00:00:00 2001
From: GassiGiuseppe <g.gassi@studenti.poliba.it>
Date: Mon, 29 Sep 2025 15:19:19 +0200
Subject: [PATCH 20/75] Added sqlAlchemy to requirements

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index e87882c..70a3169 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,3 +15,4 @@ tzdata==2025.2
 urllib3==2.5.0
 wheel==0.45.1
 Wikipedia-API==0.8.1
+SQLAlchemy

From bd72ad3571bf2710cd154c5cf08b448dc194f13d Mon Sep 17 00:00:00 2001
From: GassiGiuseppe <g.gassi@studenti.poliba.it>
Date: Mon, 29 Sep 2025 15:21:26 +0200
Subject: [PATCH 21/75] Added file to execute the complete cleaning pipeline

---
 .../data_output_models/bpe_corpus.py          |  21 ++
 .../rdf_completation_task.py                  |  26 +++
 .../data_output_models/rdf_mask_task.py       |  58 ++++++
 .../data_output_models/rdf_text_tasks.py      |  26 +++
 Scripts/DataCleaning/filter.py                | 184 ++++++++++++++++++
 Scripts/DataCleaning/pipeline.py              | 107 ++++++++++
 .../Libs/CleaningPipeline/special_token.py    |  21 ++
 Scripts/Libs/CleaningPipeline/sql_endpoint.py | 144 ++++++++++++++
 Scripts/Libs/Utils/dataframe_interaction.py   |   9 +
 9 files changed, 596 insertions(+)
 create mode 100644 Scripts/DataCleaning/data_output_models/bpe_corpus.py
 create mode 100644 Scripts/DataCleaning/data_output_models/rdf_completation_task.py
 create mode 100644 Scripts/DataCleaning/data_output_models/rdf_mask_task.py
 create mode 100644 Scripts/DataCleaning/data_output_models/rdf_text_tasks.py
 create mode 100644 Scripts/DataCleaning/filter.py
 create mode 100644 Scripts/DataCleaning/pipeline.py
 create mode 100644 Scripts/Libs/CleaningPipeline/special_token.py
 create mode 100644 Scripts/Libs/CleaningPipeline/sql_endpoint.py
 create mode 100644 Scripts/Libs/Utils/dataframe_interaction.py

diff --git a/Scripts/DataCleaning/data_output_models/bpe_corpus.py b/Scripts/DataCleaning/data_output_models/bpe_corpus.py
new file mode 100644
index 0000000..a0348b6
--- /dev/null
+++ b/Scripts/DataCleaning/data_output_models/bpe_corpus.py
@@ -0,0 +1,21 @@
+from Scripts.Libs.Utils.dataframe_interaction import get_raw_from_dataframe
+from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
+import pandas as pd
+
+class BPE_corpus():
+
+    def __init__(self, output_path :str):
+        self.output_handler = open(output_path, "w")
+
+    def close(self):
+        # add corpus end before closing
+        self.output_handler.write(SpecialToken.CORPUS_END.value)
+        self.output_handler.close()
+        
+    def write_from_str(self, output: str):
+        if output == '':
+            return
+        self.output_handler.write(output)
+
+    def write_from_df(self, df: pd.DataFrame):
+        self.write_from_str(get_raw_from_dataframe(df))
\ No newline at end of file
diff --git a/Scripts/DataCleaning/data_output_models/rdf_completation_task.py b/Scripts/DataCleaning/data_output_models/rdf_completation_task.py
new file mode 100644
index 0000000..111b2b9
--- /dev/null
+++ b/Scripts/DataCleaning/data_output_models/rdf_completation_task.py
@@ -0,0 +1,26 @@
+import pandas as pd
+
+class RDF_completation_task_dataset():
+    """
+        Write the CSV for the fourth task, which is "Predicting subsequent triples based on a given context".
+        Each RDF is saved as str
+        CSV Composition: ["MovieID","RDF"]
+    """
+    def __init__(self, output_path:str):
+     
+
+        self.output =  open(output_path, "w")
+        # then the first row as header
+        header = ["MovieID","RDF"]
+        self.output.write(",".join(header) + "\n")
+
+    def close(self):
+        self.output.close()
+
+    def write(self, RDF: pd.DataFrame):
+        """
+        Args:
+            RDF (pd.DataFrame): ["MovieID","RDF"]
+        """        
+
+        RDF.to_csv(self.output, index=False, header=False)
\ No newline at end of file
diff --git a/Scripts/DataCleaning/data_output_models/rdf_mask_task.py b/Scripts/DataCleaning/data_output_models/rdf_mask_task.py
new file mode 100644
index 0000000..01b943d
--- /dev/null
+++ b/Scripts/DataCleaning/data_output_models/rdf_mask_task.py
@@ -0,0 +1,58 @@
+import pandas as pd
+
+# do not worry about circular dependencies, this class will never call something else
+from Scripts.DataCleaning.filter import PipelineApplier
+
+class RDF_mask_task_dataset():
+    """
+        Write the CSV for the third task, which is "Predicting a masked component within an RDF triple".
+        The CSV is like: for each RDF there will be 3 rows, where every time one of the componments is missing.
+        CSV Composition: ["MovieID","IncompleteRDF","Missing","RDF"]
+    """
+    def __init__(self, output_path:str):
+     
+        # this methods will only be used by this class, but they belong in a lower level
+        self._build_triple = PipelineApplier.build_triple
+        self._build_incomplete_triple = PipelineApplier.build_incomplete_triple
+
+        self.output =  open(output_path, "w")
+        # then the first row as header
+        header = ["MovieID","IncompleteRDF","Missing","RDF"]
+        self.output.write(",".join(header) + "\n")
+
+    def close(self):
+        self.output.close()
+
+    def write(self, RDF: pd.DataFrame):
+        rdf_complete = self._build_triple(RDF)
+
+        rdf_without_subject = self._build_incomplete_triple(RDF.drop(columns=["SubjectURI"]))
+        rdf_without_relationship = self._build_incomplete_triple(RDF.drop(columns=["RelationshipURI"]))
+        rdf_without_object = self._build_incomplete_triple(RDF.drop(columns=["ObjectURI"]))
+        ####
+        df_subject = pd.DataFrame({
+            "MovieID": RDF["MovieID"],
+            "IncompleteRDF": rdf_without_subject,
+            "Missing": RDF["SubjectURI"],
+            "RDF": rdf_complete,
+        })
+
+        df_relationship = pd.DataFrame({
+            "MovieID": RDF["MovieID"],
+            "IncompleteRDF": rdf_without_relationship,
+            "Missing": RDF["RelationshipURI"],
+            "RDF": rdf_complete,
+        })
+
+        df_object = pd.DataFrame({
+            "MovieID": RDF["MovieID"],
+            "IncompleteRDF": rdf_without_object,
+            "Missing": RDF["ObjectURI"],
+            "RDF": rdf_complete,
+        })
+
+
+        output_df = pd.concat([df_subject, df_relationship, df_object], ignore_index=True)
+        output_df.to_csv(self.output, index=False, header=False)
+
+
diff --git a/Scripts/DataCleaning/data_output_models/rdf_text_tasks.py b/Scripts/DataCleaning/data_output_models/rdf_text_tasks.py
new file mode 100644
index 0000000..918e600
--- /dev/null
+++ b/Scripts/DataCleaning/data_output_models/rdf_text_tasks.py
@@ -0,0 +1,26 @@
+import pandas as pd
+
+class RDF_text_task_dataset():
+    """
+        Write the CSV for the firsts two tasks, which are "Generating structured RDF triples from natural language text" and reverse.
+        In the CVS the RDFs will be saved toghether as a string.
+        CSV Composition: ["MovieID","RDFs","Abstract"]
+    """
+    def __init__(self, output_path:str):
+     
+
+        self.output =  open(output_path, "w")
+        # then the first row as header
+        header = ["MovieID","RDFs","Abstract"]
+        self.output.write(",".join(header) + "\n")
+
+    def close(self):
+        self.output.close()
+
+    def write(self, RDF: pd.DataFrame):
+        """
+        Args:
+            RDF (pd.DataFrame): ["MovieID","Triple","Abstract"]
+        """        
+
+        RDF.to_csv(self.output, index=False, header=False)
\ No newline at end of file
diff --git a/Scripts/DataCleaning/filter.py b/Scripts/DataCleaning/filter.py
new file mode 100644
index 0000000..50d6ead
--- /dev/null
+++ b/Scripts/DataCleaning/filter.py
@@ -0,0 +1,184 @@
+# This file deletes in the pipeline the unwanted relationship by different rules
+import pandas as pd
+import sqlite3
+import numpy as np
+
+from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
+from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
+
+
+class PipelineApplier():
+
+    def __init__(self):
+
+        self.MOVIE_FILTER = pd.DataFrame()
+        self.REL_FILTER = pd.DataFrame()
+
+
+    def delete_relationship_by_str(self, RDF: pd.DataFrame, uri: str) -> pd.DataFrame:
+        return RDF[RDF["RelationshipURI"]!= uri]
+    
+    def generate_list_relationship_filter(self, filter_list: list[str]) -> None:
+        """Store RelationshipURI filters as a set """
+        self.relationship_filter_list: set[str] = set(filter_list)
+    
+    def delete_relationship_by_list_filter(self, RDF: pd.DataFrame) -> pd.DataFrame:
+        """Remove rows whose RelationshipURI is in the stored filter. Generate it first callig the generate_list_relationship_filter"""
+        return RDF[~RDF["RelationshipURI"].isin(self.relationship_filter_list)]
+
+
+    def generate_frequency_movie_filter(self, MOVIE_COUNT: pd.DataFrame ,min_treshold: int, max_treshold: int):
+        """
+        You MUST call this before filter the dataset by movie frequence [filter_by_frequence_movie_id()], 
+        since this method creates such filter
+        Args:
+            MOVIE_COUNT (pd.DataFrame): ["MovieID","Count"]
+            min_treshold (int): 
+            max_treshold (int): 
+        """        
+        MOVIE_COUNT = MOVIE_COUNT[MOVIE_COUNT["Count"] >= min_treshold]
+        MOVIE_COUNT = MOVIE_COUNT[MOVIE_COUNT["Count"] < max_treshold]
+        self.MOVIE_FILTER = MOVIE_COUNT #["MovieID"]
+
+    def generate_frequency_relationship_filter(self, REL_COUNT: pd.DataFrame ,min_treshold: int, max_treshold: int):
+        REL_COUNT = REL_COUNT[REL_COUNT["Count"] >= min_treshold]
+        REL_COUNT = REL_COUNT[REL_COUNT["Count"] < max_treshold]
+        self.REL_FILTER = REL_COUNT #["RelationshipURI"]
+
+    def filter_by_frequency_movie_id(self, RDF: pd.DataFrame) -> pd.DataFrame:
+        RDF = RDF[RDF["MovieID"].isin(self.MOVIE_FILTER["MovieID"])]
+        return RDF
+
+    def filter_by_frequency_relationship(self, RDF: pd.DataFrame) -> pd.DataFrame:
+        RDF = RDF[RDF["RelationshipURI"].isin(self.REL_FILTER["RelationshipURI"])]
+        return RDF
+
+    def rdf_add_special_token(self, RDF: pd.DataFrame):
+        """
+        Adds RDF special token to each element of the tuple. i.e: SUBJ to SubjectURI, OBJ to ObjectURI, REL to RelationshipURI. 
+        Check Scrits/Libs/CleaningPipeline/special_token.py for the up-to-date special token.
+        It only adds the special token of the three element of the RDF, no other special token.
+        Args:
+            RDF (pd.DataFrame):
+        Returns:
+            pd.DataFrame: ["MovieURI","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
+        """        
+        # if the filter runned before sliced the RDF and created a View, here the problem is resolved
+        # for more context: SettingWithCopyWarning
+        RDF = RDF.copy()
+        # at the beginning of SubjectURI RelationshipURI ObjectURI, add their special token  
+        RDF["SubjectURI"] = SpecialToken.SUBJECT.value + RDF["SubjectURI"]
+        RDF["ObjectURI"] = SpecialToken.OBJECT.value + RDF["ObjectURI"]
+        RDF["RelationshipURI"] = SpecialToken.RELATIONSHIP.value + RDF["RelationshipURI"]
+        return RDF
+
+
+    def drop_na_from_dataset(self, RDF: pd.DataFrame) -> pd.DataFrame:
+        # dataset has SubjectURI RelationshipURI ObjectURI
+        #  want to drop the '' in them
+        # Replace empty strings with NaN
+        RDF = RDF.replace('', np.nan)
+        # Drop rows where any of the key columns are NaN
+        RDF = RDF.dropna(subset=["SubjectURI", "RelationshipURI", "ObjectURI"])
+        return RDF
+    
+    def rebuild_by_movie(self, RDF: pd.DataFrame) -> pd.DataFrame:
+        """_summary_
+
+        Args:
+            RDF (pd.DataFrame): ["MovieID","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
+
+        Returns:
+            pd.DataFrame: ["MovieID","Triple","Abstract"]
+        """        
+        # to execute this method you have to have itereted by movie_id
+        # because as design we want at the end one row for each movie
+        # MovieID and abstract can be given as input for a more generic method
+        # movie_id = RDF["MovieID"].iloc(0)
+        # abstract = RDF["Abstract"].iloc(0)
+        # first let's combine each row creating column triple as join of rdf
+        RDF["Triple"] =  RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"]
+        # special token 
+        RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
+        # combine rows into one
+        # MovieID and Abstract are unique for each other 1 <-> 1
+        RDF = RDF.groupby(["MovieID", "Abstract"])["Triple"].apply("".join).reset_index()
+        # add special token for: start of triple, end of triple and start of abstract
+        RDF["Triple"] = SpecialToken.START_TRIPLE_LIST.value + RDF["Triple"] 
+        RDF["Abstract"] = SpecialToken.ABSTRACT.value + RDF["Abstract"]
+        return RDF[["MovieID","Triple","Abstract"]]
+
+    def group_by_movie_from_triple(self, RDF: pd.DataFrame) -> pd.DataFrame:
+        """
+        Args:
+            RDF (pd.DataFrame): ["MovieID","Triple","Abstract"]
+
+        Returns:
+            pd.DataFrame: ["MovieID","Triple","Abstract"]
+        """
+        # combine rows into one
+        # MovieID and Abstract are unique for each other 1 <-> 1
+        RDF = RDF.groupby(["MovieID", "Abstract"])["Triple"].apply("".join).reset_index()
+        # add special token for: start of triple, end of triple and start of abstract
+        RDF["Triple"] = SpecialToken.START_TRIPLE_LIST.value + RDF["Triple"] 
+        RDF["Abstract"] = SpecialToken.ABSTRACT.value + RDF["Abstract"]
+        return RDF[["MovieID","Triple","Abstract"]]
+
+
+    @staticmethod
+    def build_triple(RDF: pd.DataFrame):
+        """
+        Obtains joined RDF triple in one element, togheter with START and END special token
+        Args:
+            RDF (pd.DataFrame): at least ["SubjectURI", "RelationshipURI", "ObjectURI"]
+        Returns:
+            pd.DataFrame: RDF["Triple"] (just this column)
+        """        
+        # let's combine each row creating column triple as join of rdf
+        RDF["Triple"] =  RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"]
+        # special token 
+        RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
+        return RDF["Triple"]
+
+    @staticmethod
+    def build_incomplete_triple(RDF: pd.DataFrame):
+        """
+        Method helper used for the third task: "Predicting a masked component within an RDF triple". 
+        Obtains joined RDF triple in one element, togheter with START and END special token.
+        The MISSING element will be replaced by the special token <MASK>
+        Args:
+            RDF (pd.DataFrame): 2 of the following ["SubjectURI", "RelationshipURI", "ObjectURI"]
+        Returns:
+            RDF["Triple"]: pd.Series  (just this column, NOT A DATAFRAME)
+        """        
+        # let's create a new column "Triple" with the joined RDF
+
+        # the following creates a column of MASK token of the lenght of the dataframe,
+        # it is not needed since we expect to have a dataframe of just one column, but its more robust (AND SLOW)
+        MISSING = pd.Series([SpecialToken.MASK.value] * len(RDF), index=RDF.index)
+
+        RDF["Triple"] =  ( 
+                    RDF.get("SubjectURI", MISSING) + 
+                    RDF.get("RelationshipURI", MISSING) + 
+                    RDF.get("ObjectURI", MISSING))
+        # special token 
+        RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
+        return RDF["Triple"]
+
+    @staticmethod
+    def build_for_mask_task(RDF_incomplete: pd.DataFrame, MISSING: pd.DataFrame) -> pd.DataFrame:
+        # currently not used
+        """
+        Method helper used for the third task: "Predicting a masked component within an RDF triple". 
+        Given two Dataframe, the first containing the incompleted RDF and the other only the missing componment,
+        this methods applies the special token
+        Args:
+            RDF (pd.DataFrame): _description_
+
+        Returns:
+            pd.DataFrame: _description_
+        """  
+        # take an example dataframe as ["SubjectURI",""]    
+        # as input two dataframe, one with 2 column  
+        return None
+
diff --git a/Scripts/DataCleaning/pipeline.py b/Scripts/DataCleaning/pipeline.py
new file mode 100644
index 0000000..e07294b
--- /dev/null
+++ b/Scripts/DataCleaning/pipeline.py
@@ -0,0 +1,107 @@
+import re
+from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
+from Scripts.DataCleaning.filter import PipelineApplier
+# tasks dataset builder
+from Scripts.DataCleaning.data_output_models.rdf_mask_task import RDF_mask_task_dataset
+from Scripts.DataCleaning.data_output_models.bpe_corpus import BPE_corpus
+from Scripts.DataCleaning.data_output_models.rdf_text_tasks import RDF_text_task_dataset
+from Scripts.DataCleaning.data_output_models.rdf_completation_task import RDF_completation_task_dataset
+
+import pandas as pd
+
+class Pipeline():
+    def __init__(self, output):
+        self.sql_endpoint = SqlEndpoint()
+        # classes to manage taskes' datasets
+        self.task_rdf_mask = RDF_mask_task_dataset("./Assets/Dataset/Tmp/debug.csv")
+        self.task_bpe_corpus = BPE_corpus("./Assets/Dataset/Tmp/output.txt")
+        self.task_rdf_text = RDF_text_task_dataset("./Assets/Dataset/Tmp/rdf_text.csv")
+        self.task_rdf_completation = RDF_completation_task_dataset("./Assets/Dataset/Tmp/rdf_completation.csv")
+
+        # prepare the filter
+        # the filter applier needs to know thefrequence of Movies and Relationship among all the Dataset
+        self.filter_applier = PipelineApplier()
+        MOVIE_COUNT = self.sql_endpoint.get_movies_id_count()
+        REL_COUNT = self.sql_endpoint.get_relationship_count()
+        self.filter_applier.generate_frequency_movie_filter(MOVIE_COUNT,50,3000)
+        self.filter_applier.generate_frequency_relationship_filter(REL_COUNT, 50, 2395627)
+        # prepare the filter ot the relationshipURI you want to delete:
+        relationship_uri_banned_list = [
+            "dbp-dbp:wikiPageUsesTemplate","w3:2000/01/rdf-schema#label","dbp-dbo:abstract",
+            "dbp-dbo:wikiPageID","dbp-dbo:wikiPageRevisionID", "dbp-dbo:wikiPageDisambiguates",
+            "w3:2002/07/owl#sameAs","dbp-dbp:image","dbp-dbo:wikiPageLength", "w3:2000/01/rdf-schema#comment",
+            "dbp-dbo:thumbnail", "foaf:depiction", "w3:1999/02/22-rdf-syntax-ns#type"]
+        self.filter_applier.generate_list_relationship_filter(relationship_uri_banned_list)
+
+
+    def _end_file_handler(self):
+        self.task_bpe_corpus.close()
+        self.task_rdf_mask.close()
+        self.task_rdf_text.close()
+        self.task_rdf_completation.close()
+
+    def _get_cleaned_movie_rows(self):
+        for RDF in self.sql_endpoint.get_abbreviated_dataset_by_movie_id():
+            RDF = self.filter_applier.drop_na_from_dataset(RDF)
+            RDF = self.filter_applier.filter_by_frequency_movie_id(RDF)
+            RDF = self.filter_applier.filter_by_frequency_relationship(RDF)
+            # other filter
+            #
+            RDF = self.filter_applier.delete_relationship_by_list_filter(RDF)
+            if RDF.empty:
+                continue
+            RDF = self.filter_applier.rdf_add_special_token(RDF)  # WARNING, THIS MUST BE DONE AFTER FILTER BY FREQUENCE
+            yield RDF
+
+    def execute_task_bpe_corpus(self):
+        for RDF in self._get_cleaned_movie_rows():
+            RDF = self.filter_applier.rebuild_by_movie(RDF)
+            RDF = RDF[["Triple","Abstract"]]
+            self.task_bpe_corpus.write_from_df(RDF)
+        self._end_file_handler()
+
+
+    def execute_task_rdf_mask(self):
+        for RDF in self._get_cleaned_movie_rows():
+            self.task_rdf_mask.write(RDF)
+        self._end_file_handler()
+
+    def execute_tasks_rdf_text(self):
+        for RDF in self._get_cleaned_movie_rows():
+            RDF = self.filter_applier.rebuild_by_movie(RDF)
+            self.task_rdf_text.write(RDF)
+        self._end_file_handler()
+
+    def execute_task_rdf_completation(self):
+        for RDF in self._get_cleaned_movie_rows():
+            RDF["Triple"] = self.filter_applier.build_triple(RDF)
+            self.task_rdf_completation.write(RDF[["MovieID","Triple"]])
+        self._end_file_handler()
+
+
+    def execute_all_task(self):
+        for RDF in self._get_cleaned_movie_rows():
+            self.task_rdf_mask.write(RDF)
+
+            RDF["Triple"] = self.filter_applier.build_triple(RDF)
+            self.task_rdf_completation.write(RDF[["MovieID","Triple"]])
+
+            RDF = self.filter_applier.group_by_movie_from_triple(RDF[["MovieID","Triple","Abstract"]])
+
+            self.task_rdf_text.write(RDF)
+            self.task_bpe_corpus.write_from_df(RDF[["Triple","Abstract"]])
+
+        self._end_file_handler()
+        
+
+
+
+
+
+
+pipeline = Pipeline("./Assets/Dataset/Tmp/output.txt")
+# pipeline.execute_task_bpe_corpus()
+# pipeline.execute_task_rdf_mask()
+# pipeline.execute_tasks_rdf_text()
+# pipeline.execute_task_rdf_completation()
+pipeline.execute_all_task()
\ No newline at end of file
diff --git a/Scripts/Libs/CleaningPipeline/special_token.py b/Scripts/Libs/CleaningPipeline/special_token.py
new file mode 100644
index 0000000..644ad71
--- /dev/null
+++ b/Scripts/Libs/CleaningPipeline/special_token.py
@@ -0,0 +1,21 @@
+from enum import Enum
+
+class SpecialToken(str, Enum):
+    # (Enum, str) -> throws an error
+    START_TRIPLE_LIST = "<SOTL>"
+    START_TRIPLE = "<SOT>"
+    END_TRIPLE = "<EOT>"
+    SUBJECT = "<SUBJ>"
+    RELATIONSHIP = "<PRED>"
+    OBJECT = "<OBJ>"
+    ABSTRACT = "<ABS>"
+    CORPUS_END = "<END>"
+
+    ## Tasks' Token
+    RDF_TO_TEXT = "<RDF2TXT>"
+    TEXT_TO_RDF = "<TEXT2RDF>"
+    CONTINUE_RDF = "<CONTINUERDF>"
+    MASK = "<MASK>"
+
+    #BPE Training:
+    
\ No newline at end of file
diff --git a/Scripts/Libs/CleaningPipeline/sql_endpoint.py b/Scripts/Libs/CleaningPipeline/sql_endpoint.py
new file mode 100644
index 0000000..4e43528
--- /dev/null
+++ b/Scripts/Libs/CleaningPipeline/sql_endpoint.py
@@ -0,0 +1,144 @@
+#######################################################
+#   This file stand as endpoint to interact with DB   #
+#######################################################
+
+# import sqlite3
+import pandas as pd
+from sqlalchemy import create_engine
+from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
+
+
+class SqlEndpoint():
+
+    def __init__(self, DB_PATH = "./Assets/Dataset/DatawareHouse/dataset.db", chunk_size_row = 500):
+        # self.CONN = sqlite3.connect(DB_PATH) # DEPRECATED
+        self.sql_engine = create_engine(f"sqlite:///{DB_PATH}")
+        # /// 3 slash -> relative path
+        # //// 4 slash -> absolute
+        # self.conn = self.sql_engine.connect().execution_options(stream_results=True)
+        # it seems that sqlite doenst support streamer cursor
+        # PRAGMA exeutes better in writing not reading
+        self.chunk_size_row = chunk_size_row
+        pass
+
+    def get_RDF(self) -> pd.DataFrame :
+        
+        QUERY = """
+                SELECT MovieID, SubjectURI, RelationshipURI, ObjectURI
+                FROM RDFs
+                INNER JOIN Subjects USING (SubjectID)
+                INNER JOIN Relationships USING (RelationshipID)
+                INNER JOIN Objects USING (ObjectID);
+                """
+        
+        return pd.read_sql_query(QUERY, self.CONN)
+    
+    def get_chunked_abbreviated_dataset(self) -> pd.DataFrame :
+        """
+        Returns:
+            pd.DataFrame: MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
+        """        
+        
+        QUERY = """
+                SELECT MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
+                FROM RDFs
+                INNER JOIN ParsedSubjects USING (SubjectID)
+                INNER JOIN ParsedRelationships USING (RelationshipID)
+                INNER JOIN ParsedObjects USING (ObjectID)
+                INNER JOIN WikipediaAbstracts USING (MovieID);
+                """
+        
+        # return pd.read_sql_query(QUERY, self.CONN, chunksize=500)
+        # sqlite3
+        return pd.read_sql_query(QUERY, self.sql_engine, chunksize=self.chunk_size_row)
+
+    
+    def get_chunked_abbreviated_dataset_with_start_token(self)-> pd.DataFrame:
+        # DEPRECATED !
+        start_token = SpecialToken()
+        QUERY = """
+                SELECT 
+                    MovieID, 
+                    ? || SubjectURI AS SubjectURI,
+                    ? || RelationshipURI AS RelationshipURI, 
+                    ? || ObjectURI AS ObjectURI, 
+                    Abstract
+                FROM RDFs
+                INNER JOIN ParsedSubjects USING (SubjectID)
+                INNER JOIN ParsedRelationships USING (RelationshipID)
+                INNER JOIN ParsedObjects USING (ObjectID)
+                INNER JOIN WikipediaAbstracts USING (MovieID);
+                """
+        return pd.read_sql_query(QUERY, self.sql_engine, chunksize=self.chunk_size_row)
+    
+    def get_abbreviated_dataset_by_movie_id(self):# -> iter[pd.DataFrame]:
+        """
+        Gets each time a DataFrame per movie ( with all its rows in the dataset).
+        The retrieved RDFs are already abbrevieted by the sql parser
+        Yields:
+            Pandas.DataFrame: [MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract]
+        """        
+        # chunk by movieId, abstract is the same and some intersting logic are appliable
+        movie_ids = pd.read_sql_query("SELECT MovieID FROM Movies;", self.sql_engine)["MovieID"]
+        # CHOOSEN MOVIE:
+        # The Dark Knight   : 117248
+        # Inception         : 147074
+        # The Avengers      : 113621
+        # Cast Away         : 1123
+        # The Departed      : 117586
+        # American Psycho   : 90177
+        # Avatar            : 71587
+        # Django Unchained  : 138952
+        # Spirited Away     : 144137
+        # Knives Out        : 148025
+        movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]
+        movie_ids = movie_list
+
+        QUERY = """
+                SELECT MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
+                FROM RDFs
+                INNER JOIN ParsedSubjects USING (SubjectID)
+                INNER JOIN ParsedRelationships USING (RelationshipID)
+                INNER JOIN ParsedObjects USING (ObjectID)
+                INNER JOIN WikipediaAbstracts USING (MovieID)
+                WHERE MovieID = (?);
+                """        
+
+        for movie_id in movie_ids:
+            yield pd.read_sql_query(QUERY, self.sql_engine, params=(movie_id,))
+
+    def get_movies_id_count(self) -> pd.DataFrame:
+        """
+        Gets the count of each Movie in the Dataset
+        Returns:
+            Pandas.DataFrame: [MovieID, Count]
+        """        
+        QUERY = """
+                SELECT MovieID, COUNT(*) AS Count
+                FROM RDFs
+                GROUP BY MovieID;
+                """        
+        return pd.read_sql_query(QUERY, self.sql_engine)
+    
+    def get_relationship_count(self) -> pd.DataFrame:
+        """
+        Gets the count of each Relationship in the Dataset
+        Returns:
+            Pandas.DataFrame: [RelationshipURI, Count]
+        """       
+        QUERY = """
+                SELECT RelationshipURI, COUNT(*) AS Count
+                FROM RDFs
+                INNER JOIN ParsedRelationships USING (RelationshipID)
+                GROUP BY RelationshipURI;
+                """        
+        return pd.read_sql_query(QUERY, self.sql_engine)
+
+
+
+if __name__ == "__main__" :
+    sql_endpoint = SqlEndpoint()
+    for pandas_row in sql_endpoint.get_abbreviated_dataset_by_movie_id():
+        print(pandas_row)
+    # sql_endpoint.get_RDF()
+    print("done")
\ No newline at end of file
diff --git a/Scripts/Libs/Utils/dataframe_interaction.py b/Scripts/Libs/Utils/dataframe_interaction.py
new file mode 100644
index 0000000..c4df33a
--- /dev/null
+++ b/Scripts/Libs/Utils/dataframe_interaction.py
@@ -0,0 +1,9 @@
+import pandas as pd
+
+
+
+def get_raw_from_dataframe(DF: pd.DataFrame) -> str:
+    output = ''
+    for row in DF.itertuples(index=False, name=None):
+        output += "".join(map(str, row))
+    return output

From 8167c9d435b15a4f189d57b6644a376fed2f2e2c Mon Sep 17 00:00:00 2001
From: GassiGiuseppe <g.gassi@studenti.poliba.it>
Date: Mon, 29 Sep 2025 16:03:49 +0200
Subject: [PATCH 22/75] Added Toy Dataset entry point into the Pipeline class
 Before it was forced into the sql_endpoint, now all the pipeline can be
 managed in the Pipeline class

---
 Scripts/DataCleaning/pipeline.py              | 78 ++++++++++++-------
 Scripts/Libs/CleaningPipeline/sql_endpoint.py | 12 +--
 2 files changed, 57 insertions(+), 33 deletions(-)

diff --git a/Scripts/DataCleaning/pipeline.py b/Scripts/DataCleaning/pipeline.py
index e07294b..eb5b2f7 100644
--- a/Scripts/DataCleaning/pipeline.py
+++ b/Scripts/DataCleaning/pipeline.py
@@ -10,22 +10,22 @@ from Scripts.DataCleaning.data_output_models.rdf_completation_task import RDF_co
 import pandas as pd
 
 class Pipeline():
-    def __init__(self, output):
+    def __init__(self):
         self.sql_endpoint = SqlEndpoint()
         # classes to manage taskes' datasets
-        self.task_rdf_mask = RDF_mask_task_dataset("./Assets/Dataset/Tmp/debug.csv")
-        self.task_bpe_corpus = BPE_corpus("./Assets/Dataset/Tmp/output.txt")
+        self.task_rdf_mask = RDF_mask_task_dataset("./Assets/Dataset/Tmp/rdf_mask.csv")
+        self.task_bpe_corpus = BPE_corpus("./Assets/Dataset/Tmp/corpus.txt")
         self.task_rdf_text = RDF_text_task_dataset("./Assets/Dataset/Tmp/rdf_text.csv")
         self.task_rdf_completation = RDF_completation_task_dataset("./Assets/Dataset/Tmp/rdf_completation.csv")
 
         # prepare the filter
-        # the filter applier needs to know thefrequence of Movies and Relationship among all the Dataset
+        # the filter applier needs to know the frequence of Movies and Relationship among all the Dataset
         self.filter_applier = PipelineApplier()
         MOVIE_COUNT = self.sql_endpoint.get_movies_id_count()
         REL_COUNT = self.sql_endpoint.get_relationship_count()
         self.filter_applier.generate_frequency_movie_filter(MOVIE_COUNT,50,3000)
         self.filter_applier.generate_frequency_relationship_filter(REL_COUNT, 50, 2395627)
-        # prepare the filter ot the relationshipURI you want to delete:
+        # prepare the filter on the relationshipURI you want to delete:
         relationship_uri_banned_list = [
             "dbp-dbp:wikiPageUsesTemplate","w3:2000/01/rdf-schema#label","dbp-dbo:abstract",
             "dbp-dbo:wikiPageID","dbp-dbo:wikiPageRevisionID", "dbp-dbo:wikiPageDisambiguates",
@@ -34,25 +34,6 @@ class Pipeline():
         self.filter_applier.generate_list_relationship_filter(relationship_uri_banned_list)
 
 
-    def _end_file_handler(self):
-        self.task_bpe_corpus.close()
-        self.task_rdf_mask.close()
-        self.task_rdf_text.close()
-        self.task_rdf_completation.close()
-
-    def _get_cleaned_movie_rows(self):
-        for RDF in self.sql_endpoint.get_abbreviated_dataset_by_movie_id():
-            RDF = self.filter_applier.drop_na_from_dataset(RDF)
-            RDF = self.filter_applier.filter_by_frequency_movie_id(RDF)
-            RDF = self.filter_applier.filter_by_frequency_relationship(RDF)
-            # other filter
-            #
-            RDF = self.filter_applier.delete_relationship_by_list_filter(RDF)
-            if RDF.empty:
-                continue
-            RDF = self.filter_applier.rdf_add_special_token(RDF)  # WARNING, THIS MUST BE DONE AFTER FILTER BY FREQUENCE
-            yield RDF
-
     def execute_task_bpe_corpus(self):
         for RDF in self._get_cleaned_movie_rows():
             RDF = self.filter_applier.rebuild_by_movie(RDF)
@@ -66,12 +47,14 @@ class Pipeline():
             self.task_rdf_mask.write(RDF)
         self._end_file_handler()
 
+
     def execute_tasks_rdf_text(self):
         for RDF in self._get_cleaned_movie_rows():
             RDF = self.filter_applier.rebuild_by_movie(RDF)
             self.task_rdf_text.write(RDF)
         self._end_file_handler()
 
+
     def execute_task_rdf_completation(self):
         for RDF in self._get_cleaned_movie_rows():
             RDF["Triple"] = self.filter_applier.build_triple(RDF)
@@ -92,14 +75,55 @@ class Pipeline():
             self.task_bpe_corpus.write_from_df(RDF[["Triple","Abstract"]])
 
         self._end_file_handler()
-        
+
+
+    def _end_file_handler(self):
+        self.task_bpe_corpus.close()
+        self.task_rdf_mask.close()
+        self.task_rdf_text.close()
+        self.task_rdf_completation.close()
+
+
+    def _get_cleaned_movie_rows(self):
+        for RDF in self.sql_endpoint.get_abbreviated_dataset_by_movie_id():
+            RDF = self.filter_applier.drop_na_from_dataset(RDF)
+            RDF = self.filter_applier.filter_by_frequency_movie_id(RDF)
+            RDF = self.filter_applier.filter_by_frequency_relationship(RDF)
+            # other filter
+            #
+            RDF = self.filter_applier.delete_relationship_by_list_filter(RDF)
+            if RDF.empty:
+                continue
+            RDF = self.filter_applier.rdf_add_special_token(RDF)  # WARNING, THIS MUST BE DONE AFTER FILTER BY FREQUENCE
+            yield RDF
+
+
+    def use_toy_dataset(self):
+        # CHOOSEN MOVIE:
+        # The Dark Knight   : 117248
+        # Inception         : 147074
+        # The Avengers      : 113621
+        # Cast Away         : 1123
+        # The Departed      : 117586
+        # American Psycho   : 90177
+        # Avatar            : 71587
+        # Django Unchained  : 138952
+        # Spirited Away     : 144137
+        # Knives Out        : 148025
+        movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]
+        self.sql_endpoint.movie_ids = movie_list
 
 
 
+# there are a lot of settings to manage
+# you only need to change settings: 
+# in the init for file paths, frequency filter limit, banned reletionshipURI
+# in the use_toy_dataset , to change the toy dataset
+# in _get_cleaned_movie_rows: to change how the pipeline behave
 
+pipeline = Pipeline()
 
-
-pipeline = Pipeline("./Assets/Dataset/Tmp/output.txt")
+# pipeline.use_toy_dataset()
 # pipeline.execute_task_bpe_corpus()
 # pipeline.execute_task_rdf_mask()
 # pipeline.execute_tasks_rdf_text()
diff --git a/Scripts/Libs/CleaningPipeline/sql_endpoint.py b/Scripts/Libs/CleaningPipeline/sql_endpoint.py
index 4e43528..66ba1ea 100644
--- a/Scripts/Libs/CleaningPipeline/sql_endpoint.py
+++ b/Scripts/Libs/CleaningPipeline/sql_endpoint.py
@@ -18,8 +18,8 @@ class SqlEndpoint():
         # self.conn = self.sql_engine.connect().execution_options(stream_results=True)
         # it seems that sqlite doenst support streamer cursor
         # PRAGMA exeutes better in writing not reading
-        self.chunk_size_row = chunk_size_row
-        pass
+        self.chunk_size_row = chunk_size_row                    # not used now, since each chunk is a movie
+        self.movie_ids = movie_ids = pd.read_sql_query("SELECT MovieID FROM Movies;", self.sql_engine)["MovieID"]
 
     def get_RDF(self) -> pd.DataFrame :
         
@@ -79,7 +79,7 @@ class SqlEndpoint():
             Pandas.DataFrame: [MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract]
         """        
         # chunk by movieId, abstract is the same and some intersting logic are appliable
-        movie_ids = pd.read_sql_query("SELECT MovieID FROM Movies;", self.sql_engine)["MovieID"]
+        # movie_ids = pd.read_sql_query("SELECT MovieID FROM Movies;", self.sql_engine)["MovieID"]
         # CHOOSEN MOVIE:
         # The Dark Knight   : 117248
         # Inception         : 147074
@@ -91,8 +91,8 @@ class SqlEndpoint():
         # Django Unchained  : 138952
         # Spirited Away     : 144137
         # Knives Out        : 148025
-        movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]
-        movie_ids = movie_list
+        # movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]
+        # movie_ids = movie_list
 
         QUERY = """
                 SELECT MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
@@ -104,7 +104,7 @@ class SqlEndpoint():
                 WHERE MovieID = (?);
                 """        
 
-        for movie_id in movie_ids:
+        for movie_id in self.movie_ids:
             yield pd.read_sql_query(QUERY, self.sql_engine, params=(movie_id,))
 
     def get_movies_id_count(self) -> pd.DataFrame:

From 255d8a072d8e95920bbb723c4536f454a741ab02 Mon Sep 17 00:00:00 2001
From: GassiGiuseppe <g.gassi@studenti.poliba.it>
Date: Mon, 29 Sep 2025 16:59:52 +0200
Subject: [PATCH 23/75] First implementation of the cleaning pipeline UML

---
 .../cleaning-pipeline.excalidraw.json         | 634 ++++++++++++++++++
 1 file changed, 634 insertions(+)
 create mode 100644 Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json

diff --git a/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json b/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json
new file mode 100644
index 0000000..1249185
--- /dev/null
+++ b/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json
@@ -0,0 +1,634 @@
+{
+  "type": "excalidraw",
+  "version": 2,
+  "source": "https://marketplace.visualstudio.com/items?itemName=pomdtr.excalidraw-editor",
+  "elements": [
+    {
+      "id": "JNB9z-PeqZ4s8KDfWaoXe",
+      "type": "rectangle",
+      "x": 106,
+      "y": 27,
+      "width": 653,
+      "height": 263,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a2",
+      "roundness": {
+        "type": 3
+      },
+      "seed": 710740889,
+      "version": 326,
+      "versionNonce": 1107631703,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1759156408059,
+      "link": null,
+      "locked": false
+    },
+    {
+      "id": "e13wNTgUpn2flMpmMttqx",
+      "type": "text",
+      "x": 200.5943407656526,
+      "y": 44.07937975075269,
+      "width": 307.2781467269385,
+      "height": 23.3097531902191,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a3",
+      "roundness": null,
+      "seed": 1012740663,
+      "version": 444,
+      "versionNonce": 589551257,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1759156408059,
+      "link": null,
+      "locked": false,
+      "text": "Libs/CleaningPipeline/sql_endpoint",
+      "fontSize": 18.64780255217528,
+      "fontFamily": 5,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "Libs/CleaningPipeline/sql_endpoint",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "CgxCElJkKBtIHv-5WQrbo",
+      "type": "text",
+      "x": 195,
+      "y": 80.44259472749451,
+      "width": 403.64997665852184,
+      "height": 186.4780255217528,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "a4",
+      "roundness": null,
+      "seed": 1261951799,
+      "version": 507,
+      "versionNonce": 1922906999,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1759156408059,
+      "link": null,
+      "locked": false,
+      "text": "Class SqlEndpoint:\n    - sql_engine\n    + movie_ids: list[int]\n\n    #\n    + get_abbreviated_dataset_by_movie_id\n\n",
+      "fontSize": 18.64780255217528,
+      "fontFamily": 5,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "Class SqlEndpoint:\n    - sql_engine\n    + movie_ids: list[int]\n\n    #\n    + get_abbreviated_dataset_by_movie_id\n\n",
+      "autoResize": true,
+      "lineHeight": 1.25
+    },
+    {
+      "type": "line",
+      "version": 4978,
+      "versionNonce": 2079525497,
+      "isDeleted": false,
+      "id": "sYReMTdYblr-oJtYYJALU",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 0,
+      "x": -68.05426555317842,
+      "y": 87.19293561900287,
+      "strokeColor": "#000000",
+      "backgroundColor": "#a5d8ff",
+      "width": 77.09201683999922,
+      "height": 99.49948667804088,
+      "seed": 1263944119,
+      "groupIds": [
+        "9YkNe1yqnfZy9Z1JX2xr4",
+        "BDBCTrrhjbJynRAyuf3xJ"
+      ],
+      "strokeSharpness": "round",
+      "boundElementIds": [],
+      "startBinding": null,
+      "endBinding": null,
+      "lastCommittedPoint": null,
+      "startArrowhead": null,
+      "endArrowhead": null,
+      "points": [
+        [
+          0,
+          0
+        ],
+        [
+          0.2542098813493443,
+          75.20117273657175
+        ],
+        [
+          0.011896425679918422,
+          83.76249969444815
+        ],
+        [
+          3.970409367559332,
+          87.46174320643391
+        ],
+        [
+          17.75573317066317,
+          90.59250103325854
+        ],
+        [
+          41.05683533152865,
+          91.56737225214069
+        ],
+        [
+          63.319497586673116,
+          90.01084754868091
+        ],
+        [
+          75.14781395923075,
+          86.28844687220405
+        ],
+        [
+          76.81603792670788,
+          83.15042405259751
+        ],
+        [
+          77.05033394391478,
+          76.25776215104557
+        ],
+        [
+          76.86643881413028,
+          6.3089586511537865
+        ],
+        [
+          76.45188016352971,
+          -0.2999144698665015
+        ],
+        [
+          71.50179495549581,
+          -3.9936571317850627
+        ],
+        [
+          61.077971898861186,
+          -6.132877429442784
+        ],
+        [
+          37.32348754161154,
+          -7.932114425900202
+        ],
+        [
+          18.278415656797975,
+          -6.859225353587373
+        ],
+        [
+          3.2995959613238286,
+          -3.2201165291205287
+        ],
+        [
+          -0.04168289608444441,
+          -0.045185660461322996
+        ],
+        [
+          0,
+          0
+        ]
+      ],
+      "index": "a6",
+      "frameId": null,
+      "roundness": {
+        "type": 2
+      },
+      "boundElements": [],
+      "updated": 1759157176189,
+      "link": null,
+      "locked": false
+    },
+    {
+      "type": "line",
+      "version": 2683,
+      "versionNonce": 33379161,
+      "isDeleted": false,
+      "id": "0S6dEWQVqKUVkP6Z5IX1l",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 0,
+      "x": -67.53033611490343,
+      "y": 144.31921927673278,
+      "strokeColor": "#000000",
+      "backgroundColor": "#a5d8ff",
+      "width": 77.17198221193564,
+      "height": 8.562348957853036,
+      "seed": 817033943,
+      "groupIds": [
+        "9YkNe1yqnfZy9Z1JX2xr4",
+        "BDBCTrrhjbJynRAyuf3xJ"
+      ],
+      "strokeSharpness": "round",
+      "boundElementIds": [],
+      "startBinding": null,
+      "endBinding": null,
+      "lastCommittedPoint": null,
+      "startArrowhead": null,
+      "endArrowhead": null,
+      "points": [
+        [
+          0,
+          0
+        ],
+        [
+          2.033150371639873,
+          3.413095389435587
+        ],
+        [
+          10.801287372573954,
+          6.276651055277943
+        ],
+        [
+          22.468666942209353,
+          8.010803051612635
+        ],
+        [
+          40.747074201802775,
+          8.168828515515864
+        ],
+        [
+          62.077348233027564,
+          7.0647721921469495
+        ],
+        [
+          74.53446931782398,
+          3.04824021069218
+        ],
+        [
+          77.17198221193564,
+          -0.3935204423371723
+        ]
+      ],
+      "index": "a7",
+      "frameId": null,
+      "roundness": {
+        "type": 2
+      },
+      "boundElements": [],
+      "updated": 1759157176189,
+      "link": null,
+      "locked": false
+    },
+    {
+      "type": "line",
+      "version": 2769,
+      "versionNonce": 1703641145,
+      "isDeleted": false,
+      "id": "szGLND7J0nVOvRkNXX9AS",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 0,
+      "x": -68.56219343740725,
+      "y": 115.35516394150972,
+      "strokeColor": "#000000",
+      "backgroundColor": "#a5d8ff",
+      "width": 77.17198221193564,
+      "height": 8.562348957853036,
+      "seed": 1704755191,
+      "groupIds": [
+        "9YkNe1yqnfZy9Z1JX2xr4",
+        "BDBCTrrhjbJynRAyuf3xJ"
+      ],
+      "strokeSharpness": "round",
+      "boundElementIds": [],
+      "startBinding": null,
+      "endBinding": null,
+      "lastCommittedPoint": null,
+      "startArrowhead": null,
+      "endArrowhead": null,
+      "points": [
+        [
+          0,
+          0
+        ],
+        [
+          2.033150371639873,
+          3.413095389435587
+        ],
+        [
+          10.801287372573954,
+          6.276651055277943
+        ],
+        [
+          22.468666942209353,
+          8.010803051612635
+        ],
+        [
+          40.747074201802775,
+          8.168828515515864
+        ],
+        [
+          62.077348233027564,
+          7.0647721921469495
+        ],
+        [
+          74.53446931782398,
+          3.04824021069218
+        ],
+        [
+          77.17198221193564,
+          -0.3935204423371723
+        ]
+      ],
+      "index": "a8",
+      "frameId": null,
+      "roundness": {
+        "type": 2
+      },
+      "boundElements": [],
+      "updated": 1759157176189,
+      "link": null,
+      "locked": false
+    },
+    {
+      "type": "ellipse",
+      "version": 5766,
+      "versionNonce": 344002841,
+      "isDeleted": false,
+      "id": "O3t2uGktJlDd1_OX_bpV4",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 0,
+      "x": -69.6201424194893,
+      "y": 80.06066699332126,
+      "strokeColor": "#000000",
+      "backgroundColor": "#a5d8ff",
+      "width": 76.59753601865496,
+      "height": 15.49127539284798,
+      "seed": 471296279,
+      "groupIds": [
+        "9YkNe1yqnfZy9Z1JX2xr4",
+        "BDBCTrrhjbJynRAyuf3xJ"
+      ],
+      "strokeSharpness": "sharp",
+      "boundElementIds": [
+        "bxuMGTzXLn7H-uBCptINx"
+      ],
+      "index": "a9",
+      "frameId": null,
+      "roundness": null,
+      "boundElements": [],
+      "updated": 1759157176189,
+      "link": null,
+      "locked": false
+    },
+    {
+      "type": "ellipse",
+      "version": 1176,
+      "versionNonce": 1951499769,
+      "isDeleted": false,
+      "id": "_SzKlOBOvJgBg7FX0JTTM",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 0,
+      "x": -33.12815531426679,
+      "y": 104.53733467322485,
+      "strokeColor": "#000000",
+      "backgroundColor": "#228be6",
+      "width": 11.226103154161754,
+      "height": 12.183758484455605,
+      "seed": 1368927799,
+      "groupIds": [
+        "9YkNe1yqnfZy9Z1JX2xr4",
+        "BDBCTrrhjbJynRAyuf3xJ"
+      ],
+      "strokeSharpness": "sharp",
+      "boundElementIds": [],
+      "index": "aA",
+      "frameId": null,
+      "roundness": null,
+      "boundElements": [],
+      "updated": 1759157176189,
+      "link": null,
+      "locked": false
+    },
+    {
+      "type": "ellipse",
+      "version": 1464,
+      "versionNonce": 1879072473,
+      "isDeleted": false,
+      "id": "oJMl2Kxa3SPaiAY0kxo7A",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 0,
+      "x": -32.77701353033319,
+      "y": 130.75394896028996,
+      "strokeColor": "#000000",
+      "backgroundColor": "#228be6",
+      "width": 11.226103154161754,
+      "height": 12.183758484455605,
+      "seed": 1627606871,
+      "groupIds": [
+        "9YkNe1yqnfZy9Z1JX2xr4",
+        "BDBCTrrhjbJynRAyuf3xJ"
+      ],
+      "strokeSharpness": "sharp",
+      "boundElementIds": [],
+      "index": "aB",
+      "frameId": null,
+      "roundness": null,
+      "boundElements": [],
+      "updated": 1759157176189,
+      "link": null,
+      "locked": false
+    },
+    {
+      "type": "ellipse",
+      "version": 1347,
+      "versionNonce": 1176574905,
+      "isDeleted": false,
+      "id": "fB6pJBSMA-pRHrpgYKaLL",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 6.239590202363168,
+      "x": -32.12815531426679,
+      "y": 159.52267553159635,
+      "strokeColor": "#000000",
+      "backgroundColor": "#228be6",
+      "width": 11.226103154161754,
+      "height": 12.183758484455605,
+      "seed": 1420643447,
+      "groupIds": [
+        "9YkNe1yqnfZy9Z1JX2xr4",
+        "BDBCTrrhjbJynRAyuf3xJ"
+      ],
+      "strokeSharpness": "sharp",
+      "boundElementIds": [],
+      "index": "aC",
+      "frameId": null,
+      "roundness": null,
+      "boundElements": [],
+      "updated": 1759157176189,
+      "link": null,
+      "locked": false
+    },
+    {
+      "type": "text",
+      "version": 845,
+      "versionNonce": 383204505,
+      "isDeleted": false,
+      "id": "9gZ3Yy1MeP9kEOTLODqLG",
+      "fillStyle": "solid",
+      "strokeWidth": 1,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "angle": 0,
+      "x": -77.72012292771115,
+      "y": 181.11281713043917,
+      "strokeColor": "#000000",
+      "backgroundColor": "#a5d8ff",
+      "width": 95.63072204589844,
+      "height": 23.595161071904883,
+      "seed": 2019206551,
+      "groupIds": [
+        "BDBCTrrhjbJynRAyuf3xJ"
+      ],
+      "strokeSharpness": "sharp",
+      "boundElementIds": [],
+      "fontSize": 17.4778970902999,
+      "fontFamily": 1,
+      "text": "dataset.db",
+      "baseline": 16.595161071904883,
+      "textAlign": "center",
+      "verticalAlign": "top",
+      "index": "aD",
+      "frameId": null,
+      "roundness": null,
+      "boundElements": [],
+      "updated": 1759157176189,
+      "link": null,
+      "locked": false,
+      "containerId": null,
+      "originalText": "dataset.db",
+      "autoResize": true,
+      "lineHeight": 1.350000000000001
+    },
+    {
+      "id": "3eOw20xMhpB5jf_RMG24P",
+      "type": "text",
+      "x": 1131.3333333333335,
+      "y": 31.333333333333428,
+      "width": 508.3333333333333,
+      "height": 550,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aE",
+      "roundness": null,
+      "seed": 1535658041,
+      "version": 821,
+      "versionNonce": 1630266809,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1759157181677,
+      "link": null,
+      "locked": false,
+      "text": "Class PipelineApplier\n    - movie_frequence_filter : pd.DataFrame()\n    - rel_Frequence_Filter : pd.DataFrame()\n    - rel_banned_list: list[str]\n\n    + generate_movie_frequency_filter()\n    + generate_rel_frequency_filter()\n    + generate_list_relationship_filter()\n    \n    + filter_by_movie_frequency()\n    + filter_by_relationship_frequency()\n    + delete_relationship_by_list_filter()\n    + delete_relationship_by_str()\n\n    + drop_na()    \n\n    + rdf_add_special_token()\n    + group_triple_by_movie()\n    + build_by_movie()\n    # static\n    + build_triple()\n    + build_incomplete_triple()",
+      "fontSize": 20,
+      "fontFamily": 5,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "Class PipelineApplier\n    - movie_frequence_filter : pd.DataFrame()\n    - rel_Frequence_Filter : pd.DataFrame()\n    - rel_banned_list: list[str]\n\n    + generate_movie_frequency_filter()\n    + generate_rel_frequency_filter()\n    + generate_list_relationship_filter()\n    \n    + filter_by_movie_frequency()\n    + filter_by_relationship_frequency()\n    + delete_relationship_by_list_filter()\n    + delete_relationship_by_str()\n\n    + drop_na()    \n\n    + rdf_add_special_token()\n    + group_triple_by_movie()\n    + build_by_movie()\n    # static\n    + build_triple()\n    + build_incomplete_triple()",
+      "autoResize": false,
+      "lineHeight": 1.25
+    },
+    {
+      "id": "Fbl1gpb5r7QrdRauGUWm2",
+      "type": "text",
+      "x": 158.23809523809535,
+      "y": 502.52380952380935,
+      "width": 484.2857142857143,
+      "height": 475,
+      "angle": 0,
+      "strokeColor": "#1e1e1e",
+      "backgroundColor": "transparent",
+      "fillStyle": "solid",
+      "strokeWidth": 2,
+      "strokeStyle": "solid",
+      "roughness": 1,
+      "opacity": 100,
+      "groupIds": [],
+      "frameId": null,
+      "index": "aF",
+      "roundness": null,
+      "seed": 2066618807,
+      "version": 541,
+      "versionNonce": 7392153,
+      "isDeleted": false,
+      "boundElements": null,
+      "updated": 1759157954202,
+      "link": null,
+      "locked": false,
+      "text": "Class Pipeline\n    - sql_endpoint: SqlEndpoint()\n\n    - task_rdf_mask_file_handler:\n    - task_bpe_corpus_file_handler:\n    - task_rdf_text_file_handler:\n    - task_rdf_completation_file_handler:\n\n    - Filter_applier : PipelineApplier()\n\n\n    \n    + execute_task_bpe_corpus()\n    + execute_task_rdf_mask()\n    + execute_task_rdf_text()\n    + execute_task_rdf_completation()\n    + execute_all_task()\n\n    + use_toy_dataset()",
+      "fontSize": 20,
+      "fontFamily": 5,
+      "textAlign": "left",
+      "verticalAlign": "top",
+      "containerId": null,
+      "originalText": "Class Pipeline\n    - sql_endpoint: SqlEndpoint()\n\n    - task_rdf_mask_file_handler:\n    - task_bpe_corpus_file_handler:\n    - task_rdf_text_file_handler:\n    - task_rdf_completation_file_handler:\n\n    - Filter_applier : PipelineApplier()\n\n\n    \n    + execute_task_bpe_corpus()\n    + execute_task_rdf_mask()\n    + execute_task_rdf_text()\n    + execute_task_rdf_completation()\n    + execute_all_task()\n\n    + use_toy_dataset()",
+      "autoResize": false,
+      "lineHeight": 1.25
+    }
+  ],
+  "appState": {
+    "gridSize": 20,
+    "gridStep": 5,
+    "gridModeEnabled": false,
+    "viewBackgroundColor": "#ffffff"
+  },
+  "files": {}
+}
\ No newline at end of file

From c319398ca01f10f5a2099219146649390cfec4a9 Mon Sep 17 00:00:00 2001
From: GassiGiuseppe <g.gassi@studenti.poliba.it>
Date: Mon, 29 Sep 2025 17:03:31 +0200
Subject: [PATCH 24/75] little update to UML pipeline

---
 .../cleaning-pipeline.excalidraw.json                | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json b/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json
index 1249185..a3b4660 100644
--- a/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json
+++ b/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json
@@ -592,7 +592,7 @@
       "x": 158.23809523809535,
       "y": 502.52380952380935,
       "width": 484.2857142857143,
-      "height": 475,
+      "height": 500,
       "angle": 0,
       "strokeColor": "#1e1e1e",
       "backgroundColor": "transparent",
@@ -606,20 +606,20 @@
       "index": "aF",
       "roundness": null,
       "seed": 2066618807,
-      "version": 541,
-      "versionNonce": 7392153,
+      "version": 552,
+      "versionNonce": 1269344823,
       "isDeleted": false,
       "boundElements": null,
-      "updated": 1759157954202,
+      "updated": 1759158199532,
       "link": null,
       "locked": false,
-      "text": "Class Pipeline\n    - sql_endpoint: SqlEndpoint()\n\n    - task_rdf_mask_file_handler:\n    - task_bpe_corpus_file_handler:\n    - task_rdf_text_file_handler:\n    - task_rdf_completation_file_handler:\n\n    - Filter_applier : PipelineApplier()\n\n\n    \n    + execute_task_bpe_corpus()\n    + execute_task_rdf_mask()\n    + execute_task_rdf_text()\n    + execute_task_rdf_completation()\n    + execute_all_task()\n\n    + use_toy_dataset()",
+      "text": "Class Pipeline\n    - sql_endpoint: SqlEndpoint()\n\n    - task_rdf_mask_file_handler:\n    - task_bpe_corpus_file_handler:\n    - task_rdf_text_file_handler:\n    - task_rdf_completation_file_handler:\n\n    - Filter_applier : PipelineApplier()\n\n    #\n    - get_cleaned_movie_rows()\n    \n    + execute_task_bpe_corpus()\n    + execute_task_rdf_mask()\n    + execute_task_rdf_text()\n    + execute_task_rdf_completation()\n    + execute_all_task()\n\n    + use_toy_dataset()",
       "fontSize": 20,
       "fontFamily": 5,
       "textAlign": "left",
       "verticalAlign": "top",
       "containerId": null,
-      "originalText": "Class Pipeline\n    - sql_endpoint: SqlEndpoint()\n\n    - task_rdf_mask_file_handler:\n    - task_bpe_corpus_file_handler:\n    - task_rdf_text_file_handler:\n    - task_rdf_completation_file_handler:\n\n    - Filter_applier : PipelineApplier()\n\n\n    \n    + execute_task_bpe_corpus()\n    + execute_task_rdf_mask()\n    + execute_task_rdf_text()\n    + execute_task_rdf_completation()\n    + execute_all_task()\n\n    + use_toy_dataset()",
+      "originalText": "Class Pipeline\n    - sql_endpoint: SqlEndpoint()\n\n    - task_rdf_mask_file_handler:\n    - task_bpe_corpus_file_handler:\n    - task_rdf_text_file_handler:\n    - task_rdf_completation_file_handler:\n\n    - Filter_applier : PipelineApplier()\n\n    #\n    - get_cleaned_movie_rows()\n    \n    + execute_task_bpe_corpus()\n    + execute_task_rdf_mask()\n    + execute_task_rdf_text()\n    + execute_task_rdf_completation()\n    + execute_all_task()\n\n    + use_toy_dataset()",
       "autoResize": false,
       "lineHeight": 1.25
     }

From 007f1e955405ba466ab68ac0c7da656c3edca905 Mon Sep 17 00:00:00 2001
From: GassiGiuseppe <g.gassi@studenti.poliba.it>
Date: Mon, 29 Sep 2025 18:53:33 +0200
Subject: [PATCH 25/75] minor updates

---
 .vscode/settings.json                         | 23 ++++++-
 .../cleaning-pipeline.excalidraw.json         | 64 +++++++++----------
 2 files changed, 54 insertions(+), 33 deletions(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 17ae78b..226939d 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -1,3 +1,24 @@
 {
-  "jupyter.notebookFileRoot": "${workspaceFolder}"
+  // Always treat the project root as the working dir for Jupyter
+  "jupyter.notebookFileRoot": "${workspaceFolder}",
+
+  // When you click "Run Python File in Terminal", DON'T cd into the file's folder
+  "python.terminal.executeInFileDir": false,
+
+  // Start new integrated terminals at the project root
+  "terminal.integrated.cwd": "${workspaceFolder}",
+
+  // Ensure Python can import from the project root no matter which file you run
+  // (so `src/` is on sys.path). Linux shown here; add osx/windows if needed.
+  "terminal.integrated.env.linux": {
+    "PYTHONPATH": "${workspaceFolder}"
+  },
+
+  // Make pytest run from the root without needing a pytest.ini
+  "python.testing.pytestEnabled": true,
+  "python.testing.cwd": "${workspaceFolder}",
+  "python.testing.pytestArgs": ["src/test"],
+
+  // Help Pylance resolve imports like `from src...` without red squiggles
+  "python.analysis.extraPaths": ["${workspaceFolder}"]
 }
\ No newline at end of file
diff --git a/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json b/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json
index a3b4660..c7019f5 100644
--- a/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json
+++ b/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json
@@ -109,8 +109,8 @@
     },
     {
       "type": "line",
-      "version": 4978,
-      "versionNonce": 2079525497,
+      "version": 4979,
+      "versionNonce": 1473849177,
       "isDeleted": false,
       "id": "sYReMTdYblr-oJtYYJALU",
       "fillStyle": "solid",
@@ -119,7 +119,7 @@
       "roughness": 1,
       "opacity": 100,
       "angle": 0,
-      "x": -68.05426555317842,
+      "x": -67.14432426259049,
       "y": 87.19293561900287,
       "strokeColor": "#000000",
       "backgroundColor": "#a5d8ff",
@@ -221,14 +221,14 @@
         "type": 2
       },
       "boundElements": [],
-      "updated": 1759157176189,
+      "updated": 1759158252997,
       "link": null,
       "locked": false
     },
     {
       "type": "line",
-      "version": 2683,
-      "versionNonce": 33379161,
+      "version": 2684,
+      "versionNonce": 952947769,
       "isDeleted": false,
       "id": "0S6dEWQVqKUVkP6Z5IX1l",
       "fillStyle": "solid",
@@ -237,7 +237,7 @@
       "roughness": 1,
       "opacity": 100,
       "angle": 0,
-      "x": -67.53033611490343,
+      "x": -66.6203948243155,
       "y": 144.31921927673278,
       "strokeColor": "#000000",
       "backgroundColor": "#a5d8ff",
@@ -295,14 +295,14 @@
         "type": 2
       },
       "boundElements": [],
-      "updated": 1759157176189,
+      "updated": 1759158252997,
       "link": null,
       "locked": false
     },
     {
       "type": "line",
-      "version": 2769,
-      "versionNonce": 1703641145,
+      "version": 2770,
+      "versionNonce": 477619481,
       "isDeleted": false,
       "id": "szGLND7J0nVOvRkNXX9AS",
       "fillStyle": "solid",
@@ -311,7 +311,7 @@
       "roughness": 1,
       "opacity": 100,
       "angle": 0,
-      "x": -68.56219343740725,
+      "x": -67.65225214681931,
       "y": 115.35516394150972,
       "strokeColor": "#000000",
       "backgroundColor": "#a5d8ff",
@@ -369,14 +369,14 @@
         "type": 2
       },
       "boundElements": [],
-      "updated": 1759157176189,
+      "updated": 1759158252997,
       "link": null,
       "locked": false
     },
     {
       "type": "ellipse",
-      "version": 5766,
-      "versionNonce": 344002841,
+      "version": 5767,
+      "versionNonce": 2119031289,
       "isDeleted": false,
       "id": "O3t2uGktJlDd1_OX_bpV4",
       "fillStyle": "solid",
@@ -385,7 +385,7 @@
       "roughness": 1,
       "opacity": 100,
       "angle": 0,
-      "x": -69.6201424194893,
+      "x": -68.71020112890136,
       "y": 80.06066699332126,
       "strokeColor": "#000000",
       "backgroundColor": "#a5d8ff",
@@ -404,14 +404,14 @@
       "frameId": null,
       "roundness": null,
       "boundElements": [],
-      "updated": 1759157176189,
+      "updated": 1759158252997,
       "link": null,
       "locked": false
     },
     {
       "type": "ellipse",
-      "version": 1176,
-      "versionNonce": 1951499769,
+      "version": 1177,
+      "versionNonce": 525480665,
       "isDeleted": false,
       "id": "_SzKlOBOvJgBg7FX0JTTM",
       "fillStyle": "solid",
@@ -420,7 +420,7 @@
       "roughness": 1,
       "opacity": 100,
       "angle": 0,
-      "x": -33.12815531426679,
+      "x": -32.218214023678854,
       "y": 104.53733467322485,
       "strokeColor": "#000000",
       "backgroundColor": "#228be6",
@@ -437,14 +437,14 @@
       "frameId": null,
       "roundness": null,
       "boundElements": [],
-      "updated": 1759157176189,
+      "updated": 1759158252997,
       "link": null,
       "locked": false
     },
     {
       "type": "ellipse",
-      "version": 1464,
-      "versionNonce": 1879072473,
+      "version": 1465,
+      "versionNonce": 1410887609,
       "isDeleted": false,
       "id": "oJMl2Kxa3SPaiAY0kxo7A",
       "fillStyle": "solid",
@@ -453,7 +453,7 @@
       "roughness": 1,
       "opacity": 100,
       "angle": 0,
-      "x": -32.77701353033319,
+      "x": -31.867072239745255,
       "y": 130.75394896028996,
       "strokeColor": "#000000",
       "backgroundColor": "#228be6",
@@ -470,14 +470,14 @@
       "frameId": null,
       "roundness": null,
       "boundElements": [],
-      "updated": 1759157176189,
+      "updated": 1759158252997,
       "link": null,
       "locked": false
     },
     {
       "type": "ellipse",
-      "version": 1347,
-      "versionNonce": 1176574905,
+      "version": 1348,
+      "versionNonce": 314839193,
       "isDeleted": false,
       "id": "fB6pJBSMA-pRHrpgYKaLL",
       "fillStyle": "solid",
@@ -486,7 +486,7 @@
       "roughness": 1,
       "opacity": 100,
       "angle": 6.239590202363168,
-      "x": -32.12815531426679,
+      "x": -31.218214023678854,
       "y": 159.52267553159635,
       "strokeColor": "#000000",
       "backgroundColor": "#228be6",
@@ -503,14 +503,14 @@
       "frameId": null,
       "roundness": null,
       "boundElements": [],
-      "updated": 1759157176189,
+      "updated": 1759158252997,
       "link": null,
       "locked": false
     },
     {
       "type": "text",
-      "version": 845,
-      "versionNonce": 383204505,
+      "version": 846,
+      "versionNonce": 1091081593,
       "isDeleted": false,
       "id": "9gZ3Yy1MeP9kEOTLODqLG",
       "fillStyle": "solid",
@@ -519,7 +519,7 @@
       "roughness": 1,
       "opacity": 100,
       "angle": 0,
-      "x": -77.72012292771115,
+      "x": -76.81018163712321,
       "y": 181.11281713043917,
       "strokeColor": "#000000",
       "backgroundColor": "#a5d8ff",
@@ -541,7 +541,7 @@
       "frameId": null,
       "roundness": null,
       "boundElements": [],
-      "updated": 1759157176189,
+      "updated": 1759158252997,
       "link": null,
       "locked": false,
       "containerId": null,

From 18fc2ba9d810602b77e7e3cc76eaeaa5cbc492f5 Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Tue, 30 Sep 2025 13:32:24 +0200
Subject: [PATCH 26/75] Added Exceptions

---
 Project_Model/Libs/BPE/Errors/DuplicateWordException.py   | 4 ++++
 Project_Model/Libs/BPE/Errors/SentenceTooLongException.py | 4 ++++
 2 files changed, 8 insertions(+)
 create mode 100644 Project_Model/Libs/BPE/Errors/DuplicateWordException.py
 create mode 100644 Project_Model/Libs/BPE/Errors/SentenceTooLongException.py

diff --git a/Project_Model/Libs/BPE/Errors/DuplicateWordException.py b/Project_Model/Libs/BPE/Errors/DuplicateWordException.py
new file mode 100644
index 0000000..885ff5f
--- /dev/null
+++ b/Project_Model/Libs/BPE/Errors/DuplicateWordException.py
@@ -0,0 +1,4 @@
+class DuplicateWordException(Exception):
+
+    def __init__(self, *args: object) -> None:
+        super().__init__(*args)
\ No newline at end of file
diff --git a/Project_Model/Libs/BPE/Errors/SentenceTooLongException.py b/Project_Model/Libs/BPE/Errors/SentenceTooLongException.py
new file mode 100644
index 0000000..f2d7c9e
--- /dev/null
+++ b/Project_Model/Libs/BPE/Errors/SentenceTooLongException.py
@@ -0,0 +1,4 @@
+class SentenceTooLongException(Exception):
+
+    def __init__(self, *args: object) -> None:
+        super().__init__(*args)
\ No newline at end of file

From 2fe1ce9e9aac43cf16ba390cec1eff6b66e97e52 Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Tue, 30 Sep 2025 13:32:37 +0200
Subject: [PATCH 27/75] Updated Inits

---
 Project_Model/Libs/BPE/Classes/__init__.py | 6 ++++--
 Project_Model/Libs/BPE/Errors/__init__.py  | 6 +++++-
 Project_Model/Libs/BPE/Utils/__init__.py   | 7 +++++++
 Project_Model/Libs/BPE/__init__.py         | 2 ++
 4 files changed, 18 insertions(+), 3 deletions(-)
 create mode 100644 Project_Model/Libs/BPE/Utils/__init__.py

diff --git a/Project_Model/Libs/BPE/Classes/__init__.py b/Project_Model/Libs/BPE/Classes/__init__.py
index d8a7364..a52b024 100644
--- a/Project_Model/Libs/BPE/Classes/__init__.py
+++ b/Project_Model/Libs/BPE/Classes/__init__.py
@@ -1,9 +1,11 @@
 from .NanoSocratesChunker import NanoSocratesChunker
 from .NanoSocratesSplitter import NanoSocratesSplitter
-from .NanoSocratesBPE import NanoSocratesBPE
+from .NanoSocratesBPE import NanoSocratesBPE, NanoSocratesBatchMemoryBPE
+from .NanoSocraTrainer import NanoSocraTrainer
 
 __all__ = [
     "NanoSocratesChunker",
     "NanoSocratesSplitter",
-    "NanoSocratesBPE"
+    "NanoSocratesBPE",
+    "NanoSocraTrainer"
 ]
\ No newline at end of file
diff --git a/Project_Model/Libs/BPE/Errors/__init__.py b/Project_Model/Libs/BPE/Errors/__init__.py
index 587873f..262c27d 100644
--- a/Project_Model/Libs/BPE/Errors/__init__.py
+++ b/Project_Model/Libs/BPE/Errors/__init__.py
@@ -1,7 +1,11 @@
 from .DelimiterNotFoundException import DelimiterNotFoundException
 from .OutOfDictionaryException import OutOfDictionaryException
+from .DuplicateWordException import DuplicateWordException
+from .SentenceTooLongException import SentenceTooLongException
 
 __all__ = [
     "DelimiterNotFoundException",
-    "OutOfDictionaryException"
+    "OutOfDictionaryException",
+    "DuplicateWordException",
+    "SentenceTooLongException"
 ]
diff --git a/Project_Model/Libs/BPE/Utils/__init__.py b/Project_Model/Libs/BPE/Utils/__init__.py
new file mode 100644
index 0000000..f2320fa
--- /dev/null
+++ b/Project_Model/Libs/BPE/Utils/__init__.py
@@ -0,0 +1,7 @@
+from .special_regex_maker import special_regex_maker
+from .lag_checker_iterator import iterator_with_checks
+
+__all__ = [
+    "special_regex_maker",
+    "iterator_with_checks"
+]
\ No newline at end of file
diff --git a/Project_Model/Libs/BPE/__init__.py b/Project_Model/Libs/BPE/__init__.py
index 6f7d1f2..074133b 100644
--- a/Project_Model/Libs/BPE/__init__.py
+++ b/Project_Model/Libs/BPE/__init__.py
@@ -1,7 +1,9 @@
 from .Classes import *
 from .Enums import *
 from .Errors import *
+from .Utils import *
 
 from . import Classes
 from . import Enums
 from . import Errors
+from . import Utils

From 7020c9e68366e0adeabc8f09babfea784d0d7019 Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Tue, 30 Sep 2025 13:33:12 +0200
Subject: [PATCH 28/75] Added utils to make regexps and iterators that check
 for last element

---
 .../Libs/BPE/Utils/lag_checker_iterator.py    | 27 +++++++++++++++++++
 .../Libs/BPE/Utils/special_regex_maker.py     |  9 +++++++
 2 files changed, 36 insertions(+)
 create mode 100644 Project_Model/Libs/BPE/Utils/lag_checker_iterator.py
 create mode 100644 Project_Model/Libs/BPE/Utils/special_regex_maker.py

diff --git a/Project_Model/Libs/BPE/Utils/lag_checker_iterator.py b/Project_Model/Libs/BPE/Utils/lag_checker_iterator.py
new file mode 100644
index 0000000..28bbade
--- /dev/null
+++ b/Project_Model/Libs/BPE/Utils/lag_checker_iterator.py
@@ -0,0 +1,27 @@
+from collections import deque
+from typing import Generator, TypeVar
+
+T1 = TypeVar("T1")
+T2 = TypeVar("T2")
+T3 = TypeVar("T3")
+
+
+def iterator_with_checks(
+    generator: Generator[T1, T2, T3],
+) -> Generator[tuple[T1, bool], T2, T3]:
+
+    # Here we can ignore to catch stop iteration
+    #   we will propagate it
+    last_element = next(generator)
+
+    while True:
+
+        RETURN_ELEMENT = last_element
+        try:
+            element = next(generator)
+            last_element = element
+            yield (RETURN_ELEMENT, False)
+
+        except StopIteration:
+            yield (RETURN_ELEMENT, True)
+            break
diff --git a/Project_Model/Libs/BPE/Utils/special_regex_maker.py b/Project_Model/Libs/BPE/Utils/special_regex_maker.py
new file mode 100644
index 0000000..414eabf
--- /dev/null
+++ b/Project_Model/Libs/BPE/Utils/special_regex_maker.py
@@ -0,0 +1,9 @@
+import re
+
+
+def special_regex_maker(special_tokens: list[str]) -> re.Pattern:
+
+    REGEX_STR = "|".join(special_tokens)
+
+    return re.compile(REGEX_STR)
+

From c9032cab093820600621a4215ff3ee312ea20752 Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Tue, 30 Sep 2025 13:33:28 +0200
Subject: [PATCH 29/75] Added fit method

---
 .../Libs/BPE/Classes/NanoSocratesBPE.py       | 81 +++++++++++++++++--
 1 file changed, 76 insertions(+), 5 deletions(-)

diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py
index 844e860..c7f89ce 100644
--- a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py
@@ -1,11 +1,16 @@
 from .Encoder import Encoder
-from ..Errors import OutOfDictionaryException
+from ..Errors import OutOfDictionaryException, DuplicateWordException
 
 
 class NanoSocratesBatchMemoryBPE:
 
-    def __init__(self) -> None:
-        pass
+    def __init__(
+        self,
+        frequencies: dict[tuple[int, int], int],
+        merge_treshold: int
+    ) -> None:
+        self.frequencies = frequencies
+        self.merge_treshold = merge_treshold
 
 
 class NanoSocratesBPE(Encoder):
@@ -22,12 +27,66 @@ class NanoSocratesBPE(Encoder):
         for key, value in vocabulary.items():
             if value < 256:
                 raise OutOfDictionaryException()
+            # TODO: check if they are in order
             self.__vocabulary[key] = value
             self.__reverse_vocabulary[value] = key
 
+
+    @property
+    def vocabulary_size(self):
+        return len(self.__vocabulary) + 255
+
+    @property
+    def vocabulary(self):
+        return self.__vocabulary
+
+    @property
+    def __next_id(self):
+        return self.vocabulary_size + 1
+
     # TODO: implement fit
-    def fit():
-        pass
+    def fit(
+        self,
+        chunk_data: list[int],
+        memory: NanoSocratesBatchMemoryBPE,
+        last_batch: bool
+    ):
+
+        ENCODED_CHUNK = self.__round_encode(chunk_data)
+        DATA_LEN_BEFORE_LAST = len(ENCODED_CHUNK) - 1
+
+        for i in range(0, DATA_LEN_BEFORE_LAST):
+            CANDIDATE_COUPLE = (ENCODED_CHUNK[i], ENCODED_CHUNK[i+1])
+
+            frequency = memory.frequencies.get(CANDIDATE_COUPLE)
+
+            # Initialize frequency
+            if frequency is None:
+                frequency = 0
+                memory.frequencies[CANDIDATE_COUPLE] = 0
+
+            frequency += 1
+            memory.frequencies[CANDIDATE_COUPLE] = frequency
+
+        if not last_batch:
+            return (self, memory, ENCODED_CHUNK)
+
+        if len(memory.frequencies) < 1:
+            return (self, memory, ENCODED_CHUNK)
+
+        FREQUENCIES = memory.frequencies
+        MAX_COUPLE = max(FREQUENCIES.items(), key=lambda item: item[1])[0]
+        FREQUENCY = FREQUENCIES[MAX_COUPLE]
+
+        if FREQUENCY < memory.merge_treshold:
+            return (self, memory, ENCODED_CHUNK)
+
+        self.__learn_word(MAX_COUPLE)
+
+        return (self, memory, ENCODED_CHUNK)
+
+
+
 
     def encode(self, piece: str) -> list[int]:
 
@@ -104,3 +163,15 @@ class NanoSocratesBPE(Encoder):
             raise OutOfDictionaryException()
 
         return CANDIDATE_DECODED
+
+    def __learn_word(self, words: tuple[int, int]):
+
+        ID = self.__next_id
+
+        DUPLICATE = self.__vocabulary.get(words)
+
+        if DUPLICATE is not None:
+            raise DuplicateWordException()
+
+        self.__vocabulary[words] = ID
+        self.__reverse_vocabulary[ID] = words

From b09bd4acbaf079aef24a55bb61c399997b8256f2 Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Tue, 30 Sep 2025 13:33:40 +0200
Subject: [PATCH 30/75] Created trainer to train BPE

---
 .../Libs/BPE/Classes/NanoSocraTrainer.py      | 164 ++++++++++++++++++
 1 file changed, 164 insertions(+)
 create mode 100644 Project_Model/Libs/BPE/Classes/NanoSocraTrainer.py

diff --git a/Project_Model/Libs/BPE/Classes/NanoSocraTrainer.py b/Project_Model/Libs/BPE/Classes/NanoSocraTrainer.py
new file mode 100644
index 0000000..1d6d429
--- /dev/null
+++ b/Project_Model/Libs/BPE/Classes/NanoSocraTrainer.py
@@ -0,0 +1,164 @@
+from collections import deque
+from pathlib import Path
+import re
+from ..Classes import NanoSocratesBPE, NanoSocratesChunker, NanoSocratesSplitter, NanoSocratesBatchMemoryBPE
+from ..Enums import TokenType
+from ..Utils import special_regex_maker, iterator_with_checks
+
+
+class NanoSocraTrainer:
+
+    def __init__(
+        self,
+        max_vocabulary: int,
+        special_vocabulary: list[str],
+        chunk_size: int,
+        merge_treshold: int = 0,
+        max_iterations: int = 0,
+    ) -> None:
+        # Bytes
+        BYTE_RESERVED_TOKENS = 256
+        SPECIAL_RESERVED_TOKENS = len(special_vocabulary)
+        RESERVED_TOKENS = BYTE_RESERVED_TOKENS + SPECIAL_RESERVED_TOKENS
+
+        self.__max_vocabulary = max_vocabulary - RESERVED_TOKENS
+        self.__max_iterations = max_iterations
+        self.__chunk_size = chunk_size
+        self.__merge_treshold = merge_treshold
+        self.__special_token_regex = special_regex_maker(special_vocabulary)
+
+    def trainBPE(
+        self, path: Path, cache_dir: Path, bpe: NanoSocratesBPE | None = None
+    ) -> NanoSocratesBPE:
+
+        if not path.is_file():
+            raise FileNotFoundError()
+
+        if not cache_dir.is_dir():
+            raise NotADirectoryError()
+
+        if bpe is None:
+            bpe = NanoSocratesBPE()
+        BPE = bpe
+
+        if BPE.vocabulary_size > self.__max_vocabulary:
+            return BPE
+
+        exit = False
+        cached = False
+        current_iteration = 0
+
+        PATH_GEN = self.__switch_paths(path, cache_dir)
+
+        input_path = next(PATH_GEN)
+
+        while not exit:
+
+
+            out_path = next(PATH_GEN)
+            current_iteration = self.__increment_counter(current_iteration)
+            LAST_VOC_SIZE = BPE.vocabulary_size
+
+            FILE = open(out_path, "w")
+
+            for _, _, output in self.__round_train(input_path, BPE, cached):
+                FILE.write(output)
+
+            FILE.close()
+
+            cached = True
+            input_path = out_path
+
+            NEW_VOC_SIZE = BPE.vocabulary_size
+
+            if LAST_VOC_SIZE == NEW_VOC_SIZE:
+                exit = True
+                continue
+
+            if current_iteration == self.__max_iterations:
+                exit = True
+                continue
+
+            if BPE.vocabulary_size == self.__max_vocabulary:
+                exit = True
+                continue
+
+        return BPE
+
+    def __round_train(
+        self,
+        path: Path,
+        bpe: NanoSocratesBPE,
+        cached: bool
+    ):
+
+        CHUNKER = NanoSocratesChunker(self.__chunk_size, self.__special_token_regex)
+        SPLITTER = NanoSocratesSplitter(self.__special_token_regex)
+
+        BPE = bpe
+        memory = NanoSocratesBatchMemoryBPE({}, self.__merge_treshold)
+
+        CHUNKER_GENERATOR = iterator_with_checks(CHUNKER.chunk(path))
+
+        for chunk, last_chunk in CHUNKER_GENERATOR:
+
+            PIECE_GENERATOR = iterator_with_checks(
+                SPLITTER.split_text(chunk)
+            )
+
+            for piece, last_piece in PIECE_GENERATOR:
+
+                LAST_BATCH = last_chunk and last_piece
+                PIECE, TOKEN_TYPE = piece
+
+                if TOKEN_TYPE != TokenType.BPE:
+                    _, _, out = BPE.fit([], memory, LAST_BATCH)
+                    yield (BPE, memory, PIECE)
+                    continue
+
+                PIECE_DATA = self.__make_list_ids(PIECE, cached)
+
+                _, _, out = BPE.fit(PIECE_DATA, memory, LAST_BATCH)
+
+                OUT_STRING = f"{out}"
+                yield (BPE, memory, OUT_STRING)
+
+    def __increment_counter(self, counter: int):
+
+        # What if overflows???
+        try:
+            counter += 1
+        except:
+            print("Integer overflow")
+            counter = 1
+
+        return counter
+
+    def __make_list_ids(self, corpus: str, cached: bool):
+
+        if not cached:
+            return list(map(ord, corpus))
+
+        REDUCED_CORPUS_LEN = len(corpus) -1
+
+        # Skip these cars "[" "]"
+        INTS = corpus[1:REDUCED_CORPUS_LEN]
+        INT_LIST = list(map(int,INTS.split(",")))
+        return INT_LIST
+
+    def __switch_paths(self, path: Path, cache_path: Path):
+
+        yield path
+
+        TMP_1 = cache_path / "tmp1.txt"
+        TMP_2 = cache_path / "tmp2.txt"
+
+        switch = True
+
+        while True:
+            if switch:
+                yield TMP_1
+            else:
+                yield TMP_2
+            switch = not switch
+

From ccacea18d8db1709d3a4a48d89dcebbf73b02b87 Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Tue, 30 Sep 2025 13:33:54 +0200
Subject: [PATCH 31/75] Created files to test BPE training

---
 Project_Model/Tests/bpe_trainer.py            | 42 +++++++++++++++++++
 .../Tests/trainer_files/train_simple.txt      |  1 +
 2 files changed, 43 insertions(+)
 create mode 100644 Project_Model/Tests/bpe_trainer.py
 create mode 100644 Project_Model/Tests/trainer_files/train_simple.txt

diff --git a/Project_Model/Tests/bpe_trainer.py b/Project_Model/Tests/bpe_trainer.py
new file mode 100644
index 0000000..2e1fa08
--- /dev/null
+++ b/Project_Model/Tests/bpe_trainer.py
@@ -0,0 +1,42 @@
+from pathlib import Path
+from Project_Model.Libs.BPE.Enums import TokenType
+import Project_Model.Libs.BPE as BPE
+
+import re
+
+CACHE_DIR_PATH = Path("Project_Model/Tests/trainer_files/cache")
+
+class TestTrainBPE:
+
+    def test_bpe_train_encoding_simple(self):
+
+        TRAINER = BPE.NanoSocraTrainer(
+            int(32E3),
+            ["<SOT>", "<EOT>"],
+            40
+        )
+
+        TEXT = "abababab"
+        TEXT_PATH = Path("Project_Model/Tests/trainer_files/train_simple.txt")
+
+        EXPECTED = [258]
+
+        # ab = 256
+        # 256, 256 = 257
+        # 257, 257 = 258
+
+        BPE_ENCODER = TRAINER.trainBPE(
+            TEXT_PATH,
+            CACHE_DIR_PATH
+        )
+
+        ENCODED = BPE_ENCODER.encode(TEXT)
+
+        assert len(ENCODED) == len(EXPECTED)
+
+        for encoded, expected in zip(ENCODED, EXPECTED):
+            assert encoded == expected
+
+# Useful to debug weird cases
+if __name__ == "__main__":
+    TestTrainBPE().test_bpe_train_encoding_simple()
diff --git a/Project_Model/Tests/trainer_files/train_simple.txt b/Project_Model/Tests/trainer_files/train_simple.txt
new file mode 100644
index 0000000..19f4c70
--- /dev/null
+++ b/Project_Model/Tests/trainer_files/train_simple.txt
@@ -0,0 +1 @@
+<SOT>abababab<EOT>
\ No newline at end of file

From 89a0a1f4bb4f4cb6cc4ad08fb992ec2b7bd187d2 Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Tue, 30 Sep 2025 23:58:31 +0200
Subject: [PATCH 32/75] Fixed bug for utf-8 conversion

---
 .../Libs/BPE/Classes/NanoSocraTrainer.py      | 23 +++++++++++++++++--
 .../Libs/BPE/Classes/NanoSocratesBPE.py       |  4 ++--
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/Project_Model/Libs/BPE/Classes/NanoSocraTrainer.py b/Project_Model/Libs/BPE/Classes/NanoSocraTrainer.py
index 1d6d429..0e043c7 100644
--- a/Project_Model/Libs/BPE/Classes/NanoSocraTrainer.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocraTrainer.py
@@ -15,6 +15,7 @@ class NanoSocraTrainer:
         chunk_size: int,
         merge_treshold: int = 0,
         max_iterations: int = 0,
+        print_after_iterations: int = 1
     ) -> None:
         # Bytes
         BYTE_RESERVED_TOKENS = 256
@@ -26,6 +27,7 @@ class NanoSocraTrainer:
         self.__chunk_size = chunk_size
         self.__merge_treshold = merge_treshold
         self.__special_token_regex = special_regex_maker(special_vocabulary)
+        self.__print_after_iterations = print_after_iterations
 
     def trainBPE(
         self, path: Path, cache_dir: Path, bpe: NanoSocratesBPE | None = None
@@ -61,7 +63,9 @@ class NanoSocraTrainer:
 
             FILE = open(out_path, "w")
 
-            for _, _, output in self.__round_train(input_path, BPE, cached):
+            last_memory = None
+            for _, memory, output in self.__round_train(input_path, BPE, cached):
+                last_memory = memory
                 FILE.write(output)
 
             FILE.close()
@@ -71,6 +75,21 @@ class NanoSocraTrainer:
 
             NEW_VOC_SIZE = BPE.vocabulary_size
 
+            if current_iteration % self.__print_after_iterations == 0:
+                DELIMITER = "==============="
+
+                DEBUG = "\n".join([
+                    DELIMITER,
+                    f"ITERATION: {current_iteration}",
+                    DELIMITER,
+                    f"\tVocabulary size: {BPE.vocabulary_size}\n",
+                    f"\tFrequencies:\n{last_memory.frequencies}\n",
+                    f"\tvocabulary:\n{BPE.vocabulary}",
+                    DELIMITER,
+                    ""
+                ])
+                print(DEBUG)
+
             if LAST_VOC_SIZE == NEW_VOC_SIZE:
                 exit = True
                 continue
@@ -137,7 +156,7 @@ class NanoSocraTrainer:
     def __make_list_ids(self, corpus: str, cached: bool):
 
         if not cached:
-            return list(map(ord, corpus))
+            return list(corpus.encode("utf-8"))
 
         REDUCED_CORPUS_LEN = len(corpus) -1
 
diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py
index c7f89ce..3238522 100644
--- a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py
@@ -90,7 +90,7 @@ class NanoSocratesBPE(Encoder):
 
     def encode(self, piece: str) -> list[int]:
 
-        current_piece = list(map(ord, piece))
+        current_piece = list(piece.encode("utf-8"))
         new_piece = self.__round_encode(current_piece)
 
         while len(current_piece) != len(new_piece):
@@ -128,7 +128,7 @@ class NanoSocratesBPE(Encoder):
 
         return NEW_PIECE
 
-    # TODO: decode
+    # TODO: Remake decode to take a list of token IDs
     def decode(self, token_id: int) -> str:
 
         token_stack: list[int] = [token_id]

From 76f24d4eb0da854470001a7f7768e1b5d1b9d662 Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Tue, 30 Sep 2025 23:58:43 +0200
Subject: [PATCH 33/75] Renamed file

---
 Project_Model/Tests/{bpe_trainer.py => bpe_trainer_test.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename Project_Model/Tests/{bpe_trainer.py => bpe_trainer_test.py} (100%)

diff --git a/Project_Model/Tests/bpe_trainer.py b/Project_Model/Tests/bpe_trainer_test.py
similarity index 100%
rename from Project_Model/Tests/bpe_trainer.py
rename to Project_Model/Tests/bpe_trainer_test.py

From 30c2938d29def625bb5c1afd064644698424fd27 Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Tue, 30 Sep 2025 23:58:54 +0200
Subject: [PATCH 34/75] Fixed typing

---
 Scripts/Libs/CleaningPipeline/special_token.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Scripts/Libs/CleaningPipeline/special_token.py b/Scripts/Libs/CleaningPipeline/special_token.py
index 644ad71..3f25a2d 100644
--- a/Scripts/Libs/CleaningPipeline/special_token.py
+++ b/Scripts/Libs/CleaningPipeline/special_token.py
@@ -1,6 +1,7 @@
 from enum import Enum
 
-class SpecialToken(str, Enum):
+
+class SpecialToken(Enum):
     # (Enum, str) -> throws an error
     START_TRIPLE_LIST = "<SOTL>"
     START_TRIPLE = "<SOT>"
@@ -17,5 +18,4 @@ class SpecialToken(str, Enum):
     CONTINUE_RDF = "<CONTINUERDF>"
     MASK = "<MASK>"
 
-    #BPE Training:
-    
\ No newline at end of file
+    # BPE Training:

From 7ab9b0358e6ba9686d5c46d79e6e8767b2e26098 Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Tue, 30 Sep 2025 23:59:09 +0200
Subject: [PATCH 35/75] Added script to run BPE

---
 Scripts/Training/bpe_trainer.py | 100 ++++++++++++++++++++++++++++++++
 1 file changed, 100 insertions(+)
 create mode 100644 Scripts/Training/bpe_trainer.py

diff --git a/Scripts/Training/bpe_trainer.py b/Scripts/Training/bpe_trainer.py
new file mode 100644
index 0000000..759f397
--- /dev/null
+++ b/Scripts/Training/bpe_trainer.py
@@ -0,0 +1,100 @@
+import argparse
+import json
+from pathlib import Path
+import sys
+# TODO: make relative imports
+import Project_Model.Libs.BPE as BPE
+from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
+
+DEFAULT_CHUNK_SIZE = int(18e4)
+DEFAULT_DEBUG_AFTER_ITER = 1
+DEFAULT_MAX_VOCABULARY = int(32E3)
+DEFAULT_MERGE_TRESHOLD = 1
+DEFAULT_MAX_ITERATIONS = 0
+TOKEN_LIST = [token.value for token in SpecialToken]
+
+
+class ProgramArgs:
+
+    def __init__(
+        self,
+        input_file: str,
+        cache_dir: str,
+        output_file: str,
+        max_vocabulary: int,
+        max_iterations: int,
+        merge_treshold: int,
+        chunk_size: int,
+        debug_after: int,
+    ) -> None:
+        self.input_file = input_file
+        self.cache_dir = cache_dir
+        self.output_file = output_file
+        self.max_vocabulary = max_vocabulary
+        self.max_iterations = max_iterations
+        self.merge_treshold = merge_treshold
+        self.chunk_size = chunk_size
+        self.debug_after = debug_after
+
+
+def get_args(args: list[str]) -> ProgramArgs:
+
+    PARSER = argparse.ArgumentParser()
+    PARSER.add_argument("--input-file", "--input", "-i", required=True, type=str)
+    PARSER.add_argument("--cache-dir", "--cache", "-c", required=True, type=str)
+    PARSER.add_argument("--output-file", "--output", "-o", required=True, type=str)
+    PARSER.add_argument("--max-vocabulary", "--max-voc", default=DEFAULT_MAX_VOCABULARY, type=int)
+    PARSER.add_argument("--max-iterations", "--max-iter", default=DEFAULT_MAX_ITERATIONS, type=int)
+    PARSER.add_argument("--merge-treshold", "--tresh", default=DEFAULT_MERGE_TRESHOLD, type=int)
+    PARSER.add_argument("--chunk-size", default=DEFAULT_CHUNK_SIZE, type=int)
+    PARSER.add_argument("--debug-after", default=DEFAULT_DEBUG_AFTER_ITER, type=int)
+
+    parsed_args, _ = PARSER.parse_known_args(args)
+
+    return ProgramArgs(
+        parsed_args.input_file,
+        parsed_args.cache_dir,
+        parsed_args.output_file,
+        parsed_args.max_vocabulary,
+        parsed_args.max_iterations,
+        parsed_args.merge_treshold,
+        parsed_args.chunk_size,
+        parsed_args.debug_after,
+    )  # type ignore
+
+
+def train(args: ProgramArgs):
+
+    TRAINER = BPE.NanoSocraTrainer(
+        args.max_vocabulary,
+        TOKEN_LIST,
+        args.chunk_size,
+        args.merge_treshold,
+        args.max_iterations,
+        args.debug_after
+    )
+
+    DATASET_PATH = Path(args.input_file)
+    CACHE_DIR = Path(args.cache_dir)
+    VOCABULARY_PATH = Path(args.output_file)
+
+    print(f"Training BPE")
+
+    BPE_ENCODER = TRAINER.trainBPE(
+        DATASET_PATH,
+        CACHE_DIR
+    )
+
+    VOCABULARY = BPE_ENCODER.vocabulary
+    VOCABULARY_JSON = json.dumps(VOCABULARY)
+
+    print(f"Saving Vocabulary in {VOCABULARY_PATH}")
+
+    FILE = open(VOCABULARY_PATH, "w")
+    FILE.write(VOCABULARY_JSON)
+    FILE.close()
+
+
+if __name__ == "__main__":
+    ARGS = get_args(sys.argv)
+    train(ARGS)

From 9a8e726d745ca5e7af26ad43fef5ade81f31dc91 Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Wed, 1 Oct 2025 00:22:22 +0200
Subject: [PATCH 36/75] Added cdebug configuration

---
 .vscode/launch.json | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
 create mode 100644 .vscode/launch.json

diff --git a/.vscode/launch.json b/.vscode/launch.json
new file mode 100644
index 0000000..e0a93b9
--- /dev/null
+++ b/.vscode/launch.json
@@ -0,0 +1,16 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python Debugger: Current File with Arguments",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal",
+            "args": "${command:pickArgs}"
+        }
+    ]
+}
\ No newline at end of file

From 97bac464f3819ba8bfac7a91d6bec2370a4cb1d7 Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Wed, 1 Oct 2025 00:32:43 +0200
Subject: [PATCH 37/75] Fixed JSON incompatibility

---
 Scripts/Training/bpe_trainer.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/Scripts/Training/bpe_trainer.py b/Scripts/Training/bpe_trainer.py
index 759f397..904bfbf 100644
--- a/Scripts/Training/bpe_trainer.py
+++ b/Scripts/Training/bpe_trainer.py
@@ -86,7 +86,14 @@ def train(args: ProgramArgs):
     )
 
     VOCABULARY = BPE_ENCODER.vocabulary
-    VOCABULARY_JSON = json.dumps(VOCABULARY)
+
+    JSON_VOCABULARY: dict[str, int]= {}
+
+    for key, item in VOCABULARY.items():
+        TUPLE_STR = f"{key}"
+        JSON_VOCABULARY[TUPLE_STR] = item
+
+    VOCABULARY_JSON = json.dumps(JSON_VOCABULARY)
 
     print(f"Saving Vocabulary in {VOCABULARY_PATH}")
 

From dbf1d99408186b8b1705a6a124f9ecbf6334ebe7 Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Wed, 1 Oct 2025 12:20:59 +0200
Subject: [PATCH 38/75] Added json utils to save and load json files

---
 Project_Model/Libs/BPE/Utils/json_utils.py | 18 ++++++++
 Project_Model/Libs/BPE/Utils/vocabulary.py | 49 ++++++++++++++++++++++
 2 files changed, 67 insertions(+)
 create mode 100644 Project_Model/Libs/BPE/Utils/json_utils.py
 create mode 100644 Project_Model/Libs/BPE/Utils/vocabulary.py

diff --git a/Project_Model/Libs/BPE/Utils/json_utils.py b/Project_Model/Libs/BPE/Utils/json_utils.py
new file mode 100644
index 0000000..716e93a
--- /dev/null
+++ b/Project_Model/Libs/BPE/Utils/json_utils.py
@@ -0,0 +1,18 @@
+import json
+from pathlib import Path
+
+
+def save_json(vocabulary: dict, path: Path):
+
+    json_string = json.dumps(vocabulary)
+    FILE = open(path, "w")
+    FILE.write(json_string)
+    FILE.close()
+
+
+def load_json(path: Path) -> dict[tuple[int, int], int]:
+    FILE = open(path, "r")
+    json_string = FILE.read()
+    FILE.close()
+
+    return json.loads(json_string)
diff --git a/Project_Model/Libs/BPE/Utils/vocabulary.py b/Project_Model/Libs/BPE/Utils/vocabulary.py
new file mode 100644
index 0000000..fa245d5
--- /dev/null
+++ b/Project_Model/Libs/BPE/Utils/vocabulary.py
@@ -0,0 +1,49 @@
+import json
+from pathlib import Path
+from ..Errors import OutOfDictionaryException
+
+
+def nanos_vocabulary2json_str(vocabulary: dict[tuple[int, int], int]) -> str:
+
+    JSON: dict[str, int] = {}
+
+    for key, item in vocabulary.items():
+        TUPLE_STR = f"{key}"
+        JSON[TUPLE_STR] = item
+
+    return json.dumps(JSON)
+
+
+def nanos_json_str2vocabulary(json_string: str) -> dict[tuple[int, int], int]:
+
+    JSON: dict[str, int] = json.loads(json_string)
+    VOCABULARY: dict[tuple[int, int], int] = {}
+
+    for key, item in JSON.items():
+        REDUCED_KEY = len(key) - 1
+        KEY_STR = key[1:REDUCED_KEY]
+        VOC_KEY = tuple(map(int, KEY_STR.split(",")))
+
+        if len(VOC_KEY) != 2:
+            raise OutOfDictionaryException()
+
+        # Checked for weird things above
+        VOCABULARY[VOC_KEY] = item  # type: ignore
+
+    return VOCABULARY
+
+
+def save_nanos_vocabulary(vocabulary: dict[tuple[int, int], int], path: Path):
+
+    json_string = nanos_vocabulary2json_str(vocabulary)
+    FILE = open(path, "w")
+    FILE.write(json_string)
+    FILE.close()
+
+
+def load_nanos_vocabulary(path: Path) -> dict[tuple[int, int], int]:
+    FILE = open(path, "r")
+    json_string = FILE.read()
+    FILE.close()
+
+    return nanos_json_str2vocabulary(json_string)

From 66bcf6e55fe0e94f53fb63e187fa35f225beb299 Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Wed, 1 Oct 2025 12:21:42 +0200
Subject: [PATCH 39/75] Added a way to recover iteration work

---
 .../Libs/BPE/Classes/NanoSocraTrainer.py      | 135 +++++++++++++-----
 1 file changed, 100 insertions(+), 35 deletions(-)

diff --git a/Project_Model/Libs/BPE/Classes/NanoSocraTrainer.py b/Project_Model/Libs/BPE/Classes/NanoSocraTrainer.py
index 0e043c7..9dfe776 100644
--- a/Project_Model/Libs/BPE/Classes/NanoSocraTrainer.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocraTrainer.py
@@ -1,9 +1,22 @@
 from collections import deque
+import datetime
 from pathlib import Path
 import re
-from ..Classes import NanoSocratesBPE, NanoSocratesChunker, NanoSocratesSplitter, NanoSocratesBatchMemoryBPE
+from ..Classes import (
+    NanoSocratesBPE,
+    NanoSocratesChunker,
+    NanoSocratesSplitter,
+    NanoSocratesBatchMemoryBPE,
+)
 from ..Enums import TokenType
-from ..Utils import special_regex_maker, iterator_with_checks
+from ..Utils import (
+    special_regex_maker,
+    iterator_with_checks,
+    save_nanos_vocabulary,
+    load_nanos_vocabulary,
+    save_json,
+    load_json,
+)
 
 
 class NanoSocraTrainer:
@@ -15,7 +28,7 @@ class NanoSocraTrainer:
         chunk_size: int,
         merge_treshold: int = 0,
         max_iterations: int = 0,
-        print_after_iterations: int = 1
+        print_after_iterations: int = 1,
     ) -> None:
         # Bytes
         BYTE_RESERVED_TOKENS = 256
@@ -30,7 +43,11 @@ class NanoSocraTrainer:
         self.__print_after_iterations = print_after_iterations
 
     def trainBPE(
-        self, path: Path, cache_dir: Path, bpe: NanoSocratesBPE | None = None
+        self,
+        path: Path,
+        cache_dir: Path,
+        bpe: NanoSocratesBPE | None = None,
+        resume_from_iter: int = 0,
     ) -> NanoSocratesBPE:
 
         if not path.is_file():
@@ -49,45 +66,76 @@ class NanoSocraTrainer:
         exit = False
         cached = False
         current_iteration = 0
+        input_path = path
 
-        PATH_GEN = self.__switch_paths(path, cache_dir)
+        NEXT_ITERATION = resume_from_iter + 1 if resume_from_iter != 0 else 0
 
-        input_path = next(PATH_GEN)
+        PATH_GEN = self.__switch_paths(path, cache_dir, NEXT_ITERATION)
+        MEMORY_PATH_GEN = self.__switch_memory(cache_dir, resume_from_iter)
+
+        if resume_from_iter != 0:
+            cached = True
+            current_iteration = resume_from_iter
+            input_path = next(PATH_GEN)
+            # UGLY: fixes a bug immediately, unfortunately
+            _, _ = next(MEMORY_PATH_GEN)
+            _, voc_cache_path = next(MEMORY_PATH_GEN)
+            vocabulary = load_nanos_vocabulary(voc_cache_path)
+            BPE = NanoSocratesBPE(vocabulary)
 
         while not exit:
 
-
             out_path = next(PATH_GEN)
+            internal_cache_path, vocabulary_cache = next(MEMORY_PATH_GEN)
+
             current_iteration = self.__increment_counter(current_iteration)
             LAST_VOC_SIZE = BPE.vocabulary_size
 
             FILE = open(out_path, "w")
 
             last_memory = None
+
             for _, memory, output in self.__round_train(input_path, BPE, cached):
                 last_memory = memory
                 FILE.write(output)
 
             FILE.close()
 
+            internal_cache = {
+                "finished_iter": current_iteration,
+                "read_from": f"{input_path}",
+                "wrote_to": f"{out_path}",
+                "at": datetime.datetime.now(datetime.timezone.utc).strftime(
+                    "%Y-%m-%d %H:%M:%S.%f"
+                )[:-3],
+            }
+
+            VOCABULARY = BPE.vocabulary
+
+            save_json(internal_cache, internal_cache_path)
+            save_nanos_vocabulary(VOCABULARY, vocabulary_cache)
+
             cached = True
             input_path = out_path
 
             NEW_VOC_SIZE = BPE.vocabulary_size
 
             if current_iteration % self.__print_after_iterations == 0:
+
                 DELIMITER = "==============="
 
-                DEBUG = "\n".join([
-                    DELIMITER,
-                    f"ITERATION: {current_iteration}",
-                    DELIMITER,
-                    f"\tVocabulary size: {BPE.vocabulary_size}\n",
-                    f"\tFrequencies:\n{last_memory.frequencies}\n",
-                    f"\tvocabulary:\n{BPE.vocabulary}",
-                    DELIMITER,
-                    ""
-                ])
+                DEBUG = "\n".join(
+                    [
+                        DELIMITER,
+                        f"ITERATION: {current_iteration}",
+                        DELIMITER,
+                        f"\tVocabulary size: {BPE.vocabulary_size}\n",
+                        f"\tFrequencies:\n{last_memory.frequencies}\n",  # type: ignore (pretty sure it's not None)
+                        f"\tvocabulary:\n{BPE.vocabulary}",
+                        DELIMITER,
+                        "",
+                    ]
+                )
                 print(DEBUG)
 
             if LAST_VOC_SIZE == NEW_VOC_SIZE:
@@ -104,12 +152,7 @@ class NanoSocraTrainer:
 
         return BPE
 
-    def __round_train(
-        self,
-        path: Path,
-        bpe: NanoSocratesBPE,
-        cached: bool
-    ):
+    def __round_train(self, path: Path, bpe: NanoSocratesBPE, cached: bool):
 
         CHUNKER = NanoSocratesChunker(self.__chunk_size, self.__special_token_regex)
         SPLITTER = NanoSocratesSplitter(self.__special_token_regex)
@@ -121,9 +164,7 @@ class NanoSocraTrainer:
 
         for chunk, last_chunk in CHUNKER_GENERATOR:
 
-            PIECE_GENERATOR = iterator_with_checks(
-                SPLITTER.split_text(chunk)
-            )
+            PIECE_GENERATOR = iterator_with_checks(SPLITTER.split_text(chunk))
 
             for piece, last_piece in PIECE_GENERATOR:
 
@@ -158,26 +199,50 @@ class NanoSocraTrainer:
         if not cached:
             return list(corpus.encode("utf-8"))
 
-        REDUCED_CORPUS_LEN = len(corpus) -1
+        REDUCED_CORPUS_LEN = len(corpus) - 1
 
         # Skip these cars "[" "]"
         INTS = corpus[1:REDUCED_CORPUS_LEN]
-        INT_LIST = list(map(int,INTS.split(",")))
+        INT_LIST = list(map(int, INTS.split(",")))
         return INT_LIST
 
-    def __switch_paths(self, path: Path, cache_path: Path):
+    def __switch_paths(self, path: Path, cache_path: Path, initial_iteration: int):
 
-        yield path
-
-        TMP_1 = cache_path / "tmp1.txt"
-        TMP_2 = cache_path / "tmp2.txt"
+        CORPUS_TMP_1 = cache_path / "corpus-tmp1.txt"
+        CORPUS_TMP_2 = cache_path / "corpus-tmp2.txt"
 
         switch = True
 
+        if initial_iteration % 2 == 1:
+            switch = False
+
+        del initial_iteration
+
         while True:
             if switch:
-                yield TMP_1
+                yield CORPUS_TMP_1
             else:
-                yield TMP_2
+                yield CORPUS_TMP_2
             switch = not switch
 
+    def __switch_memory(self, cache_path: Path, initial_iteration: int):
+
+        INTERNAL_TMP_1 = cache_path / "internal-tmp1.json"
+        INTERNAL_TMP_2 = cache_path / "internal-tmp2.json"
+
+        VOCAB_TMP_1 = cache_path / "voc-tmp1.json"
+        VOCAB_TMP_2 = cache_path / "voc-tmp2.json"
+
+        switch = False
+
+        if initial_iteration % 2 == 1:
+            switch = True
+
+        del initial_iteration
+
+        while True:
+            if switch:
+                yield (INTERNAL_TMP_1, VOCAB_TMP_1)
+            else:
+                yield (INTERNAL_TMP_2, VOCAB_TMP_2)
+            switch = not switch

From b3d444979fdf8d514ca9ca8fcf9892117240dd8a Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Wed, 1 Oct 2025 12:22:09 +0200
Subject: [PATCH 40/75] Added flag to resume work correctly

---
 Scripts/Training/bpe_trainer.py | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/Scripts/Training/bpe_trainer.py b/Scripts/Training/bpe_trainer.py
index 904bfbf..bc8916e 100644
--- a/Scripts/Training/bpe_trainer.py
+++ b/Scripts/Training/bpe_trainer.py
@@ -21,6 +21,7 @@ class ProgramArgs:
         input_file: str,
         cache_dir: str,
         output_file: str,
+        resume_at: int,
         max_vocabulary: int,
         max_iterations: int,
         merge_treshold: int,
@@ -30,6 +31,7 @@ class ProgramArgs:
         self.input_file = input_file
         self.cache_dir = cache_dir
         self.output_file = output_file
+        self.resume_at = resume_at
         self.max_vocabulary = max_vocabulary
         self.max_iterations = max_iterations
         self.merge_treshold = merge_treshold
@@ -43,6 +45,7 @@ def get_args(args: list[str]) -> ProgramArgs:
     PARSER.add_argument("--input-file", "--input", "-i", required=True, type=str)
     PARSER.add_argument("--cache-dir", "--cache", "-c", required=True, type=str)
     PARSER.add_argument("--output-file", "--output", "-o", required=True, type=str)
+    PARSER.add_argument("--resume-at", "--resume", "-r", default=0, type=int)
     PARSER.add_argument("--max-vocabulary", "--max-voc", default=DEFAULT_MAX_VOCABULARY, type=int)
     PARSER.add_argument("--max-iterations", "--max-iter", default=DEFAULT_MAX_ITERATIONS, type=int)
     PARSER.add_argument("--merge-treshold", "--tresh", default=DEFAULT_MERGE_TRESHOLD, type=int)
@@ -55,6 +58,7 @@ def get_args(args: list[str]) -> ProgramArgs:
         parsed_args.input_file,
         parsed_args.cache_dir,
         parsed_args.output_file,
+        parsed_args.resume_at,
         parsed_args.max_vocabulary,
         parsed_args.max_iterations,
         parsed_args.merge_treshold,
@@ -82,25 +86,15 @@ def train(args: ProgramArgs):
 
     BPE_ENCODER = TRAINER.trainBPE(
         DATASET_PATH,
-        CACHE_DIR
+        CACHE_DIR,
+        resume_from_iter=args.resume_at
     )
 
     VOCABULARY = BPE_ENCODER.vocabulary
 
-    JSON_VOCABULARY: dict[str, int]= {}
-
-    for key, item in VOCABULARY.items():
-        TUPLE_STR = f"{key}"
-        JSON_VOCABULARY[TUPLE_STR] = item
-
-    VOCABULARY_JSON = json.dumps(JSON_VOCABULARY)
-
     print(f"Saving Vocabulary in {VOCABULARY_PATH}")
 
-    FILE = open(VOCABULARY_PATH, "w")
-    FILE.write(VOCABULARY_JSON)
-    FILE.close()
-
+    BPE.save_nanos_vocabulary(VOCABULARY, VOCABULARY_PATH)
 
 if __name__ == "__main__":
     ARGS = get_args(sys.argv)

From fbbe6226bb09fb1b6b408f16763fb59c8b620b1d Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Wed, 1 Oct 2025 18:56:53 +0200
Subject: [PATCH 41/75] Finished uploading stubs for TokeNano

---
 .../Libs/BPE/Classes/NanoSocraTraineRam.py    | 153 ++++++++++++++++++
 Project_Model/Libs/BPE/Classes/TokeNano.py    |   0
 .../Libs/BPE/Classes/TokeNanoCore.py          |   0
 Project_Model/Libs/BPE/Classes/__init__.py    |   4 +-
 Project_Model/Libs/BPE/Utils/__init__.py      |   7 +-
 Scripts/Training/bpe_trainer_ram.py           |  84 ++++++++++
 6 files changed, 246 insertions(+), 2 deletions(-)
 create mode 100644 Project_Model/Libs/BPE/Classes/NanoSocraTraineRam.py
 create mode 100644 Project_Model/Libs/BPE/Classes/TokeNano.py
 create mode 100644 Project_Model/Libs/BPE/Classes/TokeNanoCore.py
 create mode 100644 Scripts/Training/bpe_trainer_ram.py

diff --git a/Project_Model/Libs/BPE/Classes/NanoSocraTraineRam.py b/Project_Model/Libs/BPE/Classes/NanoSocraTraineRam.py
new file mode 100644
index 0000000..9c4f444
--- /dev/null
+++ b/Project_Model/Libs/BPE/Classes/NanoSocraTraineRam.py
@@ -0,0 +1,153 @@
+from collections import deque
+import datetime
+from pathlib import Path
+import re
+from ..Classes import (
+    NanoSocratesBPE,
+    NanoSocratesChunker,
+    NanoSocratesSplitter,
+    NanoSocratesBatchMemoryBPE,
+)
+from ..Enums import TokenType
+from ..Utils import (
+    special_regex_maker,
+    iterator_with_checks,
+    save_nanos_vocabulary,
+    load_nanos_vocabulary,
+    save_json,
+    load_json,
+)
+
+
+class NanoSocraTraineRam:
+
+    def __init__(
+        self,
+        max_vocabulary: int,
+        special_vocabulary: list[str],
+        merge_treshold: int = 0,
+        max_iterations: int = 0,
+        print_after_iterations: int = 1,
+    ) -> None:
+        # Bytes
+        BYTE_RESERVED_TOKENS = 256
+        SPECIAL_RESERVED_TOKENS = len(special_vocabulary)
+        RESERVED_TOKENS = BYTE_RESERVED_TOKENS + SPECIAL_RESERVED_TOKENS
+
+        self.__max_vocabulary = max_vocabulary - RESERVED_TOKENS
+        self.__max_iterations = max_iterations
+        self.__merge_treshold = merge_treshold
+        self.__special_token_regex = special_regex_maker(special_vocabulary)
+        self.__print_after_iterations = print_after_iterations
+
+    def trainBPE(
+        self,
+        path: Path,
+        bpe: NanoSocratesBPE | None = None,
+    ) -> NanoSocratesBPE:
+
+        if not path.is_file():
+            raise FileNotFoundError()
+
+        if bpe is None:
+            bpe = NanoSocratesBPE()
+        BPE = bpe
+
+        if BPE.vocabulary_size > self.__max_vocabulary:
+            return BPE
+
+        exit = False
+        current_iteration = 0
+        data = self.__gather_data_from_file(path)
+
+        while not exit:
+
+            current_iteration = self.__increment_counter(current_iteration)
+
+            LAST_VOC_SIZE = BPE.vocabulary_size
+
+            last_memory = None
+
+            _, data, last_memory = self.__round_train(BPE, data)
+
+            NEW_VOC_SIZE = BPE.vocabulary_size
+
+            if current_iteration % self.__print_after_iterations == 0:
+
+                DELIMITER = "==============="
+
+                DEBUG = "\n".join(
+                    [
+                        DELIMITER,
+                        f"ITERATION: {current_iteration}",
+                        DELIMITER,
+                        f"\tVocabulary size: {BPE.vocabulary_size}\n",
+                        f"\tFrequencies:\n{last_memory.frequencies}\n",  # type: ignore (pretty sure it's not None)
+                        f"\tvocabulary:\n{BPE.vocabulary}",
+                        DELIMITER,
+                        "",
+                    ]
+                )
+                print(DEBUG)
+
+            if LAST_VOC_SIZE == NEW_VOC_SIZE:
+                exit = True
+                continue
+
+            if current_iteration == self.__max_iterations:
+                exit = True
+                continue
+
+            if BPE.vocabulary_size == self.__max_vocabulary:
+                exit = True
+                continue
+
+        return BPE
+
+    def __round_train(self, bpe: NanoSocratesBPE, data: list[list[int]]):
+        DATA_LEN = len(data)
+
+        memory = NanoSocratesBatchMemoryBPE({}, 0)
+        for piece, index in zip(data, range(0, DATA_LEN)):
+
+            last_batch = index == DATA_LEN - 1
+
+            bpe, memory, output = bpe.fit(piece, memory, last_batch)
+
+            data[index] = output
+
+        return (bpe, data, memory)
+
+    def __gather_data_from_file(self, path: Path) -> list[list[int]]:
+
+        SPLITTER = NanoSocratesSplitter(self.__special_token_regex)
+
+        DATA: list[list[int]] = []
+
+        FILE = open(path, "r", encoding="utf-8")
+        file_string = FILE.read()
+        FILE.close()
+
+        for piece, type in SPLITTER.split_text(file_string):
+
+            if type != TokenType.BPE:
+                continue
+
+            int_list = self.__make_list_ids(piece)
+            DATA.append(int_list)
+
+        return DATA
+
+    def __increment_counter(self, counter: int):
+
+        # What if overflows???
+        try:
+            counter += 1
+        except:
+            print("Integer overflow")
+            counter = 1
+
+        return counter
+
+    def __make_list_ids(self, corpus: str):
+        return list(corpus.encode("utf-8"))
diff --git a/Project_Model/Libs/BPE/Classes/TokeNano.py b/Project_Model/Libs/BPE/Classes/TokeNano.py
new file mode 100644
index 0000000..e69de29
diff --git a/Project_Model/Libs/BPE/Classes/TokeNanoCore.py b/Project_Model/Libs/BPE/Classes/TokeNanoCore.py
new file mode 100644
index 0000000..e69de29
diff --git a/Project_Model/Libs/BPE/Classes/__init__.py b/Project_Model/Libs/BPE/Classes/__init__.py
index a52b024..32e958a 100644
--- a/Project_Model/Libs/BPE/Classes/__init__.py
+++ b/Project_Model/Libs/BPE/Classes/__init__.py
@@ -2,10 +2,12 @@ from .NanoSocratesChunker import NanoSocratesChunker
 from .NanoSocratesSplitter import NanoSocratesSplitter
 from .NanoSocratesBPE import NanoSocratesBPE, NanoSocratesBatchMemoryBPE
 from .NanoSocraTrainer import NanoSocraTrainer
+from .NanoSocraTraineRam import NanoSocraTraineRam
 
 __all__ = [
     "NanoSocratesChunker",
     "NanoSocratesSplitter",
     "NanoSocratesBPE",
-    "NanoSocraTrainer"
+    "NanoSocraTrainer",
+    "NanoSocraTraineRam"
 ]
\ No newline at end of file
diff --git a/Project_Model/Libs/BPE/Utils/__init__.py b/Project_Model/Libs/BPE/Utils/__init__.py
index f2320fa..3eb9eb3 100644
--- a/Project_Model/Libs/BPE/Utils/__init__.py
+++ b/Project_Model/Libs/BPE/Utils/__init__.py
@@ -1,7 +1,12 @@
 from .special_regex_maker import special_regex_maker
 from .lag_checker_iterator import iterator_with_checks
+from .vocabulary import save_nanos_vocabulary, load_nanos_vocabulary
+from .json_utils import save_json, load_json
 
 __all__ = [
     "special_regex_maker",
-    "iterator_with_checks"
+    "iterator_with_checks",
+    "save_nanos_vocabulary",
+    "load_nanos_vocabulary",
+    "save_json", "load_json"
 ]
\ No newline at end of file
diff --git a/Scripts/Training/bpe_trainer_ram.py b/Scripts/Training/bpe_trainer_ram.py
new file mode 100644
index 0000000..14ce0bb
--- /dev/null
+++ b/Scripts/Training/bpe_trainer_ram.py
@@ -0,0 +1,84 @@
+import argparse
+import json
+from pathlib import Path
+import sys
+# TODO: make relative imports
+import Project_Model.Libs.BPE as BPE
+from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
+
+DEFAULT_DEBUG_AFTER_ITER = 1
+DEFAULT_MAX_VOCABULARY = int(32E3)
+DEFAULT_MERGE_TRESHOLD = 1
+DEFAULT_MAX_ITERATIONS = 0
+TOKEN_LIST = [token.value for token in SpecialToken]
+
+
+class ProgramArgs:
+
+    def __init__(
+        self,
+        input_file: str,
+        output_file: str,
+        max_vocabulary: int,
+        max_iterations: int,
+        merge_treshold: int,
+        debug_after: int,
+    ) -> None:
+        self.input_file = input_file
+        self.output_file = output_file
+        self.max_vocabulary = max_vocabulary
+        self.max_iterations = max_iterations
+        self.merge_treshold = merge_treshold
+        self.debug_after = debug_after
+
+
+def get_args(args: list[str]) -> ProgramArgs:
+
+    PARSER = argparse.ArgumentParser()
+    PARSER.add_argument("--input-file", "--input", "-i", required=True, type=str)
+    PARSER.add_argument("--output-file", "--output", "-o", required=True, type=str)
+    PARSER.add_argument("--max-vocabulary", "--max-voc", default=DEFAULT_MAX_VOCABULARY, type=int)
+    PARSER.add_argument("--max-iterations", "--max-iter", default=DEFAULT_MAX_ITERATIONS, type=int)
+    PARSER.add_argument("--merge-treshold", "--tresh", default=DEFAULT_MERGE_TRESHOLD, type=int)
+    PARSER.add_argument("--debug-after", default=DEFAULT_DEBUG_AFTER_ITER, type=int)
+
+    parsed_args, _ = PARSER.parse_known_args(args)
+
+    return ProgramArgs(
+        parsed_args.input_file,
+        parsed_args.output_file,
+        parsed_args.max_vocabulary,
+        parsed_args.max_iterations,
+        parsed_args.merge_treshold,
+        parsed_args.debug_after,
+    )  # type ignore
+
+
+def train(args: ProgramArgs):
+
+    TRAINER = BPE.NanoSocraTraineRam(
+        args.max_vocabulary,
+        TOKEN_LIST,
+        args.merge_treshold,
+        args.max_iterations,
+        args.debug_after
+    )
+
+    DATASET_PATH = Path(args.input_file)
+    VOCABULARY_PATH = Path(args.output_file)
+
+    print(f"Training BPE")
+
+    BPE_ENCODER = TRAINER.trainBPE(
+        DATASET_PATH
+    )
+
+    VOCABULARY = BPE_ENCODER.vocabulary
+
+    print(f"Saving Vocabulary in {VOCABULARY_PATH}")
+
+    BPE.save_nanos_vocabulary(VOCABULARY, VOCABULARY_PATH)
+
+if __name__ == "__main__":
+    ARGS = get_args(sys.argv)
+    train(ARGS)

From 7cfaf601b411ea4e6ad5c929793f5aad7b8b127a Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Wed, 1 Oct 2025 19:42:22 +0200
Subject: [PATCH 42/75] Refactored to remove tokens that can't be compressed
 anymore

---
 .../Libs/BPE/Classes/NanoSocraTraineRam.py    | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/Project_Model/Libs/BPE/Classes/NanoSocraTraineRam.py b/Project_Model/Libs/BPE/Classes/NanoSocraTraineRam.py
index 9c4f444..aca820e 100644
--- a/Project_Model/Libs/BPE/Classes/NanoSocraTraineRam.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocraTraineRam.py
@@ -105,18 +105,29 @@ class NanoSocraTraineRam:
         return BPE
 
     def __round_train(self, bpe: NanoSocratesBPE, data: list[list[int]]):
+
         DATA_LEN = len(data)
+        NEW_DATA = []
 
+        counter = 0
         memory = NanoSocratesBatchMemoryBPE({}, 0)
-        for piece, index in zip(data, range(0, DATA_LEN)):
+        while len(data) > 0:
+            counter += 1
+            last_batch = len(data) == 1
 
-            last_batch = index == DATA_LEN - 1
+            piece = data.pop()
 
             bpe, memory, output = bpe.fit(piece, memory, last_batch)
 
-            data[index] = output
+            if counter % int(1E6) == 0:
+                print(f"Fitted: {counter}/{DATA_LEN}")
 
-        return (bpe, data, memory)
+            if len(output) < 2:
+                continue
+
+            NEW_DATA.append(output)
+
+        return (bpe, NEW_DATA, memory)
 
     def __gather_data_from_file(self, path: Path) -> list[list[int]]:
 

From b80b4e4112226e32a5c8eeb2d5d3b91f5ded35a5 Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Thu, 2 Oct 2025 01:29:57 +0200
Subject: [PATCH 43/75] Fixed returning type hints

---
 Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py
index 3238522..4245936 100644
--- a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py
@@ -105,7 +105,7 @@ class NanoSocratesBPE(Encoder):
             return piece
 
         PIECE_LENGTH = len(piece) - 1
-        NEW_PIECE = []
+        NEW_PIECE : list[int]= []
 
         index = 0
         while index < PIECE_LENGTH:

From 63baf29805caa4c6f8cd7e2ef6eaec4b4e83ec53 Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Thu, 2 Oct 2025 01:30:24 +0200
Subject: [PATCH 44/75] Added multithreaded training

---
 .../Libs/BPE/Classes/NanoSocraTrainerPool.py  | 219 ++++++++++++++++++
 Scripts/Training/bpe_trainer_pool.py          |  90 +++++++
 2 files changed, 309 insertions(+)
 create mode 100644 Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py
 create mode 100644 Scripts/Training/bpe_trainer_pool.py

diff --git a/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py b/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py
new file mode 100644
index 0000000..167b433
--- /dev/null
+++ b/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py
@@ -0,0 +1,219 @@
+from collections import deque
+import datetime
+import itertools
+from multiprocessing import Pool
+import os
+from pathlib import Path
+import re
+from ..Classes import (
+    NanoSocratesBPE,
+    NanoSocratesChunker,
+    NanoSocratesSplitter,
+    NanoSocratesBatchMemoryBPE,
+)
+from ..Enums import TokenType
+from ..Utils import (
+    special_regex_maker,
+    iterator_with_checks,
+    save_nanos_vocabulary,
+    load_nanos_vocabulary,
+    save_json,
+    load_json,
+)
+
+def split(a, n):
+    k, m = divmod(len(a), n)
+    return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))
+
+def split_fit(object: tuple[NanoSocratesBPE, list[list[int]]]):
+
+    bpe, data = object
+
+    NEW_DATA: list[list[int]]= []
+
+    memory = NanoSocratesBatchMemoryBPE({}, 0)
+
+    while len(data) > 0:
+
+        piece = data.pop()
+
+        bpe, memory, output = bpe.fit(piece, memory, False)
+
+        if len(output) < 2:
+            continue
+
+        # We are sure of its type
+        NEW_DATA.append(output)  # type: ignore
+
+    return (bpe, NEW_DATA, memory)
+
+
+class NanoSocraTrainerPool:
+
+    def __init__(
+        self,
+        max_vocabulary: int,
+        special_vocabulary: list[str],
+        merge_treshold: int = 0,
+        max_iterations: int = 0,
+        print_after_iterations: int = 1,
+    ) -> None:
+        # Bytes
+        BYTE_RESERVED_TOKENS = 256
+        SPECIAL_RESERVED_TOKENS = len(special_vocabulary)
+        RESERVED_TOKENS = BYTE_RESERVED_TOKENS + SPECIAL_RESERVED_TOKENS
+
+        self.__max_vocabulary = max_vocabulary - RESERVED_TOKENS
+        self.__max_iterations = max_iterations
+        self.__merge_treshold = merge_treshold
+        self.__special_token_regex = special_regex_maker(special_vocabulary)
+        self.__print_after_iterations = print_after_iterations
+
+    # TODO: add a resume function
+    def trainBPE(
+        self,
+        path: Path,
+        cache_file: Path,
+        bpe: NanoSocratesBPE | None = None,
+    ) -> NanoSocratesBPE:
+
+        if not path.is_file():
+            raise FileNotFoundError()
+
+        if not cache_file.is_file():
+            file = cache_file.open("w")
+            file.close()
+
+        if bpe is None:
+            bpe = NanoSocratesBPE()
+        BPE = bpe
+
+        if BPE.vocabulary_size > self.__max_vocabulary:
+            return BPE
+
+        exit = False
+        current_iteration = 0
+        data = self.__gather_data_from_file(path)
+
+        while not exit:
+
+            current_iteration = self.__increment_counter(current_iteration)
+
+            LAST_VOC_SIZE = BPE.vocabulary_size
+
+            last_memory = None
+
+            _, data, last_memory = self.__round_train(BPE, data)
+
+            NEW_VOC_SIZE = BPE.vocabulary_size
+
+            VOCABULARY = BPE.vocabulary
+
+            save_nanos_vocabulary(VOCABULARY, cache_file)
+
+            if current_iteration % self.__print_after_iterations == 0:
+
+                DELIMITER = "==============="
+
+                DEBUG = "\n".join(
+                    [
+                        DELIMITER,
+                        f"ITERATION: {current_iteration}",
+                        DELIMITER,
+                        f"\tVocabulary size: {BPE.vocabulary_size}\n",
+                        f"\tvocabulary:\n{BPE.vocabulary}",
+                        DELIMITER,
+                        "",
+                    ]
+                )
+                print(DEBUG)
+
+            if LAST_VOC_SIZE == NEW_VOC_SIZE:
+                exit = True
+                continue
+
+            if current_iteration == self.__max_iterations:
+                exit = True
+                continue
+
+            if BPE.vocabulary_size == self.__max_vocabulary:
+                exit = True
+                continue
+
+        return BPE
+
+    def __round_train(self, bpe: NanoSocratesBPE, data: list[list[int]]):
+
+        NEW_DATA : list[list[int]] = []
+
+        MEMORY = NanoSocratesBatchMemoryBPE({}, 0)
+
+        fit_funct = split_fit
+        CPU_COUNT = os.process_cpu_count()
+
+        if CPU_COUNT is None:
+            raise Exception()
+
+        VOCABULARY = bpe.vocabulary
+
+        data_chunks = split(data, CPU_COUNT)
+        JOBS = [(NanoSocratesBPE(VOCABULARY), chunk) for chunk in data_chunks]
+
+        JOB_RESULTS: list[tuple[NanoSocratesBPE, list[list[int]], NanoSocratesBatchMemoryBPE]]
+
+        with Pool() as pool:
+            JOB_RESULTS = pool.map(fit_funct, JOBS)
+
+        for i, res in zip(range(0, CPU_COUNT), JOB_RESULTS):
+            _, job_output, job_memory = res
+            NEW_DATA.extend(job_output)
+
+            for key, value in job_memory.frequencies.items():
+                MEMORY.frequencies[key] = value
+
+            del job_output
+            del job_memory
+
+            print(f"Joined {i + 1} out of {CPU_COUNT}")
+
+
+        # Get new token
+        bpe.fit([], MEMORY, True)
+
+        print(f"Sentences from {len(data)} to {len(NEW_DATA)}")
+
+        return (bpe, NEW_DATA, MEMORY)
+
+    def __gather_data_from_file(self, path: Path) -> list[list[int]]:
+
+        SPLITTER = NanoSocratesSplitter(self.__special_token_regex)
+
+        DATA: list[list[int]] = []
+
+        FILE = open(path, "r", encoding="utf-8")
+        file_string = FILE.read()
+        FILE.close()
+
+        for piece, type in SPLITTER.split_text(file_string):
+
+            if type != TokenType.BPE:
+                continue
+
+            int_list = self.__make_list_ids(piece)
+            DATA.append(int_list)
+
+        return DATA
+
+    def __increment_counter(self, counter: int):
+
+        # What if overflows???
+        try:
+            counter += 1
+        except:
+            print("Integer overflow")
+            counter = 1
+
+        return counter
+
+    def __make_list_ids(self, corpus: str):
+        return list(corpus.encode("utf-8"))
diff --git a/Scripts/Training/bpe_trainer_pool.py b/Scripts/Training/bpe_trainer_pool.py
new file mode 100644
index 0000000..5c7ab6e
--- /dev/null
+++ b/Scripts/Training/bpe_trainer_pool.py
@@ -0,0 +1,90 @@
+import argparse
+import json
+from pathlib import Path
+import sys
+# TODO: make relative imports
+import Project_Model.Libs.BPE as BPE
+from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
+
+DEFAULT_DEBUG_AFTER_ITER = 1
+DEFAULT_MAX_VOCABULARY = int(32E3)
+DEFAULT_MERGE_TRESHOLD = 1
+DEFAULT_MAX_ITERATIONS = 0
+TOKEN_LIST = [token.value for token in SpecialToken]
+
+
+class ProgramArgs:
+
+    def __init__(
+        self,
+        input_file: str,
+        output_file: str,
+        cache_file: str,
+        max_vocabulary: int,
+        max_iterations: int,
+        merge_treshold: int,
+        debug_after: int,
+    ) -> None:
+        self.input_file = input_file
+        self.output_file = output_file
+        self.cache_file = cache_file
+        self.max_vocabulary = max_vocabulary
+        self.max_iterations = max_iterations
+        self.merge_treshold = merge_treshold
+        self.debug_after = debug_after
+
+
+def get_args(args: list[str]) -> ProgramArgs:
+
+    PARSER = argparse.ArgumentParser()
+    PARSER.add_argument("--input-file", "--input", "-i", required=True, type=str)
+    PARSER.add_argument("--output-file", "--output", "-o", required=True, type=str)
+    PARSER.add_argument("--cache-file", "--cache", "-c", required=True, type=str)
+    PARSER.add_argument("--max-vocabulary", "--max-voc", default=DEFAULT_MAX_VOCABULARY, type=int)
+    PARSER.add_argument("--max-iterations", "--max-iter", default=DEFAULT_MAX_ITERATIONS, type=int)
+    PARSER.add_argument("--merge-treshold", "--tresh", default=DEFAULT_MERGE_TRESHOLD, type=int)
+    PARSER.add_argument("--debug-after", default=DEFAULT_DEBUG_AFTER_ITER, type=int)
+
+    parsed_args, _ = PARSER.parse_known_args(args)
+
+    return ProgramArgs(
+        parsed_args.input_file,
+        parsed_args.output_file,
+        parsed_args.cache_file,
+        parsed_args.max_vocabulary,
+        parsed_args.max_iterations,
+        parsed_args.merge_treshold,
+        parsed_args.debug_after,
+    )  # type ignore
+
+
+def train(args: ProgramArgs):
+
+    TRAINER = BPE.NanoSocraTrainerPool(
+        args.max_vocabulary,
+        TOKEN_LIST,
+        args.merge_treshold,
+        args.max_iterations,
+        args.debug_after
+    )
+
+    DATASET_PATH = Path(args.input_file)
+    VOCABULARY_PATH = Path(args.output_file)
+    CACHE_PATH = Path(args.cache_file)
+
+    print(f"Training BPE")
+
+    BPE_ENCODER = TRAINER.trainBPE(
+        DATASET_PATH,
+        CACHE_PATH
+    )
+
+    VOCABULARY = BPE_ENCODER.vocabulary
+
+    print(f"Saving Vocabulary in {VOCABULARY_PATH}")
+
+    BPE.save_nanos_vocabulary(VOCABULARY, VOCABULARY_PATH)
+
+if __name__ == "__main__":
+    ARGS = get_args(sys.argv)
+    train(ARGS)

From d19426fa625023ee56e9f077b409f0cbad3e0ef8 Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Thu, 2 Oct 2025 01:31:05 +0200
Subject: [PATCH 45/75] added multithreaded training to package

---
 Project_Model/Libs/BPE/Classes/__init__.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Project_Model/Libs/BPE/Classes/__init__.py b/Project_Model/Libs/BPE/Classes/__init__.py
index 32e958a..d3b93b6 100644
--- a/Project_Model/Libs/BPE/Classes/__init__.py
+++ b/Project_Model/Libs/BPE/Classes/__init__.py
@@ -3,11 +3,13 @@ from .NanoSocratesSplitter import NanoSocratesSplitter
 from .NanoSocratesBPE import NanoSocratesBPE, NanoSocratesBatchMemoryBPE
 from .NanoSocraTrainer import NanoSocraTrainer
 from .NanoSocraTraineRam import NanoSocraTraineRam
+from .NanoSocraTrainerPool import NanoSocraTrainerPool
 
 __all__ = [
     "NanoSocratesChunker",
     "NanoSocratesSplitter",
     "NanoSocratesBPE",
     "NanoSocraTrainer",
-    "NanoSocraTraineRam"
+    "NanoSocraTraineRam",
+    "NanoSocraTrainerPool"
 ]
\ No newline at end of file

From 3fe4e45ceb32c842d782e2347cbc0b03f09362ef Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Thu, 2 Oct 2025 01:50:37 +0200
Subject: [PATCH 46/75] Fixed a bug while joining frequencies

---
 .../Libs/BPE/Classes/NanoSocraTrainerPool.py  | 22 ++++++++++++++-----
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py b/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py
index 167b433..74a596f 100644
--- a/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py
@@ -21,15 +21,17 @@ from ..Utils import (
     load_json,
 )
 
+
 def split(a, n):
     k, m = divmod(len(a), n)
-    return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n))
+    return (a[i * k + min(i, m) : (i + 1) * k + min(i + 1, m)] for i in range(n))
+
 
 def split_fit(object: tuple[NanoSocratesBPE, list[list[int]]]):
 
     bpe, data = object
 
-    NEW_DATA: list[list[int]]= []
+    NEW_DATA: list[list[int]] = []
 
     memory = NanoSocratesBatchMemoryBPE({}, 0)
 
@@ -144,7 +146,7 @@ class NanoSocraTrainerPool:
 
     def __round_train(self, bpe: NanoSocratesBPE, data: list[list[int]]):
 
-        NEW_DATA : list[list[int]] = []
+        NEW_DATA: list[list[int]] = []
 
         MEMORY = NanoSocratesBatchMemoryBPE({}, 0)
 
@@ -159,7 +161,9 @@ class NanoSocraTrainerPool:
         data_chunks = split(data, CPU_COUNT)
         JOBS = [(NanoSocratesBPE(VOCABULARY), chunk) for chunk in data_chunks]
 
-        JOB_RESULTS: list[tuple[NanoSocratesBPE, list[list[int]], NanoSocratesBatchMemoryBPE]]
+        JOB_RESULTS: list[
+            tuple[NanoSocratesBPE, list[list[int]], NanoSocratesBatchMemoryBPE]
+        ]
 
         with Pool() as pool:
             JOB_RESULTS = pool.map(fit_funct, JOBS)
@@ -169,14 +173,20 @@ class NanoSocraTrainerPool:
             NEW_DATA.extend(job_output)
 
             for key, value in job_memory.frequencies.items():
-                MEMORY.frequencies[key] = value
+                frequency = MEMORY.frequencies.get(key)
+
+                if frequency is None:
+                    frequency = 0
+                    MEMORY.frequencies[key] = 0
+
+                frequency += value
+                MEMORY.frequencies[key] = frequency
 
             del job_output
             del job_memory
 
             print(f"Joined {i + 1} out of {CPU_COUNT}")
 
-
         # Get new token
         bpe.fit([], MEMORY, True)
 

From 0975c19e69f78cd225cc832ee8815ad3141c3063 Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Thu, 2 Oct 2025 08:48:13 +0200
Subject: [PATCH 47/75] added nwew method to encode from list of tokens

---
 Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py
index 4245936..4d44884 100644
--- a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py
@@ -52,7 +52,7 @@ class NanoSocratesBPE(Encoder):
         last_batch: bool
     ):
 
-        ENCODED_CHUNK = self.__round_encode(chunk_data)
+        ENCODED_CHUNK = self.encode_intermediate(chunk_data)
         DATA_LEN_BEFORE_LAST = len(ENCODED_CHUNK) - 1
 
         for i in range(0, DATA_LEN_BEFORE_LAST):
@@ -99,6 +99,17 @@ class NanoSocratesBPE(Encoder):
 
         return current_piece
 
+    def encode_intermediate(self, piece: list[int]):
+        current_piece = piece
+        new_piece = self.__round_encode(current_piece)
+
+        while len(current_piece) != len(new_piece):
+            current_piece = new_piece
+            new_piece = self.__round_encode(current_piece)
+
+        return current_piece
+
+
     def __round_encode(self, piece: list[int]):
 
         if len(piece) == 1:

From 17d82f0a4ece9560600a1f65d46cb70c7c4ed72c Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Thu, 2 Oct 2025 08:48:28 +0200
Subject: [PATCH 48/75] Added support to resume workload

---
 Scripts/Training/bpe_trainer_pool.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/Scripts/Training/bpe_trainer_pool.py b/Scripts/Training/bpe_trainer_pool.py
index 5c7ab6e..966816d 100644
--- a/Scripts/Training/bpe_trainer_pool.py
+++ b/Scripts/Training/bpe_trainer_pool.py
@@ -72,11 +72,17 @@ def train(args: ProgramArgs):
     VOCABULARY_PATH = Path(args.output_file)
     CACHE_PATH = Path(args.cache_file)
 
+    start_bpe = BPE.NanoSocratesBPE()
+    if CACHE_PATH.is_file():
+        voc = BPE.load_nanos_vocabulary(CACHE_PATH)
+        start_bpe = BPE.NanoSocratesBPE(voc)
+
     print(f"Training BPE")
 
     BPE_ENCODER = TRAINER.trainBPE(
         DATASET_PATH,
-        CACHE_PATH
+        CACHE_PATH,
+        start_bpe
     )
 
     VOCABULARY = BPE_ENCODER.vocabulary

From aa765b4555c7a447b06d688b515b93a71330b9a7 Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Thu, 2 Oct 2025 08:48:45 +0200
Subject: [PATCH 49/75] Added time checking

---
 .../Libs/BPE/Classes/NanoSocraTrainerPool.py  | 57 ++++++++++++++++++-
 1 file changed, 54 insertions(+), 3 deletions(-)

diff --git a/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py b/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py
index 74a596f..cdd7a95 100644
--- a/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py
@@ -5,6 +5,7 @@ from multiprocessing import Pool
 import os
 from pathlib import Path
 import re
+import time
 from ..Classes import (
     NanoSocratesBPE,
     NanoSocratesChunker,
@@ -49,6 +50,22 @@ def split_fit(object: tuple[NanoSocratesBPE, list[list[int]]]):
 
     return (bpe, NEW_DATA, memory)
 
+def split_encode(object: tuple[NanoSocratesBPE, list[list[int]]]):
+
+    bpe, data = object
+
+    NEW_DATA: list[list[int]] = []
+
+    for piece in data:
+        output = bpe.encode_intermediate(piece)
+
+        if len(output) < 2:
+            continue
+
+        # We are sure of its type
+        NEW_DATA.append(output)  # type: ignore
+
+    return NEW_DATA
 
 class NanoSocraTrainerPool:
 
@@ -96,6 +113,8 @@ class NanoSocraTrainerPool:
         exit = False
         current_iteration = 0
         data = self.__gather_data_from_file(path)
+        data = self.__encode_from_cache(BPE, data)
+
 
         while not exit:
 
@@ -105,8 +124,9 @@ class NanoSocraTrainerPool:
 
             last_memory = None
 
+            start = time.time_ns()
             _, data, last_memory = self.__round_train(BPE, data)
-
+            end = time.time_ns()
             NEW_VOC_SIZE = BPE.vocabulary_size
 
             VOCABULARY = BPE.vocabulary
@@ -122,8 +142,8 @@ class NanoSocraTrainerPool:
                         DELIMITER,
                         f"ITERATION: {current_iteration}",
                         DELIMITER,
-                        f"\tVocabulary size: {BPE.vocabulary_size}\n",
-                        f"\tvocabulary:\n{BPE.vocabulary}",
+                        f"\tVocabulary size: {BPE.vocabulary_size - 256}\n",
+                        f"\tTime elapsed: {(end - start)/1E9}s",
                         DELIMITER,
                         "",
                     ]
@@ -214,6 +234,37 @@ class NanoSocraTrainerPool:
 
         return DATA
 
+    def __encode_from_cache(self, bpe: NanoSocratesBPE, data: list[list[int]]):
+
+        NEW_DATA : list[list[int]]= []
+
+        CPU_COUNT = os.process_cpu_count()
+
+        if CPU_COUNT is None:
+            raise Exception()
+
+        VOCABULARY = bpe.vocabulary
+
+        data_chunks = split(data, CPU_COUNT)
+        JOBS = [(NanoSocratesBPE(VOCABULARY), chunk) for chunk in data_chunks]
+
+        JOB_RESULTS: list[list[list[int]]]
+
+        with Pool() as pool:
+            JOB_RESULTS = pool.map(split_encode, JOBS)
+
+        for i, res in zip(range(0, CPU_COUNT), JOB_RESULTS):
+            job_output = res
+            NEW_DATA.extend(job_output)
+
+            del job_output
+
+            print(f"Joined {i + 1} out of {CPU_COUNT}")
+
+        print(f"Sentences from {len(data)} to {len(NEW_DATA)}")
+
+        return NEW_DATA
+
     def __increment_counter(self, counter: int):
 
         # What if overflows???

From eadba1fb82d996947a830193a4ca05cedf374c76 Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Thu, 2 Oct 2025 09:33:47 +0200
Subject: [PATCH 50/75] Corrected test to reflect changes in NanoSocratesBPE

---
 Project_Model/Tests/bpe_test.py | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/Project_Model/Tests/bpe_test.py b/Project_Model/Tests/bpe_test.py
index 7332f65..e6c8f31 100644
--- a/Project_Model/Tests/bpe_test.py
+++ b/Project_Model/Tests/bpe_test.py
@@ -29,7 +29,7 @@ class TestBPE:
     def test_bpe_decoding_simple(self):
 
 
-        INPUT = 258
+        INPUT = [258]
 
         # ab = 256
         # 256, 256 = 257
@@ -47,6 +47,27 @@ class TestBPE:
         for encoded, expected in zip(DECODED, EXPECTED):
             assert encoded == expected
 
+    def test_bpe_decoding_edge_1(self):
+
+
+        INPUT = [258, ord("c")]
+
+        # ab = 256
+        # 256, 256 = 257
+        # 257, 257 = 258
+
+        VOCABULARY = {(ord("a"), ord("b")): 256, (256, 256): 257, (257, 257): 258}
+        EXPECTED = "ababababc"
+
+        BPE_ENCODER = BPE.NanoSocratesBPE(VOCABULARY)
+
+        DECODED = BPE_ENCODER.decode(INPUT)
+
+        assert len(DECODED) == len(EXPECTED)
+
+        for encoded, expected in zip(DECODED, EXPECTED):
+            assert encoded == expected
+
 # Useful to debug weird cases
 if __name__ == "__main__":
     TestBPE().test_bpe_decoding_simple()

From 1eae8582b2157375f8961db142a0617496412c09 Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Thu, 2 Oct 2025 09:33:58 +0200
Subject: [PATCH 51/75] Fixed decoding phase

---
 .../Libs/BPE/Classes/NanoSocratesBPE.py       | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py
index 4d44884..6428cb7 100644
--- a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py
@@ -1,3 +1,4 @@
+from collections import deque
 from .Encoder import Encoder
 from ..Errors import OutOfDictionaryException, DuplicateWordException
 
@@ -140,31 +141,30 @@ class NanoSocratesBPE(Encoder):
         return NEW_PIECE
 
     # TODO: Remake decode to take a list of token IDs
-    def decode(self, token_id: int) -> str:
+    def decode(self, token_ids: list[int]) -> str:
 
-        token_stack: list[int] = [token_id]
-        DECODED_STRING_ARR: list[str] = []
+        token_stack: deque[int] = deque(token_ids)
+        UTF_8_STRING_ARR: bytearray = bytearray()
 
         while len(token_stack) > 0:
-            TOKEN_ID = token_stack.pop()
+            TOKEN_ID = token_stack.popleft()
 
             if TOKEN_ID < 256:
-                DECODED_CHAR = chr(TOKEN_ID)
-                DECODED_STRING_ARR.append(
-                    DECODED_CHAR
+                UTF_8_STRING_ARR.append(
+                    TOKEN_ID
                 )
                 continue
 
             left_token, right_token = self.__token_decode(TOKEN_ID)
 
-            token_stack.append(
+            token_stack.appendleft(
                 right_token
             )
-            token_stack.append(
+            token_stack.appendleft(
                 left_token
             )
 
-        return "".join(DECODED_STRING_ARR)
+        return UTF_8_STRING_ARR.decode("utf-8")
 
     def __token_decode(self, token_id: int) -> tuple[int, int]:
 

From 2194cc7b4fb70b3397d220e09b01f53822003fd6 Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Thu, 2 Oct 2025 09:56:05 +0200
Subject: [PATCH 52/75] Changed test to use pool trainer

---
 Project_Model/Tests/bpe_trainer_test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Project_Model/Tests/bpe_trainer_test.py b/Project_Model/Tests/bpe_trainer_test.py
index 2e1fa08..69ac4bb 100644
--- a/Project_Model/Tests/bpe_trainer_test.py
+++ b/Project_Model/Tests/bpe_trainer_test.py
@@ -4,13 +4,13 @@ import Project_Model.Libs.BPE as BPE
 
 import re
 
-CACHE_DIR_PATH = Path("Project_Model/Tests/trainer_files/cache")
+CACHE_DIR_PATH = Path("Project_Model/Tests/trainer_files/cache/pool-cache.json")
 
 class TestTrainBPE:
 
     def test_bpe_train_encoding_simple(self):
 
-        TRAINER = BPE.NanoSocraTrainer(
+        TRAINER = BPE.NanoSocraTrainerPool(
             int(32E3),
             ["<SOT>", "<EOT>"],
             40

From 2e595a3a23eaa0a7381e7dbf7ed9ae1ff1694254 Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Thu, 2 Oct 2025 09:56:44 +0200
Subject: [PATCH 53/75] Changed training phase to take directly data instead of
 its encode

---
 Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py b/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py
index cdd7a95..4dd4f4f 100644
--- a/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py
@@ -46,7 +46,7 @@ def split_fit(object: tuple[NanoSocratesBPE, list[list[int]]]):
             continue
 
         # We are sure of its type
-        NEW_DATA.append(output)  # type: ignore
+        NEW_DATA.append(piece)  # type: ignore
 
     return (bpe, NEW_DATA, memory)
 
@@ -56,14 +56,14 @@ def split_encode(object: tuple[NanoSocratesBPE, list[list[int]]]):
 
     NEW_DATA: list[list[int]] = []
 
-    for piece in data:
+    for index, piece in zip(range(0, len(data)), data):
         output = bpe.encode_intermediate(piece)
 
         if len(output) < 2:
             continue
 
         # We are sure of its type
-        NEW_DATA.append(output)  # type: ignore
+        NEW_DATA.append(data[index])  # type: ignore
 
     return NEW_DATA
 

From 856bd8909c27599444c8c31558e171f1a65de9cd Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Thu, 2 Oct 2025 11:02:03 +0200
Subject: [PATCH 54/75] Added treshold

---
 Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py b/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py
index 4dd4f4f..4e88802 100644
--- a/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py
@@ -168,7 +168,7 @@ class NanoSocraTrainerPool:
 
         NEW_DATA: list[list[int]] = []
 
-        MEMORY = NanoSocratesBatchMemoryBPE({}, 0)
+        MEMORY = NanoSocratesBatchMemoryBPE({}, self.__merge_treshold)
 
         fit_funct = split_fit
         CPU_COUNT = os.process_cpu_count()

From 0eef2148a9e15619491993a41af586ad7b06fd25 Mon Sep 17 00:00:00 2001
From: GassiGiuseppe <g.gassi@studenti.poliba.it>
Date: Thu, 2 Oct 2025 12:12:44 +0200
Subject: [PATCH 55/75] in NanoSocratesBPE: encode() method rewritten and
 tested

---
 .../Libs/BPE/Classes/NanoSocratesBPE.py       | 45 ++++++++++++++-----
 1 file changed, 34 insertions(+), 11 deletions(-)

diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py
index 6428cb7..132217e 100644
--- a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py
@@ -4,12 +4,15 @@ from ..Errors import OutOfDictionaryException, DuplicateWordException
 
 
 class NanoSocratesBatchMemoryBPE:
+    """ Memory to batch training. Keeps token couple frequencies, and merge_treshold 
+    """
 
     def __init__(
         self,
         frequencies: dict[tuple[int, int], int],
         merge_treshold: int
     ) -> None:
+        
         self.frequencies = frequencies
         self.merge_treshold = merge_treshold
 
@@ -42,7 +45,12 @@ class NanoSocratesBPE(Encoder):
         return self.__vocabulary
 
     @property
-    def __next_id(self):
+    def __next_id(self) -> int:
+        """
+        Gets the next it
+        Returns:
+            int:
+        """
         return self.vocabulary_size + 1
 
     # TODO: implement fit
@@ -90,20 +98,26 @@ class NanoSocratesBPE(Encoder):
 
 
     def encode(self, piece: str) -> list[int]:
+        """Encode a String into token IDs, it firt convert it into utf-8, then pass the list of integer to encode_intermediate()
+        Args:
+            piece (str):
+        Returns:
+            list[int]:
+        """
+        converted_piece = list(piece.encode("utf-8"))
+        return self.encode_intermediate(converted_piece)
 
-        current_piece = list(piece.encode("utf-8"))
-        new_piece = self.__round_encode(current_piece)
-
-        while len(current_piece) != len(new_piece):
-            current_piece = new_piece
-            new_piece = self.__round_encode(current_piece)
-
-        return current_piece
-
-    def encode_intermediate(self, piece: list[int]):
+    def encode_intermediate(self, piece: list[int]) -> list[int]:
+        """ Encode a piece (as list of integer) till its maximum
+        Args:
+            piece (list[int]): piece to encode
+        Returns:
+            list[int]: piece encoded 
+        """        
         current_piece = piece
         new_piece = self.__round_encode(current_piece)
 
+        # until current_piece is bigger then new_piece, keep encoding
         while len(current_piece) != len(new_piece):
             current_piece = new_piece
             new_piece = self.__round_encode(current_piece)
@@ -112,6 +126,14 @@ class NanoSocratesBPE(Encoder):
 
 
     def __round_encode(self, piece: list[int]):
+        """_summary_
+
+        Args:
+            piece (list[int]): _description_
+
+        Returns:
+            _type_: _description_
+        """
 
         if len(piece) == 1:
             return piece
@@ -143,6 +165,7 @@ class NanoSocratesBPE(Encoder):
     # TODO: Remake decode to take a list of token IDs
     def decode(self, token_ids: list[int]) -> str:
 
+        # deque: double ended queue
         token_stack: deque[int] = deque(token_ids)
         UTF_8_STRING_ARR: bytearray = bytearray()
 

From a1d143187dbfad8ff90c4338f3260ad88a75030c Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Thu, 2 Oct 2025 20:11:43 +0200
Subject: [PATCH 56/75] corrected test to reflect changes in BPE trainer

---
 Project_Model/Tests/bpe_trainer_test.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/Project_Model/Tests/bpe_trainer_test.py b/Project_Model/Tests/bpe_trainer_test.py
index 69ac4bb..1f9f7fb 100644
--- a/Project_Model/Tests/bpe_trainer_test.py
+++ b/Project_Model/Tests/bpe_trainer_test.py
@@ -12,8 +12,7 @@ class TestTrainBPE:
 
         TRAINER = BPE.NanoSocraTrainerPool(
             int(32E3),
-            ["<SOT>", "<EOT>"],
-            40
+            ["<SOT>", "<EOT>"]
         )
 
         TEXT = "abababab"

From 7c935d27008106b056a33a9828102f82a82f7236 Mon Sep 17 00:00:00 2001
From: GassiGiuseppe <g.gassi@studenti.poliba.it>
Date: Fri, 3 Oct 2025 00:57:19 +0200
Subject: [PATCH 57/75] Update NanoSocratesBPE: corrected a minor bug about
 dictionary lenght, added some comment to make the code more clear

---
 .../Libs/BPE/Classes/NanoSocratesBPE.py       | 38 ++++++++++++++-----
 1 file changed, 28 insertions(+), 10 deletions(-)

diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py
index 132217e..baa5efd 100644
--- a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py
@@ -2,7 +2,10 @@ from collections import deque
 from .Encoder import Encoder
 from ..Errors import OutOfDictionaryException, DuplicateWordException
 
-
+# ABOUT THE DICTIONARY:
+# the string is converted into utf-char bytes, that is: each char is rappresented with a set of bytes from 1 to 4.
+# each bytes get casted into an integer; such that, if an integer has its value lower then 256,
+# then it is rappresenting an utf-char-byte, otherwise it is a token-ID.
 class NanoSocratesBatchMemoryBPE:
     """ Memory to batch training. Keeps token couple frequencies, and merge_treshold 
     """
@@ -31,6 +34,7 @@ class NanoSocratesBPE(Encoder):
         for key, value in vocabulary.items():
             if value < 256:
                 raise OutOfDictionaryException()
+                # values under 256 are used for unpaired char
             # TODO: check if they are in order
             self.__vocabulary[key] = value
             self.__reverse_vocabulary[value] = key
@@ -38,7 +42,7 @@ class NanoSocratesBPE(Encoder):
 
     @property
     def vocabulary_size(self):
-        return len(self.__vocabulary) + 255
+        return len(self.__vocabulary) + 256
 
     @property
     def vocabulary(self):
@@ -51,7 +55,7 @@ class NanoSocratesBPE(Encoder):
         Returns:
             int:
         """
-        return self.vocabulary_size + 1
+        return self.vocabulary_size
 
     # TODO: implement fit
     def fit(
@@ -64,6 +68,7 @@ class NanoSocratesBPE(Encoder):
         ENCODED_CHUNK = self.encode_intermediate(chunk_data)
         DATA_LEN_BEFORE_LAST = len(ENCODED_CHUNK) - 1
 
+        # update frequency of each couple of element
         for i in range(0, DATA_LEN_BEFORE_LAST):
             CANDIDATE_COUPLE = (ENCODED_CHUNK[i], ENCODED_CHUNK[i+1])
 
@@ -77,6 +82,7 @@ class NanoSocratesBPE(Encoder):
             frequency += 1
             memory.frequencies[CANDIDATE_COUPLE] = frequency
 
+
         if not last_batch:
             return (self, memory, ENCODED_CHUNK)
 
@@ -126,13 +132,14 @@ class NanoSocratesBPE(Encoder):
 
 
     def __round_encode(self, piece: list[int]):
-        """_summary_
-
+        """ A single round of encode that traverse all the object. Multiple round are needed for a full encode: \n
+        1) "ABAB" -> "XX"
+        2) "XX" -> "Y"
         Args:
-            piece (list[int]): _description_
+            piece (list[int]): the object to encode as a list of integer
 
         Returns:
-            _type_: _description_
+            (list[int]): the one time encoded object
         """
 
         if len(piece) == 1:
@@ -144,26 +151,31 @@ class NanoSocratesBPE(Encoder):
         index = 0
         while index < PIECE_LENGTH:
 
-            CANDIDATE_WORD = (piece[index], piece[index + 1])
+            CANDIDATE_WORD = (piece[index], piece[index + 1]) # take a tuple of consecutive element [int]
             CANDIDATE_TOKEN = self.__vocabulary.get(CANDIDATE_WORD)
 
+            # if no token to substitute the tuple, append the first element
             if CANDIDATE_TOKEN is None:
-                NEW_PIECE.append(piece[index])
+                NEW_PIECE.append(piece[index]) 
                 index += 1
 
+                # if the latter element of the tuple is the last element of the piece, append it
                 if index == PIECE_LENGTH:
-                    NEW_PIECE.append(piece[index])
+                    NEW_PIECE.append(piece[index]) 
 
                 continue
 
+            # in this case there was a candidate token to substitute the couple of element
             NEW_PIECE.append(CANDIDATE_TOKEN)
             index += 2
 
 
         return NEW_PIECE
 
+
     # TODO: Remake decode to take a list of token IDs
     def decode(self, token_ids: list[int]) -> str:
+        
 
         # deque: double ended queue
         token_stack: deque[int] = deque(token_ids)
@@ -199,7 +211,13 @@ class NanoSocratesBPE(Encoder):
         return CANDIDATE_DECODED
 
     def __learn_word(self, words: tuple[int, int]):
+        """ learn a new couple of object in the vocabulary
+        Args:
+            words (tuple[int, int]): the Pair of element to substitute with a new tokenID
 
+        Raises:
+            DuplicateWordException: it launch if there is a duplicate of the new tokenID in the dictionary
+        """
         ID = self.__next_id
 
         DUPLICATE = self.__vocabulary.get(words)

From a5b8692a77cec0f61b7f9cc7f5fd92f914ba33bf Mon Sep 17 00:00:00 2001
From: GassiGiuseppe <g.gassi@studenti.poliba.it>
Date: Fri, 3 Oct 2025 00:59:15 +0200
Subject: [PATCH 58/75] Updated NanoSocratesSpecial to work with TokeNano

---
 .../Libs/BPE/Classes/NanoSocratesSpecial.py   | 45 ++++++++++++-------
 1 file changed, 28 insertions(+), 17 deletions(-)

diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py b/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py
index e551d6c..8fe81bb 100644
--- a/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py
@@ -5,34 +5,43 @@ class NanoSocratesSpecial(Encoder):
 
     def __init__(
         self,
-        initial_vocabulary: list[str] | None = None
-    ) -> None:
+        vocabulary_index: int ,
+        vocabulary: dict[str, int] | None = None
+        ) -> None:
+        
         super().__init__()
 
-        self.__vocabulary: dict[str, int] = {}
+        if vocabulary is None:
+            self.__vocabulary: dict[str, int] = {}
+        else:
+            self.__vocabulary:  dict[str, int] = vocabulary
+        
         self.__reverse_vocabulary: dict[int, str] = {}
-        self.__current_index = 0
 
-        if initial_vocabulary is None:
-            return
+        if vocabulary_index is None:
+            self.__vocabulary_index = 0
+        else:
+            self.__vocabulary_index = vocabulary_index
 
-        for word in initial_vocabulary:
+        # self.__build_reverse_vocabulary()
 
-            CURRENT_INDEX = self.__current_index
-            self.__vocabulary[word] = CURRENT_INDEX
-            self.__reverse_vocabulary[CURRENT_INDEX] = word
 
-            self.__current_index += 1
 
-    @property
-    def vocabulary_size(self):
-        return self.__current_index
+    def build_reverse_vocabulary(self):
+        self.__reverse_vocabulary = {v: k for k, v in self.__vocabulary.items()}
 
-    def add_special_word(self, word:str):
-        CURRENT_INDEX = self.__current_index
+    # @property
+    # def vocabulary_size(self):
+    #     return self.__current_index
+
+    def set_vocabulary_index(self, vocabulary_index: int):
+        self.__vocabulary_index = vocabulary_index
+
+    def add_special_word_to_vocabulary(self, word:str):
+        self.__vocabulary_index = self.__vocabulary_index + 1
+        CURRENT_INDEX = self.__vocabulary_index
         self.__vocabulary[word] = CURRENT_INDEX
         self.__reverse_vocabulary[CURRENT_INDEX] = word
-        self.__current_index += 1
 
     def encode(self, word: str) -> list[int]:
         ID = self.__vocabulary.get(word)
@@ -52,3 +61,5 @@ class NanoSocratesSpecial(Encoder):
 
         return WORD
 
+    def get_reverse_vocabulary(self)-> dict[int, str]:
+        return self.__reverse_vocabulary

From 8121c75a09c95687c33d01aa6539b733e031737f Mon Sep 17 00:00:00 2001
From: GassiGiuseppe <g.gassi@studenti.poliba.it>
Date: Fri, 3 Oct 2025 01:00:36 +0200
Subject: [PATCH 59/75] Updated NanoSocratesSplitter to split also token in
 decode phase

---
 .../Libs/BPE/Classes/NanoSocratesSplitter.py  | 60 ++++++++++++++++---
 1 file changed, 51 insertions(+), 9 deletions(-)

diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py b/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py
index ccca300..399fa77 100644
--- a/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py
@@ -1,40 +1,82 @@
 import re
+from collections import deque 
 from typing import Generator
 from ..Enums import TokenType
 
 
+
 class NanoSocratesSplitter:
 
     def __init__(
         self,
-        special_token_regex: re.Pattern
+        special_token_regex: re.Pattern,
+        max_bpe_token_id: int = 255
     ) -> None:
+        # attention the regex got already compiled
         self.__special_token_regex = special_token_regex
+        self.__max_bpe_token_id : int = max_bpe_token_id # used for decoding
 
     def split_text(self, corpus: str) -> Generator[tuple[str, TokenType]]:
+        """ Split a text using a regex given 
+        Args:
+            corpus (str): all the corpus string to split
+        Yields:
+            Generator[tuple[str, TokenType]]: each time returns a piece of the splitted text: string and its TokenType. \n
+            TokenType describe if the string is for the BPE or a special Token [BPE, SPECIAL]
+        """
 
         bpe_start = 0
-        bpe_end = len(corpus)
+        bpe_end = len(corpus) # this can be deleted!
 
-        for bound_start, bound_end in self.__find_boundaries(corpus):
+        for special_token_start, special_token_end in self.__find_boundaries(corpus):
 
-            bpe_end = bound_start
+            # FIND BPE
+            bpe_end = special_token_start
             BPE_TOKEN_TEXT = corpus[bpe_start:bpe_end]
-
             if BPE_TOKEN_TEXT != "":
                 yield (BPE_TOKEN_TEXT, TokenType.BPE)
 
-            bpe_start = bound_end
-            SPECIAL_TOKEN_TEXT = corpus[bound_start:bound_end]
-
+            # FIND SPECIAL TOKEN
+            SPECIAL_TOKEN_TEXT = corpus[special_token_start:special_token_end]
             if SPECIAL_TOKEN_TEXT != "":
                 yield (SPECIAL_TOKEN_TEXT, TokenType.SPECIAL)
 
-    def __find_boundaries(self, corpus: str) -> Generator[tuple[int, int]]:
+            # now save the new bpe start point
+            # it will used in the next interaction
+            bpe_start = special_token_end
 
+
+    def __find_boundaries(self, corpus: str) -> Generator[tuple[int, int]]:
+        """
+        Find each time the start and end (not included) of the special token
+        Args:
+            corpus (str): the string where the special token will be searched
+        Yields:
+            Generator[tuple[int, int]]: Note the end is not included
+        """        
         for match in self.__special_token_regex.finditer(corpus):
             start = match.start()
             end = match.end()
 
             yield (start, end)
+        
+        # make the last boundary be the end of corpus
+        # eof = len(corpus)
+        # yield(eof,eof)
+
+
+    def split_tokens(self, corpus: list[int]) -> Generator[tuple[list[int], TokenType]] :
+        
+        not_special_token_list : list[int]= []
+        for token in corpus:
+            if token > self.__max_bpe_token_id:
+
+                if len(not_special_token_list) > 0:
+                    yield (not_special_token_list, TokenType.BPE)
+                    not_special_token_list = []
+
+                yield (token, TokenType.SPECIAL)
+                continue
+            
+            not_special_token_list.append(token)
 

From 070dc1b744b462b53ee89d2dc59f7fd98e201eba Mon Sep 17 00:00:00 2001
From: GassiGiuseppe <g.gassi@studenti.poliba.it>
Date: Fri, 3 Oct 2025 01:04:06 +0200
Subject: [PATCH 60/75] implemented token nano for the BPE encoding/decoding

---
 Project_Model/Libs/BPE/Classes/TokeNano.py    |  8 ++
 .../Libs/BPE/Classes/TokeNanoCore.py          | 79 +++++++++++++++++++
 2 files changed, 87 insertions(+)

diff --git a/Project_Model/Libs/BPE/Classes/TokeNano.py b/Project_Model/Libs/BPE/Classes/TokeNano.py
index e69de29..1088f7d 100644
--- a/Project_Model/Libs/BPE/Classes/TokeNano.py
+++ b/Project_Model/Libs/BPE/Classes/TokeNano.py
@@ -0,0 +1,8 @@
+
+from Project_Model.Libs.BPE.Classes.TokeNanoCore import TokeNanoCore
+
+class TokeNano:
+
+    def __init__(self):
+        
+        pass
\ No newline at end of file
diff --git a/Project_Model/Libs/BPE/Classes/TokeNanoCore.py b/Project_Model/Libs/BPE/Classes/TokeNanoCore.py
index e69de29..c719219 100644
--- a/Project_Model/Libs/BPE/Classes/TokeNanoCore.py
+++ b/Project_Model/Libs/BPE/Classes/TokeNanoCore.py
@@ -0,0 +1,79 @@
+from pathlib import Path
+
+from Project_Model.Libs.BPE.Classes.NanoSocratesSplitter import NanoSocratesSplitter
+from Project_Model.Libs.BPE.Classes.NanoSocratesBPE import NanoSocratesBPE
+from Project_Model.Libs.BPE.Classes.NanoSocratesSpecial import NanoSocratesSpecial
+
+from Project_Model.Libs.BPE.Utils.special_regex_maker import special_regex_maker
+from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
+from Project_Model.Libs.BPE.Enums import TokenType
+from Project_Model.Libs.BPE.Utils.json_utils import load_json
+class TokeNanoCore:
+    def __init__(self, 
+                bpe_vocabulary: dict[tuple[int, int], int]
+                # special_vocabulary: dict[str, int] 
+                ):
+        self._bpe = NanoSocratesBPE(bpe_vocabulary)
+        
+        # special_vocabulary = [token.value for token in SpecialToken]
+        special_token_list = [token.value for token in SpecialToken]
+        self._splitter = NanoSocratesSplitter(special_regex_maker(special_token_list),self._bpe.vocabulary_size)
+
+        self._special_bpe =  NanoSocratesSpecial(vocabulary_index=None) # technically its not a bpe but more something like an "autoencoder"
+        self.prepare_special_token_vocabulary()
+        
+        
+    def encode(self, corpus : str) -> list[int]:
+        output : list[int] = []
+        for piece, token_type in self._splitter.split_text(corpus):
+
+            if token_type == TokenType.SPECIAL:
+                output.extend(self._special_bpe.encode(piece))
+
+            # slow but clear
+            if token_type == TokenType.BPE:
+                output.extend(self._bpe.encode(piece))
+
+        return output
+
+    
+
+    def decode(self, corpus : list[int])-> str:
+        output_str = ''
+        for token, token_type in self._splitter.split_tokens(corpus):
+            # token is an integer if special, a list of integer otherwise
+            if token_type == TokenType.SPECIAL:
+                output_str += self._special_bpe.decode(token) # it accept an integer
+
+            # slow but clear
+            if token_type == TokenType.BPE:
+                output_str += self._bpe.decode(token) # it accept a list of integer
+        return output_str
+
+        
+
+    def prepare_special_token_vocabulary(self):
+        self._special_bpe.set_vocabulary_index(self._bpe.vocabulary_size) 
+
+        for special_token in [token.value for token in SpecialToken]:
+            self._special_bpe.add_special_word_to_vocabulary(special_token)
+
+        self._special_bpe.build_reverse_vocabulary()
+
+
+if __name__ == "__main__":
+    dictionary_path = "Assets/Dataset/Tmp/toy_dictionary.json"
+    dictionary = load_json(Path(dictionary_path))
+
+    tokenano = TokeNanoCore(dictionary)
+
+    corpus = "<SOTL><SOT><SUBJ>dbp-dbr:How_It_Should_Have_Ended<PRED>"
+    print(corpus)
+
+    encoded_list = tokenano.encode(corpus)
+    print(encoded_list)
+
+    decoded_string = tokenano.decode(encoded_list)
+    print(decoded_string)
+
+# [1474, 1475, 1477, 100, 98, 112, 45, 100, 98, 114, 58, 72, 111, 119, 95, 73, 116, 95, 83, 104, 111, 117, 108, 100, 95, 72, 97, 118, 101, 95, 69, 110, 100, 101, 100, 1478]
\ No newline at end of file

From 09f7b39512a72de432afd245b5efc9d87ccd6207 Mon Sep 17 00:00:00 2001
From: GassiGiuseppe <g.gassi@studenti.poliba.it>
Date: Fri, 3 Oct 2025 01:04:47 +0200
Subject: [PATCH 61/75] test files updated

---
 Project_Model/Tests/bpe_test.py      |  3 +-
 Project_Model/Tests/splitter_test.py | 49 ++++++++++++++++++++++++++++
 Project_Model/Tests/tokenano_test.py | 21 ++++++++++++
 3 files changed, 72 insertions(+), 1 deletion(-)
 create mode 100644 Project_Model/Tests/tokenano_test.py

diff --git a/Project_Model/Tests/bpe_test.py b/Project_Model/Tests/bpe_test.py
index e6c8f31..0acae46 100644
--- a/Project_Model/Tests/bpe_test.py
+++ b/Project_Model/Tests/bpe_test.py
@@ -70,4 +70,5 @@ class TestBPE:
 
 # Useful to debug weird cases
 if __name__ == "__main__":
-    TestBPE().test_bpe_decoding_simple()
+    # TestBPE().test_bpe_decoding_simple()
+    TestBPE().test_bpe_encoding_simple()
diff --git a/Project_Model/Tests/splitter_test.py b/Project_Model/Tests/splitter_test.py
index eda95b6..154e24e 100644
--- a/Project_Model/Tests/splitter_test.py
+++ b/Project_Model/Tests/splitter_test.py
@@ -45,6 +45,7 @@ class TestSplitter:
             ("<SEP>", TokenType.SPECIAL),
             ("m d", TokenType.BPE),
             ("<SEP>", TokenType.SPECIAL),
+            #("olor", TokenType.BPE)
         ]
 
         CHUNKS = list(SPLITTER.split_text(TEXT))
@@ -129,3 +130,51 @@ class TestSplitter:
 
             assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING
             assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE
+
+    def test_split_token_decode_simple(self):
+        # to test the token split into special and bpe
+        SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX, max_bpe_token_id=1473)
+        token_list = [100,101,1477]
+
+        CHUNKS = list(SPLITTER.split_tokens(token_list))
+        EXPECTED_CHUNKS = [
+            ([100,101], TokenType.BPE),
+            (1477, TokenType.SPECIAL),
+        ]
+
+        assert len(CHUNKS) == len(EXPECTED_CHUNKS)
+
+        for chunk, expected_chunk in zip(EXPECTED_CHUNKS, CHUNKS):
+            print(f"TEST:\n\tCHUNK:\t\t{chunk}\n\tEXPECTED:\t\t{expected_chunk}")
+            RECEIVED_TOKEN_STRING, RECEIVED_TOKEN_TYPE = chunk
+            EXPECTED_TOKEN_STRING, EXPECTED_TOKEN_TYPE = expected_chunk
+
+            assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING
+            assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE
+
+    def test_split_token_decode_simple_malformed(self):
+        # to test the token split into special and bpe
+        SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX, max_bpe_token_id=1473)
+        token_list = [100,101,1477,100]
+
+        CHUNKS = list(SPLITTER.split_tokens(token_list))
+        EXPECTED_CHUNKS = [
+            ([100,101], TokenType.BPE),
+            (1477, TokenType.SPECIAL),
+        ]
+
+        assert len(CHUNKS) == len(EXPECTED_CHUNKS)
+
+        for chunk, expected_chunk in zip(EXPECTED_CHUNKS, CHUNKS):
+            print(f"TEST:\n\tCHUNK:\t\t{chunk}\n\tEXPECTED:\t\t{expected_chunk}")
+            RECEIVED_TOKEN_STRING, RECEIVED_TOKEN_TYPE = chunk
+            EXPECTED_TOKEN_STRING, EXPECTED_TOKEN_TYPE = expected_chunk
+
+            assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING
+            assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE
+
+
+
+# Useful to debug weird cases
+if __name__ == "__main__":
+    TestSplitter().test_split_trailing_text()
\ No newline at end of file
diff --git a/Project_Model/Tests/tokenano_test.py b/Project_Model/Tests/tokenano_test.py
new file mode 100644
index 0000000..2dc7779
--- /dev/null
+++ b/Project_Model/Tests/tokenano_test.py
@@ -0,0 +1,21 @@
+
+from Project_Model.Libs.BPE.Classes.TokeNanoCore import TokeNanoCore
+
+class TestTokeNano:
+
+    def test_decode_encode_simple(self):
+        TEXT = "<SOT>abababab<EOT>"
+
+        # ab = 256
+        # 256, 256 = 257
+        # 257, 257 = 258
+
+        VOCABULARY = {(ord("a"), ord("b")): 256, (256, 256): 257, (257, 257): 258}
+        # EXPECTED = [258]
+
+        TOKE_NANO = TokeNanoCore(VOCABULARY)
+
+        ENCODED = TOKE_NANO.encode(TEXT)
+        DECODED = TOKE_NANO.decode(ENCODED)
+
+        assert TEXT == DECODED

From 845d6453488a0f65c30af664725dd623a40c9c09 Mon Sep 17 00:00:00 2001
From: GassiGiuseppe <g.gassi@studenti.poliba.it>
Date: Fri, 3 Oct 2025 10:38:35 +0200
Subject: [PATCH 62/75] added some stubs on special_regex_maker

---
 Project_Model/Libs/BPE/Utils/special_regex_maker.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/Project_Model/Libs/BPE/Utils/special_regex_maker.py b/Project_Model/Libs/BPE/Utils/special_regex_maker.py
index 414eabf..fd4ac28 100644
--- a/Project_Model/Libs/BPE/Utils/special_regex_maker.py
+++ b/Project_Model/Libs/BPE/Utils/special_regex_maker.py
@@ -2,6 +2,13 @@ import re
 
 
 def special_regex_maker(special_tokens: list[str]) -> re.Pattern:
+    """ compile a regex for the special token
+    Args:
+        special_tokens (list[str]): the list of special token
+
+    Returns:
+        re.Pattern:
+    """    
 
     REGEX_STR = "|".join(special_tokens)
 

From e8894504c60d648698cf29d2cb72cd7a1a1edebd Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Fri, 3 Oct 2025 11:44:44 +0200
Subject: [PATCH 63/75] Fixed a bug where a token (int) was yielded instead of
 a list of int

---
 .../Libs/BPE/Classes/NanoSocratesSplitter.py  | 28 ++++++++-----------
 1 file changed, 11 insertions(+), 17 deletions(-)

diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py b/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py
index 399fa77..6e0abc2 100644
--- a/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py
@@ -1,23 +1,20 @@
 import re
-from collections import deque 
+from collections import deque
 from typing import Generator
 from ..Enums import TokenType
 
 
-
 class NanoSocratesSplitter:
 
     def __init__(
-        self,
-        special_token_regex: re.Pattern,
-        max_bpe_token_id: int = 255
+        self, special_token_regex: re.Pattern, max_bpe_token_id: int = 255
     ) -> None:
         # attention the regex got already compiled
         self.__special_token_regex = special_token_regex
-        self.__max_bpe_token_id : int = max_bpe_token_id # used for decoding
+        self.__max_bpe_token_id: int = max_bpe_token_id  # used for decoding
 
     def split_text(self, corpus: str) -> Generator[tuple[str, TokenType]]:
-        """ Split a text using a regex given 
+        """Split a text using a regex given
         Args:
             corpus (str): all the corpus string to split
         Yields:
@@ -26,7 +23,7 @@ class NanoSocratesSplitter:
         """
 
         bpe_start = 0
-        bpe_end = len(corpus) # this can be deleted!
+        bpe_end = len(corpus)  # this can be deleted!
 
         for special_token_start, special_token_end in self.__find_boundaries(corpus):
 
@@ -45,7 +42,6 @@ class NanoSocratesSplitter:
             # it will used in the next interaction
             bpe_start = special_token_end
 
-
     def __find_boundaries(self, corpus: str) -> Generator[tuple[int, int]]:
         """
         Find each time the start and end (not included) of the special token
@@ -53,21 +49,20 @@ class NanoSocratesSplitter:
             corpus (str): the string where the special token will be searched
         Yields:
             Generator[tuple[int, int]]: Note the end is not included
-        """        
+        """
         for match in self.__special_token_regex.finditer(corpus):
             start = match.start()
             end = match.end()
 
             yield (start, end)
-        
+
         # make the last boundary be the end of corpus
         # eof = len(corpus)
         # yield(eof,eof)
 
+    def split_tokens(self, corpus: list[int]) -> Generator[tuple[list[int], TokenType]]:
 
-    def split_tokens(self, corpus: list[int]) -> Generator[tuple[list[int], TokenType]] :
-        
-        not_special_token_list : list[int]= []
+        not_special_token_list: list[int] = []
         for token in corpus:
             if token > self.__max_bpe_token_id:
 
@@ -75,8 +70,7 @@ class NanoSocratesSplitter:
                     yield (not_special_token_list, TokenType.BPE)
                     not_special_token_list = []
 
-                yield (token, TokenType.SPECIAL)
+                yield ([token], TokenType.SPECIAL)
                 continue
-            
-            not_special_token_list.append(token)
 
+            not_special_token_list.append(token)

From 6b9cb7cd352e2a297a80816734ffe3ff21cd674c Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Fri, 3 Oct 2025 13:26:42 +0200
Subject: [PATCH 64/75] Modified imports

---
 Project_Model/Libs/BPE/Classes/__init__.py | 1 +
 Project_Model/Libs/BPE/Utils/__init__.py   | 1 +
 2 files changed, 2 insertions(+)

diff --git a/Project_Model/Libs/BPE/Classes/__init__.py b/Project_Model/Libs/BPE/Classes/__init__.py
index d3b93b6..bab5bd8 100644
--- a/Project_Model/Libs/BPE/Classes/__init__.py
+++ b/Project_Model/Libs/BPE/Classes/__init__.py
@@ -4,6 +4,7 @@ from .NanoSocratesBPE import NanoSocratesBPE, NanoSocratesBatchMemoryBPE
 from .NanoSocraTrainer import NanoSocraTrainer
 from .NanoSocraTraineRam import NanoSocraTraineRam
 from .NanoSocraTrainerPool import NanoSocraTrainerPool
+from .NanoSocratesSpecial import NanoSocratesSpecial
 
 __all__ = [
     "NanoSocratesChunker",
diff --git a/Project_Model/Libs/BPE/Utils/__init__.py b/Project_Model/Libs/BPE/Utils/__init__.py
index 3eb9eb3..f9213c6 100644
--- a/Project_Model/Libs/BPE/Utils/__init__.py
+++ b/Project_Model/Libs/BPE/Utils/__init__.py
@@ -2,6 +2,7 @@ from .special_regex_maker import special_regex_maker
 from .lag_checker_iterator import iterator_with_checks
 from .vocabulary import save_nanos_vocabulary, load_nanos_vocabulary
 from .json_utils import save_json, load_json
+from .special_regex_maker import special_regex_maker
 
 __all__ = [
     "special_regex_maker",

From c5c0c61f797773a96f1a3fe582e8998c5d5254cd Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Fri, 3 Oct 2025 13:26:58 +0200
Subject: [PATCH 65/75] Fix of bugs and semantics

---
 .../Libs/BPE/Classes/NanoSocratesBPE.py       | 56 ++++-------
 .../Libs/BPE/Classes/NanoSocratesSpecial.py   | 65 ++++++-------
 .../Libs/BPE/Classes/NanoSocratesSplitter.py  | 24 ++++-
 .../Libs/BPE/Classes/TokeNanoCore.py          | 97 +++++++------------
 Project_Model/Libs/BPE/Enums/SpecialToken.py  | 21 ++++
 5 files changed, 134 insertions(+), 129 deletions(-)
 create mode 100644 Project_Model/Libs/BPE/Enums/SpecialToken.py

diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py
index baa5efd..d517f04 100644
--- a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py
@@ -2,20 +2,18 @@ from collections import deque
 from .Encoder import Encoder
 from ..Errors import OutOfDictionaryException, DuplicateWordException
 
+
 # ABOUT THE DICTIONARY:
 # the string is converted into utf-char bytes, that is: each char is rappresented with a set of bytes from 1 to 4.
 # each bytes get casted into an integer; such that, if an integer has its value lower then 256,
 # then it is rappresenting an utf-char-byte, otherwise it is a token-ID.
 class NanoSocratesBatchMemoryBPE:
-    """ Memory to batch training. Keeps token couple frequencies, and merge_treshold 
-    """
+    """Memory to batch training. Keeps token couple frequencies, and merge_treshold"""
 
     def __init__(
-        self,
-        frequencies: dict[tuple[int, int], int],
-        merge_treshold: int
+        self, frequencies: dict[tuple[int, int], int], merge_treshold: int
     ) -> None:
-        
+
         self.frequencies = frequencies
         self.merge_treshold = merge_treshold
 
@@ -39,7 +37,6 @@ class NanoSocratesBPE(Encoder):
             self.__vocabulary[key] = value
             self.__reverse_vocabulary[value] = key
 
-
     @property
     def vocabulary_size(self):
         return len(self.__vocabulary) + 256
@@ -62,7 +59,7 @@ class NanoSocratesBPE(Encoder):
         self,
         chunk_data: list[int],
         memory: NanoSocratesBatchMemoryBPE,
-        last_batch: bool
+        last_batch: bool,
     ):
 
         ENCODED_CHUNK = self.encode_intermediate(chunk_data)
@@ -70,7 +67,7 @@ class NanoSocratesBPE(Encoder):
 
         # update frequency of each couple of element
         for i in range(0, DATA_LEN_BEFORE_LAST):
-            CANDIDATE_COUPLE = (ENCODED_CHUNK[i], ENCODED_CHUNK[i+1])
+            CANDIDATE_COUPLE = (ENCODED_CHUNK[i], ENCODED_CHUNK[i + 1])
 
             frequency = memory.frequencies.get(CANDIDATE_COUPLE)
 
@@ -82,7 +79,6 @@ class NanoSocratesBPE(Encoder):
             frequency += 1
             memory.frequencies[CANDIDATE_COUPLE] = frequency
 
-
         if not last_batch:
             return (self, memory, ENCODED_CHUNK)
 
@@ -100,9 +96,6 @@ class NanoSocratesBPE(Encoder):
 
         return (self, memory, ENCODED_CHUNK)
 
-
-
-
     def encode(self, piece: str) -> list[int]:
         """Encode a String into token IDs, it firt convert it into utf-8, then pass the list of integer to encode_intermediate()
         Args:
@@ -114,12 +107,12 @@ class NanoSocratesBPE(Encoder):
         return self.encode_intermediate(converted_piece)
 
     def encode_intermediate(self, piece: list[int]) -> list[int]:
-        """ Encode a piece (as list of integer) till its maximum
+        """Encode a piece (as list of integer) till its maximum
         Args:
             piece (list[int]): piece to encode
         Returns:
-            list[int]: piece encoded 
-        """        
+            list[int]: piece encoded
+        """
         current_piece = piece
         new_piece = self.__round_encode(current_piece)
 
@@ -130,9 +123,8 @@ class NanoSocratesBPE(Encoder):
 
         return current_piece
 
-
     def __round_encode(self, piece: list[int]):
-        """ A single round of encode that traverse all the object. Multiple round are needed for a full encode: \n
+        """A single round of encode that traverse all the object. Multiple round are needed for a full encode: \n
         1) "ABAB" -> "XX"
         2) "XX" -> "Y"
         Args:
@@ -146,22 +138,25 @@ class NanoSocratesBPE(Encoder):
             return piece
 
         PIECE_LENGTH = len(piece) - 1
-        NEW_PIECE : list[int]= []
+        NEW_PIECE: list[int] = []
 
         index = 0
         while index < PIECE_LENGTH:
 
-            CANDIDATE_WORD = (piece[index], piece[index + 1]) # take a tuple of consecutive element [int]
+            CANDIDATE_WORD = (
+                piece[index],
+                piece[index + 1],
+            )  # take a tuple of consecutive element [int]
             CANDIDATE_TOKEN = self.__vocabulary.get(CANDIDATE_WORD)
 
             # if no token to substitute the tuple, append the first element
             if CANDIDATE_TOKEN is None:
-                NEW_PIECE.append(piece[index]) 
+                NEW_PIECE.append(piece[index])
                 index += 1
 
                 # if the latter element of the tuple is the last element of the piece, append it
                 if index == PIECE_LENGTH:
-                    NEW_PIECE.append(piece[index]) 
+                    NEW_PIECE.append(piece[index])
 
                 continue
 
@@ -169,13 +164,10 @@ class NanoSocratesBPE(Encoder):
             NEW_PIECE.append(CANDIDATE_TOKEN)
             index += 2
 
-
         return NEW_PIECE
 
-
     # TODO: Remake decode to take a list of token IDs
     def decode(self, token_ids: list[int]) -> str:
-        
 
         # deque: double ended queue
         token_stack: deque[int] = deque(token_ids)
@@ -185,19 +177,13 @@ class NanoSocratesBPE(Encoder):
             TOKEN_ID = token_stack.popleft()
 
             if TOKEN_ID < 256:
-                UTF_8_STRING_ARR.append(
-                    TOKEN_ID
-                )
+                UTF_8_STRING_ARR.append(TOKEN_ID)
                 continue
 
             left_token, right_token = self.__token_decode(TOKEN_ID)
 
-            token_stack.appendleft(
-                right_token
-            )
-            token_stack.appendleft(
-                left_token
-            )
+            token_stack.appendleft(right_token)
+            token_stack.appendleft(left_token)
 
         return UTF_8_STRING_ARR.decode("utf-8")
 
@@ -211,7 +197,7 @@ class NanoSocratesBPE(Encoder):
         return CANDIDATE_DECODED
 
     def __learn_word(self, words: tuple[int, int]):
-        """ learn a new couple of object in the vocabulary
+        """learn a new couple of object in the vocabulary
         Args:
             words (tuple[int, int]): the Pair of element to substitute with a new tokenID
 
diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py b/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py
index 8fe81bb..61d4741 100644
--- a/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py
@@ -1,47 +1,46 @@
 from .Encoder import Encoder
 from ..Errors import OutOfDictionaryException
 
+
 class NanoSocratesSpecial(Encoder):
 
     def __init__(
-        self,
-        vocabulary_index: int ,
-        vocabulary: dict[str, int] | None = None
-        ) -> None:
-        
+        self, bpe_vocabulary_size: int, special_tokens: list[str] = []
+    ) -> None:
+
         super().__init__()
 
-        if vocabulary is None:
-            self.__vocabulary: dict[str, int] = {}
-        else:
-            self.__vocabulary:  dict[str, int] = vocabulary
-        
+        self.__bpe_offset = bpe_vocabulary_size
+        self.__vocabulary: dict[str, int] = {}
         self.__reverse_vocabulary: dict[int, str] = {}
 
-        if vocabulary_index is None:
-            self.__vocabulary_index = 0
-        else:
-            self.__vocabulary_index = vocabulary_index
+        if len(special_tokens) == 0:
+            return
 
-        # self.__build_reverse_vocabulary()
+        for index, TOKEN in zip(range(0, len(special_tokens)), special_tokens):
 
+            CANDIDATE_ID = self.__bpe_offset + index + 1
+            self.__vocabulary[TOKEN] = CANDIDATE_ID
+            self.__reverse_vocabulary[CANDIDATE_ID] = TOKEN
 
+    @property
+    def __next_id(self):
+        BPE_OFFSET = self.__bpe_offset
+        VOC_LENGTH = len(self.__vocabulary)
+        return BPE_OFFSET + VOC_LENGTH + 1
 
-    def build_reverse_vocabulary(self):
-        self.__reverse_vocabulary = {v: k for k, v in self.__vocabulary.items()}
+    @property
+    def vocabulary(self) -> dict[str, int]:
+        return self.__vocabulary
 
-    # @property
-    # def vocabulary_size(self):
-    #     return self.__current_index
+    @property
+    def reverse_vocabulary(self) -> dict[int, str]:
+        return self.__reverse_vocabulary
 
-    def set_vocabulary_index(self, vocabulary_index: int):
-        self.__vocabulary_index = vocabulary_index
-
-    def add_special_word_to_vocabulary(self, word:str):
-        self.__vocabulary_index = self.__vocabulary_index + 1
-        CURRENT_INDEX = self.__vocabulary_index
-        self.__vocabulary[word] = CURRENT_INDEX
-        self.__reverse_vocabulary[CURRENT_INDEX] = word
+    def add_special_word_to_vocabulary(self, word: str):
+        CANDIDATE_INDEX = self.__next_id
+        self.__vocabulary[word] = CANDIDATE_INDEX
+        self.__reverse_vocabulary[CANDIDATE_INDEX] = word
 
     def encode(self, word: str) -> list[int]:
         ID = self.__vocabulary.get(word)
@@ -51,15 +50,15 @@ class NanoSocratesSpecial(Encoder):
 
         return [ID]
 
-    def decode(self, token_id: int) -> str:
+    def decode(self, token_id: list[int]) -> str:
 
-        ID = token_id
+        if len(token_id) != 1:
+            raise OutOfDictionaryException()
+
+        ID = token_id[0]
         WORD = self.__reverse_vocabulary.get(ID)
 
         if WORD is None:
             raise OutOfDictionaryException()
 
         return WORD
-
-    def get_reverse_vocabulary(self)-> dict[int, str]:
-        return self.__reverse_vocabulary
diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py b/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py
index 6e0abc2..02a8ccf 100644
--- a/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py
@@ -31,7 +31,8 @@ class NanoSocratesSplitter:
             bpe_end = special_token_start
             BPE_TOKEN_TEXT = corpus[bpe_start:bpe_end]
             if BPE_TOKEN_TEXT != "":
-                yield (BPE_TOKEN_TEXT, TokenType.BPE)
+                for WORD in self.__split_words(BPE_TOKEN_TEXT):
+                    yield (WORD, TokenType.BPE)
 
             # FIND SPECIAL TOKEN
             SPECIAL_TOKEN_TEXT = corpus[special_token_start:special_token_end]
@@ -60,6 +61,27 @@ class NanoSocratesSplitter:
         # eof = len(corpus)
         # yield(eof,eof)
 
+    def __split_words(self, bpe_piece: str) -> Generator[str]:
+
+        END_OF_STRING = len(bpe_piece)
+        bound_start = 0
+        bound_end = END_OF_STRING + 1
+        for i in range(0, END_OF_STRING):
+
+            CANDIDATE_CHAR = bpe_piece[i]
+
+            if CANDIDATE_CHAR != " ":
+                continue
+
+            bound_end = i
+
+            yield bpe_piece[bound_start:bound_end]
+
+            bound_start = bound_end
+            bound_end = END_OF_STRING + 1
+
+        yield bpe_piece[bound_start:bound_end]
+
     def split_tokens(self, corpus: list[int]) -> Generator[tuple[list[int], TokenType]]:
 
         not_special_token_list: list[int] = []
diff --git a/Project_Model/Libs/BPE/Classes/TokeNanoCore.py b/Project_Model/Libs/BPE/Classes/TokeNanoCore.py
index c719219..f726a95 100644
--- a/Project_Model/Libs/BPE/Classes/TokeNanoCore.py
+++ b/Project_Model/Libs/BPE/Classes/TokeNanoCore.py
@@ -1,79 +1,56 @@
 from pathlib import Path
 
-from Project_Model.Libs.BPE.Classes.NanoSocratesSplitter import NanoSocratesSplitter
-from Project_Model.Libs.BPE.Classes.NanoSocratesBPE import NanoSocratesBPE
-from Project_Model.Libs.BPE.Classes.NanoSocratesSpecial import NanoSocratesSpecial
+from ..Classes import NanoSocratesSplitter
+from ..Classes import NanoSocratesBPE
+from ..Classes import NanoSocratesSpecial
+
+from ..Utils import special_regex_maker
+from ..Enums import TokenType
+
 
-from Project_Model.Libs.BPE.Utils.special_regex_maker import special_regex_maker
-from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
-from Project_Model.Libs.BPE.Enums import TokenType
-from Project_Model.Libs.BPE.Utils.json_utils import load_json
 class TokeNanoCore:
-    def __init__(self, 
-                bpe_vocabulary: dict[tuple[int, int], int]
-                # special_vocabulary: dict[str, int] 
-                ):
-        self._bpe = NanoSocratesBPE(bpe_vocabulary)
-        
-        # special_vocabulary = [token.value for token in SpecialToken]
-        special_token_list = [token.value for token in SpecialToken]
-        self._splitter = NanoSocratesSplitter(special_regex_maker(special_token_list),self._bpe.vocabulary_size)
+    def __init__(
+        self,
+        bpe_vocabulary: dict[tuple[int, int], int],
+        special_token_list: list[str],
+        # special_vocabulary: dict[str, int]
+    ):
 
-        self._special_bpe =  NanoSocratesSpecial(vocabulary_index=None) # technically its not a bpe but more something like an "autoencoder"
-        self.prepare_special_token_vocabulary()
-        
-        
-    def encode(self, corpus : str) -> list[int]:
-        output : list[int] = []
-        for piece, token_type in self._splitter.split_text(corpus):
+        self.__bpe_encoder = NanoSocratesBPE(bpe_vocabulary)
+
+        SPECIAL_REGEX = special_regex_maker(special_token_list)
+        BPE_VOCABULARY_SIZE = self.__bpe_encoder.vocabulary_size
+
+        self.__splitter = NanoSocratesSplitter(SPECIAL_REGEX, BPE_VOCABULARY_SIZE)
+        self.__special_encoder = NanoSocratesSpecial(
+            BPE_VOCABULARY_SIZE, special_token_list
+        ) 
+
+    def encode(self, corpus: str) -> list[int]:
+        output: list[int] = []
+        for piece, token_type in self.__splitter.split_text(corpus):
 
             if token_type == TokenType.SPECIAL:
-                output.extend(self._special_bpe.encode(piece))
+                output.extend(self.__special_encoder.encode(piece))
 
             # slow but clear
             if token_type == TokenType.BPE:
-                output.extend(self._bpe.encode(piece))
+                output.extend(self.__bpe_encoder.encode(piece))
 
         return output
 
-    
-
-    def decode(self, corpus : list[int])-> str:
-        output_str = ''
-        for token, token_type in self._splitter.split_tokens(corpus):
+    def decode(self, corpus: list[int]) -> str:
+        output_str = ""
+        for token, token_type in self.__splitter.split_tokens(corpus):
             # token is an integer if special, a list of integer otherwise
             if token_type == TokenType.SPECIAL:
-                output_str += self._special_bpe.decode(token) # it accept an integer
+                output_str += self.__special_encoder.decode(
+                    token
+                )  # it accept an integer
 
             # slow but clear
             if token_type == TokenType.BPE:
-                output_str += self._bpe.decode(token) # it accept a list of integer
+                output_str += self.__bpe_encoder.decode(
+                    token
+                )  # it accept a list of integer
         return output_str
-
-        
-
-    def prepare_special_token_vocabulary(self):
-        self._special_bpe.set_vocabulary_index(self._bpe.vocabulary_size) 
-
-        for special_token in [token.value for token in SpecialToken]:
-            self._special_bpe.add_special_word_to_vocabulary(special_token)
-
-        self._special_bpe.build_reverse_vocabulary()
-
-
-if __name__ == "__main__":
-    dictionary_path = "Assets/Dataset/Tmp/toy_dictionary.json"
-    dictionary = load_json(Path(dictionary_path))
-
-    tokenano = TokeNanoCore(dictionary)
-
-    corpus = "<SOTL><SOT><SUBJ>dbp-dbr:How_It_Should_Have_Ended<PRED>"
-    print(corpus)
-
-    encoded_list = tokenano.encode(corpus)
-    print(encoded_list)
-
-    decoded_string = tokenano.decode(encoded_list)
-    print(decoded_string)
-
-# [1474, 1475, 1477, 100, 98, 112, 45, 100, 98, 114, 58, 72, 111, 119, 95, 73, 116, 95, 83, 104, 111, 117, 108, 100, 95, 72, 97, 118, 101, 95, 69, 110, 100, 101, 100, 1478]
\ No newline at end of file
diff --git a/Project_Model/Libs/BPE/Enums/SpecialToken.py b/Project_Model/Libs/BPE/Enums/SpecialToken.py
new file mode 100644
index 0000000..3f25a2d
--- /dev/null
+++ b/Project_Model/Libs/BPE/Enums/SpecialToken.py
@@ -0,0 +1,21 @@
+from enum import Enum
+
+
+class SpecialToken(Enum):
+    # (Enum, str) -> throws an error
+    START_TRIPLE_LIST = "<SOTL>"
+    START_TRIPLE = "<SOT>"
+    END_TRIPLE = "<EOT>"
+    SUBJECT = "<SUBJ>"
+    RELATIONSHIP = "<PRED>"
+    OBJECT = "<OBJ>"
+    ABSTRACT = "<ABS>"
+    CORPUS_END = "<END>"
+
+    ## Tasks' Token
+    RDF_TO_TEXT = "<RDF2TXT>"
+    TEXT_TO_RDF = "<TEXT2RDF>"
+    CONTINUE_RDF = "<CONTINUERDF>"
+    MASK = "<MASK>"
+
+    # BPE Training:

From 51f491d0334c0f70972eee4fb986e706db53877d Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Fri, 3 Oct 2025 13:27:17 +0200
Subject: [PATCH 66/75] fixed typos

---
 Project_Model/Libs/BPE/Utils/special_regex_maker.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/Project_Model/Libs/BPE/Utils/special_regex_maker.py b/Project_Model/Libs/BPE/Utils/special_regex_maker.py
index fd4ac28..c2d3add 100644
--- a/Project_Model/Libs/BPE/Utils/special_regex_maker.py
+++ b/Project_Model/Libs/BPE/Utils/special_regex_maker.py
@@ -2,15 +2,14 @@ import re
 
 
 def special_regex_maker(special_tokens: list[str]) -> re.Pattern:
-    """ compile a regex for the special token
+    """compile a regex for the special token
     Args:
         special_tokens (list[str]): the list of special token
 
     Returns:
         re.Pattern:
-    """    
+    """
 
     REGEX_STR = "|".join(special_tokens)
 
     return re.compile(REGEX_STR)
-

From c74689d01d0b8c3c5217cc15a806200c58d6eef0 Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Fri, 3 Oct 2025 13:27:38 +0200
Subject: [PATCH 67/75] Fixed tests to reflect new version of tokenizer

---
 Project_Model/Tests/splitter_test.py | 26 ++++++++++++++------------
 Project_Model/Tests/tokenano_test.py |  2 +-
 2 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/Project_Model/Tests/splitter_test.py b/Project_Model/Tests/splitter_test.py
index 154e24e..2bf9a0f 100644
--- a/Project_Model/Tests/splitter_test.py
+++ b/Project_Model/Tests/splitter_test.py
@@ -18,7 +18,8 @@ class TestSplitter:
 
         EXPECTED_CHUNKS = [
             ("<SOT>", TokenType.SPECIAL),
-            ("Lorem ", TokenType.BPE),
+            ("Lorem", TokenType.BPE),
+            (" ", TokenType.BPE),
             ("<SEP>", TokenType.SPECIAL),
         ]
 
@@ -43,9 +44,10 @@ class TestSplitter:
         EXPECTED_CHUNKS = [
             ("ipsu", TokenType.BPE),
             ("<SEP>", TokenType.SPECIAL),
-            ("m d", TokenType.BPE),
+            ("m", TokenType.BPE),
+            (" d", TokenType.BPE),
             ("<SEP>", TokenType.SPECIAL),
-            #("olor", TokenType.BPE)
+            # ("olor", TokenType.BPE)
         ]
 
         CHUNKS = list(SPLITTER.split_text(TEXT))
@@ -69,7 +71,8 @@ class TestSplitter:
         EXPECTED_CHUNKS = [
             ("ipsu", TokenType.BPE),
             ("<SEP>", TokenType.SPECIAL),
-            ("m d", TokenType.BPE),
+            ("m", TokenType.BPE),
+            (" d", TokenType.BPE),
             ("<SEP>", TokenType.SPECIAL),
             ("<SEP>", TokenType.SPECIAL),
             ("<SEP>", TokenType.SPECIAL),
@@ -134,12 +137,12 @@ class TestSplitter:
     def test_split_token_decode_simple(self):
         # to test the token split into special and bpe
         SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX, max_bpe_token_id=1473)
-        token_list = [100,101,1477]
+        token_list = [100, 101, 1477]
 
         CHUNKS = list(SPLITTER.split_tokens(token_list))
         EXPECTED_CHUNKS = [
-            ([100,101], TokenType.BPE),
-            (1477, TokenType.SPECIAL),
+            ([100, 101], TokenType.BPE),
+            ([1477], TokenType.SPECIAL),
         ]
 
         assert len(CHUNKS) == len(EXPECTED_CHUNKS)
@@ -155,12 +158,12 @@ class TestSplitter:
     def test_split_token_decode_simple_malformed(self):
         # to test the token split into special and bpe
         SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX, max_bpe_token_id=1473)
-        token_list = [100,101,1477,100]
+        token_list = [100, 101, 1477, 100]
 
         CHUNKS = list(SPLITTER.split_tokens(token_list))
         EXPECTED_CHUNKS = [
-            ([100,101], TokenType.BPE),
-            (1477, TokenType.SPECIAL),
+            ([100, 101], TokenType.BPE),
+            ([1477], TokenType.SPECIAL),
         ]
 
         assert len(CHUNKS) == len(EXPECTED_CHUNKS)
@@ -174,7 +177,6 @@ class TestSplitter:
             assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE
 
 
-
 # Useful to debug weird cases
 if __name__ == "__main__":
-    TestSplitter().test_split_trailing_text()
\ No newline at end of file
+    TestSplitter().test_split_trailing_text()
diff --git a/Project_Model/Tests/tokenano_test.py b/Project_Model/Tests/tokenano_test.py
index 2dc7779..c8f0d88 100644
--- a/Project_Model/Tests/tokenano_test.py
+++ b/Project_Model/Tests/tokenano_test.py
@@ -13,7 +13,7 @@ class TestTokeNano:
         VOCABULARY = {(ord("a"), ord("b")): 256, (256, 256): 257, (257, 257): 258}
         # EXPECTED = [258]
 
-        TOKE_NANO = TokeNanoCore(VOCABULARY)
+        TOKE_NANO = TokeNanoCore(VOCABULARY, ["<SOT>", "<EOT>"])
 
         ENCODED = TOKE_NANO.encode(TEXT)
         DECODED = TOKE_NANO.decode(ENCODED)

From 9c5f42153f836abc6e7331a04c47b7fbe4c6d9aa Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Fri, 3 Oct 2025 15:17:44 +0200
Subject: [PATCH 68/75] fixed typos

---
 Project_Model/Libs/BPE/Utils/json_utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Project_Model/Libs/BPE/Utils/json_utils.py b/Project_Model/Libs/BPE/Utils/json_utils.py
index 716e93a..b98ac2f 100644
--- a/Project_Model/Libs/BPE/Utils/json_utils.py
+++ b/Project_Model/Libs/BPE/Utils/json_utils.py
@@ -2,15 +2,15 @@ import json
 from pathlib import Path
 
 
-def save_json(vocabulary: dict, path: Path):
+def save_json(dictionary: dict, path: Path):
 
-    json_string = json.dumps(vocabulary)
+    json_string = json.dumps(dictionary)
     FILE = open(path, "w")
     FILE.write(json_string)
     FILE.close()
 
 
-def load_json(path: Path) -> dict[tuple[int, int], int]:
+def load_json(path: Path) -> dict:
     FILE = open(path, "r")
     json_string = FILE.read()
     FILE.close()

From 55e0d2ac23412680d03933d5968a9a12b09fbf6d Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Fri, 3 Oct 2025 16:08:11 +0200
Subject: [PATCH 69/75] Fixed a encoding bug

---
 Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py
index d517f04..a74412d 100644
--- a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py
@@ -162,7 +162,11 @@ class NanoSocratesBPE(Encoder):
 
             # in this case there was a candidate token to substitute the couple of element
             NEW_PIECE.append(CANDIDATE_TOKEN)
-            index += 2
+
+            index += 1
+            if index == PIECE_LENGTH:
+                    NEW_PIECE.append(piece[index])
+            index += 1
 
         return NEW_PIECE
 

From 0ee6e480044ce6e4fc5acebaf3756dc587416f97 Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Fri, 3 Oct 2025 16:09:53 +0200
Subject: [PATCH 70/75] Fixed the same bug as before, but this time is correct

---
 Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py
index a74412d..a5dab9b 100644
--- a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py
@@ -163,10 +163,10 @@ class NanoSocratesBPE(Encoder):
             # in this case there was a candidate token to substitute the couple of element
             NEW_PIECE.append(CANDIDATE_TOKEN)
 
-            index += 1
+            index += 2
+            
             if index == PIECE_LENGTH:
                     NEW_PIECE.append(piece[index])
-            index += 1
 
         return NEW_PIECE
 

From 0f95aeb1224051bf462e3dc3d4c7662a1e72665d Mon Sep 17 00:00:00 2001
From: GassiGiuseppe <g.gassi@studenti.poliba.it>
Date: Fri, 3 Oct 2025 16:26:01 +0200
Subject: [PATCH 71/75] toy dictionary for bpe implemeted

---
 Assets/Model/toy_10/README.md           | 3 +++
 Assets/Model/toy_10/toy_dictionary.json | 3 +++
 2 files changed, 6 insertions(+)
 create mode 100644 Assets/Model/toy_10/README.md
 create mode 100644 Assets/Model/toy_10/toy_dictionary.json

diff --git a/Assets/Model/toy_10/README.md b/Assets/Model/toy_10/README.md
new file mode 100644
index 0000000..b97981a
--- /dev/null
+++ b/Assets/Model/toy_10/README.md
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:806baf1def1f5b785775ae8e4bcf028d897206da2edd76b6702b1838f5880923
+size 283
diff --git a/Assets/Model/toy_10/toy_dictionary.json b/Assets/Model/toy_10/toy_dictionary.json
new file mode 100644
index 0000000..5f47d51
--- /dev/null
+++ b/Assets/Model/toy_10/toy_dictionary.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0b091b3b15bfc49b470bb9da158bc167aa797897f1ed11c012268eb4d520654b
+size 183342

From d2a3dfe90fced99aaf2d5a1802b5cb3e35b6eab7 Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Fri, 3 Oct 2025 17:59:46 +0200
Subject: [PATCH 72/75] Fixed bug

---
 Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py
index a5dab9b..bcb0c0f 100644
--- a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py
@@ -164,9 +164,9 @@ class NanoSocratesBPE(Encoder):
             NEW_PIECE.append(CANDIDATE_TOKEN)
 
             index += 2
-            
+
             if index == PIECE_LENGTH:
-                    NEW_PIECE.append(piece[index])
+                NEW_PIECE.append(piece[index])
 
         return NEW_PIECE
 

From 8a21cb1b73d46c51c07e1c9f0ce0f565649ae2ef Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Fri, 3 Oct 2025 18:00:52 +0200
Subject: [PATCH 73/75] added python analysis

---
 .vscode/settings.json | 59 +++++++++++++++++++++----------------------
 1 file changed, 29 insertions(+), 30 deletions(-)

diff --git a/.vscode/settings.json b/.vscode/settings.json
index cae6d86..7f479da 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -1,35 +1,34 @@
 {
-  // Always treat the project root as the working dir for Jupyter
-  "jupyter.notebookFileRoot": "${workspaceFolder}",
-
-  // When you click "Run Python File in Terminal", DON'T cd into the file's folder
-  "python.terminal.executeInFileDir": false,
-
-  // Start new integrated terminals at the project root
-  "terminal.integrated.cwd": "${workspaceFolder}",
-
-  // Make pytest run from the root without needing a pytest.ini
-  "python.testing.pytestEnabled": true,
-  "python.testing.cwd": "${workspaceFolder}",
-  "python.testing.pytestArgs": ["src/test"],
-
-  // Help Pylance resolve imports like `from src...` without red squiggles
-  "python.analysis.extraPaths": ["${workspaceFolder}"],
-
-  // For linux
-  "terminal.integrated.env.linux": {
-      "PYTHONPATH": "${workspaceFolder}"
-  },
-  // For OSX
-  "terminal.integrated.env.osx": {
-      "PYTHONPATH": "${workspaceFolder}"
-  },
-  // For Windows
-  "terminal.integrated.env.windows": {
-      "PYTHONPATH": "${workspaceFolder}"
-  }
+    // Always treat the project root as the working dir for Jupyter
+    "jupyter.notebookFileRoot": "${workspaceFolder}",
+    // When you click "Run Python File in Terminal", DON'T cd into the file's folder
+    "python.terminal.executeInFileDir": false,
+    // Start new integrated terminals at the project root
+    "terminal.integrated.cwd": "${workspaceFolder}",
+    // Make pytest run from the root without needing a pytest.ini
+    "python.testing.pytestEnabled": true,
+    "python.testing.cwd": "${workspaceFolder}",
+    "python.testing.pytestArgs": [
+        "src/test"
+    ],
+    // Help Pylance resolve imports like `from src...` without red squiggles
+    "python.analysis.extraPaths": [
+        "${workspaceFolder}"
+    ],
+    // For linux
+    "terminal.integrated.env.linux": {
+        "PYTHONPATH": "${workspaceFolder}"
+    },
+    // For OSX
+    "terminal.integrated.env.osx": {
+        "PYTHONPATH": "${workspaceFolder}"
+    },
+    // For Windows
+    "terminal.integrated.env.windows": {
+        "PYTHONPATH": "${workspaceFolder}"
+    },
+    "python.analysis.typeCheckingMode": "standard"
 }
-
 // {
 //   // Always treat the project root as the working dir for Jupyter
 //   "jupyter.notebookFileRoot": "${workspaceFolder}",

From 149deb407db18fd053b8916879e290a0855da0bf Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Fri, 3 Oct 2025 18:01:05 +0200
Subject: [PATCH 74/75] added cache directories

---
 .gitignore                                       | 1 +
 Project_Model/Tests/trainer_files/cache/.gitkeep | 0
 2 files changed, 1 insertion(+)
 create mode 100644 Project_Model/Tests/trainer_files/cache/.gitkeep

diff --git a/.gitignore b/.gitignore
index 0797ef4..314d94c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -254,4 +254,5 @@ $RECYCLE.BIN/
 
 # ---> Custom
 **/Tmp/**
+**/cache/**
 !**/.gitkeep
diff --git a/Project_Model/Tests/trainer_files/cache/.gitkeep b/Project_Model/Tests/trainer_files/cache/.gitkeep
new file mode 100644
index 0000000..e69de29

From 8e095ebb7a637de17d06c5fce4a1162e2c671b0a Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Fri, 3 Oct 2025 18:02:27 +0200
Subject: [PATCH 75/75] Added papers stub

---
 docs/PAPERS.md | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)
 create mode 100644 docs/PAPERS.md

diff --git a/docs/PAPERS.md b/docs/PAPERS.md
new file mode 100644
index 0000000..957d00a
--- /dev/null
+++ b/docs/PAPERS.md
@@ -0,0 +1,57 @@
+# Research Material
+
+## BPE
+
+- [BPE Wikipedia](https://en.wikipedia.org/wiki/Byte-pair_encoding)
+- [BPE Hugging Face](https://huggingface.co/learn/llm-course/chapter6/5)
+- [BPE GeeksForGeeks](https://www.geeksforgeeks.org/nlp/byte-pair-encoding-bpe-in-nlp/)
+- [BPE Medium Chetna Khanna](https://medium.com/data-science/byte-pair-encoding-subword-based-tokenization-algorithm-77828a70bee0)
+- [Stack Overflow "Explain bpe (Byte Pair Encoding) with examples?"](https://stackoverflow.com/questions/50583254/explain-bpe-byte-pair-encoding-with-examples)
+- [Implementing a byte pair encoding(BPE) Tokenizer from scratch](https://sebastianraschka.com/blog/2025/bpe-from-scratch.html)
+- [Thoretical Analysis of Byte-Pair Encoding](https://arxiv.org/pdf/2411.08671)
+- [A Formal Perspective on Byte-Pair Encoding](https://aclanthology.org/2023.findings-acl.38v2.pdf)
+- [Byte Pair Encoding is Suboptimal for Language Model Pretraining](https://arxiv.org/pdf/2004.03720)
+- [Byte pair encoding: a text compression scheme that accelerates pattern matching](https://www.researchgate.net/profile/Takeshi-Shinohara/publication/2310624_Byte_Pair_Encoding_A_Text_Compression_Scheme_That_Accelerates_Pattern_Matching/links/02e7e522f8ea00c318000000/Byte-Pair-Encoding-A-Text-Compression-Scheme-That-Accelerates-Pattern-Matching.pdf)
+- [A Formal Perspective on Byte-Pair Encoding](https://arxiv.org/pdf/2306.16837)
+- [Controlling byte pair encoding for neural machine translation](https://ieeexplore.ieee.org/abstract/document/8300571)
+- [Scaffold-BPE: Enhancing Byte Pair Encoding for Large Language Models with Simple and Effective Scaffold Token Removal](https://ojs.aaai.org/index.php/AAAI/article/view/34633)
+- [Parity-Aware Byte-Pair Encoding: Improving Cross-lingual Fairness in Tokenization](https://arxiv.org/pdf/2508.04796)
+- [Code Completion using Neural Aention and Byte Pair Encoding](https://arxiv.org/pdf/2004.06343)
+- [Getting the most out of your tokenizer for pre-training and domain adaptation](https://arxiv.org/html/2402.01035v2)
+
+## Embedder
+
+- [ROFORMER: ENHANCED TRANSFORMER WITH ROTARY POSITION EMBEDDING](https://arxiv.org/pdf/2104.09864)
+- [You could have designed state of the art positional encoding](https://huggingface.co/blog/designing-positional-encoding)
+- [Rotary Embeddings: A Relative Revolution](https://blog.eleuther.ai/rotary-embeddings/)
+- [Round and Round We Go! What makes Rotary Positional Encodings useful?](https://arxiv.org/html/2410.06205v1)
+- [Inside RoPE: Rotary Magic into Position Embeddings](https://learnopencv.com/rope-position-embeddings/)
+- [What Rotary Position Embedding Can Tell Us: Identifying Query and Key Weights Corresponding to Basic Syntactic or High-level Semantic Information](https://openreview.net/pdf?id=e5Mv7iWfVW)
+- [A gentle introduction to Rotary Position Embedding](https://krasserm.github.io/2022/12/13/rotary-position-embedding/)
+- [Context-aware Rotary Position Embedding](https://arxiv.org/pdf/2507.23083)
+- [LIERE: GENERALIZING ROTARY POSITION ENCODINGS TO HIGHER DIMENSIONAL INPUTS](https://openreview.net/pdf?id=xHMMt7r3GW)
+- [Rotary Positional Embeddings (RoPE)](https://nn.labml.ai/transformers/rope/index.html)
+- [Decoding Llama3: An explainer for tinkerers](https://hasgeek.com/simrathanspal/the-llama3-guide/sub/decoding-llama3-part-4-rotary-positional-embedding-3K8ZHpdLi6E56N8ejnaWzm)
+
+## Attention
+
+- [Standard Self-Attention (Attention is all you need)](https://arxiv.org/pdf/1706.03762)
+- [TransMLA: Multi-Head Latent Attention Is All You Need](https://arxiv.org/pdf/2502.07864)
+- [A Gentle Introduction to Multi-Head Latent Attention (MLA)](https://machinelearningmastery.com/a-gentle-introduction-to-multi-head-latent-attention-mla/)
+- [Understanding Multi-Head Latent Attention](https://planetbanatt.net/articles/mla.html)
+- [DeepSeek's Multi-Head Latent Attention](https://liorsinai.github.io/machine-learning/2025/02/22/mla.html)
+- [MatchFormer: Interleaving Attention in Transformers for Feature Matching](https://arxiv.org/pdf/2203.09645)
+- [FIT: Far-reaching Interleaved Transformers](https://arxiv.org/pdf/2305.12689)
+- [Gemma explained: What’s new in Gemma 3](https://developers.googleblog.com/en/gemma-explained-whats-new-in-gemma-3/)
+- [The Llama 4 herd: The beginning of a new era of natively multimodal AI innovation](https://ai.meta.com/blog/llama-4-multimodal-intelligence/)
+- [Attention was never enough: Tracing the rise of hybrid LLMs](https://www.ai21.com/blog/rise-of-hybrid-llms/)
+-
+
+## Spanned Masking
+
+- [Salient Span Masking for Temporal Understanding](https://arxiv.org/pdf/2303.12860)
+- [PMI-MASKING: PRINCIPLED MASKING OF CORRELATED SPANS](https://arxiv.org/pdf/2010.01825)
+
+## Models
+
+- [What Language Model Architecture and Pretraining Objective Work Best for Zero-Shot Generalization?](https://arxiv.org/pdf/2204.05832)