From 0a698e9837367de4e42d5b7506ed2a84b4e8f440 Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Thu, 25 Sep 2025 19:09:52 +0200 Subject: [PATCH 01/75] Added schema to extract from DB for BPE --- .../bpe-pipeline.excalidraw.json | 897 ++++++++++++++++++ 1 file changed, 897 insertions(+) create mode 100644 Scripts/UML/CleaningPipeline/bpe-pipeline.excalidraw.json diff --git a/Scripts/UML/CleaningPipeline/bpe-pipeline.excalidraw.json b/Scripts/UML/CleaningPipeline/bpe-pipeline.excalidraw.json new file mode 100644 index 0000000..0edf3cf --- /dev/null +++ b/Scripts/UML/CleaningPipeline/bpe-pipeline.excalidraw.json @@ -0,0 +1,897 @@ +{ + "type": "excalidraw", + "version": 2, + "source": "https://marketplace.visualstudio.com/items?itemName=pomdtr.excalidraw-editor", + "elements": [ + { + "id": "3zbCui3XtIGozHXTVAGRp", + "type": "rectangle", + "x": 316.5, + "y": 123, + "width": 436.5, + "height": 145.5, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a0", + "roundness": { + "type": 3 + }, + "seed": 1698427950, + "version": 35, + "versionNonce": 601575602, + "isDeleted": false, + "boundElements": [ + { + "id": "wD66RDbG05HfvRhAtMb0J", + "type": "text" + }, + { + "id": "gus_rxauKJ6T2L_F59PfN", + "type": "arrow" + } + ], + "updated": 1758818588814, + "link": null, + "locked": false + }, + { + "id": "wD66RDbG05HfvRhAtMb0J", + "type": "text", + "x": 480.98004150390625, + "y": 183.25, + "width": 107.5399169921875, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a1", + "roundness": null, + "seed": 910769774, + "version": 31, + "versionNonce": 1120989938, + "isDeleted": false, + "boundElements": null, + "updated": 1758818416720, + "link": null, + "locked": false, + "text": "dataset.db", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "3zbCui3XtIGozHXTVAGRp", + "originalText": "dataset.db", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "87-MeaiZGT1wln0nggYPZ", + "type": "rectangle", + "x": 339.5, + "y": 309.5, + "width": 392, + "height": 156, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a2", + "roundness": { + "type": 3 + }, + "seed": 655550318, + "version": 77, + "versionNonce": 1103939826, + "isDeleted": false, + "boundElements": null, + "updated": 1758818339000, + "link": null, + "locked": false + }, + { + "id": "EjUxEhZqEBzwvlw0VE9eJ", + "type": "rectangle", + "x": 355.5, + "y": 327, + "width": 162, + "height": 125.5, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a3", + "roundness": { + "type": 3 + }, + "seed": 1739846638, + "version": 64, + "versionNonce": 1594290034, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "ogRkV0neHrhEKTE6zlggl" + } + ], + "updated": 1758818391415, + "link": null, + "locked": false + }, + { + "id": "ogRkV0neHrhEKTE6zlggl", + "type": "text", + "x": 378.7100524902344, + "y": 377.25, + "width": 115.57989501953125, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a3V", + "roundness": null, + "seed": 2037675630, + "version": 12, + "versionNonce": 1286472046, + "isDeleted": false, + "boundElements": null, + "updated": 1758818399222, + "link": null, + "locked": false, + "text": "RDF_String", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "EjUxEhZqEBzwvlw0VE9eJ", + "originalText": "RDF_String", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "hoIRMNiMJZl4YDo-hovWy", + "type": "rectangle", + "x": 542.5, + "y": 327, + "width": 173, + "height": 125.5, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a4", + "roundness": { + "type": 3 + }, + "seed": 1189796530, + "version": 99, + "versionNonce": 1071057006, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "rsapATFAT5YSBCXzLupgZ" + }, + { + "id": "gus_rxauKJ6T2L_F59PfN", + "type": "arrow" + }, + { + "id": "Wk1bJbbtC31FqObEL5xWt", + "type": "arrow" + } + ], + "updated": 1758818593647, + "link": null, + "locked": false + }, + { + "id": "rsapATFAT5YSBCXzLupgZ", + "type": "text", + "x": 585.6800384521484, + "y": 377.25, + "width": 86.63992309570312, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a5", + "roundness": null, + "seed": 829619694, + "version": 12, + "versionNonce": 713902318, + "isDeleted": false, + "boundElements": null, + "updated": 1758818405150, + "link": null, + "locked": false, + "text": "Abstract", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "hoIRMNiMJZl4YDo-hovWy", + "originalText": "Abstract", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "jSx8ApfhtRs_nk37VvDMb", + "type": "rectangle", + "x": 316.5, + "y": 511, + "width": 436.5, + "height": 145.5, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a6", + "roundness": { + "type": 3 + }, + "seed": 492582894, + "version": 132, + "versionNonce": 893797614, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "6E23g-rgowNqHsBxX-LuM" + }, + { + "id": "hyFKqXwet_F79QM71atgI", + "type": "arrow" + }, + { + "id": "x_DP1FcQ7jraGz0gBuDi3", + "type": "arrow" + }, + { + "id": "1IGbCps2EHnzKgJUWM5nq", + "type": "arrow" + }, + { + "id": "Wk1bJbbtC31FqObEL5xWt", + "type": "arrow" + } + ], + "updated": 1758818593647, + "link": null, + "locked": false + }, + { + "id": "6E23g-rgowNqHsBxX-LuM", + "type": "text", + "x": 499.9100341796875, + "y": 571.25, + "width": 69.679931640625, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a7", + "roundness": null, + "seed": 267696178, + "version": 132, + "versionNonce": 1668243186, + "isDeleted": false, + "boundElements": null, + "updated": 1758818543211, + "link": null, + "locked": false, + "text": "Pandas", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "jSx8ApfhtRs_nk37VvDMb", + "originalText": "Pandas", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "ohj18N4AOTDz5lJNcV9gi", + "type": "rectangle", + "x": 261, + "y": 765.5, + "width": 157, + "height": 87, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a8", + "roundness": { + "type": 3 + }, + "seed": 1446207150, + "version": 279, + "versionNonce": 317375026, + "isDeleted": false, + "boundElements": [ + { + "id": "Ea1_ke2wA0D8ZjVOUtvfY", + "type": "text" + }, + { + "id": "hyFKqXwet_F79QM71atgI", + "type": "arrow" + } + ], + "updated": 1758818570993, + "link": null, + "locked": false + }, + { + "id": "Ea1_ke2wA0D8ZjVOUtvfY", + "type": "text", + "x": 297.0800323486328, + "y": 796.5, + "width": 84.83993530273438, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a9", + "roundness": null, + "seed": 435116270, + "version": 199, + "versionNonce": 1282911218, + "isDeleted": false, + "boundElements": null, + "updated": 1758818570993, + "link": null, + "locked": false, + "text": "train.txt", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "ohj18N4AOTDz5lJNcV9gi", + "originalText": "train.txt", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "A4Y54Y26fe257U_QU9lxX", + "type": "rectangle", + "x": 464, + "y": 765.5, + "width": 157, + "height": 87, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aA", + "roundness": { + "type": 3 + }, + "seed": 186148850, + "version": 232, + "versionNonce": 997119858, + "isDeleted": false, + "boundElements": [ + { + "id": "v4TvUlDEjH7EvPDmtbOn2", + "type": "text" + }, + { + "id": "1IGbCps2EHnzKgJUWM5nq", + "type": "arrow" + } + ], + "updated": 1758818570993, + "link": null, + "locked": false + }, + { + "id": "v4TvUlDEjH7EvPDmtbOn2", + "type": "text", + "x": 476.3500442504883, + "y": 796.5, + "width": 132.29991149902344, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aB", + "roundness": null, + "seed": 1131059634, + "version": 171, + "versionNonce": 239540530, + "isDeleted": false, + "boundElements": null, + "updated": 1758818570993, + "link": null, + "locked": false, + "text": "validation.txt", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "A4Y54Y26fe257U_QU9lxX", + "originalText": "validation.txt", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "mPaYpJ9Xn7tlJPmKPqJKJ", + "type": "rectangle", + "x": 674.5, + "y": 765.5, + "width": 157, + "height": 87, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aC", + "roundness": { + "type": 3 + }, + "seed": 1049323314, + "version": 235, + "versionNonce": 330560690, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "kg9nm2rpud6cax5aNPSnu" + }, + { + "id": "x_DP1FcQ7jraGz0gBuDi3", + "type": "arrow" + } + ], + "updated": 1758818570993, + "link": null, + "locked": false + }, + { + "id": "kg9nm2rpud6cax5aNPSnu", + "type": "text", + "x": 711.4300231933594, + "y": 796.5, + "width": 83.13995361328125, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aD", + "roundness": null, + "seed": 522572142, + "version": 193, + "versionNonce": 1920372338, + "isDeleted": false, + "boundElements": null, + "updated": 1758818570993, + "link": null, + "locked": false, + "text": "test.txt", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "mPaYpJ9Xn7tlJPmKPqJKJ", + "originalText": "test.txt", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "hyFKqXwet_F79QM71atgI", + "type": "arrow", + "x": 534.65, + "y": 661.5, + "width": 195.25, + "height": 99, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aG", + "roundness": null, + "seed": 873266098, + "version": 71, + "versionNonce": 541154738, + "isDeleted": false, + "boundElements": null, + "updated": 1758818570993, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 0, + 49.5 + ], + [ + -195.25, + 49.5 + ], + [ + -195.25, + 99 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "jSx8ApfhtRs_nk37VvDMb", + "fixedPoint": [ + 0.49977090492554405, + 1.034364261168385 + ], + "focus": 0, + "gap": 0 + }, + "endBinding": { + "elementId": "ohj18N4AOTDz5lJNcV9gi", + "fixedPoint": [ + 0.4993630573248406, + -0.05747126436781609 + ], + "focus": 0, + "gap": 0 + }, + "startArrowhead": null, + "endArrowhead": "triangle", + "elbowed": true, + "fixedSegments": null, + "startIsSpecial": null, + "endIsSpecial": null + }, + { + "id": "x_DP1FcQ7jraGz0gBuDi3", + "type": "arrow", + "x": 534.65, + "y": 661.5, + "width": 218.25, + "height": 99, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aH", + "roundness": null, + "seed": 1210817582, + "version": 77, + "versionNonce": 1483392370, + "isDeleted": false, + "boundElements": null, + "updated": 1758818580594, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 0, + 49.5 + ], + [ + 218.25, + 49.5 + ], + [ + 218.25, + 99 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "jSx8ApfhtRs_nk37VvDMb", + "fixedPoint": [ + 0.49977090492554405, + 1.034364261168385 + ], + "focus": 0, + "gap": 0 + }, + "endBinding": { + "elementId": "mPaYpJ9Xn7tlJPmKPqJKJ", + "fixedPoint": [ + 0.4993630573248406, + -0.05747126436781609 + ], + "focus": 0, + "gap": 0 + }, + "startArrowhead": null, + "endArrowhead": "triangle", + "elbowed": true, + "fixedSegments": null, + "startIsSpecial": null, + "endIsSpecial": null + }, + { + "id": "1IGbCps2EHnzKgJUWM5nq", + "type": "arrow", + "x": 534.65, + "y": 661.5, + "width": 0.5719232650604908, + "height": 99.07394122590165, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aK", + "roundness": null, + "seed": 1205316658, + "version": 96, + "versionNonce": 1748050674, + "isDeleted": false, + "boundElements": null, + "updated": 1758818570993, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + -0.5719232650604908, + 99.07394122590165 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "jSx8ApfhtRs_nk37VvDMb", + "fixedPoint": [ + 0.49977090492554405, + 1.034364261168385 + ], + "focus": 0, + "gap": 0 + }, + "endBinding": { + "elementId": "A4Y54Y26fe257U_QU9lxX", + "fixedPoint": [ + 0.44635717665566554, + -0.056621365219521276 + ], + "focus": 0, + "gap": 0 + }, + "startArrowhead": null, + "endArrowhead": "triangle", + "elbowed": true, + "fixedSegments": null, + "startIsSpecial": null, + "endIsSpecial": null + }, + { + "id": "gus_rxauKJ6T2L_F59PfN", + "type": "arrow", + "x": 539, + "y": 271.5, + "width": 0, + "height": 33.5, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aL", + "roundness": null, + "seed": 763990258, + "version": 17, + "versionNonce": 1028811378, + "isDeleted": false, + "boundElements": null, + "updated": 1758818588814, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 0, + 33.5 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "3zbCui3XtIGozHXTVAGRp", + "focus": -0.019473081328751418, + "gap": 3 + }, + "endBinding": { + "elementId": "hoIRMNiMJZl4YDo-hovWy", + "focus": -1.0404624277456647, + "gap": 30.7545797799829 + }, + "startArrowhead": null, + "endArrowhead": "triangle", + "elbowed": false + }, + { + "id": "Wk1bJbbtC31FqObEL5xWt", + "type": "arrow", + "x": 536.5, + "y": 468.5, + "width": 0, + "height": 39, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aM", + "roundness": null, + "seed": 1489771054, + "version": 33, + "versionNonce": 1828178606, + "isDeleted": false, + "boundElements": null, + "updated": 1758818593647, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 0, + 39 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "hoIRMNiMJZl4YDo-hovWy", + "focus": 1.0693641618497107, + "gap": 27.157190169432425 + }, + "endBinding": { + "elementId": "jSx8ApfhtRs_nk37VvDMb", + "focus": 0.008018327605956525, + "gap": 3.5 + }, + "startArrowhead": null, + "endArrowhead": "triangle", + "elbowed": false + } + ], + "appState": { + "gridSize": 20, + "gridStep": 5, + "gridModeEnabled": false, + "viewBackgroundColor": "#ffffff" + }, + "files": {} +} \ No newline at end of file From ee0aa583d53d0e23daf60754586058be1ecf6c1d Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Thu, 25 Sep 2025 19:10:45 +0200 Subject: [PATCH 02/75] Added Docs for BPE research --- docs/BPE.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 docs/BPE.md diff --git a/docs/BPE.md b/docs/BPE.md new file mode 100644 index 0000000..02dca0a --- /dev/null +++ b/docs/BPE.md @@ -0,0 +1,21 @@ +# BPE + +## Reasearch Material + +- [BPE Wikipedia](https://en.wikipedia.org/wiki/Byte-pair_encoding) +- [BPE Hugging Face](https://huggingface.co/learn/llm-course/chapter6/5) +- [BPE GeeksForGeeks](https://www.geeksforgeeks.org/nlp/byte-pair-encoding-bpe-in-nlp/) +- [BPE Medium Chetna Khanna](https://medium.com/data-science/byte-pair-encoding-subword-based-tokenization-algorithm-77828a70bee0) +- [Stack Overflow "Explain bpe (Byte Pair Encoding) with examples?"](https://stackoverflow.com/questions/50583254/explain-bpe-byte-pair-encoding-with-examples) +- [Implementing a byte pair encoding(BPE) Tokenizer from scratch](https://sebastianraschka.com/blog/2025/bpe-from-scratch.html) +- [Thoretical Analysis of Byte-Pair Encoding](https://arxiv.org/pdf/2411.08671) +- [A Formal Perspective on Byte-Pair Encoding](https://aclanthology.org/2023.findings-acl.38v2.pdf) +- [Byte Pair Encoding is Suboptimal for Language Model Pretraining](https://arxiv.org/pdf/2004.03720) +- [Byte pair encoding: a text compression scheme that accelerates pattern matching](https://www.researchgate.net/profile/Takeshi-Shinohara/publication/2310624_Byte_Pair_Encoding_A_Text_Compression_Scheme_That_Accelerates_Pattern_Matching/links/02e7e522f8ea00c318000000/Byte-Pair-Encoding-A-Text-Compression-Scheme-That-Accelerates-Pattern-Matching.pdf) +- [A Formal Perspective on Byte-Pair Encoding](https://arxiv.org/pdf/2306.16837) +- [Controlling byte pair encoding for neural machine translation](https://ieeexplore.ieee.org/abstract/document/8300571) +- [Scaffold-BPE: Enhancing Byte Pair Encoding for Large Language Models with Simple and Effective Scaffold Token Removal](https://ojs.aaai.org/index.php/AAAI/article/view/34633) +- [Parity-Aware Byte-Pair Encoding: Improving Cross-lingual Fairness in Tokenization](https://arxiv.org/pdf/2508.04796) +- [Code Completion using Neural A‚ention and Byte Pair Encoding](https://arxiv.org/pdf/2004.06343) + + From e521b0704e1941ede504f58a615d8a20fa77461b Mon Sep 17 00:00:00 2001 From: GassiGiuseppe Date: Thu, 25 Sep 2025 19:19:11 +0200 Subject: [PATCH 03/75] deleted TODO in path_splitter_tree, as it was already resolved --- Scripts/DataCleaning/path_splitter_tree.py | 1 - 1 file changed, 1 deletion(-) diff --git a/Scripts/DataCleaning/path_splitter_tree.py b/Scripts/DataCleaning/path_splitter_tree.py index e7f6f9e..9c0914a 100644 --- a/Scripts/DataCleaning/path_splitter_tree.py +++ b/Scripts/DataCleaning/path_splitter_tree.py @@ -101,7 +101,6 @@ def tree_like(file: str, csv_uri_header:str, out: str): FILE = open(file, "r", encoding="utf-8") - # TODO: Change here so it takes single URI from a CSV file # It is needed the header-name for row in csv.DictReader(FILE): From 1bbb4a0999ef289d7f17cb1231f12e95576eaae6 Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Thu, 25 Sep 2025 20:17:48 +0200 Subject: [PATCH 04/75] Added new paper --- docs/BPE.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/BPE.md b/docs/BPE.md index 02dca0a..eee3bac 100644 --- a/docs/BPE.md +++ b/docs/BPE.md @@ -17,5 +17,6 @@ - [Scaffold-BPE: Enhancing Byte Pair Encoding for Large Language Models with Simple and Effective Scaffold Token Removal](https://ojs.aaai.org/index.php/AAAI/article/view/34633) - [Parity-Aware Byte-Pair Encoding: Improving Cross-lingual Fairness in Tokenization](https://arxiv.org/pdf/2508.04796) - [Code Completion using Neural A‚ention and Byte Pair Encoding](https://arxiv.org/pdf/2004.06343) +- [Getting the most out of your tokenizer for pre-training and domain adaptation](https://arxiv.org/html/2402.01035v2) From 90012285b5473c078c5fa28457054a7954ac167a Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Thu, 25 Sep 2025 20:18:21 +0200 Subject: [PATCH 05/75] UML Diagram to explain bpe workflows --- Projec-Model/UML/bpe.excalidraw.json | 362 +++++++ .../bpe-pipeline.excalidraw.json | 897 ++++++++++++++++++ 2 files changed, 1259 insertions(+) create mode 100644 Projec-Model/UML/bpe.excalidraw.json create mode 100644 Scripts/UML/CleaningPipeline/bpe-pipeline.excalidraw.json diff --git a/Projec-Model/UML/bpe.excalidraw.json b/Projec-Model/UML/bpe.excalidraw.json new file mode 100644 index 0000000..1400c25 --- /dev/null +++ b/Projec-Model/UML/bpe.excalidraw.json @@ -0,0 +1,362 @@ +{ + "type": "excalidraw", + "version": 2, + "source": "https://marketplace.visualstudio.com/items?itemName=pomdtr.excalidraw-editor", + "elements": [ + { + "id": "EcT-dGsjmfW571ov8Gg4F", + "type": "text", + "x": 425.5, + "y": 130, + "width": 506, + "height": 550, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [ + "4rCC2-N1thmII8_dwNhe1" + ], + "frameId": null, + "index": "a3V", + "roundness": null, + "seed": 523521109, + "version": 758, + "versionNonce": 383976373, + "isDeleted": false, + "boundElements": [ + { + "id": "OA_NKjb3n3NLtUo_tKmPS", + "type": "arrow" + } + ], + "updated": 1758823931674, + "link": null, + "locked": false, + "text": "class NanoSocratesBPE:\n - vocabulary: Vocabulary\n\n - max_iterations: int\n - max_vocabulary_size: int\n - merge_treshold: int\n\n - reserve_capacity: float\n - token_length: int\n\n + fit(\n data: [[int]], \n memory: NanoSocratesBPE_BatchMemory, \n last_batch: bool\n ) -> (Self, NanoSocratesBPE_BatchMemory)\n\n + encode(word: [byte]) -> [int]\n\n + decode(token: [int]) -> [byte]\n\n \n", + "fontSize": 20, + "fontFamily": 8, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "class NanoSocratesBPE:\n - vocabulary: Vocabulary\n\n - max_iterations: int\n - max_vocabulary_size: int\n - merge_treshold: int\n\n - reserve_capacity: float\n - token_length: int\n\n + fit(\n data: [[int]], \n memory: NanoSocratesBPE_BatchMemory, \n last_batch: bool\n ) -> (Self, NanoSocratesBPE_BatchMemory)\n\n + encode(word: [byte]) -> [int]\n\n + decode(token: [int]) -> [byte]\n\n \n", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "74i4oK-JpcM4CgAqhz_x_", + "type": "rectangle", + "x": 382.5, + "y": 104, + "width": 592.5, + "height": 555, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [ + "4rCC2-N1thmII8_dwNhe1" + ], + "frameId": null, + "index": "a4", + "roundness": { + "type": 3 + }, + "seed": 50827893, + "version": 212, + "versionNonce": 692313525, + "isDeleted": false, + "boundElements": null, + "updated": 1758822941942, + "link": null, + "locked": false + }, + { + "id": "s8I1JoKulE3Vnti9a374p", + "type": "text", + "x": 1113, + "y": 128, + "width": 440, + "height": 250, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [ + "M6w9efVFwOZHkJGgwkyEw" + ], + "frameId": null, + "index": "a5", + "roundness": null, + "seed": 2091174261, + "version": 442, + "versionNonce": 1108352309, + "isDeleted": false, + "boundElements": null, + "updated": 1758822765308, + "link": null, + "locked": false, + "text": "class Vocabulary:\n\n - vocabulary: dict\n - reverse_vocabulary: dict\n\n + add_word(int) -> int\n + encode(int) -> int\n + decode(int) -> int\n\n", + "fontSize": 20, + "fontFamily": 8, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "class Vocabulary:\n\n - vocabulary: dict\n - reverse_vocabulary: dict\n\n + add_word(int) -> int\n + encode(int) -> int\n + decode(int) -> int\n\n", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "BY_Why7XDNftdMzPcwjVZ", + "type": "rectangle", + "x": 1086.5, + "y": 104, + "width": 504.5, + "height": 260.5, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [ + "M6w9efVFwOZHkJGgwkyEw" + ], + "frameId": null, + "index": "a6", + "roundness": { + "type": 3 + }, + "seed": 153939611, + "version": 153, + "versionNonce": 1903356469, + "isDeleted": false, + "boundElements": [ + { + "id": "WcDks9DR8UqeZEaxAcRf9", + "type": "arrow" + } + ], + "updated": 1758822805382, + "link": null, + "locked": false + }, + { + "id": "JCPDhuTKRx4MN950Q3jL-", + "type": "text", + "x": 1116.411067193676, + "y": 535.1519268774704, + "width": 427.72826086956525, + "height": 99.70355731225297, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [ + "DbtlKVF_9SjH2-9iMq9zy" + ], + "frameId": null, + "index": "a7", + "roundness": null, + "seed": 1326854235, + "version": 345, + "versionNonce": 592556603, + "isDeleted": false, + "boundElements": null, + "updated": 1758822845014, + "link": null, + "locked": false, + "text": "class NanoSocrateBPE_BatchMemory:\n\n + max_word_length: int\n + frequency: dict<(int, int), int> ", + "fontSize": 19.940711462450594, + "fontFamily": 8, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "class NanoSocrateBPE_BatchMemory:\n\n + max_word_length: int\n + frequency: dict<(int, int), int> ", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "l-O0rMS3SruV22_MPX9Jz", + "type": "rectangle", + "x": 1086.5, + "y": 509.22900197628456, + "width": 504.49999999999994, + "height": 154.04199604743084, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [ + "DbtlKVF_9SjH2-9iMq9zy" + ], + "frameId": null, + "index": "a8", + "roundness": { + "type": 3 + }, + "seed": 1490898171, + "version": 186, + "versionNonce": 1953870555, + "isDeleted": false, + "boundElements": [ + { + "id": "OA_NKjb3n3NLtUo_tKmPS", + "type": "arrow" + } + ], + "updated": 1758822845014, + "link": null, + "locked": false + }, + { + "id": "WcDks9DR8UqeZEaxAcRf9", + "type": "arrow", + "x": 773.5, + "y": 167, + "width": 298.5, + "height": 30, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aB", + "roundness": { + "type": 2 + }, + "seed": 1681364149, + "version": 205, + "versionNonce": 1154753851, + "isDeleted": false, + "boundElements": [], + "updated": 1758823291274, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 144.5, + -1.5 + ], + [ + 177.5, + -30 + ], + [ + 298.5, + -29.5 + ] + ], + "lastCommittedPoint": null, + "startBinding": null, + "endBinding": { + "elementId": "BY_Why7XDNftdMzPcwjVZ", + "focus": 0.7285094931977862, + "gap": 14.5 + }, + "startArrowhead": null, + "endArrowhead": "triangle", + "elbowed": false + }, + { + "id": "OA_NKjb3n3NLtUo_tKmPS", + "type": "arrow", + "x": 941, + "y": 440.7646462573778, + "width": 132.9833600541258, + "height": 105.33206183359624, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aD", + "roundness": { + "type": 2 + }, + "seed": 1871768059, + "version": 402, + "versionNonce": 462603541, + "isDeleted": false, + "boundElements": [], + "updated": 1758823931675, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 53, + 8.23535374262218 + ], + [ + 63, + 97.73535374262218 + ], + [ + 132.9833600541258, + 105.33206183359624 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "EcT-dGsjmfW571ov8Gg4F", + "focus": -0.01598303536344995, + "gap": 9.500000000000114 + }, + "endBinding": { + "elementId": "l-O0rMS3SruV22_MPX9Jz", + "focus": 0.10931526948750278, + "gap": 13.22003639101672 + }, + "startArrowhead": null, + "endArrowhead": "triangle", + "elbowed": false + } + ], + "appState": { + "gridSize": 20, + "gridStep": 5, + "gridModeEnabled": false, + "viewBackgroundColor": "#ffffff" + }, + "files": {} +} \ No newline at end of file diff --git a/Scripts/UML/CleaningPipeline/bpe-pipeline.excalidraw.json b/Scripts/UML/CleaningPipeline/bpe-pipeline.excalidraw.json new file mode 100644 index 0000000..0edf3cf --- /dev/null +++ b/Scripts/UML/CleaningPipeline/bpe-pipeline.excalidraw.json @@ -0,0 +1,897 @@ +{ + "type": "excalidraw", + "version": 2, + "source": "https://marketplace.visualstudio.com/items?itemName=pomdtr.excalidraw-editor", + "elements": [ + { + "id": "3zbCui3XtIGozHXTVAGRp", + "type": "rectangle", + "x": 316.5, + "y": 123, + "width": 436.5, + "height": 145.5, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a0", + "roundness": { + "type": 3 + }, + "seed": 1698427950, + "version": 35, + "versionNonce": 601575602, + "isDeleted": false, + "boundElements": [ + { + "id": "wD66RDbG05HfvRhAtMb0J", + "type": "text" + }, + { + "id": "gus_rxauKJ6T2L_F59PfN", + "type": "arrow" + } + ], + "updated": 1758818588814, + "link": null, + "locked": false + }, + { + "id": "wD66RDbG05HfvRhAtMb0J", + "type": "text", + "x": 480.98004150390625, + "y": 183.25, + "width": 107.5399169921875, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a1", + "roundness": null, + "seed": 910769774, + "version": 31, + "versionNonce": 1120989938, + "isDeleted": false, + "boundElements": null, + "updated": 1758818416720, + "link": null, + "locked": false, + "text": "dataset.db", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "3zbCui3XtIGozHXTVAGRp", + "originalText": "dataset.db", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "87-MeaiZGT1wln0nggYPZ", + "type": "rectangle", + "x": 339.5, + "y": 309.5, + "width": 392, + "height": 156, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a2", + "roundness": { + "type": 3 + }, + "seed": 655550318, + "version": 77, + "versionNonce": 1103939826, + "isDeleted": false, + "boundElements": null, + "updated": 1758818339000, + "link": null, + "locked": false + }, + { + "id": "EjUxEhZqEBzwvlw0VE9eJ", + "type": "rectangle", + "x": 355.5, + "y": 327, + "width": 162, + "height": 125.5, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a3", + "roundness": { + "type": 3 + }, + "seed": 1739846638, + "version": 64, + "versionNonce": 1594290034, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "ogRkV0neHrhEKTE6zlggl" + } + ], + "updated": 1758818391415, + "link": null, + "locked": false + }, + { + "id": "ogRkV0neHrhEKTE6zlggl", + "type": "text", + "x": 378.7100524902344, + "y": 377.25, + "width": 115.57989501953125, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a3V", + "roundness": null, + "seed": 2037675630, + "version": 12, + "versionNonce": 1286472046, + "isDeleted": false, + "boundElements": null, + "updated": 1758818399222, + "link": null, + "locked": false, + "text": "RDF_String", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "EjUxEhZqEBzwvlw0VE9eJ", + "originalText": "RDF_String", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "hoIRMNiMJZl4YDo-hovWy", + "type": "rectangle", + "x": 542.5, + "y": 327, + "width": 173, + "height": 125.5, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a4", + "roundness": { + "type": 3 + }, + "seed": 1189796530, + "version": 99, + "versionNonce": 1071057006, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "rsapATFAT5YSBCXzLupgZ" + }, + { + "id": "gus_rxauKJ6T2L_F59PfN", + "type": "arrow" + }, + { + "id": "Wk1bJbbtC31FqObEL5xWt", + "type": "arrow" + } + ], + "updated": 1758818593647, + "link": null, + "locked": false + }, + { + "id": "rsapATFAT5YSBCXzLupgZ", + "type": "text", + "x": 585.6800384521484, + "y": 377.25, + "width": 86.63992309570312, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a5", + "roundness": null, + "seed": 829619694, + "version": 12, + "versionNonce": 713902318, + "isDeleted": false, + "boundElements": null, + "updated": 1758818405150, + "link": null, + "locked": false, + "text": "Abstract", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "hoIRMNiMJZl4YDo-hovWy", + "originalText": "Abstract", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "jSx8ApfhtRs_nk37VvDMb", + "type": "rectangle", + "x": 316.5, + "y": 511, + "width": 436.5, + "height": 145.5, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a6", + "roundness": { + "type": 3 + }, + "seed": 492582894, + "version": 132, + "versionNonce": 893797614, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "6E23g-rgowNqHsBxX-LuM" + }, + { + "id": "hyFKqXwet_F79QM71atgI", + "type": "arrow" + }, + { + "id": "x_DP1FcQ7jraGz0gBuDi3", + "type": "arrow" + }, + { + "id": "1IGbCps2EHnzKgJUWM5nq", + "type": "arrow" + }, + { + "id": "Wk1bJbbtC31FqObEL5xWt", + "type": "arrow" + } + ], + "updated": 1758818593647, + "link": null, + "locked": false + }, + { + "id": "6E23g-rgowNqHsBxX-LuM", + "type": "text", + "x": 499.9100341796875, + "y": 571.25, + "width": 69.679931640625, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a7", + "roundness": null, + "seed": 267696178, + "version": 132, + "versionNonce": 1668243186, + "isDeleted": false, + "boundElements": null, + "updated": 1758818543211, + "link": null, + "locked": false, + "text": "Pandas", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "jSx8ApfhtRs_nk37VvDMb", + "originalText": "Pandas", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "ohj18N4AOTDz5lJNcV9gi", + "type": "rectangle", + "x": 261, + "y": 765.5, + "width": 157, + "height": 87, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a8", + "roundness": { + "type": 3 + }, + "seed": 1446207150, + "version": 279, + "versionNonce": 317375026, + "isDeleted": false, + "boundElements": [ + { + "id": "Ea1_ke2wA0D8ZjVOUtvfY", + "type": "text" + }, + { + "id": "hyFKqXwet_F79QM71atgI", + "type": "arrow" + } + ], + "updated": 1758818570993, + "link": null, + "locked": false + }, + { + "id": "Ea1_ke2wA0D8ZjVOUtvfY", + "type": "text", + "x": 297.0800323486328, + "y": 796.5, + "width": 84.83993530273438, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a9", + "roundness": null, + "seed": 435116270, + "version": 199, + "versionNonce": 1282911218, + "isDeleted": false, + "boundElements": null, + "updated": 1758818570993, + "link": null, + "locked": false, + "text": "train.txt", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "ohj18N4AOTDz5lJNcV9gi", + "originalText": "train.txt", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "A4Y54Y26fe257U_QU9lxX", + "type": "rectangle", + "x": 464, + "y": 765.5, + "width": 157, + "height": 87, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aA", + "roundness": { + "type": 3 + }, + "seed": 186148850, + "version": 232, + "versionNonce": 997119858, + "isDeleted": false, + "boundElements": [ + { + "id": "v4TvUlDEjH7EvPDmtbOn2", + "type": "text" + }, + { + "id": "1IGbCps2EHnzKgJUWM5nq", + "type": "arrow" + } + ], + "updated": 1758818570993, + "link": null, + "locked": false + }, + { + "id": "v4TvUlDEjH7EvPDmtbOn2", + "type": "text", + "x": 476.3500442504883, + "y": 796.5, + "width": 132.29991149902344, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aB", + "roundness": null, + "seed": 1131059634, + "version": 171, + "versionNonce": 239540530, + "isDeleted": false, + "boundElements": null, + "updated": 1758818570993, + "link": null, + "locked": false, + "text": "validation.txt", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "A4Y54Y26fe257U_QU9lxX", + "originalText": "validation.txt", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "mPaYpJ9Xn7tlJPmKPqJKJ", + "type": "rectangle", + "x": 674.5, + "y": 765.5, + "width": 157, + "height": 87, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aC", + "roundness": { + "type": 3 + }, + "seed": 1049323314, + "version": 235, + "versionNonce": 330560690, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "kg9nm2rpud6cax5aNPSnu" + }, + { + "id": "x_DP1FcQ7jraGz0gBuDi3", + "type": "arrow" + } + ], + "updated": 1758818570993, + "link": null, + "locked": false + }, + { + "id": "kg9nm2rpud6cax5aNPSnu", + "type": "text", + "x": 711.4300231933594, + "y": 796.5, + "width": 83.13995361328125, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aD", + "roundness": null, + "seed": 522572142, + "version": 193, + "versionNonce": 1920372338, + "isDeleted": false, + "boundElements": null, + "updated": 1758818570993, + "link": null, + "locked": false, + "text": "test.txt", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "mPaYpJ9Xn7tlJPmKPqJKJ", + "originalText": "test.txt", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "hyFKqXwet_F79QM71atgI", + "type": "arrow", + "x": 534.65, + "y": 661.5, + "width": 195.25, + "height": 99, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aG", + "roundness": null, + "seed": 873266098, + "version": 71, + "versionNonce": 541154738, + "isDeleted": false, + "boundElements": null, + "updated": 1758818570993, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 0, + 49.5 + ], + [ + -195.25, + 49.5 + ], + [ + -195.25, + 99 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "jSx8ApfhtRs_nk37VvDMb", + "fixedPoint": [ + 0.49977090492554405, + 1.034364261168385 + ], + "focus": 0, + "gap": 0 + }, + "endBinding": { + "elementId": "ohj18N4AOTDz5lJNcV9gi", + "fixedPoint": [ + 0.4993630573248406, + -0.05747126436781609 + ], + "focus": 0, + "gap": 0 + }, + "startArrowhead": null, + "endArrowhead": "triangle", + "elbowed": true, + "fixedSegments": null, + "startIsSpecial": null, + "endIsSpecial": null + }, + { + "id": "x_DP1FcQ7jraGz0gBuDi3", + "type": "arrow", + "x": 534.65, + "y": 661.5, + "width": 218.25, + "height": 99, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aH", + "roundness": null, + "seed": 1210817582, + "version": 77, + "versionNonce": 1483392370, + "isDeleted": false, + "boundElements": null, + "updated": 1758818580594, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 0, + 49.5 + ], + [ + 218.25, + 49.5 + ], + [ + 218.25, + 99 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "jSx8ApfhtRs_nk37VvDMb", + "fixedPoint": [ + 0.49977090492554405, + 1.034364261168385 + ], + "focus": 0, + "gap": 0 + }, + "endBinding": { + "elementId": "mPaYpJ9Xn7tlJPmKPqJKJ", + "fixedPoint": [ + 0.4993630573248406, + -0.05747126436781609 + ], + "focus": 0, + "gap": 0 + }, + "startArrowhead": null, + "endArrowhead": "triangle", + "elbowed": true, + "fixedSegments": null, + "startIsSpecial": null, + "endIsSpecial": null + }, + { + "id": "1IGbCps2EHnzKgJUWM5nq", + "type": "arrow", + "x": 534.65, + "y": 661.5, + "width": 0.5719232650604908, + "height": 99.07394122590165, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aK", + "roundness": null, + "seed": 1205316658, + "version": 96, + "versionNonce": 1748050674, + "isDeleted": false, + "boundElements": null, + "updated": 1758818570993, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + -0.5719232650604908, + 99.07394122590165 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "jSx8ApfhtRs_nk37VvDMb", + "fixedPoint": [ + 0.49977090492554405, + 1.034364261168385 + ], + "focus": 0, + "gap": 0 + }, + "endBinding": { + "elementId": "A4Y54Y26fe257U_QU9lxX", + "fixedPoint": [ + 0.44635717665566554, + -0.056621365219521276 + ], + "focus": 0, + "gap": 0 + }, + "startArrowhead": null, + "endArrowhead": "triangle", + "elbowed": true, + "fixedSegments": null, + "startIsSpecial": null, + "endIsSpecial": null + }, + { + "id": "gus_rxauKJ6T2L_F59PfN", + "type": "arrow", + "x": 539, + "y": 271.5, + "width": 0, + "height": 33.5, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aL", + "roundness": null, + "seed": 763990258, + "version": 17, + "versionNonce": 1028811378, + "isDeleted": false, + "boundElements": null, + "updated": 1758818588814, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 0, + 33.5 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "3zbCui3XtIGozHXTVAGRp", + "focus": -0.019473081328751418, + "gap": 3 + }, + "endBinding": { + "elementId": "hoIRMNiMJZl4YDo-hovWy", + "focus": -1.0404624277456647, + "gap": 30.7545797799829 + }, + "startArrowhead": null, + "endArrowhead": "triangle", + "elbowed": false + }, + { + "id": "Wk1bJbbtC31FqObEL5xWt", + "type": "arrow", + "x": 536.5, + "y": 468.5, + "width": 0, + "height": 39, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aM", + "roundness": null, + "seed": 1489771054, + "version": 33, + "versionNonce": 1828178606, + "isDeleted": false, + "boundElements": null, + "updated": 1758818593647, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 0, + 39 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "hoIRMNiMJZl4YDo-hovWy", + "focus": 1.0693641618497107, + "gap": 27.157190169432425 + }, + "endBinding": { + "elementId": "jSx8ApfhtRs_nk37VvDMb", + "focus": 0.008018327605956525, + "gap": 3.5 + }, + "startArrowhead": null, + "endArrowhead": "triangle", + "elbowed": false + } + ], + "appState": { + "gridSize": 20, + "gridStep": 5, + "gridModeEnabled": false, + "viewBackgroundColor": "#ffffff" + }, + "files": {} +} \ No newline at end of file From 650b37c586fe07d9bb83d4471a727c12cd717dfb Mon Sep 17 00:00:00 2001 From: GassiGiuseppe Date: Fri, 26 Sep 2025 11:24:34 +0200 Subject: [PATCH 06/75] Added vscode setting to execute jupyternotebook from root dir --- .vscode/settings.json | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..17ae78b --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "jupyter.notebookFileRoot": "${workspaceFolder}" +} \ No newline at end of file From 9972ab8a511e785bbd14148023808dc6a329e09f Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Fri, 26 Sep 2025 18:48:23 +0200 Subject: [PATCH 07/75] Added imports --- Project_Model/Libs/BPE/Classes/__init__.py | 5 +++++ Project_Model/Libs/BPE/Enums/__init__.py | 0 Project_Model/Libs/BPE/Errors/__init__.py | 5 +++++ Project_Model/Libs/BPE/__init__.py | 3 +++ Project_Model/Libs/__init__.py | 1 + 5 files changed, 14 insertions(+) create mode 100644 Project_Model/Libs/BPE/Classes/__init__.py create mode 100644 Project_Model/Libs/BPE/Enums/__init__.py create mode 100644 Project_Model/Libs/BPE/Errors/__init__.py create mode 100644 Project_Model/Libs/BPE/__init__.py create mode 100644 Project_Model/Libs/__init__.py diff --git a/Project_Model/Libs/BPE/Classes/__init__.py b/Project_Model/Libs/BPE/Classes/__init__.py new file mode 100644 index 0000000..e8e65e5 --- /dev/null +++ b/Project_Model/Libs/BPE/Classes/__init__.py @@ -0,0 +1,5 @@ +from .NanoSocratesChunker import NanoSocratesChunker + +__all__ = [ + "NanoSocratesChunker" +] \ No newline at end of file diff --git a/Project_Model/Libs/BPE/Enums/__init__.py b/Project_Model/Libs/BPE/Enums/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Project_Model/Libs/BPE/Errors/__init__.py b/Project_Model/Libs/BPE/Errors/__init__.py new file mode 100644 index 0000000..0aab0ad --- /dev/null +++ b/Project_Model/Libs/BPE/Errors/__init__.py @@ -0,0 +1,5 @@ +from .DelimiterNotFoundException import DelimiterNotFoundException + +__all__ = [ + "DelimiterNotFoundException" +] diff --git a/Project_Model/Libs/BPE/__init__.py b/Project_Model/Libs/BPE/__init__.py new file mode 100644 index 0000000..2292a87 --- /dev/null +++ b/Project_Model/Libs/BPE/__init__.py @@ -0,0 +1,3 @@ +from .Classes import * +from .Enums import * +from .Errors import * diff --git a/Project_Model/Libs/__init__.py b/Project_Model/Libs/__init__.py new file mode 100644 index 0000000..39fcdff --- /dev/null +++ b/Project_Model/Libs/__init__.py @@ -0,0 +1 @@ +from . import BPE \ No newline at end of file From 3f48b5c4286be537d2d6a5fa80aabee6849faba6 Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Fri, 26 Sep 2025 18:48:44 +0200 Subject: [PATCH 08/75] Added text files to test a chunker --- Project_Model/Tests/chunker_files/edge-1.txt | 4 ++++ Project_Model/Tests/chunker_files/simple.txt | 2 ++ Project_Model/Tests/chunker_files/stress.txt | 3 +++ 3 files changed, 9 insertions(+) create mode 100644 Project_Model/Tests/chunker_files/edge-1.txt create mode 100644 Project_Model/Tests/chunker_files/simple.txt create mode 100644 Project_Model/Tests/chunker_files/stress.txt diff --git a/Project_Model/Tests/chunker_files/edge-1.txt b/Project_Model/Tests/chunker_files/edge-1.txt new file mode 100644 index 0000000..d93fc54 --- /dev/null +++ b/Project_Model/Tests/chunker_files/edge-1.txt @@ -0,0 +1,4 @@ +Lorem ipsum dolor sit amet, +consectetur adipiscing elit. +Aenean at dui hendrerit ante sollicitud +in scelerisque \ No newline at end of file diff --git a/Project_Model/Tests/chunker_files/simple.txt b/Project_Model/Tests/chunker_files/simple.txt new file mode 100644 index 0000000..fbc222a --- /dev/null +++ b/Project_Model/Tests/chunker_files/simple.txt @@ -0,0 +1,2 @@ +Lorem ipsum dolor sit amet, consectetur adipiscing elit. +Aenean at dui hendrerit ante sollicitudin scelerisque \ No newline at end of file diff --git a/Project_Model/Tests/chunker_files/stress.txt b/Project_Model/Tests/chunker_files/stress.txt new file mode 100644 index 0000000..b3cf4c7 --- /dev/null +++ b/Project_Model/Tests/chunker_files/stress.txt @@ -0,0 +1,3 @@ +Lorem ipsum dolor sit amet, consectetur adipiscing elit. +Aenean at dui hendrerit an te sollicitudin scelerisque +dsdsasssdfdsdsfkjddsnfkjdsnfkjdnsjkfndf \ No newline at end of file From 5801a819e9e058e360e0ee983c688e87e0fa778e Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Fri, 26 Sep 2025 18:49:06 +0200 Subject: [PATCH 09/75] Added vars to make it easier to work here --- .vscode/settings.json | 39 +++++++++++++++++++++++++++++++++++++++ README.md | 21 ++++++++++++++++++++- 2 files changed, 59 insertions(+), 1 deletion(-) create mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..1d34b01 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,39 @@ +{ + // For linux + "terminal.integrated.env.linux": { + "PYTHONPATH": "${workspaceFolder}" + }, + // For OSX + "terminal.integrated.env.osx": { + "PYTHONPATH": "${workspaceFolder}" + }, + // For Windows + "terminal.integrated.env.windows": { + "PYTHONPATH": "${workspaceFolder}" + } +} + +// { +// // Always treat the project root as the working dir for Jupyter +// "jupyter.notebookFileRoot": "${workspaceFolder}", +// +// // When you click "Run Python File in Terminal", DON'T cd into the file's folder +// "python.terminal.executeInFileDir": false, +// +// // Start new integrated terminals at the project root +// "terminal.integrated.cwd": "${workspaceFolder}", +// +// // Ensure Python can import from the project root no matter which file you run +// // (so `src/` is on sys.path). Linux shown here; add osx/windows if needed. +// "terminal.integrated.env.windows": { +// "PYTHONPATH": "${workspaceFolder}" +// }, +// +// // Make pytest run from the root without needing a pytest.ini +// "python.testing.pytestEnabled": true, +// "python.testing.cwd": "${workspaceFolder}", +// "python.testing.pytestArgs": ["src/test"], +// +// // Help Pylance resolve imports like `from src...` without red squiggles +// "python.analysis.extraPaths": ["${workspaceFolder}"] +// } \ No newline at end of file diff --git a/README.md b/README.md index 1789589..1aec207 100644 --- a/README.md +++ b/README.md @@ -12,11 +12,30 @@ Create and activate you Conda enviroment with: conda env create -f environment.yaml conda activate deep_learning - + Now install dependencies on pip: pip install -r requirements.txt +Add the following on .vscode/settings.json + + ```json + { + // For linux + "terminal.integrated.env.linux": { + "PYTHONPATH": "${workspaceFolder}" + }, + // For OSX + "terminal.integrated.env.osx": { + "PYTHONPATH": "${workspaceFolder}" + }, + // For Windows + "terminal.integrated.env.windows": { + "PYTHONPATH": "${workspaceFolder}" + } + } + ``` + ## TroubleShooting Sometimes when uploading really large batch of data, git can stop the uploads thanks to the timeout. From be8a87ce0165dba6ab79793967b7767eaa21629d Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Fri, 26 Sep 2025 18:49:29 +0200 Subject: [PATCH 10/75] Modified the architecture for BPE --- Projec-Model/UML/bpe.excalidraw.json | 362 -------------- Project_Model/UML/bpe.excalidraw.json | 658 ++++++++++++++++++++++++++ 2 files changed, 658 insertions(+), 362 deletions(-) delete mode 100644 Projec-Model/UML/bpe.excalidraw.json create mode 100644 Project_Model/UML/bpe.excalidraw.json diff --git a/Projec-Model/UML/bpe.excalidraw.json b/Projec-Model/UML/bpe.excalidraw.json deleted file mode 100644 index 1400c25..0000000 --- a/Projec-Model/UML/bpe.excalidraw.json +++ /dev/null @@ -1,362 +0,0 @@ -{ - "type": "excalidraw", - "version": 2, - "source": "https://marketplace.visualstudio.com/items?itemName=pomdtr.excalidraw-editor", - "elements": [ - { - "id": "EcT-dGsjmfW571ov8Gg4F", - "type": "text", - "x": 425.5, - "y": 130, - "width": 506, - "height": 550, - "angle": 0, - "strokeColor": "#1e1e1e", - "backgroundColor": "transparent", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "groupIds": [ - "4rCC2-N1thmII8_dwNhe1" - ], - "frameId": null, - "index": "a3V", - "roundness": null, - "seed": 523521109, - "version": 758, - "versionNonce": 383976373, - "isDeleted": false, - "boundElements": [ - { - "id": "OA_NKjb3n3NLtUo_tKmPS", - "type": "arrow" - } - ], - "updated": 1758823931674, - "link": null, - "locked": false, - "text": "class NanoSocratesBPE:\n - vocabulary: Vocabulary\n\n - max_iterations: int\n - max_vocabulary_size: int\n - merge_treshold: int\n\n - reserve_capacity: float\n - token_length: int\n\n + fit(\n data: [[int]], \n memory: NanoSocratesBPE_BatchMemory, \n last_batch: bool\n ) -> (Self, NanoSocratesBPE_BatchMemory)\n\n + encode(word: [byte]) -> [int]\n\n + decode(token: [int]) -> [byte]\n\n \n", - "fontSize": 20, - "fontFamily": 8, - "textAlign": "left", - "verticalAlign": "top", - "containerId": null, - "originalText": "class NanoSocratesBPE:\n - vocabulary: Vocabulary\n\n - max_iterations: int\n - max_vocabulary_size: int\n - merge_treshold: int\n\n - reserve_capacity: float\n - token_length: int\n\n + fit(\n data: [[int]], \n memory: NanoSocratesBPE_BatchMemory, \n last_batch: bool\n ) -> (Self, NanoSocratesBPE_BatchMemory)\n\n + encode(word: [byte]) -> [int]\n\n + decode(token: [int]) -> [byte]\n\n \n", - "autoResize": true, - "lineHeight": 1.25 - }, - { - "id": "74i4oK-JpcM4CgAqhz_x_", - "type": "rectangle", - "x": 382.5, - "y": 104, - "width": 592.5, - "height": 555, - "angle": 0, - "strokeColor": "#1e1e1e", - "backgroundColor": "transparent", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "groupIds": [ - "4rCC2-N1thmII8_dwNhe1" - ], - "frameId": null, - "index": "a4", - "roundness": { - "type": 3 - }, - "seed": 50827893, - "version": 212, - "versionNonce": 692313525, - "isDeleted": false, - "boundElements": null, - "updated": 1758822941942, - "link": null, - "locked": false - }, - { - "id": "s8I1JoKulE3Vnti9a374p", - "type": "text", - "x": 1113, - "y": 128, - "width": 440, - "height": 250, - "angle": 0, - "strokeColor": "#1e1e1e", - "backgroundColor": "transparent", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "groupIds": [ - "M6w9efVFwOZHkJGgwkyEw" - ], - "frameId": null, - "index": "a5", - "roundness": null, - "seed": 2091174261, - "version": 442, - "versionNonce": 1108352309, - "isDeleted": false, - "boundElements": null, - "updated": 1758822765308, - "link": null, - "locked": false, - "text": "class Vocabulary:\n\n - vocabulary: dict\n - reverse_vocabulary: dict\n\n + add_word(int) -> int\n + encode(int) -> int\n + decode(int) -> int\n\n", - "fontSize": 20, - "fontFamily": 8, - "textAlign": "left", - "verticalAlign": "top", - "containerId": null, - "originalText": "class Vocabulary:\n\n - vocabulary: dict\n - reverse_vocabulary: dict\n\n + add_word(int) -> int\n + encode(int) -> int\n + decode(int) -> int\n\n", - "autoResize": true, - "lineHeight": 1.25 - }, - { - "id": "BY_Why7XDNftdMzPcwjVZ", - "type": "rectangle", - "x": 1086.5, - "y": 104, - "width": 504.5, - "height": 260.5, - "angle": 0, - "strokeColor": "#1e1e1e", - "backgroundColor": "transparent", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "groupIds": [ - "M6w9efVFwOZHkJGgwkyEw" - ], - "frameId": null, - "index": "a6", - "roundness": { - "type": 3 - }, - "seed": 153939611, - "version": 153, - "versionNonce": 1903356469, - "isDeleted": false, - "boundElements": [ - { - "id": "WcDks9DR8UqeZEaxAcRf9", - "type": "arrow" - } - ], - "updated": 1758822805382, - "link": null, - "locked": false - }, - { - "id": "JCPDhuTKRx4MN950Q3jL-", - "type": "text", - "x": 1116.411067193676, - "y": 535.1519268774704, - "width": 427.72826086956525, - "height": 99.70355731225297, - "angle": 0, - "strokeColor": "#1e1e1e", - "backgroundColor": "transparent", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "groupIds": [ - "DbtlKVF_9SjH2-9iMq9zy" - ], - "frameId": null, - "index": "a7", - "roundness": null, - "seed": 1326854235, - "version": 345, - "versionNonce": 592556603, - "isDeleted": false, - "boundElements": null, - "updated": 1758822845014, - "link": null, - "locked": false, - "text": "class NanoSocrateBPE_BatchMemory:\n\n + max_word_length: int\n + frequency: dict<(int, int), int> ", - "fontSize": 19.940711462450594, - "fontFamily": 8, - "textAlign": "left", - "verticalAlign": "top", - "containerId": null, - "originalText": "class NanoSocrateBPE_BatchMemory:\n\n + max_word_length: int\n + frequency: dict<(int, int), int> ", - "autoResize": true, - "lineHeight": 1.25 - }, - { - "id": "l-O0rMS3SruV22_MPX9Jz", - "type": "rectangle", - "x": 1086.5, - "y": 509.22900197628456, - "width": 504.49999999999994, - "height": 154.04199604743084, - "angle": 0, - "strokeColor": "#1e1e1e", - "backgroundColor": "transparent", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "groupIds": [ - "DbtlKVF_9SjH2-9iMq9zy" - ], - "frameId": null, - "index": "a8", - "roundness": { - "type": 3 - }, - "seed": 1490898171, - "version": 186, - "versionNonce": 1953870555, - "isDeleted": false, - "boundElements": [ - { - "id": "OA_NKjb3n3NLtUo_tKmPS", - "type": "arrow" - } - ], - "updated": 1758822845014, - "link": null, - "locked": false - }, - { - "id": "WcDks9DR8UqeZEaxAcRf9", - "type": "arrow", - "x": 773.5, - "y": 167, - "width": 298.5, - "height": 30, - "angle": 0, - "strokeColor": "#1e1e1e", - "backgroundColor": "transparent", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "groupIds": [], - "frameId": null, - "index": "aB", - "roundness": { - "type": 2 - }, - "seed": 1681364149, - "version": 205, - "versionNonce": 1154753851, - "isDeleted": false, - "boundElements": [], - "updated": 1758823291274, - "link": null, - "locked": false, - "points": [ - [ - 0, - 0 - ], - [ - 144.5, - -1.5 - ], - [ - 177.5, - -30 - ], - [ - 298.5, - -29.5 - ] - ], - "lastCommittedPoint": null, - "startBinding": null, - "endBinding": { - "elementId": "BY_Why7XDNftdMzPcwjVZ", - "focus": 0.7285094931977862, - "gap": 14.5 - }, - "startArrowhead": null, - "endArrowhead": "triangle", - "elbowed": false - }, - { - "id": "OA_NKjb3n3NLtUo_tKmPS", - "type": "arrow", - "x": 941, - "y": 440.7646462573778, - "width": 132.9833600541258, - "height": 105.33206183359624, - "angle": 0, - "strokeColor": "#1e1e1e", - "backgroundColor": "transparent", - "fillStyle": "solid", - "strokeWidth": 2, - "strokeStyle": "solid", - "roughness": 1, - "opacity": 100, - "groupIds": [], - "frameId": null, - "index": "aD", - "roundness": { - "type": 2 - }, - "seed": 1871768059, - "version": 402, - "versionNonce": 462603541, - "isDeleted": false, - "boundElements": [], - "updated": 1758823931675, - "link": null, - "locked": false, - "points": [ - [ - 0, - 0 - ], - [ - 53, - 8.23535374262218 - ], - [ - 63, - 97.73535374262218 - ], - [ - 132.9833600541258, - 105.33206183359624 - ] - ], - "lastCommittedPoint": null, - "startBinding": { - "elementId": "EcT-dGsjmfW571ov8Gg4F", - "focus": -0.01598303536344995, - "gap": 9.500000000000114 - }, - "endBinding": { - "elementId": "l-O0rMS3SruV22_MPX9Jz", - "focus": 0.10931526948750278, - "gap": 13.22003639101672 - }, - "startArrowhead": null, - "endArrowhead": "triangle", - "elbowed": false - } - ], - "appState": { - "gridSize": 20, - "gridStep": 5, - "gridModeEnabled": false, - "viewBackgroundColor": "#ffffff" - }, - "files": {} -} \ No newline at end of file diff --git a/Project_Model/UML/bpe.excalidraw.json b/Project_Model/UML/bpe.excalidraw.json new file mode 100644 index 0000000..d706222 --- /dev/null +++ b/Project_Model/UML/bpe.excalidraw.json @@ -0,0 +1,658 @@ +{ + "type": "excalidraw", + "version": 2, + "source": "https://marketplace.visualstudio.com/items?itemName=pomdtr.excalidraw-editor", + "elements": [ + { + "id": "EcT-dGsjmfW571ov8Gg4F", + "type": "text", + "x": 425.5, + "y": 132, + "width": 506, + "height": 425, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [ + "4rCC2-N1thmII8_dwNhe1" + ], + "frameId": null, + "index": "a3V", + "roundness": null, + "seed": 523521109, + "version": 883, + "versionNonce": 1590682729, + "isDeleted": false, + "boundElements": [ + { + "id": "OA_NKjb3n3NLtUo_tKmPS", + "type": "arrow" + } + ], + "updated": 1758881654155, + "link": null, + "locked": false, + "text": "class NanoSocratesBPE(Encoder):\n - vocabulary: Vocabulary\n\n + fit(\n data: [[int]], \n memory: NanoSocratesBPE_BatchMemory,\n last_sentence_chunk: bool, \n last_batch: bool\n ) -> (Self, NanoSocratesBPE_BatchMemory)\n\n + encode(word: [byte]) -> [int]\n\n + decode(token: [int]) -> [byte]\n\n + get_vocabulary_size() -> int\n \n", + "fontSize": 20, + "fontFamily": 8, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "class NanoSocratesBPE(Encoder):\n - vocabulary: Vocabulary\n\n + fit(\n data: [[int]], \n memory: NanoSocratesBPE_BatchMemory,\n last_sentence_chunk: bool, \n last_batch: bool\n ) -> (Self, NanoSocratesBPE_BatchMemory)\n\n + encode(word: [byte]) -> [int]\n\n + decode(token: [int]) -> [byte]\n\n + get_vocabulary_size() -> int\n \n", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "74i4oK-JpcM4CgAqhz_x_", + "type": "rectangle", + "x": 382.5, + "y": 104.5, + "width": 592.5, + "height": 421, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [ + "4rCC2-N1thmII8_dwNhe1" + ], + "frameId": null, + "index": "a4", + "roundness": { + "type": 3 + }, + "seed": 50827893, + "version": 319, + "versionNonce": 704459557, + "isDeleted": false, + "boundElements": [], + "updated": 1758878226277, + "link": null, + "locked": false + }, + { + "id": "s8I1JoKulE3Vnti9a374p", + "type": "text", + "x": 1113.5, + "y": 127, + "width": 517, + "height": 325, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [ + "M6w9efVFwOZHkJGgwkyEw" + ], + "frameId": null, + "index": "a5", + "roundness": null, + "seed": 2091174261, + "version": 480, + "versionNonce": 1964948039, + "isDeleted": false, + "boundElements": [], + "updated": 1758881941367, + "link": null, + "locked": false, + "text": "class Vocabulary:\n\n - vocabulary: dict<(int, int), int>\n - reverse_vocabulary: dict\n\n + size -> int\n\n + add_word(int) -> int\n + encode(int) -> int\n + decode(int) -> int\n \n\n", + "fontSize": 20, + "fontFamily": 8, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "class Vocabulary:\n\n - vocabulary: dict<(int, int), int>\n - reverse_vocabulary: dict\n\n + size -> int\n\n + add_word(int) -> int\n + encode(int) -> int\n + decode(int) -> int\n \n\n", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "BY_Why7XDNftdMzPcwjVZ", + "type": "rectangle", + "x": 1086.5, + "y": 105.5, + "width": 593.0000000000001, + "height": 325.5, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [ + "M6w9efVFwOZHkJGgwkyEw" + ], + "frameId": null, + "index": "a6", + "roundness": { + "type": 3 + }, + "seed": 153939611, + "version": 234, + "versionNonce": 2068149129, + "isDeleted": false, + "boundElements": [ + { + "id": "WcDks9DR8UqeZEaxAcRf9", + "type": "arrow" + } + ], + "updated": 1758881945661, + "link": null, + "locked": false + }, + { + "id": "JCPDhuTKRx4MN950Q3jL-", + "type": "text", + "x": 1116.411067193676, + "y": 477.3809288774704, + "width": 416.74578857421875, + "height": 99.70355731225297, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [ + "DbtlKVF_9SjH2-9iMq9zy" + ], + "frameId": null, + "index": "a7", + "roundness": null, + "seed": 1326854235, + "version": 479, + "versionNonce": 595084597, + "isDeleted": false, + "boundElements": [], + "updated": 1758902358518, + "link": null, + "locked": false, + "text": "class NanoSocrateBPE_BatchMemory:\n\n + frequency: dict<(int, int), int>\n + merge_treshold: int", + "fontSize": 19.940711462450594, + "fontFamily": 8, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "class NanoSocrateBPE_BatchMemory:\n\n + frequency: dict<(int, int), int>\n + merge_treshold: int", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "l-O0rMS3SruV22_MPX9Jz", + "type": "rectangle", + "x": 1086.5, + "y": 451.4580039762846, + "width": 593, + "height": 208.0419960474308, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [ + "DbtlKVF_9SjH2-9iMq9zy" + ], + "frameId": null, + "index": "a8", + "roundness": { + "type": 3 + }, + "seed": 1490898171, + "version": 305, + "versionNonce": 587306139, + "isDeleted": false, + "boundElements": [ + { + "id": "OA_NKjb3n3NLtUo_tKmPS", + "type": "arrow" + } + ], + "updated": 1758902358518, + "link": null, + "locked": false + }, + { + "id": "WcDks9DR8UqeZEaxAcRf9", + "type": "arrow", + "x": 773.5, + "y": 167, + "width": 297.17936724485867, + "height": 30, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aB", + "roundness": { + "type": 2 + }, + "seed": 1681364149, + "version": 303, + "versionNonce": 1262492265, + "isDeleted": false, + "boundElements": [], + "updated": 1758881945661, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 144.5, + -1.5 + ], + [ + 177.5, + -30 + ], + [ + 297.17936724485867, + -29.020420978562214 + ] + ], + "lastCommittedPoint": null, + "startBinding": null, + "endBinding": { + "elementId": "BY_Why7XDNftdMzPcwjVZ", + "focus": 0.77319587628866, + "gap": 18.25 + }, + "startArrowhead": null, + "endArrowhead": "triangle", + "elbowed": false + }, + { + "id": "OA_NKjb3n3NLtUo_tKmPS", + "type": "arrow", + "x": 946.0000000000002, + "y": 274.95951048200493, + "width": 130.016707976343, + "height": 209.36808480159067, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aD", + "roundness": { + "type": 2 + }, + "seed": 1871768059, + "version": 1039, + "versionNonce": 213535035, + "isDeleted": false, + "boundElements": [], + "updated": 1758902358519, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 54.99999999999977, + 12.54048951799507 + ], + [ + 69.49999999999977, + 188.54048951799507 + ], + [ + 130.016707976343, + 209.36808480159067 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "EcT-dGsjmfW571ov8Gg4F", + "focus": -0.48312180762055096, + "gap": 14.500000000000114 + }, + "endBinding": { + "elementId": "l-O0rMS3SruV22_MPX9Jz", + "focus": -0.16742658425737647, + "gap": 11.194126334166185 + }, + "startArrowhead": null, + "endArrowhead": "triangle", + "elbowed": false + }, + { + "id": "snZ__VDsIlri6NTp8M2Gf", + "type": "text", + "x": -245.25, + "y": 103, + "width": 330, + "height": 125, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aE", + "roundness": null, + "seed": 1758461093, + "version": 265, + "versionNonce": 1069481861, + "isDeleted": false, + "boundElements": [], + "updated": 1758879566916, + "link": null, + "locked": false, + "text": "class NanoSocratesBPETrainer:\n\n - max_iterations: int\n - max_vocabulary_size: int\n - merge_treshold: int", + "fontSize": 20, + "fontFamily": 8, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "class NanoSocratesBPETrainer:\n\n - max_iterations: int\n - max_vocabulary_size: int\n - merge_treshold: int", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "PnbmqwEWYkP8oXElKFyTp", + "type": "text", + "x": -237.75, + "y": 544, + "width": 561, + "height": 125, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aH", + "roundness": null, + "seed": 501304683, + "version": 241, + "versionNonce": 1306401003, + "isDeleted": false, + "boundElements": [], + "updated": 1758878748210, + "link": null, + "locked": false, + "text": "class NanoSocratesSplitter:\n + splitter_regex: regex\n\n + split_text(corpus: str) -> [(str, TokenType)]\n", + "fontSize": 20, + "fontFamily": 8, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "class NanoSocratesSplitter:\n + splitter_regex: regex\n\n + split_text(corpus: str) -> [(str, TokenType)]\n", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "xR_11IzgXX5O-m6WoRfCL", + "type": "text", + "x": -233.25, + "y": 366.5, + "width": 165, + "height": 75, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aI", + "roundness": null, + "seed": 2025585125, + "version": 395, + "versionNonce": 1799178985, + "isDeleted": false, + "boundElements": [], + "updated": 1758883940168, + "link": null, + "locked": false, + "text": "enum TokenType:\n + SPECIAL\n + BPE", + "fontSize": 20, + "fontFamily": 8, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "enum TokenType:\n + SPECIAL\n + BPE", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "lgKSd9qCb94-5e8rd9I3r", + "type": "text", + "x": -219.75, + "y": 764.5, + "width": 462, + "height": 275, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aJ", + "roundness": null, + "seed": 1963214021, + "version": 422, + "versionNonce": 903841927, + "isDeleted": false, + "boundElements": [], + "updated": 1758879973600, + "link": null, + "locked": false, + "text": "class TokeNano:\n\n - splitter: NanoSocratesSplitter\n - bpe_encoder: NanoSocratesBPE\n - special_encoder: NanoSocratesSpecial\n\n + encode(corpus: str) -> [int]\n\n - encode_special(piece: str) -> int\n\n - encode_bpe(piece: str) -> [int]", + "fontSize": 20, + "fontFamily": 8, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "class TokeNano:\n\n - splitter: NanoSocratesSplitter\n - bpe_encoder: NanoSocratesBPE\n - special_encoder: NanoSocratesSpecial\n\n + encode(corpus: str) -> [int]\n\n - encode_special(piece: str) -> int\n\n - encode_bpe(piece: str) -> [int]", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "DwFJoUpVT2YAEe9qPYAXa", + "type": "text", + "x": 496.75, + "y": 666, + "width": 440, + "height": 100, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aL", + "roundness": null, + "seed": 1317596203, + "version": 152, + "versionNonce": 1840679687, + "isDeleted": false, + "boundElements": [], + "updated": 1758880107704, + "link": null, + "locked": false, + "text": "class NanoSocratesSpecial(Encoder):\n\n + vocabulary: dict\n + reverse_vocabulary: dict", + "fontSize": 20, + "fontFamily": 8, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "class NanoSocratesSpecial(Encoder):\n\n + vocabulary: dict\n + reverse_vocabulary: dict", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "78gC46xatoO1_cRtaN8EC", + "type": "text", + "x": 396.375, + "y": -107.75, + "width": 346.3997802734375, + "height": 100, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aM", + "roundness": null, + "seed": 1187595241, + "version": 128, + "versionNonce": 1487192455, + "isDeleted": false, + "boundElements": [], + "updated": 1758879825591, + "link": null, + "locked": false, + "text": "class Encoder(ABC):\n\n + encode(corpus: str) -> [int]\n + decode(encoded: [int]) -> str ", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "class Encoder(ABC):\n\n + encode(corpus: str) -> [int]\n + decode(encoded: [int]) -> str ", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "3j50Ds74uU7oXoJ9kMOYJ", + "type": "text", + "x": 457.375, + "y": 903.75, + "width": 949.7594604492188, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aN", + "roundness": null, + "seed": 1994335529, + "version": 198, + "versionNonce": 1492696519, + "isDeleted": false, + "boundElements": [], + "updated": 1758882694747, + "link": null, + "locked": false, + "text": "@@mamma@@è bell^^issima e @@^^le voglio molto b^^ene @--@ replit ^^ è molto ^^bello e^^ lo amo", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "@@mamma@@è bell^^issima e @@^^le voglio molto b^^ene @--@ replit ^^ è molto ^^bello e^^ lo amo", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "yg-TvQvz4MwJZ0y8K7Ix0", + "type": "text", + "x": 435.375, + "y": 1026.25, + "width": 352, + "height": 250, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aP", + "roundness": null, + "seed": 1877486407, + "version": 344, + "versionNonce": 25830153, + "isDeleted": false, + "boundElements": [], + "updated": 1758883468886, + "link": null, + "locked": false, + "text": "class NanoSocratesChunker:\n\n - max_bytes: int\n - max_special_length: int\n - special_token_regex: regex\n\n - residuals: str\n\n # This must be an iterator\n + read(path: Path) -> str", + "fontSize": 20, + "fontFamily": 8, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "class NanoSocratesChunker:\n\n - max_bytes: int\n - max_special_length: int\n - special_token_regex: regex\n\n - residuals: str\n\n # This must be an iterator\n + read(path: Path) -> str", + "autoResize": true, + "lineHeight": 1.25 + } + ], + "appState": { + "gridSize": 20, + "gridStep": 5, + "gridModeEnabled": false, + "viewBackgroundColor": "#ffffff" + }, + "files": {} +} \ No newline at end of file From 9552d61f8d3a54bdf97c640d33075e595a43011b Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Fri, 26 Sep 2025 18:49:56 +0200 Subject: [PATCH 11/75] Added Excetption for when we don't find a delimiter --- Project_Model/Libs/BPE/Errors/DelimiterNotFoundException.py | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 Project_Model/Libs/BPE/Errors/DelimiterNotFoundException.py diff --git a/Project_Model/Libs/BPE/Errors/DelimiterNotFoundException.py b/Project_Model/Libs/BPE/Errors/DelimiterNotFoundException.py new file mode 100644 index 0000000..2823d5d --- /dev/null +++ b/Project_Model/Libs/BPE/Errors/DelimiterNotFoundException.py @@ -0,0 +1,4 @@ +class DelimiterNotFoundException(Exception): + + def __init__(self, *args: object) -> None: + super().__init__(*args) \ No newline at end of file From 8db35732f9e9c3a10fb0301cc8991bdca5e18399 Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Fri, 26 Sep 2025 18:50:23 +0200 Subject: [PATCH 12/75] Added Chunker to restrict our domains --- .../Libs/BPE/Classes/NanoSocratesChunker.py | 66 +++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 Project_Model/Libs/BPE/Classes/NanoSocratesChunker.py diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesChunker.py b/Project_Model/Libs/BPE/Classes/NanoSocratesChunker.py new file mode 100644 index 0000000..6821151 --- /dev/null +++ b/Project_Model/Libs/BPE/Classes/NanoSocratesChunker.py @@ -0,0 +1,66 @@ +from pathlib import Path +import re +from ..Errors import DelimiterNotFoundException + + +class NanoSocratesChunker: + + def __init__(self, max_size: int, special_token_regex: re.Pattern) -> None: + self.__max_size: int = max_size + self.__special_token_regex: re.Pattern = special_token_regex + self.__residual: str = "" + + def chunk(self, file_path: Path): + # read_file + FILE = open(file_path, "r", encoding="utf-8") + exit = False + + while not exit: + REMAINING_SIZE = self.__max_size - len(self.__residual) + READ_SIZE = min(self.__max_size, REMAINING_SIZE) + FILE_CHUNK = FILE.read(READ_SIZE) + + if len(FILE_CHUNK) == 0: + exit = True + continue + + CHUNK = self.__append_residuals(FILE_CHUNK) + + boundaries = self.__identify_boudaries(CHUNK) + + if boundaries is None: + + # boundaries not found in 2 chunks, + if len(CHUNK) > self.__max_size - 1: + raise DelimiterNotFoundException() + + if exit: + yield CHUNK + + self.__set_residual(0, CHUNK) + continue + + start, end = boundaries + self.__set_residual(end, CHUNK) + yield CHUNK[start:end] + + def __identify_boudaries(self, corpus: str) -> tuple[int, int] | None: + + end = 0 + + for match in self.__special_token_regex.finditer(corpus): + # print(match) + end = match.end() + + if end == 0: + return None + + return (0, end) + + def __append_residuals(self, corpus: str) -> str: + RESIDUAL = self.__residual + self.__residual = "" + return RESIDUAL + corpus + + def __set_residual(self, index: int, corpus: str): + self.__residual = corpus[index:] From 3e8b5c55796963ba9e8db3dde9dbbf241c1819b2 Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Fri, 26 Sep 2025 18:50:32 +0200 Subject: [PATCH 13/75] Added test for chunker --- Project_Model/Tests/chunker_test.py | 89 +++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 Project_Model/Tests/chunker_test.py diff --git a/Project_Model/Tests/chunker_test.py b/Project_Model/Tests/chunker_test.py new file mode 100644 index 0000000..7bac3bc --- /dev/null +++ b/Project_Model/Tests/chunker_test.py @@ -0,0 +1,89 @@ +from pathlib import Path +import re +import pytest +import Project_Model.Libs.BPE as BPE + +PATTERN = "<(TOKEN|SOT|SEP|EOT)>" +SYMBOL_REGEX = re.compile(PATTERN) + +class TestChunker: + + def test_correct_simple(self): + + FILE_PATH = Path("Project_Model/Tests/chunker_files/simple.txt") + LEAST_EXPECTED_CHUNKS = 3 + ORIG_TEXT = FILE_PATH.read_text(encoding="utf-8") + + CHUNKER = BPE.NanoSocratesChunker(40, SYMBOL_REGEX) + + CHUNKS = [] + + for chunk in CHUNKER.chunk(FILE_PATH): + print(chunk) + CHUNKS.append( + chunk + ) + + NANO_TEXT = "".join(CHUNKS) + + assert len(CHUNKS) > (LEAST_EXPECTED_CHUNKS - 1) + assert NANO_TEXT == ORIG_TEXT + + + + def test_correct_edge_1(self): + + FILE_PATH = Path("Project_Model/Tests/chunker_files/edge-1.txt") + LEAST_EXPECTED_CHUNKS = 3 + ORIG_TEXT = FILE_PATH.read_text(encoding="utf-8") + + CHUNKER = BPE.NanoSocratesChunker(15, SYMBOL_REGEX) + + CHUNKS = [] + + for chunk in CHUNKER.chunk(FILE_PATH): + print(chunk) + CHUNKS.append( + chunk + ) + + NANO_TEXT = "".join(CHUNKS) + + assert len(CHUNKS) > (LEAST_EXPECTED_CHUNKS - 1) + assert NANO_TEXT == ORIG_TEXT + + + + def test_throwing(self): + + FILE_PATH = Path("Project_Model/Tests/chunker_files/simple.txt") + + CHUNKER = BPE.NanoSocratesChunker(5, SYMBOL_REGEX) + + with pytest.raises(BPE.DelimiterNotFoundException): + for chunk in CHUNKER.chunk(FILE_PATH): + print(chunk) + +if __name__ == "__main__": + + FILE_PATH = Path("Project_Model/Tests/chunker_files/stress.txt") + LEAST_EXPECTED_CHUNKS = 3 + ORIG_TEXT = FILE_PATH.read_text(encoding="utf-8") + + CHUNKER = BPE.NanoSocratesChunker(40, SYMBOL_REGEX) + + CHUNKS = [] + + try: + for chunk in CHUNKER.chunk(FILE_PATH): + print(f"CHUNK START:\n{chunk} - {len(chunk)}\nCHUNK END\n") + CHUNKS.append( + chunk + ) + except: + exit(0) + + NANO_TEXT = "".join(CHUNKS) + + assert len(CHUNKS) > (LEAST_EXPECTED_CHUNKS - 1) + assert NANO_TEXT == ORIG_TEXT From ed0255e99babd60ee3277190f88dc99258f5e34f Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Sun, 28 Sep 2025 18:01:35 +0200 Subject: [PATCH 14/75] Updated imports --- Project_Model/Libs/BPE/Classes/__init__.py | 6 +++++- Project_Model/Libs/BPE/Enums/__init__.py | 1 + Project_Model/Libs/BPE/Errors/__init__.py | 4 +++- Project_Model/Libs/BPE/__init__.py | 4 ++++ 4 files changed, 13 insertions(+), 2 deletions(-) diff --git a/Project_Model/Libs/BPE/Classes/__init__.py b/Project_Model/Libs/BPE/Classes/__init__.py index e8e65e5..d8a7364 100644 --- a/Project_Model/Libs/BPE/Classes/__init__.py +++ b/Project_Model/Libs/BPE/Classes/__init__.py @@ -1,5 +1,9 @@ from .NanoSocratesChunker import NanoSocratesChunker +from .NanoSocratesSplitter import NanoSocratesSplitter +from .NanoSocratesBPE import NanoSocratesBPE __all__ = [ - "NanoSocratesChunker" + "NanoSocratesChunker", + "NanoSocratesSplitter", + "NanoSocratesBPE" ] \ No newline at end of file diff --git a/Project_Model/Libs/BPE/Enums/__init__.py b/Project_Model/Libs/BPE/Enums/__init__.py index e69de29..8ef388a 100644 --- a/Project_Model/Libs/BPE/Enums/__init__.py +++ b/Project_Model/Libs/BPE/Enums/__init__.py @@ -0,0 +1 @@ +from .TokenType import TokenType diff --git a/Project_Model/Libs/BPE/Errors/__init__.py b/Project_Model/Libs/BPE/Errors/__init__.py index 0aab0ad..587873f 100644 --- a/Project_Model/Libs/BPE/Errors/__init__.py +++ b/Project_Model/Libs/BPE/Errors/__init__.py @@ -1,5 +1,7 @@ from .DelimiterNotFoundException import DelimiterNotFoundException +from .OutOfDictionaryException import OutOfDictionaryException __all__ = [ - "DelimiterNotFoundException" + "DelimiterNotFoundException", + "OutOfDictionaryException" ] diff --git a/Project_Model/Libs/BPE/__init__.py b/Project_Model/Libs/BPE/__init__.py index 2292a87..6f7d1f2 100644 --- a/Project_Model/Libs/BPE/__init__.py +++ b/Project_Model/Libs/BPE/__init__.py @@ -1,3 +1,7 @@ from .Classes import * from .Enums import * from .Errors import * + +from . import Classes +from . import Enums +from . import Errors From b071145f6eff631bdc651182d7b93cf10f88d784 Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Sun, 28 Sep 2025 18:02:06 +0200 Subject: [PATCH 15/75] Added Chunker --- Project_Model/Libs/BPE/Classes/NanoSocratesChunker.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesChunker.py b/Project_Model/Libs/BPE/Classes/NanoSocratesChunker.py index 6821151..a81587c 100644 --- a/Project_Model/Libs/BPE/Classes/NanoSocratesChunker.py +++ b/Project_Model/Libs/BPE/Classes/NanoSocratesChunker.py @@ -10,6 +10,10 @@ class NanoSocratesChunker: self.__special_token_regex: re.Pattern = special_token_regex self.__residual: str = "" + # max theorethical size of chars + # between special tokens: + # - min: size - len(longest_token) + # - MAX: size - len(shortest_token) def chunk(self, file_path: Path): # read_file FILE = open(file_path, "r", encoding="utf-8") From d179e0197109479bb06fea16788e1210aac9b27a Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Sun, 28 Sep 2025 18:03:16 +0200 Subject: [PATCH 16/75] Added Splitter to divide tokens from text --- .../Libs/BPE/Classes/NanoSocratesSplitter.py | 40 ++++++ Project_Model/Tests/splitter_test.py | 131 ++++++++++++++++++ 2 files changed, 171 insertions(+) create mode 100644 Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py create mode 100644 Project_Model/Tests/splitter_test.py diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py b/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py new file mode 100644 index 0000000..ccca300 --- /dev/null +++ b/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py @@ -0,0 +1,40 @@ +import re +from typing import Generator +from ..Enums import TokenType + + +class NanoSocratesSplitter: + + def __init__( + self, + special_token_regex: re.Pattern + ) -> None: + self.__special_token_regex = special_token_regex + + def split_text(self, corpus: str) -> Generator[tuple[str, TokenType]]: + + bpe_start = 0 + bpe_end = len(corpus) + + for bound_start, bound_end in self.__find_boundaries(corpus): + + bpe_end = bound_start + BPE_TOKEN_TEXT = corpus[bpe_start:bpe_end] + + if BPE_TOKEN_TEXT != "": + yield (BPE_TOKEN_TEXT, TokenType.BPE) + + bpe_start = bound_end + SPECIAL_TOKEN_TEXT = corpus[bound_start:bound_end] + + if SPECIAL_TOKEN_TEXT != "": + yield (SPECIAL_TOKEN_TEXT, TokenType.SPECIAL) + + def __find_boundaries(self, corpus: str) -> Generator[tuple[int, int]]: + + for match in self.__special_token_regex.finditer(corpus): + start = match.start() + end = match.end() + + yield (start, end) + diff --git a/Project_Model/Tests/splitter_test.py b/Project_Model/Tests/splitter_test.py new file mode 100644 index 0000000..eda95b6 --- /dev/null +++ b/Project_Model/Tests/splitter_test.py @@ -0,0 +1,131 @@ +from Project_Model.Libs.BPE.Enums import TokenType +import Project_Model.Libs.BPE as BPE + +import re + + +PATTERN = "<(TOKEN|SOT|SEP|EOT)>" +SYMBOL_REGEX = re.compile(PATTERN) + + +class TestSplitter: + + def test_split(self): + + TEXT = "Lorem " + + SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX) + + EXPECTED_CHUNKS = [ + ("", TokenType.SPECIAL), + ("Lorem ", TokenType.BPE), + ("", TokenType.SPECIAL), + ] + + CHUNKS = list(SPLITTER.split_text(TEXT)) + + assert len(CHUNKS) == len(EXPECTED_CHUNKS) + + for chunk, expected_chunk in zip(EXPECTED_CHUNKS, CHUNKS): + print(f"TEST:\n\tCHUNK:\t\t{chunk}\n\tEXPECTED:\t\t{expected_chunk}") + RECEIVED_TOKEN_STRING, RECEIVED_TOKEN_TYPE = chunk + EXPECTED_TOKEN_STRING, EXPECTED_TOKEN_TYPE = expected_chunk + + assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING + assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE + + def test_split_trailing_text(self): + + TEXT = "ipsum dolor" + + SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX) + + EXPECTED_CHUNKS = [ + ("ipsu", TokenType.BPE), + ("", TokenType.SPECIAL), + ("m d", TokenType.BPE), + ("", TokenType.SPECIAL), + ] + + CHUNKS = list(SPLITTER.split_text(TEXT)) + + assert len(CHUNKS) == len(EXPECTED_CHUNKS) + + for chunk, expected_chunk in zip(EXPECTED_CHUNKS, CHUNKS): + print(f"TEST:\n\tCHUNK:\t\t{chunk}\n\tEXPECTED:\t\t{expected_chunk}") + RECEIVED_TOKEN_STRING, RECEIVED_TOKEN_TYPE = chunk + EXPECTED_TOKEN_STRING, EXPECTED_TOKEN_TYPE = expected_chunk + + assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING + assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE + + def test_split_multi_token(self): + + TEXT = "ipsum ddsgolor" + + SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX) + + EXPECTED_CHUNKS = [ + ("ipsu", TokenType.BPE), + ("", TokenType.SPECIAL), + ("m d", TokenType.BPE), + ("", TokenType.SPECIAL), + ("", TokenType.SPECIAL), + ("", TokenType.SPECIAL), + ("dsg", TokenType.BPE), + ("", TokenType.SPECIAL), + ] + + CHUNKS = list(SPLITTER.split_text(TEXT)) + + assert len(CHUNKS) == len(EXPECTED_CHUNKS) + + for chunk, expected_chunk in zip(EXPECTED_CHUNKS, CHUNKS): + print(f"TEST:\n\tCHUNK:\t\t{chunk}\n\tEXPECTED:\t\t{expected_chunk}") + RECEIVED_TOKEN_STRING, RECEIVED_TOKEN_TYPE = chunk + EXPECTED_TOKEN_STRING, EXPECTED_TOKEN_TYPE = expected_chunk + + assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING + assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE + + def test_split_malformed_1(self): + + TEXT = "lerisque" + + SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX) + + EXPECTED_CHUNKS = [ + ("", TokenType.SPECIAL), + ] + + CHUNKS = list(SPLITTER.split_text(TEXT)) + + assert len(CHUNKS) == len(EXPECTED_CHUNKS) + + for chunk, expected_chunk in zip(EXPECTED_CHUNKS, CHUNKS): + print(f"TEST:\n\tCHUNK:\t\t{chunk}\n\tEXPECTED:\t\t{expected_chunk}") + RECEIVED_TOKEN_STRING, RECEIVED_TOKEN_TYPE = chunk + EXPECTED_TOKEN_STRING, EXPECTED_TOKEN_TYPE = expected_chunk + + assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING + assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE + + def test_split_malformed_2(self): + + TEXT = "lerisque" + + SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX) + + EXPECTED_CHUNKS = [] + + CHUNKS = list(SPLITTER.split_text(TEXT)) + + assert len(CHUNKS) == len(EXPECTED_CHUNKS) + + for chunk, expected_chunk in zip(EXPECTED_CHUNKS, CHUNKS): + print(f"TEST:\n\tCHUNK:\t\t{chunk}\n\tEXPECTED:\t\t{expected_chunk}") + RECEIVED_TOKEN_STRING, RECEIVED_TOKEN_TYPE = chunk + EXPECTED_TOKEN_STRING, EXPECTED_TOKEN_TYPE = expected_chunk + + assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING + assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE From b46df4f91aca930f83a93934c6bf2e6e8a684d2a Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Sun, 28 Sep 2025 18:03:47 +0200 Subject: [PATCH 17/75] Added Special Encoder --- Project_Model/Libs/BPE/Classes/Encoder.py | 4 ++ .../Libs/BPE/Classes/NanoSocratesSpecial.py | 54 +++++++++++++++++++ .../BPE/Errors/OutOfDictionaryException.py | 4 ++ 3 files changed, 62 insertions(+) create mode 100644 Project_Model/Libs/BPE/Classes/Encoder.py create mode 100644 Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py create mode 100644 Project_Model/Libs/BPE/Errors/OutOfDictionaryException.py diff --git a/Project_Model/Libs/BPE/Classes/Encoder.py b/Project_Model/Libs/BPE/Classes/Encoder.py new file mode 100644 index 0000000..800772b --- /dev/null +++ b/Project_Model/Libs/BPE/Classes/Encoder.py @@ -0,0 +1,4 @@ +from abc import ABC + +class Encoder(ABC): + pass \ No newline at end of file diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py b/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py new file mode 100644 index 0000000..e551d6c --- /dev/null +++ b/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py @@ -0,0 +1,54 @@ +from .Encoder import Encoder +from ..Errors import OutOfDictionaryException + +class NanoSocratesSpecial(Encoder): + + def __init__( + self, + initial_vocabulary: list[str] | None = None + ) -> None: + super().__init__() + + self.__vocabulary: dict[str, int] = {} + self.__reverse_vocabulary: dict[int, str] = {} + self.__current_index = 0 + + if initial_vocabulary is None: + return + + for word in initial_vocabulary: + + CURRENT_INDEX = self.__current_index + self.__vocabulary[word] = CURRENT_INDEX + self.__reverse_vocabulary[CURRENT_INDEX] = word + + self.__current_index += 1 + + @property + def vocabulary_size(self): + return self.__current_index + + def add_special_word(self, word:str): + CURRENT_INDEX = self.__current_index + self.__vocabulary[word] = CURRENT_INDEX + self.__reverse_vocabulary[CURRENT_INDEX] = word + self.__current_index += 1 + + def encode(self, word: str) -> list[int]: + ID = self.__vocabulary.get(word) + + if ID is None: + raise OutOfDictionaryException() + + return [ID] + + def decode(self, token_id: int) -> str: + + ID = token_id + WORD = self.__reverse_vocabulary.get(ID) + + if WORD is None: + raise OutOfDictionaryException() + + return WORD + diff --git a/Project_Model/Libs/BPE/Errors/OutOfDictionaryException.py b/Project_Model/Libs/BPE/Errors/OutOfDictionaryException.py new file mode 100644 index 0000000..2c4c440 --- /dev/null +++ b/Project_Model/Libs/BPE/Errors/OutOfDictionaryException.py @@ -0,0 +1,4 @@ +class OutOfDictionaryException(Exception): + + def __init__(self, *args: object) -> None: + super().__init__(*args) \ No newline at end of file From e43394140577187ebd5818c975ba3ebfaff52b8d Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Sun, 28 Sep 2025 18:04:44 +0200 Subject: [PATCH 18/75] Added BPE TODO: - complete the fit method --- .../Libs/BPE/Classes/NanoSocratesBPE.py | 106 ++++++++++++++++++ Project_Model/Libs/BPE/Enums/TokenType.py | 6 + Project_Model/Tests/bpe_test.py | 52 +++++++++ 3 files changed, 164 insertions(+) create mode 100644 Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py create mode 100644 Project_Model/Libs/BPE/Enums/TokenType.py create mode 100644 Project_Model/Tests/bpe_test.py diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py new file mode 100644 index 0000000..844e860 --- /dev/null +++ b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py @@ -0,0 +1,106 @@ +from .Encoder import Encoder +from ..Errors import OutOfDictionaryException + + +class NanoSocratesBatchMemoryBPE: + + def __init__(self) -> None: + pass + + +class NanoSocratesBPE(Encoder): + + def __init__(self, vocabulary: dict[tuple[int, int], int] | None = None) -> None: + super().__init__() + + self.__vocabulary: dict[tuple[int, int], int] = {} + self.__reverse_vocabulary: dict[int, tuple[int, int]] = {} + + if vocabulary is None: + return + + for key, value in vocabulary.items(): + if value < 256: + raise OutOfDictionaryException() + self.__vocabulary[key] = value + self.__reverse_vocabulary[value] = key + + # TODO: implement fit + def fit(): + pass + + def encode(self, piece: str) -> list[int]: + + current_piece = list(map(ord, piece)) + new_piece = self.__round_encode(current_piece) + + while len(current_piece) != len(new_piece): + current_piece = new_piece + new_piece = self.__round_encode(current_piece) + + return current_piece + + def __round_encode(self, piece: list[int]): + + if len(piece) == 1: + return piece + + PIECE_LENGTH = len(piece) - 1 + NEW_PIECE = [] + + index = 0 + while index < PIECE_LENGTH: + + CANDIDATE_WORD = (piece[index], piece[index + 1]) + CANDIDATE_TOKEN = self.__vocabulary.get(CANDIDATE_WORD) + + if CANDIDATE_TOKEN is None: + NEW_PIECE.append(piece[index]) + index += 1 + + if index == PIECE_LENGTH: + NEW_PIECE.append(piece[index]) + + continue + + NEW_PIECE.append(CANDIDATE_TOKEN) + index += 2 + + + return NEW_PIECE + + # TODO: decode + def decode(self, token_id: int) -> str: + + token_stack: list[int] = [token_id] + DECODED_STRING_ARR: list[str] = [] + + while len(token_stack) > 0: + TOKEN_ID = token_stack.pop() + + if TOKEN_ID < 256: + DECODED_CHAR = chr(TOKEN_ID) + DECODED_STRING_ARR.append( + DECODED_CHAR + ) + continue + + left_token, right_token = self.__token_decode(TOKEN_ID) + + token_stack.append( + right_token + ) + token_stack.append( + left_token + ) + + return "".join(DECODED_STRING_ARR) + + def __token_decode(self, token_id: int) -> tuple[int, int]: + + CANDIDATE_DECODED = self.__reverse_vocabulary.get(token_id) + + if CANDIDATE_DECODED is None: + raise OutOfDictionaryException() + + return CANDIDATE_DECODED diff --git a/Project_Model/Libs/BPE/Enums/TokenType.py b/Project_Model/Libs/BPE/Enums/TokenType.py new file mode 100644 index 0000000..7a27c34 --- /dev/null +++ b/Project_Model/Libs/BPE/Enums/TokenType.py @@ -0,0 +1,6 @@ +from enum import Enum, auto + +class TokenType(Enum): + + SPECIAL = auto() + BPE = auto() \ No newline at end of file diff --git a/Project_Model/Tests/bpe_test.py b/Project_Model/Tests/bpe_test.py new file mode 100644 index 0000000..7332f65 --- /dev/null +++ b/Project_Model/Tests/bpe_test.py @@ -0,0 +1,52 @@ +from Project_Model.Libs.BPE.Enums import TokenType +import Project_Model.Libs.BPE as BPE + +import re + + +class TestBPE: + + def test_bpe_encoding_simple(self): + + TEXT = "abababab" + + # ab = 256 + # 256, 256 = 257 + # 257, 257 = 258 + + VOCABULARY = {(ord("a"), ord("b")): 256, (256, 256): 257, (257, 257): 258} + EXPECTED = [258] + + BPE_ENCODER = BPE.NanoSocratesBPE(VOCABULARY) + + ENCODED = BPE_ENCODER.encode(TEXT) + + assert len(ENCODED) == len(EXPECTED) + + for encoded, expected in zip(ENCODED, EXPECTED): + assert encoded == expected + + def test_bpe_decoding_simple(self): + + + INPUT = 258 + + # ab = 256 + # 256, 256 = 257 + # 257, 257 = 258 + + VOCABULARY = {(ord("a"), ord("b")): 256, (256, 256): 257, (257, 257): 258} + EXPECTED = "abababab" + + BPE_ENCODER = BPE.NanoSocratesBPE(VOCABULARY) + + DECODED = BPE_ENCODER.decode(INPUT) + + assert len(DECODED) == len(EXPECTED) + + for encoded, expected in zip(DECODED, EXPECTED): + assert encoded == expected + +# Useful to debug weird cases +if __name__ == "__main__": + TestBPE().test_bpe_decoding_simple() From 564b0d712ec571d2846ab7bba5b2c88c972236e9 Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Sun, 28 Sep 2025 18:05:03 +0200 Subject: [PATCH 19/75] Modified UML diagram --- Project_Model/UML/bpe.excalidraw.json | 57 ++++++++++++++++++++++----- 1 file changed, 47 insertions(+), 10 deletions(-) diff --git a/Project_Model/UML/bpe.excalidraw.json b/Project_Model/UML/bpe.excalidraw.json index d706222..1a53327 100644 --- a/Project_Model/UML/bpe.excalidraw.json +++ b/Project_Model/UML/bpe.excalidraw.json @@ -482,20 +482,20 @@ "index": "aJ", "roundness": null, "seed": 1963214021, - "version": 422, - "versionNonce": 903841927, + "version": 464, + "versionNonce": 1104453739, "isDeleted": false, "boundElements": [], - "updated": 1758879973600, + "updated": 1759053302739, "link": null, "locked": false, - "text": "class TokeNano:\n\n - splitter: NanoSocratesSplitter\n - bpe_encoder: NanoSocratesBPE\n - special_encoder: NanoSocratesSpecial\n\n + encode(corpus: str) -> [int]\n\n - encode_special(piece: str) -> int\n\n - encode_bpe(piece: str) -> [int]", + "text": "class TokeNanoCore:\n\n - splitter: NanoSocratesSplitter\n - bpe_encoder: NanoSocratesBPE\n - special_encoder: NanoSocratesSpecial\n\n + encode(corpus: str) -> [int]\n\n - encode_special(piece: str) -> int\n\n - encode_bpe(piece: str) -> [int]", "fontSize": 20, "fontFamily": 8, "textAlign": "left", "verticalAlign": "top", "containerId": null, - "originalText": "class TokeNano:\n\n - splitter: NanoSocratesSplitter\n - bpe_encoder: NanoSocratesBPE\n - special_encoder: NanoSocratesSpecial\n\n + encode(corpus: str) -> [int]\n\n - encode_special(piece: str) -> int\n\n - encode_bpe(piece: str) -> [int]", + "originalText": "class TokeNanoCore:\n\n - splitter: NanoSocratesSplitter\n - bpe_encoder: NanoSocratesBPE\n - special_encoder: NanoSocratesSpecial\n\n + encode(corpus: str) -> [int]\n\n - encode_special(piece: str) -> int\n\n - encode_bpe(piece: str) -> [int]", "autoResize": true, "lineHeight": 1.25 }, @@ -541,7 +541,7 @@ "type": "text", "x": 396.375, "y": -107.75, - "width": 346.3997802734375, + "width": 396, "height": 100, "angle": 0, "strokeColor": "#1e1e1e", @@ -556,16 +556,16 @@ "index": "aM", "roundness": null, "seed": 1187595241, - "version": 128, - "versionNonce": 1487192455, + "version": 130, + "versionNonce": 1273030504, "isDeleted": false, "boundElements": [], - "updated": 1758879825591, + "updated": 1759070012771, "link": null, "locked": false, "text": "class Encoder(ABC):\n\n + encode(corpus: str) -> [int]\n + decode(encoded: [int]) -> str ", "fontSize": 20, - "fontFamily": 5, + "fontFamily": 8, "textAlign": "left", "verticalAlign": "top", "containerId": null, @@ -646,6 +646,43 @@ "originalText": "class NanoSocratesChunker:\n\n - max_bytes: int\n - max_special_length: int\n - special_token_regex: regex\n\n - residuals: str\n\n # This must be an iterator\n + read(path: Path) -> str", "autoResize": true, "lineHeight": 1.25 + }, + { + "id": "2UXjWdE_jMcsCE2oQgTXn", + "type": "text", + "x": -334.75, + "y": 1112.5, + "width": 165, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aQ", + "roundness": null, + "seed": 700532363, + "version": 76, + "versionNonce": 1671597672, + "isDeleted": false, + "boundElements": [], + "updated": 1759070020002, + "link": null, + "locked": false, + "text": "class TokeNano:", + "fontSize": 20, + "fontFamily": 8, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "class TokeNano:", + "autoResize": true, + "lineHeight": 1.25 } ], "appState": { From 6ddb7de9da1af4fad8d8bae265f0622f56ba6bec Mon Sep 17 00:00:00 2001 From: GassiGiuseppe Date: Mon, 29 Sep 2025 15:19:19 +0200 Subject: [PATCH 20/75] Added sqlAlchemy to requirements --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index e87882c..70a3169 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,3 +15,4 @@ tzdata==2025.2 urllib3==2.5.0 wheel==0.45.1 Wikipedia-API==0.8.1 +SQLAlchemy From bd72ad3571bf2710cd154c5cf08b448dc194f13d Mon Sep 17 00:00:00 2001 From: GassiGiuseppe Date: Mon, 29 Sep 2025 15:21:26 +0200 Subject: [PATCH 21/75] Added file to execute the complete cleaning pipeline --- .../data_output_models/bpe_corpus.py | 21 ++ .../rdf_completation_task.py | 26 +++ .../data_output_models/rdf_mask_task.py | 58 ++++++ .../data_output_models/rdf_text_tasks.py | 26 +++ Scripts/DataCleaning/filter.py | 184 ++++++++++++++++++ Scripts/DataCleaning/pipeline.py | 107 ++++++++++ .../Libs/CleaningPipeline/special_token.py | 21 ++ Scripts/Libs/CleaningPipeline/sql_endpoint.py | 144 ++++++++++++++ Scripts/Libs/Utils/dataframe_interaction.py | 9 + 9 files changed, 596 insertions(+) create mode 100644 Scripts/DataCleaning/data_output_models/bpe_corpus.py create mode 100644 Scripts/DataCleaning/data_output_models/rdf_completation_task.py create mode 100644 Scripts/DataCleaning/data_output_models/rdf_mask_task.py create mode 100644 Scripts/DataCleaning/data_output_models/rdf_text_tasks.py create mode 100644 Scripts/DataCleaning/filter.py create mode 100644 Scripts/DataCleaning/pipeline.py create mode 100644 Scripts/Libs/CleaningPipeline/special_token.py create mode 100644 Scripts/Libs/CleaningPipeline/sql_endpoint.py create mode 100644 Scripts/Libs/Utils/dataframe_interaction.py diff --git a/Scripts/DataCleaning/data_output_models/bpe_corpus.py b/Scripts/DataCleaning/data_output_models/bpe_corpus.py new file mode 100644 index 0000000..a0348b6 --- /dev/null +++ b/Scripts/DataCleaning/data_output_models/bpe_corpus.py @@ -0,0 +1,21 @@ +from Scripts.Libs.Utils.dataframe_interaction import get_raw_from_dataframe +from Scripts.Libs.CleaningPipeline.special_token import SpecialToken +import pandas as pd + +class BPE_corpus(): + + def __init__(self, output_path :str): + self.output_handler = open(output_path, "w") + + def close(self): + # add corpus end before closing + self.output_handler.write(SpecialToken.CORPUS_END.value) + self.output_handler.close() + + def write_from_str(self, output: str): + if output == '': + return + self.output_handler.write(output) + + def write_from_df(self, df: pd.DataFrame): + self.write_from_str(get_raw_from_dataframe(df)) \ No newline at end of file diff --git a/Scripts/DataCleaning/data_output_models/rdf_completation_task.py b/Scripts/DataCleaning/data_output_models/rdf_completation_task.py new file mode 100644 index 0000000..111b2b9 --- /dev/null +++ b/Scripts/DataCleaning/data_output_models/rdf_completation_task.py @@ -0,0 +1,26 @@ +import pandas as pd + +class RDF_completation_task_dataset(): + """ + Write the CSV for the fourth task, which is "Predicting subsequent triples based on a given context". + Each RDF is saved as str + CSV Composition: ["MovieID","RDF"] + """ + def __init__(self, output_path:str): + + + self.output = open(output_path, "w") + # then the first row as header + header = ["MovieID","RDF"] + self.output.write(",".join(header) + "\n") + + def close(self): + self.output.close() + + def write(self, RDF: pd.DataFrame): + """ + Args: + RDF (pd.DataFrame): ["MovieID","RDF"] + """ + + RDF.to_csv(self.output, index=False, header=False) \ No newline at end of file diff --git a/Scripts/DataCleaning/data_output_models/rdf_mask_task.py b/Scripts/DataCleaning/data_output_models/rdf_mask_task.py new file mode 100644 index 0000000..01b943d --- /dev/null +++ b/Scripts/DataCleaning/data_output_models/rdf_mask_task.py @@ -0,0 +1,58 @@ +import pandas as pd + +# do not worry about circular dependencies, this class will never call something else +from Scripts.DataCleaning.filter import PipelineApplier + +class RDF_mask_task_dataset(): + """ + Write the CSV for the third task, which is "Predicting a masked component within an RDF triple". + The CSV is like: for each RDF there will be 3 rows, where every time one of the componments is missing. + CSV Composition: ["MovieID","IncompleteRDF","Missing","RDF"] + """ + def __init__(self, output_path:str): + + # this methods will only be used by this class, but they belong in a lower level + self._build_triple = PipelineApplier.build_triple + self._build_incomplete_triple = PipelineApplier.build_incomplete_triple + + self.output = open(output_path, "w") + # then the first row as header + header = ["MovieID","IncompleteRDF","Missing","RDF"] + self.output.write(",".join(header) + "\n") + + def close(self): + self.output.close() + + def write(self, RDF: pd.DataFrame): + rdf_complete = self._build_triple(RDF) + + rdf_without_subject = self._build_incomplete_triple(RDF.drop(columns=["SubjectURI"])) + rdf_without_relationship = self._build_incomplete_triple(RDF.drop(columns=["RelationshipURI"])) + rdf_without_object = self._build_incomplete_triple(RDF.drop(columns=["ObjectURI"])) + #### + df_subject = pd.DataFrame({ + "MovieID": RDF["MovieID"], + "IncompleteRDF": rdf_without_subject, + "Missing": RDF["SubjectURI"], + "RDF": rdf_complete, + }) + + df_relationship = pd.DataFrame({ + "MovieID": RDF["MovieID"], + "IncompleteRDF": rdf_without_relationship, + "Missing": RDF["RelationshipURI"], + "RDF": rdf_complete, + }) + + df_object = pd.DataFrame({ + "MovieID": RDF["MovieID"], + "IncompleteRDF": rdf_without_object, + "Missing": RDF["ObjectURI"], + "RDF": rdf_complete, + }) + + + output_df = pd.concat([df_subject, df_relationship, df_object], ignore_index=True) + output_df.to_csv(self.output, index=False, header=False) + + diff --git a/Scripts/DataCleaning/data_output_models/rdf_text_tasks.py b/Scripts/DataCleaning/data_output_models/rdf_text_tasks.py new file mode 100644 index 0000000..918e600 --- /dev/null +++ b/Scripts/DataCleaning/data_output_models/rdf_text_tasks.py @@ -0,0 +1,26 @@ +import pandas as pd + +class RDF_text_task_dataset(): + """ + Write the CSV for the firsts two tasks, which are "Generating structured RDF triples from natural language text" and reverse. + In the CVS the RDFs will be saved toghether as a string. + CSV Composition: ["MovieID","RDFs","Abstract"] + """ + def __init__(self, output_path:str): + + + self.output = open(output_path, "w") + # then the first row as header + header = ["MovieID","RDFs","Abstract"] + self.output.write(",".join(header) + "\n") + + def close(self): + self.output.close() + + def write(self, RDF: pd.DataFrame): + """ + Args: + RDF (pd.DataFrame): ["MovieID","Triple","Abstract"] + """ + + RDF.to_csv(self.output, index=False, header=False) \ No newline at end of file diff --git a/Scripts/DataCleaning/filter.py b/Scripts/DataCleaning/filter.py new file mode 100644 index 0000000..50d6ead --- /dev/null +++ b/Scripts/DataCleaning/filter.py @@ -0,0 +1,184 @@ +# This file deletes in the pipeline the unwanted relationship by different rules +import pandas as pd +import sqlite3 +import numpy as np + +from Scripts.Libs.CleaningPipeline.special_token import SpecialToken +from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint + + +class PipelineApplier(): + + def __init__(self): + + self.MOVIE_FILTER = pd.DataFrame() + self.REL_FILTER = pd.DataFrame() + + + def delete_relationship_by_str(self, RDF: pd.DataFrame, uri: str) -> pd.DataFrame: + return RDF[RDF["RelationshipURI"]!= uri] + + def generate_list_relationship_filter(self, filter_list: list[str]) -> None: + """Store RelationshipURI filters as a set """ + self.relationship_filter_list: set[str] = set(filter_list) + + def delete_relationship_by_list_filter(self, RDF: pd.DataFrame) -> pd.DataFrame: + """Remove rows whose RelationshipURI is in the stored filter. Generate it first callig the generate_list_relationship_filter""" + return RDF[~RDF["RelationshipURI"].isin(self.relationship_filter_list)] + + + def generate_frequency_movie_filter(self, MOVIE_COUNT: pd.DataFrame ,min_treshold: int, max_treshold: int): + """ + You MUST call this before filter the dataset by movie frequence [filter_by_frequence_movie_id()], + since this method creates such filter + Args: + MOVIE_COUNT (pd.DataFrame): ["MovieID","Count"] + min_treshold (int): + max_treshold (int): + """ + MOVIE_COUNT = MOVIE_COUNT[MOVIE_COUNT["Count"] >= min_treshold] + MOVIE_COUNT = MOVIE_COUNT[MOVIE_COUNT["Count"] < max_treshold] + self.MOVIE_FILTER = MOVIE_COUNT #["MovieID"] + + def generate_frequency_relationship_filter(self, REL_COUNT: pd.DataFrame ,min_treshold: int, max_treshold: int): + REL_COUNT = REL_COUNT[REL_COUNT["Count"] >= min_treshold] + REL_COUNT = REL_COUNT[REL_COUNT["Count"] < max_treshold] + self.REL_FILTER = REL_COUNT #["RelationshipURI"] + + def filter_by_frequency_movie_id(self, RDF: pd.DataFrame) -> pd.DataFrame: + RDF = RDF[RDF["MovieID"].isin(self.MOVIE_FILTER["MovieID"])] + return RDF + + def filter_by_frequency_relationship(self, RDF: pd.DataFrame) -> pd.DataFrame: + RDF = RDF[RDF["RelationshipURI"].isin(self.REL_FILTER["RelationshipURI"])] + return RDF + + def rdf_add_special_token(self, RDF: pd.DataFrame): + """ + Adds RDF special token to each element of the tuple. i.e: SUBJ to SubjectURI, OBJ to ObjectURI, REL to RelationshipURI. + Check Scrits/Libs/CleaningPipeline/special_token.py for the up-to-date special token. + It only adds the special token of the three element of the RDF, no other special token. + Args: + RDF (pd.DataFrame): + Returns: + pd.DataFrame: ["MovieURI","SubjectURI","RelationshipURI","ObjectURI","Abstract"] + """ + # if the filter runned before sliced the RDF and created a View, here the problem is resolved + # for more context: SettingWithCopyWarning + RDF = RDF.copy() + # at the beginning of SubjectURI RelationshipURI ObjectURI, add their special token + RDF["SubjectURI"] = SpecialToken.SUBJECT.value + RDF["SubjectURI"] + RDF["ObjectURI"] = SpecialToken.OBJECT.value + RDF["ObjectURI"] + RDF["RelationshipURI"] = SpecialToken.RELATIONSHIP.value + RDF["RelationshipURI"] + return RDF + + + def drop_na_from_dataset(self, RDF: pd.DataFrame) -> pd.DataFrame: + # dataset has SubjectURI RelationshipURI ObjectURI + # want to drop the '' in them + # Replace empty strings with NaN + RDF = RDF.replace('', np.nan) + # Drop rows where any of the key columns are NaN + RDF = RDF.dropna(subset=["SubjectURI", "RelationshipURI", "ObjectURI"]) + return RDF + + def rebuild_by_movie(self, RDF: pd.DataFrame) -> pd.DataFrame: + """_summary_ + + Args: + RDF (pd.DataFrame): ["MovieID","SubjectURI","RelationshipURI","ObjectURI","Abstract"] + + Returns: + pd.DataFrame: ["MovieID","Triple","Abstract"] + """ + # to execute this method you have to have itereted by movie_id + # because as design we want at the end one row for each movie + # MovieID and abstract can be given as input for a more generic method + # movie_id = RDF["MovieID"].iloc(0) + # abstract = RDF["Abstract"].iloc(0) + # first let's combine each row creating column triple as join of rdf + RDF["Triple"] = RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"] + # special token + RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value + # combine rows into one + # MovieID and Abstract are unique for each other 1 <-> 1 + RDF = RDF.groupby(["MovieID", "Abstract"])["Triple"].apply("".join).reset_index() + # add special token for: start of triple, end of triple and start of abstract + RDF["Triple"] = SpecialToken.START_TRIPLE_LIST.value + RDF["Triple"] + RDF["Abstract"] = SpecialToken.ABSTRACT.value + RDF["Abstract"] + return RDF[["MovieID","Triple","Abstract"]] + + def group_by_movie_from_triple(self, RDF: pd.DataFrame) -> pd.DataFrame: + """ + Args: + RDF (pd.DataFrame): ["MovieID","Triple","Abstract"] + + Returns: + pd.DataFrame: ["MovieID","Triple","Abstract"] + """ + # combine rows into one + # MovieID and Abstract are unique for each other 1 <-> 1 + RDF = RDF.groupby(["MovieID", "Abstract"])["Triple"].apply("".join).reset_index() + # add special token for: start of triple, end of triple and start of abstract + RDF["Triple"] = SpecialToken.START_TRIPLE_LIST.value + RDF["Triple"] + RDF["Abstract"] = SpecialToken.ABSTRACT.value + RDF["Abstract"] + return RDF[["MovieID","Triple","Abstract"]] + + + @staticmethod + def build_triple(RDF: pd.DataFrame): + """ + Obtains joined RDF triple in one element, togheter with START and END special token + Args: + RDF (pd.DataFrame): at least ["SubjectURI", "RelationshipURI", "ObjectURI"] + Returns: + pd.DataFrame: RDF["Triple"] (just this column) + """ + # let's combine each row creating column triple as join of rdf + RDF["Triple"] = RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"] + # special token + RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value + return RDF["Triple"] + + @staticmethod + def build_incomplete_triple(RDF: pd.DataFrame): + """ + Method helper used for the third task: "Predicting a masked component within an RDF triple". + Obtains joined RDF triple in one element, togheter with START and END special token. + The MISSING element will be replaced by the special token + Args: + RDF (pd.DataFrame): 2 of the following ["SubjectURI", "RelationshipURI", "ObjectURI"] + Returns: + RDF["Triple"]: pd.Series (just this column, NOT A DATAFRAME) + """ + # let's create a new column "Triple" with the joined RDF + + # the following creates a column of MASK token of the lenght of the dataframe, + # it is not needed since we expect to have a dataframe of just one column, but its more robust (AND SLOW) + MISSING = pd.Series([SpecialToken.MASK.value] * len(RDF), index=RDF.index) + + RDF["Triple"] = ( + RDF.get("SubjectURI", MISSING) + + RDF.get("RelationshipURI", MISSING) + + RDF.get("ObjectURI", MISSING)) + # special token + RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value + return RDF["Triple"] + + @staticmethod + def build_for_mask_task(RDF_incomplete: pd.DataFrame, MISSING: pd.DataFrame) -> pd.DataFrame: + # currently not used + """ + Method helper used for the third task: "Predicting a masked component within an RDF triple". + Given two Dataframe, the first containing the incompleted RDF and the other only the missing componment, + this methods applies the special token + Args: + RDF (pd.DataFrame): _description_ + + Returns: + pd.DataFrame: _description_ + """ + # take an example dataframe as ["SubjectURI",""] + # as input two dataframe, one with 2 column + return None + diff --git a/Scripts/DataCleaning/pipeline.py b/Scripts/DataCleaning/pipeline.py new file mode 100644 index 0000000..e07294b --- /dev/null +++ b/Scripts/DataCleaning/pipeline.py @@ -0,0 +1,107 @@ +import re +from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint +from Scripts.DataCleaning.filter import PipelineApplier +# tasks dataset builder +from Scripts.DataCleaning.data_output_models.rdf_mask_task import RDF_mask_task_dataset +from Scripts.DataCleaning.data_output_models.bpe_corpus import BPE_corpus +from Scripts.DataCleaning.data_output_models.rdf_text_tasks import RDF_text_task_dataset +from Scripts.DataCleaning.data_output_models.rdf_completation_task import RDF_completation_task_dataset + +import pandas as pd + +class Pipeline(): + def __init__(self, output): + self.sql_endpoint = SqlEndpoint() + # classes to manage taskes' datasets + self.task_rdf_mask = RDF_mask_task_dataset("./Assets/Dataset/Tmp/debug.csv") + self.task_bpe_corpus = BPE_corpus("./Assets/Dataset/Tmp/output.txt") + self.task_rdf_text = RDF_text_task_dataset("./Assets/Dataset/Tmp/rdf_text.csv") + self.task_rdf_completation = RDF_completation_task_dataset("./Assets/Dataset/Tmp/rdf_completation.csv") + + # prepare the filter + # the filter applier needs to know thefrequence of Movies and Relationship among all the Dataset + self.filter_applier = PipelineApplier() + MOVIE_COUNT = self.sql_endpoint.get_movies_id_count() + REL_COUNT = self.sql_endpoint.get_relationship_count() + self.filter_applier.generate_frequency_movie_filter(MOVIE_COUNT,50,3000) + self.filter_applier.generate_frequency_relationship_filter(REL_COUNT, 50, 2395627) + # prepare the filter ot the relationshipURI you want to delete: + relationship_uri_banned_list = [ + "dbp-dbp:wikiPageUsesTemplate","w3:2000/01/rdf-schema#label","dbp-dbo:abstract", + "dbp-dbo:wikiPageID","dbp-dbo:wikiPageRevisionID", "dbp-dbo:wikiPageDisambiguates", + "w3:2002/07/owl#sameAs","dbp-dbp:image","dbp-dbo:wikiPageLength", "w3:2000/01/rdf-schema#comment", + "dbp-dbo:thumbnail", "foaf:depiction", "w3:1999/02/22-rdf-syntax-ns#type"] + self.filter_applier.generate_list_relationship_filter(relationship_uri_banned_list) + + + def _end_file_handler(self): + self.task_bpe_corpus.close() + self.task_rdf_mask.close() + self.task_rdf_text.close() + self.task_rdf_completation.close() + + def _get_cleaned_movie_rows(self): + for RDF in self.sql_endpoint.get_abbreviated_dataset_by_movie_id(): + RDF = self.filter_applier.drop_na_from_dataset(RDF) + RDF = self.filter_applier.filter_by_frequency_movie_id(RDF) + RDF = self.filter_applier.filter_by_frequency_relationship(RDF) + # other filter + # + RDF = self.filter_applier.delete_relationship_by_list_filter(RDF) + if RDF.empty: + continue + RDF = self.filter_applier.rdf_add_special_token(RDF) # WARNING, THIS MUST BE DONE AFTER FILTER BY FREQUENCE + yield RDF + + def execute_task_bpe_corpus(self): + for RDF in self._get_cleaned_movie_rows(): + RDF = self.filter_applier.rebuild_by_movie(RDF) + RDF = RDF[["Triple","Abstract"]] + self.task_bpe_corpus.write_from_df(RDF) + self._end_file_handler() + + + def execute_task_rdf_mask(self): + for RDF in self._get_cleaned_movie_rows(): + self.task_rdf_mask.write(RDF) + self._end_file_handler() + + def execute_tasks_rdf_text(self): + for RDF in self._get_cleaned_movie_rows(): + RDF = self.filter_applier.rebuild_by_movie(RDF) + self.task_rdf_text.write(RDF) + self._end_file_handler() + + def execute_task_rdf_completation(self): + for RDF in self._get_cleaned_movie_rows(): + RDF["Triple"] = self.filter_applier.build_triple(RDF) + self.task_rdf_completation.write(RDF[["MovieID","Triple"]]) + self._end_file_handler() + + + def execute_all_task(self): + for RDF in self._get_cleaned_movie_rows(): + self.task_rdf_mask.write(RDF) + + RDF["Triple"] = self.filter_applier.build_triple(RDF) + self.task_rdf_completation.write(RDF[["MovieID","Triple"]]) + + RDF = self.filter_applier.group_by_movie_from_triple(RDF[["MovieID","Triple","Abstract"]]) + + self.task_rdf_text.write(RDF) + self.task_bpe_corpus.write_from_df(RDF[["Triple","Abstract"]]) + + self._end_file_handler() + + + + + + + +pipeline = Pipeline("./Assets/Dataset/Tmp/output.txt") +# pipeline.execute_task_bpe_corpus() +# pipeline.execute_task_rdf_mask() +# pipeline.execute_tasks_rdf_text() +# pipeline.execute_task_rdf_completation() +pipeline.execute_all_task() \ No newline at end of file diff --git a/Scripts/Libs/CleaningPipeline/special_token.py b/Scripts/Libs/CleaningPipeline/special_token.py new file mode 100644 index 0000000..644ad71 --- /dev/null +++ b/Scripts/Libs/CleaningPipeline/special_token.py @@ -0,0 +1,21 @@ +from enum import Enum + +class SpecialToken(str, Enum): + # (Enum, str) -> throws an error + START_TRIPLE_LIST = "" + START_TRIPLE = "" + END_TRIPLE = "" + SUBJECT = "" + RELATIONSHIP = "" + OBJECT = "" + ABSTRACT = "" + CORPUS_END = "" + + ## Tasks' Token + RDF_TO_TEXT = "" + TEXT_TO_RDF = "" + CONTINUE_RDF = "" + MASK = "" + + #BPE Training: + \ No newline at end of file diff --git a/Scripts/Libs/CleaningPipeline/sql_endpoint.py b/Scripts/Libs/CleaningPipeline/sql_endpoint.py new file mode 100644 index 0000000..4e43528 --- /dev/null +++ b/Scripts/Libs/CleaningPipeline/sql_endpoint.py @@ -0,0 +1,144 @@ +####################################################### +# This file stand as endpoint to interact with DB # +####################################################### + +# import sqlite3 +import pandas as pd +from sqlalchemy import create_engine +from Scripts.Libs.CleaningPipeline.special_token import SpecialToken + + +class SqlEndpoint(): + + def __init__(self, DB_PATH = "./Assets/Dataset/DatawareHouse/dataset.db", chunk_size_row = 500): + # self.CONN = sqlite3.connect(DB_PATH) # DEPRECATED + self.sql_engine = create_engine(f"sqlite:///{DB_PATH}") + # /// 3 slash -> relative path + # //// 4 slash -> absolute + # self.conn = self.sql_engine.connect().execution_options(stream_results=True) + # it seems that sqlite doenst support streamer cursor + # PRAGMA exeutes better in writing not reading + self.chunk_size_row = chunk_size_row + pass + + def get_RDF(self) -> pd.DataFrame : + + QUERY = """ + SELECT MovieID, SubjectURI, RelationshipURI, ObjectURI + FROM RDFs + INNER JOIN Subjects USING (SubjectID) + INNER JOIN Relationships USING (RelationshipID) + INNER JOIN Objects USING (ObjectID); + """ + + return pd.read_sql_query(QUERY, self.CONN) + + def get_chunked_abbreviated_dataset(self) -> pd.DataFrame : + """ + Returns: + pd.DataFrame: MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract + """ + + QUERY = """ + SELECT MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract + FROM RDFs + INNER JOIN ParsedSubjects USING (SubjectID) + INNER JOIN ParsedRelationships USING (RelationshipID) + INNER JOIN ParsedObjects USING (ObjectID) + INNER JOIN WikipediaAbstracts USING (MovieID); + """ + + # return pd.read_sql_query(QUERY, self.CONN, chunksize=500) + # sqlite3 + return pd.read_sql_query(QUERY, self.sql_engine, chunksize=self.chunk_size_row) + + + def get_chunked_abbreviated_dataset_with_start_token(self)-> pd.DataFrame: + # DEPRECATED ! + start_token = SpecialToken() + QUERY = """ + SELECT + MovieID, + ? || SubjectURI AS SubjectURI, + ? || RelationshipURI AS RelationshipURI, + ? || ObjectURI AS ObjectURI, + Abstract + FROM RDFs + INNER JOIN ParsedSubjects USING (SubjectID) + INNER JOIN ParsedRelationships USING (RelationshipID) + INNER JOIN ParsedObjects USING (ObjectID) + INNER JOIN WikipediaAbstracts USING (MovieID); + """ + return pd.read_sql_query(QUERY, self.sql_engine, chunksize=self.chunk_size_row) + + def get_abbreviated_dataset_by_movie_id(self):# -> iter[pd.DataFrame]: + """ + Gets each time a DataFrame per movie ( with all its rows in the dataset). + The retrieved RDFs are already abbrevieted by the sql parser + Yields: + Pandas.DataFrame: [MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract] + """ + # chunk by movieId, abstract is the same and some intersting logic are appliable + movie_ids = pd.read_sql_query("SELECT MovieID FROM Movies;", self.sql_engine)["MovieID"] + # CHOOSEN MOVIE: + # The Dark Knight : 117248 + # Inception : 147074 + # The Avengers : 113621 + # Cast Away : 1123 + # The Departed : 117586 + # American Psycho : 90177 + # Avatar : 71587 + # Django Unchained : 138952 + # Spirited Away : 144137 + # Knives Out : 148025 + movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025] + movie_ids = movie_list + + QUERY = """ + SELECT MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract + FROM RDFs + INNER JOIN ParsedSubjects USING (SubjectID) + INNER JOIN ParsedRelationships USING (RelationshipID) + INNER JOIN ParsedObjects USING (ObjectID) + INNER JOIN WikipediaAbstracts USING (MovieID) + WHERE MovieID = (?); + """ + + for movie_id in movie_ids: + yield pd.read_sql_query(QUERY, self.sql_engine, params=(movie_id,)) + + def get_movies_id_count(self) -> pd.DataFrame: + """ + Gets the count of each Movie in the Dataset + Returns: + Pandas.DataFrame: [MovieID, Count] + """ + QUERY = """ + SELECT MovieID, COUNT(*) AS Count + FROM RDFs + GROUP BY MovieID; + """ + return pd.read_sql_query(QUERY, self.sql_engine) + + def get_relationship_count(self) -> pd.DataFrame: + """ + Gets the count of each Relationship in the Dataset + Returns: + Pandas.DataFrame: [RelationshipURI, Count] + """ + QUERY = """ + SELECT RelationshipURI, COUNT(*) AS Count + FROM RDFs + INNER JOIN ParsedRelationships USING (RelationshipID) + GROUP BY RelationshipURI; + """ + return pd.read_sql_query(QUERY, self.sql_engine) + + + +if __name__ == "__main__" : + sql_endpoint = SqlEndpoint() + for pandas_row in sql_endpoint.get_abbreviated_dataset_by_movie_id(): + print(pandas_row) + # sql_endpoint.get_RDF() + print("done") \ No newline at end of file diff --git a/Scripts/Libs/Utils/dataframe_interaction.py b/Scripts/Libs/Utils/dataframe_interaction.py new file mode 100644 index 0000000..c4df33a --- /dev/null +++ b/Scripts/Libs/Utils/dataframe_interaction.py @@ -0,0 +1,9 @@ +import pandas as pd + + + +def get_raw_from_dataframe(DF: pd.DataFrame) -> str: + output = '' + for row in DF.itertuples(index=False, name=None): + output += "".join(map(str, row)) + return output From 8167c9d435b15a4f189d57b6644a376fed2f2e2c Mon Sep 17 00:00:00 2001 From: GassiGiuseppe Date: Mon, 29 Sep 2025 16:03:49 +0200 Subject: [PATCH 22/75] Added Toy Dataset entry point into the Pipeline class Before it was forced into the sql_endpoint, now all the pipeline can be managed in the Pipeline class --- Scripts/DataCleaning/pipeline.py | 78 ++++++++++++------- Scripts/Libs/CleaningPipeline/sql_endpoint.py | 12 +-- 2 files changed, 57 insertions(+), 33 deletions(-) diff --git a/Scripts/DataCleaning/pipeline.py b/Scripts/DataCleaning/pipeline.py index e07294b..eb5b2f7 100644 --- a/Scripts/DataCleaning/pipeline.py +++ b/Scripts/DataCleaning/pipeline.py @@ -10,22 +10,22 @@ from Scripts.DataCleaning.data_output_models.rdf_completation_task import RDF_co import pandas as pd class Pipeline(): - def __init__(self, output): + def __init__(self): self.sql_endpoint = SqlEndpoint() # classes to manage taskes' datasets - self.task_rdf_mask = RDF_mask_task_dataset("./Assets/Dataset/Tmp/debug.csv") - self.task_bpe_corpus = BPE_corpus("./Assets/Dataset/Tmp/output.txt") + self.task_rdf_mask = RDF_mask_task_dataset("./Assets/Dataset/Tmp/rdf_mask.csv") + self.task_bpe_corpus = BPE_corpus("./Assets/Dataset/Tmp/corpus.txt") self.task_rdf_text = RDF_text_task_dataset("./Assets/Dataset/Tmp/rdf_text.csv") self.task_rdf_completation = RDF_completation_task_dataset("./Assets/Dataset/Tmp/rdf_completation.csv") # prepare the filter - # the filter applier needs to know thefrequence of Movies and Relationship among all the Dataset + # the filter applier needs to know the frequence of Movies and Relationship among all the Dataset self.filter_applier = PipelineApplier() MOVIE_COUNT = self.sql_endpoint.get_movies_id_count() REL_COUNT = self.sql_endpoint.get_relationship_count() self.filter_applier.generate_frequency_movie_filter(MOVIE_COUNT,50,3000) self.filter_applier.generate_frequency_relationship_filter(REL_COUNT, 50, 2395627) - # prepare the filter ot the relationshipURI you want to delete: + # prepare the filter on the relationshipURI you want to delete: relationship_uri_banned_list = [ "dbp-dbp:wikiPageUsesTemplate","w3:2000/01/rdf-schema#label","dbp-dbo:abstract", "dbp-dbo:wikiPageID","dbp-dbo:wikiPageRevisionID", "dbp-dbo:wikiPageDisambiguates", @@ -34,25 +34,6 @@ class Pipeline(): self.filter_applier.generate_list_relationship_filter(relationship_uri_banned_list) - def _end_file_handler(self): - self.task_bpe_corpus.close() - self.task_rdf_mask.close() - self.task_rdf_text.close() - self.task_rdf_completation.close() - - def _get_cleaned_movie_rows(self): - for RDF in self.sql_endpoint.get_abbreviated_dataset_by_movie_id(): - RDF = self.filter_applier.drop_na_from_dataset(RDF) - RDF = self.filter_applier.filter_by_frequency_movie_id(RDF) - RDF = self.filter_applier.filter_by_frequency_relationship(RDF) - # other filter - # - RDF = self.filter_applier.delete_relationship_by_list_filter(RDF) - if RDF.empty: - continue - RDF = self.filter_applier.rdf_add_special_token(RDF) # WARNING, THIS MUST BE DONE AFTER FILTER BY FREQUENCE - yield RDF - def execute_task_bpe_corpus(self): for RDF in self._get_cleaned_movie_rows(): RDF = self.filter_applier.rebuild_by_movie(RDF) @@ -66,12 +47,14 @@ class Pipeline(): self.task_rdf_mask.write(RDF) self._end_file_handler() + def execute_tasks_rdf_text(self): for RDF in self._get_cleaned_movie_rows(): RDF = self.filter_applier.rebuild_by_movie(RDF) self.task_rdf_text.write(RDF) self._end_file_handler() + def execute_task_rdf_completation(self): for RDF in self._get_cleaned_movie_rows(): RDF["Triple"] = self.filter_applier.build_triple(RDF) @@ -92,14 +75,55 @@ class Pipeline(): self.task_bpe_corpus.write_from_df(RDF[["Triple","Abstract"]]) self._end_file_handler() - + + + def _end_file_handler(self): + self.task_bpe_corpus.close() + self.task_rdf_mask.close() + self.task_rdf_text.close() + self.task_rdf_completation.close() + + + def _get_cleaned_movie_rows(self): + for RDF in self.sql_endpoint.get_abbreviated_dataset_by_movie_id(): + RDF = self.filter_applier.drop_na_from_dataset(RDF) + RDF = self.filter_applier.filter_by_frequency_movie_id(RDF) + RDF = self.filter_applier.filter_by_frequency_relationship(RDF) + # other filter + # + RDF = self.filter_applier.delete_relationship_by_list_filter(RDF) + if RDF.empty: + continue + RDF = self.filter_applier.rdf_add_special_token(RDF) # WARNING, THIS MUST BE DONE AFTER FILTER BY FREQUENCE + yield RDF + + + def use_toy_dataset(self): + # CHOOSEN MOVIE: + # The Dark Knight : 117248 + # Inception : 147074 + # The Avengers : 113621 + # Cast Away : 1123 + # The Departed : 117586 + # American Psycho : 90177 + # Avatar : 71587 + # Django Unchained : 138952 + # Spirited Away : 144137 + # Knives Out : 148025 + movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025] + self.sql_endpoint.movie_ids = movie_list +# there are a lot of settings to manage +# you only need to change settings: +# in the init for file paths, frequency filter limit, banned reletionshipURI +# in the use_toy_dataset , to change the toy dataset +# in _get_cleaned_movie_rows: to change how the pipeline behave +pipeline = Pipeline() - -pipeline = Pipeline("./Assets/Dataset/Tmp/output.txt") +# pipeline.use_toy_dataset() # pipeline.execute_task_bpe_corpus() # pipeline.execute_task_rdf_mask() # pipeline.execute_tasks_rdf_text() diff --git a/Scripts/Libs/CleaningPipeline/sql_endpoint.py b/Scripts/Libs/CleaningPipeline/sql_endpoint.py index 4e43528..66ba1ea 100644 --- a/Scripts/Libs/CleaningPipeline/sql_endpoint.py +++ b/Scripts/Libs/CleaningPipeline/sql_endpoint.py @@ -18,8 +18,8 @@ class SqlEndpoint(): # self.conn = self.sql_engine.connect().execution_options(stream_results=True) # it seems that sqlite doenst support streamer cursor # PRAGMA exeutes better in writing not reading - self.chunk_size_row = chunk_size_row - pass + self.chunk_size_row = chunk_size_row # not used now, since each chunk is a movie + self.movie_ids = movie_ids = pd.read_sql_query("SELECT MovieID FROM Movies;", self.sql_engine)["MovieID"] def get_RDF(self) -> pd.DataFrame : @@ -79,7 +79,7 @@ class SqlEndpoint(): Pandas.DataFrame: [MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract] """ # chunk by movieId, abstract is the same and some intersting logic are appliable - movie_ids = pd.read_sql_query("SELECT MovieID FROM Movies;", self.sql_engine)["MovieID"] + # movie_ids = pd.read_sql_query("SELECT MovieID FROM Movies;", self.sql_engine)["MovieID"] # CHOOSEN MOVIE: # The Dark Knight : 117248 # Inception : 147074 @@ -91,8 +91,8 @@ class SqlEndpoint(): # Django Unchained : 138952 # Spirited Away : 144137 # Knives Out : 148025 - movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025] - movie_ids = movie_list + # movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025] + # movie_ids = movie_list QUERY = """ SELECT MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract @@ -104,7 +104,7 @@ class SqlEndpoint(): WHERE MovieID = (?); """ - for movie_id in movie_ids: + for movie_id in self.movie_ids: yield pd.read_sql_query(QUERY, self.sql_engine, params=(movie_id,)) def get_movies_id_count(self) -> pd.DataFrame: From 255d8a072d8e95920bbb723c4536f454a741ab02 Mon Sep 17 00:00:00 2001 From: GassiGiuseppe Date: Mon, 29 Sep 2025 16:59:52 +0200 Subject: [PATCH 23/75] First implementation of the cleaning pipeline UML --- .../cleaning-pipeline.excalidraw.json | 634 ++++++++++++++++++ 1 file changed, 634 insertions(+) create mode 100644 Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json diff --git a/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json b/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json new file mode 100644 index 0000000..1249185 --- /dev/null +++ b/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json @@ -0,0 +1,634 @@ +{ + "type": "excalidraw", + "version": 2, + "source": "https://marketplace.visualstudio.com/items?itemName=pomdtr.excalidraw-editor", + "elements": [ + { + "id": "JNB9z-PeqZ4s8KDfWaoXe", + "type": "rectangle", + "x": 106, + "y": 27, + "width": 653, + "height": 263, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a2", + "roundness": { + "type": 3 + }, + "seed": 710740889, + "version": 326, + "versionNonce": 1107631703, + "isDeleted": false, + "boundElements": null, + "updated": 1759156408059, + "link": null, + "locked": false + }, + { + "id": "e13wNTgUpn2flMpmMttqx", + "type": "text", + "x": 200.5943407656526, + "y": 44.07937975075269, + "width": 307.2781467269385, + "height": 23.3097531902191, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a3", + "roundness": null, + "seed": 1012740663, + "version": 444, + "versionNonce": 589551257, + "isDeleted": false, + "boundElements": null, + "updated": 1759156408059, + "link": null, + "locked": false, + "text": "Libs/CleaningPipeline/sql_endpoint", + "fontSize": 18.64780255217528, + "fontFamily": 5, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "Libs/CleaningPipeline/sql_endpoint", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "CgxCElJkKBtIHv-5WQrbo", + "type": "text", + "x": 195, + "y": 80.44259472749451, + "width": 403.64997665852184, + "height": 186.4780255217528, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a4", + "roundness": null, + "seed": 1261951799, + "version": 507, + "versionNonce": 1922906999, + "isDeleted": false, + "boundElements": null, + "updated": 1759156408059, + "link": null, + "locked": false, + "text": "Class SqlEndpoint:\n - sql_engine\n + movie_ids: list[int]\n\n #\n + get_abbreviated_dataset_by_movie_id\n\n", + "fontSize": 18.64780255217528, + "fontFamily": 5, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "Class SqlEndpoint:\n - sql_engine\n + movie_ids: list[int]\n\n #\n + get_abbreviated_dataset_by_movie_id\n\n", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "line", + "version": 4978, + "versionNonce": 2079525497, + "isDeleted": false, + "id": "sYReMTdYblr-oJtYYJALU", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": -68.05426555317842, + "y": 87.19293561900287, + "strokeColor": "#000000", + "backgroundColor": "#a5d8ff", + "width": 77.09201683999922, + "height": 99.49948667804088, + "seed": 1263944119, + "groupIds": [ + "9YkNe1yqnfZy9Z1JX2xr4", + "BDBCTrrhjbJynRAyuf3xJ" + ], + "strokeSharpness": "round", + "boundElementIds": [], + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + 0.2542098813493443, + 75.20117273657175 + ], + [ + 0.011896425679918422, + 83.76249969444815 + ], + [ + 3.970409367559332, + 87.46174320643391 + ], + [ + 17.75573317066317, + 90.59250103325854 + ], + [ + 41.05683533152865, + 91.56737225214069 + ], + [ + 63.319497586673116, + 90.01084754868091 + ], + [ + 75.14781395923075, + 86.28844687220405 + ], + [ + 76.81603792670788, + 83.15042405259751 + ], + [ + 77.05033394391478, + 76.25776215104557 + ], + [ + 76.86643881413028, + 6.3089586511537865 + ], + [ + 76.45188016352971, + -0.2999144698665015 + ], + [ + 71.50179495549581, + -3.9936571317850627 + ], + [ + 61.077971898861186, + -6.132877429442784 + ], + [ + 37.32348754161154, + -7.932114425900202 + ], + [ + 18.278415656797975, + -6.859225353587373 + ], + [ + 3.2995959613238286, + -3.2201165291205287 + ], + [ + -0.04168289608444441, + -0.045185660461322996 + ], + [ + 0, + 0 + ] + ], + "index": "a6", + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1759157176189, + "link": null, + "locked": false + }, + { + "type": "line", + "version": 2683, + "versionNonce": 33379161, + "isDeleted": false, + "id": "0S6dEWQVqKUVkP6Z5IX1l", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": -67.53033611490343, + "y": 144.31921927673278, + "strokeColor": "#000000", + "backgroundColor": "#a5d8ff", + "width": 77.17198221193564, + "height": 8.562348957853036, + "seed": 817033943, + "groupIds": [ + "9YkNe1yqnfZy9Z1JX2xr4", + "BDBCTrrhjbJynRAyuf3xJ" + ], + "strokeSharpness": "round", + "boundElementIds": [], + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + 2.033150371639873, + 3.413095389435587 + ], + [ + 10.801287372573954, + 6.276651055277943 + ], + [ + 22.468666942209353, + 8.010803051612635 + ], + [ + 40.747074201802775, + 8.168828515515864 + ], + [ + 62.077348233027564, + 7.0647721921469495 + ], + [ + 74.53446931782398, + 3.04824021069218 + ], + [ + 77.17198221193564, + -0.3935204423371723 + ] + ], + "index": "a7", + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1759157176189, + "link": null, + "locked": false + }, + { + "type": "line", + "version": 2769, + "versionNonce": 1703641145, + "isDeleted": false, + "id": "szGLND7J0nVOvRkNXX9AS", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": -68.56219343740725, + "y": 115.35516394150972, + "strokeColor": "#000000", + "backgroundColor": "#a5d8ff", + "width": 77.17198221193564, + "height": 8.562348957853036, + "seed": 1704755191, + "groupIds": [ + "9YkNe1yqnfZy9Z1JX2xr4", + "BDBCTrrhjbJynRAyuf3xJ" + ], + "strokeSharpness": "round", + "boundElementIds": [], + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + 2.033150371639873, + 3.413095389435587 + ], + [ + 10.801287372573954, + 6.276651055277943 + ], + [ + 22.468666942209353, + 8.010803051612635 + ], + [ + 40.747074201802775, + 8.168828515515864 + ], + [ + 62.077348233027564, + 7.0647721921469495 + ], + [ + 74.53446931782398, + 3.04824021069218 + ], + [ + 77.17198221193564, + -0.3935204423371723 + ] + ], + "index": "a8", + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1759157176189, + "link": null, + "locked": false + }, + { + "type": "ellipse", + "version": 5766, + "versionNonce": 344002841, + "isDeleted": false, + "id": "O3t2uGktJlDd1_OX_bpV4", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": -69.6201424194893, + "y": 80.06066699332126, + "strokeColor": "#000000", + "backgroundColor": "#a5d8ff", + "width": 76.59753601865496, + "height": 15.49127539284798, + "seed": 471296279, + "groupIds": [ + "9YkNe1yqnfZy9Z1JX2xr4", + "BDBCTrrhjbJynRAyuf3xJ" + ], + "strokeSharpness": "sharp", + "boundElementIds": [ + "bxuMGTzXLn7H-uBCptINx" + ], + "index": "a9", + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1759157176189, + "link": null, + "locked": false + }, + { + "type": "ellipse", + "version": 1176, + "versionNonce": 1951499769, + "isDeleted": false, + "id": "_SzKlOBOvJgBg7FX0JTTM", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": -33.12815531426679, + "y": 104.53733467322485, + "strokeColor": "#000000", + "backgroundColor": "#228be6", + "width": 11.226103154161754, + "height": 12.183758484455605, + "seed": 1368927799, + "groupIds": [ + "9YkNe1yqnfZy9Z1JX2xr4", + "BDBCTrrhjbJynRAyuf3xJ" + ], + "strokeSharpness": "sharp", + "boundElementIds": [], + "index": "aA", + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1759157176189, + "link": null, + "locked": false + }, + { + "type": "ellipse", + "version": 1464, + "versionNonce": 1879072473, + "isDeleted": false, + "id": "oJMl2Kxa3SPaiAY0kxo7A", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": -32.77701353033319, + "y": 130.75394896028996, + "strokeColor": "#000000", + "backgroundColor": "#228be6", + "width": 11.226103154161754, + "height": 12.183758484455605, + "seed": 1627606871, + "groupIds": [ + "9YkNe1yqnfZy9Z1JX2xr4", + "BDBCTrrhjbJynRAyuf3xJ" + ], + "strokeSharpness": "sharp", + "boundElementIds": [], + "index": "aB", + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1759157176189, + "link": null, + "locked": false + }, + { + "type": "ellipse", + "version": 1347, + "versionNonce": 1176574905, + "isDeleted": false, + "id": "fB6pJBSMA-pRHrpgYKaLL", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 6.239590202363168, + "x": -32.12815531426679, + "y": 159.52267553159635, + "strokeColor": "#000000", + "backgroundColor": "#228be6", + "width": 11.226103154161754, + "height": 12.183758484455605, + "seed": 1420643447, + "groupIds": [ + "9YkNe1yqnfZy9Z1JX2xr4", + "BDBCTrrhjbJynRAyuf3xJ" + ], + "strokeSharpness": "sharp", + "boundElementIds": [], + "index": "aC", + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1759157176189, + "link": null, + "locked": false + }, + { + "type": "text", + "version": 845, + "versionNonce": 383204505, + "isDeleted": false, + "id": "9gZ3Yy1MeP9kEOTLODqLG", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": -77.72012292771115, + "y": 181.11281713043917, + "strokeColor": "#000000", + "backgroundColor": "#a5d8ff", + "width": 95.63072204589844, + "height": 23.595161071904883, + "seed": 2019206551, + "groupIds": [ + "BDBCTrrhjbJynRAyuf3xJ" + ], + "strokeSharpness": "sharp", + "boundElementIds": [], + "fontSize": 17.4778970902999, + "fontFamily": 1, + "text": "dataset.db", + "baseline": 16.595161071904883, + "textAlign": "center", + "verticalAlign": "top", + "index": "aD", + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1759157176189, + "link": null, + "locked": false, + "containerId": null, + "originalText": "dataset.db", + "autoResize": true, + "lineHeight": 1.350000000000001 + }, + { + "id": "3eOw20xMhpB5jf_RMG24P", + "type": "text", + "x": 1131.3333333333335, + "y": 31.333333333333428, + "width": 508.3333333333333, + "height": 550, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aE", + "roundness": null, + "seed": 1535658041, + "version": 821, + "versionNonce": 1630266809, + "isDeleted": false, + "boundElements": null, + "updated": 1759157181677, + "link": null, + "locked": false, + "text": "Class PipelineApplier\n - movie_frequence_filter : pd.DataFrame()\n - rel_Frequence_Filter : pd.DataFrame()\n - rel_banned_list: list[str]\n\n + generate_movie_frequency_filter()\n + generate_rel_frequency_filter()\n + generate_list_relationship_filter()\n \n + filter_by_movie_frequency()\n + filter_by_relationship_frequency()\n + delete_relationship_by_list_filter()\n + delete_relationship_by_str()\n\n + drop_na() \n\n + rdf_add_special_token()\n + group_triple_by_movie()\n + build_by_movie()\n # static\n + build_triple()\n + build_incomplete_triple()", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "Class PipelineApplier\n - movie_frequence_filter : pd.DataFrame()\n - rel_Frequence_Filter : pd.DataFrame()\n - rel_banned_list: list[str]\n\n + generate_movie_frequency_filter()\n + generate_rel_frequency_filter()\n + generate_list_relationship_filter()\n \n + filter_by_movie_frequency()\n + filter_by_relationship_frequency()\n + delete_relationship_by_list_filter()\n + delete_relationship_by_str()\n\n + drop_na() \n\n + rdf_add_special_token()\n + group_triple_by_movie()\n + build_by_movie()\n # static\n + build_triple()\n + build_incomplete_triple()", + "autoResize": false, + "lineHeight": 1.25 + }, + { + "id": "Fbl1gpb5r7QrdRauGUWm2", + "type": "text", + "x": 158.23809523809535, + "y": 502.52380952380935, + "width": 484.2857142857143, + "height": 475, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aF", + "roundness": null, + "seed": 2066618807, + "version": 541, + "versionNonce": 7392153, + "isDeleted": false, + "boundElements": null, + "updated": 1759157954202, + "link": null, + "locked": false, + "text": "Class Pipeline\n - sql_endpoint: SqlEndpoint()\n\n - task_rdf_mask_file_handler:\n - task_bpe_corpus_file_handler:\n - task_rdf_text_file_handler:\n - task_rdf_completation_file_handler:\n\n - Filter_applier : PipelineApplier()\n\n\n \n + execute_task_bpe_corpus()\n + execute_task_rdf_mask()\n + execute_task_rdf_text()\n + execute_task_rdf_completation()\n + execute_all_task()\n\n + use_toy_dataset()", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "Class Pipeline\n - sql_endpoint: SqlEndpoint()\n\n - task_rdf_mask_file_handler:\n - task_bpe_corpus_file_handler:\n - task_rdf_text_file_handler:\n - task_rdf_completation_file_handler:\n\n - Filter_applier : PipelineApplier()\n\n\n \n + execute_task_bpe_corpus()\n + execute_task_rdf_mask()\n + execute_task_rdf_text()\n + execute_task_rdf_completation()\n + execute_all_task()\n\n + use_toy_dataset()", + "autoResize": false, + "lineHeight": 1.25 + } + ], + "appState": { + "gridSize": 20, + "gridStep": 5, + "gridModeEnabled": false, + "viewBackgroundColor": "#ffffff" + }, + "files": {} +} \ No newline at end of file From c319398ca01f10f5a2099219146649390cfec4a9 Mon Sep 17 00:00:00 2001 From: GassiGiuseppe Date: Mon, 29 Sep 2025 17:03:31 +0200 Subject: [PATCH 24/75] little update to UML pipeline --- .../cleaning-pipeline.excalidraw.json | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json b/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json index 1249185..a3b4660 100644 --- a/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json +++ b/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json @@ -592,7 +592,7 @@ "x": 158.23809523809535, "y": 502.52380952380935, "width": 484.2857142857143, - "height": 475, + "height": 500, "angle": 0, "strokeColor": "#1e1e1e", "backgroundColor": "transparent", @@ -606,20 +606,20 @@ "index": "aF", "roundness": null, "seed": 2066618807, - "version": 541, - "versionNonce": 7392153, + "version": 552, + "versionNonce": 1269344823, "isDeleted": false, "boundElements": null, - "updated": 1759157954202, + "updated": 1759158199532, "link": null, "locked": false, - "text": "Class Pipeline\n - sql_endpoint: SqlEndpoint()\n\n - task_rdf_mask_file_handler:\n - task_bpe_corpus_file_handler:\n - task_rdf_text_file_handler:\n - task_rdf_completation_file_handler:\n\n - Filter_applier : PipelineApplier()\n\n\n \n + execute_task_bpe_corpus()\n + execute_task_rdf_mask()\n + execute_task_rdf_text()\n + execute_task_rdf_completation()\n + execute_all_task()\n\n + use_toy_dataset()", + "text": "Class Pipeline\n - sql_endpoint: SqlEndpoint()\n\n - task_rdf_mask_file_handler:\n - task_bpe_corpus_file_handler:\n - task_rdf_text_file_handler:\n - task_rdf_completation_file_handler:\n\n - Filter_applier : PipelineApplier()\n\n #\n - get_cleaned_movie_rows()\n \n + execute_task_bpe_corpus()\n + execute_task_rdf_mask()\n + execute_task_rdf_text()\n + execute_task_rdf_completation()\n + execute_all_task()\n\n + use_toy_dataset()", "fontSize": 20, "fontFamily": 5, "textAlign": "left", "verticalAlign": "top", "containerId": null, - "originalText": "Class Pipeline\n - sql_endpoint: SqlEndpoint()\n\n - task_rdf_mask_file_handler:\n - task_bpe_corpus_file_handler:\n - task_rdf_text_file_handler:\n - task_rdf_completation_file_handler:\n\n - Filter_applier : PipelineApplier()\n\n\n \n + execute_task_bpe_corpus()\n + execute_task_rdf_mask()\n + execute_task_rdf_text()\n + execute_task_rdf_completation()\n + execute_all_task()\n\n + use_toy_dataset()", + "originalText": "Class Pipeline\n - sql_endpoint: SqlEndpoint()\n\n - task_rdf_mask_file_handler:\n - task_bpe_corpus_file_handler:\n - task_rdf_text_file_handler:\n - task_rdf_completation_file_handler:\n\n - Filter_applier : PipelineApplier()\n\n #\n - get_cleaned_movie_rows()\n \n + execute_task_bpe_corpus()\n + execute_task_rdf_mask()\n + execute_task_rdf_text()\n + execute_task_rdf_completation()\n + execute_all_task()\n\n + use_toy_dataset()", "autoResize": false, "lineHeight": 1.25 } From 007f1e955405ba466ab68ac0c7da656c3edca905 Mon Sep 17 00:00:00 2001 From: GassiGiuseppe Date: Mon, 29 Sep 2025 18:53:33 +0200 Subject: [PATCH 25/75] minor updates --- .vscode/settings.json | 23 ++++++- .../cleaning-pipeline.excalidraw.json | 64 +++++++++---------- 2 files changed, 54 insertions(+), 33 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 17ae78b..226939d 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,3 +1,24 @@ { - "jupyter.notebookFileRoot": "${workspaceFolder}" + // Always treat the project root as the working dir for Jupyter + "jupyter.notebookFileRoot": "${workspaceFolder}", + + // When you click "Run Python File in Terminal", DON'T cd into the file's folder + "python.terminal.executeInFileDir": false, + + // Start new integrated terminals at the project root + "terminal.integrated.cwd": "${workspaceFolder}", + + // Ensure Python can import from the project root no matter which file you run + // (so `src/` is on sys.path). Linux shown here; add osx/windows if needed. + "terminal.integrated.env.linux": { + "PYTHONPATH": "${workspaceFolder}" + }, + + // Make pytest run from the root without needing a pytest.ini + "python.testing.pytestEnabled": true, + "python.testing.cwd": "${workspaceFolder}", + "python.testing.pytestArgs": ["src/test"], + + // Help Pylance resolve imports like `from src...` without red squiggles + "python.analysis.extraPaths": ["${workspaceFolder}"] } \ No newline at end of file diff --git a/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json b/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json index a3b4660..c7019f5 100644 --- a/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json +++ b/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json @@ -109,8 +109,8 @@ }, { "type": "line", - "version": 4978, - "versionNonce": 2079525497, + "version": 4979, + "versionNonce": 1473849177, "isDeleted": false, "id": "sYReMTdYblr-oJtYYJALU", "fillStyle": "solid", @@ -119,7 +119,7 @@ "roughness": 1, "opacity": 100, "angle": 0, - "x": -68.05426555317842, + "x": -67.14432426259049, "y": 87.19293561900287, "strokeColor": "#000000", "backgroundColor": "#a5d8ff", @@ -221,14 +221,14 @@ "type": 2 }, "boundElements": [], - "updated": 1759157176189, + "updated": 1759158252997, "link": null, "locked": false }, { "type": "line", - "version": 2683, - "versionNonce": 33379161, + "version": 2684, + "versionNonce": 952947769, "isDeleted": false, "id": "0S6dEWQVqKUVkP6Z5IX1l", "fillStyle": "solid", @@ -237,7 +237,7 @@ "roughness": 1, "opacity": 100, "angle": 0, - "x": -67.53033611490343, + "x": -66.6203948243155, "y": 144.31921927673278, "strokeColor": "#000000", "backgroundColor": "#a5d8ff", @@ -295,14 +295,14 @@ "type": 2 }, "boundElements": [], - "updated": 1759157176189, + "updated": 1759158252997, "link": null, "locked": false }, { "type": "line", - "version": 2769, - "versionNonce": 1703641145, + "version": 2770, + "versionNonce": 477619481, "isDeleted": false, "id": "szGLND7J0nVOvRkNXX9AS", "fillStyle": "solid", @@ -311,7 +311,7 @@ "roughness": 1, "opacity": 100, "angle": 0, - "x": -68.56219343740725, + "x": -67.65225214681931, "y": 115.35516394150972, "strokeColor": "#000000", "backgroundColor": "#a5d8ff", @@ -369,14 +369,14 @@ "type": 2 }, "boundElements": [], - "updated": 1759157176189, + "updated": 1759158252997, "link": null, "locked": false }, { "type": "ellipse", - "version": 5766, - "versionNonce": 344002841, + "version": 5767, + "versionNonce": 2119031289, "isDeleted": false, "id": "O3t2uGktJlDd1_OX_bpV4", "fillStyle": "solid", @@ -385,7 +385,7 @@ "roughness": 1, "opacity": 100, "angle": 0, - "x": -69.6201424194893, + "x": -68.71020112890136, "y": 80.06066699332126, "strokeColor": "#000000", "backgroundColor": "#a5d8ff", @@ -404,14 +404,14 @@ "frameId": null, "roundness": null, "boundElements": [], - "updated": 1759157176189, + "updated": 1759158252997, "link": null, "locked": false }, { "type": "ellipse", - "version": 1176, - "versionNonce": 1951499769, + "version": 1177, + "versionNonce": 525480665, "isDeleted": false, "id": "_SzKlOBOvJgBg7FX0JTTM", "fillStyle": "solid", @@ -420,7 +420,7 @@ "roughness": 1, "opacity": 100, "angle": 0, - "x": -33.12815531426679, + "x": -32.218214023678854, "y": 104.53733467322485, "strokeColor": "#000000", "backgroundColor": "#228be6", @@ -437,14 +437,14 @@ "frameId": null, "roundness": null, "boundElements": [], - "updated": 1759157176189, + "updated": 1759158252997, "link": null, "locked": false }, { "type": "ellipse", - "version": 1464, - "versionNonce": 1879072473, + "version": 1465, + "versionNonce": 1410887609, "isDeleted": false, "id": "oJMl2Kxa3SPaiAY0kxo7A", "fillStyle": "solid", @@ -453,7 +453,7 @@ "roughness": 1, "opacity": 100, "angle": 0, - "x": -32.77701353033319, + "x": -31.867072239745255, "y": 130.75394896028996, "strokeColor": "#000000", "backgroundColor": "#228be6", @@ -470,14 +470,14 @@ "frameId": null, "roundness": null, "boundElements": [], - "updated": 1759157176189, + "updated": 1759158252997, "link": null, "locked": false }, { "type": "ellipse", - "version": 1347, - "versionNonce": 1176574905, + "version": 1348, + "versionNonce": 314839193, "isDeleted": false, "id": "fB6pJBSMA-pRHrpgYKaLL", "fillStyle": "solid", @@ -486,7 +486,7 @@ "roughness": 1, "opacity": 100, "angle": 6.239590202363168, - "x": -32.12815531426679, + "x": -31.218214023678854, "y": 159.52267553159635, "strokeColor": "#000000", "backgroundColor": "#228be6", @@ -503,14 +503,14 @@ "frameId": null, "roundness": null, "boundElements": [], - "updated": 1759157176189, + "updated": 1759158252997, "link": null, "locked": false }, { "type": "text", - "version": 845, - "versionNonce": 383204505, + "version": 846, + "versionNonce": 1091081593, "isDeleted": false, "id": "9gZ3Yy1MeP9kEOTLODqLG", "fillStyle": "solid", @@ -519,7 +519,7 @@ "roughness": 1, "opacity": 100, "angle": 0, - "x": -77.72012292771115, + "x": -76.81018163712321, "y": 181.11281713043917, "strokeColor": "#000000", "backgroundColor": "#a5d8ff", @@ -541,7 +541,7 @@ "frameId": null, "roundness": null, "boundElements": [], - "updated": 1759157176189, + "updated": 1759158252997, "link": null, "locked": false, "containerId": null, From 18fc2ba9d810602b77e7e3cc76eaeaa5cbc492f5 Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Tue, 30 Sep 2025 13:32:24 +0200 Subject: [PATCH 26/75] Added Exceptions --- Project_Model/Libs/BPE/Errors/DuplicateWordException.py | 4 ++++ Project_Model/Libs/BPE/Errors/SentenceTooLongException.py | 4 ++++ 2 files changed, 8 insertions(+) create mode 100644 Project_Model/Libs/BPE/Errors/DuplicateWordException.py create mode 100644 Project_Model/Libs/BPE/Errors/SentenceTooLongException.py diff --git a/Project_Model/Libs/BPE/Errors/DuplicateWordException.py b/Project_Model/Libs/BPE/Errors/DuplicateWordException.py new file mode 100644 index 0000000..885ff5f --- /dev/null +++ b/Project_Model/Libs/BPE/Errors/DuplicateWordException.py @@ -0,0 +1,4 @@ +class DuplicateWordException(Exception): + + def __init__(self, *args: object) -> None: + super().__init__(*args) \ No newline at end of file diff --git a/Project_Model/Libs/BPE/Errors/SentenceTooLongException.py b/Project_Model/Libs/BPE/Errors/SentenceTooLongException.py new file mode 100644 index 0000000..f2d7c9e --- /dev/null +++ b/Project_Model/Libs/BPE/Errors/SentenceTooLongException.py @@ -0,0 +1,4 @@ +class SentenceTooLongException(Exception): + + def __init__(self, *args: object) -> None: + super().__init__(*args) \ No newline at end of file From 2fe1ce9e9aac43cf16ba390cec1eff6b66e97e52 Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Tue, 30 Sep 2025 13:32:37 +0200 Subject: [PATCH 27/75] Updated Inits --- Project_Model/Libs/BPE/Classes/__init__.py | 6 ++++-- Project_Model/Libs/BPE/Errors/__init__.py | 6 +++++- Project_Model/Libs/BPE/Utils/__init__.py | 7 +++++++ Project_Model/Libs/BPE/__init__.py | 2 ++ 4 files changed, 18 insertions(+), 3 deletions(-) create mode 100644 Project_Model/Libs/BPE/Utils/__init__.py diff --git a/Project_Model/Libs/BPE/Classes/__init__.py b/Project_Model/Libs/BPE/Classes/__init__.py index d8a7364..a52b024 100644 --- a/Project_Model/Libs/BPE/Classes/__init__.py +++ b/Project_Model/Libs/BPE/Classes/__init__.py @@ -1,9 +1,11 @@ from .NanoSocratesChunker import NanoSocratesChunker from .NanoSocratesSplitter import NanoSocratesSplitter -from .NanoSocratesBPE import NanoSocratesBPE +from .NanoSocratesBPE import NanoSocratesBPE, NanoSocratesBatchMemoryBPE +from .NanoSocraTrainer import NanoSocraTrainer __all__ = [ "NanoSocratesChunker", "NanoSocratesSplitter", - "NanoSocratesBPE" + "NanoSocratesBPE", + "NanoSocraTrainer" ] \ No newline at end of file diff --git a/Project_Model/Libs/BPE/Errors/__init__.py b/Project_Model/Libs/BPE/Errors/__init__.py index 587873f..262c27d 100644 --- a/Project_Model/Libs/BPE/Errors/__init__.py +++ b/Project_Model/Libs/BPE/Errors/__init__.py @@ -1,7 +1,11 @@ from .DelimiterNotFoundException import DelimiterNotFoundException from .OutOfDictionaryException import OutOfDictionaryException +from .DuplicateWordException import DuplicateWordException +from .SentenceTooLongException import SentenceTooLongException __all__ = [ "DelimiterNotFoundException", - "OutOfDictionaryException" + "OutOfDictionaryException", + "DuplicateWordException", + "SentenceTooLongException" ] diff --git a/Project_Model/Libs/BPE/Utils/__init__.py b/Project_Model/Libs/BPE/Utils/__init__.py new file mode 100644 index 0000000..f2320fa --- /dev/null +++ b/Project_Model/Libs/BPE/Utils/__init__.py @@ -0,0 +1,7 @@ +from .special_regex_maker import special_regex_maker +from .lag_checker_iterator import iterator_with_checks + +__all__ = [ + "special_regex_maker", + "iterator_with_checks" +] \ No newline at end of file diff --git a/Project_Model/Libs/BPE/__init__.py b/Project_Model/Libs/BPE/__init__.py index 6f7d1f2..074133b 100644 --- a/Project_Model/Libs/BPE/__init__.py +++ b/Project_Model/Libs/BPE/__init__.py @@ -1,7 +1,9 @@ from .Classes import * from .Enums import * from .Errors import * +from .Utils import * from . import Classes from . import Enums from . import Errors +from . import Utils From 7020c9e68366e0adeabc8f09babfea784d0d7019 Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Tue, 30 Sep 2025 13:33:12 +0200 Subject: [PATCH 28/75] Added utils to make regexps and iterators that check for last element --- .../Libs/BPE/Utils/lag_checker_iterator.py | 27 +++++++++++++++++++ .../Libs/BPE/Utils/special_regex_maker.py | 9 +++++++ 2 files changed, 36 insertions(+) create mode 100644 Project_Model/Libs/BPE/Utils/lag_checker_iterator.py create mode 100644 Project_Model/Libs/BPE/Utils/special_regex_maker.py diff --git a/Project_Model/Libs/BPE/Utils/lag_checker_iterator.py b/Project_Model/Libs/BPE/Utils/lag_checker_iterator.py new file mode 100644 index 0000000..28bbade --- /dev/null +++ b/Project_Model/Libs/BPE/Utils/lag_checker_iterator.py @@ -0,0 +1,27 @@ +from collections import deque +from typing import Generator, TypeVar + +T1 = TypeVar("T1") +T2 = TypeVar("T2") +T3 = TypeVar("T3") + + +def iterator_with_checks( + generator: Generator[T1, T2, T3], +) -> Generator[tuple[T1, bool], T2, T3]: + + # Here we can ignore to catch stop iteration + # we will propagate it + last_element = next(generator) + + while True: + + RETURN_ELEMENT = last_element + try: + element = next(generator) + last_element = element + yield (RETURN_ELEMENT, False) + + except StopIteration: + yield (RETURN_ELEMENT, True) + break diff --git a/Project_Model/Libs/BPE/Utils/special_regex_maker.py b/Project_Model/Libs/BPE/Utils/special_regex_maker.py new file mode 100644 index 0000000..414eabf --- /dev/null +++ b/Project_Model/Libs/BPE/Utils/special_regex_maker.py @@ -0,0 +1,9 @@ +import re + + +def special_regex_maker(special_tokens: list[str]) -> re.Pattern: + + REGEX_STR = "|".join(special_tokens) + + return re.compile(REGEX_STR) + From c9032cab093820600621a4215ff3ee312ea20752 Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Tue, 30 Sep 2025 13:33:28 +0200 Subject: [PATCH 29/75] Added fit method --- .../Libs/BPE/Classes/NanoSocratesBPE.py | 81 +++++++++++++++++-- 1 file changed, 76 insertions(+), 5 deletions(-) diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py index 844e860..c7f89ce 100644 --- a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py +++ b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py @@ -1,11 +1,16 @@ from .Encoder import Encoder -from ..Errors import OutOfDictionaryException +from ..Errors import OutOfDictionaryException, DuplicateWordException class NanoSocratesBatchMemoryBPE: - def __init__(self) -> None: - pass + def __init__( + self, + frequencies: dict[tuple[int, int], int], + merge_treshold: int + ) -> None: + self.frequencies = frequencies + self.merge_treshold = merge_treshold class NanoSocratesBPE(Encoder): @@ -22,12 +27,66 @@ class NanoSocratesBPE(Encoder): for key, value in vocabulary.items(): if value < 256: raise OutOfDictionaryException() + # TODO: check if they are in order self.__vocabulary[key] = value self.__reverse_vocabulary[value] = key + + @property + def vocabulary_size(self): + return len(self.__vocabulary) + 255 + + @property + def vocabulary(self): + return self.__vocabulary + + @property + def __next_id(self): + return self.vocabulary_size + 1 + # TODO: implement fit - def fit(): - pass + def fit( + self, + chunk_data: list[int], + memory: NanoSocratesBatchMemoryBPE, + last_batch: bool + ): + + ENCODED_CHUNK = self.__round_encode(chunk_data) + DATA_LEN_BEFORE_LAST = len(ENCODED_CHUNK) - 1 + + for i in range(0, DATA_LEN_BEFORE_LAST): + CANDIDATE_COUPLE = (ENCODED_CHUNK[i], ENCODED_CHUNK[i+1]) + + frequency = memory.frequencies.get(CANDIDATE_COUPLE) + + # Initialize frequency + if frequency is None: + frequency = 0 + memory.frequencies[CANDIDATE_COUPLE] = 0 + + frequency += 1 + memory.frequencies[CANDIDATE_COUPLE] = frequency + + if not last_batch: + return (self, memory, ENCODED_CHUNK) + + if len(memory.frequencies) < 1: + return (self, memory, ENCODED_CHUNK) + + FREQUENCIES = memory.frequencies + MAX_COUPLE = max(FREQUENCIES.items(), key=lambda item: item[1])[0] + FREQUENCY = FREQUENCIES[MAX_COUPLE] + + if FREQUENCY < memory.merge_treshold: + return (self, memory, ENCODED_CHUNK) + + self.__learn_word(MAX_COUPLE) + + return (self, memory, ENCODED_CHUNK) + + + def encode(self, piece: str) -> list[int]: @@ -104,3 +163,15 @@ class NanoSocratesBPE(Encoder): raise OutOfDictionaryException() return CANDIDATE_DECODED + + def __learn_word(self, words: tuple[int, int]): + + ID = self.__next_id + + DUPLICATE = self.__vocabulary.get(words) + + if DUPLICATE is not None: + raise DuplicateWordException() + + self.__vocabulary[words] = ID + self.__reverse_vocabulary[ID] = words From b09bd4acbaf079aef24a55bb61c399997b8256f2 Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Tue, 30 Sep 2025 13:33:40 +0200 Subject: [PATCH 30/75] Created trainer to train BPE --- .../Libs/BPE/Classes/NanoSocraTrainer.py | 164 ++++++++++++++++++ 1 file changed, 164 insertions(+) create mode 100644 Project_Model/Libs/BPE/Classes/NanoSocraTrainer.py diff --git a/Project_Model/Libs/BPE/Classes/NanoSocraTrainer.py b/Project_Model/Libs/BPE/Classes/NanoSocraTrainer.py new file mode 100644 index 0000000..1d6d429 --- /dev/null +++ b/Project_Model/Libs/BPE/Classes/NanoSocraTrainer.py @@ -0,0 +1,164 @@ +from collections import deque +from pathlib import Path +import re +from ..Classes import NanoSocratesBPE, NanoSocratesChunker, NanoSocratesSplitter, NanoSocratesBatchMemoryBPE +from ..Enums import TokenType +from ..Utils import special_regex_maker, iterator_with_checks + + +class NanoSocraTrainer: + + def __init__( + self, + max_vocabulary: int, + special_vocabulary: list[str], + chunk_size: int, + merge_treshold: int = 0, + max_iterations: int = 0, + ) -> None: + # Bytes + BYTE_RESERVED_TOKENS = 256 + SPECIAL_RESERVED_TOKENS = len(special_vocabulary) + RESERVED_TOKENS = BYTE_RESERVED_TOKENS + SPECIAL_RESERVED_TOKENS + + self.__max_vocabulary = max_vocabulary - RESERVED_TOKENS + self.__max_iterations = max_iterations + self.__chunk_size = chunk_size + self.__merge_treshold = merge_treshold + self.__special_token_regex = special_regex_maker(special_vocabulary) + + def trainBPE( + self, path: Path, cache_dir: Path, bpe: NanoSocratesBPE | None = None + ) -> NanoSocratesBPE: + + if not path.is_file(): + raise FileNotFoundError() + + if not cache_dir.is_dir(): + raise NotADirectoryError() + + if bpe is None: + bpe = NanoSocratesBPE() + BPE = bpe + + if BPE.vocabulary_size > self.__max_vocabulary: + return BPE + + exit = False + cached = False + current_iteration = 0 + + PATH_GEN = self.__switch_paths(path, cache_dir) + + input_path = next(PATH_GEN) + + while not exit: + + + out_path = next(PATH_GEN) + current_iteration = self.__increment_counter(current_iteration) + LAST_VOC_SIZE = BPE.vocabulary_size + + FILE = open(out_path, "w") + + for _, _, output in self.__round_train(input_path, BPE, cached): + FILE.write(output) + + FILE.close() + + cached = True + input_path = out_path + + NEW_VOC_SIZE = BPE.vocabulary_size + + if LAST_VOC_SIZE == NEW_VOC_SIZE: + exit = True + continue + + if current_iteration == self.__max_iterations: + exit = True + continue + + if BPE.vocabulary_size == self.__max_vocabulary: + exit = True + continue + + return BPE + + def __round_train( + self, + path: Path, + bpe: NanoSocratesBPE, + cached: bool + ): + + CHUNKER = NanoSocratesChunker(self.__chunk_size, self.__special_token_regex) + SPLITTER = NanoSocratesSplitter(self.__special_token_regex) + + BPE = bpe + memory = NanoSocratesBatchMemoryBPE({}, self.__merge_treshold) + + CHUNKER_GENERATOR = iterator_with_checks(CHUNKER.chunk(path)) + + for chunk, last_chunk in CHUNKER_GENERATOR: + + PIECE_GENERATOR = iterator_with_checks( + SPLITTER.split_text(chunk) + ) + + for piece, last_piece in PIECE_GENERATOR: + + LAST_BATCH = last_chunk and last_piece + PIECE, TOKEN_TYPE = piece + + if TOKEN_TYPE != TokenType.BPE: + _, _, out = BPE.fit([], memory, LAST_BATCH) + yield (BPE, memory, PIECE) + continue + + PIECE_DATA = self.__make_list_ids(PIECE, cached) + + _, _, out = BPE.fit(PIECE_DATA, memory, LAST_BATCH) + + OUT_STRING = f"{out}" + yield (BPE, memory, OUT_STRING) + + def __increment_counter(self, counter: int): + + # What if overflows??? + try: + counter += 1 + except: + print("Integer overflow") + counter = 1 + + return counter + + def __make_list_ids(self, corpus: str, cached: bool): + + if not cached: + return list(map(ord, corpus)) + + REDUCED_CORPUS_LEN = len(corpus) -1 + + # Skip these cars "[" "]" + INTS = corpus[1:REDUCED_CORPUS_LEN] + INT_LIST = list(map(int,INTS.split(","))) + return INT_LIST + + def __switch_paths(self, path: Path, cache_path: Path): + + yield path + + TMP_1 = cache_path / "tmp1.txt" + TMP_2 = cache_path / "tmp2.txt" + + switch = True + + while True: + if switch: + yield TMP_1 + else: + yield TMP_2 + switch = not switch + From ccacea18d8db1709d3a4a48d89dcebbf73b02b87 Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Tue, 30 Sep 2025 13:33:54 +0200 Subject: [PATCH 31/75] Created files to test BPE training --- Project_Model/Tests/bpe_trainer.py | 42 +++++++++++++++++++ .../Tests/trainer_files/train_simple.txt | 1 + 2 files changed, 43 insertions(+) create mode 100644 Project_Model/Tests/bpe_trainer.py create mode 100644 Project_Model/Tests/trainer_files/train_simple.txt diff --git a/Project_Model/Tests/bpe_trainer.py b/Project_Model/Tests/bpe_trainer.py new file mode 100644 index 0000000..2e1fa08 --- /dev/null +++ b/Project_Model/Tests/bpe_trainer.py @@ -0,0 +1,42 @@ +from pathlib import Path +from Project_Model.Libs.BPE.Enums import TokenType +import Project_Model.Libs.BPE as BPE + +import re + +CACHE_DIR_PATH = Path("Project_Model/Tests/trainer_files/cache") + +class TestTrainBPE: + + def test_bpe_train_encoding_simple(self): + + TRAINER = BPE.NanoSocraTrainer( + int(32E3), + ["", ""], + 40 + ) + + TEXT = "abababab" + TEXT_PATH = Path("Project_Model/Tests/trainer_files/train_simple.txt") + + EXPECTED = [258] + + # ab = 256 + # 256, 256 = 257 + # 257, 257 = 258 + + BPE_ENCODER = TRAINER.trainBPE( + TEXT_PATH, + CACHE_DIR_PATH + ) + + ENCODED = BPE_ENCODER.encode(TEXT) + + assert len(ENCODED) == len(EXPECTED) + + for encoded, expected in zip(ENCODED, EXPECTED): + assert encoded == expected + +# Useful to debug weird cases +if __name__ == "__main__": + TestTrainBPE().test_bpe_train_encoding_simple() diff --git a/Project_Model/Tests/trainer_files/train_simple.txt b/Project_Model/Tests/trainer_files/train_simple.txt new file mode 100644 index 0000000..19f4c70 --- /dev/null +++ b/Project_Model/Tests/trainer_files/train_simple.txt @@ -0,0 +1 @@ +abababab \ No newline at end of file From 89a0a1f4bb4f4cb6cc4ad08fb992ec2b7bd187d2 Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Tue, 30 Sep 2025 23:58:31 +0200 Subject: [PATCH 32/75] Fixed bug for utf-8 conversion --- .../Libs/BPE/Classes/NanoSocraTrainer.py | 23 +++++++++++++++++-- .../Libs/BPE/Classes/NanoSocratesBPE.py | 4 ++-- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/Project_Model/Libs/BPE/Classes/NanoSocraTrainer.py b/Project_Model/Libs/BPE/Classes/NanoSocraTrainer.py index 1d6d429..0e043c7 100644 --- a/Project_Model/Libs/BPE/Classes/NanoSocraTrainer.py +++ b/Project_Model/Libs/BPE/Classes/NanoSocraTrainer.py @@ -15,6 +15,7 @@ class NanoSocraTrainer: chunk_size: int, merge_treshold: int = 0, max_iterations: int = 0, + print_after_iterations: int = 1 ) -> None: # Bytes BYTE_RESERVED_TOKENS = 256 @@ -26,6 +27,7 @@ class NanoSocraTrainer: self.__chunk_size = chunk_size self.__merge_treshold = merge_treshold self.__special_token_regex = special_regex_maker(special_vocabulary) + self.__print_after_iterations = print_after_iterations def trainBPE( self, path: Path, cache_dir: Path, bpe: NanoSocratesBPE | None = None @@ -61,7 +63,9 @@ class NanoSocraTrainer: FILE = open(out_path, "w") - for _, _, output in self.__round_train(input_path, BPE, cached): + last_memory = None + for _, memory, output in self.__round_train(input_path, BPE, cached): + last_memory = memory FILE.write(output) FILE.close() @@ -71,6 +75,21 @@ class NanoSocraTrainer: NEW_VOC_SIZE = BPE.vocabulary_size + if current_iteration % self.__print_after_iterations == 0: + DELIMITER = "===============" + + DEBUG = "\n".join([ + DELIMITER, + f"ITERATION: {current_iteration}", + DELIMITER, + f"\tVocabulary size: {BPE.vocabulary_size}\n", + f"\tFrequencies:\n{last_memory.frequencies}\n", + f"\tvocabulary:\n{BPE.vocabulary}", + DELIMITER, + "" + ]) + print(DEBUG) + if LAST_VOC_SIZE == NEW_VOC_SIZE: exit = True continue @@ -137,7 +156,7 @@ class NanoSocraTrainer: def __make_list_ids(self, corpus: str, cached: bool): if not cached: - return list(map(ord, corpus)) + return list(corpus.encode("utf-8")) REDUCED_CORPUS_LEN = len(corpus) -1 diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py index c7f89ce..3238522 100644 --- a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py +++ b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py @@ -90,7 +90,7 @@ class NanoSocratesBPE(Encoder): def encode(self, piece: str) -> list[int]: - current_piece = list(map(ord, piece)) + current_piece = list(piece.encode("utf-8")) new_piece = self.__round_encode(current_piece) while len(current_piece) != len(new_piece): @@ -128,7 +128,7 @@ class NanoSocratesBPE(Encoder): return NEW_PIECE - # TODO: decode + # TODO: Remake decode to take a list of token IDs def decode(self, token_id: int) -> str: token_stack: list[int] = [token_id] From 76f24d4eb0da854470001a7f7768e1b5d1b9d662 Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Tue, 30 Sep 2025 23:58:43 +0200 Subject: [PATCH 33/75] Renamed file --- Project_Model/Tests/{bpe_trainer.py => bpe_trainer_test.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename Project_Model/Tests/{bpe_trainer.py => bpe_trainer_test.py} (100%) diff --git a/Project_Model/Tests/bpe_trainer.py b/Project_Model/Tests/bpe_trainer_test.py similarity index 100% rename from Project_Model/Tests/bpe_trainer.py rename to Project_Model/Tests/bpe_trainer_test.py From 30c2938d29def625bb5c1afd064644698424fd27 Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Tue, 30 Sep 2025 23:58:54 +0200 Subject: [PATCH 34/75] Fixed typing --- Scripts/Libs/CleaningPipeline/special_token.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Scripts/Libs/CleaningPipeline/special_token.py b/Scripts/Libs/CleaningPipeline/special_token.py index 644ad71..3f25a2d 100644 --- a/Scripts/Libs/CleaningPipeline/special_token.py +++ b/Scripts/Libs/CleaningPipeline/special_token.py @@ -1,6 +1,7 @@ from enum import Enum -class SpecialToken(str, Enum): + +class SpecialToken(Enum): # (Enum, str) -> throws an error START_TRIPLE_LIST = "" START_TRIPLE = "" @@ -17,5 +18,4 @@ class SpecialToken(str, Enum): CONTINUE_RDF = "" MASK = "" - #BPE Training: - \ No newline at end of file + # BPE Training: From 7ab9b0358e6ba9686d5c46d79e6e8767b2e26098 Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Tue, 30 Sep 2025 23:59:09 +0200 Subject: [PATCH 35/75] Added script to run BPE --- Scripts/Training/bpe_trainer.py | 100 ++++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) create mode 100644 Scripts/Training/bpe_trainer.py diff --git a/Scripts/Training/bpe_trainer.py b/Scripts/Training/bpe_trainer.py new file mode 100644 index 0000000..759f397 --- /dev/null +++ b/Scripts/Training/bpe_trainer.py @@ -0,0 +1,100 @@ +import argparse +import json +from pathlib import Path +import sys +# TODO: make relative imports +import Project_Model.Libs.BPE as BPE +from Scripts.Libs.CleaningPipeline.special_token import SpecialToken + +DEFAULT_CHUNK_SIZE = int(18e4) +DEFAULT_DEBUG_AFTER_ITER = 1 +DEFAULT_MAX_VOCABULARY = int(32E3) +DEFAULT_MERGE_TRESHOLD = 1 +DEFAULT_MAX_ITERATIONS = 0 +TOKEN_LIST = [token.value for token in SpecialToken] + + +class ProgramArgs: + + def __init__( + self, + input_file: str, + cache_dir: str, + output_file: str, + max_vocabulary: int, + max_iterations: int, + merge_treshold: int, + chunk_size: int, + debug_after: int, + ) -> None: + self.input_file = input_file + self.cache_dir = cache_dir + self.output_file = output_file + self.max_vocabulary = max_vocabulary + self.max_iterations = max_iterations + self.merge_treshold = merge_treshold + self.chunk_size = chunk_size + self.debug_after = debug_after + + +def get_args(args: list[str]) -> ProgramArgs: + + PARSER = argparse.ArgumentParser() + PARSER.add_argument("--input-file", "--input", "-i", required=True, type=str) + PARSER.add_argument("--cache-dir", "--cache", "-c", required=True, type=str) + PARSER.add_argument("--output-file", "--output", "-o", required=True, type=str) + PARSER.add_argument("--max-vocabulary", "--max-voc", default=DEFAULT_MAX_VOCABULARY, type=int) + PARSER.add_argument("--max-iterations", "--max-iter", default=DEFAULT_MAX_ITERATIONS, type=int) + PARSER.add_argument("--merge-treshold", "--tresh", default=DEFAULT_MERGE_TRESHOLD, type=int) + PARSER.add_argument("--chunk-size", default=DEFAULT_CHUNK_SIZE, type=int) + PARSER.add_argument("--debug-after", default=DEFAULT_DEBUG_AFTER_ITER, type=int) + + parsed_args, _ = PARSER.parse_known_args(args) + + return ProgramArgs( + parsed_args.input_file, + parsed_args.cache_dir, + parsed_args.output_file, + parsed_args.max_vocabulary, + parsed_args.max_iterations, + parsed_args.merge_treshold, + parsed_args.chunk_size, + parsed_args.debug_after, + ) # type ignore + + +def train(args: ProgramArgs): + + TRAINER = BPE.NanoSocraTrainer( + args.max_vocabulary, + TOKEN_LIST, + args.chunk_size, + args.merge_treshold, + args.max_iterations, + args.debug_after + ) + + DATASET_PATH = Path(args.input_file) + CACHE_DIR = Path(args.cache_dir) + VOCABULARY_PATH = Path(args.output_file) + + print(f"Training BPE") + + BPE_ENCODER = TRAINER.trainBPE( + DATASET_PATH, + CACHE_DIR + ) + + VOCABULARY = BPE_ENCODER.vocabulary + VOCABULARY_JSON = json.dumps(VOCABULARY) + + print(f"Saving Vocabulary in {VOCABULARY_PATH}") + + FILE = open(VOCABULARY_PATH, "w") + FILE.write(VOCABULARY_JSON) + FILE.close() + + +if __name__ == "__main__": + ARGS = get_args(sys.argv) + train(ARGS) From 9a8e726d745ca5e7af26ad43fef5ade81f31dc91 Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Wed, 1 Oct 2025 00:22:22 +0200 Subject: [PATCH 36/75] Added cdebug configuration --- .vscode/launch.json | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 .vscode/launch.json diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..e0a93b9 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,16 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Python Debugger: Current File with Arguments", + "type": "debugpy", + "request": "launch", + "program": "${file}", + "console": "integratedTerminal", + "args": "${command:pickArgs}" + } + ] +} \ No newline at end of file From 97bac464f3819ba8bfac7a91d6bec2370a4cb1d7 Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Wed, 1 Oct 2025 00:32:43 +0200 Subject: [PATCH 37/75] Fixed JSON incompatibility --- Scripts/Training/bpe_trainer.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/Scripts/Training/bpe_trainer.py b/Scripts/Training/bpe_trainer.py index 759f397..904bfbf 100644 --- a/Scripts/Training/bpe_trainer.py +++ b/Scripts/Training/bpe_trainer.py @@ -86,7 +86,14 @@ def train(args: ProgramArgs): ) VOCABULARY = BPE_ENCODER.vocabulary - VOCABULARY_JSON = json.dumps(VOCABULARY) + + JSON_VOCABULARY: dict[str, int]= {} + + for key, item in VOCABULARY.items(): + TUPLE_STR = f"{key}" + JSON_VOCABULARY[TUPLE_STR] = item + + VOCABULARY_JSON = json.dumps(JSON_VOCABULARY) print(f"Saving Vocabulary in {VOCABULARY_PATH}") From dbf1d99408186b8b1705a6a124f9ecbf6334ebe7 Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Wed, 1 Oct 2025 12:20:59 +0200 Subject: [PATCH 38/75] Added json utils to save and load json files --- Project_Model/Libs/BPE/Utils/json_utils.py | 18 ++++++++ Project_Model/Libs/BPE/Utils/vocabulary.py | 49 ++++++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100644 Project_Model/Libs/BPE/Utils/json_utils.py create mode 100644 Project_Model/Libs/BPE/Utils/vocabulary.py diff --git a/Project_Model/Libs/BPE/Utils/json_utils.py b/Project_Model/Libs/BPE/Utils/json_utils.py new file mode 100644 index 0000000..716e93a --- /dev/null +++ b/Project_Model/Libs/BPE/Utils/json_utils.py @@ -0,0 +1,18 @@ +import json +from pathlib import Path + + +def save_json(vocabulary: dict, path: Path): + + json_string = json.dumps(vocabulary) + FILE = open(path, "w") + FILE.write(json_string) + FILE.close() + + +def load_json(path: Path) -> dict[tuple[int, int], int]: + FILE = open(path, "r") + json_string = FILE.read() + FILE.close() + + return json.loads(json_string) diff --git a/Project_Model/Libs/BPE/Utils/vocabulary.py b/Project_Model/Libs/BPE/Utils/vocabulary.py new file mode 100644 index 0000000..fa245d5 --- /dev/null +++ b/Project_Model/Libs/BPE/Utils/vocabulary.py @@ -0,0 +1,49 @@ +import json +from pathlib import Path +from ..Errors import OutOfDictionaryException + + +def nanos_vocabulary2json_str(vocabulary: dict[tuple[int, int], int]) -> str: + + JSON: dict[str, int] = {} + + for key, item in vocabulary.items(): + TUPLE_STR = f"{key}" + JSON[TUPLE_STR] = item + + return json.dumps(JSON) + + +def nanos_json_str2vocabulary(json_string: str) -> dict[tuple[int, int], int]: + + JSON: dict[str, int] = json.loads(json_string) + VOCABULARY: dict[tuple[int, int], int] = {} + + for key, item in JSON.items(): + REDUCED_KEY = len(key) - 1 + KEY_STR = key[1:REDUCED_KEY] + VOC_KEY = tuple(map(int, KEY_STR.split(","))) + + if len(VOC_KEY) != 2: + raise OutOfDictionaryException() + + # Checked for weird things above + VOCABULARY[VOC_KEY] = item # type: ignore + + return VOCABULARY + + +def save_nanos_vocabulary(vocabulary: dict[tuple[int, int], int], path: Path): + + json_string = nanos_vocabulary2json_str(vocabulary) + FILE = open(path, "w") + FILE.write(json_string) + FILE.close() + + +def load_nanos_vocabulary(path: Path) -> dict[tuple[int, int], int]: + FILE = open(path, "r") + json_string = FILE.read() + FILE.close() + + return nanos_json_str2vocabulary(json_string) From 66bcf6e55fe0e94f53fb63e187fa35f225beb299 Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Wed, 1 Oct 2025 12:21:42 +0200 Subject: [PATCH 39/75] Added a way to recover iteration work --- .../Libs/BPE/Classes/NanoSocraTrainer.py | 135 +++++++++++++----- 1 file changed, 100 insertions(+), 35 deletions(-) diff --git a/Project_Model/Libs/BPE/Classes/NanoSocraTrainer.py b/Project_Model/Libs/BPE/Classes/NanoSocraTrainer.py index 0e043c7..9dfe776 100644 --- a/Project_Model/Libs/BPE/Classes/NanoSocraTrainer.py +++ b/Project_Model/Libs/BPE/Classes/NanoSocraTrainer.py @@ -1,9 +1,22 @@ from collections import deque +import datetime from pathlib import Path import re -from ..Classes import NanoSocratesBPE, NanoSocratesChunker, NanoSocratesSplitter, NanoSocratesBatchMemoryBPE +from ..Classes import ( + NanoSocratesBPE, + NanoSocratesChunker, + NanoSocratesSplitter, + NanoSocratesBatchMemoryBPE, +) from ..Enums import TokenType -from ..Utils import special_regex_maker, iterator_with_checks +from ..Utils import ( + special_regex_maker, + iterator_with_checks, + save_nanos_vocabulary, + load_nanos_vocabulary, + save_json, + load_json, +) class NanoSocraTrainer: @@ -15,7 +28,7 @@ class NanoSocraTrainer: chunk_size: int, merge_treshold: int = 0, max_iterations: int = 0, - print_after_iterations: int = 1 + print_after_iterations: int = 1, ) -> None: # Bytes BYTE_RESERVED_TOKENS = 256 @@ -30,7 +43,11 @@ class NanoSocraTrainer: self.__print_after_iterations = print_after_iterations def trainBPE( - self, path: Path, cache_dir: Path, bpe: NanoSocratesBPE | None = None + self, + path: Path, + cache_dir: Path, + bpe: NanoSocratesBPE | None = None, + resume_from_iter: int = 0, ) -> NanoSocratesBPE: if not path.is_file(): @@ -49,45 +66,76 @@ class NanoSocraTrainer: exit = False cached = False current_iteration = 0 + input_path = path - PATH_GEN = self.__switch_paths(path, cache_dir) + NEXT_ITERATION = resume_from_iter + 1 if resume_from_iter != 0 else 0 - input_path = next(PATH_GEN) + PATH_GEN = self.__switch_paths(path, cache_dir, NEXT_ITERATION) + MEMORY_PATH_GEN = self.__switch_memory(cache_dir, resume_from_iter) + + if resume_from_iter != 0: + cached = True + current_iteration = resume_from_iter + input_path = next(PATH_GEN) + # UGLY: fixes a bug immediately, unfortunately + _, _ = next(MEMORY_PATH_GEN) + _, voc_cache_path = next(MEMORY_PATH_GEN) + vocabulary = load_nanos_vocabulary(voc_cache_path) + BPE = NanoSocratesBPE(vocabulary) while not exit: - out_path = next(PATH_GEN) + internal_cache_path, vocabulary_cache = next(MEMORY_PATH_GEN) + current_iteration = self.__increment_counter(current_iteration) LAST_VOC_SIZE = BPE.vocabulary_size FILE = open(out_path, "w") last_memory = None + for _, memory, output in self.__round_train(input_path, BPE, cached): last_memory = memory FILE.write(output) FILE.close() + internal_cache = { + "finished_iter": current_iteration, + "read_from": f"{input_path}", + "wrote_to": f"{out_path}", + "at": datetime.datetime.now(datetime.timezone.utc).strftime( + "%Y-%m-%d %H:%M:%S.%f" + )[:-3], + } + + VOCABULARY = BPE.vocabulary + + save_json(internal_cache, internal_cache_path) + save_nanos_vocabulary(VOCABULARY, vocabulary_cache) + cached = True input_path = out_path NEW_VOC_SIZE = BPE.vocabulary_size if current_iteration % self.__print_after_iterations == 0: + DELIMITER = "===============" - DEBUG = "\n".join([ - DELIMITER, - f"ITERATION: {current_iteration}", - DELIMITER, - f"\tVocabulary size: {BPE.vocabulary_size}\n", - f"\tFrequencies:\n{last_memory.frequencies}\n", - f"\tvocabulary:\n{BPE.vocabulary}", - DELIMITER, - "" - ]) + DEBUG = "\n".join( + [ + DELIMITER, + f"ITERATION: {current_iteration}", + DELIMITER, + f"\tVocabulary size: {BPE.vocabulary_size}\n", + f"\tFrequencies:\n{last_memory.frequencies}\n", # type: ignore (pretty sure it's not None) + f"\tvocabulary:\n{BPE.vocabulary}", + DELIMITER, + "", + ] + ) print(DEBUG) if LAST_VOC_SIZE == NEW_VOC_SIZE: @@ -104,12 +152,7 @@ class NanoSocraTrainer: return BPE - def __round_train( - self, - path: Path, - bpe: NanoSocratesBPE, - cached: bool - ): + def __round_train(self, path: Path, bpe: NanoSocratesBPE, cached: bool): CHUNKER = NanoSocratesChunker(self.__chunk_size, self.__special_token_regex) SPLITTER = NanoSocratesSplitter(self.__special_token_regex) @@ -121,9 +164,7 @@ class NanoSocraTrainer: for chunk, last_chunk in CHUNKER_GENERATOR: - PIECE_GENERATOR = iterator_with_checks( - SPLITTER.split_text(chunk) - ) + PIECE_GENERATOR = iterator_with_checks(SPLITTER.split_text(chunk)) for piece, last_piece in PIECE_GENERATOR: @@ -158,26 +199,50 @@ class NanoSocraTrainer: if not cached: return list(corpus.encode("utf-8")) - REDUCED_CORPUS_LEN = len(corpus) -1 + REDUCED_CORPUS_LEN = len(corpus) - 1 # Skip these cars "[" "]" INTS = corpus[1:REDUCED_CORPUS_LEN] - INT_LIST = list(map(int,INTS.split(","))) + INT_LIST = list(map(int, INTS.split(","))) return INT_LIST - def __switch_paths(self, path: Path, cache_path: Path): + def __switch_paths(self, path: Path, cache_path: Path, initial_iteration: int): - yield path - - TMP_1 = cache_path / "tmp1.txt" - TMP_2 = cache_path / "tmp2.txt" + CORPUS_TMP_1 = cache_path / "corpus-tmp1.txt" + CORPUS_TMP_2 = cache_path / "corpus-tmp2.txt" switch = True + if initial_iteration % 2 == 1: + switch = False + + del initial_iteration + while True: if switch: - yield TMP_1 + yield CORPUS_TMP_1 else: - yield TMP_2 + yield CORPUS_TMP_2 switch = not switch + def __switch_memory(self, cache_path: Path, initial_iteration: int): + + INTERNAL_TMP_1 = cache_path / "internal-tmp1.json" + INTERNAL_TMP_2 = cache_path / "internal-tmp2.json" + + VOCAB_TMP_1 = cache_path / "voc-tmp1.json" + VOCAB_TMP_2 = cache_path / "voc-tmp2.json" + + switch = False + + if initial_iteration % 2 == 1: + switch = True + + del initial_iteration + + while True: + if switch: + yield (INTERNAL_TMP_1, VOCAB_TMP_1) + else: + yield (INTERNAL_TMP_2, VOCAB_TMP_2) + switch = not switch From b3d444979fdf8d514ca9ca8fcf9892117240dd8a Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Wed, 1 Oct 2025 12:22:09 +0200 Subject: [PATCH 40/75] Added flag to resume work correctly --- Scripts/Training/bpe_trainer.py | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/Scripts/Training/bpe_trainer.py b/Scripts/Training/bpe_trainer.py index 904bfbf..bc8916e 100644 --- a/Scripts/Training/bpe_trainer.py +++ b/Scripts/Training/bpe_trainer.py @@ -21,6 +21,7 @@ class ProgramArgs: input_file: str, cache_dir: str, output_file: str, + resume_at: int, max_vocabulary: int, max_iterations: int, merge_treshold: int, @@ -30,6 +31,7 @@ class ProgramArgs: self.input_file = input_file self.cache_dir = cache_dir self.output_file = output_file + self.resume_at = resume_at self.max_vocabulary = max_vocabulary self.max_iterations = max_iterations self.merge_treshold = merge_treshold @@ -43,6 +45,7 @@ def get_args(args: list[str]) -> ProgramArgs: PARSER.add_argument("--input-file", "--input", "-i", required=True, type=str) PARSER.add_argument("--cache-dir", "--cache", "-c", required=True, type=str) PARSER.add_argument("--output-file", "--output", "-o", required=True, type=str) + PARSER.add_argument("--resume-at", "--resume", "-r", default=0, type=int) PARSER.add_argument("--max-vocabulary", "--max-voc", default=DEFAULT_MAX_VOCABULARY, type=int) PARSER.add_argument("--max-iterations", "--max-iter", default=DEFAULT_MAX_ITERATIONS, type=int) PARSER.add_argument("--merge-treshold", "--tresh", default=DEFAULT_MERGE_TRESHOLD, type=int) @@ -55,6 +58,7 @@ def get_args(args: list[str]) -> ProgramArgs: parsed_args.input_file, parsed_args.cache_dir, parsed_args.output_file, + parsed_args.resume_at, parsed_args.max_vocabulary, parsed_args.max_iterations, parsed_args.merge_treshold, @@ -82,25 +86,15 @@ def train(args: ProgramArgs): BPE_ENCODER = TRAINER.trainBPE( DATASET_PATH, - CACHE_DIR + CACHE_DIR, + resume_from_iter=args.resume_at ) VOCABULARY = BPE_ENCODER.vocabulary - JSON_VOCABULARY: dict[str, int]= {} - - for key, item in VOCABULARY.items(): - TUPLE_STR = f"{key}" - JSON_VOCABULARY[TUPLE_STR] = item - - VOCABULARY_JSON = json.dumps(JSON_VOCABULARY) - print(f"Saving Vocabulary in {VOCABULARY_PATH}") - FILE = open(VOCABULARY_PATH, "w") - FILE.write(VOCABULARY_JSON) - FILE.close() - + BPE.save_nanos_vocabulary(VOCABULARY, VOCABULARY_PATH) if __name__ == "__main__": ARGS = get_args(sys.argv) From fbbe6226bb09fb1b6b408f16763fb59c8b620b1d Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Wed, 1 Oct 2025 18:56:53 +0200 Subject: [PATCH 41/75] Finished uploading stubs for TokeNano --- .../Libs/BPE/Classes/NanoSocraTraineRam.py | 153 ++++++++++++++++++ Project_Model/Libs/BPE/Classes/TokeNano.py | 0 .../Libs/BPE/Classes/TokeNanoCore.py | 0 Project_Model/Libs/BPE/Classes/__init__.py | 4 +- Project_Model/Libs/BPE/Utils/__init__.py | 7 +- Scripts/Training/bpe_trainer_ram.py | 84 ++++++++++ 6 files changed, 246 insertions(+), 2 deletions(-) create mode 100644 Project_Model/Libs/BPE/Classes/NanoSocraTraineRam.py create mode 100644 Project_Model/Libs/BPE/Classes/TokeNano.py create mode 100644 Project_Model/Libs/BPE/Classes/TokeNanoCore.py create mode 100644 Scripts/Training/bpe_trainer_ram.py diff --git a/Project_Model/Libs/BPE/Classes/NanoSocraTraineRam.py b/Project_Model/Libs/BPE/Classes/NanoSocraTraineRam.py new file mode 100644 index 0000000..9c4f444 --- /dev/null +++ b/Project_Model/Libs/BPE/Classes/NanoSocraTraineRam.py @@ -0,0 +1,153 @@ +from collections import deque +import datetime +from pathlib import Path +import re +from ..Classes import ( + NanoSocratesBPE, + NanoSocratesChunker, + NanoSocratesSplitter, + NanoSocratesBatchMemoryBPE, +) +from ..Enums import TokenType +from ..Utils import ( + special_regex_maker, + iterator_with_checks, + save_nanos_vocabulary, + load_nanos_vocabulary, + save_json, + load_json, +) + + +class NanoSocraTraineRam: + + def __init__( + self, + max_vocabulary: int, + special_vocabulary: list[str], + merge_treshold: int = 0, + max_iterations: int = 0, + print_after_iterations: int = 1, + ) -> None: + # Bytes + BYTE_RESERVED_TOKENS = 256 + SPECIAL_RESERVED_TOKENS = len(special_vocabulary) + RESERVED_TOKENS = BYTE_RESERVED_TOKENS + SPECIAL_RESERVED_TOKENS + + self.__max_vocabulary = max_vocabulary - RESERVED_TOKENS + self.__max_iterations = max_iterations + self.__merge_treshold = merge_treshold + self.__special_token_regex = special_regex_maker(special_vocabulary) + self.__print_after_iterations = print_after_iterations + + def trainBPE( + self, + path: Path, + bpe: NanoSocratesBPE | None = None, + ) -> NanoSocratesBPE: + + if not path.is_file(): + raise FileNotFoundError() + + if bpe is None: + bpe = NanoSocratesBPE() + BPE = bpe + + if BPE.vocabulary_size > self.__max_vocabulary: + return BPE + + exit = False + current_iteration = 0 + data = self.__gather_data_from_file(path) + + while not exit: + + current_iteration = self.__increment_counter(current_iteration) + + LAST_VOC_SIZE = BPE.vocabulary_size + + last_memory = None + + _, data, last_memory = self.__round_train(BPE, data) + + NEW_VOC_SIZE = BPE.vocabulary_size + + if current_iteration % self.__print_after_iterations == 0: + + DELIMITER = "===============" + + DEBUG = "\n".join( + [ + DELIMITER, + f"ITERATION: {current_iteration}", + DELIMITER, + f"\tVocabulary size: {BPE.vocabulary_size}\n", + f"\tFrequencies:\n{last_memory.frequencies}\n", # type: ignore (pretty sure it's not None) + f"\tvocabulary:\n{BPE.vocabulary}", + DELIMITER, + "", + ] + ) + print(DEBUG) + + if LAST_VOC_SIZE == NEW_VOC_SIZE: + exit = True + continue + + if current_iteration == self.__max_iterations: + exit = True + continue + + if BPE.vocabulary_size == self.__max_vocabulary: + exit = True + continue + + return BPE + + def __round_train(self, bpe: NanoSocratesBPE, data: list[list[int]]): + DATA_LEN = len(data) + + memory = NanoSocratesBatchMemoryBPE({}, 0) + for piece, index in zip(data, range(0, DATA_LEN)): + + last_batch = index == DATA_LEN - 1 + + bpe, memory, output = bpe.fit(piece, memory, last_batch) + + data[index] = output + + return (bpe, data, memory) + + def __gather_data_from_file(self, path: Path) -> list[list[int]]: + + SPLITTER = NanoSocratesSplitter(self.__special_token_regex) + + DATA: list[list[int]] = [] + + FILE = open(path, "r", encoding="utf-8") + file_string = FILE.read() + FILE.close() + + for piece, type in SPLITTER.split_text(file_string): + + if type != TokenType.BPE: + continue + + int_list = self.__make_list_ids(piece) + DATA.append(int_list) + + return DATA + + def __increment_counter(self, counter: int): + + # What if overflows??? + try: + counter += 1 + except: + print("Integer overflow") + counter = 1 + + return counter + + def __make_list_ids(self, corpus: str): + return list(corpus.encode("utf-8")) diff --git a/Project_Model/Libs/BPE/Classes/TokeNano.py b/Project_Model/Libs/BPE/Classes/TokeNano.py new file mode 100644 index 0000000..e69de29 diff --git a/Project_Model/Libs/BPE/Classes/TokeNanoCore.py b/Project_Model/Libs/BPE/Classes/TokeNanoCore.py new file mode 100644 index 0000000..e69de29 diff --git a/Project_Model/Libs/BPE/Classes/__init__.py b/Project_Model/Libs/BPE/Classes/__init__.py index a52b024..32e958a 100644 --- a/Project_Model/Libs/BPE/Classes/__init__.py +++ b/Project_Model/Libs/BPE/Classes/__init__.py @@ -2,10 +2,12 @@ from .NanoSocratesChunker import NanoSocratesChunker from .NanoSocratesSplitter import NanoSocratesSplitter from .NanoSocratesBPE import NanoSocratesBPE, NanoSocratesBatchMemoryBPE from .NanoSocraTrainer import NanoSocraTrainer +from .NanoSocraTraineRam import NanoSocraTraineRam __all__ = [ "NanoSocratesChunker", "NanoSocratesSplitter", "NanoSocratesBPE", - "NanoSocraTrainer" + "NanoSocraTrainer", + "NanoSocraTraineRam" ] \ No newline at end of file diff --git a/Project_Model/Libs/BPE/Utils/__init__.py b/Project_Model/Libs/BPE/Utils/__init__.py index f2320fa..3eb9eb3 100644 --- a/Project_Model/Libs/BPE/Utils/__init__.py +++ b/Project_Model/Libs/BPE/Utils/__init__.py @@ -1,7 +1,12 @@ from .special_regex_maker import special_regex_maker from .lag_checker_iterator import iterator_with_checks +from .vocabulary import save_nanos_vocabulary, load_nanos_vocabulary +from .json_utils import save_json, load_json __all__ = [ "special_regex_maker", - "iterator_with_checks" + "iterator_with_checks", + "save_nanos_vocabulary", + "load_nanos_vocabulary", + "save_json", "load_json" ] \ No newline at end of file diff --git a/Scripts/Training/bpe_trainer_ram.py b/Scripts/Training/bpe_trainer_ram.py new file mode 100644 index 0000000..14ce0bb --- /dev/null +++ b/Scripts/Training/bpe_trainer_ram.py @@ -0,0 +1,84 @@ +import argparse +import json +from pathlib import Path +import sys +# TODO: make relative imports +import Project_Model.Libs.BPE as BPE +from Scripts.Libs.CleaningPipeline.special_token import SpecialToken + +DEFAULT_DEBUG_AFTER_ITER = 1 +DEFAULT_MAX_VOCABULARY = int(32E3) +DEFAULT_MERGE_TRESHOLD = 1 +DEFAULT_MAX_ITERATIONS = 0 +TOKEN_LIST = [token.value for token in SpecialToken] + + +class ProgramArgs: + + def __init__( + self, + input_file: str, + output_file: str, + max_vocabulary: int, + max_iterations: int, + merge_treshold: int, + debug_after: int, + ) -> None: + self.input_file = input_file + self.output_file = output_file + self.max_vocabulary = max_vocabulary + self.max_iterations = max_iterations + self.merge_treshold = merge_treshold + self.debug_after = debug_after + + +def get_args(args: list[str]) -> ProgramArgs: + + PARSER = argparse.ArgumentParser() + PARSER.add_argument("--input-file", "--input", "-i", required=True, type=str) + PARSER.add_argument("--output-file", "--output", "-o", required=True, type=str) + PARSER.add_argument("--max-vocabulary", "--max-voc", default=DEFAULT_MAX_VOCABULARY, type=int) + PARSER.add_argument("--max-iterations", "--max-iter", default=DEFAULT_MAX_ITERATIONS, type=int) + PARSER.add_argument("--merge-treshold", "--tresh", default=DEFAULT_MERGE_TRESHOLD, type=int) + PARSER.add_argument("--debug-after", default=DEFAULT_DEBUG_AFTER_ITER, type=int) + + parsed_args, _ = PARSER.parse_known_args(args) + + return ProgramArgs( + parsed_args.input_file, + parsed_args.output_file, + parsed_args.max_vocabulary, + parsed_args.max_iterations, + parsed_args.merge_treshold, + parsed_args.debug_after, + ) # type ignore + + +def train(args: ProgramArgs): + + TRAINER = BPE.NanoSocraTraineRam( + args.max_vocabulary, + TOKEN_LIST, + args.merge_treshold, + args.max_iterations, + args.debug_after + ) + + DATASET_PATH = Path(args.input_file) + VOCABULARY_PATH = Path(args.output_file) + + print(f"Training BPE") + + BPE_ENCODER = TRAINER.trainBPE( + DATASET_PATH + ) + + VOCABULARY = BPE_ENCODER.vocabulary + + print(f"Saving Vocabulary in {VOCABULARY_PATH}") + + BPE.save_nanos_vocabulary(VOCABULARY, VOCABULARY_PATH) + +if __name__ == "__main__": + ARGS = get_args(sys.argv) + train(ARGS) From 7cfaf601b411ea4e6ad5c929793f5aad7b8b127a Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Wed, 1 Oct 2025 19:42:22 +0200 Subject: [PATCH 42/75] Refactored to remove tokens that can't be compressed anymore --- .../Libs/BPE/Classes/NanoSocraTraineRam.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/Project_Model/Libs/BPE/Classes/NanoSocraTraineRam.py b/Project_Model/Libs/BPE/Classes/NanoSocraTraineRam.py index 9c4f444..aca820e 100644 --- a/Project_Model/Libs/BPE/Classes/NanoSocraTraineRam.py +++ b/Project_Model/Libs/BPE/Classes/NanoSocraTraineRam.py @@ -105,18 +105,29 @@ class NanoSocraTraineRam: return BPE def __round_train(self, bpe: NanoSocratesBPE, data: list[list[int]]): + DATA_LEN = len(data) + NEW_DATA = [] + counter = 0 memory = NanoSocratesBatchMemoryBPE({}, 0) - for piece, index in zip(data, range(0, DATA_LEN)): + while len(data) > 0: + counter += 1 + last_batch = len(data) == 1 - last_batch = index == DATA_LEN - 1 + piece = data.pop() bpe, memory, output = bpe.fit(piece, memory, last_batch) - data[index] = output + if counter % int(1E6) == 0: + print(f"Fitted: {counter}/{DATA_LEN}") - return (bpe, data, memory) + if len(output) < 2: + continue + + NEW_DATA.append(output) + + return (bpe, NEW_DATA, memory) def __gather_data_from_file(self, path: Path) -> list[list[int]]: From b80b4e4112226e32a5c8eeb2d5d3b91f5ded35a5 Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Thu, 2 Oct 2025 01:29:57 +0200 Subject: [PATCH 43/75] Fixed returning type hints --- Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py index 3238522..4245936 100644 --- a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py +++ b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py @@ -105,7 +105,7 @@ class NanoSocratesBPE(Encoder): return piece PIECE_LENGTH = len(piece) - 1 - NEW_PIECE = [] + NEW_PIECE : list[int]= [] index = 0 while index < PIECE_LENGTH: From 63baf29805caa4c6f8cd7e2ef6eaec4b4e83ec53 Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Thu, 2 Oct 2025 01:30:24 +0200 Subject: [PATCH 44/75] Added multithreaded training --- .../Libs/BPE/Classes/NanoSocraTrainerPool.py | 219 ++++++++++++++++++ Scripts/Training/bpe_trainer_pool.py | 90 +++++++ 2 files changed, 309 insertions(+) create mode 100644 Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py create mode 100644 Scripts/Training/bpe_trainer_pool.py diff --git a/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py b/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py new file mode 100644 index 0000000..167b433 --- /dev/null +++ b/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py @@ -0,0 +1,219 @@ +from collections import deque +import datetime +import itertools +from multiprocessing import Pool +import os +from pathlib import Path +import re +from ..Classes import ( + NanoSocratesBPE, + NanoSocratesChunker, + NanoSocratesSplitter, + NanoSocratesBatchMemoryBPE, +) +from ..Enums import TokenType +from ..Utils import ( + special_regex_maker, + iterator_with_checks, + save_nanos_vocabulary, + load_nanos_vocabulary, + save_json, + load_json, +) + +def split(a, n): + k, m = divmod(len(a), n) + return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n)) + +def split_fit(object: tuple[NanoSocratesBPE, list[list[int]]]): + + bpe, data = object + + NEW_DATA: list[list[int]]= [] + + memory = NanoSocratesBatchMemoryBPE({}, 0) + + while len(data) > 0: + + piece = data.pop() + + bpe, memory, output = bpe.fit(piece, memory, False) + + if len(output) < 2: + continue + + # We are sure of its type + NEW_DATA.append(output) # type: ignore + + return (bpe, NEW_DATA, memory) + + +class NanoSocraTrainerPool: + + def __init__( + self, + max_vocabulary: int, + special_vocabulary: list[str], + merge_treshold: int = 0, + max_iterations: int = 0, + print_after_iterations: int = 1, + ) -> None: + # Bytes + BYTE_RESERVED_TOKENS = 256 + SPECIAL_RESERVED_TOKENS = len(special_vocabulary) + RESERVED_TOKENS = BYTE_RESERVED_TOKENS + SPECIAL_RESERVED_TOKENS + + self.__max_vocabulary = max_vocabulary - RESERVED_TOKENS + self.__max_iterations = max_iterations + self.__merge_treshold = merge_treshold + self.__special_token_regex = special_regex_maker(special_vocabulary) + self.__print_after_iterations = print_after_iterations + + # TODO: add a resume function + def trainBPE( + self, + path: Path, + cache_file: Path, + bpe: NanoSocratesBPE | None = None, + ) -> NanoSocratesBPE: + + if not path.is_file(): + raise FileNotFoundError() + + if not cache_file.is_file(): + file = cache_file.open("w") + file.close() + + if bpe is None: + bpe = NanoSocratesBPE() + BPE = bpe + + if BPE.vocabulary_size > self.__max_vocabulary: + return BPE + + exit = False + current_iteration = 0 + data = self.__gather_data_from_file(path) + + while not exit: + + current_iteration = self.__increment_counter(current_iteration) + + LAST_VOC_SIZE = BPE.vocabulary_size + + last_memory = None + + _, data, last_memory = self.__round_train(BPE, data) + + NEW_VOC_SIZE = BPE.vocabulary_size + + VOCABULARY = BPE.vocabulary + + save_nanos_vocabulary(VOCABULARY, cache_file) + + if current_iteration % self.__print_after_iterations == 0: + + DELIMITER = "===============" + + DEBUG = "\n".join( + [ + DELIMITER, + f"ITERATION: {current_iteration}", + DELIMITER, + f"\tVocabulary size: {BPE.vocabulary_size}\n", + f"\tvocabulary:\n{BPE.vocabulary}", + DELIMITER, + "", + ] + ) + print(DEBUG) + + if LAST_VOC_SIZE == NEW_VOC_SIZE: + exit = True + continue + + if current_iteration == self.__max_iterations: + exit = True + continue + + if BPE.vocabulary_size == self.__max_vocabulary: + exit = True + continue + + return BPE + + def __round_train(self, bpe: NanoSocratesBPE, data: list[list[int]]): + + NEW_DATA : list[list[int]] = [] + + MEMORY = NanoSocratesBatchMemoryBPE({}, 0) + + fit_funct = split_fit + CPU_COUNT = os.process_cpu_count() + + if CPU_COUNT is None: + raise Exception() + + VOCABULARY = bpe.vocabulary + + data_chunks = split(data, CPU_COUNT) + JOBS = [(NanoSocratesBPE(VOCABULARY), chunk) for chunk in data_chunks] + + JOB_RESULTS: list[tuple[NanoSocratesBPE, list[list[int]], NanoSocratesBatchMemoryBPE]] + + with Pool() as pool: + JOB_RESULTS = pool.map(fit_funct, JOBS) + + for i, res in zip(range(0, CPU_COUNT), JOB_RESULTS): + _, job_output, job_memory = res + NEW_DATA.extend(job_output) + + for key, value in job_memory.frequencies.items(): + MEMORY.frequencies[key] = value + + del job_output + del job_memory + + print(f"Joined {i + 1} out of {CPU_COUNT}") + + + # Get new token + bpe.fit([], MEMORY, True) + + print(f"Sentences from {len(data)} to {len(NEW_DATA)}") + + return (bpe, NEW_DATA, MEMORY) + + def __gather_data_from_file(self, path: Path) -> list[list[int]]: + + SPLITTER = NanoSocratesSplitter(self.__special_token_regex) + + DATA: list[list[int]] = [] + + FILE = open(path, "r", encoding="utf-8") + file_string = FILE.read() + FILE.close() + + for piece, type in SPLITTER.split_text(file_string): + + if type != TokenType.BPE: + continue + + int_list = self.__make_list_ids(piece) + DATA.append(int_list) + + return DATA + + def __increment_counter(self, counter: int): + + # What if overflows??? + try: + counter += 1 + except: + print("Integer overflow") + counter = 1 + + return counter + + def __make_list_ids(self, corpus: str): + return list(corpus.encode("utf-8")) diff --git a/Scripts/Training/bpe_trainer_pool.py b/Scripts/Training/bpe_trainer_pool.py new file mode 100644 index 0000000..5c7ab6e --- /dev/null +++ b/Scripts/Training/bpe_trainer_pool.py @@ -0,0 +1,90 @@ +import argparse +import json +from pathlib import Path +import sys +# TODO: make relative imports +import Project_Model.Libs.BPE as BPE +from Scripts.Libs.CleaningPipeline.special_token import SpecialToken + +DEFAULT_DEBUG_AFTER_ITER = 1 +DEFAULT_MAX_VOCABULARY = int(32E3) +DEFAULT_MERGE_TRESHOLD = 1 +DEFAULT_MAX_ITERATIONS = 0 +TOKEN_LIST = [token.value for token in SpecialToken] + + +class ProgramArgs: + + def __init__( + self, + input_file: str, + output_file: str, + cache_file: str, + max_vocabulary: int, + max_iterations: int, + merge_treshold: int, + debug_after: int, + ) -> None: + self.input_file = input_file + self.output_file = output_file + self.cache_file = cache_file + self.max_vocabulary = max_vocabulary + self.max_iterations = max_iterations + self.merge_treshold = merge_treshold + self.debug_after = debug_after + + +def get_args(args: list[str]) -> ProgramArgs: + + PARSER = argparse.ArgumentParser() + PARSER.add_argument("--input-file", "--input", "-i", required=True, type=str) + PARSER.add_argument("--output-file", "--output", "-o", required=True, type=str) + PARSER.add_argument("--cache-file", "--cache", "-c", required=True, type=str) + PARSER.add_argument("--max-vocabulary", "--max-voc", default=DEFAULT_MAX_VOCABULARY, type=int) + PARSER.add_argument("--max-iterations", "--max-iter", default=DEFAULT_MAX_ITERATIONS, type=int) + PARSER.add_argument("--merge-treshold", "--tresh", default=DEFAULT_MERGE_TRESHOLD, type=int) + PARSER.add_argument("--debug-after", default=DEFAULT_DEBUG_AFTER_ITER, type=int) + + parsed_args, _ = PARSER.parse_known_args(args) + + return ProgramArgs( + parsed_args.input_file, + parsed_args.output_file, + parsed_args.cache_file, + parsed_args.max_vocabulary, + parsed_args.max_iterations, + parsed_args.merge_treshold, + parsed_args.debug_after, + ) # type ignore + + +def train(args: ProgramArgs): + + TRAINER = BPE.NanoSocraTrainerPool( + args.max_vocabulary, + TOKEN_LIST, + args.merge_treshold, + args.max_iterations, + args.debug_after + ) + + DATASET_PATH = Path(args.input_file) + VOCABULARY_PATH = Path(args.output_file) + CACHE_PATH = Path(args.cache_file) + + print(f"Training BPE") + + BPE_ENCODER = TRAINER.trainBPE( + DATASET_PATH, + CACHE_PATH + ) + + VOCABULARY = BPE_ENCODER.vocabulary + + print(f"Saving Vocabulary in {VOCABULARY_PATH}") + + BPE.save_nanos_vocabulary(VOCABULARY, VOCABULARY_PATH) + +if __name__ == "__main__": + ARGS = get_args(sys.argv) + train(ARGS) From d19426fa625023ee56e9f077b409f0cbad3e0ef8 Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Thu, 2 Oct 2025 01:31:05 +0200 Subject: [PATCH 45/75] added multithreaded training to package --- Project_Model/Libs/BPE/Classes/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Project_Model/Libs/BPE/Classes/__init__.py b/Project_Model/Libs/BPE/Classes/__init__.py index 32e958a..d3b93b6 100644 --- a/Project_Model/Libs/BPE/Classes/__init__.py +++ b/Project_Model/Libs/BPE/Classes/__init__.py @@ -3,11 +3,13 @@ from .NanoSocratesSplitter import NanoSocratesSplitter from .NanoSocratesBPE import NanoSocratesBPE, NanoSocratesBatchMemoryBPE from .NanoSocraTrainer import NanoSocraTrainer from .NanoSocraTraineRam import NanoSocraTraineRam +from .NanoSocraTrainerPool import NanoSocraTrainerPool __all__ = [ "NanoSocratesChunker", "NanoSocratesSplitter", "NanoSocratesBPE", "NanoSocraTrainer", - "NanoSocraTraineRam" + "NanoSocraTraineRam", + "NanoSocraTrainerPool" ] \ No newline at end of file From 3fe4e45ceb32c842d782e2347cbc0b03f09362ef Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Thu, 2 Oct 2025 01:50:37 +0200 Subject: [PATCH 46/75] Fixed a bug while joining frequencies --- .../Libs/BPE/Classes/NanoSocraTrainerPool.py | 22 ++++++++++++++----- 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py b/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py index 167b433..74a596f 100644 --- a/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py +++ b/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py @@ -21,15 +21,17 @@ from ..Utils import ( load_json, ) + def split(a, n): k, m = divmod(len(a), n) - return (a[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(n)) + return (a[i * k + min(i, m) : (i + 1) * k + min(i + 1, m)] for i in range(n)) + def split_fit(object: tuple[NanoSocratesBPE, list[list[int]]]): bpe, data = object - NEW_DATA: list[list[int]]= [] + NEW_DATA: list[list[int]] = [] memory = NanoSocratesBatchMemoryBPE({}, 0) @@ -144,7 +146,7 @@ class NanoSocraTrainerPool: def __round_train(self, bpe: NanoSocratesBPE, data: list[list[int]]): - NEW_DATA : list[list[int]] = [] + NEW_DATA: list[list[int]] = [] MEMORY = NanoSocratesBatchMemoryBPE({}, 0) @@ -159,7 +161,9 @@ class NanoSocraTrainerPool: data_chunks = split(data, CPU_COUNT) JOBS = [(NanoSocratesBPE(VOCABULARY), chunk) for chunk in data_chunks] - JOB_RESULTS: list[tuple[NanoSocratesBPE, list[list[int]], NanoSocratesBatchMemoryBPE]] + JOB_RESULTS: list[ + tuple[NanoSocratesBPE, list[list[int]], NanoSocratesBatchMemoryBPE] + ] with Pool() as pool: JOB_RESULTS = pool.map(fit_funct, JOBS) @@ -169,14 +173,20 @@ class NanoSocraTrainerPool: NEW_DATA.extend(job_output) for key, value in job_memory.frequencies.items(): - MEMORY.frequencies[key] = value + frequency = MEMORY.frequencies.get(key) + + if frequency is None: + frequency = 0 + MEMORY.frequencies[key] = 0 + + frequency += value + MEMORY.frequencies[key] = frequency del job_output del job_memory print(f"Joined {i + 1} out of {CPU_COUNT}") - # Get new token bpe.fit([], MEMORY, True) From 0975c19e69f78cd225cc832ee8815ad3141c3063 Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Thu, 2 Oct 2025 08:48:13 +0200 Subject: [PATCH 47/75] added nwew method to encode from list of tokens --- Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py index 4245936..4d44884 100644 --- a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py +++ b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py @@ -52,7 +52,7 @@ class NanoSocratesBPE(Encoder): last_batch: bool ): - ENCODED_CHUNK = self.__round_encode(chunk_data) + ENCODED_CHUNK = self.encode_intermediate(chunk_data) DATA_LEN_BEFORE_LAST = len(ENCODED_CHUNK) - 1 for i in range(0, DATA_LEN_BEFORE_LAST): @@ -99,6 +99,17 @@ class NanoSocratesBPE(Encoder): return current_piece + def encode_intermediate(self, piece: list[int]): + current_piece = piece + new_piece = self.__round_encode(current_piece) + + while len(current_piece) != len(new_piece): + current_piece = new_piece + new_piece = self.__round_encode(current_piece) + + return current_piece + + def __round_encode(self, piece: list[int]): if len(piece) == 1: From 17d82f0a4ece9560600a1f65d46cb70c7c4ed72c Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Thu, 2 Oct 2025 08:48:28 +0200 Subject: [PATCH 48/75] Added support to resume workload --- Scripts/Training/bpe_trainer_pool.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/Scripts/Training/bpe_trainer_pool.py b/Scripts/Training/bpe_trainer_pool.py index 5c7ab6e..966816d 100644 --- a/Scripts/Training/bpe_trainer_pool.py +++ b/Scripts/Training/bpe_trainer_pool.py @@ -72,11 +72,17 @@ def train(args: ProgramArgs): VOCABULARY_PATH = Path(args.output_file) CACHE_PATH = Path(args.cache_file) + start_bpe = BPE.NanoSocratesBPE() + if CACHE_PATH.is_file(): + voc = BPE.load_nanos_vocabulary(CACHE_PATH) + start_bpe = BPE.NanoSocratesBPE(voc) + print(f"Training BPE") BPE_ENCODER = TRAINER.trainBPE( DATASET_PATH, - CACHE_PATH + CACHE_PATH, + start_bpe ) VOCABULARY = BPE_ENCODER.vocabulary From aa765b4555c7a447b06d688b515b93a71330b9a7 Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Thu, 2 Oct 2025 08:48:45 +0200 Subject: [PATCH 49/75] Added time checking --- .../Libs/BPE/Classes/NanoSocraTrainerPool.py | 57 ++++++++++++++++++- 1 file changed, 54 insertions(+), 3 deletions(-) diff --git a/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py b/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py index 74a596f..cdd7a95 100644 --- a/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py +++ b/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py @@ -5,6 +5,7 @@ from multiprocessing import Pool import os from pathlib import Path import re +import time from ..Classes import ( NanoSocratesBPE, NanoSocratesChunker, @@ -49,6 +50,22 @@ def split_fit(object: tuple[NanoSocratesBPE, list[list[int]]]): return (bpe, NEW_DATA, memory) +def split_encode(object: tuple[NanoSocratesBPE, list[list[int]]]): + + bpe, data = object + + NEW_DATA: list[list[int]] = [] + + for piece in data: + output = bpe.encode_intermediate(piece) + + if len(output) < 2: + continue + + # We are sure of its type + NEW_DATA.append(output) # type: ignore + + return NEW_DATA class NanoSocraTrainerPool: @@ -96,6 +113,8 @@ class NanoSocraTrainerPool: exit = False current_iteration = 0 data = self.__gather_data_from_file(path) + data = self.__encode_from_cache(BPE, data) + while not exit: @@ -105,8 +124,9 @@ class NanoSocraTrainerPool: last_memory = None + start = time.time_ns() _, data, last_memory = self.__round_train(BPE, data) - + end = time.time_ns() NEW_VOC_SIZE = BPE.vocabulary_size VOCABULARY = BPE.vocabulary @@ -122,8 +142,8 @@ class NanoSocraTrainerPool: DELIMITER, f"ITERATION: {current_iteration}", DELIMITER, - f"\tVocabulary size: {BPE.vocabulary_size}\n", - f"\tvocabulary:\n{BPE.vocabulary}", + f"\tVocabulary size: {BPE.vocabulary_size - 256}\n", + f"\tTime elapsed: {(end - start)/1E9}s", DELIMITER, "", ] @@ -214,6 +234,37 @@ class NanoSocraTrainerPool: return DATA + def __encode_from_cache(self, bpe: NanoSocratesBPE, data: list[list[int]]): + + NEW_DATA : list[list[int]]= [] + + CPU_COUNT = os.process_cpu_count() + + if CPU_COUNT is None: + raise Exception() + + VOCABULARY = bpe.vocabulary + + data_chunks = split(data, CPU_COUNT) + JOBS = [(NanoSocratesBPE(VOCABULARY), chunk) for chunk in data_chunks] + + JOB_RESULTS: list[list[list[int]]] + + with Pool() as pool: + JOB_RESULTS = pool.map(split_encode, JOBS) + + for i, res in zip(range(0, CPU_COUNT), JOB_RESULTS): + job_output = res + NEW_DATA.extend(job_output) + + del job_output + + print(f"Joined {i + 1} out of {CPU_COUNT}") + + print(f"Sentences from {len(data)} to {len(NEW_DATA)}") + + return NEW_DATA + def __increment_counter(self, counter: int): # What if overflows??? From eadba1fb82d996947a830193a4ca05cedf374c76 Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Thu, 2 Oct 2025 09:33:47 +0200 Subject: [PATCH 50/75] Corrected test to reflect changes in NanoSocratesBPE --- Project_Model/Tests/bpe_test.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/Project_Model/Tests/bpe_test.py b/Project_Model/Tests/bpe_test.py index 7332f65..e6c8f31 100644 --- a/Project_Model/Tests/bpe_test.py +++ b/Project_Model/Tests/bpe_test.py @@ -29,7 +29,7 @@ class TestBPE: def test_bpe_decoding_simple(self): - INPUT = 258 + INPUT = [258] # ab = 256 # 256, 256 = 257 @@ -47,6 +47,27 @@ class TestBPE: for encoded, expected in zip(DECODED, EXPECTED): assert encoded == expected + def test_bpe_decoding_edge_1(self): + + + INPUT = [258, ord("c")] + + # ab = 256 + # 256, 256 = 257 + # 257, 257 = 258 + + VOCABULARY = {(ord("a"), ord("b")): 256, (256, 256): 257, (257, 257): 258} + EXPECTED = "ababababc" + + BPE_ENCODER = BPE.NanoSocratesBPE(VOCABULARY) + + DECODED = BPE_ENCODER.decode(INPUT) + + assert len(DECODED) == len(EXPECTED) + + for encoded, expected in zip(DECODED, EXPECTED): + assert encoded == expected + # Useful to debug weird cases if __name__ == "__main__": TestBPE().test_bpe_decoding_simple() From 1eae8582b2157375f8961db142a0617496412c09 Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Thu, 2 Oct 2025 09:33:58 +0200 Subject: [PATCH 51/75] Fixed decoding phase --- .../Libs/BPE/Classes/NanoSocratesBPE.py | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py index 4d44884..6428cb7 100644 --- a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py +++ b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py @@ -1,3 +1,4 @@ +from collections import deque from .Encoder import Encoder from ..Errors import OutOfDictionaryException, DuplicateWordException @@ -140,31 +141,30 @@ class NanoSocratesBPE(Encoder): return NEW_PIECE # TODO: Remake decode to take a list of token IDs - def decode(self, token_id: int) -> str: + def decode(self, token_ids: list[int]) -> str: - token_stack: list[int] = [token_id] - DECODED_STRING_ARR: list[str] = [] + token_stack: deque[int] = deque(token_ids) + UTF_8_STRING_ARR: bytearray = bytearray() while len(token_stack) > 0: - TOKEN_ID = token_stack.pop() + TOKEN_ID = token_stack.popleft() if TOKEN_ID < 256: - DECODED_CHAR = chr(TOKEN_ID) - DECODED_STRING_ARR.append( - DECODED_CHAR + UTF_8_STRING_ARR.append( + TOKEN_ID ) continue left_token, right_token = self.__token_decode(TOKEN_ID) - token_stack.append( + token_stack.appendleft( right_token ) - token_stack.append( + token_stack.appendleft( left_token ) - return "".join(DECODED_STRING_ARR) + return UTF_8_STRING_ARR.decode("utf-8") def __token_decode(self, token_id: int) -> tuple[int, int]: From 2194cc7b4fb70b3397d220e09b01f53822003fd6 Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Thu, 2 Oct 2025 09:56:05 +0200 Subject: [PATCH 52/75] Changed test to use pool trainer --- Project_Model/Tests/bpe_trainer_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Project_Model/Tests/bpe_trainer_test.py b/Project_Model/Tests/bpe_trainer_test.py index 2e1fa08..69ac4bb 100644 --- a/Project_Model/Tests/bpe_trainer_test.py +++ b/Project_Model/Tests/bpe_trainer_test.py @@ -4,13 +4,13 @@ import Project_Model.Libs.BPE as BPE import re -CACHE_DIR_PATH = Path("Project_Model/Tests/trainer_files/cache") +CACHE_DIR_PATH = Path("Project_Model/Tests/trainer_files/cache/pool-cache.json") class TestTrainBPE: def test_bpe_train_encoding_simple(self): - TRAINER = BPE.NanoSocraTrainer( + TRAINER = BPE.NanoSocraTrainerPool( int(32E3), ["", ""], 40 From 2e595a3a23eaa0a7381e7dbf7ed9ae1ff1694254 Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Thu, 2 Oct 2025 09:56:44 +0200 Subject: [PATCH 53/75] Changed training phase to take directly data instead of its encode --- Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py b/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py index cdd7a95..4dd4f4f 100644 --- a/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py +++ b/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py @@ -46,7 +46,7 @@ def split_fit(object: tuple[NanoSocratesBPE, list[list[int]]]): continue # We are sure of its type - NEW_DATA.append(output) # type: ignore + NEW_DATA.append(piece) # type: ignore return (bpe, NEW_DATA, memory) @@ -56,14 +56,14 @@ def split_encode(object: tuple[NanoSocratesBPE, list[list[int]]]): NEW_DATA: list[list[int]] = [] - for piece in data: + for index, piece in zip(range(0, len(data)), data): output = bpe.encode_intermediate(piece) if len(output) < 2: continue # We are sure of its type - NEW_DATA.append(output) # type: ignore + NEW_DATA.append(data[index]) # type: ignore return NEW_DATA From 856bd8909c27599444c8c31558e171f1a65de9cd Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Thu, 2 Oct 2025 11:02:03 +0200 Subject: [PATCH 54/75] Added treshold --- Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py b/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py index 4dd4f4f..4e88802 100644 --- a/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py +++ b/Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py @@ -168,7 +168,7 @@ class NanoSocraTrainerPool: NEW_DATA: list[list[int]] = [] - MEMORY = NanoSocratesBatchMemoryBPE({}, 0) + MEMORY = NanoSocratesBatchMemoryBPE({}, self.__merge_treshold) fit_funct = split_fit CPU_COUNT = os.process_cpu_count() From 0eef2148a9e15619491993a41af586ad7b06fd25 Mon Sep 17 00:00:00 2001 From: GassiGiuseppe Date: Thu, 2 Oct 2025 12:12:44 +0200 Subject: [PATCH 55/75] in NanoSocratesBPE: encode() method rewritten and tested --- .../Libs/BPE/Classes/NanoSocratesBPE.py | 45 ++++++++++++++----- 1 file changed, 34 insertions(+), 11 deletions(-) diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py index 6428cb7..132217e 100644 --- a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py +++ b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py @@ -4,12 +4,15 @@ from ..Errors import OutOfDictionaryException, DuplicateWordException class NanoSocratesBatchMemoryBPE: + """ Memory to batch training. Keeps token couple frequencies, and merge_treshold + """ def __init__( self, frequencies: dict[tuple[int, int], int], merge_treshold: int ) -> None: + self.frequencies = frequencies self.merge_treshold = merge_treshold @@ -42,7 +45,12 @@ class NanoSocratesBPE(Encoder): return self.__vocabulary @property - def __next_id(self): + def __next_id(self) -> int: + """ + Gets the next it + Returns: + int: + """ return self.vocabulary_size + 1 # TODO: implement fit @@ -90,20 +98,26 @@ class NanoSocratesBPE(Encoder): def encode(self, piece: str) -> list[int]: + """Encode a String into token IDs, it firt convert it into utf-8, then pass the list of integer to encode_intermediate() + Args: + piece (str): + Returns: + list[int]: + """ + converted_piece = list(piece.encode("utf-8")) + return self.encode_intermediate(converted_piece) - current_piece = list(piece.encode("utf-8")) - new_piece = self.__round_encode(current_piece) - - while len(current_piece) != len(new_piece): - current_piece = new_piece - new_piece = self.__round_encode(current_piece) - - return current_piece - - def encode_intermediate(self, piece: list[int]): + def encode_intermediate(self, piece: list[int]) -> list[int]: + """ Encode a piece (as list of integer) till its maximum + Args: + piece (list[int]): piece to encode + Returns: + list[int]: piece encoded + """ current_piece = piece new_piece = self.__round_encode(current_piece) + # until current_piece is bigger then new_piece, keep encoding while len(current_piece) != len(new_piece): current_piece = new_piece new_piece = self.__round_encode(current_piece) @@ -112,6 +126,14 @@ class NanoSocratesBPE(Encoder): def __round_encode(self, piece: list[int]): + """_summary_ + + Args: + piece (list[int]): _description_ + + Returns: + _type_: _description_ + """ if len(piece) == 1: return piece @@ -143,6 +165,7 @@ class NanoSocratesBPE(Encoder): # TODO: Remake decode to take a list of token IDs def decode(self, token_ids: list[int]) -> str: + # deque: double ended queue token_stack: deque[int] = deque(token_ids) UTF_8_STRING_ARR: bytearray = bytearray() From a1d143187dbfad8ff90c4338f3260ad88a75030c Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Thu, 2 Oct 2025 20:11:43 +0200 Subject: [PATCH 56/75] corrected test to reflect changes in BPE trainer --- Project_Model/Tests/bpe_trainer_test.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Project_Model/Tests/bpe_trainer_test.py b/Project_Model/Tests/bpe_trainer_test.py index 69ac4bb..1f9f7fb 100644 --- a/Project_Model/Tests/bpe_trainer_test.py +++ b/Project_Model/Tests/bpe_trainer_test.py @@ -12,8 +12,7 @@ class TestTrainBPE: TRAINER = BPE.NanoSocraTrainerPool( int(32E3), - ["", ""], - 40 + ["", ""] ) TEXT = "abababab" From 7c935d27008106b056a33a9828102f82a82f7236 Mon Sep 17 00:00:00 2001 From: GassiGiuseppe Date: Fri, 3 Oct 2025 00:57:19 +0200 Subject: [PATCH 57/75] Update NanoSocratesBPE: corrected a minor bug about dictionary lenght, added some comment to make the code more clear --- .../Libs/BPE/Classes/NanoSocratesBPE.py | 38 ++++++++++++++----- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py index 132217e..baa5efd 100644 --- a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py +++ b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py @@ -2,7 +2,10 @@ from collections import deque from .Encoder import Encoder from ..Errors import OutOfDictionaryException, DuplicateWordException - +# ABOUT THE DICTIONARY: +# the string is converted into utf-char bytes, that is: each char is rappresented with a set of bytes from 1 to 4. +# each bytes get casted into an integer; such that, if an integer has its value lower then 256, +# then it is rappresenting an utf-char-byte, otherwise it is a token-ID. class NanoSocratesBatchMemoryBPE: """ Memory to batch training. Keeps token couple frequencies, and merge_treshold """ @@ -31,6 +34,7 @@ class NanoSocratesBPE(Encoder): for key, value in vocabulary.items(): if value < 256: raise OutOfDictionaryException() + # values under 256 are used for unpaired char # TODO: check if they are in order self.__vocabulary[key] = value self.__reverse_vocabulary[value] = key @@ -38,7 +42,7 @@ class NanoSocratesBPE(Encoder): @property def vocabulary_size(self): - return len(self.__vocabulary) + 255 + return len(self.__vocabulary) + 256 @property def vocabulary(self): @@ -51,7 +55,7 @@ class NanoSocratesBPE(Encoder): Returns: int: """ - return self.vocabulary_size + 1 + return self.vocabulary_size # TODO: implement fit def fit( @@ -64,6 +68,7 @@ class NanoSocratesBPE(Encoder): ENCODED_CHUNK = self.encode_intermediate(chunk_data) DATA_LEN_BEFORE_LAST = len(ENCODED_CHUNK) - 1 + # update frequency of each couple of element for i in range(0, DATA_LEN_BEFORE_LAST): CANDIDATE_COUPLE = (ENCODED_CHUNK[i], ENCODED_CHUNK[i+1]) @@ -77,6 +82,7 @@ class NanoSocratesBPE(Encoder): frequency += 1 memory.frequencies[CANDIDATE_COUPLE] = frequency + if not last_batch: return (self, memory, ENCODED_CHUNK) @@ -126,13 +132,14 @@ class NanoSocratesBPE(Encoder): def __round_encode(self, piece: list[int]): - """_summary_ - + """ A single round of encode that traverse all the object. Multiple round are needed for a full encode: \n + 1) "ABAB" -> "XX" + 2) "XX" -> "Y" Args: - piece (list[int]): _description_ + piece (list[int]): the object to encode as a list of integer Returns: - _type_: _description_ + (list[int]): the one time encoded object """ if len(piece) == 1: @@ -144,26 +151,31 @@ class NanoSocratesBPE(Encoder): index = 0 while index < PIECE_LENGTH: - CANDIDATE_WORD = (piece[index], piece[index + 1]) + CANDIDATE_WORD = (piece[index], piece[index + 1]) # take a tuple of consecutive element [int] CANDIDATE_TOKEN = self.__vocabulary.get(CANDIDATE_WORD) + # if no token to substitute the tuple, append the first element if CANDIDATE_TOKEN is None: - NEW_PIECE.append(piece[index]) + NEW_PIECE.append(piece[index]) index += 1 + # if the latter element of the tuple is the last element of the piece, append it if index == PIECE_LENGTH: - NEW_PIECE.append(piece[index]) + NEW_PIECE.append(piece[index]) continue + # in this case there was a candidate token to substitute the couple of element NEW_PIECE.append(CANDIDATE_TOKEN) index += 2 return NEW_PIECE + # TODO: Remake decode to take a list of token IDs def decode(self, token_ids: list[int]) -> str: + # deque: double ended queue token_stack: deque[int] = deque(token_ids) @@ -199,7 +211,13 @@ class NanoSocratesBPE(Encoder): return CANDIDATE_DECODED def __learn_word(self, words: tuple[int, int]): + """ learn a new couple of object in the vocabulary + Args: + words (tuple[int, int]): the Pair of element to substitute with a new tokenID + Raises: + DuplicateWordException: it launch if there is a duplicate of the new tokenID in the dictionary + """ ID = self.__next_id DUPLICATE = self.__vocabulary.get(words) From a5b8692a77cec0f61b7f9cc7f5fd92f914ba33bf Mon Sep 17 00:00:00 2001 From: GassiGiuseppe Date: Fri, 3 Oct 2025 00:59:15 +0200 Subject: [PATCH 58/75] Updated NanoSocratesSpecial to work with TokeNano --- .../Libs/BPE/Classes/NanoSocratesSpecial.py | 45 ++++++++++++------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py b/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py index e551d6c..8fe81bb 100644 --- a/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py +++ b/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py @@ -5,34 +5,43 @@ class NanoSocratesSpecial(Encoder): def __init__( self, - initial_vocabulary: list[str] | None = None - ) -> None: + vocabulary_index: int , + vocabulary: dict[str, int] | None = None + ) -> None: + super().__init__() - self.__vocabulary: dict[str, int] = {} + if vocabulary is None: + self.__vocabulary: dict[str, int] = {} + else: + self.__vocabulary: dict[str, int] = vocabulary + self.__reverse_vocabulary: dict[int, str] = {} - self.__current_index = 0 - if initial_vocabulary is None: - return + if vocabulary_index is None: + self.__vocabulary_index = 0 + else: + self.__vocabulary_index = vocabulary_index - for word in initial_vocabulary: + # self.__build_reverse_vocabulary() - CURRENT_INDEX = self.__current_index - self.__vocabulary[word] = CURRENT_INDEX - self.__reverse_vocabulary[CURRENT_INDEX] = word - self.__current_index += 1 - @property - def vocabulary_size(self): - return self.__current_index + def build_reverse_vocabulary(self): + self.__reverse_vocabulary = {v: k for k, v in self.__vocabulary.items()} - def add_special_word(self, word:str): - CURRENT_INDEX = self.__current_index + # @property + # def vocabulary_size(self): + # return self.__current_index + + def set_vocabulary_index(self, vocabulary_index: int): + self.__vocabulary_index = vocabulary_index + + def add_special_word_to_vocabulary(self, word:str): + self.__vocabulary_index = self.__vocabulary_index + 1 + CURRENT_INDEX = self.__vocabulary_index self.__vocabulary[word] = CURRENT_INDEX self.__reverse_vocabulary[CURRENT_INDEX] = word - self.__current_index += 1 def encode(self, word: str) -> list[int]: ID = self.__vocabulary.get(word) @@ -52,3 +61,5 @@ class NanoSocratesSpecial(Encoder): return WORD + def get_reverse_vocabulary(self)-> dict[int, str]: + return self.__reverse_vocabulary From 8121c75a09c95687c33d01aa6539b733e031737f Mon Sep 17 00:00:00 2001 From: GassiGiuseppe Date: Fri, 3 Oct 2025 01:00:36 +0200 Subject: [PATCH 59/75] Updated NanoSocratesSplitter to split also token in decode phase --- .../Libs/BPE/Classes/NanoSocratesSplitter.py | 60 ++++++++++++++++--- 1 file changed, 51 insertions(+), 9 deletions(-) diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py b/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py index ccca300..399fa77 100644 --- a/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py +++ b/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py @@ -1,40 +1,82 @@ import re +from collections import deque from typing import Generator from ..Enums import TokenType + class NanoSocratesSplitter: def __init__( self, - special_token_regex: re.Pattern + special_token_regex: re.Pattern, + max_bpe_token_id: int = 255 ) -> None: + # attention the regex got already compiled self.__special_token_regex = special_token_regex + self.__max_bpe_token_id : int = max_bpe_token_id # used for decoding def split_text(self, corpus: str) -> Generator[tuple[str, TokenType]]: + """ Split a text using a regex given + Args: + corpus (str): all the corpus string to split + Yields: + Generator[tuple[str, TokenType]]: each time returns a piece of the splitted text: string and its TokenType. \n + TokenType describe if the string is for the BPE or a special Token [BPE, SPECIAL] + """ bpe_start = 0 - bpe_end = len(corpus) + bpe_end = len(corpus) # this can be deleted! - for bound_start, bound_end in self.__find_boundaries(corpus): + for special_token_start, special_token_end in self.__find_boundaries(corpus): - bpe_end = bound_start + # FIND BPE + bpe_end = special_token_start BPE_TOKEN_TEXT = corpus[bpe_start:bpe_end] - if BPE_TOKEN_TEXT != "": yield (BPE_TOKEN_TEXT, TokenType.BPE) - bpe_start = bound_end - SPECIAL_TOKEN_TEXT = corpus[bound_start:bound_end] - + # FIND SPECIAL TOKEN + SPECIAL_TOKEN_TEXT = corpus[special_token_start:special_token_end] if SPECIAL_TOKEN_TEXT != "": yield (SPECIAL_TOKEN_TEXT, TokenType.SPECIAL) - def __find_boundaries(self, corpus: str) -> Generator[tuple[int, int]]: + # now save the new bpe start point + # it will used in the next interaction + bpe_start = special_token_end + + def __find_boundaries(self, corpus: str) -> Generator[tuple[int, int]]: + """ + Find each time the start and end (not included) of the special token + Args: + corpus (str): the string where the special token will be searched + Yields: + Generator[tuple[int, int]]: Note the end is not included + """ for match in self.__special_token_regex.finditer(corpus): start = match.start() end = match.end() yield (start, end) + + # make the last boundary be the end of corpus + # eof = len(corpus) + # yield(eof,eof) + + + def split_tokens(self, corpus: list[int]) -> Generator[tuple[list[int], TokenType]] : + + not_special_token_list : list[int]= [] + for token in corpus: + if token > self.__max_bpe_token_id: + + if len(not_special_token_list) > 0: + yield (not_special_token_list, TokenType.BPE) + not_special_token_list = [] + + yield (token, TokenType.SPECIAL) + continue + + not_special_token_list.append(token) From 070dc1b744b462b53ee89d2dc59f7fd98e201eba Mon Sep 17 00:00:00 2001 From: GassiGiuseppe Date: Fri, 3 Oct 2025 01:04:06 +0200 Subject: [PATCH 60/75] implemented token nano for the BPE encoding/decoding --- Project_Model/Libs/BPE/Classes/TokeNano.py | 8 ++ .../Libs/BPE/Classes/TokeNanoCore.py | 79 +++++++++++++++++++ 2 files changed, 87 insertions(+) diff --git a/Project_Model/Libs/BPE/Classes/TokeNano.py b/Project_Model/Libs/BPE/Classes/TokeNano.py index e69de29..1088f7d 100644 --- a/Project_Model/Libs/BPE/Classes/TokeNano.py +++ b/Project_Model/Libs/BPE/Classes/TokeNano.py @@ -0,0 +1,8 @@ + +from Project_Model.Libs.BPE.Classes.TokeNanoCore import TokeNanoCore + +class TokeNano: + + def __init__(self): + + pass \ No newline at end of file diff --git a/Project_Model/Libs/BPE/Classes/TokeNanoCore.py b/Project_Model/Libs/BPE/Classes/TokeNanoCore.py index e69de29..c719219 100644 --- a/Project_Model/Libs/BPE/Classes/TokeNanoCore.py +++ b/Project_Model/Libs/BPE/Classes/TokeNanoCore.py @@ -0,0 +1,79 @@ +from pathlib import Path + +from Project_Model.Libs.BPE.Classes.NanoSocratesSplitter import NanoSocratesSplitter +from Project_Model.Libs.BPE.Classes.NanoSocratesBPE import NanoSocratesBPE +from Project_Model.Libs.BPE.Classes.NanoSocratesSpecial import NanoSocratesSpecial + +from Project_Model.Libs.BPE.Utils.special_regex_maker import special_regex_maker +from Scripts.Libs.CleaningPipeline.special_token import SpecialToken +from Project_Model.Libs.BPE.Enums import TokenType +from Project_Model.Libs.BPE.Utils.json_utils import load_json +class TokeNanoCore: + def __init__(self, + bpe_vocabulary: dict[tuple[int, int], int] + # special_vocabulary: dict[str, int] + ): + self._bpe = NanoSocratesBPE(bpe_vocabulary) + + # special_vocabulary = [token.value for token in SpecialToken] + special_token_list = [token.value for token in SpecialToken] + self._splitter = NanoSocratesSplitter(special_regex_maker(special_token_list),self._bpe.vocabulary_size) + + self._special_bpe = NanoSocratesSpecial(vocabulary_index=None) # technically its not a bpe but more something like an "autoencoder" + self.prepare_special_token_vocabulary() + + + def encode(self, corpus : str) -> list[int]: + output : list[int] = [] + for piece, token_type in self._splitter.split_text(corpus): + + if token_type == TokenType.SPECIAL: + output.extend(self._special_bpe.encode(piece)) + + # slow but clear + if token_type == TokenType.BPE: + output.extend(self._bpe.encode(piece)) + + return output + + + + def decode(self, corpus : list[int])-> str: + output_str = '' + for token, token_type in self._splitter.split_tokens(corpus): + # token is an integer if special, a list of integer otherwise + if token_type == TokenType.SPECIAL: + output_str += self._special_bpe.decode(token) # it accept an integer + + # slow but clear + if token_type == TokenType.BPE: + output_str += self._bpe.decode(token) # it accept a list of integer + return output_str + + + + def prepare_special_token_vocabulary(self): + self._special_bpe.set_vocabulary_index(self._bpe.vocabulary_size) + + for special_token in [token.value for token in SpecialToken]: + self._special_bpe.add_special_word_to_vocabulary(special_token) + + self._special_bpe.build_reverse_vocabulary() + + +if __name__ == "__main__": + dictionary_path = "Assets/Dataset/Tmp/toy_dictionary.json" + dictionary = load_json(Path(dictionary_path)) + + tokenano = TokeNanoCore(dictionary) + + corpus = "dbp-dbr:How_It_Should_Have_Ended" + print(corpus) + + encoded_list = tokenano.encode(corpus) + print(encoded_list) + + decoded_string = tokenano.decode(encoded_list) + print(decoded_string) + +# [1474, 1475, 1477, 100, 98, 112, 45, 100, 98, 114, 58, 72, 111, 119, 95, 73, 116, 95, 83, 104, 111, 117, 108, 100, 95, 72, 97, 118, 101, 95, 69, 110, 100, 101, 100, 1478] \ No newline at end of file From 09f7b39512a72de432afd245b5efc9d87ccd6207 Mon Sep 17 00:00:00 2001 From: GassiGiuseppe Date: Fri, 3 Oct 2025 01:04:47 +0200 Subject: [PATCH 61/75] test files updated --- Project_Model/Tests/bpe_test.py | 3 +- Project_Model/Tests/splitter_test.py | 49 ++++++++++++++++++++++++++++ Project_Model/Tests/tokenano_test.py | 21 ++++++++++++ 3 files changed, 72 insertions(+), 1 deletion(-) create mode 100644 Project_Model/Tests/tokenano_test.py diff --git a/Project_Model/Tests/bpe_test.py b/Project_Model/Tests/bpe_test.py index e6c8f31..0acae46 100644 --- a/Project_Model/Tests/bpe_test.py +++ b/Project_Model/Tests/bpe_test.py @@ -70,4 +70,5 @@ class TestBPE: # Useful to debug weird cases if __name__ == "__main__": - TestBPE().test_bpe_decoding_simple() + # TestBPE().test_bpe_decoding_simple() + TestBPE().test_bpe_encoding_simple() diff --git a/Project_Model/Tests/splitter_test.py b/Project_Model/Tests/splitter_test.py index eda95b6..154e24e 100644 --- a/Project_Model/Tests/splitter_test.py +++ b/Project_Model/Tests/splitter_test.py @@ -45,6 +45,7 @@ class TestSplitter: ("", TokenType.SPECIAL), ("m d", TokenType.BPE), ("", TokenType.SPECIAL), + #("olor", TokenType.BPE) ] CHUNKS = list(SPLITTER.split_text(TEXT)) @@ -129,3 +130,51 @@ class TestSplitter: assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE + + def test_split_token_decode_simple(self): + # to test the token split into special and bpe + SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX, max_bpe_token_id=1473) + token_list = [100,101,1477] + + CHUNKS = list(SPLITTER.split_tokens(token_list)) + EXPECTED_CHUNKS = [ + ([100,101], TokenType.BPE), + (1477, TokenType.SPECIAL), + ] + + assert len(CHUNKS) == len(EXPECTED_CHUNKS) + + for chunk, expected_chunk in zip(EXPECTED_CHUNKS, CHUNKS): + print(f"TEST:\n\tCHUNK:\t\t{chunk}\n\tEXPECTED:\t\t{expected_chunk}") + RECEIVED_TOKEN_STRING, RECEIVED_TOKEN_TYPE = chunk + EXPECTED_TOKEN_STRING, EXPECTED_TOKEN_TYPE = expected_chunk + + assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING + assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE + + def test_split_token_decode_simple_malformed(self): + # to test the token split into special and bpe + SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX, max_bpe_token_id=1473) + token_list = [100,101,1477,100] + + CHUNKS = list(SPLITTER.split_tokens(token_list)) + EXPECTED_CHUNKS = [ + ([100,101], TokenType.BPE), + (1477, TokenType.SPECIAL), + ] + + assert len(CHUNKS) == len(EXPECTED_CHUNKS) + + for chunk, expected_chunk in zip(EXPECTED_CHUNKS, CHUNKS): + print(f"TEST:\n\tCHUNK:\t\t{chunk}\n\tEXPECTED:\t\t{expected_chunk}") + RECEIVED_TOKEN_STRING, RECEIVED_TOKEN_TYPE = chunk + EXPECTED_TOKEN_STRING, EXPECTED_TOKEN_TYPE = expected_chunk + + assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING + assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE + + + +# Useful to debug weird cases +if __name__ == "__main__": + TestSplitter().test_split_trailing_text() \ No newline at end of file diff --git a/Project_Model/Tests/tokenano_test.py b/Project_Model/Tests/tokenano_test.py new file mode 100644 index 0000000..2dc7779 --- /dev/null +++ b/Project_Model/Tests/tokenano_test.py @@ -0,0 +1,21 @@ + +from Project_Model.Libs.BPE.Classes.TokeNanoCore import TokeNanoCore + +class TestTokeNano: + + def test_decode_encode_simple(self): + TEXT = "abababab" + + # ab = 256 + # 256, 256 = 257 + # 257, 257 = 258 + + VOCABULARY = {(ord("a"), ord("b")): 256, (256, 256): 257, (257, 257): 258} + # EXPECTED = [258] + + TOKE_NANO = TokeNanoCore(VOCABULARY) + + ENCODED = TOKE_NANO.encode(TEXT) + DECODED = TOKE_NANO.decode(ENCODED) + + assert TEXT == DECODED From 845d6453488a0f65c30af664725dd623a40c9c09 Mon Sep 17 00:00:00 2001 From: GassiGiuseppe Date: Fri, 3 Oct 2025 10:38:35 +0200 Subject: [PATCH 62/75] added some stubs on special_regex_maker --- Project_Model/Libs/BPE/Utils/special_regex_maker.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Project_Model/Libs/BPE/Utils/special_regex_maker.py b/Project_Model/Libs/BPE/Utils/special_regex_maker.py index 414eabf..fd4ac28 100644 --- a/Project_Model/Libs/BPE/Utils/special_regex_maker.py +++ b/Project_Model/Libs/BPE/Utils/special_regex_maker.py @@ -2,6 +2,13 @@ import re def special_regex_maker(special_tokens: list[str]) -> re.Pattern: + """ compile a regex for the special token + Args: + special_tokens (list[str]): the list of special token + + Returns: + re.Pattern: + """ REGEX_STR = "|".join(special_tokens) From e8894504c60d648698cf29d2cb72cd7a1a1edebd Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Fri, 3 Oct 2025 11:44:44 +0200 Subject: [PATCH 63/75] Fixed a bug where a token (int) was yielded instead of a list of int --- .../Libs/BPE/Classes/NanoSocratesSplitter.py | 28 ++++++++----------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py b/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py index 399fa77..6e0abc2 100644 --- a/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py +++ b/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py @@ -1,23 +1,20 @@ import re -from collections import deque +from collections import deque from typing import Generator from ..Enums import TokenType - class NanoSocratesSplitter: def __init__( - self, - special_token_regex: re.Pattern, - max_bpe_token_id: int = 255 + self, special_token_regex: re.Pattern, max_bpe_token_id: int = 255 ) -> None: # attention the regex got already compiled self.__special_token_regex = special_token_regex - self.__max_bpe_token_id : int = max_bpe_token_id # used for decoding + self.__max_bpe_token_id: int = max_bpe_token_id # used for decoding def split_text(self, corpus: str) -> Generator[tuple[str, TokenType]]: - """ Split a text using a regex given + """Split a text using a regex given Args: corpus (str): all the corpus string to split Yields: @@ -26,7 +23,7 @@ class NanoSocratesSplitter: """ bpe_start = 0 - bpe_end = len(corpus) # this can be deleted! + bpe_end = len(corpus) # this can be deleted! for special_token_start, special_token_end in self.__find_boundaries(corpus): @@ -45,7 +42,6 @@ class NanoSocratesSplitter: # it will used in the next interaction bpe_start = special_token_end - def __find_boundaries(self, corpus: str) -> Generator[tuple[int, int]]: """ Find each time the start and end (not included) of the special token @@ -53,21 +49,20 @@ class NanoSocratesSplitter: corpus (str): the string where the special token will be searched Yields: Generator[tuple[int, int]]: Note the end is not included - """ + """ for match in self.__special_token_regex.finditer(corpus): start = match.start() end = match.end() yield (start, end) - + # make the last boundary be the end of corpus # eof = len(corpus) # yield(eof,eof) + def split_tokens(self, corpus: list[int]) -> Generator[tuple[list[int], TokenType]]: - def split_tokens(self, corpus: list[int]) -> Generator[tuple[list[int], TokenType]] : - - not_special_token_list : list[int]= [] + not_special_token_list: list[int] = [] for token in corpus: if token > self.__max_bpe_token_id: @@ -75,8 +70,7 @@ class NanoSocratesSplitter: yield (not_special_token_list, TokenType.BPE) not_special_token_list = [] - yield (token, TokenType.SPECIAL) + yield ([token], TokenType.SPECIAL) continue - - not_special_token_list.append(token) + not_special_token_list.append(token) From 6b9cb7cd352e2a297a80816734ffe3ff21cd674c Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Fri, 3 Oct 2025 13:26:42 +0200 Subject: [PATCH 64/75] Modified imports --- Project_Model/Libs/BPE/Classes/__init__.py | 1 + Project_Model/Libs/BPE/Utils/__init__.py | 1 + 2 files changed, 2 insertions(+) diff --git a/Project_Model/Libs/BPE/Classes/__init__.py b/Project_Model/Libs/BPE/Classes/__init__.py index d3b93b6..bab5bd8 100644 --- a/Project_Model/Libs/BPE/Classes/__init__.py +++ b/Project_Model/Libs/BPE/Classes/__init__.py @@ -4,6 +4,7 @@ from .NanoSocratesBPE import NanoSocratesBPE, NanoSocratesBatchMemoryBPE from .NanoSocraTrainer import NanoSocraTrainer from .NanoSocraTraineRam import NanoSocraTraineRam from .NanoSocraTrainerPool import NanoSocraTrainerPool +from .NanoSocratesSpecial import NanoSocratesSpecial __all__ = [ "NanoSocratesChunker", diff --git a/Project_Model/Libs/BPE/Utils/__init__.py b/Project_Model/Libs/BPE/Utils/__init__.py index 3eb9eb3..f9213c6 100644 --- a/Project_Model/Libs/BPE/Utils/__init__.py +++ b/Project_Model/Libs/BPE/Utils/__init__.py @@ -2,6 +2,7 @@ from .special_regex_maker import special_regex_maker from .lag_checker_iterator import iterator_with_checks from .vocabulary import save_nanos_vocabulary, load_nanos_vocabulary from .json_utils import save_json, load_json +from .special_regex_maker import special_regex_maker __all__ = [ "special_regex_maker", From c5c0c61f797773a96f1a3fe582e8998c5d5254cd Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Fri, 3 Oct 2025 13:26:58 +0200 Subject: [PATCH 65/75] Fix of bugs and semantics --- .../Libs/BPE/Classes/NanoSocratesBPE.py | 56 ++++------- .../Libs/BPE/Classes/NanoSocratesSpecial.py | 65 ++++++------- .../Libs/BPE/Classes/NanoSocratesSplitter.py | 24 ++++- .../Libs/BPE/Classes/TokeNanoCore.py | 97 +++++++------------ Project_Model/Libs/BPE/Enums/SpecialToken.py | 21 ++++ 5 files changed, 134 insertions(+), 129 deletions(-) create mode 100644 Project_Model/Libs/BPE/Enums/SpecialToken.py diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py index baa5efd..d517f04 100644 --- a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py +++ b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py @@ -2,20 +2,18 @@ from collections import deque from .Encoder import Encoder from ..Errors import OutOfDictionaryException, DuplicateWordException + # ABOUT THE DICTIONARY: # the string is converted into utf-char bytes, that is: each char is rappresented with a set of bytes from 1 to 4. # each bytes get casted into an integer; such that, if an integer has its value lower then 256, # then it is rappresenting an utf-char-byte, otherwise it is a token-ID. class NanoSocratesBatchMemoryBPE: - """ Memory to batch training. Keeps token couple frequencies, and merge_treshold - """ + """Memory to batch training. Keeps token couple frequencies, and merge_treshold""" def __init__( - self, - frequencies: dict[tuple[int, int], int], - merge_treshold: int + self, frequencies: dict[tuple[int, int], int], merge_treshold: int ) -> None: - + self.frequencies = frequencies self.merge_treshold = merge_treshold @@ -39,7 +37,6 @@ class NanoSocratesBPE(Encoder): self.__vocabulary[key] = value self.__reverse_vocabulary[value] = key - @property def vocabulary_size(self): return len(self.__vocabulary) + 256 @@ -62,7 +59,7 @@ class NanoSocratesBPE(Encoder): self, chunk_data: list[int], memory: NanoSocratesBatchMemoryBPE, - last_batch: bool + last_batch: bool, ): ENCODED_CHUNK = self.encode_intermediate(chunk_data) @@ -70,7 +67,7 @@ class NanoSocratesBPE(Encoder): # update frequency of each couple of element for i in range(0, DATA_LEN_BEFORE_LAST): - CANDIDATE_COUPLE = (ENCODED_CHUNK[i], ENCODED_CHUNK[i+1]) + CANDIDATE_COUPLE = (ENCODED_CHUNK[i], ENCODED_CHUNK[i + 1]) frequency = memory.frequencies.get(CANDIDATE_COUPLE) @@ -82,7 +79,6 @@ class NanoSocratesBPE(Encoder): frequency += 1 memory.frequencies[CANDIDATE_COUPLE] = frequency - if not last_batch: return (self, memory, ENCODED_CHUNK) @@ -100,9 +96,6 @@ class NanoSocratesBPE(Encoder): return (self, memory, ENCODED_CHUNK) - - - def encode(self, piece: str) -> list[int]: """Encode a String into token IDs, it firt convert it into utf-8, then pass the list of integer to encode_intermediate() Args: @@ -114,12 +107,12 @@ class NanoSocratesBPE(Encoder): return self.encode_intermediate(converted_piece) def encode_intermediate(self, piece: list[int]) -> list[int]: - """ Encode a piece (as list of integer) till its maximum + """Encode a piece (as list of integer) till its maximum Args: piece (list[int]): piece to encode Returns: - list[int]: piece encoded - """ + list[int]: piece encoded + """ current_piece = piece new_piece = self.__round_encode(current_piece) @@ -130,9 +123,8 @@ class NanoSocratesBPE(Encoder): return current_piece - def __round_encode(self, piece: list[int]): - """ A single round of encode that traverse all the object. Multiple round are needed for a full encode: \n + """A single round of encode that traverse all the object. Multiple round are needed for a full encode: \n 1) "ABAB" -> "XX" 2) "XX" -> "Y" Args: @@ -146,22 +138,25 @@ class NanoSocratesBPE(Encoder): return piece PIECE_LENGTH = len(piece) - 1 - NEW_PIECE : list[int]= [] + NEW_PIECE: list[int] = [] index = 0 while index < PIECE_LENGTH: - CANDIDATE_WORD = (piece[index], piece[index + 1]) # take a tuple of consecutive element [int] + CANDIDATE_WORD = ( + piece[index], + piece[index + 1], + ) # take a tuple of consecutive element [int] CANDIDATE_TOKEN = self.__vocabulary.get(CANDIDATE_WORD) # if no token to substitute the tuple, append the first element if CANDIDATE_TOKEN is None: - NEW_PIECE.append(piece[index]) + NEW_PIECE.append(piece[index]) index += 1 # if the latter element of the tuple is the last element of the piece, append it if index == PIECE_LENGTH: - NEW_PIECE.append(piece[index]) + NEW_PIECE.append(piece[index]) continue @@ -169,13 +164,10 @@ class NanoSocratesBPE(Encoder): NEW_PIECE.append(CANDIDATE_TOKEN) index += 2 - return NEW_PIECE - # TODO: Remake decode to take a list of token IDs def decode(self, token_ids: list[int]) -> str: - # deque: double ended queue token_stack: deque[int] = deque(token_ids) @@ -185,19 +177,13 @@ class NanoSocratesBPE(Encoder): TOKEN_ID = token_stack.popleft() if TOKEN_ID < 256: - UTF_8_STRING_ARR.append( - TOKEN_ID - ) + UTF_8_STRING_ARR.append(TOKEN_ID) continue left_token, right_token = self.__token_decode(TOKEN_ID) - token_stack.appendleft( - right_token - ) - token_stack.appendleft( - left_token - ) + token_stack.appendleft(right_token) + token_stack.appendleft(left_token) return UTF_8_STRING_ARR.decode("utf-8") @@ -211,7 +197,7 @@ class NanoSocratesBPE(Encoder): return CANDIDATE_DECODED def __learn_word(self, words: tuple[int, int]): - """ learn a new couple of object in the vocabulary + """learn a new couple of object in the vocabulary Args: words (tuple[int, int]): the Pair of element to substitute with a new tokenID diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py b/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py index 8fe81bb..61d4741 100644 --- a/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py +++ b/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py @@ -1,47 +1,46 @@ from .Encoder import Encoder from ..Errors import OutOfDictionaryException + class NanoSocratesSpecial(Encoder): def __init__( - self, - vocabulary_index: int , - vocabulary: dict[str, int] | None = None - ) -> None: - + self, bpe_vocabulary_size: int, special_tokens: list[str] = [] + ) -> None: + super().__init__() - if vocabulary is None: - self.__vocabulary: dict[str, int] = {} - else: - self.__vocabulary: dict[str, int] = vocabulary - + self.__bpe_offset = bpe_vocabulary_size + self.__vocabulary: dict[str, int] = {} self.__reverse_vocabulary: dict[int, str] = {} - if vocabulary_index is None: - self.__vocabulary_index = 0 - else: - self.__vocabulary_index = vocabulary_index + if len(special_tokens) == 0: + return - # self.__build_reverse_vocabulary() + for index, TOKEN in zip(range(0, len(special_tokens)), special_tokens): + CANDIDATE_ID = self.__bpe_offset + index + 1 + self.__vocabulary[TOKEN] = CANDIDATE_ID + self.__reverse_vocabulary[CANDIDATE_ID] = TOKEN + @property + def __next_id(self): + BPE_OFFSET = self.__bpe_offset + VOC_LENGTH = len(self.__vocabulary) + return BPE_OFFSET + VOC_LENGTH + 1 - def build_reverse_vocabulary(self): - self.__reverse_vocabulary = {v: k for k, v in self.__vocabulary.items()} + @property + def vocabulary(self) -> dict[str, int]: + return self.__vocabulary - # @property - # def vocabulary_size(self): - # return self.__current_index + @property + def reverse_vocabulary(self) -> dict[int, str]: + return self.__reverse_vocabulary - def set_vocabulary_index(self, vocabulary_index: int): - self.__vocabulary_index = vocabulary_index - - def add_special_word_to_vocabulary(self, word:str): - self.__vocabulary_index = self.__vocabulary_index + 1 - CURRENT_INDEX = self.__vocabulary_index - self.__vocabulary[word] = CURRENT_INDEX - self.__reverse_vocabulary[CURRENT_INDEX] = word + def add_special_word_to_vocabulary(self, word: str): + CANDIDATE_INDEX = self.__next_id + self.__vocabulary[word] = CANDIDATE_INDEX + self.__reverse_vocabulary[CANDIDATE_INDEX] = word def encode(self, word: str) -> list[int]: ID = self.__vocabulary.get(word) @@ -51,15 +50,15 @@ class NanoSocratesSpecial(Encoder): return [ID] - def decode(self, token_id: int) -> str: + def decode(self, token_id: list[int]) -> str: - ID = token_id + if len(token_id) != 1: + raise OutOfDictionaryException() + + ID = token_id[0] WORD = self.__reverse_vocabulary.get(ID) if WORD is None: raise OutOfDictionaryException() return WORD - - def get_reverse_vocabulary(self)-> dict[int, str]: - return self.__reverse_vocabulary diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py b/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py index 6e0abc2..02a8ccf 100644 --- a/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py +++ b/Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py @@ -31,7 +31,8 @@ class NanoSocratesSplitter: bpe_end = special_token_start BPE_TOKEN_TEXT = corpus[bpe_start:bpe_end] if BPE_TOKEN_TEXT != "": - yield (BPE_TOKEN_TEXT, TokenType.BPE) + for WORD in self.__split_words(BPE_TOKEN_TEXT): + yield (WORD, TokenType.BPE) # FIND SPECIAL TOKEN SPECIAL_TOKEN_TEXT = corpus[special_token_start:special_token_end] @@ -60,6 +61,27 @@ class NanoSocratesSplitter: # eof = len(corpus) # yield(eof,eof) + def __split_words(self, bpe_piece: str) -> Generator[str]: + + END_OF_STRING = len(bpe_piece) + bound_start = 0 + bound_end = END_OF_STRING + 1 + for i in range(0, END_OF_STRING): + + CANDIDATE_CHAR = bpe_piece[i] + + if CANDIDATE_CHAR != " ": + continue + + bound_end = i + + yield bpe_piece[bound_start:bound_end] + + bound_start = bound_end + bound_end = END_OF_STRING + 1 + + yield bpe_piece[bound_start:bound_end] + def split_tokens(self, corpus: list[int]) -> Generator[tuple[list[int], TokenType]]: not_special_token_list: list[int] = [] diff --git a/Project_Model/Libs/BPE/Classes/TokeNanoCore.py b/Project_Model/Libs/BPE/Classes/TokeNanoCore.py index c719219..f726a95 100644 --- a/Project_Model/Libs/BPE/Classes/TokeNanoCore.py +++ b/Project_Model/Libs/BPE/Classes/TokeNanoCore.py @@ -1,79 +1,56 @@ from pathlib import Path -from Project_Model.Libs.BPE.Classes.NanoSocratesSplitter import NanoSocratesSplitter -from Project_Model.Libs.BPE.Classes.NanoSocratesBPE import NanoSocratesBPE -from Project_Model.Libs.BPE.Classes.NanoSocratesSpecial import NanoSocratesSpecial +from ..Classes import NanoSocratesSplitter +from ..Classes import NanoSocratesBPE +from ..Classes import NanoSocratesSpecial + +from ..Utils import special_regex_maker +from ..Enums import TokenType + -from Project_Model.Libs.BPE.Utils.special_regex_maker import special_regex_maker -from Scripts.Libs.CleaningPipeline.special_token import SpecialToken -from Project_Model.Libs.BPE.Enums import TokenType -from Project_Model.Libs.BPE.Utils.json_utils import load_json class TokeNanoCore: - def __init__(self, - bpe_vocabulary: dict[tuple[int, int], int] - # special_vocabulary: dict[str, int] - ): - self._bpe = NanoSocratesBPE(bpe_vocabulary) - - # special_vocabulary = [token.value for token in SpecialToken] - special_token_list = [token.value for token in SpecialToken] - self._splitter = NanoSocratesSplitter(special_regex_maker(special_token_list),self._bpe.vocabulary_size) + def __init__( + self, + bpe_vocabulary: dict[tuple[int, int], int], + special_token_list: list[str], + # special_vocabulary: dict[str, int] + ): - self._special_bpe = NanoSocratesSpecial(vocabulary_index=None) # technically its not a bpe but more something like an "autoencoder" - self.prepare_special_token_vocabulary() - - - def encode(self, corpus : str) -> list[int]: - output : list[int] = [] - for piece, token_type in self._splitter.split_text(corpus): + self.__bpe_encoder = NanoSocratesBPE(bpe_vocabulary) + + SPECIAL_REGEX = special_regex_maker(special_token_list) + BPE_VOCABULARY_SIZE = self.__bpe_encoder.vocabulary_size + + self.__splitter = NanoSocratesSplitter(SPECIAL_REGEX, BPE_VOCABULARY_SIZE) + self.__special_encoder = NanoSocratesSpecial( + BPE_VOCABULARY_SIZE, special_token_list + ) + + def encode(self, corpus: str) -> list[int]: + output: list[int] = [] + for piece, token_type in self.__splitter.split_text(corpus): if token_type == TokenType.SPECIAL: - output.extend(self._special_bpe.encode(piece)) + output.extend(self.__special_encoder.encode(piece)) # slow but clear if token_type == TokenType.BPE: - output.extend(self._bpe.encode(piece)) + output.extend(self.__bpe_encoder.encode(piece)) return output - - - def decode(self, corpus : list[int])-> str: - output_str = '' - for token, token_type in self._splitter.split_tokens(corpus): + def decode(self, corpus: list[int]) -> str: + output_str = "" + for token, token_type in self.__splitter.split_tokens(corpus): # token is an integer if special, a list of integer otherwise if token_type == TokenType.SPECIAL: - output_str += self._special_bpe.decode(token) # it accept an integer + output_str += self.__special_encoder.decode( + token + ) # it accept an integer # slow but clear if token_type == TokenType.BPE: - output_str += self._bpe.decode(token) # it accept a list of integer + output_str += self.__bpe_encoder.decode( + token + ) # it accept a list of integer return output_str - - - - def prepare_special_token_vocabulary(self): - self._special_bpe.set_vocabulary_index(self._bpe.vocabulary_size) - - for special_token in [token.value for token in SpecialToken]: - self._special_bpe.add_special_word_to_vocabulary(special_token) - - self._special_bpe.build_reverse_vocabulary() - - -if __name__ == "__main__": - dictionary_path = "Assets/Dataset/Tmp/toy_dictionary.json" - dictionary = load_json(Path(dictionary_path)) - - tokenano = TokeNanoCore(dictionary) - - corpus = "dbp-dbr:How_It_Should_Have_Ended" - print(corpus) - - encoded_list = tokenano.encode(corpus) - print(encoded_list) - - decoded_string = tokenano.decode(encoded_list) - print(decoded_string) - -# [1474, 1475, 1477, 100, 98, 112, 45, 100, 98, 114, 58, 72, 111, 119, 95, 73, 116, 95, 83, 104, 111, 117, 108, 100, 95, 72, 97, 118, 101, 95, 69, 110, 100, 101, 100, 1478] \ No newline at end of file diff --git a/Project_Model/Libs/BPE/Enums/SpecialToken.py b/Project_Model/Libs/BPE/Enums/SpecialToken.py new file mode 100644 index 0000000..3f25a2d --- /dev/null +++ b/Project_Model/Libs/BPE/Enums/SpecialToken.py @@ -0,0 +1,21 @@ +from enum import Enum + + +class SpecialToken(Enum): + # (Enum, str) -> throws an error + START_TRIPLE_LIST = "" + START_TRIPLE = "" + END_TRIPLE = "" + SUBJECT = "" + RELATIONSHIP = "" + OBJECT = "" + ABSTRACT = "" + CORPUS_END = "" + + ## Tasks' Token + RDF_TO_TEXT = "" + TEXT_TO_RDF = "" + CONTINUE_RDF = "" + MASK = "" + + # BPE Training: From 51f491d0334c0f70972eee4fb986e706db53877d Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Fri, 3 Oct 2025 13:27:17 +0200 Subject: [PATCH 66/75] fixed typos --- Project_Model/Libs/BPE/Utils/special_regex_maker.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Project_Model/Libs/BPE/Utils/special_regex_maker.py b/Project_Model/Libs/BPE/Utils/special_regex_maker.py index fd4ac28..c2d3add 100644 --- a/Project_Model/Libs/BPE/Utils/special_regex_maker.py +++ b/Project_Model/Libs/BPE/Utils/special_regex_maker.py @@ -2,15 +2,14 @@ import re def special_regex_maker(special_tokens: list[str]) -> re.Pattern: - """ compile a regex for the special token + """compile a regex for the special token Args: special_tokens (list[str]): the list of special token Returns: re.Pattern: - """ + """ REGEX_STR = "|".join(special_tokens) return re.compile(REGEX_STR) - From c74689d01d0b8c3c5217cc15a806200c58d6eef0 Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Fri, 3 Oct 2025 13:27:38 +0200 Subject: [PATCH 67/75] Fixed tests to reflect new version of tokenizer --- Project_Model/Tests/splitter_test.py | 26 ++++++++++++++------------ Project_Model/Tests/tokenano_test.py | 2 +- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/Project_Model/Tests/splitter_test.py b/Project_Model/Tests/splitter_test.py index 154e24e..2bf9a0f 100644 --- a/Project_Model/Tests/splitter_test.py +++ b/Project_Model/Tests/splitter_test.py @@ -18,7 +18,8 @@ class TestSplitter: EXPECTED_CHUNKS = [ ("", TokenType.SPECIAL), - ("Lorem ", TokenType.BPE), + ("Lorem", TokenType.BPE), + (" ", TokenType.BPE), ("", TokenType.SPECIAL), ] @@ -43,9 +44,10 @@ class TestSplitter: EXPECTED_CHUNKS = [ ("ipsu", TokenType.BPE), ("", TokenType.SPECIAL), - ("m d", TokenType.BPE), + ("m", TokenType.BPE), + (" d", TokenType.BPE), ("", TokenType.SPECIAL), - #("olor", TokenType.BPE) + # ("olor", TokenType.BPE) ] CHUNKS = list(SPLITTER.split_text(TEXT)) @@ -69,7 +71,8 @@ class TestSplitter: EXPECTED_CHUNKS = [ ("ipsu", TokenType.BPE), ("", TokenType.SPECIAL), - ("m d", TokenType.BPE), + ("m", TokenType.BPE), + (" d", TokenType.BPE), ("", TokenType.SPECIAL), ("", TokenType.SPECIAL), ("", TokenType.SPECIAL), @@ -134,12 +137,12 @@ class TestSplitter: def test_split_token_decode_simple(self): # to test the token split into special and bpe SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX, max_bpe_token_id=1473) - token_list = [100,101,1477] + token_list = [100, 101, 1477] CHUNKS = list(SPLITTER.split_tokens(token_list)) EXPECTED_CHUNKS = [ - ([100,101], TokenType.BPE), - (1477, TokenType.SPECIAL), + ([100, 101], TokenType.BPE), + ([1477], TokenType.SPECIAL), ] assert len(CHUNKS) == len(EXPECTED_CHUNKS) @@ -155,12 +158,12 @@ class TestSplitter: def test_split_token_decode_simple_malformed(self): # to test the token split into special and bpe SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX, max_bpe_token_id=1473) - token_list = [100,101,1477,100] + token_list = [100, 101, 1477, 100] CHUNKS = list(SPLITTER.split_tokens(token_list)) EXPECTED_CHUNKS = [ - ([100,101], TokenType.BPE), - (1477, TokenType.SPECIAL), + ([100, 101], TokenType.BPE), + ([1477], TokenType.SPECIAL), ] assert len(CHUNKS) == len(EXPECTED_CHUNKS) @@ -174,7 +177,6 @@ class TestSplitter: assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE - # Useful to debug weird cases if __name__ == "__main__": - TestSplitter().test_split_trailing_text() \ No newline at end of file + TestSplitter().test_split_trailing_text() diff --git a/Project_Model/Tests/tokenano_test.py b/Project_Model/Tests/tokenano_test.py index 2dc7779..c8f0d88 100644 --- a/Project_Model/Tests/tokenano_test.py +++ b/Project_Model/Tests/tokenano_test.py @@ -13,7 +13,7 @@ class TestTokeNano: VOCABULARY = {(ord("a"), ord("b")): 256, (256, 256): 257, (257, 257): 258} # EXPECTED = [258] - TOKE_NANO = TokeNanoCore(VOCABULARY) + TOKE_NANO = TokeNanoCore(VOCABULARY, ["", ""]) ENCODED = TOKE_NANO.encode(TEXT) DECODED = TOKE_NANO.decode(ENCODED) From 9c5f42153f836abc6e7331a04c47b7fbe4c6d9aa Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Fri, 3 Oct 2025 15:17:44 +0200 Subject: [PATCH 68/75] fixed typos --- Project_Model/Libs/BPE/Utils/json_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Project_Model/Libs/BPE/Utils/json_utils.py b/Project_Model/Libs/BPE/Utils/json_utils.py index 716e93a..b98ac2f 100644 --- a/Project_Model/Libs/BPE/Utils/json_utils.py +++ b/Project_Model/Libs/BPE/Utils/json_utils.py @@ -2,15 +2,15 @@ import json from pathlib import Path -def save_json(vocabulary: dict, path: Path): +def save_json(dictionary: dict, path: Path): - json_string = json.dumps(vocabulary) + json_string = json.dumps(dictionary) FILE = open(path, "w") FILE.write(json_string) FILE.close() -def load_json(path: Path) -> dict[tuple[int, int], int]: +def load_json(path: Path) -> dict: FILE = open(path, "r") json_string = FILE.read() FILE.close() From 55e0d2ac23412680d03933d5968a9a12b09fbf6d Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Fri, 3 Oct 2025 16:08:11 +0200 Subject: [PATCH 69/75] Fixed a encoding bug --- Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py index d517f04..a74412d 100644 --- a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py +++ b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py @@ -162,7 +162,11 @@ class NanoSocratesBPE(Encoder): # in this case there was a candidate token to substitute the couple of element NEW_PIECE.append(CANDIDATE_TOKEN) - index += 2 + + index += 1 + if index == PIECE_LENGTH: + NEW_PIECE.append(piece[index]) + index += 1 return NEW_PIECE From 0ee6e480044ce6e4fc5acebaf3756dc587416f97 Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Fri, 3 Oct 2025 16:09:53 +0200 Subject: [PATCH 70/75] Fixed the same bug as before, but this time is correct --- Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py index a74412d..a5dab9b 100644 --- a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py +++ b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py @@ -163,10 +163,10 @@ class NanoSocratesBPE(Encoder): # in this case there was a candidate token to substitute the couple of element NEW_PIECE.append(CANDIDATE_TOKEN) - index += 1 + index += 2 + if index == PIECE_LENGTH: NEW_PIECE.append(piece[index]) - index += 1 return NEW_PIECE From 0f95aeb1224051bf462e3dc3d4c7662a1e72665d Mon Sep 17 00:00:00 2001 From: GassiGiuseppe Date: Fri, 3 Oct 2025 16:26:01 +0200 Subject: [PATCH 71/75] toy dictionary for bpe implemeted --- Assets/Model/toy_10/README.md | 3 +++ Assets/Model/toy_10/toy_dictionary.json | 3 +++ 2 files changed, 6 insertions(+) create mode 100644 Assets/Model/toy_10/README.md create mode 100644 Assets/Model/toy_10/toy_dictionary.json diff --git a/Assets/Model/toy_10/README.md b/Assets/Model/toy_10/README.md new file mode 100644 index 0000000..b97981a --- /dev/null +++ b/Assets/Model/toy_10/README.md @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:806baf1def1f5b785775ae8e4bcf028d897206da2edd76b6702b1838f5880923 +size 283 diff --git a/Assets/Model/toy_10/toy_dictionary.json b/Assets/Model/toy_10/toy_dictionary.json new file mode 100644 index 0000000..5f47d51 --- /dev/null +++ b/Assets/Model/toy_10/toy_dictionary.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b091b3b15bfc49b470bb9da158bc167aa797897f1ed11c012268eb4d520654b +size 183342 From d2a3dfe90fced99aaf2d5a1802b5cb3e35b6eab7 Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Fri, 3 Oct 2025 17:59:46 +0200 Subject: [PATCH 72/75] Fixed bug --- Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py index a5dab9b..bcb0c0f 100644 --- a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py +++ b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py @@ -164,9 +164,9 @@ class NanoSocratesBPE(Encoder): NEW_PIECE.append(CANDIDATE_TOKEN) index += 2 - + if index == PIECE_LENGTH: - NEW_PIECE.append(piece[index]) + NEW_PIECE.append(piece[index]) return NEW_PIECE From 8a21cb1b73d46c51c07e1c9f0ce0f565649ae2ef Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Fri, 3 Oct 2025 18:00:52 +0200 Subject: [PATCH 73/75] added python analysis --- .vscode/settings.json | 59 +++++++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 30 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index cae6d86..7f479da 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,35 +1,34 @@ { - // Always treat the project root as the working dir for Jupyter - "jupyter.notebookFileRoot": "${workspaceFolder}", - - // When you click "Run Python File in Terminal", DON'T cd into the file's folder - "python.terminal.executeInFileDir": false, - - // Start new integrated terminals at the project root - "terminal.integrated.cwd": "${workspaceFolder}", - - // Make pytest run from the root without needing a pytest.ini - "python.testing.pytestEnabled": true, - "python.testing.cwd": "${workspaceFolder}", - "python.testing.pytestArgs": ["src/test"], - - // Help Pylance resolve imports like `from src...` without red squiggles - "python.analysis.extraPaths": ["${workspaceFolder}"], - - // For linux - "terminal.integrated.env.linux": { - "PYTHONPATH": "${workspaceFolder}" - }, - // For OSX - "terminal.integrated.env.osx": { - "PYTHONPATH": "${workspaceFolder}" - }, - // For Windows - "terminal.integrated.env.windows": { - "PYTHONPATH": "${workspaceFolder}" - } + // Always treat the project root as the working dir for Jupyter + "jupyter.notebookFileRoot": "${workspaceFolder}", + // When you click "Run Python File in Terminal", DON'T cd into the file's folder + "python.terminal.executeInFileDir": false, + // Start new integrated terminals at the project root + "terminal.integrated.cwd": "${workspaceFolder}", + // Make pytest run from the root without needing a pytest.ini + "python.testing.pytestEnabled": true, + "python.testing.cwd": "${workspaceFolder}", + "python.testing.pytestArgs": [ + "src/test" + ], + // Help Pylance resolve imports like `from src...` without red squiggles + "python.analysis.extraPaths": [ + "${workspaceFolder}" + ], + // For linux + "terminal.integrated.env.linux": { + "PYTHONPATH": "${workspaceFolder}" + }, + // For OSX + "terminal.integrated.env.osx": { + "PYTHONPATH": "${workspaceFolder}" + }, + // For Windows + "terminal.integrated.env.windows": { + "PYTHONPATH": "${workspaceFolder}" + }, + "python.analysis.typeCheckingMode": "standard" } - // { // // Always treat the project root as the working dir for Jupyter // "jupyter.notebookFileRoot": "${workspaceFolder}", From 149deb407db18fd053b8916879e290a0855da0bf Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Fri, 3 Oct 2025 18:01:05 +0200 Subject: [PATCH 74/75] added cache directories --- .gitignore | 1 + Project_Model/Tests/trainer_files/cache/.gitkeep | 0 2 files changed, 1 insertion(+) create mode 100644 Project_Model/Tests/trainer_files/cache/.gitkeep diff --git a/.gitignore b/.gitignore index 0797ef4..314d94c 100644 --- a/.gitignore +++ b/.gitignore @@ -254,4 +254,5 @@ $RECYCLE.BIN/ # ---> Custom **/Tmp/** +**/cache/** !**/.gitkeep diff --git a/Project_Model/Tests/trainer_files/cache/.gitkeep b/Project_Model/Tests/trainer_files/cache/.gitkeep new file mode 100644 index 0000000..e69de29 From 8e095ebb7a637de17d06c5fce4a1162e2c671b0a Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Fri, 3 Oct 2025 18:02:27 +0200 Subject: [PATCH 75/75] Added papers stub --- docs/PAPERS.md | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 docs/PAPERS.md diff --git a/docs/PAPERS.md b/docs/PAPERS.md new file mode 100644 index 0000000..957d00a --- /dev/null +++ b/docs/PAPERS.md @@ -0,0 +1,57 @@ +# Research Material + +## BPE + +- [BPE Wikipedia](https://en.wikipedia.org/wiki/Byte-pair_encoding) +- [BPE Hugging Face](https://huggingface.co/learn/llm-course/chapter6/5) +- [BPE GeeksForGeeks](https://www.geeksforgeeks.org/nlp/byte-pair-encoding-bpe-in-nlp/) +- [BPE Medium Chetna Khanna](https://medium.com/data-science/byte-pair-encoding-subword-based-tokenization-algorithm-77828a70bee0) +- [Stack Overflow "Explain bpe (Byte Pair Encoding) with examples?"](https://stackoverflow.com/questions/50583254/explain-bpe-byte-pair-encoding-with-examples) +- [Implementing a byte pair encoding(BPE) Tokenizer from scratch](https://sebastianraschka.com/blog/2025/bpe-from-scratch.html) +- [Thoretical Analysis of Byte-Pair Encoding](https://arxiv.org/pdf/2411.08671) +- [A Formal Perspective on Byte-Pair Encoding](https://aclanthology.org/2023.findings-acl.38v2.pdf) +- [Byte Pair Encoding is Suboptimal for Language Model Pretraining](https://arxiv.org/pdf/2004.03720) +- [Byte pair encoding: a text compression scheme that accelerates pattern matching](https://www.researchgate.net/profile/Takeshi-Shinohara/publication/2310624_Byte_Pair_Encoding_A_Text_Compression_Scheme_That_Accelerates_Pattern_Matching/links/02e7e522f8ea00c318000000/Byte-Pair-Encoding-A-Text-Compression-Scheme-That-Accelerates-Pattern-Matching.pdf) +- [A Formal Perspective on Byte-Pair Encoding](https://arxiv.org/pdf/2306.16837) +- [Controlling byte pair encoding for neural machine translation](https://ieeexplore.ieee.org/abstract/document/8300571) +- [Scaffold-BPE: Enhancing Byte Pair Encoding for Large Language Models with Simple and Effective Scaffold Token Removal](https://ojs.aaai.org/index.php/AAAI/article/view/34633) +- [Parity-Aware Byte-Pair Encoding: Improving Cross-lingual Fairness in Tokenization](https://arxiv.org/pdf/2508.04796) +- [Code Completion using Neural A‚ention and Byte Pair Encoding](https://arxiv.org/pdf/2004.06343) +- [Getting the most out of your tokenizer for pre-training and domain adaptation](https://arxiv.org/html/2402.01035v2) + +## Embedder + +- [ROFORMER: ENHANCED TRANSFORMER WITH ROTARY POSITION EMBEDDING](https://arxiv.org/pdf/2104.09864) +- [You could have designed state of the art positional encoding](https://huggingface.co/blog/designing-positional-encoding) +- [Rotary Embeddings: A Relative Revolution](https://blog.eleuther.ai/rotary-embeddings/) +- [Round and Round We Go! What makes Rotary Positional Encodings useful?](https://arxiv.org/html/2410.06205v1) +- [Inside RoPE: Rotary Magic into Position Embeddings](https://learnopencv.com/rope-position-embeddings/) +- [What Rotary Position Embedding Can Tell Us: Identifying Query and Key Weights Corresponding to Basic Syntactic or High-level Semantic Information](https://openreview.net/pdf?id=e5Mv7iWfVW) +- [A gentle introduction to Rotary Position Embedding](https://krasserm.github.io/2022/12/13/rotary-position-embedding/) +- [Context-aware Rotary Position Embedding](https://arxiv.org/pdf/2507.23083) +- [LIERE: GENERALIZING ROTARY POSITION ENCODINGS TO HIGHER DIMENSIONAL INPUTS](https://openreview.net/pdf?id=xHMMt7r3GW) +- [Rotary Positional Embeddings (RoPE)](https://nn.labml.ai/transformers/rope/index.html) +- [Decoding Llama3: An explainer for tinkerers](https://hasgeek.com/simrathanspal/the-llama3-guide/sub/decoding-llama3-part-4-rotary-positional-embedding-3K8ZHpdLi6E56N8ejnaWzm) + +## Attention + +- [Standard Self-Attention (Attention is all you need)](https://arxiv.org/pdf/1706.03762) +- [TransMLA: Multi-Head Latent Attention Is All You Need](https://arxiv.org/pdf/2502.07864) +- [A Gentle Introduction to Multi-Head Latent Attention (MLA)](https://machinelearningmastery.com/a-gentle-introduction-to-multi-head-latent-attention-mla/) +- [Understanding Multi-Head Latent Attention](https://planetbanatt.net/articles/mla.html) +- [DeepSeek's Multi-Head Latent Attention](https://liorsinai.github.io/machine-learning/2025/02/22/mla.html) +- [MatchFormer: Interleaving Attention in Transformers for Feature Matching](https://arxiv.org/pdf/2203.09645) +- [FIT: Far-reaching Interleaved Transformers](https://arxiv.org/pdf/2305.12689) +- [Gemma explained: What’s new in Gemma 3](https://developers.googleblog.com/en/gemma-explained-whats-new-in-gemma-3/) +- [The Llama 4 herd: The beginning of a new era of natively multimodal AI innovation](https://ai.meta.com/blog/llama-4-multimodal-intelligence/) +- [Attention was never enough: Tracing the rise of hybrid LLMs](https://www.ai21.com/blog/rise-of-hybrid-llms/) +- + +## Spanned Masking + +- [Salient Span Masking for Temporal Understanding](https://arxiv.org/pdf/2303.12860) +- [PMI-MASKING: PRINCIPLED MASKING OF CORRELATED SPANS](https://arxiv.org/pdf/2010.01825) + +## Models + +- [What Language Model Architecture and Pretraining Objective Work Best for Zero-Shot Generalization?](https://arxiv.org/pdf/2204.05832)