From 0a698e9837367de4e42d5b7506ed2a84b4e8f440 Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Thu, 25 Sep 2025 19:09:52 +0200 Subject: [PATCH 1/9] Added schema to extract from DB for BPE --- .../bpe-pipeline.excalidraw.json | 897 ++++++++++++++++++ 1 file changed, 897 insertions(+) create mode 100644 Scripts/UML/CleaningPipeline/bpe-pipeline.excalidraw.json diff --git a/Scripts/UML/CleaningPipeline/bpe-pipeline.excalidraw.json b/Scripts/UML/CleaningPipeline/bpe-pipeline.excalidraw.json new file mode 100644 index 0000000..0edf3cf --- /dev/null +++ b/Scripts/UML/CleaningPipeline/bpe-pipeline.excalidraw.json @@ -0,0 +1,897 @@ +{ + "type": "excalidraw", + "version": 2, + "source": "https://marketplace.visualstudio.com/items?itemName=pomdtr.excalidraw-editor", + "elements": [ + { + "id": "3zbCui3XtIGozHXTVAGRp", + "type": "rectangle", + "x": 316.5, + "y": 123, + "width": 436.5, + "height": 145.5, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a0", + "roundness": { + "type": 3 + }, + "seed": 1698427950, + "version": 35, + "versionNonce": 601575602, + "isDeleted": false, + "boundElements": [ + { + "id": "wD66RDbG05HfvRhAtMb0J", + "type": "text" + }, + { + "id": "gus_rxauKJ6T2L_F59PfN", + "type": "arrow" + } + ], + "updated": 1758818588814, + "link": null, + "locked": false + }, + { + "id": "wD66RDbG05HfvRhAtMb0J", + "type": "text", + "x": 480.98004150390625, + "y": 183.25, + "width": 107.5399169921875, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a1", + "roundness": null, + "seed": 910769774, + "version": 31, + "versionNonce": 1120989938, + "isDeleted": false, + "boundElements": null, + "updated": 1758818416720, + "link": null, + "locked": false, + "text": "dataset.db", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "3zbCui3XtIGozHXTVAGRp", + "originalText": "dataset.db", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "87-MeaiZGT1wln0nggYPZ", + "type": "rectangle", + "x": 339.5, + "y": 309.5, + "width": 392, + "height": 156, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a2", + "roundness": { + "type": 3 + }, + "seed": 655550318, + "version": 77, + "versionNonce": 1103939826, + "isDeleted": false, + "boundElements": null, + "updated": 1758818339000, + "link": null, + "locked": false + }, + { + "id": "EjUxEhZqEBzwvlw0VE9eJ", + "type": "rectangle", + "x": 355.5, + "y": 327, + "width": 162, + "height": 125.5, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a3", + "roundness": { + "type": 3 + }, + "seed": 1739846638, + "version": 64, + "versionNonce": 1594290034, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "ogRkV0neHrhEKTE6zlggl" + } + ], + "updated": 1758818391415, + "link": null, + "locked": false + }, + { + "id": "ogRkV0neHrhEKTE6zlggl", + "type": "text", + "x": 378.7100524902344, + "y": 377.25, + "width": 115.57989501953125, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a3V", + "roundness": null, + "seed": 2037675630, + "version": 12, + "versionNonce": 1286472046, + "isDeleted": false, + "boundElements": null, + "updated": 1758818399222, + "link": null, + "locked": false, + "text": "RDF_String", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "EjUxEhZqEBzwvlw0VE9eJ", + "originalText": "RDF_String", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "hoIRMNiMJZl4YDo-hovWy", + "type": "rectangle", + "x": 542.5, + "y": 327, + "width": 173, + "height": 125.5, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a4", + "roundness": { + "type": 3 + }, + "seed": 1189796530, + "version": 99, + "versionNonce": 1071057006, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "rsapATFAT5YSBCXzLupgZ" + }, + { + "id": "gus_rxauKJ6T2L_F59PfN", + "type": "arrow" + }, + { + "id": "Wk1bJbbtC31FqObEL5xWt", + "type": "arrow" + } + ], + "updated": 1758818593647, + "link": null, + "locked": false + }, + { + "id": "rsapATFAT5YSBCXzLupgZ", + "type": "text", + "x": 585.6800384521484, + "y": 377.25, + "width": 86.63992309570312, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a5", + "roundness": null, + "seed": 829619694, + "version": 12, + "versionNonce": 713902318, + "isDeleted": false, + "boundElements": null, + "updated": 1758818405150, + "link": null, + "locked": false, + "text": "Abstract", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "hoIRMNiMJZl4YDo-hovWy", + "originalText": "Abstract", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "jSx8ApfhtRs_nk37VvDMb", + "type": "rectangle", + "x": 316.5, + "y": 511, + "width": 436.5, + "height": 145.5, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a6", + "roundness": { + "type": 3 + }, + "seed": 492582894, + "version": 132, + "versionNonce": 893797614, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "6E23g-rgowNqHsBxX-LuM" + }, + { + "id": "hyFKqXwet_F79QM71atgI", + "type": "arrow" + }, + { + "id": "x_DP1FcQ7jraGz0gBuDi3", + "type": "arrow" + }, + { + "id": "1IGbCps2EHnzKgJUWM5nq", + "type": "arrow" + }, + { + "id": "Wk1bJbbtC31FqObEL5xWt", + "type": "arrow" + } + ], + "updated": 1758818593647, + "link": null, + "locked": false + }, + { + "id": "6E23g-rgowNqHsBxX-LuM", + "type": "text", + "x": 499.9100341796875, + "y": 571.25, + "width": 69.679931640625, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a7", + "roundness": null, + "seed": 267696178, + "version": 132, + "versionNonce": 1668243186, + "isDeleted": false, + "boundElements": null, + "updated": 1758818543211, + "link": null, + "locked": false, + "text": "Pandas", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "jSx8ApfhtRs_nk37VvDMb", + "originalText": "Pandas", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "ohj18N4AOTDz5lJNcV9gi", + "type": "rectangle", + "x": 261, + "y": 765.5, + "width": 157, + "height": 87, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a8", + "roundness": { + "type": 3 + }, + "seed": 1446207150, + "version": 279, + "versionNonce": 317375026, + "isDeleted": false, + "boundElements": [ + { + "id": "Ea1_ke2wA0D8ZjVOUtvfY", + "type": "text" + }, + { + "id": "hyFKqXwet_F79QM71atgI", + "type": "arrow" + } + ], + "updated": 1758818570993, + "link": null, + "locked": false + }, + { + "id": "Ea1_ke2wA0D8ZjVOUtvfY", + "type": "text", + "x": 297.0800323486328, + "y": 796.5, + "width": 84.83993530273438, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a9", + "roundness": null, + "seed": 435116270, + "version": 199, + "versionNonce": 1282911218, + "isDeleted": false, + "boundElements": null, + "updated": 1758818570993, + "link": null, + "locked": false, + "text": "train.txt", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "ohj18N4AOTDz5lJNcV9gi", + "originalText": "train.txt", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "A4Y54Y26fe257U_QU9lxX", + "type": "rectangle", + "x": 464, + "y": 765.5, + "width": 157, + "height": 87, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aA", + "roundness": { + "type": 3 + }, + "seed": 186148850, + "version": 232, + "versionNonce": 997119858, + "isDeleted": false, + "boundElements": [ + { + "id": "v4TvUlDEjH7EvPDmtbOn2", + "type": "text" + }, + { + "id": "1IGbCps2EHnzKgJUWM5nq", + "type": "arrow" + } + ], + "updated": 1758818570993, + "link": null, + "locked": false + }, + { + "id": "v4TvUlDEjH7EvPDmtbOn2", + "type": "text", + "x": 476.3500442504883, + "y": 796.5, + "width": 132.29991149902344, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aB", + "roundness": null, + "seed": 1131059634, + "version": 171, + "versionNonce": 239540530, + "isDeleted": false, + "boundElements": null, + "updated": 1758818570993, + "link": null, + "locked": false, + "text": "validation.txt", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "A4Y54Y26fe257U_QU9lxX", + "originalText": "validation.txt", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "mPaYpJ9Xn7tlJPmKPqJKJ", + "type": "rectangle", + "x": 674.5, + "y": 765.5, + "width": 157, + "height": 87, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aC", + "roundness": { + "type": 3 + }, + "seed": 1049323314, + "version": 235, + "versionNonce": 330560690, + "isDeleted": false, + "boundElements": [ + { + "type": "text", + "id": "kg9nm2rpud6cax5aNPSnu" + }, + { + "id": "x_DP1FcQ7jraGz0gBuDi3", + "type": "arrow" + } + ], + "updated": 1758818570993, + "link": null, + "locked": false + }, + { + "id": "kg9nm2rpud6cax5aNPSnu", + "type": "text", + "x": 711.4300231933594, + "y": 796.5, + "width": 83.13995361328125, + "height": 25, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aD", + "roundness": null, + "seed": 522572142, + "version": 193, + "versionNonce": 1920372338, + "isDeleted": false, + "boundElements": null, + "updated": 1758818570993, + "link": null, + "locked": false, + "text": "test.txt", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "center", + "verticalAlign": "middle", + "containerId": "mPaYpJ9Xn7tlJPmKPqJKJ", + "originalText": "test.txt", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "hyFKqXwet_F79QM71atgI", + "type": "arrow", + "x": 534.65, + "y": 661.5, + "width": 195.25, + "height": 99, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aG", + "roundness": null, + "seed": 873266098, + "version": 71, + "versionNonce": 541154738, + "isDeleted": false, + "boundElements": null, + "updated": 1758818570993, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 0, + 49.5 + ], + [ + -195.25, + 49.5 + ], + [ + -195.25, + 99 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "jSx8ApfhtRs_nk37VvDMb", + "fixedPoint": [ + 0.49977090492554405, + 1.034364261168385 + ], + "focus": 0, + "gap": 0 + }, + "endBinding": { + "elementId": "ohj18N4AOTDz5lJNcV9gi", + "fixedPoint": [ + 0.4993630573248406, + -0.05747126436781609 + ], + "focus": 0, + "gap": 0 + }, + "startArrowhead": null, + "endArrowhead": "triangle", + "elbowed": true, + "fixedSegments": null, + "startIsSpecial": null, + "endIsSpecial": null + }, + { + "id": "x_DP1FcQ7jraGz0gBuDi3", + "type": "arrow", + "x": 534.65, + "y": 661.5, + "width": 218.25, + "height": 99, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aH", + "roundness": null, + "seed": 1210817582, + "version": 77, + "versionNonce": 1483392370, + "isDeleted": false, + "boundElements": null, + "updated": 1758818580594, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 0, + 49.5 + ], + [ + 218.25, + 49.5 + ], + [ + 218.25, + 99 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "jSx8ApfhtRs_nk37VvDMb", + "fixedPoint": [ + 0.49977090492554405, + 1.034364261168385 + ], + "focus": 0, + "gap": 0 + }, + "endBinding": { + "elementId": "mPaYpJ9Xn7tlJPmKPqJKJ", + "fixedPoint": [ + 0.4993630573248406, + -0.05747126436781609 + ], + "focus": 0, + "gap": 0 + }, + "startArrowhead": null, + "endArrowhead": "triangle", + "elbowed": true, + "fixedSegments": null, + "startIsSpecial": null, + "endIsSpecial": null + }, + { + "id": "1IGbCps2EHnzKgJUWM5nq", + "type": "arrow", + "x": 534.65, + "y": 661.5, + "width": 0.5719232650604908, + "height": 99.07394122590165, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aK", + "roundness": null, + "seed": 1205316658, + "version": 96, + "versionNonce": 1748050674, + "isDeleted": false, + "boundElements": null, + "updated": 1758818570993, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + -0.5719232650604908, + 99.07394122590165 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "jSx8ApfhtRs_nk37VvDMb", + "fixedPoint": [ + 0.49977090492554405, + 1.034364261168385 + ], + "focus": 0, + "gap": 0 + }, + "endBinding": { + "elementId": "A4Y54Y26fe257U_QU9lxX", + "fixedPoint": [ + 0.44635717665566554, + -0.056621365219521276 + ], + "focus": 0, + "gap": 0 + }, + "startArrowhead": null, + "endArrowhead": "triangle", + "elbowed": true, + "fixedSegments": null, + "startIsSpecial": null, + "endIsSpecial": null + }, + { + "id": "gus_rxauKJ6T2L_F59PfN", + "type": "arrow", + "x": 539, + "y": 271.5, + "width": 0, + "height": 33.5, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aL", + "roundness": null, + "seed": 763990258, + "version": 17, + "versionNonce": 1028811378, + "isDeleted": false, + "boundElements": null, + "updated": 1758818588814, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 0, + 33.5 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "3zbCui3XtIGozHXTVAGRp", + "focus": -0.019473081328751418, + "gap": 3 + }, + "endBinding": { + "elementId": "hoIRMNiMJZl4YDo-hovWy", + "focus": -1.0404624277456647, + "gap": 30.7545797799829 + }, + "startArrowhead": null, + "endArrowhead": "triangle", + "elbowed": false + }, + { + "id": "Wk1bJbbtC31FqObEL5xWt", + "type": "arrow", + "x": 536.5, + "y": 468.5, + "width": 0, + "height": 39, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aM", + "roundness": null, + "seed": 1489771054, + "version": 33, + "versionNonce": 1828178606, + "isDeleted": false, + "boundElements": null, + "updated": 1758818593647, + "link": null, + "locked": false, + "points": [ + [ + 0, + 0 + ], + [ + 0, + 39 + ] + ], + "lastCommittedPoint": null, + "startBinding": { + "elementId": "hoIRMNiMJZl4YDo-hovWy", + "focus": 1.0693641618497107, + "gap": 27.157190169432425 + }, + "endBinding": { + "elementId": "jSx8ApfhtRs_nk37VvDMb", + "focus": 0.008018327605956525, + "gap": 3.5 + }, + "startArrowhead": null, + "endArrowhead": "triangle", + "elbowed": false + } + ], + "appState": { + "gridSize": 20, + "gridStep": 5, + "gridModeEnabled": false, + "viewBackgroundColor": "#ffffff" + }, + "files": {} +} \ No newline at end of file From e521b0704e1941ede504f58a615d8a20fa77461b Mon Sep 17 00:00:00 2001 From: GassiGiuseppe Date: Thu, 25 Sep 2025 19:19:11 +0200 Subject: [PATCH 2/9] deleted TODO in path_splitter_tree, as it was already resolved --- Scripts/DataCleaning/path_splitter_tree.py | 1 - 1 file changed, 1 deletion(-) diff --git a/Scripts/DataCleaning/path_splitter_tree.py b/Scripts/DataCleaning/path_splitter_tree.py index e7f6f9e..9c0914a 100644 --- a/Scripts/DataCleaning/path_splitter_tree.py +++ b/Scripts/DataCleaning/path_splitter_tree.py @@ -101,7 +101,6 @@ def tree_like(file: str, csv_uri_header:str, out: str): FILE = open(file, "r", encoding="utf-8") - # TODO: Change here so it takes single URI from a CSV file # It is needed the header-name for row in csv.DictReader(FILE): From 650b37c586fe07d9bb83d4471a727c12cd717dfb Mon Sep 17 00:00:00 2001 From: GassiGiuseppe Date: Fri, 26 Sep 2025 11:24:34 +0200 Subject: [PATCH 3/9] Added vscode setting to execute jupyternotebook from root dir --- .vscode/settings.json | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..17ae78b --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "jupyter.notebookFileRoot": "${workspaceFolder}" +} \ No newline at end of file From 6ddb7de9da1af4fad8d8bae265f0622f56ba6bec Mon Sep 17 00:00:00 2001 From: GassiGiuseppe Date: Mon, 29 Sep 2025 15:19:19 +0200 Subject: [PATCH 4/9] Added sqlAlchemy to requirements --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index e87882c..70a3169 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,3 +15,4 @@ tzdata==2025.2 urllib3==2.5.0 wheel==0.45.1 Wikipedia-API==0.8.1 +SQLAlchemy From bd72ad3571bf2710cd154c5cf08b448dc194f13d Mon Sep 17 00:00:00 2001 From: GassiGiuseppe Date: Mon, 29 Sep 2025 15:21:26 +0200 Subject: [PATCH 5/9] Added file to execute the complete cleaning pipeline --- .../data_output_models/bpe_corpus.py | 21 ++ .../rdf_completation_task.py | 26 +++ .../data_output_models/rdf_mask_task.py | 58 ++++++ .../data_output_models/rdf_text_tasks.py | 26 +++ Scripts/DataCleaning/filter.py | 184 ++++++++++++++++++ Scripts/DataCleaning/pipeline.py | 107 ++++++++++ .../Libs/CleaningPipeline/special_token.py | 21 ++ Scripts/Libs/CleaningPipeline/sql_endpoint.py | 144 ++++++++++++++ Scripts/Libs/Utils/dataframe_interaction.py | 9 + 9 files changed, 596 insertions(+) create mode 100644 Scripts/DataCleaning/data_output_models/bpe_corpus.py create mode 100644 Scripts/DataCleaning/data_output_models/rdf_completation_task.py create mode 100644 Scripts/DataCleaning/data_output_models/rdf_mask_task.py create mode 100644 Scripts/DataCleaning/data_output_models/rdf_text_tasks.py create mode 100644 Scripts/DataCleaning/filter.py create mode 100644 Scripts/DataCleaning/pipeline.py create mode 100644 Scripts/Libs/CleaningPipeline/special_token.py create mode 100644 Scripts/Libs/CleaningPipeline/sql_endpoint.py create mode 100644 Scripts/Libs/Utils/dataframe_interaction.py diff --git a/Scripts/DataCleaning/data_output_models/bpe_corpus.py b/Scripts/DataCleaning/data_output_models/bpe_corpus.py new file mode 100644 index 0000000..a0348b6 --- /dev/null +++ b/Scripts/DataCleaning/data_output_models/bpe_corpus.py @@ -0,0 +1,21 @@ +from Scripts.Libs.Utils.dataframe_interaction import get_raw_from_dataframe +from Scripts.Libs.CleaningPipeline.special_token import SpecialToken +import pandas as pd + +class BPE_corpus(): + + def __init__(self, output_path :str): + self.output_handler = open(output_path, "w") + + def close(self): + # add corpus end before closing + self.output_handler.write(SpecialToken.CORPUS_END.value) + self.output_handler.close() + + def write_from_str(self, output: str): + if output == '': + return + self.output_handler.write(output) + + def write_from_df(self, df: pd.DataFrame): + self.write_from_str(get_raw_from_dataframe(df)) \ No newline at end of file diff --git a/Scripts/DataCleaning/data_output_models/rdf_completation_task.py b/Scripts/DataCleaning/data_output_models/rdf_completation_task.py new file mode 100644 index 0000000..111b2b9 --- /dev/null +++ b/Scripts/DataCleaning/data_output_models/rdf_completation_task.py @@ -0,0 +1,26 @@ +import pandas as pd + +class RDF_completation_task_dataset(): + """ + Write the CSV for the fourth task, which is "Predicting subsequent triples based on a given context". + Each RDF is saved as str + CSV Composition: ["MovieID","RDF"] + """ + def __init__(self, output_path:str): + + + self.output = open(output_path, "w") + # then the first row as header + header = ["MovieID","RDF"] + self.output.write(",".join(header) + "\n") + + def close(self): + self.output.close() + + def write(self, RDF: pd.DataFrame): + """ + Args: + RDF (pd.DataFrame): ["MovieID","RDF"] + """ + + RDF.to_csv(self.output, index=False, header=False) \ No newline at end of file diff --git a/Scripts/DataCleaning/data_output_models/rdf_mask_task.py b/Scripts/DataCleaning/data_output_models/rdf_mask_task.py new file mode 100644 index 0000000..01b943d --- /dev/null +++ b/Scripts/DataCleaning/data_output_models/rdf_mask_task.py @@ -0,0 +1,58 @@ +import pandas as pd + +# do not worry about circular dependencies, this class will never call something else +from Scripts.DataCleaning.filter import PipelineApplier + +class RDF_mask_task_dataset(): + """ + Write the CSV for the third task, which is "Predicting a masked component within an RDF triple". + The CSV is like: for each RDF there will be 3 rows, where every time one of the componments is missing. + CSV Composition: ["MovieID","IncompleteRDF","Missing","RDF"] + """ + def __init__(self, output_path:str): + + # this methods will only be used by this class, but they belong in a lower level + self._build_triple = PipelineApplier.build_triple + self._build_incomplete_triple = PipelineApplier.build_incomplete_triple + + self.output = open(output_path, "w") + # then the first row as header + header = ["MovieID","IncompleteRDF","Missing","RDF"] + self.output.write(",".join(header) + "\n") + + def close(self): + self.output.close() + + def write(self, RDF: pd.DataFrame): + rdf_complete = self._build_triple(RDF) + + rdf_without_subject = self._build_incomplete_triple(RDF.drop(columns=["SubjectURI"])) + rdf_without_relationship = self._build_incomplete_triple(RDF.drop(columns=["RelationshipURI"])) + rdf_without_object = self._build_incomplete_triple(RDF.drop(columns=["ObjectURI"])) + #### + df_subject = pd.DataFrame({ + "MovieID": RDF["MovieID"], + "IncompleteRDF": rdf_without_subject, + "Missing": RDF["SubjectURI"], + "RDF": rdf_complete, + }) + + df_relationship = pd.DataFrame({ + "MovieID": RDF["MovieID"], + "IncompleteRDF": rdf_without_relationship, + "Missing": RDF["RelationshipURI"], + "RDF": rdf_complete, + }) + + df_object = pd.DataFrame({ + "MovieID": RDF["MovieID"], + "IncompleteRDF": rdf_without_object, + "Missing": RDF["ObjectURI"], + "RDF": rdf_complete, + }) + + + output_df = pd.concat([df_subject, df_relationship, df_object], ignore_index=True) + output_df.to_csv(self.output, index=False, header=False) + + diff --git a/Scripts/DataCleaning/data_output_models/rdf_text_tasks.py b/Scripts/DataCleaning/data_output_models/rdf_text_tasks.py new file mode 100644 index 0000000..918e600 --- /dev/null +++ b/Scripts/DataCleaning/data_output_models/rdf_text_tasks.py @@ -0,0 +1,26 @@ +import pandas as pd + +class RDF_text_task_dataset(): + """ + Write the CSV for the firsts two tasks, which are "Generating structured RDF triples from natural language text" and reverse. + In the CVS the RDFs will be saved toghether as a string. + CSV Composition: ["MovieID","RDFs","Abstract"] + """ + def __init__(self, output_path:str): + + + self.output = open(output_path, "w") + # then the first row as header + header = ["MovieID","RDFs","Abstract"] + self.output.write(",".join(header) + "\n") + + def close(self): + self.output.close() + + def write(self, RDF: pd.DataFrame): + """ + Args: + RDF (pd.DataFrame): ["MovieID","Triple","Abstract"] + """ + + RDF.to_csv(self.output, index=False, header=False) \ No newline at end of file diff --git a/Scripts/DataCleaning/filter.py b/Scripts/DataCleaning/filter.py new file mode 100644 index 0000000..50d6ead --- /dev/null +++ b/Scripts/DataCleaning/filter.py @@ -0,0 +1,184 @@ +# This file deletes in the pipeline the unwanted relationship by different rules +import pandas as pd +import sqlite3 +import numpy as np + +from Scripts.Libs.CleaningPipeline.special_token import SpecialToken +from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint + + +class PipelineApplier(): + + def __init__(self): + + self.MOVIE_FILTER = pd.DataFrame() + self.REL_FILTER = pd.DataFrame() + + + def delete_relationship_by_str(self, RDF: pd.DataFrame, uri: str) -> pd.DataFrame: + return RDF[RDF["RelationshipURI"]!= uri] + + def generate_list_relationship_filter(self, filter_list: list[str]) -> None: + """Store RelationshipURI filters as a set """ + self.relationship_filter_list: set[str] = set(filter_list) + + def delete_relationship_by_list_filter(self, RDF: pd.DataFrame) -> pd.DataFrame: + """Remove rows whose RelationshipURI is in the stored filter. Generate it first callig the generate_list_relationship_filter""" + return RDF[~RDF["RelationshipURI"].isin(self.relationship_filter_list)] + + + def generate_frequency_movie_filter(self, MOVIE_COUNT: pd.DataFrame ,min_treshold: int, max_treshold: int): + """ + You MUST call this before filter the dataset by movie frequence [filter_by_frequence_movie_id()], + since this method creates such filter + Args: + MOVIE_COUNT (pd.DataFrame): ["MovieID","Count"] + min_treshold (int): + max_treshold (int): + """ + MOVIE_COUNT = MOVIE_COUNT[MOVIE_COUNT["Count"] >= min_treshold] + MOVIE_COUNT = MOVIE_COUNT[MOVIE_COUNT["Count"] < max_treshold] + self.MOVIE_FILTER = MOVIE_COUNT #["MovieID"] + + def generate_frequency_relationship_filter(self, REL_COUNT: pd.DataFrame ,min_treshold: int, max_treshold: int): + REL_COUNT = REL_COUNT[REL_COUNT["Count"] >= min_treshold] + REL_COUNT = REL_COUNT[REL_COUNT["Count"] < max_treshold] + self.REL_FILTER = REL_COUNT #["RelationshipURI"] + + def filter_by_frequency_movie_id(self, RDF: pd.DataFrame) -> pd.DataFrame: + RDF = RDF[RDF["MovieID"].isin(self.MOVIE_FILTER["MovieID"])] + return RDF + + def filter_by_frequency_relationship(self, RDF: pd.DataFrame) -> pd.DataFrame: + RDF = RDF[RDF["RelationshipURI"].isin(self.REL_FILTER["RelationshipURI"])] + return RDF + + def rdf_add_special_token(self, RDF: pd.DataFrame): + """ + Adds RDF special token to each element of the tuple. i.e: SUBJ to SubjectURI, OBJ to ObjectURI, REL to RelationshipURI. + Check Scrits/Libs/CleaningPipeline/special_token.py for the up-to-date special token. + It only adds the special token of the three element of the RDF, no other special token. + Args: + RDF (pd.DataFrame): + Returns: + pd.DataFrame: ["MovieURI","SubjectURI","RelationshipURI","ObjectURI","Abstract"] + """ + # if the filter runned before sliced the RDF and created a View, here the problem is resolved + # for more context: SettingWithCopyWarning + RDF = RDF.copy() + # at the beginning of SubjectURI RelationshipURI ObjectURI, add their special token + RDF["SubjectURI"] = SpecialToken.SUBJECT.value + RDF["SubjectURI"] + RDF["ObjectURI"] = SpecialToken.OBJECT.value + RDF["ObjectURI"] + RDF["RelationshipURI"] = SpecialToken.RELATIONSHIP.value + RDF["RelationshipURI"] + return RDF + + + def drop_na_from_dataset(self, RDF: pd.DataFrame) -> pd.DataFrame: + # dataset has SubjectURI RelationshipURI ObjectURI + # want to drop the '' in them + # Replace empty strings with NaN + RDF = RDF.replace('', np.nan) + # Drop rows where any of the key columns are NaN + RDF = RDF.dropna(subset=["SubjectURI", "RelationshipURI", "ObjectURI"]) + return RDF + + def rebuild_by_movie(self, RDF: pd.DataFrame) -> pd.DataFrame: + """_summary_ + + Args: + RDF (pd.DataFrame): ["MovieID","SubjectURI","RelationshipURI","ObjectURI","Abstract"] + + Returns: + pd.DataFrame: ["MovieID","Triple","Abstract"] + """ + # to execute this method you have to have itereted by movie_id + # because as design we want at the end one row for each movie + # MovieID and abstract can be given as input for a more generic method + # movie_id = RDF["MovieID"].iloc(0) + # abstract = RDF["Abstract"].iloc(0) + # first let's combine each row creating column triple as join of rdf + RDF["Triple"] = RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"] + # special token + RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value + # combine rows into one + # MovieID and Abstract are unique for each other 1 <-> 1 + RDF = RDF.groupby(["MovieID", "Abstract"])["Triple"].apply("".join).reset_index() + # add special token for: start of triple, end of triple and start of abstract + RDF["Triple"] = SpecialToken.START_TRIPLE_LIST.value + RDF["Triple"] + RDF["Abstract"] = SpecialToken.ABSTRACT.value + RDF["Abstract"] + return RDF[["MovieID","Triple","Abstract"]] + + def group_by_movie_from_triple(self, RDF: pd.DataFrame) -> pd.DataFrame: + """ + Args: + RDF (pd.DataFrame): ["MovieID","Triple","Abstract"] + + Returns: + pd.DataFrame: ["MovieID","Triple","Abstract"] + """ + # combine rows into one + # MovieID and Abstract are unique for each other 1 <-> 1 + RDF = RDF.groupby(["MovieID", "Abstract"])["Triple"].apply("".join).reset_index() + # add special token for: start of triple, end of triple and start of abstract + RDF["Triple"] = SpecialToken.START_TRIPLE_LIST.value + RDF["Triple"] + RDF["Abstract"] = SpecialToken.ABSTRACT.value + RDF["Abstract"] + return RDF[["MovieID","Triple","Abstract"]] + + + @staticmethod + def build_triple(RDF: pd.DataFrame): + """ + Obtains joined RDF triple in one element, togheter with START and END special token + Args: + RDF (pd.DataFrame): at least ["SubjectURI", "RelationshipURI", "ObjectURI"] + Returns: + pd.DataFrame: RDF["Triple"] (just this column) + """ + # let's combine each row creating column triple as join of rdf + RDF["Triple"] = RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"] + # special token + RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value + return RDF["Triple"] + + @staticmethod + def build_incomplete_triple(RDF: pd.DataFrame): + """ + Method helper used for the third task: "Predicting a masked component within an RDF triple". + Obtains joined RDF triple in one element, togheter with START and END special token. + The MISSING element will be replaced by the special token + Args: + RDF (pd.DataFrame): 2 of the following ["SubjectURI", "RelationshipURI", "ObjectURI"] + Returns: + RDF["Triple"]: pd.Series (just this column, NOT A DATAFRAME) + """ + # let's create a new column "Triple" with the joined RDF + + # the following creates a column of MASK token of the lenght of the dataframe, + # it is not needed since we expect to have a dataframe of just one column, but its more robust (AND SLOW) + MISSING = pd.Series([SpecialToken.MASK.value] * len(RDF), index=RDF.index) + + RDF["Triple"] = ( + RDF.get("SubjectURI", MISSING) + + RDF.get("RelationshipURI", MISSING) + + RDF.get("ObjectURI", MISSING)) + # special token + RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value + return RDF["Triple"] + + @staticmethod + def build_for_mask_task(RDF_incomplete: pd.DataFrame, MISSING: pd.DataFrame) -> pd.DataFrame: + # currently not used + """ + Method helper used for the third task: "Predicting a masked component within an RDF triple". + Given two Dataframe, the first containing the incompleted RDF and the other only the missing componment, + this methods applies the special token + Args: + RDF (pd.DataFrame): _description_ + + Returns: + pd.DataFrame: _description_ + """ + # take an example dataframe as ["SubjectURI",""] + # as input two dataframe, one with 2 column + return None + diff --git a/Scripts/DataCleaning/pipeline.py b/Scripts/DataCleaning/pipeline.py new file mode 100644 index 0000000..e07294b --- /dev/null +++ b/Scripts/DataCleaning/pipeline.py @@ -0,0 +1,107 @@ +import re +from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint +from Scripts.DataCleaning.filter import PipelineApplier +# tasks dataset builder +from Scripts.DataCleaning.data_output_models.rdf_mask_task import RDF_mask_task_dataset +from Scripts.DataCleaning.data_output_models.bpe_corpus import BPE_corpus +from Scripts.DataCleaning.data_output_models.rdf_text_tasks import RDF_text_task_dataset +from Scripts.DataCleaning.data_output_models.rdf_completation_task import RDF_completation_task_dataset + +import pandas as pd + +class Pipeline(): + def __init__(self, output): + self.sql_endpoint = SqlEndpoint() + # classes to manage taskes' datasets + self.task_rdf_mask = RDF_mask_task_dataset("./Assets/Dataset/Tmp/debug.csv") + self.task_bpe_corpus = BPE_corpus("./Assets/Dataset/Tmp/output.txt") + self.task_rdf_text = RDF_text_task_dataset("./Assets/Dataset/Tmp/rdf_text.csv") + self.task_rdf_completation = RDF_completation_task_dataset("./Assets/Dataset/Tmp/rdf_completation.csv") + + # prepare the filter + # the filter applier needs to know thefrequence of Movies and Relationship among all the Dataset + self.filter_applier = PipelineApplier() + MOVIE_COUNT = self.sql_endpoint.get_movies_id_count() + REL_COUNT = self.sql_endpoint.get_relationship_count() + self.filter_applier.generate_frequency_movie_filter(MOVIE_COUNT,50,3000) + self.filter_applier.generate_frequency_relationship_filter(REL_COUNT, 50, 2395627) + # prepare the filter ot the relationshipURI you want to delete: + relationship_uri_banned_list = [ + "dbp-dbp:wikiPageUsesTemplate","w3:2000/01/rdf-schema#label","dbp-dbo:abstract", + "dbp-dbo:wikiPageID","dbp-dbo:wikiPageRevisionID", "dbp-dbo:wikiPageDisambiguates", + "w3:2002/07/owl#sameAs","dbp-dbp:image","dbp-dbo:wikiPageLength", "w3:2000/01/rdf-schema#comment", + "dbp-dbo:thumbnail", "foaf:depiction", "w3:1999/02/22-rdf-syntax-ns#type"] + self.filter_applier.generate_list_relationship_filter(relationship_uri_banned_list) + + + def _end_file_handler(self): + self.task_bpe_corpus.close() + self.task_rdf_mask.close() + self.task_rdf_text.close() + self.task_rdf_completation.close() + + def _get_cleaned_movie_rows(self): + for RDF in self.sql_endpoint.get_abbreviated_dataset_by_movie_id(): + RDF = self.filter_applier.drop_na_from_dataset(RDF) + RDF = self.filter_applier.filter_by_frequency_movie_id(RDF) + RDF = self.filter_applier.filter_by_frequency_relationship(RDF) + # other filter + # + RDF = self.filter_applier.delete_relationship_by_list_filter(RDF) + if RDF.empty: + continue + RDF = self.filter_applier.rdf_add_special_token(RDF) # WARNING, THIS MUST BE DONE AFTER FILTER BY FREQUENCE + yield RDF + + def execute_task_bpe_corpus(self): + for RDF in self._get_cleaned_movie_rows(): + RDF = self.filter_applier.rebuild_by_movie(RDF) + RDF = RDF[["Triple","Abstract"]] + self.task_bpe_corpus.write_from_df(RDF) + self._end_file_handler() + + + def execute_task_rdf_mask(self): + for RDF in self._get_cleaned_movie_rows(): + self.task_rdf_mask.write(RDF) + self._end_file_handler() + + def execute_tasks_rdf_text(self): + for RDF in self._get_cleaned_movie_rows(): + RDF = self.filter_applier.rebuild_by_movie(RDF) + self.task_rdf_text.write(RDF) + self._end_file_handler() + + def execute_task_rdf_completation(self): + for RDF in self._get_cleaned_movie_rows(): + RDF["Triple"] = self.filter_applier.build_triple(RDF) + self.task_rdf_completation.write(RDF[["MovieID","Triple"]]) + self._end_file_handler() + + + def execute_all_task(self): + for RDF in self._get_cleaned_movie_rows(): + self.task_rdf_mask.write(RDF) + + RDF["Triple"] = self.filter_applier.build_triple(RDF) + self.task_rdf_completation.write(RDF[["MovieID","Triple"]]) + + RDF = self.filter_applier.group_by_movie_from_triple(RDF[["MovieID","Triple","Abstract"]]) + + self.task_rdf_text.write(RDF) + self.task_bpe_corpus.write_from_df(RDF[["Triple","Abstract"]]) + + self._end_file_handler() + + + + + + + +pipeline = Pipeline("./Assets/Dataset/Tmp/output.txt") +# pipeline.execute_task_bpe_corpus() +# pipeline.execute_task_rdf_mask() +# pipeline.execute_tasks_rdf_text() +# pipeline.execute_task_rdf_completation() +pipeline.execute_all_task() \ No newline at end of file diff --git a/Scripts/Libs/CleaningPipeline/special_token.py b/Scripts/Libs/CleaningPipeline/special_token.py new file mode 100644 index 0000000..644ad71 --- /dev/null +++ b/Scripts/Libs/CleaningPipeline/special_token.py @@ -0,0 +1,21 @@ +from enum import Enum + +class SpecialToken(str, Enum): + # (Enum, str) -> throws an error + START_TRIPLE_LIST = "" + START_TRIPLE = "" + END_TRIPLE = "" + SUBJECT = "" + RELATIONSHIP = "" + OBJECT = "" + ABSTRACT = "" + CORPUS_END = "" + + ## Tasks' Token + RDF_TO_TEXT = "" + TEXT_TO_RDF = "" + CONTINUE_RDF = "" + MASK = "" + + #BPE Training: + \ No newline at end of file diff --git a/Scripts/Libs/CleaningPipeline/sql_endpoint.py b/Scripts/Libs/CleaningPipeline/sql_endpoint.py new file mode 100644 index 0000000..4e43528 --- /dev/null +++ b/Scripts/Libs/CleaningPipeline/sql_endpoint.py @@ -0,0 +1,144 @@ +####################################################### +# This file stand as endpoint to interact with DB # +####################################################### + +# import sqlite3 +import pandas as pd +from sqlalchemy import create_engine +from Scripts.Libs.CleaningPipeline.special_token import SpecialToken + + +class SqlEndpoint(): + + def __init__(self, DB_PATH = "./Assets/Dataset/DatawareHouse/dataset.db", chunk_size_row = 500): + # self.CONN = sqlite3.connect(DB_PATH) # DEPRECATED + self.sql_engine = create_engine(f"sqlite:///{DB_PATH}") + # /// 3 slash -> relative path + # //// 4 slash -> absolute + # self.conn = self.sql_engine.connect().execution_options(stream_results=True) + # it seems that sqlite doenst support streamer cursor + # PRAGMA exeutes better in writing not reading + self.chunk_size_row = chunk_size_row + pass + + def get_RDF(self) -> pd.DataFrame : + + QUERY = """ + SELECT MovieID, SubjectURI, RelationshipURI, ObjectURI + FROM RDFs + INNER JOIN Subjects USING (SubjectID) + INNER JOIN Relationships USING (RelationshipID) + INNER JOIN Objects USING (ObjectID); + """ + + return pd.read_sql_query(QUERY, self.CONN) + + def get_chunked_abbreviated_dataset(self) -> pd.DataFrame : + """ + Returns: + pd.DataFrame: MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract + """ + + QUERY = """ + SELECT MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract + FROM RDFs + INNER JOIN ParsedSubjects USING (SubjectID) + INNER JOIN ParsedRelationships USING (RelationshipID) + INNER JOIN ParsedObjects USING (ObjectID) + INNER JOIN WikipediaAbstracts USING (MovieID); + """ + + # return pd.read_sql_query(QUERY, self.CONN, chunksize=500) + # sqlite3 + return pd.read_sql_query(QUERY, self.sql_engine, chunksize=self.chunk_size_row) + + + def get_chunked_abbreviated_dataset_with_start_token(self)-> pd.DataFrame: + # DEPRECATED ! + start_token = SpecialToken() + QUERY = """ + SELECT + MovieID, + ? || SubjectURI AS SubjectURI, + ? || RelationshipURI AS RelationshipURI, + ? || ObjectURI AS ObjectURI, + Abstract + FROM RDFs + INNER JOIN ParsedSubjects USING (SubjectID) + INNER JOIN ParsedRelationships USING (RelationshipID) + INNER JOIN ParsedObjects USING (ObjectID) + INNER JOIN WikipediaAbstracts USING (MovieID); + """ + return pd.read_sql_query(QUERY, self.sql_engine, chunksize=self.chunk_size_row) + + def get_abbreviated_dataset_by_movie_id(self):# -> iter[pd.DataFrame]: + """ + Gets each time a DataFrame per movie ( with all its rows in the dataset). + The retrieved RDFs are already abbrevieted by the sql parser + Yields: + Pandas.DataFrame: [MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract] + """ + # chunk by movieId, abstract is the same and some intersting logic are appliable + movie_ids = pd.read_sql_query("SELECT MovieID FROM Movies;", self.sql_engine)["MovieID"] + # CHOOSEN MOVIE: + # The Dark Knight : 117248 + # Inception : 147074 + # The Avengers : 113621 + # Cast Away : 1123 + # The Departed : 117586 + # American Psycho : 90177 + # Avatar : 71587 + # Django Unchained : 138952 + # Spirited Away : 144137 + # Knives Out : 148025 + movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025] + movie_ids = movie_list + + QUERY = """ + SELECT MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract + FROM RDFs + INNER JOIN ParsedSubjects USING (SubjectID) + INNER JOIN ParsedRelationships USING (RelationshipID) + INNER JOIN ParsedObjects USING (ObjectID) + INNER JOIN WikipediaAbstracts USING (MovieID) + WHERE MovieID = (?); + """ + + for movie_id in movie_ids: + yield pd.read_sql_query(QUERY, self.sql_engine, params=(movie_id,)) + + def get_movies_id_count(self) -> pd.DataFrame: + """ + Gets the count of each Movie in the Dataset + Returns: + Pandas.DataFrame: [MovieID, Count] + """ + QUERY = """ + SELECT MovieID, COUNT(*) AS Count + FROM RDFs + GROUP BY MovieID; + """ + return pd.read_sql_query(QUERY, self.sql_engine) + + def get_relationship_count(self) -> pd.DataFrame: + """ + Gets the count of each Relationship in the Dataset + Returns: + Pandas.DataFrame: [RelationshipURI, Count] + """ + QUERY = """ + SELECT RelationshipURI, COUNT(*) AS Count + FROM RDFs + INNER JOIN ParsedRelationships USING (RelationshipID) + GROUP BY RelationshipURI; + """ + return pd.read_sql_query(QUERY, self.sql_engine) + + + +if __name__ == "__main__" : + sql_endpoint = SqlEndpoint() + for pandas_row in sql_endpoint.get_abbreviated_dataset_by_movie_id(): + print(pandas_row) + # sql_endpoint.get_RDF() + print("done") \ No newline at end of file diff --git a/Scripts/Libs/Utils/dataframe_interaction.py b/Scripts/Libs/Utils/dataframe_interaction.py new file mode 100644 index 0000000..c4df33a --- /dev/null +++ b/Scripts/Libs/Utils/dataframe_interaction.py @@ -0,0 +1,9 @@ +import pandas as pd + + + +def get_raw_from_dataframe(DF: pd.DataFrame) -> str: + output = '' + for row in DF.itertuples(index=False, name=None): + output += "".join(map(str, row)) + return output From 8167c9d435b15a4f189d57b6644a376fed2f2e2c Mon Sep 17 00:00:00 2001 From: GassiGiuseppe Date: Mon, 29 Sep 2025 16:03:49 +0200 Subject: [PATCH 6/9] Added Toy Dataset entry point into the Pipeline class Before it was forced into the sql_endpoint, now all the pipeline can be managed in the Pipeline class --- Scripts/DataCleaning/pipeline.py | 78 ++++++++++++------- Scripts/Libs/CleaningPipeline/sql_endpoint.py | 12 +-- 2 files changed, 57 insertions(+), 33 deletions(-) diff --git a/Scripts/DataCleaning/pipeline.py b/Scripts/DataCleaning/pipeline.py index e07294b..eb5b2f7 100644 --- a/Scripts/DataCleaning/pipeline.py +++ b/Scripts/DataCleaning/pipeline.py @@ -10,22 +10,22 @@ from Scripts.DataCleaning.data_output_models.rdf_completation_task import RDF_co import pandas as pd class Pipeline(): - def __init__(self, output): + def __init__(self): self.sql_endpoint = SqlEndpoint() # classes to manage taskes' datasets - self.task_rdf_mask = RDF_mask_task_dataset("./Assets/Dataset/Tmp/debug.csv") - self.task_bpe_corpus = BPE_corpus("./Assets/Dataset/Tmp/output.txt") + self.task_rdf_mask = RDF_mask_task_dataset("./Assets/Dataset/Tmp/rdf_mask.csv") + self.task_bpe_corpus = BPE_corpus("./Assets/Dataset/Tmp/corpus.txt") self.task_rdf_text = RDF_text_task_dataset("./Assets/Dataset/Tmp/rdf_text.csv") self.task_rdf_completation = RDF_completation_task_dataset("./Assets/Dataset/Tmp/rdf_completation.csv") # prepare the filter - # the filter applier needs to know thefrequence of Movies and Relationship among all the Dataset + # the filter applier needs to know the frequence of Movies and Relationship among all the Dataset self.filter_applier = PipelineApplier() MOVIE_COUNT = self.sql_endpoint.get_movies_id_count() REL_COUNT = self.sql_endpoint.get_relationship_count() self.filter_applier.generate_frequency_movie_filter(MOVIE_COUNT,50,3000) self.filter_applier.generate_frequency_relationship_filter(REL_COUNT, 50, 2395627) - # prepare the filter ot the relationshipURI you want to delete: + # prepare the filter on the relationshipURI you want to delete: relationship_uri_banned_list = [ "dbp-dbp:wikiPageUsesTemplate","w3:2000/01/rdf-schema#label","dbp-dbo:abstract", "dbp-dbo:wikiPageID","dbp-dbo:wikiPageRevisionID", "dbp-dbo:wikiPageDisambiguates", @@ -34,25 +34,6 @@ class Pipeline(): self.filter_applier.generate_list_relationship_filter(relationship_uri_banned_list) - def _end_file_handler(self): - self.task_bpe_corpus.close() - self.task_rdf_mask.close() - self.task_rdf_text.close() - self.task_rdf_completation.close() - - def _get_cleaned_movie_rows(self): - for RDF in self.sql_endpoint.get_abbreviated_dataset_by_movie_id(): - RDF = self.filter_applier.drop_na_from_dataset(RDF) - RDF = self.filter_applier.filter_by_frequency_movie_id(RDF) - RDF = self.filter_applier.filter_by_frequency_relationship(RDF) - # other filter - # - RDF = self.filter_applier.delete_relationship_by_list_filter(RDF) - if RDF.empty: - continue - RDF = self.filter_applier.rdf_add_special_token(RDF) # WARNING, THIS MUST BE DONE AFTER FILTER BY FREQUENCE - yield RDF - def execute_task_bpe_corpus(self): for RDF in self._get_cleaned_movie_rows(): RDF = self.filter_applier.rebuild_by_movie(RDF) @@ -66,12 +47,14 @@ class Pipeline(): self.task_rdf_mask.write(RDF) self._end_file_handler() + def execute_tasks_rdf_text(self): for RDF in self._get_cleaned_movie_rows(): RDF = self.filter_applier.rebuild_by_movie(RDF) self.task_rdf_text.write(RDF) self._end_file_handler() + def execute_task_rdf_completation(self): for RDF in self._get_cleaned_movie_rows(): RDF["Triple"] = self.filter_applier.build_triple(RDF) @@ -92,14 +75,55 @@ class Pipeline(): self.task_bpe_corpus.write_from_df(RDF[["Triple","Abstract"]]) self._end_file_handler() - + + + def _end_file_handler(self): + self.task_bpe_corpus.close() + self.task_rdf_mask.close() + self.task_rdf_text.close() + self.task_rdf_completation.close() + + + def _get_cleaned_movie_rows(self): + for RDF in self.sql_endpoint.get_abbreviated_dataset_by_movie_id(): + RDF = self.filter_applier.drop_na_from_dataset(RDF) + RDF = self.filter_applier.filter_by_frequency_movie_id(RDF) + RDF = self.filter_applier.filter_by_frequency_relationship(RDF) + # other filter + # + RDF = self.filter_applier.delete_relationship_by_list_filter(RDF) + if RDF.empty: + continue + RDF = self.filter_applier.rdf_add_special_token(RDF) # WARNING, THIS MUST BE DONE AFTER FILTER BY FREQUENCE + yield RDF + + + def use_toy_dataset(self): + # CHOOSEN MOVIE: + # The Dark Knight : 117248 + # Inception : 147074 + # The Avengers : 113621 + # Cast Away : 1123 + # The Departed : 117586 + # American Psycho : 90177 + # Avatar : 71587 + # Django Unchained : 138952 + # Spirited Away : 144137 + # Knives Out : 148025 + movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025] + self.sql_endpoint.movie_ids = movie_list +# there are a lot of settings to manage +# you only need to change settings: +# in the init for file paths, frequency filter limit, banned reletionshipURI +# in the use_toy_dataset , to change the toy dataset +# in _get_cleaned_movie_rows: to change how the pipeline behave +pipeline = Pipeline() - -pipeline = Pipeline("./Assets/Dataset/Tmp/output.txt") +# pipeline.use_toy_dataset() # pipeline.execute_task_bpe_corpus() # pipeline.execute_task_rdf_mask() # pipeline.execute_tasks_rdf_text() diff --git a/Scripts/Libs/CleaningPipeline/sql_endpoint.py b/Scripts/Libs/CleaningPipeline/sql_endpoint.py index 4e43528..66ba1ea 100644 --- a/Scripts/Libs/CleaningPipeline/sql_endpoint.py +++ b/Scripts/Libs/CleaningPipeline/sql_endpoint.py @@ -18,8 +18,8 @@ class SqlEndpoint(): # self.conn = self.sql_engine.connect().execution_options(stream_results=True) # it seems that sqlite doenst support streamer cursor # PRAGMA exeutes better in writing not reading - self.chunk_size_row = chunk_size_row - pass + self.chunk_size_row = chunk_size_row # not used now, since each chunk is a movie + self.movie_ids = movie_ids = pd.read_sql_query("SELECT MovieID FROM Movies;", self.sql_engine)["MovieID"] def get_RDF(self) -> pd.DataFrame : @@ -79,7 +79,7 @@ class SqlEndpoint(): Pandas.DataFrame: [MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract] """ # chunk by movieId, abstract is the same and some intersting logic are appliable - movie_ids = pd.read_sql_query("SELECT MovieID FROM Movies;", self.sql_engine)["MovieID"] + # movie_ids = pd.read_sql_query("SELECT MovieID FROM Movies;", self.sql_engine)["MovieID"] # CHOOSEN MOVIE: # The Dark Knight : 117248 # Inception : 147074 @@ -91,8 +91,8 @@ class SqlEndpoint(): # Django Unchained : 138952 # Spirited Away : 144137 # Knives Out : 148025 - movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025] - movie_ids = movie_list + # movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025] + # movie_ids = movie_list QUERY = """ SELECT MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract @@ -104,7 +104,7 @@ class SqlEndpoint(): WHERE MovieID = (?); """ - for movie_id in movie_ids: + for movie_id in self.movie_ids: yield pd.read_sql_query(QUERY, self.sql_engine, params=(movie_id,)) def get_movies_id_count(self) -> pd.DataFrame: From 255d8a072d8e95920bbb723c4536f454a741ab02 Mon Sep 17 00:00:00 2001 From: GassiGiuseppe Date: Mon, 29 Sep 2025 16:59:52 +0200 Subject: [PATCH 7/9] First implementation of the cleaning pipeline UML --- .../cleaning-pipeline.excalidraw.json | 634 ++++++++++++++++++ 1 file changed, 634 insertions(+) create mode 100644 Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json diff --git a/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json b/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json new file mode 100644 index 0000000..1249185 --- /dev/null +++ b/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json @@ -0,0 +1,634 @@ +{ + "type": "excalidraw", + "version": 2, + "source": "https://marketplace.visualstudio.com/items?itemName=pomdtr.excalidraw-editor", + "elements": [ + { + "id": "JNB9z-PeqZ4s8KDfWaoXe", + "type": "rectangle", + "x": 106, + "y": 27, + "width": 653, + "height": 263, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a2", + "roundness": { + "type": 3 + }, + "seed": 710740889, + "version": 326, + "versionNonce": 1107631703, + "isDeleted": false, + "boundElements": null, + "updated": 1759156408059, + "link": null, + "locked": false + }, + { + "id": "e13wNTgUpn2flMpmMttqx", + "type": "text", + "x": 200.5943407656526, + "y": 44.07937975075269, + "width": 307.2781467269385, + "height": 23.3097531902191, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a3", + "roundness": null, + "seed": 1012740663, + "version": 444, + "versionNonce": 589551257, + "isDeleted": false, + "boundElements": null, + "updated": 1759156408059, + "link": null, + "locked": false, + "text": "Libs/CleaningPipeline/sql_endpoint", + "fontSize": 18.64780255217528, + "fontFamily": 5, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "Libs/CleaningPipeline/sql_endpoint", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "id": "CgxCElJkKBtIHv-5WQrbo", + "type": "text", + "x": 195, + "y": 80.44259472749451, + "width": 403.64997665852184, + "height": 186.4780255217528, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "a4", + "roundness": null, + "seed": 1261951799, + "version": 507, + "versionNonce": 1922906999, + "isDeleted": false, + "boundElements": null, + "updated": 1759156408059, + "link": null, + "locked": false, + "text": "Class SqlEndpoint:\n - sql_engine\n + movie_ids: list[int]\n\n #\n + get_abbreviated_dataset_by_movie_id\n\n", + "fontSize": 18.64780255217528, + "fontFamily": 5, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "Class SqlEndpoint:\n - sql_engine\n + movie_ids: list[int]\n\n #\n + get_abbreviated_dataset_by_movie_id\n\n", + "autoResize": true, + "lineHeight": 1.25 + }, + { + "type": "line", + "version": 4978, + "versionNonce": 2079525497, + "isDeleted": false, + "id": "sYReMTdYblr-oJtYYJALU", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": -68.05426555317842, + "y": 87.19293561900287, + "strokeColor": "#000000", + "backgroundColor": "#a5d8ff", + "width": 77.09201683999922, + "height": 99.49948667804088, + "seed": 1263944119, + "groupIds": [ + "9YkNe1yqnfZy9Z1JX2xr4", + "BDBCTrrhjbJynRAyuf3xJ" + ], + "strokeSharpness": "round", + "boundElementIds": [], + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + 0.2542098813493443, + 75.20117273657175 + ], + [ + 0.011896425679918422, + 83.76249969444815 + ], + [ + 3.970409367559332, + 87.46174320643391 + ], + [ + 17.75573317066317, + 90.59250103325854 + ], + [ + 41.05683533152865, + 91.56737225214069 + ], + [ + 63.319497586673116, + 90.01084754868091 + ], + [ + 75.14781395923075, + 86.28844687220405 + ], + [ + 76.81603792670788, + 83.15042405259751 + ], + [ + 77.05033394391478, + 76.25776215104557 + ], + [ + 76.86643881413028, + 6.3089586511537865 + ], + [ + 76.45188016352971, + -0.2999144698665015 + ], + [ + 71.50179495549581, + -3.9936571317850627 + ], + [ + 61.077971898861186, + -6.132877429442784 + ], + [ + 37.32348754161154, + -7.932114425900202 + ], + [ + 18.278415656797975, + -6.859225353587373 + ], + [ + 3.2995959613238286, + -3.2201165291205287 + ], + [ + -0.04168289608444441, + -0.045185660461322996 + ], + [ + 0, + 0 + ] + ], + "index": "a6", + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1759157176189, + "link": null, + "locked": false + }, + { + "type": "line", + "version": 2683, + "versionNonce": 33379161, + "isDeleted": false, + "id": "0S6dEWQVqKUVkP6Z5IX1l", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": -67.53033611490343, + "y": 144.31921927673278, + "strokeColor": "#000000", + "backgroundColor": "#a5d8ff", + "width": 77.17198221193564, + "height": 8.562348957853036, + "seed": 817033943, + "groupIds": [ + "9YkNe1yqnfZy9Z1JX2xr4", + "BDBCTrrhjbJynRAyuf3xJ" + ], + "strokeSharpness": "round", + "boundElementIds": [], + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + 2.033150371639873, + 3.413095389435587 + ], + [ + 10.801287372573954, + 6.276651055277943 + ], + [ + 22.468666942209353, + 8.010803051612635 + ], + [ + 40.747074201802775, + 8.168828515515864 + ], + [ + 62.077348233027564, + 7.0647721921469495 + ], + [ + 74.53446931782398, + 3.04824021069218 + ], + [ + 77.17198221193564, + -0.3935204423371723 + ] + ], + "index": "a7", + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1759157176189, + "link": null, + "locked": false + }, + { + "type": "line", + "version": 2769, + "versionNonce": 1703641145, + "isDeleted": false, + "id": "szGLND7J0nVOvRkNXX9AS", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": -68.56219343740725, + "y": 115.35516394150972, + "strokeColor": "#000000", + "backgroundColor": "#a5d8ff", + "width": 77.17198221193564, + "height": 8.562348957853036, + "seed": 1704755191, + "groupIds": [ + "9YkNe1yqnfZy9Z1JX2xr4", + "BDBCTrrhjbJynRAyuf3xJ" + ], + "strokeSharpness": "round", + "boundElementIds": [], + "startBinding": null, + "endBinding": null, + "lastCommittedPoint": null, + "startArrowhead": null, + "endArrowhead": null, + "points": [ + [ + 0, + 0 + ], + [ + 2.033150371639873, + 3.413095389435587 + ], + [ + 10.801287372573954, + 6.276651055277943 + ], + [ + 22.468666942209353, + 8.010803051612635 + ], + [ + 40.747074201802775, + 8.168828515515864 + ], + [ + 62.077348233027564, + 7.0647721921469495 + ], + [ + 74.53446931782398, + 3.04824021069218 + ], + [ + 77.17198221193564, + -0.3935204423371723 + ] + ], + "index": "a8", + "frameId": null, + "roundness": { + "type": 2 + }, + "boundElements": [], + "updated": 1759157176189, + "link": null, + "locked": false + }, + { + "type": "ellipse", + "version": 5766, + "versionNonce": 344002841, + "isDeleted": false, + "id": "O3t2uGktJlDd1_OX_bpV4", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": -69.6201424194893, + "y": 80.06066699332126, + "strokeColor": "#000000", + "backgroundColor": "#a5d8ff", + "width": 76.59753601865496, + "height": 15.49127539284798, + "seed": 471296279, + "groupIds": [ + "9YkNe1yqnfZy9Z1JX2xr4", + "BDBCTrrhjbJynRAyuf3xJ" + ], + "strokeSharpness": "sharp", + "boundElementIds": [ + "bxuMGTzXLn7H-uBCptINx" + ], + "index": "a9", + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1759157176189, + "link": null, + "locked": false + }, + { + "type": "ellipse", + "version": 1176, + "versionNonce": 1951499769, + "isDeleted": false, + "id": "_SzKlOBOvJgBg7FX0JTTM", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": -33.12815531426679, + "y": 104.53733467322485, + "strokeColor": "#000000", + "backgroundColor": "#228be6", + "width": 11.226103154161754, + "height": 12.183758484455605, + "seed": 1368927799, + "groupIds": [ + "9YkNe1yqnfZy9Z1JX2xr4", + "BDBCTrrhjbJynRAyuf3xJ" + ], + "strokeSharpness": "sharp", + "boundElementIds": [], + "index": "aA", + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1759157176189, + "link": null, + "locked": false + }, + { + "type": "ellipse", + "version": 1464, + "versionNonce": 1879072473, + "isDeleted": false, + "id": "oJMl2Kxa3SPaiAY0kxo7A", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": -32.77701353033319, + "y": 130.75394896028996, + "strokeColor": "#000000", + "backgroundColor": "#228be6", + "width": 11.226103154161754, + "height": 12.183758484455605, + "seed": 1627606871, + "groupIds": [ + "9YkNe1yqnfZy9Z1JX2xr4", + "BDBCTrrhjbJynRAyuf3xJ" + ], + "strokeSharpness": "sharp", + "boundElementIds": [], + "index": "aB", + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1759157176189, + "link": null, + "locked": false + }, + { + "type": "ellipse", + "version": 1347, + "versionNonce": 1176574905, + "isDeleted": false, + "id": "fB6pJBSMA-pRHrpgYKaLL", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 6.239590202363168, + "x": -32.12815531426679, + "y": 159.52267553159635, + "strokeColor": "#000000", + "backgroundColor": "#228be6", + "width": 11.226103154161754, + "height": 12.183758484455605, + "seed": 1420643447, + "groupIds": [ + "9YkNe1yqnfZy9Z1JX2xr4", + "BDBCTrrhjbJynRAyuf3xJ" + ], + "strokeSharpness": "sharp", + "boundElementIds": [], + "index": "aC", + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1759157176189, + "link": null, + "locked": false + }, + { + "type": "text", + "version": 845, + "versionNonce": 383204505, + "isDeleted": false, + "id": "9gZ3Yy1MeP9kEOTLODqLG", + "fillStyle": "solid", + "strokeWidth": 1, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "angle": 0, + "x": -77.72012292771115, + "y": 181.11281713043917, + "strokeColor": "#000000", + "backgroundColor": "#a5d8ff", + "width": 95.63072204589844, + "height": 23.595161071904883, + "seed": 2019206551, + "groupIds": [ + "BDBCTrrhjbJynRAyuf3xJ" + ], + "strokeSharpness": "sharp", + "boundElementIds": [], + "fontSize": 17.4778970902999, + "fontFamily": 1, + "text": "dataset.db", + "baseline": 16.595161071904883, + "textAlign": "center", + "verticalAlign": "top", + "index": "aD", + "frameId": null, + "roundness": null, + "boundElements": [], + "updated": 1759157176189, + "link": null, + "locked": false, + "containerId": null, + "originalText": "dataset.db", + "autoResize": true, + "lineHeight": 1.350000000000001 + }, + { + "id": "3eOw20xMhpB5jf_RMG24P", + "type": "text", + "x": 1131.3333333333335, + "y": 31.333333333333428, + "width": 508.3333333333333, + "height": 550, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aE", + "roundness": null, + "seed": 1535658041, + "version": 821, + "versionNonce": 1630266809, + "isDeleted": false, + "boundElements": null, + "updated": 1759157181677, + "link": null, + "locked": false, + "text": "Class PipelineApplier\n - movie_frequence_filter : pd.DataFrame()\n - rel_Frequence_Filter : pd.DataFrame()\n - rel_banned_list: list[str]\n\n + generate_movie_frequency_filter()\n + generate_rel_frequency_filter()\n + generate_list_relationship_filter()\n \n + filter_by_movie_frequency()\n + filter_by_relationship_frequency()\n + delete_relationship_by_list_filter()\n + delete_relationship_by_str()\n\n + drop_na() \n\n + rdf_add_special_token()\n + group_triple_by_movie()\n + build_by_movie()\n # static\n + build_triple()\n + build_incomplete_triple()", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "Class PipelineApplier\n - movie_frequence_filter : pd.DataFrame()\n - rel_Frequence_Filter : pd.DataFrame()\n - rel_banned_list: list[str]\n\n + generate_movie_frequency_filter()\n + generate_rel_frequency_filter()\n + generate_list_relationship_filter()\n \n + filter_by_movie_frequency()\n + filter_by_relationship_frequency()\n + delete_relationship_by_list_filter()\n + delete_relationship_by_str()\n\n + drop_na() \n\n + rdf_add_special_token()\n + group_triple_by_movie()\n + build_by_movie()\n # static\n + build_triple()\n + build_incomplete_triple()", + "autoResize": false, + "lineHeight": 1.25 + }, + { + "id": "Fbl1gpb5r7QrdRauGUWm2", + "type": "text", + "x": 158.23809523809535, + "y": 502.52380952380935, + "width": 484.2857142857143, + "height": 475, + "angle": 0, + "strokeColor": "#1e1e1e", + "backgroundColor": "transparent", + "fillStyle": "solid", + "strokeWidth": 2, + "strokeStyle": "solid", + "roughness": 1, + "opacity": 100, + "groupIds": [], + "frameId": null, + "index": "aF", + "roundness": null, + "seed": 2066618807, + "version": 541, + "versionNonce": 7392153, + "isDeleted": false, + "boundElements": null, + "updated": 1759157954202, + "link": null, + "locked": false, + "text": "Class Pipeline\n - sql_endpoint: SqlEndpoint()\n\n - task_rdf_mask_file_handler:\n - task_bpe_corpus_file_handler:\n - task_rdf_text_file_handler:\n - task_rdf_completation_file_handler:\n\n - Filter_applier : PipelineApplier()\n\n\n \n + execute_task_bpe_corpus()\n + execute_task_rdf_mask()\n + execute_task_rdf_text()\n + execute_task_rdf_completation()\n + execute_all_task()\n\n + use_toy_dataset()", + "fontSize": 20, + "fontFamily": 5, + "textAlign": "left", + "verticalAlign": "top", + "containerId": null, + "originalText": "Class Pipeline\n - sql_endpoint: SqlEndpoint()\n\n - task_rdf_mask_file_handler:\n - task_bpe_corpus_file_handler:\n - task_rdf_text_file_handler:\n - task_rdf_completation_file_handler:\n\n - Filter_applier : PipelineApplier()\n\n\n \n + execute_task_bpe_corpus()\n + execute_task_rdf_mask()\n + execute_task_rdf_text()\n + execute_task_rdf_completation()\n + execute_all_task()\n\n + use_toy_dataset()", + "autoResize": false, + "lineHeight": 1.25 + } + ], + "appState": { + "gridSize": 20, + "gridStep": 5, + "gridModeEnabled": false, + "viewBackgroundColor": "#ffffff" + }, + "files": {} +} \ No newline at end of file From c319398ca01f10f5a2099219146649390cfec4a9 Mon Sep 17 00:00:00 2001 From: GassiGiuseppe Date: Mon, 29 Sep 2025 17:03:31 +0200 Subject: [PATCH 8/9] little update to UML pipeline --- .../cleaning-pipeline.excalidraw.json | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json b/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json index 1249185..a3b4660 100644 --- a/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json +++ b/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json @@ -592,7 +592,7 @@ "x": 158.23809523809535, "y": 502.52380952380935, "width": 484.2857142857143, - "height": 475, + "height": 500, "angle": 0, "strokeColor": "#1e1e1e", "backgroundColor": "transparent", @@ -606,20 +606,20 @@ "index": "aF", "roundness": null, "seed": 2066618807, - "version": 541, - "versionNonce": 7392153, + "version": 552, + "versionNonce": 1269344823, "isDeleted": false, "boundElements": null, - "updated": 1759157954202, + "updated": 1759158199532, "link": null, "locked": false, - "text": "Class Pipeline\n - sql_endpoint: SqlEndpoint()\n\n - task_rdf_mask_file_handler:\n - task_bpe_corpus_file_handler:\n - task_rdf_text_file_handler:\n - task_rdf_completation_file_handler:\n\n - Filter_applier : PipelineApplier()\n\n\n \n + execute_task_bpe_corpus()\n + execute_task_rdf_mask()\n + execute_task_rdf_text()\n + execute_task_rdf_completation()\n + execute_all_task()\n\n + use_toy_dataset()", + "text": "Class Pipeline\n - sql_endpoint: SqlEndpoint()\n\n - task_rdf_mask_file_handler:\n - task_bpe_corpus_file_handler:\n - task_rdf_text_file_handler:\n - task_rdf_completation_file_handler:\n\n - Filter_applier : PipelineApplier()\n\n #\n - get_cleaned_movie_rows()\n \n + execute_task_bpe_corpus()\n + execute_task_rdf_mask()\n + execute_task_rdf_text()\n + execute_task_rdf_completation()\n + execute_all_task()\n\n + use_toy_dataset()", "fontSize": 20, "fontFamily": 5, "textAlign": "left", "verticalAlign": "top", "containerId": null, - "originalText": "Class Pipeline\n - sql_endpoint: SqlEndpoint()\n\n - task_rdf_mask_file_handler:\n - task_bpe_corpus_file_handler:\n - task_rdf_text_file_handler:\n - task_rdf_completation_file_handler:\n\n - Filter_applier : PipelineApplier()\n\n\n \n + execute_task_bpe_corpus()\n + execute_task_rdf_mask()\n + execute_task_rdf_text()\n + execute_task_rdf_completation()\n + execute_all_task()\n\n + use_toy_dataset()", + "originalText": "Class Pipeline\n - sql_endpoint: SqlEndpoint()\n\n - task_rdf_mask_file_handler:\n - task_bpe_corpus_file_handler:\n - task_rdf_text_file_handler:\n - task_rdf_completation_file_handler:\n\n - Filter_applier : PipelineApplier()\n\n #\n - get_cleaned_movie_rows()\n \n + execute_task_bpe_corpus()\n + execute_task_rdf_mask()\n + execute_task_rdf_text()\n + execute_task_rdf_completation()\n + execute_all_task()\n\n + use_toy_dataset()", "autoResize": false, "lineHeight": 1.25 } From 007f1e955405ba466ab68ac0c7da656c3edca905 Mon Sep 17 00:00:00 2001 From: GassiGiuseppe Date: Mon, 29 Sep 2025 18:53:33 +0200 Subject: [PATCH 9/9] minor updates --- .vscode/settings.json | 23 ++++++- .../cleaning-pipeline.excalidraw.json | 64 +++++++++---------- 2 files changed, 54 insertions(+), 33 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 17ae78b..226939d 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,3 +1,24 @@ { - "jupyter.notebookFileRoot": "${workspaceFolder}" + // Always treat the project root as the working dir for Jupyter + "jupyter.notebookFileRoot": "${workspaceFolder}", + + // When you click "Run Python File in Terminal", DON'T cd into the file's folder + "python.terminal.executeInFileDir": false, + + // Start new integrated terminals at the project root + "terminal.integrated.cwd": "${workspaceFolder}", + + // Ensure Python can import from the project root no matter which file you run + // (so `src/` is on sys.path). Linux shown here; add osx/windows if needed. + "terminal.integrated.env.linux": { + "PYTHONPATH": "${workspaceFolder}" + }, + + // Make pytest run from the root without needing a pytest.ini + "python.testing.pytestEnabled": true, + "python.testing.cwd": "${workspaceFolder}", + "python.testing.pytestArgs": ["src/test"], + + // Help Pylance resolve imports like `from src...` without red squiggles + "python.analysis.extraPaths": ["${workspaceFolder}"] } \ No newline at end of file diff --git a/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json b/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json index a3b4660..c7019f5 100644 --- a/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json +++ b/Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json @@ -109,8 +109,8 @@ }, { "type": "line", - "version": 4978, - "versionNonce": 2079525497, + "version": 4979, + "versionNonce": 1473849177, "isDeleted": false, "id": "sYReMTdYblr-oJtYYJALU", "fillStyle": "solid", @@ -119,7 +119,7 @@ "roughness": 1, "opacity": 100, "angle": 0, - "x": -68.05426555317842, + "x": -67.14432426259049, "y": 87.19293561900287, "strokeColor": "#000000", "backgroundColor": "#a5d8ff", @@ -221,14 +221,14 @@ "type": 2 }, "boundElements": [], - "updated": 1759157176189, + "updated": 1759158252997, "link": null, "locked": false }, { "type": "line", - "version": 2683, - "versionNonce": 33379161, + "version": 2684, + "versionNonce": 952947769, "isDeleted": false, "id": "0S6dEWQVqKUVkP6Z5IX1l", "fillStyle": "solid", @@ -237,7 +237,7 @@ "roughness": 1, "opacity": 100, "angle": 0, - "x": -67.53033611490343, + "x": -66.6203948243155, "y": 144.31921927673278, "strokeColor": "#000000", "backgroundColor": "#a5d8ff", @@ -295,14 +295,14 @@ "type": 2 }, "boundElements": [], - "updated": 1759157176189, + "updated": 1759158252997, "link": null, "locked": false }, { "type": "line", - "version": 2769, - "versionNonce": 1703641145, + "version": 2770, + "versionNonce": 477619481, "isDeleted": false, "id": "szGLND7J0nVOvRkNXX9AS", "fillStyle": "solid", @@ -311,7 +311,7 @@ "roughness": 1, "opacity": 100, "angle": 0, - "x": -68.56219343740725, + "x": -67.65225214681931, "y": 115.35516394150972, "strokeColor": "#000000", "backgroundColor": "#a5d8ff", @@ -369,14 +369,14 @@ "type": 2 }, "boundElements": [], - "updated": 1759157176189, + "updated": 1759158252997, "link": null, "locked": false }, { "type": "ellipse", - "version": 5766, - "versionNonce": 344002841, + "version": 5767, + "versionNonce": 2119031289, "isDeleted": false, "id": "O3t2uGktJlDd1_OX_bpV4", "fillStyle": "solid", @@ -385,7 +385,7 @@ "roughness": 1, "opacity": 100, "angle": 0, - "x": -69.6201424194893, + "x": -68.71020112890136, "y": 80.06066699332126, "strokeColor": "#000000", "backgroundColor": "#a5d8ff", @@ -404,14 +404,14 @@ "frameId": null, "roundness": null, "boundElements": [], - "updated": 1759157176189, + "updated": 1759158252997, "link": null, "locked": false }, { "type": "ellipse", - "version": 1176, - "versionNonce": 1951499769, + "version": 1177, + "versionNonce": 525480665, "isDeleted": false, "id": "_SzKlOBOvJgBg7FX0JTTM", "fillStyle": "solid", @@ -420,7 +420,7 @@ "roughness": 1, "opacity": 100, "angle": 0, - "x": -33.12815531426679, + "x": -32.218214023678854, "y": 104.53733467322485, "strokeColor": "#000000", "backgroundColor": "#228be6", @@ -437,14 +437,14 @@ "frameId": null, "roundness": null, "boundElements": [], - "updated": 1759157176189, + "updated": 1759158252997, "link": null, "locked": false }, { "type": "ellipse", - "version": 1464, - "versionNonce": 1879072473, + "version": 1465, + "versionNonce": 1410887609, "isDeleted": false, "id": "oJMl2Kxa3SPaiAY0kxo7A", "fillStyle": "solid", @@ -453,7 +453,7 @@ "roughness": 1, "opacity": 100, "angle": 0, - "x": -32.77701353033319, + "x": -31.867072239745255, "y": 130.75394896028996, "strokeColor": "#000000", "backgroundColor": "#228be6", @@ -470,14 +470,14 @@ "frameId": null, "roundness": null, "boundElements": [], - "updated": 1759157176189, + "updated": 1759158252997, "link": null, "locked": false }, { "type": "ellipse", - "version": 1347, - "versionNonce": 1176574905, + "version": 1348, + "versionNonce": 314839193, "isDeleted": false, "id": "fB6pJBSMA-pRHrpgYKaLL", "fillStyle": "solid", @@ -486,7 +486,7 @@ "roughness": 1, "opacity": 100, "angle": 6.239590202363168, - "x": -32.12815531426679, + "x": -31.218214023678854, "y": 159.52267553159635, "strokeColor": "#000000", "backgroundColor": "#228be6", @@ -503,14 +503,14 @@ "frameId": null, "roundness": null, "boundElements": [], - "updated": 1759157176189, + "updated": 1759158252997, "link": null, "locked": false }, { "type": "text", - "version": 845, - "versionNonce": 383204505, + "version": 846, + "versionNonce": 1091081593, "isDeleted": false, "id": "9gZ3Yy1MeP9kEOTLODqLG", "fillStyle": "solid", @@ -519,7 +519,7 @@ "roughness": 1, "opacity": 100, "angle": 0, - "x": -77.72012292771115, + "x": -76.81018163712321, "y": 181.11281713043917, "strokeColor": "#000000", "backgroundColor": "#a5d8ff", @@ -541,7 +541,7 @@ "frameId": null, "roundness": null, "boundElements": [], - "updated": 1759157176189, + "updated": 1759158252997, "link": null, "locked": false, "containerId": null,