NanoSocrates/Project_Model/UML/bpe.excalidraw.json
2025-09-26 18:49:29 +02:00

658 lines
19 KiB
JSON

{
"type": "excalidraw",
"version": 2,
"source": "https://marketplace.visualstudio.com/items?itemName=pomdtr.excalidraw-editor",
"elements": [
{
"id": "EcT-dGsjmfW571ov8Gg4F",
"type": "text",
"x": 425.5,
"y": 132,
"width": 506,
"height": 425,
"angle": 0,
"strokeColor": "#1e1e1e",
"backgroundColor": "transparent",
"fillStyle": "solid",
"strokeWidth": 2,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"groupIds": [
"4rCC2-N1thmII8_dwNhe1"
],
"frameId": null,
"index": "a3V",
"roundness": null,
"seed": 523521109,
"version": 883,
"versionNonce": 1590682729,
"isDeleted": false,
"boundElements": [
{
"id": "OA_NKjb3n3NLtUo_tKmPS",
"type": "arrow"
}
],
"updated": 1758881654155,
"link": null,
"locked": false,
"text": "class NanoSocratesBPE(Encoder):\n - vocabulary: Vocabulary\n\n + fit(\n data: [[int]], \n memory: NanoSocratesBPE_BatchMemory,\n last_sentence_chunk: bool, \n last_batch: bool\n ) -> (Self, NanoSocratesBPE_BatchMemory)\n\n + encode(word: [byte]) -> [int]\n\n + decode(token: [int]) -> [byte]\n\n + get_vocabulary_size() -> int\n \n",
"fontSize": 20,
"fontFamily": 8,
"textAlign": "left",
"verticalAlign": "top",
"containerId": null,
"originalText": "class NanoSocratesBPE(Encoder):\n - vocabulary: Vocabulary\n\n + fit(\n data: [[int]], \n memory: NanoSocratesBPE_BatchMemory,\n last_sentence_chunk: bool, \n last_batch: bool\n ) -> (Self, NanoSocratesBPE_BatchMemory)\n\n + encode(word: [byte]) -> [int]\n\n + decode(token: [int]) -> [byte]\n\n + get_vocabulary_size() -> int\n \n",
"autoResize": true,
"lineHeight": 1.25
},
{
"id": "74i4oK-JpcM4CgAqhz_x_",
"type": "rectangle",
"x": 382.5,
"y": 104.5,
"width": 592.5,
"height": 421,
"angle": 0,
"strokeColor": "#1e1e1e",
"backgroundColor": "transparent",
"fillStyle": "solid",
"strokeWidth": 2,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"groupIds": [
"4rCC2-N1thmII8_dwNhe1"
],
"frameId": null,
"index": "a4",
"roundness": {
"type": 3
},
"seed": 50827893,
"version": 319,
"versionNonce": 704459557,
"isDeleted": false,
"boundElements": [],
"updated": 1758878226277,
"link": null,
"locked": false
},
{
"id": "s8I1JoKulE3Vnti9a374p",
"type": "text",
"x": 1113.5,
"y": 127,
"width": 517,
"height": 325,
"angle": 0,
"strokeColor": "#1e1e1e",
"backgroundColor": "transparent",
"fillStyle": "solid",
"strokeWidth": 2,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"groupIds": [
"M6w9efVFwOZHkJGgwkyEw"
],
"frameId": null,
"index": "a5",
"roundness": null,
"seed": 2091174261,
"version": 480,
"versionNonce": 1964948039,
"isDeleted": false,
"boundElements": [],
"updated": 1758881941367,
"link": null,
"locked": false,
"text": "class Vocabulary:\n\n - vocabulary: dict<(int, int), int>\n - reverse_vocabulary: dict<int, (int, int)>\n\n + size -> int\n\n + add_word(int) -> int\n + encode(int) -> int\n + decode(int) -> int\n \n\n",
"fontSize": 20,
"fontFamily": 8,
"textAlign": "left",
"verticalAlign": "top",
"containerId": null,
"originalText": "class Vocabulary:\n\n - vocabulary: dict<(int, int), int>\n - reverse_vocabulary: dict<int, (int, int)>\n\n + size -> int\n\n + add_word(int) -> int\n + encode(int) -> int\n + decode(int) -> int\n \n\n",
"autoResize": true,
"lineHeight": 1.25
},
{
"id": "BY_Why7XDNftdMzPcwjVZ",
"type": "rectangle",
"x": 1086.5,
"y": 105.5,
"width": 593.0000000000001,
"height": 325.5,
"angle": 0,
"strokeColor": "#1e1e1e",
"backgroundColor": "transparent",
"fillStyle": "solid",
"strokeWidth": 2,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"groupIds": [
"M6w9efVFwOZHkJGgwkyEw"
],
"frameId": null,
"index": "a6",
"roundness": {
"type": 3
},
"seed": 153939611,
"version": 234,
"versionNonce": 2068149129,
"isDeleted": false,
"boundElements": [
{
"id": "WcDks9DR8UqeZEaxAcRf9",
"type": "arrow"
}
],
"updated": 1758881945661,
"link": null,
"locked": false
},
{
"id": "JCPDhuTKRx4MN950Q3jL-",
"type": "text",
"x": 1116.411067193676,
"y": 477.3809288774704,
"width": 416.74578857421875,
"height": 99.70355731225297,
"angle": 0,
"strokeColor": "#1e1e1e",
"backgroundColor": "transparent",
"fillStyle": "solid",
"strokeWidth": 2,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"groupIds": [
"DbtlKVF_9SjH2-9iMq9zy"
],
"frameId": null,
"index": "a7",
"roundness": null,
"seed": 1326854235,
"version": 479,
"versionNonce": 595084597,
"isDeleted": false,
"boundElements": [],
"updated": 1758902358518,
"link": null,
"locked": false,
"text": "class NanoSocrateBPE_BatchMemory:\n\n + frequency: dict<(int, int), int>\n + merge_treshold: int",
"fontSize": 19.940711462450594,
"fontFamily": 8,
"textAlign": "left",
"verticalAlign": "top",
"containerId": null,
"originalText": "class NanoSocrateBPE_BatchMemory:\n\n + frequency: dict<(int, int), int>\n + merge_treshold: int",
"autoResize": true,
"lineHeight": 1.25
},
{
"id": "l-O0rMS3SruV22_MPX9Jz",
"type": "rectangle",
"x": 1086.5,
"y": 451.4580039762846,
"width": 593,
"height": 208.0419960474308,
"angle": 0,
"strokeColor": "#1e1e1e",
"backgroundColor": "transparent",
"fillStyle": "solid",
"strokeWidth": 2,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"groupIds": [
"DbtlKVF_9SjH2-9iMq9zy"
],
"frameId": null,
"index": "a8",
"roundness": {
"type": 3
},
"seed": 1490898171,
"version": 305,
"versionNonce": 587306139,
"isDeleted": false,
"boundElements": [
{
"id": "OA_NKjb3n3NLtUo_tKmPS",
"type": "arrow"
}
],
"updated": 1758902358518,
"link": null,
"locked": false
},
{
"id": "WcDks9DR8UqeZEaxAcRf9",
"type": "arrow",
"x": 773.5,
"y": 167,
"width": 297.17936724485867,
"height": 30,
"angle": 0,
"strokeColor": "#1e1e1e",
"backgroundColor": "transparent",
"fillStyle": "solid",
"strokeWidth": 2,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"groupIds": [],
"frameId": null,
"index": "aB",
"roundness": {
"type": 2
},
"seed": 1681364149,
"version": 303,
"versionNonce": 1262492265,
"isDeleted": false,
"boundElements": [],
"updated": 1758881945661,
"link": null,
"locked": false,
"points": [
[
0,
0
],
[
144.5,
-1.5
],
[
177.5,
-30
],
[
297.17936724485867,
-29.020420978562214
]
],
"lastCommittedPoint": null,
"startBinding": null,
"endBinding": {
"elementId": "BY_Why7XDNftdMzPcwjVZ",
"focus": 0.77319587628866,
"gap": 18.25
},
"startArrowhead": null,
"endArrowhead": "triangle",
"elbowed": false
},
{
"id": "OA_NKjb3n3NLtUo_tKmPS",
"type": "arrow",
"x": 946.0000000000002,
"y": 274.95951048200493,
"width": 130.016707976343,
"height": 209.36808480159067,
"angle": 0,
"strokeColor": "#1e1e1e",
"backgroundColor": "transparent",
"fillStyle": "solid",
"strokeWidth": 2,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"groupIds": [],
"frameId": null,
"index": "aD",
"roundness": {
"type": 2
},
"seed": 1871768059,
"version": 1039,
"versionNonce": 213535035,
"isDeleted": false,
"boundElements": [],
"updated": 1758902358519,
"link": null,
"locked": false,
"points": [
[
0,
0
],
[
54.99999999999977,
12.54048951799507
],
[
69.49999999999977,
188.54048951799507
],
[
130.016707976343,
209.36808480159067
]
],
"lastCommittedPoint": null,
"startBinding": {
"elementId": "EcT-dGsjmfW571ov8Gg4F",
"focus": -0.48312180762055096,
"gap": 14.500000000000114
},
"endBinding": {
"elementId": "l-O0rMS3SruV22_MPX9Jz",
"focus": -0.16742658425737647,
"gap": 11.194126334166185
},
"startArrowhead": null,
"endArrowhead": "triangle",
"elbowed": false
},
{
"id": "snZ__VDsIlri6NTp8M2Gf",
"type": "text",
"x": -245.25,
"y": 103,
"width": 330,
"height": 125,
"angle": 0,
"strokeColor": "#1e1e1e",
"backgroundColor": "transparent",
"fillStyle": "solid",
"strokeWidth": 2,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"groupIds": [],
"frameId": null,
"index": "aE",
"roundness": null,
"seed": 1758461093,
"version": 265,
"versionNonce": 1069481861,
"isDeleted": false,
"boundElements": [],
"updated": 1758879566916,
"link": null,
"locked": false,
"text": "class NanoSocratesBPETrainer:\n\n - max_iterations: int\n - max_vocabulary_size: int\n - merge_treshold: int",
"fontSize": 20,
"fontFamily": 8,
"textAlign": "left",
"verticalAlign": "top",
"containerId": null,
"originalText": "class NanoSocratesBPETrainer:\n\n - max_iterations: int\n - max_vocabulary_size: int\n - merge_treshold: int",
"autoResize": true,
"lineHeight": 1.25
},
{
"id": "PnbmqwEWYkP8oXElKFyTp",
"type": "text",
"x": -237.75,
"y": 544,
"width": 561,
"height": 125,
"angle": 0,
"strokeColor": "#1e1e1e",
"backgroundColor": "transparent",
"fillStyle": "solid",
"strokeWidth": 2,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"groupIds": [],
"frameId": null,
"index": "aH",
"roundness": null,
"seed": 501304683,
"version": 241,
"versionNonce": 1306401003,
"isDeleted": false,
"boundElements": [],
"updated": 1758878748210,
"link": null,
"locked": false,
"text": "class NanoSocratesSplitter:\n + splitter_regex: regex\n\n + split_text(corpus: str) -> [(str, TokenType)]\n",
"fontSize": 20,
"fontFamily": 8,
"textAlign": "left",
"verticalAlign": "top",
"containerId": null,
"originalText": "class NanoSocratesSplitter:\n + splitter_regex: regex\n\n + split_text(corpus: str) -> [(str, TokenType)]\n",
"autoResize": true,
"lineHeight": 1.25
},
{
"id": "xR_11IzgXX5O-m6WoRfCL",
"type": "text",
"x": -233.25,
"y": 366.5,
"width": 165,
"height": 75,
"angle": 0,
"strokeColor": "#1e1e1e",
"backgroundColor": "transparent",
"fillStyle": "solid",
"strokeWidth": 2,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"groupIds": [],
"frameId": null,
"index": "aI",
"roundness": null,
"seed": 2025585125,
"version": 395,
"versionNonce": 1799178985,
"isDeleted": false,
"boundElements": [],
"updated": 1758883940168,
"link": null,
"locked": false,
"text": "enum TokenType:\n + SPECIAL\n + BPE",
"fontSize": 20,
"fontFamily": 8,
"textAlign": "left",
"verticalAlign": "top",
"containerId": null,
"originalText": "enum TokenType:\n + SPECIAL\n + BPE",
"autoResize": true,
"lineHeight": 1.25
},
{
"id": "lgKSd9qCb94-5e8rd9I3r",
"type": "text",
"x": -219.75,
"y": 764.5,
"width": 462,
"height": 275,
"angle": 0,
"strokeColor": "#1e1e1e",
"backgroundColor": "transparent",
"fillStyle": "solid",
"strokeWidth": 2,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"groupIds": [],
"frameId": null,
"index": "aJ",
"roundness": null,
"seed": 1963214021,
"version": 422,
"versionNonce": 903841927,
"isDeleted": false,
"boundElements": [],
"updated": 1758879973600,
"link": null,
"locked": false,
"text": "class TokeNano:\n\n - splitter: NanoSocratesSplitter\n - bpe_encoder: NanoSocratesBPE\n - special_encoder: NanoSocratesSpecial\n\n + encode(corpus: str) -> [int]\n\n - encode_special(piece: str) -> int\n\n - encode_bpe(piece: str) -> [int]",
"fontSize": 20,
"fontFamily": 8,
"textAlign": "left",
"verticalAlign": "top",
"containerId": null,
"originalText": "class TokeNano:\n\n - splitter: NanoSocratesSplitter\n - bpe_encoder: NanoSocratesBPE\n - special_encoder: NanoSocratesSpecial\n\n + encode(corpus: str) -> [int]\n\n - encode_special(piece: str) -> int\n\n - encode_bpe(piece: str) -> [int]",
"autoResize": true,
"lineHeight": 1.25
},
{
"id": "DwFJoUpVT2YAEe9qPYAXa",
"type": "text",
"x": 496.75,
"y": 666,
"width": 440,
"height": 100,
"angle": 0,
"strokeColor": "#1e1e1e",
"backgroundColor": "transparent",
"fillStyle": "solid",
"strokeWidth": 2,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"groupIds": [],
"frameId": null,
"index": "aL",
"roundness": null,
"seed": 1317596203,
"version": 152,
"versionNonce": 1840679687,
"isDeleted": false,
"boundElements": [],
"updated": 1758880107704,
"link": null,
"locked": false,
"text": "class NanoSocratesSpecial(Encoder):\n\n + vocabulary: dict<str, int>\n + reverse_vocabulary: dict<int, str>",
"fontSize": 20,
"fontFamily": 8,
"textAlign": "left",
"verticalAlign": "top",
"containerId": null,
"originalText": "class NanoSocratesSpecial(Encoder):\n\n + vocabulary: dict<str, int>\n + reverse_vocabulary: dict<int, str>",
"autoResize": true,
"lineHeight": 1.25
},
{
"id": "78gC46xatoO1_cRtaN8EC",
"type": "text",
"x": 396.375,
"y": -107.75,
"width": 346.3997802734375,
"height": 100,
"angle": 0,
"strokeColor": "#1e1e1e",
"backgroundColor": "transparent",
"fillStyle": "solid",
"strokeWidth": 2,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"groupIds": [],
"frameId": null,
"index": "aM",
"roundness": null,
"seed": 1187595241,
"version": 128,
"versionNonce": 1487192455,
"isDeleted": false,
"boundElements": [],
"updated": 1758879825591,
"link": null,
"locked": false,
"text": "class Encoder(ABC):\n\n + encode(corpus: str) -> [int]\n + decode(encoded: [int]) -> str ",
"fontSize": 20,
"fontFamily": 5,
"textAlign": "left",
"verticalAlign": "top",
"containerId": null,
"originalText": "class Encoder(ABC):\n\n + encode(corpus: str) -> [int]\n + decode(encoded: [int]) -> str ",
"autoResize": true,
"lineHeight": 1.25
},
{
"id": "3j50Ds74uU7oXoJ9kMOYJ",
"type": "text",
"x": 457.375,
"y": 903.75,
"width": 949.7594604492188,
"height": 25,
"angle": 0,
"strokeColor": "#1e1e1e",
"backgroundColor": "transparent",
"fillStyle": "solid",
"strokeWidth": 2,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"groupIds": [],
"frameId": null,
"index": "aN",
"roundness": null,
"seed": 1994335529,
"version": 198,
"versionNonce": 1492696519,
"isDeleted": false,
"boundElements": [],
"updated": 1758882694747,
"link": null,
"locked": false,
"text": "@@mamma@@è bell^^issima e @@^^le voglio molto b^^ene @--@ replit ^^ è molto ^^bello e^^ lo amo",
"fontSize": 20,
"fontFamily": 5,
"textAlign": "left",
"verticalAlign": "top",
"containerId": null,
"originalText": "@@mamma@@è bell^^issima e @@^^le voglio molto b^^ene @--@ replit ^^ è molto ^^bello e^^ lo amo",
"autoResize": true,
"lineHeight": 1.25
},
{
"id": "yg-TvQvz4MwJZ0y8K7Ix0",
"type": "text",
"x": 435.375,
"y": 1026.25,
"width": 352,
"height": 250,
"angle": 0,
"strokeColor": "#1e1e1e",
"backgroundColor": "transparent",
"fillStyle": "solid",
"strokeWidth": 2,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"groupIds": [],
"frameId": null,
"index": "aP",
"roundness": null,
"seed": 1877486407,
"version": 344,
"versionNonce": 25830153,
"isDeleted": false,
"boundElements": [],
"updated": 1758883468886,
"link": null,
"locked": false,
"text": "class NanoSocratesChunker:\n\n - max_bytes: int\n - max_special_length: int\n - special_token_regex: regex\n\n - residuals: str\n\n # This must be an iterator\n + read(path: Path) -> str",
"fontSize": 20,
"fontFamily": 8,
"textAlign": "left",
"verticalAlign": "top",
"containerId": null,
"originalText": "class NanoSocratesChunker:\n\n - max_bytes: int\n - max_special_length: int\n - special_token_regex: regex\n\n - residuals: str\n\n # This must be an iterator\n + read(path: Path) -> str",
"autoResize": true,
"lineHeight": 1.25
}
],
"appState": {
"gridSize": 20,
"gridStep": 5,
"gridModeEnabled": false,
"viewBackgroundColor": "#ffffff"
},
"files": {}
}