Compare commits

..

No commits in common. "dev.train" and "main" have entirely different histories.

186 changed files with 2 additions and 13927 deletions

1
.gitattributes vendored
View File

@ -1,3 +1,2 @@
Exam/Deep_Learning_2025_VIII.pdf filter=lfs diff=lfs merge=lfs -text
Assets/** filter=lfs diff=lfs merge=lfs -text
Assets/Dataset/1-hop/dataset.csv filter=lfs diff=lfs merge=lfs -text

7
.gitignore vendored
View File

@ -189,8 +189,7 @@ ipython_config.py
.LSOverride
# Icon must end with two \r
Icon
Icon
# Thumbnails
._*
@ -252,7 +251,3 @@ $RECYCLE.BIN/
# .nfs files are created when an open file is removed but is still being accessed
.nfs*
# ---> Custom
**/Tmp/**
**/cache/**
!**/.gitkeep

View File

@ -1,14 +0,0 @@
{
"recommendations": [
"bierner.github-markdown-preview",
"bierner.markdown-checkbox",
"bierner.markdown-emoji",
"bierner.markdown-footnotes",
"bierner.markdown-mermaid",
"bierner.markdown-preview-github-styles",
"bierner.markdown-yaml-preamble",
"davidanson.vscode-markdownlint",
"kejun.markdown-alert",
"yzhang.markdown-all-in-one"
]
}

16
.vscode/launch.json vendored
View File

@ -1,16 +0,0 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Python Debugger: Current File with Arguments",
"type": "debugpy",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal",
"args": "${command:pickArgs}"
}
]
}

55
.vscode/settings.json vendored
View File

@ -1,55 +0,0 @@
{
// Always treat the project root as the working dir for Jupyter
"jupyter.notebookFileRoot": "${workspaceFolder}",
// When you click "Run Python File in Terminal", DON'T cd into the file's folder
"python.terminal.executeInFileDir": false,
// Start new integrated terminals at the project root
"terminal.integrated.cwd": "${workspaceFolder}",
// Make pytest run from the root without needing a pytest.ini
"python.testing.pytestEnabled": true,
"python.testing.cwd": "${workspaceFolder}",
"python.testing.pytestArgs": [
"src/test"
],
// Help Pylance resolve imports like `from src...` without red squiggles
"python.analysis.extraPaths": [
"${workspaceFolder}"
],
// For linux
"terminal.integrated.env.linux": {
"PYTHONPATH": "${workspaceFolder}"
},
// For OSX
"terminal.integrated.env.osx": {
"PYTHONPATH": "${workspaceFolder}"
},
// For Windows
"terminal.integrated.env.windows": {
"PYTHONPATH": "${workspaceFolder}"
},
"python.analysis.typeCheckingMode": "standard"
}
// {
// // Always treat the project root as the working dir for Jupyter
// "jupyter.notebookFileRoot": "${workspaceFolder}",
//
// // When you click "Run Python File in Terminal", DON'T cd into the file's folder
// "python.terminal.executeInFileDir": false,
//
// // Start new integrated terminals at the project root
// "terminal.integrated.cwd": "${workspaceFolder}",
//
// // Ensure Python can import from the project root no matter which file you run
// // (so `src/` is on sys.path). Linux shown here; add osx/windows if needed.
// "terminal.integrated.env.windows": {
// "PYTHONPATH": "${workspaceFolder}"
// },
//
// // Make pytest run from the root without needing a pytest.ini
// "python.testing.pytestEnabled": true,
// "python.testing.cwd": "${workspaceFolder}",
// "python.testing.pytestArgs": ["src/test"],
//
// // Help Pylance resolve imports like `from src...` without red squiggles
// "python.analysis.extraPaths": ["${workspaceFolder}"]
// }

BIN
Assets/Dataset/1-hop/curated/corpus.txt (Stored with Git LFS)

Binary file not shown.

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:6e0a193f90f2b0efc5185b0db9555178b172268b3eab289225b894ac1493493f
3 size 2471083

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:dd309865b60df86f63f76341858e382a8423297ec63eb6f525ccd28b62caf486
3 size 2494589

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:2949f2e9c6ae2b4784e04405dd7f5a3ec2eb65537b421fdc6751e9d5a19af41d
3 size 19527224

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:bc28507d806df96d6c953fbba1999f62a55e26025001de5135892069df05b9bc
3 size 22021103

BIN
Assets/Dataset/1-hop/curated/rdf_text.csv (Stored with Git LFS)

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:176f13b63859c4dc0ca42b94d875aa82b74ad1cd88a186c439ef5444f45ed715
3 size 24455751

BIN
Assets/Dataset/1-hop/dataset.csv (Stored with Git LFS)

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:331d8ef4e99c5200f1323e7149bd8aade39dc17ee5778b553bb32c593ff601cf
3 size 2443211793

BIN
Assets/Dataset/1-hop/movie-pageid.csv (Stored with Git LFS)

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:88e387ed1338bdfd34ded22f3f8bebb2be5127857bf36fcffc266b35c534587c
3 size 10148507

BIN
Assets/Dataset/1-hop/movies.csv (Stored with Git LFS)

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:8d81c8801ea79bd46747769a288cd0c507b3b94b2fb4bbb9605e282776ca5efb
3 size 8808636

BIN
Assets/Dataset/1-hop/reverse.csv (Stored with Git LFS)

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:b4878aed66c382e73982b19fa02129d5b3c3e3e8690c28e4dd662257e1d9b119
3 size 32343972

BIN
Assets/Dataset/1-hop/small/corpus.txt (Stored with Git LFS)

Binary file not shown.

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:206f83b88b442f617575985ac88f4241071fa1b7d66b5935405178051511a369
3 size 1344466

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:6914b6b1f8f06f8cf73b96b9c27bf556f1ee93256f435b7da0be0df2af093d05
3 size 1334675

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:41e92da8af52ca1c83334ebea7312c63d37fdeacde425ba91b78f44a56e4fb88
3 size 10568092

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:80da574017b251c9f07ecbce837d9d36a9ee8183a2a3bdbe0a2e31e22226ab79
3 size 12773126

BIN
Assets/Dataset/1-hop/small/rdf_text.csv (Stored with Git LFS)

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:41b30ab739a01482036c40b6560adfe751c5905ae80aafef6ee0f1a716849c68
3 size 13222824

BIN
Assets/Dataset/1-hop/toy/corpus.txt (Stored with Git LFS)

Binary file not shown.

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:39012a1e59eaa740d01515aa6b9744267dbb3ae13941b28558060795a94d90e0
3 size 86122

BIN
Assets/Dataset/1-hop/toy/rdf_mask.csv (Stored with Git LFS)

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:8f31602eba47f7daff3b13bb243abaf429ff5900a8d26ae854ba790fda47d287
3 size 517642

BIN
Assets/Dataset/1-hop/toy/rdf_text.csv (Stored with Git LFS)

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:1189e04d3ba9d9138a4e216200313f5842b8a49de1745bb553ba2e3abf18d818
3 size 102533

BIN
Assets/Dataset/1-hop/uri-abbreviations.csv (Stored with Git LFS)

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:c1fcb1ad61a69145145c45c639ab42b36ffc63caa0ef9832eb81491197883ff4
3 size 8086

BIN
Assets/Dataset/1-hop/wikipedia-movie.csv (Stored with Git LFS)

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:1730dc111c0290b16d094a4b6a6577d966978d97ee9ef4202e86148cc9d8e8e8
3 size 17445736

BIN
Assets/Dataset/1-hop/wikipedia-summary.csv (Stored with Git LFS)

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:ef7b680257f16b193a9b4ea2914564b58c676955809e6b9d58058adaab7855c1
3 size 73089553

BIN
Assets/Dataset/DatawareHouse/dataset.db (Stored with Git LFS)

Binary file not shown.

BIN
Assets/Model/curated/NanoSocrates.zip (Stored with Git LFS)

Binary file not shown.

BIN
Assets/Model/curated/dec_optim.zip (Stored with Git LFS)

Binary file not shown.

BIN
Assets/Model/curated/enc_optim.zip (Stored with Git LFS)

Binary file not shown.

BIN
Assets/Model/curated/last_epoch.txt (Stored with Git LFS)

Binary file not shown.

BIN
Assets/Model/curated/log_loss.csv (Stored with Git LFS)

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:203b6cb364cf95cbb6cc0ebbff9e8b80e80dda73ff210ad91edeedf6024f6ab1
3 size 2876

BIN
Assets/Model/curated/nano_optim.zip (Stored with Git LFS)

Binary file not shown.

BIN
Assets/Model/small/bpe-small-16.json (Stored with Git LFS)

Binary file not shown.

BIN
Assets/Model/small/bpe-small.json (Stored with Git LFS)

Binary file not shown.

BIN
Assets/Model/toy_10/README.md (Stored with Git LFS)

Binary file not shown.

BIN
Assets/Model/toy_10/toy_dictionary.json (Stored with Git LFS)

Binary file not shown.

View File

@ -1,193 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "ddfb4457",
"metadata": {},
"outputs": [
{
"ename": "AssertionError",
"evalue": "target id 3872 >= V (256). Fix TOKEN_SPACE_SIZE.",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mAssertionError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 126\u001b[39m\n\u001b[32m 124\u001b[39m \u001b[38;5;66;03m# sanity guard (helps debug vocab mismatches fast)\u001b[39;00m\n\u001b[32m 125\u001b[39m max_seen = tgt[:, :Tp].max().item()\n\u001b[32m--> \u001b[39m\u001b[32m126\u001b[39m \u001b[38;5;28;01massert\u001b[39;00m max_seen < V \u001b[38;5;129;01mor\u001b[39;00m (tgt[:, :Tp] == PAD_TOKEN).all(), \\\n\u001b[32m 127\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mtarget id \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmax_seen\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m >= V (\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mV\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m). Fix TOKEN_SPACE_SIZE.\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 129\u001b[39m \u001b[38;5;66;03m# CE over all tokens produced so far (0..t). PAD is ignored by ignore_index\u001b[39;00m\n\u001b[32m 130\u001b[39m loss_t = cross_entropy(\n\u001b[32m 131\u001b[39m logits_btV.reshape(-\u001b[32m1\u001b[39m, V), \u001b[38;5;66;03m# [B*(t+1), V]\u001b[39;00m\n\u001b[32m 132\u001b[39m tgt[:, :Tp].reshape(-\u001b[32m1\u001b[39m) \u001b[38;5;66;03m# [B*(t+1)]\u001b[39;00m\n\u001b[32m 133\u001b[39m )\n",
"\u001b[31mAssertionError\u001b[39m: target id 3872 >= V (256). Fix TOKEN_SPACE_SIZE."
]
}
],
"source": [
"import random\n",
"import torch\n",
"import pandas as pd\n",
"from pathlib import Path\n",
"import Project_Model.Libs.Embedder as Embedder\n",
"import Project_Model.Libs.BPE as BPE\n",
"import Project_Model.Libs.Transformer as Transformer\n",
"import Project_Model.Libs.TorchShims as torch_shims\n",
"from Project_Model.Libs.Training.learning_rade_shedulers import CustomLR\n",
"from Project_Model.Libs.Training.logistic_collector import LogitsCollector # external collector\n",
"\n",
"# set a fixed seed\n",
"torch.manual_seed(0)\n",
"random.seed(0)\n",
"DEVICE = torch_shims.get_default_device()\n",
"torch.set_default_device(DEVICE)\n",
"\n",
"# BPE Init\n",
"VOCABULARY_PATH = Path(\"Assets/Model/toy_10/toy_dictionary.json\")\n",
"SPECIAL_VOC = BPE.default_special_tokens()\n",
"\n",
"VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)\n",
"TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)\n",
"\n",
"# Constants (TEMP size; will be corrected after dataset scan below)\n",
"TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + 1\n",
"EMBEDDED_SIZE = 256\n",
"FEED_FORWARD_MULTIPLIER = 4\n",
"ATTENTION_HEADS = 4\n",
"SENTENCE_LENGTH = 256\n",
"NUMBER_OF_BLOCKS = 2\n",
"MAX_EPOCHS = int(1e4)\n",
"\n",
"PAD_TOKEN = TOKENANO.encode(\"<PAD>\")[0]\n",
"END_TOKEN = TOKENANO.encode(\"<END>\")[0]\n",
"\n",
"# Load CSV\n",
"TOY_DATASET_PATH = Path(\"Assets/Dataset/1-hop/toy/rdf_text.csv\")\n",
"TOY_DATASET = pd.read_csv(TOY_DATASET_PATH)\n",
"\n",
"TOY_BATCH_INPUT_LIST: list[list[int]] = []\n",
"TOY_BATCH_PADDING_LIST: list[list[bool]] = []\n",
"TOY_BATCH_TARGET_LIST: list[list[int]] = []\n",
"TOY_BATCH_DECODER_DEFAULT: list[list[int]] = []\n",
"\n",
"for index, row in TOY_DATASET.iterrows():\n",
" RDFs: str = row[\"RDFs\"]\n",
" Abstract: str = row[\"Abstract\"]\n",
"\n",
" input_tokens = TOKENANO.encode(RDFs) # encoder input ids\n",
" output_tokens = TOKENANO.encode(Abstract)[1:] # decoder target ids (shifted left)\n",
" decoder_default_tokens = TOKENANO.encode(\"<SOS>\") # decoder input starts with <SOS>\n",
"\n",
" input_tokens, padding = Transformer.normalize_sequence(\n",
" input_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n",
" ) # pad/trim + end token\n",
" output_tokens, _ = Transformer.normalize_sequence(\n",
" output_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n",
" ) # pad/trim + end token\n",
" decoder_default_tokens = Transformer.pad_sequence(\n",
" decoder_default_tokens, SENTENCE_LENGTH, PAD_TOKEN\n",
" ) # pad with PAD up to SENTENCE_LENGTH\n",
"\n",
" TOY_BATCH_INPUT_LIST.append(input_tokens)\n",
" TOY_BATCH_PADDING_LIST.append(padding)\n",
" TOY_BATCH_TARGET_LIST.append(output_tokens)\n",
" TOY_BATCH_DECODER_DEFAULT.append(decoder_default_tokens)\n",
"\n",
"# fix V to cover ALL ids (including specials) # <- important\n",
"max_enc_id = max(max(row) for row in TOY_BATCH_INPUT_LIST) if TOY_BATCH_INPUT_LIST else 0\n",
"max_tgt_id = max(max(row) for row in TOY_BATCH_TARGET_LIST) if TOY_BATCH_TARGET_LIST else 0\n",
"TOKEN_SPACE_SIZE = max(TOKEN_SPACE_SIZE, max(PAD_TOKEN, END_TOKEN, max_enc_id, max_tgt_id) + 1)\n",
"\n",
"# Training loop\n",
"LOSS_HISTORY = []\n",
"NANOSOCRATES = Transformer.TrainingModel(\n",
" TOKEN_SPACE_SIZE,\n",
" EMBEDDED_SIZE,\n",
" FEED_FORWARD_MULTIPLIER,\n",
" ATTENTION_HEADS,\n",
" NUMBER_OF_BLOCKS,\n",
")\n",
"\n",
"collector = LogitsCollector(PAD_TOKEN, END_TOKEN, TOKENANO) # collects logits and decodes\n",
"\n",
"NANOSOCRATES.train()\n",
"cross_entropy = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)\n",
"optimizer = torch.optim.AdamW(NANOSOCRATES.parameters(), lr=1.0) # base lr works as factor\n",
"scheduler = CustomLR(optimizer, EMBEDDED_SIZE, warmup_steps=4000, factor=1.0) # step each optimizer step\n",
"\n",
"current_epoch = 0\n",
"BATCH_SIZE = min(32, len(TOY_BATCH_INPUT_LIST)) # small batch to stabilize\n",
"\n",
"while current_epoch < MAX_EPOCHS:\n",
" # simple fixed mini-batch from the top; later you can shuffle/slice\n",
" enc = torch.tensor(TOY_BATCH_INPUT_LIST[:BATCH_SIZE], dtype=torch.long) # [B,T] encoder token ids\n",
" pad = torch.tensor(TOY_BATCH_PADDING_LIST[:BATCH_SIZE], dtype=torch.bool) # [B,T] True where encoder PAD is present\n",
" tgt = torch.tensor(TOY_BATCH_TARGET_LIST[:BATCH_SIZE], dtype=torch.long) # [B,T] decoder targets (ground-truth)\n",
"\n",
" # decoder prefix buffer: <SOS> at pos 0, PAD elsewhere (no shift here) # we will fill it step by step\n",
" dec = torch.tensor(TOY_BATCH_DECODER_DEFAULT[:BATCH_SIZE], dtype=torch.long) # [B,T]\n",
"\n",
" total_loss = 0.0\n",
" collector.reset() # start fresh for this epoch\n",
"\n",
" T = tgt.size(1) # sequence length\n",
" for t in range(T):\n",
" # skip all-PAD steps to avoid CE divide-by-zero late in the sequence\n",
" if (tgt[:, t] == PAD_TOKEN).all(): # all PAD at this timestep\n",
" break\n",
"\n",
" optimizer.zero_grad(set_to_none=True) # clear grads for this token step\n",
"\n",
" prefix = dec[:, : t + 1] # [B, t+1] current decoder prefix\n",
" dec_pad_mask = prefix.eq(PAD_TOKEN) # [B, t+1] True where PAD inside prefix\n",
"\n",
" # now decoder returns all steps up to t -> [B, t+1, V]\n",
" logits_btV: torch.Tensor = NANOSOCRATES((enc, pad, prefix, dec_pad_mask)) # full logits for learning\n",
" collector.add(logits_btV) # collector will take the last step\n",
"\n",
" Tp = logits_btV.size(1) # t+1\n",
" V = logits_btV.size(-1) # vocab size\n",
"\n",
" # sanity guard (helps debug vocab mismatches fast)\n",
" max_seen = tgt[:, :Tp].max().item()\n",
" assert max_seen < V or (tgt[:, :Tp] == PAD_TOKEN).all(), \\\n",
" f\"target id {max_seen} >= V ({V}). Fix TOKEN_SPACE_SIZE.\"\n",
"\n",
" # CE over all tokens produced so far (0..t). PAD is ignored by ignore_index\n",
" loss_t = cross_entropy(\n",
" logits_btV.reshape(-1, V), # [B*(t+1), V]\n",
" tgt[:, :Tp].reshape(-1) # [B*(t+1)]\n",
" )\n",
"\n",
" loss_t.backward() # backprop for this step\n",
" optimizer.step() # update params\n",
" scheduler.step() # Noam/warmup: step per optimizer step\n",
"\n",
" total_loss = float(loss_t.detach()) # keep last step loss for logging\n",
"\n",
" # teacher forcing: reveal the correct token for next position\n",
" if t < T - 1:\n",
" dec[:, t + 1] = tgt[:, t] # write ground-truth into next slot\n",
"\n",
" current_epoch += 1\n",
" print(f\"EPOCH {current_epoch}\\n\\tLoss: {total_loss:.6f}\") # simple log\n",
" collector.print_decoded() # print decoded predictions for the batch\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "deep_learning",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

File diff suppressed because one or more lines are too long

View File

@ -1,308 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 6,
"id": "7a311d4b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[7706, 290, 756, 4270, 7357, 115, 351, 1507, 1213, 410, 3382, 317, 497, 4740, 2784, 7712], [7706, 290, 756, 4270, 7357, 115, 351, 1507, 1213, 410, 3382, 317, 497, 4740, 2784, 7712], [7706, 290, 756, 4270, 7357, 115, 351, 1507, 1213, 410, 3382, 317, 497, 4740, 2784, 7712]]\n",
"3\n",
"Embedder Tensor: torch.Size([3, 16, 256])\n",
"Values:\n",
"tensor([[[-0.6981, 0.0804, -2.1672, ..., 0.3919, 0.3341, 1.0794],\n",
" [ 2.5818, -0.2308, 0.6001, ..., -0.0500, -0.0408, -0.9852],\n",
" [-0.6967, 0.8109, 1.3108, ..., 2.1693, 1.4143, -0.1236],\n",
" ...,\n",
" [ 2.1226, 2.5695, -1.6178, ..., -0.0652, -0.0802, 0.1103],\n",
" [ 0.8770, -2.4782, 0.8536, ..., 2.0471, -1.5702, 0.7387],\n",
" [ 1.4284, -0.4654, 0.1394, ..., 1.6520, 0.6728, 1.3851]],\n",
"\n",
" [[-0.6981, 0.0804, -2.1672, ..., 0.3919, 0.3341, 1.0794],\n",
" [ 2.5818, -0.2308, 0.6001, ..., -0.0500, -0.0408, -0.9852],\n",
" [-0.6967, 0.8109, 1.3108, ..., 2.1693, 1.4143, -0.1236],\n",
" ...,\n",
" [ 2.1226, 2.5695, -1.6178, ..., -0.0652, -0.0802, 0.1103],\n",
" [ 0.8770, -2.4782, 0.8536, ..., 2.0471, -1.5702, 0.7387],\n",
" [ 1.4284, -0.4654, 0.1394, ..., 1.6520, 0.6728, 1.3851]],\n",
"\n",
" [[-0.6981, 0.0804, -2.1672, ..., 0.3919, 0.3341, 1.0794],\n",
" [ 2.5818, -0.2308, 0.6001, ..., -0.0500, -0.0408, -0.9852],\n",
" [-0.6967, 0.8109, 1.3108, ..., 2.1693, 1.4143, -0.1236],\n",
" ...,\n",
" [ 2.1226, 2.5695, -1.6178, ..., -0.0652, -0.0802, 0.1103],\n",
" [ 0.8770, -2.4782, 0.8536, ..., 2.0471, -1.5702, 0.7387],\n",
" [ 1.4284, -0.4654, 0.1394, ..., 1.6520, 0.6728, 1.3851]]],\n",
" grad_fn=<AddBackward0>)\n",
"ENCODER Tensor: torch.Size([3, 1, 256])\n",
"Values:\n",
"tensor([[[ 8.0069e-01, 4.0532e-01, -1.8316e+00, -1.3902e+00, -1.1784e+00,\n",
" 1.3667e+00, -9.7890e-01, 6.0696e-01, -1.4899e+00, 5.5765e-01,\n",
" 4.5991e-02, 5.1214e-01, 3.1901e-01, 4.7577e-01, -2.9585e-01,\n",
" -1.0811e+00, -1.5281e+00, -6.3773e-01, -9.5954e-01, 1.8497e+00,\n",
" -1.1789e+00, -9.7387e-01, 1.1931e-01, -7.2703e-01, 5.3108e-01,\n",
" -6.4877e-01, -4.5188e-01, 1.5185e+00, -8.3408e-01, 3.2824e-01,\n",
" -1.8166e+00, 1.9548e+00, -5.2419e-01, -1.0693e+00, -1.8510e+00,\n",
" 1.5440e+00, -3.2370e-01, -1.3990e+00, -4.6940e-01, 6.5840e-02,\n",
" -9.2057e-01, 1.2513e+00, -5.9168e-01, 7.8198e-01, -1.3121e+00,\n",
" 1.1492e+00, -2.3695e-01, -1.8935e+00, 1.1639e+00, -5.8169e-01,\n",
" 2.5051e-01, -8.1654e-01, -1.0328e+00, 1.4285e+00, -8.1485e-01,\n",
" 1.0614e+00, -3.3834e-01, -4.1667e-02, -1.1920e-01, 3.1383e-01,\n",
" -5.9857e-01, 1.7327e-01, -1.6854e+00, -1.5174e+00, -2.6508e-01,\n",
" -6.0082e-01, 5.1468e-01, 2.7909e-01, -2.5296e-01, -1.4670e+00,\n",
" -1.3587e+00, -8.8864e-02, 3.2825e-01, 1.0950e+00, -1.0371e+00,\n",
" 1.1744e+00, 5.2984e-01, 4.1751e-01, -9.8803e-01, 3.5631e-01,\n",
" 4.7484e-01, 2.2435e-01, 1.4022e+00, 1.2242e+00, 1.1447e+00,\n",
" -5.4052e-01, -9.1786e-01, -1.2299e+00, 1.1656e+00, 9.1570e-01,\n",
" 1.8956e+00, 7.4344e-01, 4.2187e-01, -9.5426e-02, -3.2428e-01,\n",
" 9.6364e-01, -2.3252e-01, 2.9036e-01, -2.4432e+00, 9.8019e-01,\n",
" -4.6697e-02, 8.3910e-01, -4.3541e-01, -7.1915e-01, -7.5638e-01,\n",
" 9.0217e-01, 2.0919e+00, -7.9533e-01, -1.5413e-01, -6.9260e-01,\n",
" -1.3086e+00, 7.8925e-01, 1.8855e-01, 7.4043e-01, -3.8834e-01,\n",
" 1.0272e-02, 1.0763e+00, 4.2142e-01, 6.6520e-01, 4.5996e-01,\n",
" -8.5060e-01, -9.0101e-01, -4.2090e-01, 2.5596e-01, -1.4946e+00,\n",
" 1.0925e-01, -7.5359e-01, -3.0447e-01, 1.0679e+00, 1.9398e+00,\n",
" 8.1472e-01, 1.3498e+00, 1.1107e+00, 6.3288e-01, 3.1149e-01,\n",
" -1.9333e+00, -1.5274e+00, 2.1794e-01, -3.1895e-02, 1.0756e+00,\n",
" 1.0215e+00, 1.6938e+00, -1.0939e+00, 2.2690e+00, -7.0921e-01,\n",
" 6.4212e-01, -6.5468e-01, 1.6839e+00, 5.7296e-01, -1.4031e+00,\n",
" 3.9133e-01, -5.3541e-01, 4.3439e-01, -1.6785e+00, 5.2030e-03,\n",
" 4.5155e-01, -7.0953e-01, -1.9656e-01, -3.8671e-02, -1.0927e+00,\n",
" -3.0405e-01, -1.3818e-02, -3.7748e-01, 1.4412e+00, -1.4254e-01,\n",
" 7.9939e-01, -8.5402e-01, -1.0330e+00, 1.7661e+00, -3.6084e-01,\n",
" 1.5622e+00, 1.0240e+00, 1.9056e-01, -4.1480e-01, 6.9056e-01,\n",
" 1.7204e+00, -9.9218e-01, -1.6504e-01, -1.1807e+00, 1.0827e+00,\n",
" 1.5973e+00, 1.4849e-01, -2.2867e+00, 7.7322e-01, -6.8401e-01,\n",
" -6.0493e-01, 1.0616e+00, -1.8034e-01, -1.8828e+00, 1.1031e-01,\n",
" 2.5452e-01, -4.2489e-02, 8.1171e-01, 1.3429e+00, -6.5058e-01,\n",
" -1.3531e+00, -1.2263e+00, 1.1226e+00, 1.2407e+00, -9.7453e-01,\n",
" 9.4696e-01, 6.6186e-01, -5.0804e-01, 1.2647e-01, -1.1777e+00,\n",
" 6.8443e-02, -1.3043e-01, 2.9595e-01, -1.5330e+00, -6.5733e-01,\n",
" 1.1291e+00, 6.9629e-01, 4.4690e-01, 8.0151e-01, -1.2406e+00,\n",
" 2.6085e+00, -2.0310e-01, -1.0226e+00, -6.9182e-02, 7.6600e-01,\n",
" -9.9842e-01, 2.0896e+00, 2.6334e-01, -1.1559e-01, -6.6876e-01,\n",
" -6.6295e-01, -1.6461e-01, 2.8270e+00, 3.2727e-01, 1.3724e+00,\n",
" -1.0749e+00, 3.7782e-01, -1.5472e+00, 3.0822e-01, 5.7273e-02,\n",
" 3.9136e-01, 8.2948e-01, 2.1438e-01, -9.8623e-01, 5.6053e-01,\n",
" -1.5617e+00, -3.9595e-01, 1.0451e-02, -1.1860e+00, -1.4994e-01,\n",
" 1.6566e+00, 2.0369e+00, -4.3995e-01, -4.4262e-01, -3.1014e-01,\n",
" 5.9083e-01, -1.0765e+00, -5.2906e-01, 4.6039e-02, -1.0154e+00,\n",
" 5.9942e-01]],\n",
"\n",
" [[ 1.2683e+00, -4.3200e-01, -1.3333e+00, -3.6705e-01, -5.8895e-01,\n",
" 9.9266e-01, -4.2914e-01, 9.2765e-01, -1.0935e+00, 1.4975e+00,\n",
" -5.3739e-01, -2.8332e-01, 9.1166e-01, 1.5010e+00, -2.1787e-01,\n",
" -1.4258e+00, -2.7524e-01, -1.2602e+00, 2.0117e-01, 2.3906e+00,\n",
" -9.6397e-01, -7.5872e-01, 3.3948e-01, -7.9353e-01, 9.1668e-01,\n",
" 8.7734e-04, -3.0271e-01, 1.7087e+00, -1.0273e+00, 1.5174e+00,\n",
" -2.6405e-02, 1.4236e+00, -9.9093e-01, 5.4787e-01, -1.0904e+00,\n",
" 5.2156e-01, -6.3470e-01, -7.7688e-01, -1.2538e+00, -3.9307e-01,\n",
" -7.6707e-01, 1.3733e+00, -7.2709e-01, 1.1185e+00, -1.5860e+00,\n",
" -2.6148e-01, -3.7984e-01, -1.3604e+00, 9.2864e-02, -7.9642e-01,\n",
" 1.0956e+00, 3.1202e-01, -4.1234e-01, 3.6488e-02, -1.4639e+00,\n",
" 1.0947e+00, -7.9230e-01, 4.6913e-01, -2.3407e-01, 4.1768e-02,\n",
" -1.5921e+00, 6.9743e-01, -7.0222e-01, -5.4705e-01, -6.5663e-01,\n",
" -4.1810e-01, 2.7744e-01, 7.9178e-01, 7.5886e-01, -7.6302e-01,\n",
" -1.2204e+00, -1.1103e+00, -1.3646e-01, 1.9589e+00, -1.3637e+00,\n",
" 9.0804e-01, 2.3094e-01, -5.5953e-02, -6.7626e-01, 1.4242e+00,\n",
" 1.0167e+00, 1.0705e+00, 2.2947e+00, 9.1274e-01, 1.2281e+00,\n",
" -7.0638e-01, -1.2249e+00, -8.9208e-02, 1.1016e+00, 1.1940e+00,\n",
" 3.5834e-01, 1.2961e+00, -4.6674e-01, 3.4572e-01, -4.3458e-01,\n",
" 1.1008e+00, 3.7783e-01, -6.5841e-01, -2.3127e+00, 1.4617e+00,\n",
" -1.2826e-01, 1.3463e-01, -8.5268e-01, -8.4144e-01, -1.8594e+00,\n",
" 1.9260e-01, 1.6432e+00, -2.0640e-02, -5.0030e-01, -1.5334e-01,\n",
" -6.1072e-01, -1.3694e-01, -3.7308e-01, 1.6603e+00, 1.1246e-01,\n",
" 6.0823e-02, 7.8749e-01, -1.7002e-01, 1.2058e+00, 8.5615e-01,\n",
" 1.2525e-01, -1.0584e+00, -4.7931e-01, 1.4088e-01, -1.8149e+00,\n",
" 1.4654e+00, -1.0936e+00, 5.3182e-01, 9.5694e-01, 3.2472e+00,\n",
" 3.4877e-01, 1.8491e+00, -1.5184e-01, 1.4711e+00, -7.6064e-01,\n",
" -2.2144e+00, -1.8952e+00, -4.9502e-01, -6.6836e-01, 1.4946e+00,\n",
" 6.7616e-01, 1.1501e+00, -9.4747e-01, 1.1009e+00, -1.4211e+00,\n",
" 3.9528e-01, -9.5220e-01, 1.4886e+00, 7.1784e-01, -1.9941e+00,\n",
" 6.7901e-02, -1.3109e-01, 1.1695e+00, 1.2861e-01, -2.8123e-01,\n",
" -6.1611e-01, 1.5513e-01, -3.9289e-01, -4.5543e-02, -2.8628e-01,\n",
" 2.6118e-01, 2.2623e-01, -6.3705e-01, 7.3591e-01, -7.8799e-01,\n",
" 2.5053e-01, -1.5923e-01, -4.9584e-01, 1.9009e+00, -2.3263e-01,\n",
" 1.2213e+00, 1.0313e+00, 2.0177e-02, -6.2209e-01, -3.5161e-01,\n",
" 1.5143e+00, -7.2332e-02, 2.3909e-02, -2.1261e+00, 8.5199e-01,\n",
" 1.9084e+00, 4.6845e-02, -2.3554e+00, 1.3735e+00, -7.3909e-01,\n",
" -8.3949e-01, -3.9314e-01, -4.3324e-01, -9.6804e-01, -5.3124e-01,\n",
" -6.5091e-01, -1.1738e+00, 1.3315e+00, 6.5606e-01, -1.4131e-01,\n",
" -1.7712e+00, -1.1628e+00, 9.6813e-01, 8.7314e-01, -9.8027e-01,\n",
" 6.9376e-01, 5.3878e-01, -1.6169e+00, 2.2860e-01, -6.2179e-01,\n",
" -1.1043e-01, -3.9658e-01, 2.8712e-01, 8.2201e-02, 2.0888e-01,\n",
" -5.9884e-01, 7.3092e-01, 6.9128e-01, 5.3977e-01, -1.5728e+00,\n",
" 1.6878e+00, -8.2669e-01, -9.8076e-01, -3.4203e-01, 4.6939e-02,\n",
" -1.3158e-01, 2.1923e+00, -6.6483e-02, -4.0687e-01, -1.2715e+00,\n",
" -8.1549e-01, -1.2047e+00, 1.3547e+00, -4.2072e-01, 1.1674e+00,\n",
" -5.1421e-01, 1.3055e+00, -1.1277e+00, 1.8372e+00, -1.1215e+00,\n",
" 1.4797e+00, 2.8354e-01, -6.3974e-01, -1.2869e+00, -2.7897e-01,\n",
" -1.0397e+00, 1.8622e-01, -5.0397e-02, -4.4865e-02, -7.6067e-01,\n",
" 1.7715e+00, 1.5040e+00, -2.6854e-01, -5.2802e-01, -5.3407e-01,\n",
" 2.0313e-02, -2.6276e-01, -7.0748e-01, -8.7328e-01, -3.4108e-01,\n",
" 1.4313e+00]],\n",
"\n",
" [[ 7.7464e-01, -4.2187e-01, -2.0571e+00, -8.6709e-01, -1.5722e+00,\n",
" 4.9540e-01, -1.5270e+00, 1.0499e+00, -1.9579e+00, -2.5298e-02,\n",
" 4.3419e-01, 5.8822e-01, 1.3392e+00, 6.9604e-01, -9.7883e-01,\n",
" -9.1354e-01, -9.1852e-01, -6.0951e-01, -6.6255e-02, 1.3907e+00,\n",
" -6.2912e-01, -2.7524e-01, 1.9520e-02, -2.7154e-01, 1.5162e-01,\n",
" 1.3318e-02, -8.9196e-01, 9.0976e-01, -1.3544e+00, 2.4276e-01,\n",
" -7.4038e-01, 9.7062e-01, 3.2011e-01, 3.4486e-01, -2.3374e+00,\n",
" 1.3311e+00, -3.1871e-02, -1.4468e+00, -1.5968e+00, 3.0418e-01,\n",
" -7.7136e-01, 1.3427e+00, -1.2493e+00, 1.4114e+00, -1.2475e+00,\n",
" 7.0239e-01, -9.6120e-02, -4.4365e-01, 5.3238e-01, -1.4933e+00,\n",
" 5.4476e-01, -1.8490e-02, -5.9936e-01, 1.0878e+00, -1.8892e+00,\n",
" 1.2810e+00, -1.0747e+00, 5.3514e-01, 1.7422e-01, 1.1354e+00,\n",
" -7.4837e-01, 4.0327e-01, -1.8950e+00, -7.2336e-01, 2.4441e-01,\n",
" -1.3650e-01, -4.8344e-01, 3.3921e-02, 5.0889e-01, -1.3769e+00,\n",
" -2.5907e-01, -2.7549e-01, -1.9128e-01, 1.9751e+00, -7.1191e-01,\n",
" 5.1910e-01, 1.0902e-01, 2.9995e-01, -3.5180e-01, -6.2139e-01,\n",
" 7.2905e-01, -5.3177e-01, 4.3340e-01, 1.0071e+00, 1.7586e+00,\n",
" -3.9963e-01, -2.5139e-01, -9.4213e-01, 9.2847e-01, 1.1298e+00,\n",
" 7.8545e-01, 1.3188e+00, 3.7466e-01, 9.0773e-01, -4.0454e-02,\n",
" 1.3444e+00, 6.0301e-01, 8.9929e-02, -2.0754e+00, 4.8614e-01,\n",
" -9.7160e-01, 8.2446e-01, -1.1813e+00, -9.6185e-01, -9.2922e-02,\n",
" 6.0154e-01, 1.6640e+00, -1.0461e+00, 1.5868e-01, -5.7239e-01,\n",
" -6.2726e-01, 3.2848e-01, 5.9609e-01, 1.5563e+00, -4.0883e-01,\n",
" 4.4902e-01, 1.4004e+00, 2.2426e-01, 3.8314e-01, -2.0641e-01,\n",
" -1.6465e-01, -6.4645e-01, 1.5772e-01, 6.8907e-01, -1.2703e+00,\n",
" 1.8914e-01, -6.2678e-01, 3.0179e-01, 1.2687e+00, 1.6849e+00,\n",
" 1.5690e+00, 1.0999e+00, 1.5820e+00, -6.4808e-01, 5.1003e-01,\n",
" -1.6674e+00, -1.2224e+00, 1.9769e-01, -1.3883e-01, 1.2179e+00,\n",
" 1.2971e+00, 4.6259e-01, -5.8717e-01, 1.4532e+00, -1.0540e+00,\n",
" 2.8689e-01, -1.3895e+00, 1.4014e+00, -4.0430e-01, -2.6099e+00,\n",
" -1.0293e+00, -1.1097e+00, 8.6266e-01, -1.0535e+00, 7.1789e-01,\n",
" 6.0642e-01, -1.2493e+00, -3.7762e-01, -4.1281e-02, -7.3049e-01,\n",
" -7.2913e-04, -7.3122e-02, -2.3850e-01, 1.2546e+00, 1.8802e-01,\n",
" 1.3135e+00, -5.0367e-01, 1.2456e-01, 2.7475e+00, -1.2486e+00,\n",
" 1.4441e+00, 8.7469e-01, -5.6901e-01, -1.2145e-01, 3.1091e-01,\n",
" 1.9406e+00, -8.1891e-01, 3.1316e-02, -1.2867e+00, 8.0780e-01,\n",
" 7.0041e-01, 2.8903e-01, -1.6387e+00, 6.6553e-01, -1.3696e+00,\n",
" -7.9454e-01, 3.3899e-01, -5.5822e-01, -8.1969e-01, -1.2410e-01,\n",
" -3.7024e-01, -7.2536e-01, 7.5648e-01, 1.6899e+00, -1.7404e-01,\n",
" -1.7191e+00, -7.2603e-01, 1.5046e+00, 8.3216e-01, -1.5304e+00,\n",
" -1.8264e-01, 3.3451e-01, -5.6636e-02, 6.1099e-01, -9.8517e-01,\n",
" 4.4856e-01, -8.6275e-01, 6.9264e-02, -1.1572e+00, 2.3373e-01,\n",
" 5.9896e-01, 1.2384e-01, 1.0309e+00, 1.4273e+00, -8.4776e-01,\n",
" 2.6236e+00, -9.0133e-01, -4.0009e-01, -4.9727e-01, 3.7945e-01,\n",
" -9.0712e-01, 1.5725e+00, 1.6298e-01, 1.1544e-01, -4.3125e-01,\n",
" -8.7131e-01, -2.5880e-01, 2.9032e+00, 2.7154e-01, 1.3677e+00,\n",
" -8.8544e-01, 5.6083e-01, -1.8256e+00, 9.4832e-01, -1.0762e+00,\n",
" 7.5421e-01, 6.5008e-01, -8.6361e-01, -1.4911e+00, -7.5930e-02,\n",
" -1.6896e+00, 1.5223e-02, -1.5283e-01, -1.8741e+00, 1.1400e-01,\n",
" 1.8822e+00, 2.6615e+00, 2.1607e-01, -5.6243e-01, 3.6730e-01,\n",
" 4.0374e-01, -1.1973e+00, -5.3006e-01, -3.4750e-01, -4.4187e-01,\n",
" 7.4358e-01]]], grad_fn=<NativeLayerNormBackward0>)\n"
]
}
],
"source": [
"import random\n",
"import torch\n",
"from pathlib import Path\n",
"import Project_Model.Libs.Embedder as Embedder\n",
"import Project_Model.Libs.BPE as BPE\n",
"import Project_Model.Libs.Transformer as Transformer\n",
"\n",
"# set a fixed seed\n",
"torch.manual_seed(0)\n",
"random.seed(0)\n",
"\n",
"TEXT = (\n",
" \"<ABS>The Dark Knight is a 2008 superhero film directed by Christopher Nolan,<SOTL>\"\n",
")\n",
"OUT_TEXT = \"<START>\"\n",
"VOCABULARY_PATH = Path(\"Assets/Model/toy_10/toy_dictionary.json\")\n",
"SPECIAL_VOC = BPE.default_special_tokens()\n",
"\n",
"VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)\n",
"TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)\n",
"\n",
"PAD_TOKEN = TOKENANO.encode(\"<PAD>\")[0]\n",
"END_TOKEN = TOKENANO.encode(\"<END>\")[0]\n",
"\n",
"ENCODER_INPUT = TOKENANO.encode(TEXT)\n",
"DECODER_INPUT = TOKENANO.encode(OUT_TEXT)\n",
"MAX_LEN = len(ENCODER_INPUT) + 1\n",
"\n",
"EN_IN, PAD_MASK = Transformer.normalize_sequence(ENCODER_INPUT, MAX_LEN, PAD_TOKEN, END_TOKEN)\n",
"DEC_IN, _ = Transformer.normalize_sequence(DECODER_INPUT, MAX_LEN, PAD_TOKEN, END_TOKEN)\n",
"BATCH_LEN = 3\n",
"\n",
"INPUT_TOKENIZATION = [\n",
" EN_IN\n",
"] * BATCH_LEN\n",
"OUTPUT_TOKENIZATION = [\n",
" DEC_IN\n",
"] * BATCH_LEN\n",
"\n",
"\n",
"print(INPUT_TOKENIZATION)\n",
"\n",
"TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size\n",
"EMBEDDED_SIZE = 256\n",
"FEED_FORWARD_DIM = EMBEDDED_SIZE * 4\n",
"\n",
"EMBEDDER = Embedder.NanoSocratesEmbedder(TOKEN_SPACE_SIZE, EMBEDDED_SIZE)\n",
"encoder_tensor: torch.Tensor = EMBEDDER(INPUT_TOKENIZATION)\n",
"ENCODER = torch.nn.Sequential(\n",
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
")\n",
"decoder_tensor: torch.Tensor = EMBEDDER(OUTPUT_TOKENIZATION)\n",
"DECODER = torch.nn.Sequential(\n",
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
")\n",
"\n",
"print(len(INPUT_TOKENIZATION))\n",
"print(f\"Embedder Tensor: {encoder_tensor.shape}\")\n",
"print(f\"Values:\\n{encoder_tensor}\")\n",
"\n",
"BATCH_SIZE, TOKENS, DIMENSIONS = encoder_tensor.shape\n",
"PAD_MASK = torch.tensor([PAD_MASK] * BATCH_LEN)\n",
"\n",
"encoder_out, _ = ENCODER((encoder_tensor, PAD_MASK))\n",
"tensor: torch.Tensor\n",
"tensor, _, _, _ = DECODER((decoder_tensor, encoder_out, encoder_out, None))\n",
"\n",
"print(f\"ENCODER Tensor: {tensor.shape}\")\n",
"print(f\"Values:\\n{tensor}\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "deep_learning",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -1,131 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "c64b0e24",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[7706, 290, 756, 4270, 7357, 115, 351, 1507, 1213, 410, 3382, 317, 497, 4740, 2784, 7700], [7706, 290, 756, 4270, 7357, 115, 351, 1507, 1213, 410, 3382, 317, 497, 4740, 2784, 7700]]\n",
"2\n",
"Embedder Tensor: torch.Size([2, 16, 256])\n",
"Values:\n",
"tensor([[[-0.6981, 0.0804, -2.1672, ..., 0.3919, 0.3341, 1.0794],\n",
" [ 2.5818, -0.2308, 0.6001, ..., -0.0500, -0.0408, -0.9852],\n",
" [-0.6967, 0.8109, 1.3108, ..., 2.1693, 1.4143, -0.1236],\n",
" ...,\n",
" [ 2.1226, 2.5695, -1.6178, ..., -0.0652, -0.0802, 0.1103],\n",
" [ 0.8770, -2.4782, 0.8536, ..., 2.0471, -1.5702, 0.7387],\n",
" [-0.0495, -1.8601, 0.0405, ..., 2.3944, -0.4297, 1.1141]],\n",
"\n",
" [[-0.6981, 0.0804, -2.1672, ..., 0.3919, 0.3341, 1.0794],\n",
" [ 2.5818, -0.2308, 0.6001, ..., -0.0500, -0.0408, -0.9852],\n",
" [-0.6967, 0.8109, 1.3108, ..., 2.1693, 1.4143, -0.1236],\n",
" ...,\n",
" [ 2.1226, 2.5695, -1.6178, ..., -0.0652, -0.0802, 0.1103],\n",
" [ 0.8770, -2.4782, 0.8536, ..., 2.0471, -1.5702, 0.7387],\n",
" [-0.0495, -1.8601, 0.0405, ..., 2.3944, -0.4297, 1.1141]]],\n",
" grad_fn=<AddBackward0>)\n",
"ENCODER Tensor: torch.Size([2, 16, 256])\n",
"Values:\n",
"tensor([[[-1.6325, 0.4094, -2.1403, ..., 0.4654, 0.5993, 0.9683],\n",
" [ 1.8236, 0.4025, -0.6972, ..., 0.2430, 0.2536, -1.0889],\n",
" [-0.0587, 0.1618, -0.2335, ..., 1.7609, 1.2664, -0.4452],\n",
" ...,\n",
" [ 2.0337, 1.3184, -1.3165, ..., -0.3303, 0.6572, 0.0884],\n",
" [ 0.5752, -2.5594, -0.2393, ..., 1.3318, -1.4236, 0.4686],\n",
" [ 1.0075, -2.4273, -0.4593, ..., 1.6660, 0.0359, 0.2927]],\n",
"\n",
" [[-1.8300, -0.3079, -1.6585, ..., 0.4859, 0.5652, 0.8072],\n",
" [ 1.5461, -0.5666, -0.0330, ..., 0.5651, 0.2974, -1.0879],\n",
" [-0.9060, 0.2700, -0.4585, ..., 2.0363, 1.2657, -0.7060],\n",
" ...,\n",
" [ 1.6688, 1.7038, -1.9549, ..., -0.2052, 0.6270, 0.4598],\n",
" [ 0.0482, -2.3951, -0.4351, ..., 1.6230, -1.3662, -0.0390],\n",
" [ 0.8146, -2.6169, -0.6188, ..., 1.4525, 0.0507, 0.5177]]],\n",
" grad_fn=<NativeLayerNormBackward0>)\n"
]
}
],
"source": [
"import random\n",
"import torch\n",
"from pathlib import Path\n",
"import Project_Model.Libs.Embedder as Embedder\n",
"import Project_Model.Libs.BPE as BPE\n",
"import Project_Model.Libs.Transformer as Transformer\n",
"\n",
"# set a fixed seed\n",
"torch.manual_seed(0)\n",
"random.seed(0)\n",
"\n",
"TEXT = \"<ABS>The Dark Knight is a 2008 superhero film directed by Christopher Nolan,<SOTL>\"\n",
"\n",
"VOCABULARY_PATH = Path(\"Assets/Model/toy_10/toy_dictionary.json\")\n",
"SPECIAL_VOC = BPE.default_special_tokens()\n",
"\n",
"VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)\n",
"TOKENANO = BPE.TokeNanoCore(\n",
" VOCABULARY,\n",
" SPECIAL_VOC\n",
")\n",
"\n",
"TOKENIZATION = [TOKENANO.encode(TEXT), TOKENANO.encode(TEXT)]\n",
"print(TOKENIZATION)\n",
"\n",
"TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size\n",
"EMBEDDED_SIZE = 256\n",
"FEED_FORWARD_DIM = EMBEDDED_SIZE * 4\n",
"\n",
"EMBEDDER = Embedder.NanoSocratesEmbedder(TOKEN_SPACE_SIZE, EMBEDDED_SIZE)\n",
"tensor: torch.Tensor = EMBEDDER(TOKENIZATION)\n",
"ENCODER = torch.nn.Sequential(\n",
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
")\n",
"print(len(TOKENIZATION))\n",
"print(f\"Embedder Tensor: {tensor.shape}\")\n",
"print(f\"Values:\\n{tensor}\")\n",
"\n",
"BATCH_SIZE, TOKENS, DIMENSIONS = tensor.shape\n",
"PAD_MASK = torch.tensor([[True] * TOKENS] * BATCH_SIZE, dtype=torch.bool)\n",
"tensor, _ = ENCODER((tensor, PAD_MASK))\n",
"print(f\"ENCODER Tensor: {tensor.shape}\")\n",
"print(f\"Values:\\n{tensor}\")\n",
"\n",
"\n",
"\n",
"\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "deep_learning",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -1,263 +0,0 @@
import random
import torch
from pathlib import Path
import Project_Model.Libs.BPE as BPE
import Project_Model.Libs.Transformer as Transformer
import Project_Model.Libs.TransformerUtils as TUtils
import Project_Model.Libs.TorchShims as torch_shims
import Project_Model.Libs.Batch as Batch
# set a default device
DEVICE = torch_shims.get_default_device()
torch.set_default_device(DEVICE)
# set a fixed seed
torch.manual_seed(0)
random.seed(0)
# Get paths
MODEL_DIR = "Assets/Model/curated"
# MODEL_DIR= "Assets/Dataset/Tmp"
VOCABULARY_PATH = Path("Assets/Model/small/bpe-small-16.json")
TRAIN_DATASET_PATH = Path("Assets/Dataset/1-hop/small/holdout/train.csv")
VALIDATION_DATASET_PATH = Path("Assets/Dataset/1-hop/small/holdout/evaluation.csv")
TEST_DATASET_PATH = Path("Assets/Dataset/1-hop/small/holdout/test.csv")
# TEST_DATASET_PATH = Path("Assets/Dataset/1-hop/toy/rdf_text.csv")
MODEL_PATH = Path(f"{MODEL_DIR}/NanoSocrates.zip")
# BPE Init
SPECIAL_VOC = BPE.default_special_tokens()
VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)
TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)
# Constants
MASK_EXTRA_SPACE = 100
REAL_TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size
TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + MASK_EXTRA_SPACE
EMBEDDED_SIZE = 256
FEED_FORWARD_MULTIPLIER = 4
ATTENTION_HEADS = 4
SENTENCE_LENGTH = 256
NUMBER_OF_BLOCKS = 2
SOS_TOKEN = TOKENANO.encode("<SOS>")[0]
PAD_TOKEN = TOKENANO.encode("<PAD>")[0]
END_TOKEN = TOKENANO.encode("<EOS>")[0]
SUBJ_TOKEN = TOKENANO.encode("<SUBJ>")[0]
REL_TOKEN = TOKENANO.encode("<PRED>")[0]
OBJ_TOKEN = TOKENANO.encode("<OBJ>")[0]
MASK_TOKEN = TOKENANO.encode("<MASK>")[0]
CONTINUTE_TOKEN = TOKENANO.encode("<CONTINUERDF>")[0]
SPECIAL_TOKENS: set[int] = set(TOKENANO.encode("".join(BPE.default_special_tokens())))
ALLOWED_TOKENS = set([SUBJ_TOKEN, REL_TOKEN, OBJ_TOKEN])
FORBIDDEN_TOKENS = SPECIAL_TOKENS - ALLOWED_TOKENS
# Spanned_Masker
MASKER = Transformer.SpannedMasker(REAL_TOKEN_SPACE_SIZE, FORBIDDEN_TOKENS, average_span=4)
TRAIN_BATCHER = Batch.Batcher(TRAIN_DATASET_PATH, SENTENCE_LENGTH, TOKENANO, MASKER)
VALIDATION_BATCHER = Batch.Batcher(
VALIDATION_DATASET_PATH, SENTENCE_LENGTH, TOKENANO, MASKER
)
TEST_BATCHER = Batch.Batcher(TEST_DATASET_PATH, SENTENCE_LENGTH, TOKENANO, MASKER, debug=True)
# Model
NANOSOCRATES_TRAIN = Transformer.TrainingModel(
TOKEN_SPACE_SIZE,
EMBEDDED_SIZE,
FEED_FORWARD_MULTIPLIER,
ATTENTION_HEADS,
NUMBER_OF_BLOCKS,
)
NANOSOCRATES = Transformer.NanoSocratesCore(
TOKEN_SPACE_SIZE,
SENTENCE_LENGTH,
SOS_TOKEN,
PAD_TOKEN,
END_TOKEN,
CONTINUTE_TOKEN,
EMBEDDED_SIZE,
FEED_FORWARD_MULTIPLIER,
ATTENTION_HEADS,
NUMBER_OF_BLOCKS,
)
if MODEL_PATH.is_file():
nanosocrates_dict = torch.load(MODEL_PATH, weights_only=True, map_location=DEVICE)
NANOSOCRATES_TRAIN.load_state_dict(nanosocrates_dict)
_, ENCODER_ONLY, DECODER_ONLY = TUtils.decompose_nano_socrates(
NANOSOCRATES, TOKEN_SPACE_SIZE, EMBEDDED_SIZE
)
NANOSOCRATES = TUtils.train2inference(
NANOSOCRATES_TRAIN,
NANOSOCRATES
)
NANOSOCRATES.eval()
ENCODER_ONLY.eval()
DECODER_ONLY.eval()
NANOSOCRATES_TRAIN.eval()
task_1_metrics = []
task_2_metrics = []
task_3_metrics = []
task_4_metrics = []
example_num = 0
with torch.no_grad():
for example in TEST_BATCHER.batch(1):
print(f"DOING Example: {example_num}")
src_x, tgt_y, pad_x, pad_y, tasktype = example
enc_x = torch.tensor(src_x)
ACTUAL_BATCH_SIZE, _ = enc_x.shape
enc_x_pad = torch.tensor(pad_x, dtype=torch.bool)
tgt = torch.tensor(tgt_y)
tgt_pad = torch.tensor(pad_y, dtype=torch.bool)
dec_x = Transformer.get_decoder_input(
ACTUAL_BATCH_SIZE, SOS_TOKEN, PAD_TOKEN, SENTENCE_LENGTH
)
dec_x[:, 1:] = tgt[:, :-1]
dec_x_pad = dec_x.eq(PAD_TOKEN)
out: torch.Tensor = NANOSOCRATES.inference((enc_x, enc_x_pad), tasktype)
tokens: list[int] = out.tolist()[0]
tokens.append(END_TOKEN)
tokens = list(map(lambda x: MASK_TOKEN if x > TOKENANO.vocabulary_size else x, tokens))
out_string = TOKENANO.decode(tokens)
exp_tokens: list[int] = tgt_y[0]
exp_tokens = list(map(lambda x: MASK_TOKEN if x > TOKENANO.vocabulary_size else x, exp_tokens))
exp_string = TOKENANO.decode(exp_tokens)
enc_tokens: list[int] = src_x[0]
enc_tokens = list(map(lambda x: MASK_TOKEN if x > TOKENANO.vocabulary_size else x, enc_tokens))
enc_string = TOKENANO.decode(enc_tokens)
print(f"PROMPT:\n{enc_string}")
print(f"EXPECTED:\n{exp_string}")
print(f"ACTUAL:\n{out_string}")
if tasktype == Batch.TaskType.RDF2TXT:
example_num += 1
ref = TUtils.remove_padding(exp_tokens, PAD_TOKEN, END_TOKEN)
pred = TUtils.remove_padding(tokens, PAD_TOKEN, END_TOKEN)
ref_str = TOKENANO.decode(ref)
pred_str = TOKENANO.decode(pred)
bleu, rouge, meteor = TUtils.rdf2txt([ref_str], [pred_str])
task_1_metrics.append(
[
bleu["bleu"], rouge["rougeL"], meteor["meteor"] # type: ignore
]
)
if tasktype == Batch.TaskType.TEXT2RDF:
ref = TUtils.remove_padding(exp_tokens, PAD_TOKEN, END_TOKEN)
pred = TUtils.remove_padding(tokens[1:], PAD_TOKEN, END_TOKEN)
ref, pred = TUtils.balance_paddings(ref, pred, PAD_TOKEN)
precision, recall = TUtils.txt2rdf(ref, pred)
task_2_metrics.append(
[
precision["precision"], recall["recall"] # type: ignore
]
)
if tasktype == Batch.TaskType.MASKING:
ref = TUtils.remove_padding(exp_tokens, PAD_TOKEN, END_TOKEN)
pred = TUtils.remove_padding(tokens, PAD_TOKEN, END_TOKEN)
ref, pred = TUtils.balance_paddings(ref, pred, PAD_TOKEN)
accuracy = TUtils.accuracy(ref, pred)
task_3_metrics.append(
accuracy["accuracy"] # type: ignore
)
if tasktype == Batch.TaskType.COMPLETATION:
ref = TUtils.remove_padding(exp_tokens, PAD_TOKEN, END_TOKEN)
pred = TUtils.remove_padding(tokens, PAD_TOKEN, END_TOKEN)
ref, pred = TUtils.balance_paddings(ref, pred, PAD_TOKEN)
precision, recall = TUtils.txt2rdf(ref, pred)
task_4_metrics.append(
[
precision["precision"], recall["recall"] # type: ignore
]
)
bleus = [row[0] for row in task_1_metrics]
rouges = [row[1] for row in task_1_metrics]
meteors = [row[2] for row in task_1_metrics]
prec_1 = [row[0] for row in task_2_metrics]
rec_1 = [row[1] for row in task_2_metrics]
acc = task_3_metrics
prec_2 = [row[0] for row in task_4_metrics]
rec_2 = [row[1] for row in task_4_metrics]
BLEU = TUtils.average(bleus)
ROUGE = TUtils.average(rouges)
METEOR = TUtils.average(meteors)
PREC_1 = TUtils.average(prec_1)
REC_1 = TUtils.average(rec_1)
F1_1 = TUtils.f1(PREC_1, REC_1)
ACC = TUtils.average(acc)
PREC_2 = TUtils.average(prec_2)
REC_2 = TUtils.average(rec_2)
F1_2 = TUtils.f1(PREC_2, REC_2)
SEPARATOR = "**************************************************************************"
OUTPUT = "".join([
f"{SEPARATOR}\n",
"*\tRDF2TXT:\n",
f"*\t\tBLEU: {BLEU} - ROUGE: {ROUGE} - METEOR: {METEOR}\n"
f"{SEPARATOR}\n",
"*\tTXT2RDF:\n",
f"*\t\tPRECISION: {PREC_1} - RECALL: {REC_1} - F1: {F1_1}\n"
f"{SEPARATOR}\n",
"*\tRDF Completion 1:\n",
f"*\t\tACCURACY: {ACC}\n"
f"{SEPARATOR}\n",
"*\tRDF Completion 2:\n",
f"*\t\tPRECISION: {PREC_2} - RECALL: {REC_2} - F1: {F1_2}\n"
f"{SEPARATOR}\n",
""
])
print(OUTPUT)
print("\nDEBUG")
print(task_1_metrics)
print(task_2_metrics)
print(task_3_metrics)
print(task_4_metrics)

View File

@ -1,157 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "f5762da9",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"torch.Size([3, 17, 7714])\n",
"torch.Size([3, 17])\n",
"tensor([[2034, 6523, 5406, 3985, 5406, 6523, 2034, 2034, 5745, 643, 5406, 7405,\n",
" 6523, 6230, 6419, 5745, 657],\n",
" [2458, 830, 5745, 5745, 5406, 3741, 2034, 5745, 6302, 6419, 5406, 2411,\n",
" 719, 830, 5745, 3189, 2775],\n",
" [2034, 5745, 5327, 4696, 6523, 643, 6419, 1671, 6302, 4406, 5745, 643,\n",
" 643, 1901, 1914, 1914, 719]])\n"
]
}
],
"source": [
"import random\n",
"import torch\n",
"from pathlib import Path\n",
"import Project_Model.Libs.Embedder as Embedder\n",
"import Project_Model.Libs.BPE as BPE\n",
"import Project_Model.Libs.Transformer as Transformer\n",
"\n",
"# set a fixed seed\n",
"torch.manual_seed(0)\n",
"random.seed(0)\n",
"\n",
"# BPE Init\n",
"VOCABULARY_PATH = Path(\"Assets/Model/toy_10/toy_dictionary.json\")\n",
"SPECIAL_VOC = BPE.default_special_tokens()\n",
"\n",
"VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)\n",
"TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)\n",
"\n",
"\n",
"# Constants\n",
"TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + 1\n",
"EMBEDDED_SIZE = 256\n",
"FEED_FORWARD_DIM = EMBEDDED_SIZE * 4\n",
"\n",
"\n",
"# Model Init\n",
"ENCODER_EMBEDDER = Embedder.NanoSocratesEmbedder(TOKEN_SPACE_SIZE, EMBEDDED_SIZE)\n",
"DECODER_EMBEDDER = Embedder.NanoSocratesEmbedder(TOKEN_SPACE_SIZE, EMBEDDED_SIZE)\n",
"\n",
"ENCODER = torch.nn.Sequential(\n",
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
")\n",
"\n",
"DECODER = torch.nn.Sequential(\n",
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
")\n",
"\n",
"DETOKENER = Transformer.DeToken(\n",
" EMBEDDED_SIZE,\n",
" TOKEN_SPACE_SIZE\n",
")\n",
"\n",
"\n",
"# Data\n",
"TEXT = (\n",
" \"<ABS>The Dark Knight is a 2008 superhero film directed by Christopher Nolan,<SOTL>\"\n",
")\n",
"OUT_TEXT = \"<START>\"\n",
"\n",
"PAD_TOKEN = TOKENANO.encode(\"<PAD>\")[0]\n",
"END_TOKEN = TOKENANO.encode(\"<END>\")[0]\n",
"\n",
"ENCODER_INPUT = TOKENANO.encode(TEXT)\n",
"DECODER_INPUT = TOKENANO.encode(OUT_TEXT)\n",
"MAX_LEN = len(ENCODER_INPUT) + 1\n",
"\n",
"EN_IN, PAD_MASK = Transformer.normalize_sequence(ENCODER_INPUT, MAX_LEN, PAD_TOKEN, END_TOKEN)\n",
"DEC_IN, _ = Transformer.normalize_sequence(DECODER_INPUT, MAX_LEN, PAD_TOKEN, END_TOKEN)\n",
"\n",
"BATCH_LEN = 3\n",
"\n",
"INPUT_TOKENIZATION = [\n",
" EN_IN\n",
"] * BATCH_LEN\n",
"OUTPUT_TOKENIZATION = [\n",
" DEC_IN\n",
"] * BATCH_LEN\n",
"\n",
"encoder_tensor_input = ENCODER_EMBEDDER(INPUT_TOKENIZATION)\n",
"encoder_padding_mask = torch.tensor([PAD_MASK] * BATCH_LEN)\n",
"\n",
"encoder_output, _ = ENCODER((encoder_tensor_input, encoder_padding_mask))\n",
"\n",
"decoder_tensor_input = DECODER_EMBEDDER(OUTPUT_TOKENIZATION)\n",
"decoder_padding_mask = torch.tensor([[False] * MAX_LEN] * BATCH_LEN)\n",
"\n",
"decoder_output, _, _, _ = DECODER((decoder_tensor_input, encoder_output, encoder_output, None))\n",
"\n",
"logits: torch.Tensor = DETOKENER(decoder_output)\n",
"\n",
"print(logits.shape)\n",
"\n",
"# print(logits)\n",
"\n",
"most_probable_tokens = torch.argmax(logits, 2)\n",
"\n",
"print(most_probable_tokens.shape)\n",
"print(most_probable_tokens)\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "deep_learning",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -1,472 +0,0 @@
import random
import sys
import torch
import pandas as pd
from pathlib import Path
import Project_Model.Libs.Embedder as Embedder
import Project_Model.Libs.BPE as BPE
import Project_Model.Libs.Transformer as Transformer
import Project_Model.Libs.TransformerUtils as TUtils
import Project_Model.Libs.TorchShims as torch_shims
import Project_Model.Libs.Batch as Batch
from Project_Model.Libs.Training.loss_saver import Log
# set a fixed seed
torch.manual_seed(0)
random.seed(0)
# set a default device
DEVICE = torch_shims.get_default_device()
torch.set_default_device(DEVICE)
# Get paths
CHECKPOINT_DIR = "Assets/Dataset/Tmp"
VOCABULARY_PATH = Path("Assets/Model/small/bpe-small-16.json")
TRAIN_DATASET_PATH = Path("Assets/Dataset/1-hop/toy/rdf_text.csv")
VALIDATION_DATASET_PATH = Path("Assets/Dataset/1-hop/toy/rdf_text.csv")
TEST_DATASET_PATH = Path("Assets/Dataset/1-hop/toy/rdf_text.csv")
CHECKPOINT_PATH = Path(f"{CHECKPOINT_DIR}/NanoSocrates.zip")
NANO_OPTIM_PATH = Path(f"{CHECKPOINT_DIR}/nano_optim.zip")
ENC_OPTIM_PATH = Path(f"{CHECKPOINT_DIR}/enc_optim.zip")
DEC_OPTIM_PATH = Path(f"{CHECKPOINT_DIR}/dec_optim.zip")
LAST_EPOCH_PATH = Path(f"{CHECKPOINT_DIR}/last_epoch.txt")
# log saver:
loss_saver = Log(f"{CHECKPOINT_DIR}/log_loss.csv")
# BPE Init
SPECIAL_VOC = BPE.default_special_tokens()
VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)
TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)
# Constants
MASK_EXTRA_SPACE = 100
REAL_TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size
TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + MASK_EXTRA_SPACE
EMBEDDED_SIZE = 256
FEED_FORWARD_MULTIPLIER = 4
ATTENTION_HEADS = 4
SENTENCE_LENGTH = 256
NUMBER_OF_BLOCKS = 2
MAX_EPOCHS = int(300)
PRETRAIN_EPOCHS = int(20)
WARMUP_EPOCHS = int(30)
MINI_BATCH_SIZE = 20
VALIDATION_STEPS = 10
CHECKPOINT_STEPS = VALIDATION_STEPS
PATIENCE = 4
CURRENT_EPOCH = -1 if not LAST_EPOCH_PATH.is_file() else int(LAST_EPOCH_PATH.read_text())
VERBOSE = False
LEARNING_RATE = 0.05
LABEL_SMOOTHING = 0.01
SOS_TOKEN = TOKENANO.encode("<SOS>")[0]
PAD_TOKEN = TOKENANO.encode("<PAD>")[0]
END_TOKEN = TOKENANO.encode("<END>")[0]
SUBJ_TOKEN = TOKENANO.encode("<SUBJ>")[0]
REL_TOKEN = TOKENANO.encode("<PRED>")[0]
OBJ_TOKEN = TOKENANO.encode("<OBJ>")[0]
MASK_TOKEN = TOKENANO.encode("<MASK>")[0]
SPECIAL_TOKENS: set[int] = set(TOKENANO.encode("".join(BPE.default_special_tokens())))
ALLOWED_TOKENS = set([SUBJ_TOKEN, REL_TOKEN, OBJ_TOKEN])
FORBIDDEN_TOKENS = SPECIAL_TOKENS - ALLOWED_TOKENS
# Spanned_Masker
MASKER = Transformer.SpannedMasker(REAL_TOKEN_SPACE_SIZE, FORBIDDEN_TOKENS, average_span=4)
TRAIN_BATCHER = Batch.Batcher(TRAIN_DATASET_PATH, SENTENCE_LENGTH, TOKENANO, MASKER)
VALIDATION_BATCHER = Batch.Batcher(
VALIDATION_DATASET_PATH, SENTENCE_LENGTH, TOKENANO, MASKER
)
TEST_BATCHER = Batch.Batcher(TEST_DATASET_PATH, SENTENCE_LENGTH, TOKENANO, MASKER)
# Model
NANOSOCRATES = Transformer.TrainingModel(
TOKEN_SPACE_SIZE,
EMBEDDED_SIZE,
FEED_FORWARD_MULTIPLIER,
ATTENTION_HEADS,
NUMBER_OF_BLOCKS,
)
if CHECKPOINT_PATH.is_file():
nanosocrates_dict = torch.load(CHECKPOINT_PATH, weights_only=True)
NANOSOCRATES.load_state_dict(nanosocrates_dict)
_, ENCODER_ONLY, DECODER_ONLY = TUtils.decompose_nano_socrates(
NANOSOCRATES, TOKEN_SPACE_SIZE, EMBEDDED_SIZE
)
# Training constants
nano_cross_entropy = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN, label_smoothing=LABEL_SMOOTHING)
encoder_ce = torch.nn.CrossEntropyLoss( label_smoothing=LABEL_SMOOTHING)
decoder_ce = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN, label_smoothing=LABEL_SMOOTHING)
nano_optim = torch.optim.AdamW(NANOSOCRATES.parameters(), LEARNING_RATE)
encoder_only_optim = torch.optim.AdamW(ENCODER_ONLY.parameters(), LEARNING_RATE)
decoder_only_optim = torch.optim.AdamW(DECODER_ONLY.parameters(), LEARNING_RATE)
if NANO_OPTIM_PATH.is_file():
optim_dict = torch.load(NANO_OPTIM_PATH)
nano_optim.load_state_dict(optim_dict)
if ENC_OPTIM_PATH.is_file():
optim_dict = torch.load(ENC_OPTIM_PATH)
encoder_only_optim.load_state_dict(optim_dict)
if DEC_OPTIM_PATH.is_file():
optim_dict = torch.load(DEC_OPTIM_PATH)
decoder_only_optim.load_state_dict(optim_dict)
nano_scheduler = Transformer.WarmupLR(nano_optim, WARMUP_EPOCHS, EMBEDDED_SIZE, last_epoch=CURRENT_EPOCH)
encoder_only_scheduler = Transformer.WarmupLR(
encoder_only_optim, WARMUP_EPOCHS, EMBEDDED_SIZE, last_epoch=CURRENT_EPOCH
)
decoder_only_scheduler = Transformer.WarmupLR(
decoder_only_optim, WARMUP_EPOCHS, EMBEDDED_SIZE, last_epoch=CURRENT_EPOCH
)
current_epoch = CURRENT_EPOCH + 2
patience = 0
average_loss_validation = {
"txt": float("inf"),
"encoder_only": float("inf"),
"decoder_only": float("inf"),
}
while current_epoch < MAX_EPOCHS:
NANOSOCRATES.train()
ENCODER_ONLY.train()
DECODER_ONLY.train()
text_batch_losses = []
encoder_batch_losses = []
decoder_batch_losses = []
batch_counter = 0
if VERBOSE:
print(f"EPOCH {current_epoch} STARTING")
for batch in TRAIN_BATCHER.batch(MINI_BATCH_SIZE):
batch_counter += 1
src_x, tgt_y, pad_x, pad_y, tasktype = batch
enc_x = torch.tensor(src_x)
ACTUAL_BATCH_SIZE, _ = enc_x.shape
enc_x_pad = torch.tensor(pad_x, dtype=torch.bool)
tgt = torch.tensor(tgt_y)
tgt_pad = torch.tensor(pad_y, dtype=torch.bool)
dec_x = Transformer.get_decoder_input(
ACTUAL_BATCH_SIZE, SOS_TOKEN, PAD_TOKEN, SENTENCE_LENGTH
)
dec_x[:, 1:] = tgt[:, :-1]
dec_x_pad = dec_x.eq(PAD_TOKEN)
if VERBOSE:
for s in TUtils.decode_batch(enc_x, TOKENANO, MASK_TOKEN):
print("Input")
print(s)
for s in TUtils.decode_batch(enc_x_pad, TOKENANO, MASK_TOKEN):
print("Encoder Padding mask")
print(s)
for s in TUtils.decode_batch(tgt, TOKENANO, MASK_TOKEN):
print("Desired Output")
print(s)
a_dx = dec_x[:,:]
a_dx[:, -1]= END_TOKEN
for s in TUtils.decode_batch(a_dx, TOKENANO, MASK_TOKEN):
print("Decoder Input")
print(s)
if VERBOSE:
print(f"\tBATCH {batch_counter} Starting")
# Task 1 and Task 2
if tasktype == Batch.TaskType.RDF2TXT or tasktype == Batch.TaskType.TEXT2RDF:
if VERBOSE:
print(f"\tExecuting TASK 1 or 2 - BATCH {batch_counter}")
nano_optim.zero_grad()
pred_logits: torch.Tensor = NANOSOCRATES((enc_x, enc_x_pad, dec_x, dec_x_pad))
pred_logits = pred_logits.permute(0, 2, 1)
loss: torch.Tensor = nano_cross_entropy(pred_logits, tgt)
loss.backward()
nano_optim.step()
text_batch_losses.append(loss)
continue
# Pretrain first
if current_epoch < PRETRAIN_EPOCHS:
continue
# Task 3
if tasktype == Batch.TaskType.MASKING:
if VERBOSE:
print(f"\tExecuting TASK 3 - BATCH {batch_counter}")
encoder_only_optim.zero_grad()
pred_logits = ENCODER_ONLY((enc_x, enc_x_pad))
pred_logits = pred_logits.permute(0, 2, 1)
# print(torch.max(tgt))
loss: torch.Tensor = encoder_ce(pred_logits, tgt)
loss.backward()
encoder_only_optim.step()
exp_tokens: list[int] = tgt_y[0]
exp_tokens = list(map(lambda x: MASK_TOKEN if x > TOKENANO.vocabulary_size else x, exp_tokens))
exp_string = TOKENANO.decode(exp_tokens)
enc_tokens: list[int] = src_x[0]
enc_tokens = list(map(lambda x: MASK_TOKEN if x > TOKENANO.vocabulary_size else x, enc_tokens))
enc_string = TOKENANO.decode(enc_tokens)
print(f"PROMPT:\n{enc_string}")
print(f"EXPECTED:\n{exp_string}")
encoder_batch_losses.append(loss.item())
continue
# Task 4
if tasktype == Batch.TaskType.COMPLETATION:
if VERBOSE:
print(f"\tExecuting TASK 4 - BATCH {batch_counter}")
decoder_only_optim.zero_grad()
pred_logits = DECODER_ONLY((dec_x, enc_x_pad, dec_x_pad))
pred_logits = pred_logits.permute(0, 2, 1)
loss: torch.Tensor = decoder_ce(pred_logits, tgt)
loss.backward()
decoder_only_optim.step()
decoder_batch_losses.append(
loss
)
continue
nano_scheduler.step()
encoder_only_scheduler.step()
decoder_only_scheduler.step()
current_epoch += 1
if current_epoch % VALIDATION_STEPS == 0:
NANOSOCRATES.eval()
ENCODER_ONLY.eval()
DECODER_ONLY.eval()
with torch.no_grad():
txt_avg_batch_losses = []
enc_avg_batch_losses = []
dec_avg_batch_losses = []
for batch in VALIDATION_BATCHER.batch(MINI_BATCH_SIZE):
src_x, tgt_y, pad_x, pad_y, tasktype = batch
enc_x = torch.tensor(src_x)
ACTUAL_BATCH_SIZE, _ = enc_x.shape
enc_x_pad = torch.tensor(pad_x, dtype=torch.bool)
tgt = torch.tensor(tgt_y)
tgt_pad = torch.tensor(pad_y, dtype=torch.bool)
dec_x = Transformer.get_decoder_input(
ACTUAL_BATCH_SIZE, SOS_TOKEN, PAD_TOKEN, SENTENCE_LENGTH
)
dec_x[:, 1:] = tgt[:, :-1]
dec_x_pad = dec_x.eq(PAD_TOKEN)
# Task 1 and Task 2
if (
tasktype == Batch.TaskType.RDF2TXT
or tasktype == Batch.TaskType.TEXT2RDF
):
pred_logits = NANOSOCRATES((enc_x, enc_x_pad, dec_x, dec_x_pad))
pred_logits = pred_logits.permute(0, 2, 1)
loss: torch.Tensor = nano_cross_entropy(
pred_logits, tgt
)
txt_avg_batch_losses.append(loss)
continue
# Pretrain first
if current_epoch <= PRETRAIN_EPOCHS:
continue
# Task 3
if tasktype == Batch.TaskType.MASKING:
pred_logits = ENCODER_ONLY((enc_x, enc_x_pad))
pred_logits = pred_logits.permute(0, 2, 1)
loss: torch.Tensor = encoder_ce(pred_logits, tgt)
enc_avg_batch_losses.append(loss.item())
continue
# Task 4
if tasktype == Batch.TaskType.COMPLETATION:
pred_logits = DECODER_ONLY((dec_x, enc_x_pad, dec_x_pad))
pred_logits = pred_logits.permute(0, 2, 1)
loss: torch.Tensor = decoder_ce(pred_logits, tgt)
dec_avg_batch_losses.append(loss)
continue
txt_avg_loss = sum(txt_avg_batch_losses) / len(txt_avg_batch_losses)
enc_avg_loss = float("inf")
dec_avg_loss = float("inf")
if current_epoch > PRETRAIN_EPOCHS:
enc_avg_loss = sum(enc_avg_batch_losses) / len(enc_avg_batch_losses)
dec_avg_loss = sum(dec_avg_batch_losses) / len(dec_avg_batch_losses)
if current_epoch < PRETRAIN_EPOCHS:
if txt_avg_loss < average_loss_validation["txt"]:
average_loss_validation["txt"] = txt_avg_loss
else:
patience += 1
if VERBOSE:
print(f"losing a patience, current irritation: {patience}")
else:
counter = 0
if txt_avg_loss > average_loss_validation["txt"]:
if VERBOSE:
print("txt average is higher than lowest")
counter += 1
else:
average_loss_validation["txt"] = txt_avg_loss
if enc_avg_loss > average_loss_validation["encoder_only"]:
if VERBOSE:
print("masking average is higher than lowest")
counter += 1
else:
average_loss_validation["encoder_only"] = enc_avg_loss
if dec_avg_loss > average_loss_validation["decoder_only"]:
if VERBOSE:
print("decoding only average is higher than lowest")
counter += 1
else:
average_loss_validation["decoder_only"] = dec_avg_loss
if counter > 1:
patience += 1
if VERBOSE:
print(f"losing a patience, current irritation: {patience}")
if counter == 0:
patience = max(0, patience - 1)
if VERBOSE:
print(f"all good, gaining a patience, current irritation: {patience}")
txt_train_avg_loss = sum(text_batch_losses) / len(text_batch_losses)
enc_avg_train_loss = float("inf")
dec_avg_train_loss = float("inf")
if current_epoch > PRETRAIN_EPOCHS:
try:
enc_avg_train_loss = sum(encoder_batch_losses) / len(encoder_batch_losses)
dec_avg_train_loss = sum(decoder_batch_losses) / len(decoder_batch_losses)
except:
pass
# write on log
loss_saver.write([current_epoch, txt_train_avg_loss,enc_avg_train_loss,dec_avg_train_loss,txt_avg_loss,enc_avg_loss,dec_avg_loss])
SEPARATOR = "================================================================================================================"
DEBUG_TEXT = "".join(
[
f"{SEPARATOR}\n",
f"EPOCH {current_epoch}\n",
f"{SEPARATOR}\n",
f"Train Losses:\n",
f"\tAvg Losses:\n",
f"\t\tavg_txt: {txt_train_avg_loss} - avg_enc: {enc_avg_train_loss} - avg_dec: {dec_avg_train_loss}\n",
f"{SEPARATOR}\n",
f"Validation Losses:\n",
f"\ttxt_loss: {txt_avg_loss} - masking_loss: {enc_avg_loss} - prediction_loss: {dec_avg_loss}\n",
f"{SEPARATOR}\n",
]
)
print(DEBUG_TEXT)
# Warn about patience
if patience == PATIENCE:
print("Model is likely overfitting, so let's stop here")
# SAVE MODEL
if current_epoch % CHECKPOINT_STEPS == 0 or patience == PATIENCE:
print(f"Saving model at {CHECKPOINT_PATH.as_posix()}")
torch.save(NANOSOCRATES.state_dict(), CHECKPOINT_PATH)
torch.save(nano_optim.state_dict(), NANO_OPTIM_PATH)
torch.save(encoder_only_optim.state_dict(), ENC_OPTIM_PATH)
torch.save(decoder_only_optim.state_dict(), DEC_OPTIM_PATH)
FILE = open(LAST_EPOCH_PATH, "w", encoding="utf-8")
FILE.write(f"{current_epoch}")
FILE.close()
if patience == PATIENCE:
exit(0)

View File

@ -1,224 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "adbd9598",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\Chris\\miniconda3\\envs\\deep_learning\\Lib\\site-packages\\torch\\utils\\_device.py:103: UserWarning: Aten Op fallback from XPU to CPU happends. This may have performance implications. If need debug the fallback ops please set environment variable `PYTORCH_DEBUG_XPU_FALLBACK=1` (Triggered internally at C:\\actions-runner\\_work\\pytorch\\pytorch\\pytorch\\build\\xpu\\ATen\\RegisterXPU_0.cpp:54528.)\n",
" return func(*args, **kwargs)\n"
]
},
{
"ename": "",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n",
"\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n",
"\u001b[1;31mClick <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. \n",
"\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
]
}
],
"source": [
"import random\n",
"import torch\n",
"import pandas as pd\n",
"from pathlib import Path\n",
"import Project_Model.Libs.Embedder as Embedder\n",
"import Project_Model.Libs.BPE as BPE\n",
"import Project_Model.Libs.Transformer as Transformer\n",
"import Project_Model.Libs.TorchShims as torch_shims\n",
"\n",
"# set a fixed seed\n",
"torch.manual_seed(0)\n",
"random.seed(0)\n",
"DEVICE = torch_shims.get_default_device()\n",
"torch.set_default_device(DEVICE)\n",
"\n",
"# set a default device\n",
"\n",
"# BPE Init\n",
"VOCABULARY_PATH = Path(\"Assets/Model/toy_10/toy_dictionary.json\")\n",
"SPECIAL_VOC = BPE.default_special_tokens()\n",
"\n",
"VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)\n",
"TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)\n",
"\n",
"\n",
"# Constants\n",
"TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + 1\n",
"EMBEDDED_SIZE = 256\n",
"FEED_FORWARD_MULTIPLIER = 4\n",
"ATTENTION_HEADS = 8\n",
"SENTENCE_LENGTH = 256\n",
"NUMBER_OF_BLOCKS = 4\n",
"MAX_EPOCHS = int(1e3)\n",
"\n",
"\n",
"PAD_TOKEN = TOKENANO.encode(\"<PAD>\")[0]\n",
"END_TOKEN = TOKENANO.encode(\"<END>\")[0]\n",
"\n",
"\n",
"# Load CSV\n",
"TOY_DATASET_PATH = Path(\"Assets/Dataset/1-hop/toy/rdf_text.csv\")\n",
"\n",
"TOY_DATASET = pd.read_csv(TOY_DATASET_PATH)\n",
"\n",
"TOY_BATCH_INPUT_LIST: list[list[int]] = []\n",
"TOY_BATCH_PADDING_LIST: list[list[bool]] = []\n",
"TOY_BATCH_TARGET_LIST: list[list[int]] = []\n",
"TOY_BATCH_DECODER_DEFAULT: list[list[int]]= []\n",
"\n",
"\n",
"for index, row in TOY_DATASET.iterrows():\n",
"\n",
" RDFs: str = row[\"RDFs\"]\n",
" Abstract: str = row[\"Abstract\"]\n",
"\n",
" input_tokens = TOKENANO.encode(RDFs)\n",
" output_tokens = TOKENANO.encode(Abstract)[1:]\n",
" decoder_default_tokens = TOKENANO.encode(\"<SOS>\")\n",
"\n",
" input_tokens, padding = Transformer.normalize_sequence(\n",
" input_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n",
" )\n",
" output_tokens, _ = Transformer.normalize_sequence(\n",
" output_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n",
" )\n",
" decoder_default_tokens, _ = Transformer.normalize_sequence(\n",
" decoder_default_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN, False\n",
" )\n",
"\n",
" TOY_BATCH_INPUT_LIST.append(input_tokens)\n",
" TOY_BATCH_PADDING_LIST.append(padding)\n",
" TOY_BATCH_TARGET_LIST.append(output_tokens)\n",
" TOY_BATCH_DECODER_DEFAULT.append(decoder_default_tokens)\n",
"\n",
" output_tokens = TOKENANO.encode(RDFs)\n",
" input_tokens = TOKENANO.encode(Abstract)[1:]\n",
" decoder_default_tokens = TOKENANO.encode(\"<SOS>\")\n",
"\n",
" input_tokens, padding = Transformer.normalize_sequence(\n",
" input_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n",
" )\n",
" output_tokens, _ = Transformer.normalize_sequence(\n",
" output_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n",
" )\n",
" decoder_default_tokens, _ = Transformer.normalize_sequence(\n",
" decoder_default_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN, False\n",
" )\n",
"\n",
" TOY_BATCH_INPUT_LIST.append(input_tokens)\n",
" TOY_BATCH_PADDING_LIST.append(padding)\n",
" TOY_BATCH_TARGET_LIST.append(output_tokens)\n",
" TOY_BATCH_DECODER_DEFAULT.append(decoder_default_tokens)\n",
"\n",
"# Training loop\n",
"LOSS_HISTORY = []\n",
"NANOSOCRATES = Transformer.TrainingModel(\n",
" TOKEN_SPACE_SIZE,\n",
" EMBEDDED_SIZE,\n",
" FEED_FORWARD_MULTIPLIER,\n",
" ATTENTION_HEADS,\n",
" NUMBER_OF_BLOCKS\n",
")\n",
"cross_entropy = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)\n",
"optimizer = torch.optim.AdamW(NANOSOCRATES.parameters())\n",
"scheduler = Transformer.WarmupLR(optimizer, 4000, EMBEDDED_SIZE)\n",
"last_loss = 0\n",
"current_epoch = 0\n",
"\n",
"while current_epoch < MAX_EPOCHS:\n",
"\n",
" optimizer.zero_grad()\n",
"\n",
" encoder_list = torch.tensor(TOY_BATCH_INPUT_LIST[:])\n",
" decoder_list = torch.tensor(TOY_BATCH_DECODER_DEFAULT[:])\n",
" src_padding = torch.tensor(TOY_BATCH_PADDING_LIST[:], dtype=torch.bool)\n",
"\n",
" # Transform target into logits\n",
" target_logits = torch.tensor(TOY_BATCH_TARGET_LIST[:])\n",
"\n",
" last_loss = 0\n",
" last_prediction: torch.Tensor\n",
"\n",
" for i in range(0, SENTENCE_LENGTH):\n",
"\n",
" optimizer.zero_grad()\n",
" tgt_padding = decoder_list.eq(PAD_TOKEN)\n",
"\n",
" logits: torch.Tensor = NANOSOCRATES((encoder_list, src_padding, decoder_list, tgt_padding))\n",
" prob = torch.softmax(logits, 2)\n",
"\n",
" most_probable_tokens = torch.argmax(prob, 2)\n",
" last_prediction = most_probable_tokens\n",
"\n",
" logits = logits[:,:i,:]\n",
" logits = logits.permute(0, 2, 1)\n",
"\n",
" loss : torch.Tensor = cross_entropy(logits, target_logits[:, 0:i])\n",
" # loss : torch.Tensor = cross_entropy(logits, target_logits)\n",
"\n",
" last_loss = loss\n",
" loss.backward()\n",
" optimizer.step()\n",
" scheduler.step()\n",
"\n",
" if i < SENTENCE_LENGTH - 1:\n",
" decoder_list[:,i+1] = target_logits[:,i]\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
" current_epoch += 1\n",
"\n",
" if current_epoch % 1 == 0:\n",
" print(f\"EPOCH {current_epoch}\\n\\tLoss: {last_loss}\")\n",
"\n",
" for encoded_sentence, expected_sentence in zip(\n",
" Transformer.tensor2token(last_prediction[:,:], END_TOKEN), # type: ignore\n",
" Transformer.tensor2token(target_logits[:,:], END_TOKEN)\n",
" ):\n",
" decoded_sentence = TOKENANO.decode(encoded_sentence)\n",
" decoded_target = TOKENANO.decode(expected_sentence)\n",
" print(f\"\\tACTUAL:\\n\\t\\t{decoded_sentence}\\n\\tEXPECTED:\\n\\t\\t{decoded_target}\\n\")\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "deep_learning",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -1,509 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "adbef43f",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\Chris\\miniconda3\\envs\\deep_learning\\Lib\\site-packages\\torch\\utils\\_device.py:103: UserWarning: Aten Op fallback from XPU to CPU happends. This may have performance implications. If need debug the fallback ops please set environment variable `PYTORCH_DEBUG_XPU_FALLBACK=1` (Triggered internally at C:\\actions-runner\\_work\\pytorch\\pytorch\\pytorch\\build\\xpu\\ATen\\RegisterXPU_0.cpp:54528.)\n",
" return func(*args, **kwargs)\n",
"c:\\Users\\Chris\\miniconda3\\envs\\deep_learning\\Lib\\site-packages\\torch\\optim\\lr_scheduler.py:192: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate\n",
" warnings.warn(\n"
]
},
{
"ename": "IndexError",
"evalue": "list index out of range",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mIndexError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 383\u001b[39m\n\u001b[32m 381\u001b[39m txt_min_train_losses = text_batch_losses[:][\u001b[32m0\u001b[39m]\n\u001b[32m 382\u001b[39m txt_avg_train_losses = text_batch_losses[:][\u001b[32m1\u001b[39m]\n\u001b[32m--> \u001b[39m\u001b[32m383\u001b[39m txt_max_train_losses = \u001b[43mtext_batch_losses\u001b[49m\u001b[43m[\u001b[49m\u001b[43m:\u001b[49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[32;43m2\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[32m 385\u001b[39m txt_min_loss = \u001b[38;5;28mmin\u001b[39m(txt_min_train_losses)\n\u001b[32m 386\u001b[39m txt_avg_min_loss = \u001b[38;5;28msum\u001b[39m(txt_min_train_losses) / \u001b[38;5;28mlen\u001b[39m(txt_min_train_losses)\n",
"\u001b[31mIndexError\u001b[39m: list index out of range"
]
}
],
"source": [
"import random\n",
"import sys\n",
"import torch\n",
"import pandas as pd\n",
"from pathlib import Path\n",
"import Project_Model.Libs.Embedder as Embedder\n",
"import Project_Model.Libs.BPE as BPE\n",
"import Project_Model.Libs.Transformer as Transformer\n",
"import Project_Model.Libs.TransformerUtils as TUtils\n",
"import Project_Model.Libs.TorchShims as torch_shims\n",
"import Project_Model.Libs.Batch as Batch\n",
"\n",
"# set a fixed seed\n",
"torch.manual_seed(0)\n",
"random.seed(0)\n",
"\n",
"\n",
"# set a default device\n",
"DEVICE = torch_shims.get_default_device()\n",
"torch.set_default_device(DEVICE)\n",
"\n",
"\n",
"# Get paths\n",
"VOCABULARY_PATH = Path(\"Assets/Model/small/bpe-small-16.json\")\n",
"TRAIN_DATASET_PATH = Path(\"Assets/Dataset/1-hop/toy/rdf_text.csv\")\n",
"VALIDATION_DATASET_PATH = Path(\"Assets/Dataset/1-hop/toy/rdf_text.csv\")\n",
"TEST_DATASET_PATH = Path(\"Assets/Dataset/1-hop/toy/rdf_text.csv\")\n",
"CHECKPOINT_PATH = Path(\"Assets/Dataset/Tmp/NanoSocrates.zip\")\n",
"\n",
"\n",
"# BPE Init\n",
"SPECIAL_VOC = BPE.default_special_tokens()\n",
"VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)\n",
"TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)\n",
"\n",
"\n",
"# Constants\n",
"MASK_EXTRA_SPACE = 25\n",
"TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + MASK_EXTRA_SPACE\n",
"EMBEDDED_SIZE = 256\n",
"FEED_FORWARD_MULTIPLIER = 4\n",
"ATTENTION_HEADS = 8\n",
"SENTENCE_LENGTH = 256\n",
"NUMBER_OF_BLOCKS = 4\n",
"MAX_EPOCHS = int(1e3)\n",
"PRETRAIN_EPOCHS = int(2)\n",
"WARMUP_EPOCHS = int(4e3)\n",
"MINI_BATCH_SIZE = 10\n",
"VALIDATION_STEPS = 1\n",
"CHECKPOINT_STEPS = VALIDATION_STEPS * 4\n",
"PATIENCE = 4\n",
"CURRENT_EPOCH = 0\n",
"\n",
"SOS_TOKEN = TOKENANO.encode(\"<SOS>\")[0]\n",
"\n",
"PAD_TOKEN = TOKENANO.encode(\"<PAD>\")[0]\n",
"END_TOKEN = TOKENANO.encode(\"<END>\")[0]\n",
"SUBJ_TOKEN = TOKENANO.encode(\"<SUBJ>\")[0]\n",
"REL_TOKEN = TOKENANO.encode(\"<PRED>\")[0]\n",
"OBJ_TOKEN = TOKENANO.encode(\"<OBJ>\")[0]\n",
"\n",
"SPECIAL_TOKENS: set[int] = set(TOKENANO.encode(\"\".join(BPE.default_special_tokens())))\n",
"ALLOWED_TOKENS = set([SUBJ_TOKEN, REL_TOKEN, OBJ_TOKEN])\n",
"FORBIDDEN_TOKENS = SPECIAL_TOKENS - ALLOWED_TOKENS\n",
"\n",
"\n",
"# Spanned_Masker\n",
"MASKER = Transformer.SpannedMasker(\n",
" TOKEN_SPACE_SIZE,\n",
" FORBIDDEN_TOKENS\n",
")\n",
"\n",
"TRAIN_BATCHER = Batch.Batcher(\n",
" TRAIN_DATASET_PATH,\n",
" SENTENCE_LENGTH,\n",
" TOKENANO,\n",
" MASKER\n",
")\n",
"VALIDATION_BATCHER = Batch.Batcher(\n",
" VALIDATION_DATASET_PATH,\n",
" SENTENCE_LENGTH,\n",
" TOKENANO,\n",
" MASKER\n",
")\n",
"TEST_BATCHER = Batch.Batcher(\n",
" TEST_DATASET_PATH,\n",
" SENTENCE_LENGTH,\n",
" TOKENANO,\n",
" MASKER\n",
")\n",
"\n",
"\n",
"# Model\n",
"NANOSOCRATES = Transformer.TrainingModel(\n",
" TOKEN_SPACE_SIZE,\n",
" EMBEDDED_SIZE,\n",
" FEED_FORWARD_MULTIPLIER,\n",
" ATTENTION_HEADS,\n",
" NUMBER_OF_BLOCKS\n",
")\n",
"_, ENCODER_ONLY, DECODER_ONLY = TUtils.decompose_nano_socrates(\n",
" NANOSOCRATES,\n",
" TOKEN_SPACE_SIZE,\n",
" EMBEDDED_SIZE\n",
")\n",
"\n",
"\n",
"# Training constants\n",
"cross_entropy = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)\n",
"nano_optim = torch.optim.AdamW(NANOSOCRATES.parameters())\n",
"encoder_only_optim = torch.optim.AdamW(ENCODER_ONLY.parameters())\n",
"decoder_only_optim = torch.optim.AdamW(DECODER_ONLY.parameters())\n",
"\n",
"nano_scheduler = Transformer.WarmupLR(nano_optim, WARMUP_EPOCHS, EMBEDDED_SIZE)\n",
"encoder_only_scheduler = Transformer.WarmupLR(encoder_only_optim, WARMUP_EPOCHS, EMBEDDED_SIZE)\n",
"decoder_only_scheduler = Transformer.WarmupLR(decoder_only_optim, WARMUP_EPOCHS, EMBEDDED_SIZE)\n",
"\n",
"current_epoch = CURRENT_EPOCH\n",
"patience = 0\n",
"\n",
"\n",
"average_loss_validation = {\n",
" \"txt\": float(\"inf\"),\n",
" \"encoder_only\": float(\"inf\"),\n",
" \"decoder_only\": float(\"inf\")\n",
"}\n",
"\n",
"while current_epoch < MAX_EPOCHS:\n",
"\n",
" text_batch_losses = []\n",
" encoder_batch_losses = []\n",
" decoder_batch_losses = []\n",
"\n",
" for batch in TRAIN_BATCHER.batch(MINI_BATCH_SIZE):\n",
"\n",
" src_x, tgt_y, pad_x, pad_y, tasktype = batch\n",
"\n",
" enc_x = torch.tensor(src_x)\n",
" enc_x_pad = torch.tensor(pad_x, dtype=torch.bool)\n",
" dec_x = Transformer.get_decoder_input(MINI_BATCH_SIZE, SOS_TOKEN, PAD_TOKEN, SENTENCE_LENGTH)\n",
" dec_x_pad = dec_x.eq(PAD_TOKEN)\n",
" tgt = torch.tensor(tgt_y)\n",
" tgt_pad = torch.tensor(pad_y, dtype=torch.bool)\n",
"\n",
" # Task 1 and Task 2\n",
" if tasktype == Batch.TaskType.RDF2TXT or tasktype == Batch.TaskType.TEXT2RDF:\n",
"\n",
" BATCH_LOSS = []\n",
"\n",
" for token_idx in range(0, SENTENCE_LENGTH):\n",
"\n",
" nano_optim.zero_grad()\n",
"\n",
"\n",
"\n",
" pred_logits = NANOSOCRATES((\n",
" enc_x, enc_x_pad, dec_x, dec_x_pad\n",
" ))\n",
"\n",
" pred_logits = pred_logits[:, token_idx, :]\n",
"\n",
" loss: torch.Tensor= cross_entropy(pred_logits, tgt[:, token_idx])\n",
"\n",
" loss.backward()\n",
" nano_optim.step()\n",
"\n",
"\n",
" BATCH_LOSS.append(\n",
" loss.item()\n",
" )\n",
"\n",
" if token_idx < SENTENCE_LENGTH - 1:\n",
" dec_x[:,token_idx + 1] = tgt[:, token_idx]\n",
"\n",
" MIN_BATCH_LOSS = min(BATCH_LOSS)\n",
" MAX_BATCH_LOSS = max(BATCH_LOSS)\n",
" AVG_BATCH_LOSS = sum(BATCH_LOSS) / MINI_BATCH_SIZE\n",
"\n",
" text_batch_losses.append([MIN_BATCH_LOSS, AVG_BATCH_LOSS, MAX_BATCH_LOSS])\n",
" continue\n",
"\n",
"\n",
" # Pretrain first\n",
" if current_epoch < PRETRAIN_EPOCHS:\n",
" continue\n",
"\n",
"\n",
" # Task 3\n",
" if tasktype == Batch.TaskType.MASKING:\n",
"\n",
" encoder_only_optim.zero_grad()\n",
"\n",
" pred_logits = ENCODER_ONLY((\n",
" enc_x, enc_x_pad\n",
" ))\n",
"\n",
" loss: torch.Tensor= cross_entropy(pred_logits, tgt)\n",
"\n",
" loss.backward()\n",
" encoder_only_optim.step()\n",
"\n",
" encoder_batch_losses.append(\n",
" loss.item()\n",
" )\n",
"\n",
" continue\n",
"\n",
"\n",
" # Task 4\n",
" if tasktype == Batch.TaskType.COMPLETATION:\n",
"\n",
" BATCH_LOSS = []\n",
"\n",
" for token_idx in range(0, SENTENCE_LENGTH):\n",
"\n",
" decoder_only_optim.zero_grad()\n",
"\n",
" pred_logits = DECODER_ONLY((\n",
" enc_x, enc_x_pad\n",
" ))\n",
"\n",
" pred_logits = pred_logits[:, token_idx, :]\n",
"\n",
" loss: torch.Tensor= cross_entropy(pred_logits, tgt[:, token_idx])\n",
"\n",
" loss.backward()\n",
" decoder_only_optim.step()\n",
"\n",
" BATCH_LOSS.append(\n",
" loss.item()\n",
" )\n",
"\n",
" if token_idx < SENTENCE_LENGTH - 1:\n",
" dec_x[:,token_idx + 1] = tgt[:, token_idx]\n",
"\n",
"\n",
" MIN_BATCH_LOSS = min(BATCH_LOSS)\n",
" MAX_BATCH_LOSS = max(BATCH_LOSS)\n",
" AVG_BATCH_LOSS = sum(BATCH_LOSS) / MINI_BATCH_SIZE\n",
"\n",
" decoder_batch_losses.append([MIN_BATCH_LOSS, AVG_BATCH_LOSS, MAX_BATCH_LOSS])\n",
"\n",
" continue\n",
"\n",
"\n",
" nano_scheduler.step()\n",
" encoder_only_scheduler.step()\n",
" decoder_only_scheduler.step()\n",
"\n",
" current_epoch += 1\n",
"\n",
" if current_epoch % VALIDATION_STEPS == 0:\n",
"\n",
" txt_avg_batch_losses = []\n",
" enc_avg_batch_losses = []\n",
" dec_avg_batch_losses = []\n",
"\n",
" for batch in VALIDATION_BATCHER.batch(MINI_BATCH_SIZE):\n",
"\n",
" src_x, tgt_y, pad_x, pad_y, tasktype = batch\n",
"\n",
" enc_x = torch.tensor(src_x)\n",
" enc_x_pad = torch.tensor(pad_x, dtype=torch.bool)\n",
" dec_x = Transformer.get_decoder_input(MINI_BATCH_SIZE, SOS_TOKEN, PAD_TOKEN, SENTENCE_LENGTH)\n",
" dec_x_pad = dec_x.eq(PAD_TOKEN)\n",
" tgt = torch.tensor(tgt_y)\n",
" tgt_pad = torch.tensor(pad_y, dtype=torch.bool)\n",
"\n",
" # Task 1 and Task 2\n",
" if tasktype == Batch.TaskType.RDF2TXT or tasktype == Batch.TaskType.TEXT2RDF:\n",
"\n",
" BATCH_LOSS = []\n",
"\n",
" for token_idx in range(0, SENTENCE_LENGTH):\n",
"\n",
"\n",
"\n",
" pred_logits = NANOSOCRATES((\n",
" enc_x, enc_x_pad, dec_x, dec_x_pad\n",
" ))\n",
"\n",
" pred_logits = pred_logits[:, token_idx, :]\n",
"\n",
" loss: torch.Tensor= cross_entropy(pred_logits, tgt[:, token_idx])\n",
"\n",
"\n",
" BATCH_LOSS.append(\n",
" loss.item()\n",
" )\n",
"\n",
" if token_idx < SENTENCE_LENGTH - 1:\n",
" dec_x[:,token_idx + 1] = tgt[:, token_idx]\n",
"\n",
"\n",
" AVG_BATCH_LOSS = sum(BATCH_LOSS) / MINI_BATCH_SIZE\n",
" txt_avg_batch_losses.append(AVG_BATCH_LOSS)\n",
"\n",
" continue\n",
"\n",
"\n",
" # Pretrain first\n",
" if current_epoch < PRETRAIN_EPOCHS:\n",
" continue\n",
"\n",
"\n",
" # Task 3\n",
" if tasktype == Batch.TaskType.MASKING:\n",
"\n",
" pred_logits = ENCODER_ONLY((\n",
" enc_x, enc_x_pad\n",
" ))\n",
"\n",
" loss: torch.Tensor= cross_entropy(pred_logits, tgt)\n",
"\n",
" enc_avg_batch_losses.append(\n",
" loss.item()\n",
" )\n",
"\n",
" continue\n",
"\n",
"\n",
" # Task 4\n",
" if tasktype == Batch.TaskType.COMPLETATION:\n",
"\n",
" BATCH_LOSS = []\n",
"\n",
" for token_idx in range(0, SENTENCE_LENGTH):\n",
"\n",
" pred_logits = DECODER_ONLY((\n",
" enc_x, enc_x_pad\n",
" ))\n",
"\n",
" pred_logits = pred_logits[:, token_idx, :]\n",
"\n",
" loss: torch.Tensor= cross_entropy(pred_logits, tgt[:, token_idx])\n",
"\n",
" BATCH_LOSS.append(\n",
" loss.item()\n",
" )\n",
"\n",
" if token_idx < SENTENCE_LENGTH - 1:\n",
" dec_x[:,token_idx + 1] = tgt[:, token_idx]\n",
"\n",
"\n",
" AVG_BATCH_LOSS = sum(BATCH_LOSS) / MINI_BATCH_SIZE\n",
"\n",
" dec_avg_batch_losses.append(AVG_BATCH_LOSS)\n",
"\n",
" continue\n",
"\n",
" txt_avg_loss = sum(txt_avg_batch_losses) / len(txt_avg_batch_losses)\n",
" enc_avg_loss = float(\"inf\")\n",
" dec_avg_loss = float(\"inf\")\n",
"\n",
" if current_epoch >= PRETRAIN_EPOCHS:\n",
" enc_avg_loss = sum(enc_avg_batch_losses) / len(enc_avg_batch_losses)\n",
" dec_avg_loss = sum(dec_avg_batch_losses) / len(dec_avg_batch_losses)\n",
"\n",
" if current_epoch < PRETRAIN_EPOCHS:\n",
"\n",
" if txt_avg_loss < average_loss_validation[\"txt\"]:\n",
" average_loss_validation[\"txt\"] = txt_avg_loss\n",
" else:\n",
" patience += 1\n",
" else:\n",
"\n",
" counter = 0\n",
"\n",
" if txt_avg_loss > average_loss_validation[\"txt\"]:\n",
" counter += 1\n",
"\n",
" if txt_avg_loss > average_loss_validation[\"encoder_only\"]:\n",
" counter += 1\n",
"\n",
" if txt_avg_loss > average_loss_validation[\"decoder_only\"]:\n",
" counter += 1\n",
"\n",
" if counter > 1:\n",
" patience += 1\n",
"\n",
" txt_min_train_losses = text_batch_losses[:][0]\n",
" txt_avg_train_losses = text_batch_losses[:][1]\n",
" txt_max_train_losses = text_batch_losses[:][2]\n",
"\n",
" txt_min_loss = min(txt_min_train_losses)\n",
" txt_avg_min_loss = sum(txt_min_train_losses) / len(txt_min_train_losses)\n",
" txt_max_loss = max(txt_max_train_losses)\n",
" txt_avg_max_loss = sum(txt_max_train_losses) / len(txt_max_train_losses)\n",
" txt_avg_loss = sum(txt_avg_train_losses) / len(txt_avg_train_losses)\n",
"\n",
" enc_avg_train_loss = float(\"inf\")\n",
"\n",
" dec_min_loss = float(\"inf\")\n",
" dec_avg_min_loss = float(\"inf\")\n",
" dec_max_loss = float(\"inf\")\n",
" dec_avg_max_loss = float(\"inf\")\n",
" dec_avg_loss = float(\"inf\")\n",
"\n",
" if current_epoch >= PRETRAIN_EPOCHS:\n",
" enc_avg_train_loss = sum(encoder_batch_losses) / len(encoder_batch_losses)\n",
"\n",
" dec_min_train_losses = decoder_batch_losses[:][0]\n",
" dec_avg_train_losses = decoder_batch_losses[:][1]\n",
" dec_max_train_losses = decoder_batch_losses[:][2]\n",
"\n",
" dec_min_loss = min(dec_min_train_losses)\n",
" dec_avg_min_loss = sum(dec_min_train_losses) / len(dec_min_train_losses)\n",
" dec_max_loss = max(dec_max_train_losses)\n",
" dec_avg_max_loss = sum(dec_max_train_losses) / len(dec_max_train_losses)\n",
" dec_avg_loss = sum(dec_avg_train_losses) / len(dec_avg_train_losses)\n",
"\n",
"\n",
" SEPARATOR = \"===========================================================================================\"\n",
" DEBUG_TEXT = \"\".join([\n",
" f\"{SEPARATOR}\\n\",\n",
" f\"EPOCH {current_epoch}\"\n",
" f\"{SEPARATOR}\\n\",\n",
" f\"Train Losses:\\n\"\n",
" f\"\\tMin Losses:\\n\"\n",
" f\"\\t\\tmin_txt: {txt_min_loss} - avg_txt: {txt_avg_min_loss}\\n\"\n",
" f\"\\t\\tmin_dec: {dec_min_loss} - avg_dec: {dec_avg_min_loss}\\n\"\n",
" f\"\\tMax Losses:\\n\"\n",
" f\"\\t\\tmax_txt: {txt_max_loss} - avg_txt: {txt_avg_max_loss}\\n\"\n",
" f\"\\t\\tmax_dec: {dec_min_loss} - avg_dec: {dec_avg_max_loss}\\n\"\n",
" f\"\\tAvg Losses:\\n\"\n",
" f\"\\t\\tavg_txt: {txt_avg_loss} - avg_enc: {enc_avg_loss} - avg_dec: {dec_avg_loss}\\n\"\n",
" f\"{SEPARATOR}\\n\",\n",
" f\"Validation Losses:\\n\"\n",
" f\"\\ttxt_loss: {txt_avg_loss} - masking_loss: {enc_avg_loss} - prediction: {dec_avg_loss}\"\n",
" f\"{SEPARATOR}\\n\",\n",
" ])\n",
"\n",
"\n",
"\n",
"\n",
"\n",
" # Warn about patience\n",
" if patience == PATIENCE:\n",
" print(\n",
" \"Model is likely overfitting, so let's stop here\"\n",
" )\n",
"\n",
" # SAVE MODEL\n",
" if current_epoch % CHECKPOINT_STEPS == 0 or patience == PATIENCE:\n",
" print(f\"Saving model at {CHECKPOINT_PATH.as_posix()}\")\n",
" torch.save(NANOSOCRATES.state_dict(), CHECKPOINT_PATH)\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "deep_learning",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -1,433 +0,0 @@
import random
import sys
import torch
import pandas as pd
from pathlib import Path
import Project_Model.Libs.Embedder as Embedder
import Project_Model.Libs.BPE as BPE
import Project_Model.Libs.Transformer as Transformer
import Project_Model.Libs.TransformerUtils as TUtils
import Project_Model.Libs.TorchShims as torch_shims
import Project_Model.Libs.Batch as Batch
# set a fixed seed
torch.manual_seed(0)
random.seed(0)
# set a default device
DEVICE = torch_shims.get_default_device()
torch.set_default_device(DEVICE)
# Get paths
VOCABULARY_PATH = Path("Assets/Model/small/bpe-small-16.json")
TRAIN_DATASET_PATH = Path("Assets/Dataset/1-hop/small/holdout/train.csv")
VALIDATION_DATASET_PATH = Path("Assets/Dataset/1-hop/small/holdout/evaluation.csv")
TEST_DATASET_PATH = Path("Assets/Dataset/1-hop/small/holdout/test.csv")
CHECKPOINT_PATH = Path("Assets/Dataset/Tmp/NanoSocrates.zip")
# BPE Init
SPECIAL_VOC = BPE.default_special_tokens()
VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)
TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)
# Constants
MASK_EXTRA_SPACE = 100
REAL_TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size
TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + MASK_EXTRA_SPACE
EMBEDDED_SIZE = 256
FEED_FORWARD_MULTIPLIER = 4
ATTENTION_HEADS = 8
SENTENCE_LENGTH = 256
NUMBER_OF_BLOCKS = 4
MAX_EPOCHS = int(1e3)
PRETRAIN_EPOCHS = int(10)
WARMUP_EPOCHS = int(4e3)
MINI_BATCH_SIZE = 100
VALIDATION_STEPS = 5
CHECKPOINT_STEPS = VALIDATION_STEPS * 4
PATIENCE = 4
CURRENT_EPOCH = 0
SOS_TOKEN = TOKENANO.encode("<SOS>")[0]
PAD_TOKEN = TOKENANO.encode("<PAD>")[0]
END_TOKEN = TOKENANO.encode("<END>")[0]
SUBJ_TOKEN = TOKENANO.encode("<SUBJ>")[0]
REL_TOKEN = TOKENANO.encode("<PRED>")[0]
OBJ_TOKEN = TOKENANO.encode("<OBJ>")[0]
SPECIAL_TOKENS: set[int] = set(TOKENANO.encode("".join(BPE.default_special_tokens())))
ALLOWED_TOKENS = set([SUBJ_TOKEN, REL_TOKEN, OBJ_TOKEN])
FORBIDDEN_TOKENS = SPECIAL_TOKENS - ALLOWED_TOKENS
# Spanned_Masker
MASKER = Transformer.SpannedMasker(REAL_TOKEN_SPACE_SIZE, FORBIDDEN_TOKENS)
TRAIN_BATCHER = Batch.Batcher(TRAIN_DATASET_PATH, SENTENCE_LENGTH, TOKENANO, MASKER)
VALIDATION_BATCHER = Batch.Batcher(
VALIDATION_DATASET_PATH, SENTENCE_LENGTH, TOKENANO, MASKER
)
TEST_BATCHER = Batch.Batcher(TEST_DATASET_PATH, SENTENCE_LENGTH, TOKENANO, MASKER)
# Model
NANOSOCRATES = Transformer.TrainingModel(
TOKEN_SPACE_SIZE,
EMBEDDED_SIZE,
FEED_FORWARD_MULTIPLIER,
ATTENTION_HEADS,
NUMBER_OF_BLOCKS,
)
_, ENCODER_ONLY, DECODER_ONLY = TUtils.decompose_nano_socrates(
NANOSOCRATES, TOKEN_SPACE_SIZE, EMBEDDED_SIZE
)
# Training constants
nano_cross_entropy = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)
encoder_ce = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)
decoder_ce = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)
nano_optim = torch.optim.AdamW(NANOSOCRATES.parameters())
encoder_only_optim = torch.optim.AdamW(ENCODER_ONLY.parameters())
decoder_only_optim = torch.optim.AdamW(DECODER_ONLY.parameters())
nano_scheduler = Transformer.WarmupLR(nano_optim, WARMUP_EPOCHS, EMBEDDED_SIZE)
encoder_only_scheduler = Transformer.WarmupLR(
encoder_only_optim, WARMUP_EPOCHS, EMBEDDED_SIZE
)
decoder_only_scheduler = Transformer.WarmupLR(
decoder_only_optim, WARMUP_EPOCHS, EMBEDDED_SIZE
)
current_epoch = CURRENT_EPOCH
patience = 0
average_loss_validation = {
"txt": float("inf"),
"encoder_only": float("inf"),
"decoder_only": float("inf"),
}
while current_epoch < MAX_EPOCHS:
NANOSOCRATES.train()
ENCODER_ONLY.train()
DECODER_ONLY.train()
text_batch_losses = []
encoder_batch_losses = []
decoder_batch_losses = []
batch_counter = 0
print(f"EPOCH {current_epoch} STARTING")
for batch in TRAIN_BATCHER.batch(MINI_BATCH_SIZE):
batch_counter += 1
src_x, tgt_y, pad_x, pad_y, tasktype = batch
enc_x = torch.tensor(src_x)
ACTUAL_BATCH_SIZE, _ = enc_x.shape
enc_x_pad = torch.tensor(pad_x, dtype=torch.bool)
dec_x = Transformer.get_decoder_input(
ACTUAL_BATCH_SIZE, SOS_TOKEN, PAD_TOKEN, SENTENCE_LENGTH
)
dec_x_pad = dec_x.eq(PAD_TOKEN)
tgt = torch.tensor(tgt_y)
tgt_pad = torch.tensor(pad_y, dtype=torch.bool)
print(f"\tBATCH {batch_counter} Starting")
# Task 1 and Task 2
if tasktype == Batch.TaskType.RDF2TXT or tasktype == Batch.TaskType.TEXT2RDF:
print(f"\tExecuting TASK 1 or 2 - BATCH {batch_counter}")
BATCH_LOSS = []
for token_idx in range(0, SENTENCE_LENGTH):
nano_optim.zero_grad()
pred_logits = NANOSOCRATES((enc_x, enc_x_pad, dec_x, dec_x_pad))
pred_logits = pred_logits[:, token_idx, :]
loss: torch.Tensor = nano_cross_entropy(pred_logits, tgt[:, token_idx])
loss.backward()
nano_optim.step()
BATCH_LOSS.append(loss.item())
if token_idx < SENTENCE_LENGTH - 1:
dec_x[:, token_idx + 1] = tgt[:, token_idx]
MIN_BATCH_LOSS = min(BATCH_LOSS)
MAX_BATCH_LOSS = max(BATCH_LOSS)
AVG_BATCH_LOSS = sum(BATCH_LOSS) / MINI_BATCH_SIZE
text_batch_losses.append([MIN_BATCH_LOSS, AVG_BATCH_LOSS, MAX_BATCH_LOSS])
continue
# Pretrain first
if current_epoch < PRETRAIN_EPOCHS:
continue
# Task 3
if tasktype == Batch.TaskType.MASKING:
print(f"\tExecuting TASK 3 - BATCH {batch_counter}")
encoder_only_optim.zero_grad()
pred_logits = ENCODER_ONLY((enc_x, enc_x_pad))
pred_logits = pred_logits.permute(0, 2, 1)
print(torch.max(tgt))
loss: torch.Tensor = encoder_ce(pred_logits, tgt)
loss.backward()
encoder_only_optim.step()
encoder_batch_losses.append(loss.item())
continue
# Task 4
if tasktype == Batch.TaskType.COMPLETATION:
print(f"\tExecuting TASK 4 - BATCH {batch_counter}")
BATCH_LOSS = []
for token_idx in range(0, SENTENCE_LENGTH):
decoder_only_optim.zero_grad()
pred_logits = DECODER_ONLY((enc_x, enc_x_pad))
pred_logits = pred_logits[:, token_idx, :]
loss: torch.Tensor = decoder_ce(pred_logits, tgt[:, token_idx])
loss.backward()
decoder_only_optim.step()
BATCH_LOSS.append(loss.item())
if token_idx < SENTENCE_LENGTH - 1:
dec_x[:, token_idx + 1] = tgt[:, token_idx]
MIN_BATCH_LOSS = min(BATCH_LOSS)
MAX_BATCH_LOSS = max(BATCH_LOSS)
AVG_BATCH_LOSS = sum(BATCH_LOSS) / MINI_BATCH_SIZE
decoder_batch_losses.append(
[MIN_BATCH_LOSS, AVG_BATCH_LOSS, MAX_BATCH_LOSS]
)
continue
nano_scheduler.step()
encoder_only_scheduler.step()
decoder_only_scheduler.step()
current_epoch += 1
if current_epoch % VALIDATION_STEPS == 0:
NANOSOCRATES.eval()
ENCODER_ONLY.eval()
DECODER_ONLY.eval()
txt_avg_batch_losses = []
enc_avg_batch_losses = []
dec_avg_batch_losses = []
for batch in VALIDATION_BATCHER.batch(MINI_BATCH_SIZE):
src_x, tgt_y, pad_x, pad_y, tasktype = batch
enc_x = torch.tensor(src_x)
ACTUAL_BATCH_SIZE, _, _ = enc_x.shape
enc_x_pad = torch.tensor(pad_x, dtype=torch.bool)
dec_x = Transformer.get_decoder_input(
ACTUAL_BATCH_SIZE, SOS_TOKEN, PAD_TOKEN, SENTENCE_LENGTH
)
dec_x_pad = dec_x.eq(PAD_TOKEN)
tgt = torch.tensor(tgt_y)
tgt_pad = torch.tensor(pad_y, dtype=torch.bool)
# Task 1 and Task 2
if (
tasktype == Batch.TaskType.RDF2TXT
or tasktype == Batch.TaskType.TEXT2RDF
):
BATCH_LOSS = []
for token_idx in range(0, SENTENCE_LENGTH):
pred_logits = NANOSOCRATES((enc_x, enc_x_pad, dec_x, dec_x_pad))
pred_logits = pred_logits[:, token_idx, :]
loss: torch.Tensor = nano_cross_entropy(pred_logits, tgt[:, token_idx])
BATCH_LOSS.append(loss.item())
if token_idx < SENTENCE_LENGTH - 1:
dec_x[:, token_idx + 1] = tgt[:, token_idx]
AVG_BATCH_LOSS = sum(BATCH_LOSS) / MINI_BATCH_SIZE
txt_avg_batch_losses.append(AVG_BATCH_LOSS)
continue
# Pretrain first
if current_epoch < PRETRAIN_EPOCHS:
continue
# Task 3
if tasktype == Batch.TaskType.MASKING:
pred_logits = ENCODER_ONLY((enc_x, enc_x_pad))
pred_logits = pred_logits.permute(0, 2, 1)
loss: torch.Tensor = encoder_ce(pred_logits, tgt)
enc_avg_batch_losses.append(loss.item())
continue
# Task 4
if tasktype == Batch.TaskType.COMPLETATION:
BATCH_LOSS = []
for token_idx in range(0, SENTENCE_LENGTH):
pred_logits = DECODER_ONLY((enc_x, enc_x_pad))
pred_logits = pred_logits[:, token_idx, :]
loss: torch.Tensor = decoder_ce(pred_logits, tgt[:, token_idx])
BATCH_LOSS.append(loss.item())
if token_idx < SENTENCE_LENGTH - 1:
dec_x[:, token_idx + 1] = tgt[:, token_idx]
AVG_BATCH_LOSS = sum(BATCH_LOSS) / MINI_BATCH_SIZE
dec_avg_batch_losses.append(AVG_BATCH_LOSS)
continue
txt_avg_loss = sum(txt_avg_batch_losses) / len(txt_avg_batch_losses)
enc_avg_loss = float("inf")
dec_avg_loss = float("inf")
if current_epoch >= PRETRAIN_EPOCHS:
enc_avg_loss = sum(enc_avg_batch_losses) / len(enc_avg_batch_losses)
dec_avg_loss = sum(dec_avg_batch_losses) / len(dec_avg_batch_losses)
if current_epoch < PRETRAIN_EPOCHS:
if txt_avg_loss < average_loss_validation["txt"]:
average_loss_validation["txt"] = txt_avg_loss
else:
patience += 1
else:
counter = 0
if txt_avg_loss > average_loss_validation["txt"]:
counter += 1
if txt_avg_loss > average_loss_validation["encoder_only"]:
counter += 1
if txt_avg_loss > average_loss_validation["decoder_only"]:
counter += 1
if counter > 1:
patience += 1
txt_min_train_losses = [row[0] for row in text_batch_losses]
txt_avg_train_losses = [row[1] for row in text_batch_losses]
txt_max_train_losses = [row[2] for row in text_batch_losses]
txt_min_loss = min(txt_min_train_losses)
txt_avg_min_loss = sum(txt_min_train_losses) / len(txt_min_train_losses)
txt_max_loss = max(txt_max_train_losses)
txt_avg_max_loss = sum(txt_max_train_losses) / len(txt_max_train_losses)
txt_avg_loss = sum(txt_avg_train_losses) / len(txt_avg_train_losses)
enc_avg_train_loss = float("inf")
dec_min_loss = float("inf")
dec_avg_min_loss = float("inf")
dec_max_loss = float("inf")
dec_avg_max_loss = float("inf")
dec_avg_loss = float("inf")
if current_epoch >= PRETRAIN_EPOCHS:
enc_avg_train_loss = sum(encoder_batch_losses) / len(encoder_batch_losses)
dec_min_train_losses = [row[0] for row in decoder_batch_losses]
dec_avg_train_losses = [row[1] for row in decoder_batch_losses]
dec_max_train_losses = [row[2] for row in decoder_batch_losses]
dec_min_loss = min(dec_min_train_losses)
dec_avg_min_loss = sum(dec_min_train_losses) / len(dec_min_train_losses)
dec_max_loss = max(dec_max_train_losses)
dec_avg_max_loss = sum(dec_max_train_losses) / len(dec_max_train_losses)
dec_avg_loss = sum(dec_avg_train_losses) / len(dec_avg_train_losses)
SEPARATOR = "================================================================================================================"
DEBUG_TEXT = "".join(
[
f"{SEPARATOR}\n",
f"EPOCH {current_epoch}\n",
f"{SEPARATOR}\n",
f"Train Losses:\n",
f"\tMin Losses:\n",
f"\t\tmin_txt: {txt_min_loss} - avg_txt: {txt_avg_min_loss}\n",
f"\t\tmin_dec: {dec_min_loss} - avg_dec: {dec_avg_min_loss}\n",
f"\tMax Losses:\n",
f"\t\tmax_txt: {txt_max_loss} - avg_txt: {txt_avg_max_loss}\n",
f"\t\tmax_dec: {dec_min_loss} - avg_dec: {dec_avg_max_loss}\n",
f"\tAvg Losses:\n",
f"\t\tavg_txt: {txt_avg_loss} - avg_enc: {enc_avg_loss} - avg_dec: {dec_avg_loss}\n",
f"{SEPARATOR}\n",
f"Validation Losses:\n",
f"\ttxt_loss: {txt_avg_loss} - masking_loss: {enc_avg_loss} - prediction: {dec_avg_loss}\n",
f"{SEPARATOR}\n",
]
)
print(DEBUG_TEXT)
# Warn about patience
if patience == PATIENCE:
print("Model is likely overfitting, so let's stop here")
# SAVE MODEL
if current_epoch % CHECKPOINT_STEPS == 0 or patience == PATIENCE:
print(f"Saving model at {CHECKPOINT_PATH.as_posix()}")
torch.save(NANOSOCRATES.state_dict(), CHECKPOINT_PATH)

View File

@ -1,112 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "4ae47336",
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"B, T, D = 4, 7, 32\n",
"x = torch.randn(B, T, D)\n",
"attn_mask = torch.triu(torch.ones(T, T, dtype=torch.bool), diagonal=1) # [T,T]\n",
"pad_mask = torch.zeros(B, T, dtype=torch.bool) # no pads\n",
"mha = torch.nn.MultiheadAttention(D, num_heads=4, batch_first=True)\n",
"y, _ = mha(x, x, x, attn_mask=attn_mask, key_padding_mask=pad_mask) # should work\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "e38e3fb5",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"tensor([[[0, 0, 0, 0, 1, 0, 0, 0, 0, 0],\n",
" [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],\n",
" [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]],\n",
"\n",
" [[0, 0, 1, 0, 0, 0, 0, 0, 0, 0],\n",
" [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],\n",
" [0, 0, 0, 0, 0, 1, 0, 0, 0, 0]]])"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"torch.nn.functional.one_hot(torch.tensor([\n",
" [4, 1, 9],\n",
" [2,4,5]\n",
"]))"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "7119ad53",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"device(type='cpu')"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"torch.get_default_device()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "8c95691a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"xpu\n"
]
}
],
"source": [
"from Project_Model.Libs.TorchShims import get_default_device\n",
"\n",
"print(get_default_device())"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "deep_learning",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -1,177 +0,0 @@
import random
import time
import torch
import pandas as pd
from pathlib import Path
import Project_Model.Libs.Embedder as Embedder
import Project_Model.Libs.BPE as BPE
import Project_Model.Libs.Transformer as Transformer
import Project_Model.Libs.TorchShims as torch_shims
# set a fixed seed
torch.manual_seed(0)
random.seed(0)
DEVICE = torch_shims.get_default_device()
torch.set_default_device(DEVICE)
# set a default device
# BPE Init
VOCABULARY_PATH = Path("Assets/Model/toy_10/toy_dictionary.json")
SPECIAL_VOC = BPE.default_special_tokens()
VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)
TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)
# Constants
TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + 1
EMBEDDED_SIZE = 256
FEED_FORWARD_MULTIPLIER = 4
ATTENTION_HEADS = 8
SENTENCE_LENGTH = 256
NUMBER_OF_BLOCKS = 4
MAX_EPOCHS = int(1e3)
PAD_TOKEN = TOKENANO.encode("<PAD>")[0]
END_TOKEN = TOKENANO.encode("<END>")[0]
# Load CSV
TOY_DATASET_PATH = Path("Assets/Dataset/1-hop/toy/rdf_text.csv")
TOY_DATASET = pd.read_csv(TOY_DATASET_PATH)
TOY_BATCH_INPUT_LIST: list[list[int]] = []
TOY_BATCH_PADDING_LIST: list[list[bool]] = []
TOY_BATCH_TARGET_LIST: list[list[int]] = []
TOY_BATCH_DECODER_DEFAULT: list[list[int]] = []
for index, row in TOY_DATASET.iterrows():
RDFs: str = row["RDFs"]
Abstract: str = row["Abstract"]
input_tokens = TOKENANO.encode(RDFs)
output_tokens = TOKENANO.encode(Abstract)[1:]
decoder_default_tokens = TOKENANO.encode("<SOS>")
input_tokens, padding = Transformer.normalize_sequence(
input_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN
)
output_tokens, _ = Transformer.normalize_sequence(
output_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN
)
decoder_default_tokens, _ = Transformer.normalize_sequence(
decoder_default_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN, False
)
TOY_BATCH_INPUT_LIST.append(input_tokens)
TOY_BATCH_PADDING_LIST.append(padding)
TOY_BATCH_TARGET_LIST.append(output_tokens)
TOY_BATCH_DECODER_DEFAULT.append(decoder_default_tokens)
output_tokens = TOKENANO.encode(RDFs)
input_tokens = TOKENANO.encode(Abstract)[1:]
decoder_default_tokens = TOKENANO.encode("<SOS>")
input_tokens, padding = Transformer.normalize_sequence(
input_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN
)
output_tokens, _ = Transformer.normalize_sequence(
output_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN
)
decoder_default_tokens, _ = Transformer.normalize_sequence(
decoder_default_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN, False
)
TOY_BATCH_INPUT_LIST.append(input_tokens)
TOY_BATCH_PADDING_LIST.append(padding)
TOY_BATCH_TARGET_LIST.append(output_tokens)
TOY_BATCH_DECODER_DEFAULT.append(decoder_default_tokens)
# Training loop
LOSS_HISTORY = []
NANOSOCRATES = Transformer.TrainingModel(
TOKEN_SPACE_SIZE,
EMBEDDED_SIZE,
FEED_FORWARD_MULTIPLIER,
ATTENTION_HEADS,
NUMBER_OF_BLOCKS,
)
cross_entropy = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)
optimizer = torch.optim.AdamW(NANOSOCRATES.parameters())
scheduler = Transformer.WarmupLR(optimizer, 4000, EMBEDDED_SIZE)
last_loss = 0
current_epoch = 0
while current_epoch < MAX_EPOCHS:
optimizer.zero_grad()
encoder_list = torch.tensor(TOY_BATCH_INPUT_LIST[:])
decoder_list = torch.tensor(TOY_BATCH_DECODER_DEFAULT[:])
src_padding = torch.tensor(TOY_BATCH_PADDING_LIST[:], dtype=torch.bool)
# Transform target into logits
target_logits = torch.tensor(TOY_BATCH_TARGET_LIST[:])
last_loss = 0
last_prediction: torch.Tensor
LOSS_HISTORY = []
start = time.time_ns()
for i in range(0, SENTENCE_LENGTH):
optimizer.zero_grad()
tgt_padding = decoder_list.eq(PAD_TOKEN)
logits: torch.Tensor = NANOSOCRATES(
(encoder_list, src_padding, decoder_list, tgt_padding)
)
prob = torch.softmax(logits, 2)
most_probable_tokens = torch.argmax(prob, 2)
last_prediction = most_probable_tokens
logits = logits[:, i, :]
# logits = logits.permute(0, 2, 1)
loss: torch.Tensor = cross_entropy(logits, target_logits[:, i])
LOSS_HISTORY.append(loss.item())
# loss : torch.Tensor = cross_entropy(logits, target_logits[:, 0:i])
# loss : torch.Tensor = cross_entropy(logits, target_logits)
last_loss = loss
loss.backward()
optimizer.step()
scheduler.step()
if i < SENTENCE_LENGTH - 1:
decoder_list[:, i + 1] = target_logits[:, i]
current_epoch += 1
end = time.time_ns()
if current_epoch % 1 == 0:
MIN_LOSS = min(LOSS_HISTORY)
MAX_LOSS = max(LOSS_HISTORY)
AVERAGE_LOSS = sum(LOSS_HISTORY)/len(LOSS_HISTORY)
print(f"EPOCH {current_epoch}\n\tTime: {(end-start)/1E9}s\n\tLoss: {last_loss}")
print(f"\tMin Loss: {MIN_LOSS}\tAvg Loss: {AVERAGE_LOSS}\tMax Loss: {MAX_LOSS}\n")
# for encoded_sentence, expected_sentence in zip(
# Transformer.tensor2token(last_prediction[:, :], END_TOKEN), # type: ignore
# Transformer.tensor2token(target_logits[:, :], END_TOKEN),
# ):
# decoded_sentence = TOKENANO.decode(encoded_sentence)
# decoded_target = TOKENANO.decode(expected_sentence)
# print(
# f"\tACTUAL:\n\t\t{decoded_sentence}\n\tEXPECTED:\n\t\t{decoded_target}\n"
# )

View File

@ -1,60 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "dd23cc94",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Current detected architecture is: xpu\n"
]
}
],
"source": [
"import torch\n",
"from Project_Model.Libs.TorchShims import get_default_device\n",
"\n",
"DEVICE = get_default_device()\n",
"\n",
"print(f\"Current detected architecture is: {DEVICE.type}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6584882e",
"metadata": {},
"outputs": [],
"source": [
"import Project_Model.Libs.Transformer as Transformer\n",
"DECODER = Transformer.Decoder(256, 1024, 4)\n",
"print()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "deep_learning",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -1,4 +0,0 @@
from abc import ABC
class Encoder(ABC):
pass

View File

@ -1,164 +0,0 @@
from collections import deque
import datetime
from pathlib import Path
import re
from ..Classes import (
NanoSocratesBPE,
NanoSocratesChunker,
NanoSocratesSplitter,
NanoSocratesBatchMemoryBPE,
)
from ..Enums import TokenType
from ..Utils import (
special_regex_maker,
iterator_with_checks,
save_nanos_vocabulary,
load_nanos_vocabulary,
save_json,
load_json,
)
class NanoSocraTraineRam:
def __init__(
self,
max_vocabulary: int,
special_vocabulary: list[str],
merge_treshold: int = 0,
max_iterations: int = 0,
print_after_iterations: int = 1,
) -> None:
# Bytes
BYTE_RESERVED_TOKENS = 256
SPECIAL_RESERVED_TOKENS = len(special_vocabulary)
RESERVED_TOKENS = BYTE_RESERVED_TOKENS + SPECIAL_RESERVED_TOKENS
self.__max_vocabulary = max_vocabulary - RESERVED_TOKENS
self.__max_iterations = max_iterations
self.__merge_treshold = merge_treshold
self.__special_token_regex = special_regex_maker(special_vocabulary)
self.__print_after_iterations = print_after_iterations
def trainBPE(
self,
path: Path,
bpe: NanoSocratesBPE | None = None,
) -> NanoSocratesBPE:
if not path.is_file():
raise FileNotFoundError()
if bpe is None:
bpe = NanoSocratesBPE()
BPE = bpe
if BPE.vocabulary_size > self.__max_vocabulary:
return BPE
exit = False
current_iteration = 0
data = self.__gather_data_from_file(path)
while not exit:
current_iteration = self.__increment_counter(current_iteration)
LAST_VOC_SIZE = BPE.vocabulary_size
last_memory = None
_, data, last_memory = self.__round_train(BPE, data)
NEW_VOC_SIZE = BPE.vocabulary_size
if current_iteration % self.__print_after_iterations == 0:
DELIMITER = "==============="
DEBUG = "\n".join(
[
DELIMITER,
f"ITERATION: {current_iteration}",
DELIMITER,
f"\tVocabulary size: {BPE.vocabulary_size}\n",
f"\tFrequencies:\n{last_memory.frequencies}\n", # type: ignore (pretty sure it's not None)
f"\tvocabulary:\n{BPE.vocabulary}",
DELIMITER,
"",
]
)
print(DEBUG)
if LAST_VOC_SIZE == NEW_VOC_SIZE:
exit = True
continue
if current_iteration == self.__max_iterations:
exit = True
continue
if BPE.vocabulary_size == self.__max_vocabulary:
exit = True
continue
return BPE
def __round_train(self, bpe: NanoSocratesBPE, data: list[list[int]]):
DATA_LEN = len(data)
NEW_DATA = []
counter = 0
memory = NanoSocratesBatchMemoryBPE({}, 0)
while len(data) > 0:
counter += 1
last_batch = len(data) == 1
piece = data.pop()
bpe, memory, output = bpe.fit(piece, memory, last_batch)
if counter % int(1E6) == 0:
print(f"Fitted: {counter}/{DATA_LEN}")
if len(output) < 2:
continue
NEW_DATA.append(output)
return (bpe, NEW_DATA, memory)
def __gather_data_from_file(self, path: Path) -> list[list[int]]:
SPLITTER = NanoSocratesSplitter(self.__special_token_regex)
DATA: list[list[int]] = []
FILE = open(path, "r", encoding="utf-8")
file_string = FILE.read()
FILE.close()
for piece, type in SPLITTER.split_text(file_string):
if type != TokenType.BPE:
continue
int_list = self.__make_list_ids(piece)
DATA.append(int_list)
return DATA
def __increment_counter(self, counter: int):
# What if overflows???
try:
counter += 1
except:
print("Integer overflow")
counter = 1
return counter
def __make_list_ids(self, corpus: str):
return list(corpus.encode("utf-8"))

View File

@ -1,248 +0,0 @@
from collections import deque
import datetime
from pathlib import Path
import re
from ..Classes import (
NanoSocratesBPE,
NanoSocratesChunker,
NanoSocratesSplitter,
NanoSocratesBatchMemoryBPE,
)
from ..Enums import TokenType
from ..Utils import (
special_regex_maker,
iterator_with_checks,
save_nanos_vocabulary,
load_nanos_vocabulary,
save_json,
load_json,
)
class NanoSocraTrainer:
def __init__(
self,
max_vocabulary: int,
special_vocabulary: list[str],
chunk_size: int,
merge_treshold: int = 0,
max_iterations: int = 0,
print_after_iterations: int = 1,
) -> None:
# Bytes
BYTE_RESERVED_TOKENS = 256
SPECIAL_RESERVED_TOKENS = len(special_vocabulary)
RESERVED_TOKENS = BYTE_RESERVED_TOKENS + SPECIAL_RESERVED_TOKENS
self.__max_vocabulary = max_vocabulary - RESERVED_TOKENS
self.__max_iterations = max_iterations
self.__chunk_size = chunk_size
self.__merge_treshold = merge_treshold
self.__special_token_regex = special_regex_maker(special_vocabulary)
self.__print_after_iterations = print_after_iterations
def trainBPE(
self,
path: Path,
cache_dir: Path,
bpe: NanoSocratesBPE | None = None,
resume_from_iter: int = 0,
) -> NanoSocratesBPE:
if not path.is_file():
raise FileNotFoundError()
if not cache_dir.is_dir():
raise NotADirectoryError()
if bpe is None:
bpe = NanoSocratesBPE()
BPE = bpe
if BPE.vocabulary_size > self.__max_vocabulary:
return BPE
exit = False
cached = False
current_iteration = 0
input_path = path
NEXT_ITERATION = resume_from_iter + 1 if resume_from_iter != 0 else 0
PATH_GEN = self.__switch_paths(path, cache_dir, NEXT_ITERATION)
MEMORY_PATH_GEN = self.__switch_memory(cache_dir, resume_from_iter)
if resume_from_iter != 0:
cached = True
current_iteration = resume_from_iter
input_path = next(PATH_GEN)
# UGLY: fixes a bug immediately, unfortunately
_, _ = next(MEMORY_PATH_GEN)
_, voc_cache_path = next(MEMORY_PATH_GEN)
vocabulary = load_nanos_vocabulary(voc_cache_path)
BPE = NanoSocratesBPE(vocabulary)
while not exit:
out_path = next(PATH_GEN)
internal_cache_path, vocabulary_cache = next(MEMORY_PATH_GEN)
current_iteration = self.__increment_counter(current_iteration)
LAST_VOC_SIZE = BPE.vocabulary_size
FILE = open(out_path, "w")
last_memory = None
for _, memory, output in self.__round_train(input_path, BPE, cached):
last_memory = memory
FILE.write(output)
FILE.close()
internal_cache = {
"finished_iter": current_iteration,
"read_from": f"{input_path}",
"wrote_to": f"{out_path}",
"at": datetime.datetime.now(datetime.timezone.utc).strftime(
"%Y-%m-%d %H:%M:%S.%f"
)[:-3],
}
VOCABULARY = BPE.vocabulary
save_json(internal_cache, internal_cache_path)
save_nanos_vocabulary(VOCABULARY, vocabulary_cache)
cached = True
input_path = out_path
NEW_VOC_SIZE = BPE.vocabulary_size
if current_iteration % self.__print_after_iterations == 0:
DELIMITER = "==============="
DEBUG = "\n".join(
[
DELIMITER,
f"ITERATION: {current_iteration}",
DELIMITER,
f"\tVocabulary size: {BPE.vocabulary_size}\n",
f"\tFrequencies:\n{last_memory.frequencies}\n", # type: ignore (pretty sure it's not None)
f"\tvocabulary:\n{BPE.vocabulary}",
DELIMITER,
"",
]
)
print(DEBUG)
if LAST_VOC_SIZE == NEW_VOC_SIZE:
exit = True
continue
if current_iteration == self.__max_iterations:
exit = True
continue
if BPE.vocabulary_size == self.__max_vocabulary:
exit = True
continue
return BPE
def __round_train(self, path: Path, bpe: NanoSocratesBPE, cached: bool):
CHUNKER = NanoSocratesChunker(self.__chunk_size, self.__special_token_regex)
SPLITTER = NanoSocratesSplitter(self.__special_token_regex)
BPE = bpe
memory = NanoSocratesBatchMemoryBPE({}, self.__merge_treshold)
CHUNKER_GENERATOR = iterator_with_checks(CHUNKER.chunk(path))
for chunk, last_chunk in CHUNKER_GENERATOR:
PIECE_GENERATOR = iterator_with_checks(SPLITTER.split_text(chunk))
for piece, last_piece in PIECE_GENERATOR:
LAST_BATCH = last_chunk and last_piece
PIECE, TOKEN_TYPE = piece
if TOKEN_TYPE != TokenType.BPE:
_, _, out = BPE.fit([], memory, LAST_BATCH)
yield (BPE, memory, PIECE)
continue
PIECE_DATA = self.__make_list_ids(PIECE, cached)
_, _, out = BPE.fit(PIECE_DATA, memory, LAST_BATCH)
OUT_STRING = f"{out}"
yield (BPE, memory, OUT_STRING)
def __increment_counter(self, counter: int):
# What if overflows???
try:
counter += 1
except:
print("Integer overflow")
counter = 1
return counter
def __make_list_ids(self, corpus: str, cached: bool):
if not cached:
return list(corpus.encode("utf-8"))
REDUCED_CORPUS_LEN = len(corpus) - 1
# Skip these cars "[" "]"
INTS = corpus[1:REDUCED_CORPUS_LEN]
INT_LIST = list(map(int, INTS.split(",")))
return INT_LIST
def __switch_paths(self, path: Path, cache_path: Path, initial_iteration: int):
CORPUS_TMP_1 = cache_path / "corpus-tmp1.txt"
CORPUS_TMP_2 = cache_path / "corpus-tmp2.txt"
switch = True
if initial_iteration % 2 == 1:
switch = False
del initial_iteration
while True:
if switch:
yield CORPUS_TMP_1
else:
yield CORPUS_TMP_2
switch = not switch
def __switch_memory(self, cache_path: Path, initial_iteration: int):
INTERNAL_TMP_1 = cache_path / "internal-tmp1.json"
INTERNAL_TMP_2 = cache_path / "internal-tmp2.json"
VOCAB_TMP_1 = cache_path / "voc-tmp1.json"
VOCAB_TMP_2 = cache_path / "voc-tmp2.json"
switch = False
if initial_iteration % 2 == 1:
switch = True
del initial_iteration
while True:
if switch:
yield (INTERNAL_TMP_1, VOCAB_TMP_1)
else:
yield (INTERNAL_TMP_2, VOCAB_TMP_2)
switch = not switch

View File

@ -1,280 +0,0 @@
from collections import deque
import datetime
import itertools
from multiprocessing import Pool
import os
from pathlib import Path
import re
import time
from ..Classes import (
NanoSocratesBPE,
NanoSocratesChunker,
NanoSocratesSplitter,
NanoSocratesBatchMemoryBPE,
)
from ..Enums import TokenType
from ..Utils import (
special_regex_maker,
iterator_with_checks,
save_nanos_vocabulary,
load_nanos_vocabulary,
save_json,
load_json,
)
def split(a, n):
k, m = divmod(len(a), n)
return (a[i * k + min(i, m) : (i + 1) * k + min(i + 1, m)] for i in range(n))
def split_fit(object: tuple[NanoSocratesBPE, list[list[int]]]):
bpe, data = object
NEW_DATA: list[list[int]] = []
memory = NanoSocratesBatchMemoryBPE({}, 0)
while len(data) > 0:
piece = data.pop()
bpe, memory, output = bpe.fit(piece, memory, False)
if len(output) < 2:
continue
# We are sure of its type
NEW_DATA.append(piece) # type: ignore
return (bpe, NEW_DATA, memory)
def split_encode(object: tuple[NanoSocratesBPE, list[list[int]]]):
bpe, data = object
NEW_DATA: list[list[int]] = []
for index, piece in zip(range(0, len(data)), data):
output = bpe.encode_intermediate(piece)
if len(output) < 2:
continue
# We are sure of its type
NEW_DATA.append(data[index]) # type: ignore
return NEW_DATA
class NanoSocraTrainerPool:
def __init__(
self,
max_vocabulary: int,
special_vocabulary: list[str],
merge_treshold: int = 0,
max_iterations: int = 0,
print_after_iterations: int = 1,
) -> None:
# Bytes
BYTE_RESERVED_TOKENS = 256
SPECIAL_RESERVED_TOKENS = len(special_vocabulary)
RESERVED_TOKENS = BYTE_RESERVED_TOKENS + SPECIAL_RESERVED_TOKENS
self.__max_vocabulary = max_vocabulary - RESERVED_TOKENS
self.__max_iterations = max_iterations
self.__merge_treshold = merge_treshold
self.__special_token_regex = special_regex_maker(special_vocabulary)
self.__print_after_iterations = print_after_iterations
# TODO: add a resume function
def trainBPE(
self,
path: Path,
cache_file: Path,
bpe: NanoSocratesBPE | None = None,
) -> NanoSocratesBPE:
if not path.is_file():
raise FileNotFoundError()
if not cache_file.is_file():
file = cache_file.open("w")
file.close()
if bpe is None:
bpe = NanoSocratesBPE()
BPE = bpe
if BPE.vocabulary_size >= self.__max_vocabulary:
return BPE
exit = False
current_iteration = 0
data = self.__gather_data_from_file(path)
data = self.__encode_from_cache(BPE, data)
while not exit:
current_iteration = self.__increment_counter(current_iteration)
LAST_VOC_SIZE = BPE.vocabulary_size
last_memory = None
start = time.time_ns()
_, data, last_memory = self.__round_train(BPE, data)
end = time.time_ns()
NEW_VOC_SIZE = BPE.vocabulary_size
VOCABULARY = BPE.vocabulary
save_nanos_vocabulary(VOCABULARY, cache_file)
if current_iteration % self.__print_after_iterations == 0:
DELIMITER = "==============="
DEBUG = "\n".join(
[
DELIMITER,
f"ITERATION: {current_iteration}",
DELIMITER,
f"\tVocabulary size: {BPE.vocabulary_size - 256}\n",
f"\tTime elapsed: {(end - start)/1E9}s",
DELIMITER,
"",
]
)
print(DEBUG)
if LAST_VOC_SIZE == NEW_VOC_SIZE:
exit = True
continue
if current_iteration == self.__max_iterations:
exit = True
continue
if BPE.vocabulary_size == self.__max_vocabulary:
exit = True
continue
return BPE
def __round_train(self, bpe: NanoSocratesBPE, data: list[list[int]]):
NEW_DATA: list[list[int]] = []
MEMORY = NanoSocratesBatchMemoryBPE({}, self.__merge_treshold)
fit_funct = split_fit
CPU_COUNT = os.process_cpu_count()
if CPU_COUNT is None:
raise Exception()
VOCABULARY = bpe.vocabulary
data_chunks = split(data, CPU_COUNT)
JOBS = [(NanoSocratesBPE(VOCABULARY), chunk) for chunk in data_chunks]
JOB_RESULTS: list[
tuple[NanoSocratesBPE, list[list[int]], NanoSocratesBatchMemoryBPE]
]
with Pool() as pool:
JOB_RESULTS = pool.map(fit_funct, JOBS)
for i, res in zip(range(0, CPU_COUNT), JOB_RESULTS):
_, job_output, job_memory = res
NEW_DATA.extend(job_output)
for key, value in job_memory.frequencies.items():
frequency = MEMORY.frequencies.get(key)
if frequency is None:
frequency = 0
MEMORY.frequencies[key] = 0
frequency += value
MEMORY.frequencies[key] = frequency
del job_output
del job_memory
print(f"Joined {i + 1} out of {CPU_COUNT}")
# Get new token
bpe.fit([], MEMORY, True)
print(f"Sentences from {len(data)} to {len(NEW_DATA)}")
return (bpe, NEW_DATA, MEMORY)
def __gather_data_from_file(self, path: Path) -> list[list[int]]:
SPLITTER = NanoSocratesSplitter(self.__special_token_regex)
DATA: list[list[int]] = []
FILE = open(path, "r", encoding="utf-8")
file_string = FILE.read()
FILE.close()
for piece, type in SPLITTER.split_text(file_string):
if type != TokenType.BPE:
continue
int_list = self.__make_list_ids(piece)
DATA.append(int_list)
return DATA
def __encode_from_cache(self, bpe: NanoSocratesBPE, data: list[list[int]]):
NEW_DATA : list[list[int]]= []
CPU_COUNT = os.process_cpu_count()
if CPU_COUNT is None:
raise Exception()
VOCABULARY = bpe.vocabulary
data_chunks = split(data, CPU_COUNT)
JOBS = [(NanoSocratesBPE(VOCABULARY), chunk) for chunk in data_chunks]
JOB_RESULTS: list[list[list[int]]]
with Pool() as pool:
JOB_RESULTS = pool.map(split_encode, JOBS)
for i, res in zip(range(0, CPU_COUNT), JOB_RESULTS):
job_output = res
NEW_DATA.extend(job_output)
del job_output
print(f"Joined {i + 1} out of {CPU_COUNT}")
print(f"Sentences from {len(data)} to {len(NEW_DATA)}")
return NEW_DATA
def __increment_counter(self, counter: int):
# What if overflows???
try:
counter += 1
except:
print("Integer overflow")
counter = 1
return counter
def __make_list_ids(self, corpus: str):
return list(corpus.encode("utf-8"))

View File

@ -1,219 +0,0 @@
from collections import deque
from .Encoder import Encoder
from ..Errors import OutOfDictionaryException, DuplicateWordException
# ABOUT THE DICTIONARY:
# the string is converted into utf-char bytes, that is: each char is rappresented with a set of bytes from 1 to 4.
# each bytes get casted into an integer; such that, if an integer has its value lower then 256,
# then it is rappresenting an utf-char-byte, otherwise it is a token-ID.
class NanoSocratesBatchMemoryBPE:
"""Memory to batch training. Keeps token couple frequencies, and merge_treshold"""
def __init__(
self, frequencies: dict[tuple[int, int], int], merge_treshold: int
) -> None:
self.frequencies = frequencies
self.merge_treshold = merge_treshold
class NanoSocratesBPE(Encoder):
def __init__(self, vocabulary: dict[tuple[int, int], int] | None = None) -> None:
super().__init__()
self.__vocabulary: dict[tuple[int, int], int] = {}
self.__reverse_vocabulary: dict[int, tuple[int, int]] = {}
if vocabulary is None:
return
for key, value in vocabulary.items():
if value < 256:
raise OutOfDictionaryException()
# values under 256 are used for unpaired char
# TODO: check if they are in order
self.__vocabulary[key] = value
self.__reverse_vocabulary[value] = key
@property
def vocabulary_size(self):
return len(self.__vocabulary) + 256
@property
def vocabulary(self):
return self.__vocabulary
@property
def __next_id(self) -> int:
"""
Gets the next it
Returns:
int:
"""
return self.vocabulary_size
# TODO: implement fit
def fit(
self,
chunk_data: list[int],
memory: NanoSocratesBatchMemoryBPE,
last_batch: bool,
):
ENCODED_CHUNK = self.encode_intermediate(chunk_data)
DATA_LEN_BEFORE_LAST = len(ENCODED_CHUNK) - 1
# update frequency of each couple of element
for i in range(0, DATA_LEN_BEFORE_LAST):
CANDIDATE_COUPLE = (ENCODED_CHUNK[i], ENCODED_CHUNK[i + 1])
frequency = memory.frequencies.get(CANDIDATE_COUPLE)
# Initialize frequency
if frequency is None:
frequency = 0
memory.frequencies[CANDIDATE_COUPLE] = 0
frequency += 1
memory.frequencies[CANDIDATE_COUPLE] = frequency
if not last_batch:
return (self, memory, ENCODED_CHUNK)
if len(memory.frequencies) < 1:
return (self, memory, ENCODED_CHUNK)
FREQUENCIES = memory.frequencies
MAX_COUPLE = max(FREQUENCIES.items(), key=lambda item: item[1])[0]
FREQUENCY = FREQUENCIES[MAX_COUPLE]
if FREQUENCY < memory.merge_treshold:
return (self, memory, ENCODED_CHUNK)
self.__learn_word(MAX_COUPLE)
return (self, memory, ENCODED_CHUNK)
def encode(self, piece: str) -> list[int]:
"""Encode a String into token IDs, it firt convert it into utf-8, then pass the list of integer to encode_intermediate()
Args:
piece (str):
Returns:
list[int]:
"""
converted_piece = list(piece.encode("utf-8"))
return self.encode_intermediate(converted_piece)
def encode_intermediate(self, piece: list[int]) -> list[int]:
"""Encode a piece (as list of integer) till its maximum
Args:
piece (list[int]): piece to encode
Returns:
list[int]: piece encoded
"""
current_piece = piece
new_piece = self.__round_encode(current_piece)
# until current_piece is bigger then new_piece, keep encoding
while len(current_piece) != len(new_piece):
current_piece = new_piece
new_piece = self.__round_encode(current_piece)
return current_piece
def __round_encode(self, piece: list[int]):
"""A single round of encode that traverse all the object. Multiple round are needed for a full encode: \n
1) "ABAB" -> "XX"
2) "XX" -> "Y"
Args:
piece (list[int]): the object to encode as a list of integer
Returns:
(list[int]): the one time encoded object
"""
if len(piece) == 1:
return piece
PIECE_LENGTH = len(piece) - 1
NEW_PIECE: list[int] = []
index = 0
while index < PIECE_LENGTH:
CANDIDATE_WORD = (
piece[index],
piece[index + 1],
) # take a tuple of consecutive element [int]
CANDIDATE_TOKEN = self.__vocabulary.get(CANDIDATE_WORD)
# if no token to substitute the tuple, append the first element
if CANDIDATE_TOKEN is None:
NEW_PIECE.append(piece[index])
index += 1
# if the latter element of the tuple is the last element of the piece, append it
if index == PIECE_LENGTH:
NEW_PIECE.append(piece[index])
continue
# in this case there was a candidate token to substitute the couple of element
NEW_PIECE.append(CANDIDATE_TOKEN)
index += 2
if index == PIECE_LENGTH:
NEW_PIECE.append(piece[index])
return NEW_PIECE
# TODO: Remake decode to take a list of token IDs
def decode(self, token_ids: list[int]) -> str:
# deque: double ended queue
token_stack: deque[int] = deque(token_ids)
UTF_8_STRING_ARR: bytearray = bytearray()
while len(token_stack) > 0:
TOKEN_ID = token_stack.popleft()
if TOKEN_ID < 256:
UTF_8_STRING_ARR.append(TOKEN_ID)
continue
left_token, right_token = self.__token_decode(TOKEN_ID)
token_stack.appendleft(right_token)
token_stack.appendleft(left_token)
return UTF_8_STRING_ARR.decode("utf-8", errors="ignore")
def __token_decode(self, token_id: int) -> tuple[int, int]:
CANDIDATE_DECODED = self.__reverse_vocabulary.get(token_id)
if CANDIDATE_DECODED is None:
raise OutOfDictionaryException()
return CANDIDATE_DECODED
def __learn_word(self, words: tuple[int, int]):
"""learn a new couple of object in the vocabulary
Args:
words (tuple[int, int]): the Pair of element to substitute with a new tokenID
Raises:
DuplicateWordException: it launch if there is a duplicate of the new tokenID in the dictionary
"""
ID = self.__next_id
DUPLICATE = self.__vocabulary.get(words)
if DUPLICATE is not None:
raise DuplicateWordException()
self.__vocabulary[words] = ID
self.__reverse_vocabulary[ID] = words

View File

@ -1,70 +0,0 @@
from pathlib import Path
import re
from ..Errors import DelimiterNotFoundException
class NanoSocratesChunker:
def __init__(self, max_size: int, special_token_regex: re.Pattern) -> None:
self.__max_size: int = max_size
self.__special_token_regex: re.Pattern = special_token_regex
self.__residual: str = ""
# max theorethical size of chars
# between special tokens:
# - min: size - len(longest_token)
# - MAX: size - len(shortest_token)
def chunk(self, file_path: Path):
# read_file
FILE = open(file_path, "r", encoding="utf-8")
exit = False
while not exit:
REMAINING_SIZE = self.__max_size - len(self.__residual)
READ_SIZE = min(self.__max_size, REMAINING_SIZE)
FILE_CHUNK = FILE.read(READ_SIZE)
if len(FILE_CHUNK) == 0:
exit = True
continue
CHUNK = self.__append_residuals(FILE_CHUNK)
boundaries = self.__identify_boudaries(CHUNK)
if boundaries is None:
# boundaries not found in 2 chunks,
if len(CHUNK) > self.__max_size - 1:
raise DelimiterNotFoundException()
if exit:
yield CHUNK
self.__set_residual(0, CHUNK)
continue
start, end = boundaries
self.__set_residual(end, CHUNK)
yield CHUNK[start:end]
def __identify_boudaries(self, corpus: str) -> tuple[int, int] | None:
end = 0
for match in self.__special_token_regex.finditer(corpus):
# print(match)
end = match.end()
if end == 0:
return None
return (0, end)
def __append_residuals(self, corpus: str) -> str:
RESIDUAL = self.__residual
self.__residual = ""
return RESIDUAL + corpus
def __set_residual(self, index: int, corpus: str):
self.__residual = corpus[index:]

View File

@ -1,68 +0,0 @@
from .Encoder import Encoder
from ..Errors import OutOfDictionaryException
class NanoSocratesSpecial(Encoder):
def __init__(
self, bpe_vocabulary_size: int, special_tokens: list[str] = []
) -> None:
super().__init__()
self.__bpe_offset = bpe_vocabulary_size
self.__vocabulary: dict[str, int] = {}
self.__reverse_vocabulary: dict[int, str] = {}
if len(special_tokens) == 0:
return
for index, TOKEN in zip(range(0, len(special_tokens)), special_tokens):
CANDIDATE_ID = self.__bpe_offset + index + 1
self.__vocabulary[TOKEN] = CANDIDATE_ID
self.__reverse_vocabulary[CANDIDATE_ID] = TOKEN
@property
def __next_id(self):
BPE_OFFSET = self.__bpe_offset
VOC_LENGTH = len(self.__vocabulary)
return BPE_OFFSET + VOC_LENGTH + 1
@property
def vocabulary_size(self) -> int:
return len(self.vocabulary)
@property
def vocabulary(self) -> dict[str, int]:
return self.__vocabulary
@property
def reverse_vocabulary(self) -> dict[int, str]:
return self.__reverse_vocabulary
def add_special_word_to_vocabulary(self, word: str):
CANDIDATE_INDEX = self.__next_id
self.__vocabulary[word] = CANDIDATE_INDEX
self.__reverse_vocabulary[CANDIDATE_INDEX] = word
def encode(self, word: str) -> list[int]:
ID = self.__vocabulary.get(word)
if ID is None:
raise OutOfDictionaryException()
return [ID]
def decode(self, token_id: list[int]) -> str:
if len(token_id) != 1:
raise OutOfDictionaryException()
ID = token_id[0]
WORD = self.__reverse_vocabulary.get(ID)
if WORD is None:
raise OutOfDictionaryException()
return WORD

View File

@ -1,98 +0,0 @@
import re
from collections import deque
from typing import Generator
from ..Enums import TokenType
class NanoSocratesSplitter:
def __init__(
self, special_token_regex: re.Pattern, max_bpe_token_id: int = 255
) -> None:
# attention the regex got already compiled
self.__special_token_regex = special_token_regex
self.__max_bpe_token_id: int = max_bpe_token_id # used for decoding
def split_text(self, corpus: str) -> Generator[tuple[str, TokenType]]:
"""Split a text using a regex given
Args:
corpus (str): all the corpus string to split
Yields:
Generator[tuple[str, TokenType]]: each time returns a piece of the splitted text: string and its TokenType. \n
TokenType describe if the string is for the BPE or a special Token [BPE, SPECIAL]
"""
bpe_start = 0
bpe_end = len(corpus) # this can be deleted!
for special_token_start, special_token_end in self.__find_boundaries(corpus):
# FIND BPE
bpe_end = special_token_start
BPE_TOKEN_TEXT = corpus[bpe_start:bpe_end]
if BPE_TOKEN_TEXT != "":
for WORD in self.__split_words(BPE_TOKEN_TEXT):
yield (WORD, TokenType.BPE)
# FIND SPECIAL TOKEN
SPECIAL_TOKEN_TEXT = corpus[special_token_start:special_token_end]
if SPECIAL_TOKEN_TEXT != "":
yield (SPECIAL_TOKEN_TEXT, TokenType.SPECIAL)
# now save the new bpe start point
# it will used in the next interaction
bpe_start = special_token_end
def __find_boundaries(self, corpus: str) -> Generator[tuple[int, int]]:
"""
Find each time the start and end (not included) of the special token
Args:
corpus (str): the string where the special token will be searched
Yields:
Generator[tuple[int, int]]: Note the end is not included
"""
for match in self.__special_token_regex.finditer(corpus):
start = match.start()
end = match.end()
yield (start, end)
# make the last boundary be the end of corpus
# eof = len(corpus)
# yield(eof,eof)
def __split_words(self, bpe_piece: str) -> Generator[str]:
END_OF_STRING = len(bpe_piece)
bound_start = 0
bound_end = END_OF_STRING + 1
for i in range(0, END_OF_STRING):
CANDIDATE_CHAR = bpe_piece[i]
if CANDIDATE_CHAR != " ":
continue
bound_end = i
yield bpe_piece[bound_start:bound_end]
bound_start = bound_end
bound_end = END_OF_STRING + 1
yield bpe_piece[bound_start:bound_end]
def split_tokens(self, corpus: list[int]) -> Generator[tuple[list[int], TokenType]]:
not_special_token_list: list[int] = []
for token in corpus:
if token > self.__max_bpe_token_id:
if len(not_special_token_list) > 0:
yield (not_special_token_list, TokenType.BPE)
not_special_token_list = []
yield ([token], TokenType.SPECIAL)
continue
not_special_token_list.append(token)

View File

@ -1,8 +0,0 @@
from Project_Model.Libs.BPE.Classes.TokeNanoCore import TokeNanoCore
class TokeNano:
def __init__(self):
pass

View File

@ -1,84 +0,0 @@
from pathlib import Path
from ..Classes import NanoSocratesSplitter
from ..Classes import NanoSocratesBPE
from ..Classes import NanoSocratesSpecial
from ..Utils import special_regex_maker
from ..Enums import TokenType
from ..Enums import SpecialToken
class TokeNanoCore:
def __init__(
self,
bpe_vocabulary: dict[tuple[int, int], int],
special_token_list: list[str],
# special_vocabulary: dict[str, int]
):
self.__bpe_encoder = NanoSocratesBPE(bpe_vocabulary)
SPECIAL_REGEX = special_regex_maker(special_token_list)
BPE_VOCABULARY_SIZE = self.__bpe_encoder.vocabulary_size
self.__splitter = NanoSocratesSplitter(SPECIAL_REGEX, BPE_VOCABULARY_SIZE)
self.__special_encoder = NanoSocratesSpecial(
BPE_VOCABULARY_SIZE, special_token_list
)
@property
def vocabulary_size(self):
BPE_VOC_SIZE = self.__bpe_encoder.vocabulary_size
SPECIAL_VOC_SIZE = self.__special_encoder.vocabulary_size
return BPE_VOC_SIZE + SPECIAL_VOC_SIZE + 1
def encode(self, corpus: str) -> list[int]:
output: list[int] = []
for piece, token_type in self.__splitter.split_text(corpus):
if token_type == TokenType.SPECIAL:
output.extend(self.__special_encoder.encode(piece))
# slow but clear
if token_type == TokenType.BPE:
output.extend(self.__bpe_encoder.encode(piece))
return output
def encode_incomplete_string(self, corpus: str) -> list[int]:
"""
Encode string which don't end with a special token
"""
corpus = corpus + SpecialToken.CORPUS_END.value
output: list[int] = []
for piece, token_type in self.__splitter.split_text(corpus):
if token_type == TokenType.SPECIAL:
output.extend(self.__special_encoder.encode(piece))
# slow but clear
if token_type == TokenType.BPE:
output.extend(self.__bpe_encoder.encode(piece))
return output[:-1]
def decode(self, corpus: list[int]) -> str:
output_str = ""
for token, token_type in self.__splitter.split_tokens(corpus):
# token is an integer if special, a list of integer otherwise
if token_type == TokenType.SPECIAL:
output_str += self.__special_encoder.decode(
token
) # it accept an integer
# slow but clear
if token_type == TokenType.BPE:
output_str += self.__bpe_encoder.decode(
token
) # it accept a list of integer
return output_str

View File

@ -1,20 +0,0 @@
from .NanoSocratesChunker import NanoSocratesChunker
from .NanoSocratesSplitter import NanoSocratesSplitter
from .NanoSocratesBPE import NanoSocratesBPE, NanoSocratesBatchMemoryBPE
from .NanoSocraTrainer import NanoSocraTrainer
from .NanoSocraTraineRam import NanoSocraTraineRam
from .NanoSocraTrainerPool import NanoSocraTrainerPool
from .NanoSocratesSpecial import NanoSocratesSpecial
from .TokeNanoCore import TokeNanoCore
from .TokeNano import TokeNano
__all__ = [
"NanoSocratesChunker",
"NanoSocratesSplitter",
"NanoSocratesBPE",
"NanoSocraTrainer",
"NanoSocraTraineRam",
"NanoSocraTrainerPool",
"TokeNanoCore",
"TokeNano"
]

View File

@ -1,27 +0,0 @@
from enum import Enum
class SpecialToken(Enum):
# (Enum, str) -> throws an error
START_TRIPLE_LIST = "<SOTL>"
START_TRIPLE = "<SOT>"
END_TRIPLE = "<EOT>"
SUBJECT = "<SUBJ>"
RELATIONSHIP = "<PRED>"
OBJECT = "<OBJ>"
ABSTRACT = "<ABS>"
## Tasks' Token
RDF_TO_TEXT = "<RDF2TXT>"
TEXT_TO_RDF = "<TEXT2RDF>"
CONTINUE_RDF = "<CONTINUERDF>"
MASK = "<MASK>"
# BPE Training:
# NanoSocrates
START = "<START>"
CORPUS_END = "<END>"
START_OF_SEQUENCE = "<SOS>"
END_OF_SEQUENCE = "<EOS>"
PAD = "<PAD>"

View File

@ -1,6 +0,0 @@
from enum import Enum, auto
class TokenType(Enum):
SPECIAL = auto()
BPE = auto()

View File

@ -1,6 +0,0 @@
from .TokenType import TokenType
from .SpecialToken import SpecialToken
__all__ = [
"SpecialToken"
]

View File

@ -1,4 +0,0 @@
class DelimiterNotFoundException(Exception):
def __init__(self, *args: object) -> None:
super().__init__(*args)

View File

@ -1,4 +0,0 @@
class DuplicateWordException(Exception):
def __init__(self, *args: object) -> None:
super().__init__(*args)

View File

@ -1,4 +0,0 @@
class OutOfDictionaryException(Exception):
def __init__(self, *args: object) -> None:
super().__init__(*args)

View File

@ -1,4 +0,0 @@
class SentenceTooLongException(Exception):
def __init__(self, *args: object) -> None:
super().__init__(*args)

View File

@ -1,11 +0,0 @@
from .DelimiterNotFoundException import DelimiterNotFoundException
from .OutOfDictionaryException import OutOfDictionaryException
from .DuplicateWordException import DuplicateWordException
from .SentenceTooLongException import SentenceTooLongException
__all__ = [
"DelimiterNotFoundException",
"OutOfDictionaryException",
"DuplicateWordException",
"SentenceTooLongException"
]

View File

@ -1,15 +0,0 @@
from .special_regex_maker import special_regex_maker
from .lag_checker_iterator import iterator_with_checks
from .vocabulary import save_nanos_vocabulary, load_nanos_vocabulary
from .json_utils import save_json, load_json
from .special_regex_maker import special_regex_maker
from .default_special_tokens import default_special_tokens
__all__ = [
"special_regex_maker",
"iterator_with_checks",
"save_nanos_vocabulary",
"load_nanos_vocabulary",
"save_json", "load_json",
"default_special_tokens"
]

View File

@ -1,4 +0,0 @@
from ..Enums import SpecialToken
def default_special_tokens() -> list[str]:
return [token.value for token in SpecialToken]

View File

@ -1,18 +0,0 @@
import json
from pathlib import Path
def save_json(dictionary: dict, path: Path):
json_string = json.dumps(dictionary)
FILE = open(path, "w")
FILE.write(json_string)
FILE.close()
def load_json(path: Path) -> dict:
FILE = open(path, "r")
json_string = FILE.read()
FILE.close()
return json.loads(json_string)

View File

@ -1,27 +0,0 @@
from collections import deque
from typing import Generator, TypeVar
T1 = TypeVar("T1")
T2 = TypeVar("T2")
T3 = TypeVar("T3")
def iterator_with_checks(
generator: Generator[T1, T2, T3],
) -> Generator[tuple[T1, bool], T2, T3]:
# Here we can ignore to catch stop iteration
# we will propagate it
last_element = next(generator)
while True:
RETURN_ELEMENT = last_element
try:
element = next(generator)
last_element = element
yield (RETURN_ELEMENT, False)
except StopIteration:
yield (RETURN_ELEMENT, True)
break

View File

@ -1,15 +0,0 @@
import re
def special_regex_maker(special_tokens: list[str]) -> re.Pattern:
"""compile a regex for the special token
Args:
special_tokens (list[str]): the list of special token
Returns:
re.Pattern:
"""
REGEX_STR = "|".join(special_tokens)
return re.compile(REGEX_STR)

View File

@ -1,49 +0,0 @@
import json
from pathlib import Path
from ..Errors import OutOfDictionaryException
def nanos_vocabulary2json_str(vocabulary: dict[tuple[int, int], int]) -> str:
JSON: dict[str, int] = {}
for key, item in vocabulary.items():
TUPLE_STR = f"{key}"
JSON[TUPLE_STR] = item
return json.dumps(JSON)
def nanos_json_str2vocabulary(json_string: str) -> dict[tuple[int, int], int]:
JSON: dict[str, int] = json.loads(json_string)
VOCABULARY: dict[tuple[int, int], int] = {}
for key, item in JSON.items():
REDUCED_KEY = len(key) - 1
KEY_STR = key[1:REDUCED_KEY]
VOC_KEY = tuple(map(int, KEY_STR.split(",")))
if len(VOC_KEY) != 2:
raise OutOfDictionaryException()
# Checked for weird things above
VOCABULARY[VOC_KEY] = item # type: ignore
return VOCABULARY
def save_nanos_vocabulary(vocabulary: dict[tuple[int, int], int], path: Path):
json_string = nanos_vocabulary2json_str(vocabulary)
FILE = open(path, "w")
FILE.write(json_string)
FILE.close()
def load_nanos_vocabulary(path: Path) -> dict[tuple[int, int], int]:
FILE = open(path, "r")
json_string = FILE.read()
FILE.close()
return nanos_json_str2vocabulary(json_string)

View File

@ -1,9 +0,0 @@
from .Classes import *
from .Enums import *
from .Errors import *
from .Utils import *
from . import Classes
from . import Enums
from . import Errors
from . import Utils

View File

@ -1,243 +0,0 @@
import random
import sys
from typing import Any, Generator
import pandas as pd
from pathlib import Path
from ..Enums import TaskType
import Project_Model.Libs.BPE as BPE
# from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
from Project_Model.Libs.Transformer import (
SpannedMasker,
truncate_rdf_list,
normalize_sequence,
)
from Project_Model.Libs.BPE import SpecialToken
class Batcher:
def __init__(
self,
dataset_path: Path,
max_length: int,
tokenizer: BPE.TokeNanoCore,
masker: SpannedMasker,
seed: int = 0,
debug = False
) -> None:
# ABSTRACT, TRIPLE
# tasks:
# rdf2text: X: TRIPLE, Y: ABSTRACT
# text2rdf: X: ABSTRACT, X:TRIPLE
# masking ( call masker): X: incomplete_triple Y: complete_triple (as exam)
# completation: X: TRIPLE SUBSET, Y: related TRIPLE SUBSET
# it will truncate
# it will instantiate spanmaskter and truncator
self._dataset_path = dataset_path
self._tokenizer = tokenizer
self._masker = masker
self.__max_length = max_length
self._seed = seed
# self._token_completation = TokenCompletationTransformer(sotl,eos)
self._completation_task_token_truncator = truncate_rdf_list
self.__debug = debug
def batch(self, batch_size) -> Generator[
tuple[
list[list[int]],
list[list[int]],
list[list[int]],
list[list[int]],
TaskType
],
Any,
Any,
]:
"""
Yields: X,Y,padding_X
"""
RNG = random.Random(self._seed)
self._masker.reseed(self._seed)
for batch in pd.read_csv(self._dataset_path, chunksize=batch_size):
tokenized_batch = pd.DataFrame()
# encode
tokenized_batch[["Abstract", "RDFs"]] = batch[["Abstract", "RDFs"]].map(
lambda t: self._tokenizer.encode(t)
)
X, Y, padding_X, padding_Y = self.__rdf2txt_transformation(tokenized_batch)
yield X, Y, padding_X, padding_Y, TaskType.RDF2TXT
(
X,
Y,
padding_X,
padding_Y,
) = self.__txt2rdf_transformation(tokenized_batch)
yield X, Y, padding_X, padding_Y, TaskType.TEXT2RDF
(
X,
Y,
padding_X,
padding_Y,
) = self.__masking_trasformation(tokenized_batch)
yield X, Y, padding_X, padding_Y, TaskType.MASKING
(
X,
Y,
padding_X,
padding_Y,
) = self.__token_completation_task(
tokenized_batch, RNG.randint(0, sys.maxsize)
)
yield X, Y, padding_X, padding_Y, TaskType.COMPLETATION
# output = pd.concat([rdf2txt_batch,txt2rdf_batch,completation_batch],ignore_index=True)
# output = output.sample(frac=1).reset_index(drop=True)
# self.decode_debug(output)
# yield output
def __random_subset_rdfs(self, batch: pd.DataFrame, seed=0):
# WIP
rng = random.Random(seed)
def to_list(x):
return x.split(SpecialToken.START_TRIPLE.value)[1:]
batch["RDFs"] = batch["RDFs"].map(to_list)
def decode_debug(self, batch: pd.DataFrame):
decoded = pd.DataFrame()
decoded[["X", "Y"]] = batch[["X", "Y"]].map(lambda t: self._tokenizer.decode(t))
print(decoded)
def __normalization(
self, X: list[list[int]], Y: list[list[int]]
) -> tuple[list[list[int]], list[list[int]], list[list[int]], list[list[int]]]:
pad_token = self._tokenizer.encode(SpecialToken.PAD.value)[0]
end_token = self._tokenizer.encode(SpecialToken.END_OF_SEQUENCE.value)[0]
out_X = []
padding_X = []
out_Y = []
padding_Y = []
for x in X:
out_x, padding_x = normalize_sequence(
x, self.__max_length, pad_token, end_token, True
)
out_X.append(out_x)
padding_X.append(padding_x)
for y in Y:
out_y, padding_y = normalize_sequence(
y, self.__max_length, pad_token, end_token, True
)
out_Y.append(out_y)
padding_Y.append(padding_y)
return out_X, out_Y, padding_X, padding_Y
def __rdf2txt_transformation(self, batch: pd.DataFrame):
X: list[list[int]]
task_token = self._tokenizer.encode(SpecialToken.RDF_TO_TEXT.value)
out = batch.rename(columns={"RDFs": "X", "Abstract": "Y"})[["X", "Y"]]
out["X"] = [task_token + x for x in out["X"]]
return self.__normalization(out["X"].to_list(), out["Y"].to_list())
def __txt2rdf_transformation(self, batch: pd.DataFrame):
task_token = self._tokenizer.encode(SpecialToken.TEXT_TO_RDF.value)
out = batch.rename(columns={"Abstract": "X", "RDFs": "Y"})[["X", "Y"]]
out["X"] = [task_token + x for x in out["X"]]
return self.__normalization(out["X"].to_list(), out["Y"].to_list())
def __masking_trasformation(self, batch: pd.DataFrame):
X = []
Y = []
for rdf in batch["RDFs"]:
x, y = self._masker.mask_sequence(rdf[:self.__max_length])
X.append(x)
Y.append(y)
return self.__normalization(X, Y)
def __token_completation_task(self, batch: pd.DataFrame, minibatch_seed: int):
continue_triple_token = self._tokenizer.encode(SpecialToken.CONTINUE_RDF.value)[
0
]
eot = self._tokenizer.encode(SpecialToken.END_TRIPLE.value)[0]
X = []
Y = []
for rdf in batch["RDFs"]:
# here first truncate to max_lenght
rdf = rdf[: self.__max_length] # truncator that uses "eot" so no problem
x, y = self._completation_task_token_truncator(
rdf, 0.5, continue_triple_token, eot, minibatch_seed
)
X.append(x)
Y.append(y)
return self.__token_cmpletation_task_special_normalization(X, Y)
def __token_cmpletation_task_special_normalization(self, X: list[list[int]], Y: list[list[int]]
) -> tuple[list[list[int]], list[list[int]], list[list[int]], list[list[int]]]:
def continue_rdf_padding(sequence: list[int], pad_token: int):
for i, x in enumerate(sequence):
if x == pad_token:
i = i+1 # continueRDF will be excluded by the mask
# fill the tail with True and stop
return [False] * i + [True] * (len(sequence) - i)
return [False] * len(sequence) # no pad token found
pad_token = self._tokenizer.encode(SpecialToken.PAD.value)[0]
end_token = self._tokenizer.encode(SpecialToken.END_OF_SEQUENCE.value)[0]
continue_rdf = self._tokenizer.encode(SpecialToken.CONTINUE_RDF.value)[0]
out_X = []
padding_X = []
out_Y = []
padding_Y = []
for x in X:
out_x, _ = normalize_sequence(
x, self.__max_length, pad_token, end_token, True
)
out_X.append(out_x)
# padding_X.append(padding_x)
special_padding = continue_rdf_padding(out_x,continue_rdf)
padding_X.append(special_padding)
for y in Y:
out_y, padding_y = normalize_sequence(
y, self.__max_length, pad_token, end_token, True
)
out_Y.append(out_y)
# special padding
# special_padding = continue_rdf_padding(out_y,continue_rdf)
# padding_Y.append(special_padding)
padding_Y.append(padding_y)
return out_X, out_Y, padding_X, padding_Y
if __name__ == "__main__":
DATASET_PATH = Path("Assets/Dataset/Tmp/rdf_text.csv")
VOCABULARY_path = "Assets/Dataset/Tmp/trimmed.json"
from pathlib import Path
VOCABULARY = BPE.load_nanos_vocabulary(Path(VOCABULARY_path))
SPECIAL_LIST = BPE.default_special_tokens()
TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_LIST)
SPECIAL_TOKENS: set[int] = set(TOKENANO.encode("".join(SPECIAL_LIST)))
MASKER = SpannedMasker(TOKENANO.vocabulary_size, SPECIAL_TOKENS)
prova = "<ABS>Cactus Flower is a 1969 American screwball comedy film directed by Gene Saks, and starring Walter Matthau, Ingrid Bergman and Goldie Hawn, who won an Academy Award for her performance.The screenplay was adapted by I. A. L. Diamond from the 1965 Broadway play of the same title written by Abe Burrows, which, in turn, is based on the French play Fleur de cactus by Pierre Barillet and Jean-Pierre Gredy. Cactus Flower was the ninth highest-grossing film of 1969."
print(TOKENANO.encode(prova))
batcher = Batcher(DATASET_PATH,256, TOKENANO, MASKER)
for batch in batcher.batch(8):
print(batch)

View File

@ -1,33 +0,0 @@
class TokenCompletationTransformer:
def __init__(self,SOTL_token,EOS_token, input_percent:float = 0.5) -> None:
self.__SOTL_token = SOTL_token
self.__EOS_token = EOS_token
self.__input_percent = input_percent
pass
def get_completation_tuple(
self,
token_sequence: list[int],
)-> tuple[list[int], list[int]]:
# split the sequence by encoded(<SOTL>), dont take the first, firts pertenge in as X the other as Y
sotl_count =int( token_sequence.count(self.__SOTL_token) * self.__input_percent)
sotl_index = 0
percent_index = 0
while sotl_index < sotl_count:
token = token_sequence[percent_index]
if token == self.__SOTL_token:
sotl_index += 1
percent_index+=1
percent_index = percent_index -1
x_list = token_sequence[:percent_index]
x_list.append(self.__EOS_token)
y_list = token_sequence[percent_index:]
return (x_list,y_list)

View File

@ -1,2 +0,0 @@
from .Batcher import Batcher
from .TokenCompletation import TokenCompletationTransformer

View File

@ -1,8 +0,0 @@
from enum import Enum, auto
class TaskType(Enum):
RDF2TXT = auto()
TEXT2RDF = auto()
MASKING = auto()
COMPLETATION = auto()

View File

@ -1,5 +0,0 @@
from .TaskType import TaskType
__all__ = [
"TaskType"
]

View File

@ -1,5 +0,0 @@
from .Classes import *
from .Enums import *
from . import Classes
from . import Enums

View File

@ -1,23 +0,0 @@
import torch
from ..Utils import fixed_positional_encoding
# WIP FOR BATCHING
class NanoSocratesEmbedder(torch.nn.Module):
def __init__(self, vocabulary_size: int, embedding_size: int) -> None:
super().__init__()
self.__embedder = torch.nn.Embedding(vocabulary_size, embedding_size)
def forward(self, tokenized_sentence: torch.Tensor) -> torch.Tensor:
computed_embeddings: torch.Tensor = self.__embedder(tokenized_sentence)
_, SENTENCE_LENGHT, EMBEDDING_SIZE = computed_embeddings.shape # for batching
POSITIONAL_ENCODINGS = fixed_positional_encoding(
SENTENCE_LENGHT, EMBEDDING_SIZE
)
computed_embeddings = computed_embeddings + POSITIONAL_ENCODINGS # for batching
return computed_embeddings

View File

@ -1,5 +0,0 @@
from .NanoSocratesEmbedder import NanoSocratesEmbedder
__all__ = [
"NanoSocratesEmbedder"
]

View File

@ -1,5 +0,0 @@
from .fixed_positional_encoding import fixed_positional_encoding
__all__ = [
"fixed_positional_encoding"
]

View File

@ -1,28 +0,0 @@
import torch
def fixed_positional_encoding(
sentence_dimension: int,
embedding_dimension: int,
) -> torch.Tensor:
BIG_CONST = int(1e4)
INITIAL_ENCODING = torch.tensor([i for i in range(0, sentence_dimension)])
ENCODINGS: list[torch.Tensor] = []
for i in range(0, embedding_dimension):
EMBEDDING_POSITION = i
# Note: The original paper did not specify
# to compute: pos mod 2!!
DIVISOR = BIG_CONST ** ((2 * (EMBEDDING_POSITION // 2)) / embedding_dimension)
INTERMEDIATE_ENCODING = INITIAL_ENCODING / DIVISOR
if EMBEDDING_POSITION % 2 == 0:
ENCODINGS.append(torch.sin(INTERMEDIATE_ENCODING))
continue
ENCODINGS.append(torch.cos(INTERMEDIATE_ENCODING))
return torch.stack(ENCODINGS).transpose(0, 1)

View File

@ -1,7 +0,0 @@
from .Utils import *
from .Classes import *
from . import Utils
from . import Classes

View File

@ -1,70 +0,0 @@
import evaluate
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
class Evaluator():
def __init__(self) -> None:
# txt based evaluator
self.__rouge = evaluate.load("rouge")
self.__rouge_types = ["rougeLsum", "rouge1", "rouge2"] #rougeLsum will work bad because it expect that each sentence are divided with /n
self._bleu = evaluate.load("bleu")
self._meteor = evaluate.load("meteor")
# token based evaluator
self.__acc_m = evaluate.load("accuracy")
self.__prec_m = evaluate.load("precision")
self.__rec_m = evaluate.load("recall")
self.__f1_m = evaluate.load("f1")
def rdf2txt_rouge_evaluation(self, preds: list[str], refs: list[str]):
results = self.__rouge.compute(
predictions=preds, references=refs,
rouge_types=self.__rouge_types,
use_stemmer=True,
use_aggregator=True #F1
)
return {k: float(results[k]) for k in self.__rouge_types}
def rdf2txt_bleu_evaluation(self, preds: list[str], refs: list[str]) -> float:
# sacreBLEU via evaluate; expects references as list-of-lists
# each prediction can be evaluated against a list of references, hence [[ref]]
results = self._bleu.compute(predictions=preds, references=[[r] for r in refs])
return float(results["bleu"]) # (native sacreBLEU scale)
def rdf2txt_meteor_evaluation(self, preds: list[str], refs: list[str]) -> float:
# as bleu
res = self._meteor.compute(predictions=preds, references=[[r] for r in refs])
return float(res["meteor"])
def __my_accuracy(self,preds: list[list[int]], refs: list[list[int]]):
# it is done on token sequence not single token
total = len(preds)
correct = 0
for p, r in zip(preds, refs):
correct += int(p == r)
return correct / total
def __accuracy(self, preds, refs):
return accuracy_score(preds,refs)
def __clean_batch_by_pad(self, preds: list[list[int]], refs: list[list[int]]):
output_preds = []
output_refs = []
#TODO
pad_token: int = 7000 # percolate
for pred, ref in zip(preds,refs):
try:
i = ref.index(pad_token) # first time pad token appears
except ValueError:
i = len(ref)
output_preds.append(pred[:i])
output_refs.append(ref[:i])
return output_preds,output_refs
def __precision_recall(self, preds: list[list[int]], refs: list[list[int]]):
#TODO
p, r, f1, _ = precision_recall_fscore_support(
preds, refs, average="binary", zero_division=0
) #### watch later
return {"precision": float(p), "recall": float(r), "f1": float(f1)}

View File

@ -1,5 +0,0 @@
from .get_default_device import get_default_device
__all__ = [
"get_default_device"
]

View File

@ -1,17 +0,0 @@
import torch
def get_default_device() -> torch.device:
# Cuda or ROCm
if torch.cuda.is_available():
return torch.device("cuda")
# Intel GPUs
if torch.xpu.is_available():
return torch.device("xpu")
# Apple GPUs
if torch.backends.mps.is_available():
return torch.device("mps")
return torch.device("cpu")

View File

@ -1,7 +0,0 @@
from .Utils import *
from .Utils import get_default_device
__all__ = [
"get_default_device"
]

View File

@ -1,43 +0,0 @@
import torch
class LogitsCollector:
def __init__(self, pad_token: int, end_token: int, tokenizer) -> None:
self.__pad_token = pad_token # used to skip PAD
self.__end_token = end_token # used to stop at END
self.__tokenizer = tokenizer # exposes .decode(list[int]) -> str
self.__steps: list[torch.Tensor] = [] # list of per-step logits [B,V]
def reset(self) -> None:
self.__steps.clear() # clear history
def add(self, logits_step: torch.Tensor) -> None:
if logits_step.dim() == 3: # handle [B,1,V]
logits_step = logits_step[:, -1, :] # -> [B,V]
self.__steps.append(logits_step.detach()) # store raw logits (detached)
def tokens(self) -> list[list[int]]:
if not self.__steps:
return []
stack = torch.stack(self.__steps, dim=0) # [T,B,V]
probs = torch.softmax(stack, dim=-1) # softmax over vocab -> [T,B,V]
ids = probs.argmax(dim=-1).transpose(0, 1) # greedy ids -> [B,T]
out: list[list[int]] = []
for row in ids.tolist():
seq: list[int] = []
for tok in row:
# if tok == self.__end_token: # stop on END
# break
if tok == self.__pad_token: # skip PAD
continue
seq.append(tok)
out.append(seq)
return out
def print_decoded(self) -> None:
for i, seq in enumerate(self.tokens()):
try:
# text = text + self.__end_token
text = self.__tokenizer.decode(seq) # decode tokens to string
except Exception:
text = str(seq) # fallback to ids
print(f"[{i}] {text}") # simple print

View File

@ -1,20 +0,0 @@
import os
from pathlib import Path
class Log:
def __init__(self, path):
self.path = path
header = ["epoch","avg_txt","avg_enc","avg_dec","txt_loss","masking_loss","prediction_loss"]
if Path(path).is_file():
return
with open(self.path, "w", encoding="utf-8", newline="") as f:
f.write(",".join(header) + "\n")
def write(self, loss: list[float]):
line = ",".join(str(float(x)) for x in loss) + "\n"
with open(self.path, "a", encoding="utf-8", newline="") as f:
f.write(line)
f.flush()
os.fsync(f.fileno()) # extra durability per write # suggested against sudden crashes since it will be done

View File

@ -1,19 +0,0 @@
import torch
class DeToken(torch.nn.Module):
def __init__(self, embedding_size: int, vocabulary_size: int) -> None:
super().__init__()
self.__linear = torch.nn.Linear(embedding_size, vocabulary_size)
def forward(self, x: torch.Tensor) -> torch.Tensor:
# 1) Go from latent space to vocabularu space
x = self.__linear(x)
# 2) Go to logits
# x = torch.softmax(x, 2)
return x

View File

@ -1,115 +0,0 @@
from typing import Optional
import torch
import torch.nn as nn
from .FeedForwardNetwork import FeedForwardNetwork
from .TorchMultiHeadAttention import TorchMultiHeadAttention as MultiHeadAttention
from ..Utils.attention_mask import get_causal_attention_mask, get_prefix_causal_mask_from_padding_mask
# B, L(T), E_D
class Decoder(nn.Module):
def __init__(
self,
embedding_dimension: int,
feed_forward_hidden_layer_dimension: int,
number_of_attention_heads: int,
) -> None:
self.__attention_heads = number_of_attention_heads
super().__init__()
self.__masked_attention = MultiHeadAttention(
embedding_dimension, number_of_attention_heads, dropout=0.1
)
self.__layer_norm_1 = nn.LayerNorm(embedding_dimension)
self.__cross_attention = MultiHeadAttention(
embedding_dimension, number_of_attention_heads, dropout=0.1
)
self.__layer_norm_2 = nn.LayerNorm(embedding_dimension)
self.__dropout = nn.Dropout(0.1)
self.__feed_forward_network = FeedForwardNetwork(
embedding_dimension, feed_forward_hidden_layer_dimension
)
self.__layer_norm_3 = nn.LayerNorm(embedding_dimension)
def forward(
self,
args: tuple[
torch.Tensor,
torch.Tensor,
torch.Tensor,
torch.Tensor,
torch.Tensor,
Optional[bool]
]
): # -> list[torch.Tensor]: # k_x = v_x . While x_q = x
# WARNING: args is needed to have sequential
if len(args) < 6:
args = args + (False)
x, k_x, v_x, src_padding_mask, tgt_padding_mask, decoder_only = args
# build of attention mask
# TODO: create a prefix causal mask if needed
if decoder_only:
attention_mask = get_prefix_causal_mask_from_padding_mask(x.size(1),src_padding_mask,self.__attention_heads) # the correct is tgt however ...
else:
attention_mask = get_causal_attention_mask(x.size(1))
# 1) Masked Attention
MASKED_ATTENTION = self.__masked_attention(
x, x, x, key_padding_mask=tgt_padding_mask, attention_mask=attention_mask
)
# 2) Dropout
DROPPED_MASKED_ATTENTION = self.__dropout(MASKED_ATTENTION)
del MASKED_ATTENTION
# 3) Residual Connection
x = x + DROPPED_MASKED_ATTENTION
del DROPPED_MASKED_ATTENTION
# 4) Layer Normalization
x = self.__layer_norm_1(x)
if not decoder_only:
# 5) Encoderdecoder (cross) attention
CROSS_ATTENTION = self.__cross_attention(
x, k_x, v_x, key_padding_mask=src_padding_mask
)
# 6) Dropout
DROPPED_CROSS_ATTENTION = self.__dropout(CROSS_ATTENTION)
del CROSS_ATTENTION
# 7) Residual Connection
x = x + DROPPED_CROSS_ATTENTION
del DROPPED_CROSS_ATTENTION
# 8) Layer Normalization
x = self.__layer_norm_2(x)
# 9) Position-wise feed-forward
FEED_FORWARD = self.__feed_forward_network(x)
# 10) Dropout
DROPPED_FEED_FORWARD = self.__dropout(FEED_FORWARD)
del FEED_FORWARD
# 11) Residual Connection
x = x + DROPPED_FEED_FORWARD
del DROPPED_FEED_FORWARD
# 12) Layer Normalization
x = self.__layer_norm_3(x)
return (x, k_x, v_x, src_padding_mask, tgt_padding_mask, decoder_only)
# use eval to disable dropout ecc

View File

@ -1,73 +0,0 @@
import torch
import torch.nn as nn
from Project_Model.Libs.Transformer.Classes.FeedForwardNetwork import FeedForwardNetwork
from Project_Model.Libs.Transformer.Classes.TorchMultiHeadAttention import (
TorchMultiHeadAttention as MultiHeadAttention,
)
class Encoder(
nn.Module
): # in this way we expose the primitive of nn.Module for training purpose
def __init__(
self,
embedding_dimension: int,
feed_forward_hidden_layer_dimension: int,
number_of_attention_heads: int,
) -> None:
super().__init__()
self.__attention = MultiHeadAttention(
embedding_dimension, number_of_attention_heads, dropout=0.1
)
self.__layer_norm_1 = nn.LayerNorm(
embedding_dimension
) # norm of first "Add and Normalize"
self.__feed_forward = FeedForwardNetwork(
embedding_dimension, feed_forward_hidden_layer_dimension
)
self.__layer_norm_2 = nn.LayerNorm(
embedding_dimension
) # norm of second "Add and Normalize"
self.__dropout = nn.Dropout(0.1) # ...
def forward(self, args: tuple[torch.Tensor, torch.Tensor]):
# WARNING: args is needed to have sequential
x, padding_mask = args
# -> ATTENTION -> dropout -> add and normalize -> FF -> dropout -> add and normalize ->
# Attention with Residual Connection [ input + self-attention]
# 1) Multi Head Attention
ATTENTION = self.__attention(x, x, x, key_padding_mask=padding_mask)
# 2) Dropout
DROPPED_ATTENTION = self.__dropout(ATTENTION)
del ATTENTION
# 3) Residual Connection
x = x + DROPPED_ATTENTION
del DROPPED_ATTENTION
# 4) Layer Normalization
x = self.__layer_norm_1(x)
# 5) Feed Forward
FEED_FORWARD = self.__feed_forward(x)
# 6) Dropout
DROPPED_FEED_FORWARD = self.__dropout(FEED_FORWARD)
del FEED_FORWARD
# 7) Residual Connection
x = x + DROPPED_FEED_FORWARD
del DROPPED_FEED_FORWARD
# 8) Layer Normalization
x = self.__layer_norm_2(x)
return (x, padding_mask)
# use eval to disable dropout ecc

Some files were not shown because too many files have changed in this diff Show More