From 8adacdb08c2c708c071d78ee975fdaaa652fb361 Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Tue, 7 Oct 2025 20:45:04 +0200 Subject: [PATCH] Added new playgrounds --- Playgrounds/nanosocrates-sanity-check.ipynb | 4 +- Playgrounds/nanosocrates-train-toy.ipynb | 243 ++++++++++++++++++++ Playgrounds/prova.ipynb | 73 +++++- 3 files changed, 318 insertions(+), 2 deletions(-) create mode 100644 Playgrounds/nanosocrates-train-toy.ipynb diff --git a/Playgrounds/nanosocrates-sanity-check.ipynb b/Playgrounds/nanosocrates-sanity-check.ipynb index 7fd1ca7..ab56598 100644 --- a/Playgrounds/nanosocrates-sanity-check.ipynb +++ b/Playgrounds/nanosocrates-sanity-check.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "f5762da9", "metadata": {}, "outputs": [ @@ -127,6 +127,8 @@ "\n", "\n", "\n", + "\n", + "\n", "\n" ] } diff --git a/Playgrounds/nanosocrates-train-toy.ipynb b/Playgrounds/nanosocrates-train-toy.ipynb new file mode 100644 index 0000000..f69a836 --- /dev/null +++ b/Playgrounds/nanosocrates-train-toy.ipynb @@ -0,0 +1,243 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "adbd9598", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\Chris\\miniconda3\\envs\\deep_learning\\Lib\\site-packages\\torch\\utils\\_device.py:103: UserWarning: Aten Op fallback from XPU to CPU happends. This may have performance implications. If need debug the fallback ops please set environment variable `PYTORCH_DEBUG_XPU_FALLBACK=1` (Triggered internally at C:\\actions-runner\\_work\\pytorch\\pytorch\\pytorch\\build\\xpu\\ATen\\RegisterXPU_0.cpp:54528.)\n", + " return func(*args, **kwargs)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "EPOCH 0\n", + "\tCurrent Loss: 8.951058387756348\n", + "EPOCH 10\n", + "\tCurrent Loss: 8.913984298706055\n", + "EPOCH 20\n", + "\tCurrent Loss: 8.911956787109375\n", + "EPOCH 30\n", + "\tCurrent Loss: 8.911856651306152\n", + "EPOCH 40\n", + "\tCurrent Loss: 8.911840438842773\n", + "EPOCH 50\n", + "\tCurrent Loss: 8.911835670471191\n", + "EPOCH 60\n", + "\tCurrent Loss: 8.911831855773926\n", + "EPOCH 70\n", + "\tCurrent Loss: 8.91179084777832\n", + "EPOCH 80\n", + "\tCurrent Loss: 8.899038314819336\n", + "EPOCH 90\n", + "\tCurrent Loss: 8.898558616638184\n" + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mKeyboardInterrupt\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 133\u001b[39m\n\u001b[32m 130\u001b[39m LOSS_HISTORY.append(loss)\n\u001b[32m 132\u001b[39m loss.backward()\n\u001b[32m--> \u001b[39m\u001b[32m133\u001b[39m \u001b[43moptimizer\u001b[49m\u001b[43m.\u001b[49m\u001b[43mstep\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 134\u001b[39m \u001b[38;5;66;03m# scheduler.step()\u001b[39;00m\n\u001b[32m 136\u001b[39m most_probable_tokens = torch.argmax(logits, \u001b[32m2\u001b[39m)\n", + "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\Chris\\miniconda3\\envs\\deep_learning\\Lib\\site-packages\\torch\\optim\\optimizer.py:516\u001b[39m, in \u001b[36mOptimizer.profile_hook_step..wrapper\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m 511\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 512\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\n\u001b[32m 513\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfunc\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m must return None or a tuple of (new_args, new_kwargs), but got \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresult\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m.\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 514\u001b[39m )\n\u001b[32m--> \u001b[39m\u001b[32m516\u001b[39m out = \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 517\u001b[39m \u001b[38;5;28mself\u001b[39m._optimizer_step_code()\n\u001b[32m 519\u001b[39m \u001b[38;5;66;03m# call optimizer step post hooks\u001b[39;00m\n", + "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\Chris\\miniconda3\\envs\\deep_learning\\Lib\\site-packages\\torch\\optim\\optimizer.py:81\u001b[39m, in \u001b[36m_use_grad_for_differentiable.._use_grad\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m 79\u001b[39m torch.set_grad_enabled(\u001b[38;5;28mself\u001b[39m.defaults[\u001b[33m\"\u001b[39m\u001b[33mdifferentiable\u001b[39m\u001b[33m\"\u001b[39m])\n\u001b[32m 80\u001b[39m torch._dynamo.graph_break()\n\u001b[32m---> \u001b[39m\u001b[32m81\u001b[39m ret = \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 82\u001b[39m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[32m 83\u001b[39m torch._dynamo.graph_break()\n", + "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\Chris\\miniconda3\\envs\\deep_learning\\Lib\\site-packages\\torch\\optim\\adam.py:247\u001b[39m, in \u001b[36mAdam.step\u001b[39m\u001b[34m(self, closure)\u001b[39m\n\u001b[32m 235\u001b[39m beta1, beta2 = group[\u001b[33m\"\u001b[39m\u001b[33mbetas\u001b[39m\u001b[33m\"\u001b[39m]\n\u001b[32m 237\u001b[39m has_complex = \u001b[38;5;28mself\u001b[39m._init_group(\n\u001b[32m 238\u001b[39m group,\n\u001b[32m 239\u001b[39m params_with_grad,\n\u001b[32m (...)\u001b[39m\u001b[32m 244\u001b[39m state_steps,\n\u001b[32m 245\u001b[39m )\n\u001b[32m--> \u001b[39m\u001b[32m247\u001b[39m \u001b[43madam\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 248\u001b[39m \u001b[43m \u001b[49m\u001b[43mparams_with_grad\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 249\u001b[39m \u001b[43m \u001b[49m\u001b[43mgrads\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 250\u001b[39m \u001b[43m \u001b[49m\u001b[43mexp_avgs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 251\u001b[39m \u001b[43m \u001b[49m\u001b[43mexp_avg_sqs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 252\u001b[39m \u001b[43m \u001b[49m\u001b[43mmax_exp_avg_sqs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 253\u001b[39m \u001b[43m \u001b[49m\u001b[43mstate_steps\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 254\u001b[39m \u001b[43m \u001b[49m\u001b[43mamsgrad\u001b[49m\u001b[43m=\u001b[49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mamsgrad\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 255\u001b[39m \u001b[43m \u001b[49m\u001b[43mhas_complex\u001b[49m\u001b[43m=\u001b[49m\u001b[43mhas_complex\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 256\u001b[39m \u001b[43m \u001b[49m\u001b[43mbeta1\u001b[49m\u001b[43m=\u001b[49m\u001b[43mbeta1\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 257\u001b[39m \u001b[43m \u001b[49m\u001b[43mbeta2\u001b[49m\u001b[43m=\u001b[49m\u001b[43mbeta2\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 258\u001b[39m \u001b[43m \u001b[49m\u001b[43mlr\u001b[49m\u001b[43m=\u001b[49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mlr\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 259\u001b[39m \u001b[43m \u001b[49m\u001b[43mweight_decay\u001b[49m\u001b[43m=\u001b[49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mweight_decay\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 260\u001b[39m \u001b[43m \u001b[49m\u001b[43meps\u001b[49m\u001b[43m=\u001b[49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43meps\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 261\u001b[39m \u001b[43m \u001b[49m\u001b[43mmaximize\u001b[49m\u001b[43m=\u001b[49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mmaximize\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 262\u001b[39m \u001b[43m \u001b[49m\u001b[43mforeach\u001b[49m\u001b[43m=\u001b[49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mforeach\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 263\u001b[39m \u001b[43m \u001b[49m\u001b[43mcapturable\u001b[49m\u001b[43m=\u001b[49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mcapturable\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 264\u001b[39m \u001b[43m \u001b[49m\u001b[43mdifferentiable\u001b[49m\u001b[43m=\u001b[49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mdifferentiable\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 265\u001b[39m \u001b[43m \u001b[49m\u001b[43mfused\u001b[49m\u001b[43m=\u001b[49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mfused\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 266\u001b[39m \u001b[43m \u001b[49m\u001b[43mgrad_scale\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mgetattr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mgrad_scale\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 267\u001b[39m \u001b[43m \u001b[49m\u001b[43mfound_inf\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mgetattr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mfound_inf\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 268\u001b[39m \u001b[43m \u001b[49m\u001b[43mdecoupled_weight_decay\u001b[49m\u001b[43m=\u001b[49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mdecoupled_weight_decay\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 269\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 271\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m loss\n", + "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\Chris\\miniconda3\\envs\\deep_learning\\Lib\\site-packages\\torch\\optim\\optimizer.py:149\u001b[39m, in \u001b[36m_disable_dynamo_if_unsupported..wrapper..maybe_fallback\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m 147\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m disabled_func(*args, **kwargs)\n\u001b[32m 148\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m149\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\Chris\\miniconda3\\envs\\deep_learning\\Lib\\site-packages\\torch\\optim\\adam.py:949\u001b[39m, in \u001b[36madam\u001b[39m\u001b[34m(params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, foreach, capturable, differentiable, fused, grad_scale, found_inf, has_complex, decoupled_weight_decay, amsgrad, beta1, beta2, lr, weight_decay, eps, maximize)\u001b[39m\n\u001b[32m 946\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 947\u001b[39m func = _single_tensor_adam\n\u001b[32m--> \u001b[39m\u001b[32m949\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 950\u001b[39m \u001b[43m \u001b[49m\u001b[43mparams\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 951\u001b[39m \u001b[43m \u001b[49m\u001b[43mgrads\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 952\u001b[39m \u001b[43m \u001b[49m\u001b[43mexp_avgs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 953\u001b[39m \u001b[43m \u001b[49m\u001b[43mexp_avg_sqs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 954\u001b[39m \u001b[43m \u001b[49m\u001b[43mmax_exp_avg_sqs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 955\u001b[39m \u001b[43m \u001b[49m\u001b[43mstate_steps\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 956\u001b[39m \u001b[43m \u001b[49m\u001b[43mamsgrad\u001b[49m\u001b[43m=\u001b[49m\u001b[43mamsgrad\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 957\u001b[39m \u001b[43m \u001b[49m\u001b[43mhas_complex\u001b[49m\u001b[43m=\u001b[49m\u001b[43mhas_complex\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 958\u001b[39m \u001b[43m \u001b[49m\u001b[43mbeta1\u001b[49m\u001b[43m=\u001b[49m\u001b[43mbeta1\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 959\u001b[39m \u001b[43m \u001b[49m\u001b[43mbeta2\u001b[49m\u001b[43m=\u001b[49m\u001b[43mbeta2\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 960\u001b[39m \u001b[43m \u001b[49m\u001b[43mlr\u001b[49m\u001b[43m=\u001b[49m\u001b[43mlr\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 961\u001b[39m \u001b[43m \u001b[49m\u001b[43mweight_decay\u001b[49m\u001b[43m=\u001b[49m\u001b[43mweight_decay\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 962\u001b[39m \u001b[43m \u001b[49m\u001b[43meps\u001b[49m\u001b[43m=\u001b[49m\u001b[43meps\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 963\u001b[39m \u001b[43m \u001b[49m\u001b[43mmaximize\u001b[49m\u001b[43m=\u001b[49m\u001b[43mmaximize\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 964\u001b[39m \u001b[43m \u001b[49m\u001b[43mcapturable\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcapturable\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 965\u001b[39m \u001b[43m \u001b[49m\u001b[43mdifferentiable\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdifferentiable\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 966\u001b[39m \u001b[43m \u001b[49m\u001b[43mgrad_scale\u001b[49m\u001b[43m=\u001b[49m\u001b[43mgrad_scale\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 967\u001b[39m \u001b[43m \u001b[49m\u001b[43mfound_inf\u001b[49m\u001b[43m=\u001b[49m\u001b[43mfound_inf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 968\u001b[39m \u001b[43m \u001b[49m\u001b[43mdecoupled_weight_decay\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdecoupled_weight_decay\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 969\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\Chris\\miniconda3\\envs\\deep_learning\\Lib\\site-packages\\torch\\optim\\adam.py:755\u001b[39m, in \u001b[36m_multi_tensor_adam\u001b[39m\u001b[34m(params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, grad_scale, found_inf, amsgrad, has_complex, beta1, beta2, lr, weight_decay, eps, maximize, capturable, differentiable, decoupled_weight_decay)\u001b[39m\n\u001b[32m 752\u001b[39m torch._foreach_addcdiv_(device_params, device_exp_avgs, exp_avg_sq_sqrt)\n\u001b[32m 753\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 754\u001b[39m bias_correction1 = [\n\u001b[32m--> \u001b[39m\u001b[32m755\u001b[39m \u001b[32m1\u001b[39m - beta1 ** \u001b[43m_get_value\u001b[49m\u001b[43m(\u001b[49m\u001b[43mstep\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mfor\u001b[39;00m step \u001b[38;5;129;01min\u001b[39;00m device_state_steps\n\u001b[32m 756\u001b[39m ]\n\u001b[32m 757\u001b[39m bias_correction2 = [\n\u001b[32m 758\u001b[39m \u001b[32m1\u001b[39m - beta2 ** _get_value(step) \u001b[38;5;28;01mfor\u001b[39;00m step \u001b[38;5;129;01min\u001b[39;00m device_state_steps\n\u001b[32m 759\u001b[39m ]\n\u001b[32m 761\u001b[39m step_size = _stack_if_compiling([(lr / bc) * -\u001b[32m1\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m bc \u001b[38;5;129;01min\u001b[39;00m bias_correction1])\n", + "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\Chris\\miniconda3\\envs\\deep_learning\\Lib\\site-packages\\torch\\optim\\optimizer.py:96\u001b[39m, in \u001b[36m_get_value\u001b[39m\u001b[34m(x)\u001b[39m\n\u001b[32m 94\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m x\n\u001b[32m 95\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m---> \u001b[39m\u001b[32m96\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mx\u001b[49m\u001b[43m.\u001b[49m\u001b[43mitem\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(x, torch.Tensor) \u001b[38;5;28;01melse\u001b[39;00m x\n", + "\u001b[36mFile \u001b[39m\u001b[32mc:\\Users\\Chris\\miniconda3\\envs\\deep_learning\\Lib\\site-packages\\torch\\utils\\_device.py:103\u001b[39m, in \u001b[36mDeviceContext.__torch_function__\u001b[39m\u001b[34m(self, func, types, args, kwargs)\u001b[39m\n\u001b[32m 101\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m func \u001b[38;5;129;01min\u001b[39;00m _device_constructors() \u001b[38;5;129;01mand\u001b[39;00m kwargs.get(\u001b[33m\"\u001b[39m\u001b[33mdevice\u001b[39m\u001b[33m\"\u001b[39m) \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 102\u001b[39m kwargs[\u001b[33m\"\u001b[39m\u001b[33mdevice\u001b[39m\u001b[33m\"\u001b[39m] = \u001b[38;5;28mself\u001b[39m.device\n\u001b[32m--> \u001b[39m\u001b[32m103\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[31mKeyboardInterrupt\u001b[39m: " + ] + } + ], + "source": [ + "import random\n", + "import torch\n", + "import pandas as pd\n", + "from pathlib import Path\n", + "import Project_Model.Libs.Embedder as Embedder\n", + "import Project_Model.Libs.BPE as BPE\n", + "import Project_Model.Libs.Transformer as Transformer\n", + "import Project_Model.Libs.TorchShims as torch_shims\n", + "\n", + "# set a fixed seed\n", + "torch.manual_seed(0)\n", + "random.seed(0)\n", + "DEVICE = torch_shims.get_default_device()\n", + "torch.set_default_device(DEVICE)\n", + "\n", + "# set a default device\n", + "\n", + "# BPE Init\n", + "VOCABULARY_PATH = Path(\"Assets/Model/toy_10/toy_dictionary.json\")\n", + "SPECIAL_VOC = BPE.default_special_tokens()\n", + "\n", + "VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)\n", + "TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)\n", + "\n", + "\n", + "# Constants\n", + "TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + 1\n", + "EMBEDDED_SIZE = 256\n", + "FEED_FORWARD_DIM = EMBEDDED_SIZE * 4\n", + "ATTENTION_HEADS = 4\n", + "SENTENCE_LENGTH = 256\n", + "NUMBER_OF_BLOCKS = 2\n", + "MAX_EPOCHS = int(1e3)\n", + "\n", + "\n", + "# Model Init\n", + "ENCODER_EMBEDDER = Embedder.NanoSocratesEmbedder(TOKEN_SPACE_SIZE, EMBEDDED_SIZE)\n", + "DECODER_EMBEDDER = Embedder.NanoSocratesEmbedder(TOKEN_SPACE_SIZE, EMBEDDED_SIZE)\n", + "\n", + "ENCODERS = [\n", + " Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, ATTENTION_HEADS)\n", + "] * NUMBER_OF_BLOCKS\n", + "ENCODER = torch.nn.Sequential(*ENCODERS)\n", + "\n", + "DECODERS = [\n", + " Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, ATTENTION_HEADS)\n", + "] * NUMBER_OF_BLOCKS\n", + "DECODER = torch.nn.Sequential(*DECODERS)\n", + "\n", + "DETOKENER = Transformer.DeToken(EMBEDDED_SIZE, TOKEN_SPACE_SIZE)\n", + "\n", + "PAD_TOKEN = TOKENANO.encode(\"\")[0]\n", + "END_TOKEN = TOKENANO.encode(\"\")[0]\n", + "\n", + "\n", + "# Load CSV\n", + "TOY_DATASET_PATH = Path(\"Assets/Dataset/1-hop/toy/rdf_text.csv\")\n", + "\n", + "TOY_DATASET = pd.read_csv(TOY_DATASET_PATH)\n", + "\n", + "TOY_BATCH_INPUT_LIST: list[list[int]] = []\n", + "TOY_BATCH_PADDING_LIST: list[list[bool]] = []\n", + "TOY_BATCH_TARGET_LIST: list[list[int]] = []\n", + "TOY_BATCH_DECODER_DEFAULT: list[list[int]]= []\n", + "\n", + "\n", + "for index, row in TOY_DATASET.iterrows():\n", + "\n", + " RDFs: str = row[\"RDFs\"]\n", + " Abstract: str = row[\"Abstract\"]\n", + "\n", + " input_tokens = TOKENANO.encode(RDFs)\n", + " output_tokens = TOKENANO.encode(Abstract)\n", + " decoder_default_tokens = TOKENANO.encode(\"\")\n", + "\n", + " input_tokens, padding = Transformer.normalize_sequence(\n", + " input_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n", + " )\n", + " output_tokens, _ = Transformer.normalize_sequence(\n", + " output_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n", + " )\n", + " decoder_default_tokens, _ = Transformer.normalize_sequence(\n", + " decoder_default_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n", + " )\n", + "\n", + " TOY_BATCH_INPUT_LIST.append(input_tokens)\n", + " TOY_BATCH_PADDING_LIST.append(padding)\n", + " TOY_BATCH_TARGET_LIST.append(output_tokens)\n", + " TOY_BATCH_DECODER_DEFAULT.append(decoder_default_tokens)\n", + "\n", + "# Training loop\n", + "LOSS_HISTORY = []\n", + "NANOSOCRATES = torch.nn.ModuleList([\n", + " ENCODER_EMBEDDER,\n", + " ENCODER,\n", + " DECODER_EMBEDDER,\n", + " DECODER,\n", + " DETOKENER\n", + "])\n", + "cross_entropy = torch.nn.CrossEntropyLoss()\n", + "optimizer = torch.optim.AdamW(NANOSOCRATES.parameters())\n", + "# scheduler = torch.optim.lr_scheduler.LRScheduler(optimizer)\n", + "last_loss = 0\n", + "current_epoch = 0\n", + "\n", + "while current_epoch < MAX_EPOCHS:\n", + "\n", + " optimizer.zero_grad()\n", + "\n", + " INPUT_LIST = TOY_BATCH_INPUT_LIST[:]\n", + " TARGET_LIST = TOY_BATCH_TARGET_LIST[:]\n", + " # Transform target into logits\n", + " target_logits = torch.tensor(TOY_BATCH_TARGET_LIST[:])\n", + " DECODER_DEFAULT_LIST = TOY_BATCH_DECODER_DEFAULT[:]\n", + " PADDINGS = torch.tensor(TOY_BATCH_PADDING_LIST, dtype=torch.bool)\n", + " ENCODER_INPUTS = ENCODER_EMBEDDER(INPUT_LIST)\n", + " DECODER_INPUTS = DECODER_EMBEDDER(DECODER_DEFAULT_LIST)\n", + "\n", + " for _ in range(0, SENTENCE_LENGTH):\n", + "\n", + " optimizer.zero_grad()\n", + "\n", + " \n", + "\n", + " encoder_output, _ = ENCODER((ENCODER_INPUTS, PADDINGS))\n", + "\n", + " decoder_output, _, _, _ = DECODER(\n", + " (DECODER_INPUTS, encoder_output, encoder_output, None)\n", + " )\n", + "\n", + " logits: torch.Tensor = DETOKENER(decoder_output)\n", + " logits = logits.permute(0, 2, 1)\n", + "\n", + "\n", + " loss: torch.Tensor= cross_entropy(logits, target_logits)\n", + " last_loss = loss\n", + " LOSS_HISTORY.append(loss)\n", + "\n", + " loss.backward()\n", + " optimizer.step()\n", + " # scheduler.step()\n", + "\n", + " most_probable_tokens = torch.argmax(logits, 2)\n", + "\n", + " if current_epoch % 10 == 0:\n", + "\n", + " print(f\"EPOCH {current_epoch}\\n\\tCurrent Loss: {last_loss}\")\n", + "\n", + " current_epoch += 1\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "deep_learning", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Playgrounds/prova.ipynb b/Playgrounds/prova.ipynb index a4996bb..60f4ef5 100644 --- a/Playgrounds/prova.ipynb +++ b/Playgrounds/prova.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "4ae47336", "metadata": {}, "outputs": [], @@ -15,6 +15,77 @@ "mha = torch.nn.MultiheadAttention(D, num_heads=4, batch_first=True)\n", "y, _ = mha(x, x, x, attn_mask=attn_mask, key_padding_mask=pad_mask) # should work\n" ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "e38e3fb5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([[[0, 0, 0, 0, 1, 0, 0, 0, 0, 0],\n", + " [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],\n", + " [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]],\n", + "\n", + " [[0, 0, 1, 0, 0, 0, 0, 0, 0, 0],\n", + " [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],\n", + " [0, 0, 0, 0, 0, 1, 0, 0, 0, 0]]])" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "torch.nn.functional.one_hot(torch.tensor([\n", + " [4, 1, 9],\n", + " [2,4,5]\n", + "]))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "7119ad53", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "device(type='cpu')" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "torch.get_default_device()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "8c95691a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "xpu\n" + ] + } + ], + "source": [ + "from Project_Model.Libs.TorchShims import get_default_device\n", + "\n", + "print(get_default_device())" + ] } ], "metadata": {