diff --git a/Playgrounds/doctor.ipynb b/Playgrounds/doctor.ipynb new file mode 100644 index 0000000..75b45f9 --- /dev/null +++ b/Playgrounds/doctor.ipynb @@ -0,0 +1,196 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "ddfb4457", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "EPOCH 1\n", + "\tLoss: 7.424792\n", + "[0] \n", + "[1] \n", + "[2] \n", + "[3] \n", + "[4] \n", + "[5] \n", + "[6] \n", + "[7] \n", + "[8] \n", + "[9] \n" + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mKeyboardInterrupt\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 114\u001b[39m\n\u001b[32m 112\u001b[39m loss_t = cross_entropy(logits_t, tgt[:, t]) \u001b[38;5;66;03m# CE expects raw logits; PAD ignored\u001b[39;00m\n\u001b[32m 113\u001b[39m loss_t.backward() \u001b[38;5;66;03m# backprop for this step\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m114\u001b[39m \u001b[43moptimizer\u001b[49m\u001b[43m.\u001b[49m\u001b[43mstep\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# update params\u001b[39;00m\n\u001b[32m 115\u001b[39m scheduler.step() \u001b[38;5;66;03m# Noam/warmup: step per optimizer step\u001b[39;00m\n\u001b[32m 117\u001b[39m total_loss = \u001b[38;5;28mfloat\u001b[39m(loss_t.detach()) \u001b[38;5;66;03m# keep last step loss for logging\u001b[39;00m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/deep_learning/lib/python3.13/site-packages/torch/optim/optimizer.py:516\u001b[39m, in \u001b[36mOptimizer.profile_hook_step..wrapper\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m 511\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 512\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\n\u001b[32m 513\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfunc\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m must return None or a tuple of (new_args, new_kwargs), but got \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresult\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m.\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 514\u001b[39m )\n\u001b[32m--> \u001b[39m\u001b[32m516\u001b[39m out = \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 517\u001b[39m \u001b[38;5;28mself\u001b[39m._optimizer_step_code()\n\u001b[32m 519\u001b[39m \u001b[38;5;66;03m# call optimizer step post hooks\u001b[39;00m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/deep_learning/lib/python3.13/site-packages/torch/optim/optimizer.py:81\u001b[39m, in \u001b[36m_use_grad_for_differentiable.._use_grad\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m 79\u001b[39m torch.set_grad_enabled(\u001b[38;5;28mself\u001b[39m.defaults[\u001b[33m\"\u001b[39m\u001b[33mdifferentiable\u001b[39m\u001b[33m\"\u001b[39m])\n\u001b[32m 80\u001b[39m torch._dynamo.graph_break()\n\u001b[32m---> \u001b[39m\u001b[32m81\u001b[39m ret = \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 82\u001b[39m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[32m 83\u001b[39m torch._dynamo.graph_break()\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/deep_learning/lib/python3.13/site-packages/torch/optim/adam.py:247\u001b[39m, in \u001b[36mAdam.step\u001b[39m\u001b[34m(self, closure)\u001b[39m\n\u001b[32m 235\u001b[39m beta1, beta2 = group[\u001b[33m\"\u001b[39m\u001b[33mbetas\u001b[39m\u001b[33m\"\u001b[39m]\n\u001b[32m 237\u001b[39m has_complex = \u001b[38;5;28mself\u001b[39m._init_group(\n\u001b[32m 238\u001b[39m group,\n\u001b[32m 239\u001b[39m params_with_grad,\n\u001b[32m (...)\u001b[39m\u001b[32m 244\u001b[39m state_steps,\n\u001b[32m 245\u001b[39m )\n\u001b[32m--> \u001b[39m\u001b[32m247\u001b[39m \u001b[43madam\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 248\u001b[39m \u001b[43m \u001b[49m\u001b[43mparams_with_grad\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 249\u001b[39m \u001b[43m \u001b[49m\u001b[43mgrads\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 250\u001b[39m \u001b[43m \u001b[49m\u001b[43mexp_avgs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 251\u001b[39m \u001b[43m \u001b[49m\u001b[43mexp_avg_sqs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 252\u001b[39m \u001b[43m \u001b[49m\u001b[43mmax_exp_avg_sqs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 253\u001b[39m \u001b[43m \u001b[49m\u001b[43mstate_steps\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 254\u001b[39m \u001b[43m \u001b[49m\u001b[43mamsgrad\u001b[49m\u001b[43m=\u001b[49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mamsgrad\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 255\u001b[39m \u001b[43m \u001b[49m\u001b[43mhas_complex\u001b[49m\u001b[43m=\u001b[49m\u001b[43mhas_complex\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 256\u001b[39m \u001b[43m \u001b[49m\u001b[43mbeta1\u001b[49m\u001b[43m=\u001b[49m\u001b[43mbeta1\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 257\u001b[39m \u001b[43m \u001b[49m\u001b[43mbeta2\u001b[49m\u001b[43m=\u001b[49m\u001b[43mbeta2\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 258\u001b[39m \u001b[43m \u001b[49m\u001b[43mlr\u001b[49m\u001b[43m=\u001b[49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mlr\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 259\u001b[39m \u001b[43m \u001b[49m\u001b[43mweight_decay\u001b[49m\u001b[43m=\u001b[49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mweight_decay\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 260\u001b[39m \u001b[43m \u001b[49m\u001b[43meps\u001b[49m\u001b[43m=\u001b[49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43meps\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 261\u001b[39m \u001b[43m \u001b[49m\u001b[43mmaximize\u001b[49m\u001b[43m=\u001b[49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mmaximize\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 262\u001b[39m \u001b[43m \u001b[49m\u001b[43mforeach\u001b[49m\u001b[43m=\u001b[49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mforeach\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 263\u001b[39m \u001b[43m \u001b[49m\u001b[43mcapturable\u001b[49m\u001b[43m=\u001b[49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mcapturable\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 264\u001b[39m \u001b[43m \u001b[49m\u001b[43mdifferentiable\u001b[49m\u001b[43m=\u001b[49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mdifferentiable\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 265\u001b[39m \u001b[43m \u001b[49m\u001b[43mfused\u001b[49m\u001b[43m=\u001b[49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mfused\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 266\u001b[39m \u001b[43m \u001b[49m\u001b[43mgrad_scale\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mgetattr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mgrad_scale\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 267\u001b[39m \u001b[43m \u001b[49m\u001b[43mfound_inf\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43mgetattr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mfound_inf\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 268\u001b[39m \u001b[43m \u001b[49m\u001b[43mdecoupled_weight_decay\u001b[49m\u001b[43m=\u001b[49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mdecoupled_weight_decay\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 269\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 271\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m loss\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/deep_learning/lib/python3.13/site-packages/torch/optim/optimizer.py:149\u001b[39m, in \u001b[36m_disable_dynamo_if_unsupported..wrapper..maybe_fallback\u001b[39m\u001b[34m(*args, **kwargs)\u001b[39m\n\u001b[32m 147\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m disabled_func(*args, **kwargs)\n\u001b[32m 148\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m149\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/deep_learning/lib/python3.13/site-packages/torch/optim/adam.py:949\u001b[39m, in \u001b[36madam\u001b[39m\u001b[34m(params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, foreach, capturable, differentiable, fused, grad_scale, found_inf, has_complex, decoupled_weight_decay, amsgrad, beta1, beta2, lr, weight_decay, eps, maximize)\u001b[39m\n\u001b[32m 946\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 947\u001b[39m func = _single_tensor_adam\n\u001b[32m--> \u001b[39m\u001b[32m949\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 950\u001b[39m \u001b[43m \u001b[49m\u001b[43mparams\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 951\u001b[39m \u001b[43m \u001b[49m\u001b[43mgrads\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 952\u001b[39m \u001b[43m \u001b[49m\u001b[43mexp_avgs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 953\u001b[39m \u001b[43m \u001b[49m\u001b[43mexp_avg_sqs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 954\u001b[39m \u001b[43m \u001b[49m\u001b[43mmax_exp_avg_sqs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 955\u001b[39m \u001b[43m \u001b[49m\u001b[43mstate_steps\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 956\u001b[39m \u001b[43m \u001b[49m\u001b[43mamsgrad\u001b[49m\u001b[43m=\u001b[49m\u001b[43mamsgrad\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 957\u001b[39m \u001b[43m \u001b[49m\u001b[43mhas_complex\u001b[49m\u001b[43m=\u001b[49m\u001b[43mhas_complex\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 958\u001b[39m \u001b[43m \u001b[49m\u001b[43mbeta1\u001b[49m\u001b[43m=\u001b[49m\u001b[43mbeta1\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 959\u001b[39m \u001b[43m \u001b[49m\u001b[43mbeta2\u001b[49m\u001b[43m=\u001b[49m\u001b[43mbeta2\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 960\u001b[39m \u001b[43m \u001b[49m\u001b[43mlr\u001b[49m\u001b[43m=\u001b[49m\u001b[43mlr\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 961\u001b[39m \u001b[43m \u001b[49m\u001b[43mweight_decay\u001b[49m\u001b[43m=\u001b[49m\u001b[43mweight_decay\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 962\u001b[39m \u001b[43m \u001b[49m\u001b[43meps\u001b[49m\u001b[43m=\u001b[49m\u001b[43meps\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 963\u001b[39m \u001b[43m \u001b[49m\u001b[43mmaximize\u001b[49m\u001b[43m=\u001b[49m\u001b[43mmaximize\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 964\u001b[39m \u001b[43m \u001b[49m\u001b[43mcapturable\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcapturable\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 965\u001b[39m \u001b[43m \u001b[49m\u001b[43mdifferentiable\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdifferentiable\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 966\u001b[39m \u001b[43m \u001b[49m\u001b[43mgrad_scale\u001b[49m\u001b[43m=\u001b[49m\u001b[43mgrad_scale\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 967\u001b[39m \u001b[43m \u001b[49m\u001b[43mfound_inf\u001b[49m\u001b[43m=\u001b[49m\u001b[43mfound_inf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 968\u001b[39m \u001b[43m \u001b[49m\u001b[43mdecoupled_weight_decay\u001b[49m\u001b[43m=\u001b[49m\u001b[43mdecoupled_weight_decay\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 969\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/deep_learning/lib/python3.13/site-packages/torch/optim/adam.py:411\u001b[39m, in \u001b[36m_single_tensor_adam\u001b[39m\u001b[34m(params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, grad_scale, found_inf, amsgrad, has_complex, beta1, beta2, lr, weight_decay, eps, maximize, capturable, differentiable, decoupled_weight_decay)\u001b[39m\n\u001b[32m 408\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m weight_decay != \u001b[32m0\u001b[39m:\n\u001b[32m 409\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m decoupled_weight_decay:\n\u001b[32m 410\u001b[39m \u001b[38;5;66;03m# Perform stepweight decay\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m411\u001b[39m \u001b[43mparam\u001b[49m\u001b[43m.\u001b[49m\u001b[43mmul_\u001b[49m\u001b[43m(\u001b[49m\u001b[32;43m1\u001b[39;49m\u001b[43m \u001b[49m\u001b[43m-\u001b[49m\u001b[43m \u001b[49m\u001b[43mlr\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m \u001b[49m\u001b[43mweight_decay\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 412\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 413\u001b[39m \u001b[38;5;66;03m# Nested if is necessary to bypass jitscript rules\u001b[39;00m\n\u001b[32m 414\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m differentiable \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(weight_decay, Tensor):\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/deep_learning/lib/python3.13/site-packages/torch/utils/_device.py:103\u001b[39m, in \u001b[36mDeviceContext.__torch_function__\u001b[39m\u001b[34m(self, func, types, args, kwargs)\u001b[39m\n\u001b[32m 101\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m func \u001b[38;5;129;01min\u001b[39;00m _device_constructors() \u001b[38;5;129;01mand\u001b[39;00m kwargs.get(\u001b[33m\"\u001b[39m\u001b[33mdevice\u001b[39m\u001b[33m\"\u001b[39m) \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 102\u001b[39m kwargs[\u001b[33m\"\u001b[39m\u001b[33mdevice\u001b[39m\u001b[33m\"\u001b[39m] = \u001b[38;5;28mself\u001b[39m.device\n\u001b[32m--> \u001b[39m\u001b[32m103\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[31mKeyboardInterrupt\u001b[39m: " + ] + } + ], + "source": [ + "import random\n", + "import torch\n", + "import pandas as pd\n", + "from pathlib import Path\n", + "import Project_Model.Libs.Embedder as Embedder\n", + "import Project_Model.Libs.BPE as BPE\n", + "import Project_Model.Libs.Transformer as Transformer\n", + "import Project_Model.Libs.TorchShims as torch_shims\n", + "from Project_Model.Libs.Training.learning_rade_shedulers import Custom_lr\n", + "from Project_Model.Libs.Training.logistic_collector import LogitsCollector # import the external collector\n", + "\n", + "# set a fixed seed\n", + "torch.manual_seed(0)\n", + "random.seed(0)\n", + "DEVICE = torch_shims.get_default_device()\n", + "torch.set_default_device(DEVICE)\n", + "\n", + "# BPE Init\n", + "VOCABULARY_PATH = Path(\"Assets/Model/toy_10/toy_dictionary.json\")\n", + "SPECIAL_VOC = BPE.default_special_tokens()\n", + "\n", + "VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)\n", + "TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)\n", + "\n", + "# Constants\n", + "TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + 1\n", + "EMBEDDED_SIZE = 256\n", + "FEED_FORWARD_MULTIPLIER = 4\n", + "ATTENTION_HEADS = 4\n", + "SENTENCE_LENGTH = 256\n", + "NUMBER_OF_BLOCKS = 2\n", + "MAX_EPOCHS = int(1e3)\n", + "\n", + "PAD_TOKEN = TOKENANO.encode(\"\")[0]\n", + "END_TOKEN = TOKENANO.encode(\"\")[0]\n", + "\n", + "# Load CSV\n", + "TOY_DATASET_PATH = Path(\"Assets/Dataset/1-hop/toy/rdf_text.csv\")\n", + "TOY_DATASET = pd.read_csv(TOY_DATASET_PATH)\n", + "\n", + "TOY_BATCH_INPUT_LIST: list[list[int]] = []\n", + "TOY_BATCH_PADDING_LIST: list[list[bool]] = []\n", + "TOY_BATCH_TARGET_LIST: list[list[int]] = []\n", + "TOY_BATCH_DECODER_DEFAULT: list[list[int]] = []\n", + "\n", + "for index, row in TOY_DATASET.iterrows():\n", + " RDFs: str = row[\"RDFs\"]\n", + " Abstract: str = row[\"Abstract\"]\n", + "\n", + " input_tokens = TOKENANO.encode(RDFs) # encoder input ids\n", + " output_tokens = TOKENANO.encode(Abstract)[1:] # decoder target ids (shifted left)\n", + " decoder_default_tokens = TOKENANO.encode(\"\") # decoder input starts with \n", + "\n", + " input_tokens, padding = Transformer.normalize_sequence(\n", + " input_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n", + " ) # pad/trim + end token\n", + " output_tokens, _ = Transformer.normalize_sequence(\n", + " output_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n", + " ) # pad/trim + end token\n", + " decoder_default_tokens = Transformer.pad_sequence(\n", + " decoder_default_tokens, SENTENCE_LENGTH, PAD_TOKEN\n", + " ) # pad with PAD up to SENTENCE_LENGTH\n", + "\n", + " TOY_BATCH_INPUT_LIST.append(input_tokens)\n", + " TOY_BATCH_PADDING_LIST.append(padding)\n", + " TOY_BATCH_TARGET_LIST.append(output_tokens)\n", + " TOY_BATCH_DECODER_DEFAULT.append(decoder_default_tokens)\n", + "\n", + "# Training loop\n", + "LOSS_HISTORY = []\n", + "NANOSOCRATES = Transformer.TrainingModel(\n", + " TOKEN_SPACE_SIZE,\n", + " EMBEDDED_SIZE,\n", + " FEED_FORWARD_MULTIPLIER,\n", + " ATTENTION_HEADS,\n", + " NUMBER_OF_BLOCKS,\n", + ")\n", + "\n", + "collector = LogitsCollector(PAD_TOKEN, END_TOKEN, TOKENANO) # collects logits and decodes\n", + "\n", + "NANOSOCRATES.train()\n", + "cross_entropy = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)\n", + "optimizer = torch.optim.AdamW(NANOSOCRATES.parameters())\n", + "scheduler = Custom_lr(EMBEDDED_SIZE, 4000) # step each optimizer step\n", + "\n", + "current_epoch = 0\n", + "BATCH_SIZE = min(32, len(TOY_BATCH_INPUT_LIST)) # small batch to stabilize\n", + "\n", + "while current_epoch < MAX_EPOCHS:\n", + " # simple fixed mini-batch from the top; later you can shuffle/slice\n", + " enc = torch.tensor(TOY_BATCH_INPUT_LIST[:BATCH_SIZE], dtype=torch.long) # [B,T] encoder token ids\n", + " pad = torch.tensor(TOY_BATCH_PADDING_LIST[:BATCH_SIZE], dtype=torch.bool) # [B,T] True where encoder PAD is present\n", + " tgt = torch.tensor(TOY_BATCH_TARGET_LIST[:BATCH_SIZE], dtype=torch.long) # [B,T] decoder targets (ground-truth)\n", + "\n", + " # decoder prefix buffer: at pos 0, PAD elsewhere (no shift here) # we will fill it step by step\n", + " dec = torch.tensor(TOY_BATCH_DECODER_DEFAULT[:BATCH_SIZE], dtype=torch.long) # [B,T]\n", + "\n", + " total_loss = 0.0\n", + " collector.reset() # start fresh for this epoch\n", + "\n", + " T = tgt.size(1) # sequence length\n", + " for t in range(T):\n", + " optimizer.zero_grad(set_to_none=True) # clear grads for this token step\n", + "\n", + " prefix = dec[:, : t + 1] # [B, t+1] current decoder prefix\n", + " dec_pad_mask = prefix.eq(PAD_TOKEN) # [B, t+1] True where PAD inside prefix\n", + "\n", + " # one-step logits given prefix (trainer model expects 4 args now)\n", + " logits_t: torch.Tensor = NANOSOCRATES((enc, pad, prefix, dec_pad_mask)) # [B,V] logits for step t\n", + " collector.add(logits_t) # store logits for decoding later\n", + "\n", + " loss_t = cross_entropy(logits_t, tgt[:, t]) # CE expects raw logits; PAD ignored\n", + " loss_t.backward() # backprop for this step\n", + " optimizer.step() # update params\n", + " scheduler.step() # Noam/warmup: step per optimizer step\n", + "\n", + " total_loss = float(loss_t.detach()) # keep last step loss for logging\n", + "\n", + " # teacher forcing: reveal the correct token for next position\n", + " if t < T - 1:\n", + " dec[:, t + 1] = tgt[:, t] # write ground-truth into next slot\n", + "\n", + " current_epoch += 1\n", + " print(f\"EPOCH {current_epoch}\\n\\tLoss: {total_loss:.6f}\") # simple log\n", + " collector.print_decoded() # print decoded predictions for the batch\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "deep_learning", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Playgrounds/doctor.py b/Playgrounds/doctor.py new file mode 100644 index 0000000..a201eaf --- /dev/null +++ b/Playgrounds/doctor.py @@ -0,0 +1,125 @@ +import random +import torch +import pandas as pd +from pathlib import Path +import Project_Model.Libs.Embedder as Embedder +import Project_Model.Libs.BPE as BPE +import Project_Model.Libs.Transformer as Transformer +import Project_Model.Libs.TorchShims as torch_shims +from Project_Model.Libs.Training.learning_rade_shedulers import Custom_lr +from Project_Model.Libs.Training.logistic_collector import LogitsCollector # import the external collector + +# set a fixed seed +torch.manual_seed(0) +random.seed(0) +DEVICE = torch_shims.get_default_device() +torch.set_default_device(DEVICE) + +# BPE Init +VOCABULARY_PATH = Path("Assets/Model/toy_10/toy_dictionary.json") +SPECIAL_VOC = BPE.default_special_tokens() + +VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH) +TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC) + +# Constants +TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + 1 +EMBEDDED_SIZE = 256 +FEED_FORWARD_MULTIPLIER = 4 +ATTENTION_HEADS = 4 +SENTENCE_LENGTH = 256 +NUMBER_OF_BLOCKS = 2 +MAX_EPOCHS = int(1e3) + +PAD_TOKEN = TOKENANO.encode("")[0] +END_TOKEN = TOKENANO.encode("")[0] + +# Load CSV +TOY_DATASET_PATH = Path("Assets/Dataset/1-hop/toy/rdf_text.csv") +TOY_DATASET = pd.read_csv(TOY_DATASET_PATH) + +TOY_BATCH_INPUT_LIST: list[list[int]] = [] +TOY_BATCH_PADDING_LIST: list[list[bool]] = [] +TOY_BATCH_TARGET_LIST: list[list[int]] = [] +TOY_BATCH_DECODER_DEFAULT: list[list[int]] = [] + +for index, row in TOY_DATASET.iterrows(): + RDFs: str = row["RDFs"] + Abstract: str = row["Abstract"] + + input_tokens = TOKENANO.encode(RDFs) # encoder input ids + output_tokens = TOKENANO.encode(Abstract)[1:] # decoder target ids (shifted left) + decoder_default_tokens = TOKENANO.encode("") # decoder input starts with + + input_tokens, padding = Transformer.normalize_sequence( + input_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN + ) # pad/trim + end token + output_tokens, _ = Transformer.normalize_sequence( + output_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN + ) # pad/trim + end token + decoder_default_tokens = Transformer.pad_sequence( + decoder_default_tokens, SENTENCE_LENGTH, PAD_TOKEN + ) # pad with PAD up to SENTENCE_LENGTH + + TOY_BATCH_INPUT_LIST.append(input_tokens) + TOY_BATCH_PADDING_LIST.append(padding) + TOY_BATCH_TARGET_LIST.append(output_tokens) + TOY_BATCH_DECODER_DEFAULT.append(decoder_default_tokens) + +# Training loop +LOSS_HISTORY = [] +NANOSOCRATES = Transformer.TrainingModel( + TOKEN_SPACE_SIZE, + EMBEDDED_SIZE, + FEED_FORWARD_MULTIPLIER, + ATTENTION_HEADS, + NUMBER_OF_BLOCKS, +) + +collector = LogitsCollector(PAD_TOKEN, END_TOKEN, TOKENANO) # collects logits and decodes + +NANOSOCRATES.train() +cross_entropy = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN) +optimizer = torch.optim.AdamW(NANOSOCRATES.parameters()) +scheduler = Custom_lr(EMBEDDED_SIZE, 4000) # step each optimizer step + +current_epoch = 0 +BATCH_SIZE = min(32, len(TOY_BATCH_INPUT_LIST)) # small batch to stabilize + +while current_epoch < MAX_EPOCHS: + # simple fixed mini-batch from the top; later you can shuffle/slice + enc = torch.tensor(TOY_BATCH_INPUT_LIST[:BATCH_SIZE], dtype=torch.long) # [B,T] encoder token ids + pad = torch.tensor(TOY_BATCH_PADDING_LIST[:BATCH_SIZE], dtype=torch.bool) # [B,T] True where encoder PAD is present + tgt = torch.tensor(TOY_BATCH_TARGET_LIST[:BATCH_SIZE], dtype=torch.long) # [B,T] decoder targets (ground-truth) + + # decoder prefix buffer: at pos 0, PAD elsewhere (no shift here) # we will fill it step by step + dec = torch.tensor(TOY_BATCH_DECODER_DEFAULT[:BATCH_SIZE], dtype=torch.long) # [B,T] + + total_loss = 0.0 + collector.reset() # start fresh for this epoch + + T = tgt.size(1) # sequence length + for t in range(T): + optimizer.zero_grad(set_to_none=True) # clear grads for this token step + + prefix = dec[:, : t + 1] # [B, t+1] current decoder prefix + dec_pad_mask = prefix.eq(PAD_TOKEN) # [B, t+1] True where PAD inside prefix + + # one-step logits given prefix (trainer model expects 4 args now) + logits_t: torch.Tensor = NANOSOCRATES((enc, pad, prefix, dec_pad_mask)) # [B,V] logits for step t + collector.add(logits_t) # store logits for decoding later + + loss_t = cross_entropy(logits_t, tgt[:, t]) # CE expects raw logits; PAD ignored + loss_t.backward() # backprop for this step + optimizer.step() # update params + scheduler.step() # Noam/warmup: step per optimizer step + + total_loss = float(loss_t.detach()) # keep last step loss for logging + + # teacher forcing: reveal the correct token for next position + if t < T - 1: + dec[:, t + 1] = tgt[:, t] # write ground-truth into next slot + + current_epoch += 1 + print(f"EPOCH {current_epoch}\n\tLoss: {total_loss:.6f}") # simple log + collector.print_decoded() # print decoded predictions for the batch diff --git a/Playgrounds/locistic_test.ipynb b/Playgrounds/locistic_test.ipynb new file mode 100644 index 0000000..8bcc721 --- /dev/null +++ b/Playgrounds/locistic_test.ipynb @@ -0,0 +1,221 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "c8741a8f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "EPOCH 1\n", + "\tLoss: 7.424792\n", + "[0] \n", + "[1] \n", + "[2] \n", + "[3] \n", + "[4] \n", + "[5] \n", + "[6] \n", + "[7] \n", + "[8] \n", + "[9] \n" + ] + } + ], + "source": [ + "import random\n", + "import torch\n", + "import pandas as pd\n", + "from pathlib import Path\n", + "import Project_Model.Libs.Embedder as Embedder\n", + "import Project_Model.Libs.BPE as BPE\n", + "import Project_Model.Libs.Transformer as Transformer\n", + "import Project_Model.Libs.TorchShims as torch_shims\n", + "from Project_Model.Libs.Training.learning_rade_shedulers import Custom_lr\n", + "\n", + "import torch\n", + "\n", + "class LogitsCollector:\n", + " def __init__(self, pad_token: int, end_token: int, tokenizer) -> None:\n", + " self.__pad_token = pad_token # used to skip PAD\n", + " self.__end_token = end_token # used to stop at END\n", + " self.__tokenizer = tokenizer # exposes .decode(list[int]) -> str\n", + " self.__steps: list[torch.Tensor] = [] # list of per-step logits [B,V]\n", + "\n", + " def reset(self) -> None:\n", + " self.__steps.clear() # clear history\n", + "\n", + " def add(self, logits_step: torch.Tensor) -> None:\n", + " if logits_step.dim() == 3: # handle [B,1,V]\n", + " logits_step = logits_step[:, -1, :] # -> [B,V]\n", + " self.__steps.append(logits_step.detach()) # store raw logits (detached)\n", + "\n", + " def tokens(self) -> list[list[int]]:\n", + " if not self.__steps:\n", + " return []\n", + " stack = torch.stack(self.__steps, dim=0) # [T,B,V]\n", + " probs = torch.softmax(stack, dim=-1) # softmax over vocab -> [T,B,V]\n", + " ids = probs.argmax(dim=-1).transpose(0, 1) # greedy ids -> [B,T]\n", + " out: list[list[int]] = []\n", + " for row in ids.tolist():\n", + " seq: list[int] = []\n", + " for tok in row:\n", + " if tok == self.__end_token: # stop on END\n", + " break\n", + " if tok == self.__pad_token: # skip PAD\n", + " continue\n", + " seq.append(tok)\n", + " out.append(seq)\n", + " return out\n", + "\n", + " def print_decoded(self) -> None:\n", + " for i, seq in enumerate(self.tokens()):\n", + " try:\n", + " text = self.__tokenizer.decode(seq) # decode tokens to string\n", + " except Exception:\n", + " text = str(seq) # fallback to ids\n", + " print(f\"[{i}] {text}\") # simple print\n", + "\n", + "\n", + "# set a fixed seed\n", + "torch.manual_seed(0)\n", + "random.seed(0)\n", + "DEVICE = torch_shims.get_default_device()\n", + "torch.set_default_device(DEVICE)\n", + "\n", + "# BPE Init\n", + "VOCABULARY_PATH = Path(\"Assets/Model/toy_10/toy_dictionary.json\")\n", + "SPECIAL_VOC = BPE.default_special_tokens()\n", + "\n", + "VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)\n", + "TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)\n", + "\n", + "# Constants\n", + "TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + 1\n", + "EMBEDDED_SIZE = 256\n", + "FEED_FORWARD_MULTIPLIER = 4\n", + "ATTENTION_HEADS = 4\n", + "SENTENCE_LENGTH = 256\n", + "NUMBER_OF_BLOCKS = 2\n", + "MAX_EPOCHS = int(1e3)\n", + "\n", + "PAD_TOKEN = TOKENANO.encode(\"\")[0]\n", + "END_TOKEN = TOKENANO.encode(\"\")[0]\n", + "\n", + "# Load CSV\n", + "TOY_DATASET_PATH = Path(\"Assets/Dataset/1-hop/toy/rdf_text.csv\")\n", + "TOY_DATASET = pd.read_csv(TOY_DATASET_PATH)\n", + "\n", + "TOY_BATCH_INPUT_LIST: list[list[int]] = []\n", + "TOY_BATCH_PADDING_LIST: list[list[bool]] = []\n", + "TOY_BATCH_TARGET_LIST: list[list[int]] = []\n", + "TOY_BATCH_DECODER_DEFAULT: list[list[int]] = []\n", + "\n", + "for index, row in TOY_DATASET.iterrows():\n", + " RDFs: str = row[\"RDFs\"]\n", + " Abstract: str = row[\"Abstract\"]\n", + "\n", + " input_tokens = TOKENANO.encode(RDFs) # encoder input ids\n", + " output_tokens = TOKENANO.encode(Abstract)[1:] # decoder target ids (shifted left)\n", + " decoder_default_tokens = TOKENANO.encode(\"\") # decoder input starts with \n", + "\n", + " input_tokens, padding = Transformer.normalize_sequence(\n", + " input_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n", + " ) # pad/trim + end token\n", + " output_tokens, _ = Transformer.normalize_sequence(\n", + " output_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n", + " ) # pad/trim + end token\n", + " decoder_default_tokens = Transformer.pad_sequence(\n", + " decoder_default_tokens, SENTENCE_LENGTH, PAD_TOKEN\n", + " ) # pad with PAD up to SENTENCE_LENGTH\n", + "\n", + " TOY_BATCH_INPUT_LIST.append(input_tokens)\n", + " TOY_BATCH_PADDING_LIST.append(padding)\n", + " TOY_BATCH_TARGET_LIST.append(output_tokens)\n", + " TOY_BATCH_DECODER_DEFAULT.append(decoder_default_tokens)\n", + "\n", + "# Training loop\n", + "LOSS_HISTORY = []\n", + "NANOSOCRATES = Transformer.TrainingModel(\n", + " TOKEN_SPACE_SIZE,\n", + " EMBEDDED_SIZE,\n", + " FEED_FORWARD_MULTIPLIER,\n", + " ATTENTION_HEADS,\n", + " NUMBER_OF_BLOCKS,\n", + ")\n", + "\n", + "collector = LogitsCollector(PAD_TOKEN, END_TOKEN, TOKENANO) # collects logits and decodes\n", + "\n", + "NANOSOCRATES.train()\n", + "cross_entropy = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)\n", + "optimizer = torch.optim.AdamW(NANOSOCRATES.parameters())\n", + "scheduler = Custom_lr(EMBEDDED_SIZE, 4000) # step each optimizer step\n", + "\n", + "current_epoch = 0\n", + "BATCH_SIZE = min(32, len(TOY_BATCH_INPUT_LIST)) # small batch to stabilize\n", + "\n", + "while current_epoch < MAX_EPOCHS:\n", + " # simple fixed mini-batch from the top; later you can shuffle/slice\n", + " enc = torch.tensor(TOY_BATCH_INPUT_LIST[:BATCH_SIZE], dtype=torch.long) # [B,T] encoder token ids\n", + " pad = torch.tensor(TOY_BATCH_PADDING_LIST[:BATCH_SIZE], dtype=torch.bool) # [B,T] True where encoder PAD is present\n", + " tgt = torch.tensor(TOY_BATCH_TARGET_LIST[:BATCH_SIZE], dtype=torch.long) # [B,T] decoder targets (ground-truth)\n", + "\n", + " # decoder prefix buffer: at pos 0, PAD elsewhere (no shift here) # we will fill it step by step\n", + " dec = torch.tensor(TOY_BATCH_DECODER_DEFAULT[:BATCH_SIZE], dtype=torch.long) # [B,T]\n", + "\n", + " total_loss = 0.0\n", + " collector.reset() # start fresh for this epoch\n", + "\n", + " T = tgt.size(1) # sequence length\n", + " for t in range(T):\n", + " optimizer.zero_grad(set_to_none=True) # clear grads for this token step\n", + "\n", + " prefix = dec[:, : t + 1] # [B, t+1] current decoder prefix\n", + " dec_pad_mask = prefix.eq(PAD_TOKEN) # [B, t+1] True where PAD inside prefix\n", + "\n", + " # one-step logits given prefix (trainer model expects 4 args now)\n", + " logits_t: torch.Tensor = NANOSOCRATES((enc, pad, prefix, dec_pad_mask)) # [B,V] logits for step t\n", + " collector.add(logits_t) # store logits for decoding later\n", + "\n", + " loss_t = cross_entropy(logits_t, tgt[:, t]) # CE expects raw logits; PAD ignored\n", + " loss_t.backward() # backprop for this step\n", + " optimizer.step() # update params\n", + " scheduler.step() # Noam/warmup: step per optimizer step\n", + "\n", + " total_loss = float(loss_t.detach()) # keep last step loss for logging\n", + "\n", + " # teacher forcing: reveal the correct token for next position\n", + " if t < T - 1:\n", + " dec[:, t + 1] = tgt[:, t] # write ground-truth into next slot\n", + "\n", + " current_epoch += 1\n", + " print(f\"EPOCH {current_epoch}\\n\\tLoss: {total_loss:.6f}\") # simple log\n", + " collector.print_decoded() # print decoded predictions for the batch\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "deep_learning", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Playgrounds/model-teacher-forcing.ipynb b/Playgrounds/model-teacher-forcing.ipynb new file mode 100644 index 0000000..dc8a0b0 --- /dev/null +++ b/Playgrounds/model-teacher-forcing.ipynb @@ -0,0 +1,205 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "0afbf498", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "EPOCH 1\n", + "\tLoss: 9.174470901489258\n", + "EPOCH 2\n", + "\tLoss: 9.20919132232666\n", + "EPOCH 3\n", + "\tLoss: 9.227106094360352\n", + "EPOCH 4\n", + "\tLoss: 9.172086715698242\n", + "EPOCH 5\n", + "\tLoss: 9.180150985717773\n" + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mKeyboardInterrupt\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 116\u001b[39m\n\u001b[32m 113\u001b[39m step_target = target_logits[:, i] \u001b[38;5;66;03m# [B]\u001b[39;00m\n\u001b[32m 115\u001b[39m loss = cross_entropy(step_logits,step_target) \u001b[38;5;66;03m# now loss is without softmax\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m116\u001b[39m \u001b[43mloss\u001b[49m\u001b[43m.\u001b[49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# DAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMN\u001b[39;00m\n\u001b[32m 117\u001b[39m last_loss = loss\n\u001b[32m 118\u001b[39m optimizer.step()\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/deep_learning/lib/python3.13/site-packages/torch/_tensor.py:638\u001b[39m, in \u001b[36mTensor.backward\u001b[39m\u001b[34m(self, gradient, retain_graph, create_graph, inputs)\u001b[39m\n\u001b[32m 595\u001b[39m \u001b[38;5;250m\u001b[39m\u001b[33mr\u001b[39m\u001b[33;03m\"\"\"Computes the gradient of current tensor wrt graph leaves.\u001b[39;00m\n\u001b[32m 596\u001b[39m \n\u001b[32m 597\u001b[39m \u001b[33;03mThe graph is differentiated using the chain rule. If the tensor is\u001b[39;00m\n\u001b[32m (...)\u001b[39m\u001b[32m 635\u001b[39m \u001b[33;03m used to compute the :attr:`tensors`. Defaults to ``None``.\u001b[39;00m\n\u001b[32m 636\u001b[39m \u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 637\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m has_torch_function_unary(\u001b[38;5;28mself\u001b[39m):\n\u001b[32m--> \u001b[39m\u001b[32m638\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mhandle_torch_function\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 639\u001b[39m \u001b[43m \u001b[49m\u001b[43mTensor\u001b[49m\u001b[43m.\u001b[49m\u001b[43mbackward\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 640\u001b[39m \u001b[43m \u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 641\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 642\u001b[39m \u001b[43m \u001b[49m\u001b[43mgradient\u001b[49m\u001b[43m=\u001b[49m\u001b[43mgradient\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 643\u001b[39m \u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m=\u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 644\u001b[39m \u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 645\u001b[39m \u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m=\u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 646\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 647\u001b[39m torch.autograd.backward(\n\u001b[32m 648\u001b[39m \u001b[38;5;28mself\u001b[39m, gradient, retain_graph, create_graph, inputs=inputs\n\u001b[32m 649\u001b[39m )\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/deep_learning/lib/python3.13/site-packages/torch/overrides.py:1725\u001b[39m, in \u001b[36mhandle_torch_function\u001b[39m\u001b[34m(public_api, relevant_args, *args, **kwargs)\u001b[39m\n\u001b[32m 1721\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m _is_torch_function_mode_enabled():\n\u001b[32m 1722\u001b[39m \u001b[38;5;66;03m# if we're here, the mode must be set to a TorchFunctionStackMode\u001b[39;00m\n\u001b[32m 1723\u001b[39m \u001b[38;5;66;03m# this unsets it and calls directly into TorchFunctionStackMode's torch function\u001b[39;00m\n\u001b[32m 1724\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m _pop_mode_temporarily() \u001b[38;5;28;01mas\u001b[39;00m mode:\n\u001b[32m-> \u001b[39m\u001b[32m1725\u001b[39m result = \u001b[43mmode\u001b[49m\u001b[43m.\u001b[49m\u001b[43m__torch_function__\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpublic_api\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtypes\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1726\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m result \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mNotImplemented\u001b[39m:\n\u001b[32m 1727\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m result\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/deep_learning/lib/python3.13/site-packages/torch/utils/_device.py:103\u001b[39m, in \u001b[36mDeviceContext.__torch_function__\u001b[39m\u001b[34m(self, func, types, args, kwargs)\u001b[39m\n\u001b[32m 101\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m func \u001b[38;5;129;01min\u001b[39;00m _device_constructors() \u001b[38;5;129;01mand\u001b[39;00m kwargs.get(\u001b[33m\"\u001b[39m\u001b[33mdevice\u001b[39m\u001b[33m\"\u001b[39m) \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 102\u001b[39m kwargs[\u001b[33m\"\u001b[39m\u001b[33mdevice\u001b[39m\u001b[33m\"\u001b[39m] = \u001b[38;5;28mself\u001b[39m.device\n\u001b[32m--> \u001b[39m\u001b[32m103\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/deep_learning/lib/python3.13/site-packages/torch/_tensor.py:647\u001b[39m, in \u001b[36mTensor.backward\u001b[39m\u001b[34m(self, gradient, retain_graph, create_graph, inputs)\u001b[39m\n\u001b[32m 637\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m has_torch_function_unary(\u001b[38;5;28mself\u001b[39m):\n\u001b[32m 638\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m handle_torch_function(\n\u001b[32m 639\u001b[39m Tensor.backward,\n\u001b[32m 640\u001b[39m (\u001b[38;5;28mself\u001b[39m,),\n\u001b[32m (...)\u001b[39m\u001b[32m 645\u001b[39m inputs=inputs,\n\u001b[32m 646\u001b[39m )\n\u001b[32m--> \u001b[39m\u001b[32m647\u001b[39m \u001b[43mtorch\u001b[49m\u001b[43m.\u001b[49m\u001b[43mautograd\u001b[49m\u001b[43m.\u001b[49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 648\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgradient\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m=\u001b[49m\u001b[43minputs\u001b[49m\n\u001b[32m 649\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/deep_learning/lib/python3.13/site-packages/torch/autograd/__init__.py:354\u001b[39m, in \u001b[36mbackward\u001b[39m\u001b[34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001b[39m\n\u001b[32m 349\u001b[39m retain_graph = create_graph\n\u001b[32m 351\u001b[39m \u001b[38;5;66;03m# The reason we repeat the same comment below is that\u001b[39;00m\n\u001b[32m 352\u001b[39m \u001b[38;5;66;03m# some Python versions print out the first line of a multi-line function\u001b[39;00m\n\u001b[32m 353\u001b[39m \u001b[38;5;66;03m# calls in the traceback and some print out the last line\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m354\u001b[39m \u001b[43m_engine_run_backward\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 355\u001b[39m \u001b[43m \u001b[49m\u001b[43mtensors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 356\u001b[39m \u001b[43m \u001b[49m\u001b[43mgrad_tensors_\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 357\u001b[39m \u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 358\u001b[39m \u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 359\u001b[39m \u001b[43m \u001b[49m\u001b[43minputs_tuple\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 360\u001b[39m \u001b[43m \u001b[49m\u001b[43mallow_unreachable\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 361\u001b[39m \u001b[43m \u001b[49m\u001b[43maccumulate_grad\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 362\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/deep_learning/lib/python3.13/site-packages/torch/autograd/graph.py:829\u001b[39m, in \u001b[36m_engine_run_backward\u001b[39m\u001b[34m(t_outputs, *args, **kwargs)\u001b[39m\n\u001b[32m 827\u001b[39m unregister_hooks = _register_logging_hooks_on_whole_graph(t_outputs)\n\u001b[32m 828\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m829\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mVariable\u001b[49m\u001b[43m.\u001b[49m\u001b[43m_execution_engine\u001b[49m\u001b[43m.\u001b[49m\u001b[43mrun_backward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Calls into the C++ engine to run the backward pass\u001b[39;49;00m\n\u001b[32m 830\u001b[39m \u001b[43m \u001b[49m\u001b[43mt_outputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\n\u001b[32m 831\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# Calls into the C++ engine to run the backward pass\u001b[39;00m\n\u001b[32m 832\u001b[39m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[32m 833\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m attach_logging_hooks:\n", + "\u001b[31mKeyboardInterrupt\u001b[39m: " + ] + } + ], + "source": [ + "import random\n", + "import torch\n", + "import pandas as pd\n", + "from pathlib import Path\n", + "import Project_Model.Libs.Embedder as Embedder\n", + "import Project_Model.Libs.BPE as BPE\n", + "import Project_Model.Libs.Transformer as Transformer\n", + "import Project_Model.Libs.TorchShims as torch_shims\n", + "\n", + "# set a fixed seed\n", + "torch.manual_seed(0)\n", + "random.seed(0)\n", + "DEVICE = torch_shims.get_default_device()\n", + "torch.set_default_device(DEVICE)\n", + "\n", + "# set a default device\n", + "\n", + "# BPE Init\n", + "VOCABULARY_PATH = Path(\"Assets/Model/toy_10/toy_dictionary.json\")\n", + "SPECIAL_VOC = BPE.default_special_tokens()\n", + "\n", + "VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)\n", + "TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)\n", + "\n", + "\n", + "# Constants\n", + "TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + 1\n", + "EMBEDDED_SIZE = 256\n", + "FEED_FORWARD_MULTIPLIER = 4\n", + "ATTENTION_HEADS = 4\n", + "SENTENCE_LENGTH = 256\n", + "NUMBER_OF_BLOCKS = 2\n", + "MAX_EPOCHS = int(1e3)\n", + "\n", + "\n", + "PAD_TOKEN = TOKENANO.encode(\"\")[0]\n", + "END_TOKEN = TOKENANO.encode(\"\")[0]\n", + "\n", + "\n", + "# Load CSV\n", + "TOY_DATASET_PATH = Path(\"Assets/Dataset/1-hop/toy/rdf_text.csv\")\n", + "\n", + "TOY_DATASET = pd.read_csv(TOY_DATASET_PATH)\n", + "\n", + "TOY_BATCH_INPUT_LIST: list[list[int]] = []\n", + "TOY_BATCH_PADDING_LIST: list[list[bool]] = []\n", + "TOY_BATCH_TARGET_LIST: list[list[int]] = []\n", + "TOY_BATCH_DECODER_DEFAULT: list[list[int]]= []\n", + "\n", + "\n", + "for index, row in TOY_DATASET.iterrows():\n", + "\n", + " RDFs: str = row[\"RDFs\"]\n", + " Abstract: str = row[\"Abstract\"]\n", + "\n", + " input_tokens = TOKENANO.encode(RDFs)\n", + " output_tokens = TOKENANO.encode(Abstract)[1:]\n", + " decoder_default_tokens = TOKENANO.encode(\"\")\n", + "\n", + " input_tokens, padding = Transformer.normalize_sequence(\n", + " input_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n", + " )\n", + " output_tokens, _ = Transformer.normalize_sequence(\n", + " output_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n", + " )\n", + " decoder_default_tokens, _ = Transformer.normalize_sequence(\n", + " decoder_default_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n", + " )\n", + "\n", + " TOY_BATCH_INPUT_LIST.append(input_tokens)\n", + " TOY_BATCH_PADDING_LIST.append(padding)\n", + " TOY_BATCH_TARGET_LIST.append(output_tokens)\n", + " TOY_BATCH_DECODER_DEFAULT.append(decoder_default_tokens)\n", + "\n", + "# Training loop\n", + "LOSS_HISTORY = []\n", + "NANOSOCRATES = Transformer.TrainingModel(\n", + " TOKEN_SPACE_SIZE,\n", + " EMBEDDED_SIZE,\n", + " FEED_FORWARD_MULTIPLIER,\n", + " ATTENTION_HEADS,\n", + " NUMBER_OF_BLOCKS\n", + ")\n", + "\n", + "NANOSOCRATES.train() # nothing important, activates dropout etc \n", + "cross_entropy = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)\n", + "optimizer = torch.optim.AdamW(NANOSOCRATES.parameters())\n", + "scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 4)\n", + "\n", + "last_loss = 0\n", + "\n", + "current_epoch = 0\n", + "while current_epoch < MAX_EPOCHS:\n", + "\n", + " encoder_list = torch.tensor([TOY_BATCH_INPUT_LIST[0]])\n", + " decoder_list = torch.tensor([TOY_BATCH_DECODER_DEFAULT[0]])\n", + " padding_list = torch.tensor([TOY_BATCH_PADDING_LIST[0]], dtype=torch.bool)\n", + " target_logits = torch.tensor([TOY_BATCH_TARGET_LIST[0]]) # Transform target into logits\n", + "\n", + " optimizer.zero_grad() # to clear gradient\n", + "\n", + " last_loss = 0.0\n", + "\n", + " for i in range(0, SENTENCE_LENGTH):\n", + "\n", + " # optimizer.zero_grad()\n", + " # forward \n", + " logits: torch.Tensor = NANOSOCRATES((encoder_list, padding_list, decoder_list))\n", + " # probabilities = torch.softmax(logits,2)\n", + " \n", + "\n", + " step_logits = logits[:, i, :] # [B, V]\n", + " step_target = target_logits[:, i] # [B]\n", + "\n", + " loss = cross_entropy(step_logits,step_target) # now loss is without softmax\n", + " loss.backward() # DAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMN\n", + " last_loss = loss\n", + " optimizer.step()\n", + " optimizer.zero_grad()\n", + " scheduler.step()\n", + " \n", + " probabilities = torch.softmax(logits,2)\n", + " most_probable_tokens = torch.argmax(probabilities, 2) \n", + " if i < SENTENCE_LENGTH - 1:\n", + " decoder_list[:,i+1] = most_probable_tokens[:,i]\n", + "\n", + "\n", + " current_epoch += 1\n", + "\n", + " if current_epoch % 1 == 0:\n", + " print(f\"EPOCH {current_epoch}\\n\\tLoss: {last_loss}\")\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "deep_learning", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Playgrounds/trainer.ipynb b/Playgrounds/trainer.ipynb new file mode 100644 index 0000000..e69de29 diff --git a/Project_Model/Libs/Batch/Classes/BatchEmbedder.py b/Project_Model/Libs/Batch/Classes/BatchEmbedder.py new file mode 100644 index 0000000..ee4c5a0 --- /dev/null +++ b/Project_Model/Libs/Batch/Classes/BatchEmbedder.py @@ -0,0 +1,11 @@ +from ....Libs.Embedder.Classes.NanoSocratesEmbedder import NanoSocratesEmbedder +import torch + +class BatchEmbedder(torch.nn.Module): + + def __init__(self, vocabulary_size: int, embedding_size: int) -> None: + super().__init__() + self.__embedder = NanoSocratesEmbedder(vocabulary_size,embedding_size) + + + def forward(self, ) \ No newline at end of file diff --git a/Project_Model/Libs/Training/logistic_collector.py b/Project_Model/Libs/Training/logistic_collector.py new file mode 100644 index 0000000..2e1ad36 --- /dev/null +++ b/Project_Model/Libs/Training/logistic_collector.py @@ -0,0 +1,42 @@ +import torch + +class LogitsCollector: + def __init__(self, pad_token: int, end_token: int, tokenizer) -> None: + self.__pad_token = pad_token # used to skip PAD + self.__end_token = end_token # used to stop at END + self.__tokenizer = tokenizer # exposes .decode(list[int]) -> str + self.__steps: list[torch.Tensor] = [] # list of per-step logits [B,V] + + def reset(self) -> None: + self.__steps.clear() # clear history + + def add(self, logits_step: torch.Tensor) -> None: + if logits_step.dim() == 3: # handle [B,1,V] + logits_step = logits_step[:, -1, :] # -> [B,V] + self.__steps.append(logits_step.detach()) # store raw logits (detached) + + def tokens(self) -> list[list[int]]: + if not self.__steps: + return [] + stack = torch.stack(self.__steps, dim=0) # [T,B,V] + probs = torch.softmax(stack, dim=-1) # softmax over vocab -> [T,B,V] + ids = probs.argmax(dim=-1).transpose(0, 1) # greedy ids -> [B,T] + out: list[list[int]] = [] + for row in ids.tolist(): + seq: list[int] = [] + for tok in row: + if tok == self.__end_token: # stop on END + break + if tok == self.__pad_token: # skip PAD + continue + seq.append(tok) + out.append(seq) + return out + + def print_decoded(self) -> None: + for i, seq in enumerate(self.tokens()): + try: + text = self.__tokenizer.decode(seq) # decode tokens to string + except Exception: + text = str(seq) # fallback to ids + print(f"[{i}] {text}") # simple print diff --git a/Project_Model/Libs/Training/training.py b/Project_Model/Libs/Training/training.py new file mode 100644 index 0000000..e69de29 diff --git a/Project_Model/Libs/Transformer/Classes/DeToken.py b/Project_Model/Libs/Transformer/Classes/DeToken.py index c0b961e..1742374 100644 --- a/Project_Model/Libs/Transformer/Classes/DeToken.py +++ b/Project_Model/Libs/Transformer/Classes/DeToken.py @@ -14,6 +14,6 @@ class DeToken(torch.nn.Module): x = self.__linear(x) # 2) Go to logits - x = torch.softmax(x, 2) + # x = torch.softmax(x, 2) return x diff --git a/Project_Model/Libs/Transformer/Classes/Decoder.py b/Project_Model/Libs/Transformer/Classes/Decoder.py index 8074da1..3b080bf 100644 --- a/Project_Model/Libs/Transformer/Classes/Decoder.py +++ b/Project_Model/Libs/Transformer/Classes/Decoder.py @@ -41,11 +41,12 @@ class Decoder(nn.Module): torch.Tensor, torch.Tensor, torch.Tensor, + torch.Tensor, torch.Tensor ] ): # -> list[torch.Tensor]: # k_x = v_x . While x_q = x # WARNING: args is needed to have sequential - x, k_x, v_x, padding_mask = args + x, k_x, v_x, padding_mask,encoder_padding_mask = args # build of attention mask attention_mask = get_causal_attention_mask(x.size(1)) @@ -68,7 +69,7 @@ class Decoder(nn.Module): # 5) Encoder–decoder (cross) attention CROSS_ATTENTION = self.__cross_attention( - x, k_x, v_x, key_padding_mask=padding_mask + x, k_x, v_x, key_padding_mask=encoder_padding_mask ) # 6) Dropout @@ -96,7 +97,7 @@ class Decoder(nn.Module): # 12) Layer Normalization x = self.__layer_norm_3(x) - return (x, k_x, v_x, padding_mask) + return (x, k_x, v_x, padding_mask, encoder_padding_mask) # use eval to disable dropout ecc diff --git a/Project_Model/Libs/Transformer/Classes/NanoSocrates.py b/Project_Model/Libs/Transformer/Classes/NanoSocrates.py new file mode 100644 index 0000000..560e7bd --- /dev/null +++ b/Project_Model/Libs/Transformer/Classes/NanoSocrates.py @@ -0,0 +1,23 @@ +import torch +from NanoSocratesCore import NanoSocratesCore + +class NanoSocrates(torch.nn.Module): + + def __init__(self, + embedded_size: int, + feed_forward_dim: int, + encoder_layers: int, + decoder_layers:int, + attention_heads: int, + vocab_size: int) -> None: + + super().__init__() + + self._model = NanoSocratesCore( + embedded_size, + feed_forward_dim, + encoder_layers, + decoder_layers, + attention_heads, + vocab_size) + \ No newline at end of file diff --git a/Project_Model/Libs/Transformer/Classes/NanoSocratesCore.py b/Project_Model/Libs/Transformer/Classes/NanoSocratesCore.py index fca307a..541609b 100644 --- a/Project_Model/Libs/Transformer/Classes/NanoSocratesCore.py +++ b/Project_Model/Libs/Transformer/Classes/NanoSocratesCore.py @@ -16,8 +16,11 @@ class NanoSocratesCore(torch.nn.Module): num_encoder_layers: int = 2, num_decoder_layers: int = 2, num_attention_heads: int = 4, + pad_token: int = 0, ) -> None: + super().__init__() + self.__pad_token = pad_token feed_forward_dim = embedding_size * feed_forward_multiplier self.__sentence_length = sentence_length @@ -43,69 +46,64 @@ class NanoSocratesCore(torch.nn.Module): self.__input_embeder = NanoSocratesEmbedder(vocab_size, embedding_size) self.__output_embedder = NanoSocratesEmbedder(vocab_size, embedding_size) + @torch.no_grad() # inference only def forward( self, encoder_input: list[list[int]], - decoder_input: list[list[int]], - encoder_padding_mask: list[list[int]], + decoder_input: list[list[int]], # must start with and PAD elsewhere + encoder_padding_mask: list[list[bool]], # True where encoder is PAD ): - - if len(encoder_padding_mask) != len(encoder_input): - raise Exception("Mismatch in received_dimensions") - - # TODO: check for tensor in input to embedder # 1) Embed User-Input for encoders - ENCODER_INPUT = self.__input_embeder(encoder_input) + ENCODER_INPUT = self.__input_embeder(encoder_input) # [B,S,E] # 2) Encode User-Input - ENCODER_OUTPUT, _ = self.__encoder_sequence(ENCODER_INPUT, encoder_padding_mask) + ENCODER_OUTPUT, encoder_padding_mask = self.__encoder_sequence( + (ENCODER_INPUT, encoder_padding_mask) # as tuple + ) # [B,S,E], [B,S] del ENCODER_INPUT - exit_loop = False - decoder_token_list = decoder_input[:] + # 3) Autoregressive Output (greedy) + LOGITS_HISTORY: list[torch.Tensor] = [] # keep per-step distributions + decoder_token_list = [row[:] for row in decoder_input] # copy tokens decoder_phase = 0 + exit_loop = False - LOGITS_HISTORY: list[torch.Tensor] = [] - - # 3) Autoregressive Output while not exit_loop: + decoder_phase += 1 # move to next position - # 3.0) Increment Counter - decoder_phase += 1 + # 3.1) Build decoder key padding mask from current tokens (True where PAD) + DECODER_KEY_PADDING_MASK: list[list[bool]] = [ + [tok == self.__pad_token for tok in row] for row in decoder_token_list + ] # [B,T] - # 3.1) Embed Decoder Input - decoder_input = self.__output_embedder(decoder_token_list) + # 3.2) Embed Decoder Input (full sequence; decoder builds causal mask inside) + DECODER_INPUT = self.__output_embedder(decoder_token_list) # [B,T,E] - # 3.2) Decode Decoder Input + # 3.3) Decode (self-attn uses causal mask internally; we provide PAD masks) DECODER_OUTPUT, _, _, _ = self.__decoder_sequence( - decoder_input, ENCODER_OUTPUT, ENCODER_OUTPUT - ) + (DECODER_INPUT, ENCODER_OUTPUT, ENCODER_OUTPUT, + DECODER_KEY_PADDING_MASK, encoder_padding_mask) + ) # [B,T,E] + del DECODER_INPUT - # 3.3) Go back to Token space - # TODO: change name - LOGITS = self.__linear(DECODER_OUTPUT) + # 3.4) Project to token space + LOGITS = self.__linear(DECODER_OUTPUT) # [B,T,V] del DECODER_OUTPUT - # 3.4) Transform in probabilities - # TODO: change name - TOKEN_PROBABILITIES = torch.softmax(LOGITS, dim=-1) - del LOGITS + # 3.5) Probabilities and greedy pick at current step + TOKEN_PROBABILITIES = torch.softmax(LOGITS, dim=-1) # [B,T,V] + LOGITS_HISTORY.append(TOKEN_PROBABILITIES) # store for this step - LOGITS_HISTORY.append(TOKEN_PROBABILITIES) + step_idx = decoder_phase - 1 # 0-based + TOKEN_IDS = TOKEN_PROBABILITIES[:, step_idx, :].argmax(dim=-1).tolist() # [B] -> list[int] - # 3.5) Take most probable tokens - TOKEN_IDS = torch.argmax(TOKEN_PROBABILITIES, -1) + # 3.6) Write prediction into next slot (the slot is PAD) + if step_idx + 1 < self.__sentence_length: + for b, tok in enumerate(TOKEN_IDS): + decoder_token_list[b][step_idx + 1] = tok # feed next position - # TODO: check for dimensions and for efficiency - DECODER_TOKEN_TENSOR = torch.tensor(decoder_token_list) - DECODER_TOKEN_TENSOR[:, decoder_phase] = TOKEN_IDS - decoder_token_list = DECODER_TOKEN_TENSOR.tolist() - - del TOKEN_IDS - del DECODER_TOKEN_TENSOR - - # 3.6) Check if we generated all tokens + # 3.7) Stop when we filled the sequence if decoder_phase == self.__sentence_length - 1: exit_loop = True - return LOGITS_HISTORY + return LOGITS_HISTORY # list of [B,T,V] (per step) diff --git a/Project_Model/Libs/Transformer/Models/TrainingModel.py b/Project_Model/Libs/Transformer/Models/TrainingModel.py index 2a72717..dd192dc 100644 --- a/Project_Model/Libs/Transformer/Models/TrainingModel.py +++ b/Project_Model/Libs/Transformer/Models/TrainingModel.py @@ -24,32 +24,49 @@ class TrainingModel(torch.nn.Module): vocabulary_size, latent_space ) - TMP_ENCODERS = [ + # do NOT share layer weights + enc_layers = [ Encoder(latent_space, feed_forward_latent_space, attention_heads) - ] * layer_number - - TMP_DECODERS = [ + for _ in range(layer_number) + ] + dec_layers = [ Decoder(latent_space, feed_forward_latent_space, attention_heads) - ] * layer_number + for _ in range(layer_number) + ] - self.__encoder = torch.nn.Sequential(*TMP_ENCODERS) - self.__decoder = torch.nn.Sequential(*TMP_DECODERS) + self.__encoder = torch.nn.Sequential(*enc_layers) + self.__decoder = torch.nn.Sequential(*dec_layers) self.__detokener = DeToken(latent_space, vocabulary_size) - def forward(self, args: tuple[torch.Tensor, torch.Tensor, torch.Tensor]): - - encoder_embedder_input, padding_tensor, decoder_embedder_input = args + def forward( + self, + args: tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor] + ): + # returns logits for the LAST decoder position only -> [B, V] + ( + encoder_embedder_input, # [B,S] encoder tokens + encoder_padding_mask, # [B,S] True where encoder is PAD + decoder_embedder_prefix, # [B,Tp] decoder prefix (e.g., + tokens so far) + decoder_padding_mask, # [B,Tp] True where decoder prefix has PAD + ) = args - encoder_tensor = self.__encoder_embedder(encoder_embedder_input) - decoder_tensor = self.__decoder_embedder(decoder_embedder_input) + # 1) embeddings + encoder_tensor = self.__encoder_embedder(encoder_embedder_input) # [B,S,E] + decoder_tensor = self.__decoder_embedder(decoder_embedder_prefix) # [B,Tp,E] - encoder_output, _ = self.__encoder((encoder_tensor, padding_tensor)) + # 2) encode + encoder_output, _ = self.__encoder((encoder_tensor, encoder_padding_mask)) # [B,S,E], [B,S] - decoder_output, _, _, _ = self.__decoder( - (decoder_tensor, encoder_tensor, encoder_tensor, None) - ) + # 3) decode (causal mask is built inside the decoder) + decoder_output, _, _, _, _ = self.__decoder( + (decoder_tensor, encoder_output, encoder_output, + decoder_padding_mask, encoder_padding_mask) + ) # [B,Tp,E], ... - logits: torch.Tensor = self.__detokener(decoder_output) + # 4) project only the last time step + last_hidden = decoder_output[:, -1:, :] # [B,1,E] + step_logits = self.__detokener(last_hidden) # [B,1,V] + step_logits = step_logits[:, -1, :] # [B,V] - return logits + return step_logits # logits for one token