From d2fdeb18a25a6ec2d96a99df6ad9e7a4b694299f Mon Sep 17 00:00:00 2001 From: GassiGiuseppe Date: Thu, 9 Oct 2025 12:41:47 +0200 Subject: [PATCH] bla bla doctor --- Playgrounds/doctor.ipynb | 193 +++++++++++++++++++++++++++++ Playgrounds/encoder-pretraining.py | 0 2 files changed, 193 insertions(+) create mode 100644 Playgrounds/doctor.ipynb create mode 100644 Playgrounds/encoder-pretraining.py diff --git a/Playgrounds/doctor.ipynb b/Playgrounds/doctor.ipynb new file mode 100644 index 0000000..33e435c --- /dev/null +++ b/Playgrounds/doctor.ipynb @@ -0,0 +1,193 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "ddfb4457", + "metadata": {}, + "outputs": [ + { + "ename": "AssertionError", + "evalue": "target id 3872 >= V (256). Fix TOKEN_SPACE_SIZE.", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mAssertionError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 126\u001b[39m\n\u001b[32m 124\u001b[39m \u001b[38;5;66;03m# sanity guard (helps debug vocab mismatches fast)\u001b[39;00m\n\u001b[32m 125\u001b[39m max_seen = tgt[:, :Tp].max().item()\n\u001b[32m--> \u001b[39m\u001b[32m126\u001b[39m \u001b[38;5;28;01massert\u001b[39;00m max_seen < V \u001b[38;5;129;01mor\u001b[39;00m (tgt[:, :Tp] == PAD_TOKEN).all(), \\\n\u001b[32m 127\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mtarget id \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmax_seen\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m >= V (\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mV\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m). Fix TOKEN_SPACE_SIZE.\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 129\u001b[39m \u001b[38;5;66;03m# CE over all tokens produced so far (0..t). PAD is ignored by ignore_index\u001b[39;00m\n\u001b[32m 130\u001b[39m loss_t = cross_entropy(\n\u001b[32m 131\u001b[39m logits_btV.reshape(-\u001b[32m1\u001b[39m, V), \u001b[38;5;66;03m# [B*(t+1), V]\u001b[39;00m\n\u001b[32m 132\u001b[39m tgt[:, :Tp].reshape(-\u001b[32m1\u001b[39m) \u001b[38;5;66;03m# [B*(t+1)]\u001b[39;00m\n\u001b[32m 133\u001b[39m )\n", + "\u001b[31mAssertionError\u001b[39m: target id 3872 >= V (256). Fix TOKEN_SPACE_SIZE." + ] + } + ], + "source": [ + "import random\n", + "import torch\n", + "import pandas as pd\n", + "from pathlib import Path\n", + "import Project_Model.Libs.Embedder as Embedder\n", + "import Project_Model.Libs.BPE as BPE\n", + "import Project_Model.Libs.Transformer as Transformer\n", + "import Project_Model.Libs.TorchShims as torch_shims\n", + "from Project_Model.Libs.Training.learning_rade_shedulers import CustomLR\n", + "from Project_Model.Libs.Training.logistic_collector import LogitsCollector # external collector\n", + "\n", + "# set a fixed seed\n", + "torch.manual_seed(0)\n", + "random.seed(0)\n", + "DEVICE = torch_shims.get_default_device()\n", + "torch.set_default_device(DEVICE)\n", + "\n", + "# BPE Init\n", + "VOCABULARY_PATH = Path(\"Assets/Model/toy_10/toy_dictionary.json\")\n", + "SPECIAL_VOC = BPE.default_special_tokens()\n", + "\n", + "VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)\n", + "TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)\n", + "\n", + "# Constants (TEMP size; will be corrected after dataset scan below)\n", + "TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + 1\n", + "EMBEDDED_SIZE = 256\n", + "FEED_FORWARD_MULTIPLIER = 4\n", + "ATTENTION_HEADS = 4\n", + "SENTENCE_LENGTH = 256\n", + "NUMBER_OF_BLOCKS = 2\n", + "MAX_EPOCHS = int(1e4)\n", + "\n", + "PAD_TOKEN = TOKENANO.encode(\"\")[0]\n", + "END_TOKEN = TOKENANO.encode(\"\")[0]\n", + "\n", + "# Load CSV\n", + "TOY_DATASET_PATH = Path(\"Assets/Dataset/1-hop/toy/rdf_text.csv\")\n", + "TOY_DATASET = pd.read_csv(TOY_DATASET_PATH)\n", + "\n", + "TOY_BATCH_INPUT_LIST: list[list[int]] = []\n", + "TOY_BATCH_PADDING_LIST: list[list[bool]] = []\n", + "TOY_BATCH_TARGET_LIST: list[list[int]] = []\n", + "TOY_BATCH_DECODER_DEFAULT: list[list[int]] = []\n", + "\n", + "for index, row in TOY_DATASET.iterrows():\n", + " RDFs: str = row[\"RDFs\"]\n", + " Abstract: str = row[\"Abstract\"]\n", + "\n", + " input_tokens = TOKENANO.encode(RDFs) # encoder input ids\n", + " output_tokens = TOKENANO.encode(Abstract)[1:] # decoder target ids (shifted left)\n", + " decoder_default_tokens = TOKENANO.encode(\"\") # decoder input starts with \n", + "\n", + " input_tokens, padding = Transformer.normalize_sequence(\n", + " input_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n", + " ) # pad/trim + end token\n", + " output_tokens, _ = Transformer.normalize_sequence(\n", + " output_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n", + " ) # pad/trim + end token\n", + " decoder_default_tokens = Transformer.pad_sequence(\n", + " decoder_default_tokens, SENTENCE_LENGTH, PAD_TOKEN\n", + " ) # pad with PAD up to SENTENCE_LENGTH\n", + "\n", + " TOY_BATCH_INPUT_LIST.append(input_tokens)\n", + " TOY_BATCH_PADDING_LIST.append(padding)\n", + " TOY_BATCH_TARGET_LIST.append(output_tokens)\n", + " TOY_BATCH_DECODER_DEFAULT.append(decoder_default_tokens)\n", + "\n", + "# fix V to cover ALL ids (including specials) # <- important\n", + "max_enc_id = max(max(row) for row in TOY_BATCH_INPUT_LIST) if TOY_BATCH_INPUT_LIST else 0\n", + "max_tgt_id = max(max(row) for row in TOY_BATCH_TARGET_LIST) if TOY_BATCH_TARGET_LIST else 0\n", + "TOKEN_SPACE_SIZE = max(TOKEN_SPACE_SIZE, max(PAD_TOKEN, END_TOKEN, max_enc_id, max_tgt_id) + 1)\n", + "\n", + "# Training loop\n", + "LOSS_HISTORY = []\n", + "NANOSOCRATES = Transformer.TrainingModel(\n", + " TOKEN_SPACE_SIZE,\n", + " EMBEDDED_SIZE,\n", + " FEED_FORWARD_MULTIPLIER,\n", + " ATTENTION_HEADS,\n", + " NUMBER_OF_BLOCKS,\n", + ")\n", + "\n", + "collector = LogitsCollector(PAD_TOKEN, END_TOKEN, TOKENANO) # collects logits and decodes\n", + "\n", + "NANOSOCRATES.train()\n", + "cross_entropy = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)\n", + "optimizer = torch.optim.AdamW(NANOSOCRATES.parameters(), lr=1.0) # base lr works as factor\n", + "scheduler = CustomLR(optimizer, EMBEDDED_SIZE, warmup_steps=4000, factor=1.0) # step each optimizer step\n", + "\n", + "current_epoch = 0\n", + "BATCH_SIZE = min(32, len(TOY_BATCH_INPUT_LIST)) # small batch to stabilize\n", + "\n", + "while current_epoch < MAX_EPOCHS:\n", + " # simple fixed mini-batch from the top; later you can shuffle/slice\n", + " enc = torch.tensor(TOY_BATCH_INPUT_LIST[:BATCH_SIZE], dtype=torch.long) # [B,T] encoder token ids\n", + " pad = torch.tensor(TOY_BATCH_PADDING_LIST[:BATCH_SIZE], dtype=torch.bool) # [B,T] True where encoder PAD is present\n", + " tgt = torch.tensor(TOY_BATCH_TARGET_LIST[:BATCH_SIZE], dtype=torch.long) # [B,T] decoder targets (ground-truth)\n", + "\n", + " # decoder prefix buffer: at pos 0, PAD elsewhere (no shift here) # we will fill it step by step\n", + " dec = torch.tensor(TOY_BATCH_DECODER_DEFAULT[:BATCH_SIZE], dtype=torch.long) # [B,T]\n", + "\n", + " total_loss = 0.0\n", + " collector.reset() # start fresh for this epoch\n", + "\n", + " T = tgt.size(1) # sequence length\n", + " for t in range(T):\n", + " # skip all-PAD steps to avoid CE divide-by-zero late in the sequence\n", + " if (tgt[:, t] == PAD_TOKEN).all(): # all PAD at this timestep\n", + " break\n", + "\n", + " optimizer.zero_grad(set_to_none=True) # clear grads for this token step\n", + "\n", + " prefix = dec[:, : t + 1] # [B, t+1] current decoder prefix\n", + " dec_pad_mask = prefix.eq(PAD_TOKEN) # [B, t+1] True where PAD inside prefix\n", + "\n", + " # now decoder returns all steps up to t -> [B, t+1, V]\n", + " logits_btV: torch.Tensor = NANOSOCRATES((enc, pad, prefix, dec_pad_mask)) # full logits for learning\n", + " collector.add(logits_btV) # collector will take the last step\n", + "\n", + " Tp = logits_btV.size(1) # t+1\n", + " V = logits_btV.size(-1) # vocab size\n", + "\n", + " # sanity guard (helps debug vocab mismatches fast)\n", + " max_seen = tgt[:, :Tp].max().item()\n", + " assert max_seen < V or (tgt[:, :Tp] == PAD_TOKEN).all(), \\\n", + " f\"target id {max_seen} >= V ({V}). Fix TOKEN_SPACE_SIZE.\"\n", + "\n", + " # CE over all tokens produced so far (0..t). PAD is ignored by ignore_index\n", + " loss_t = cross_entropy(\n", + " logits_btV.reshape(-1, V), # [B*(t+1), V]\n", + " tgt[:, :Tp].reshape(-1) # [B*(t+1)]\n", + " )\n", + "\n", + " loss_t.backward() # backprop for this step\n", + " optimizer.step() # update params\n", + " scheduler.step() # Noam/warmup: step per optimizer step\n", + "\n", + " total_loss = float(loss_t.detach()) # keep last step loss for logging\n", + "\n", + " # teacher forcing: reveal the correct token for next position\n", + " if t < T - 1:\n", + " dec[:, t + 1] = tgt[:, t] # write ground-truth into next slot\n", + "\n", + " current_epoch += 1\n", + " print(f\"EPOCH {current_epoch}\\n\\tLoss: {total_loss:.6f}\") # simple log\n", + " collector.print_decoded() # print decoded predictions for the batch\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "deep_learning", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Playgrounds/encoder-pretraining.py b/Playgrounds/encoder-pretraining.py new file mode 100644 index 0000000..e69de29