doctor and model test
This commit is contained in:
parent
b805dc538e
commit
1de2cc59db
196
Playgrounds/doctor.ipynb
Normal file
196
Playgrounds/doctor.ipynb
Normal file
File diff suppressed because one or more lines are too long
125
Playgrounds/doctor.py
Normal file
125
Playgrounds/doctor.py
Normal file
@ -0,0 +1,125 @@
|
||||
import random
|
||||
import torch
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
import Project_Model.Libs.Embedder as Embedder
|
||||
import Project_Model.Libs.BPE as BPE
|
||||
import Project_Model.Libs.Transformer as Transformer
|
||||
import Project_Model.Libs.TorchShims as torch_shims
|
||||
from Project_Model.Libs.Training.learning_rade_shedulers import Custom_lr
|
||||
from Project_Model.Libs.Training.logistic_collector import LogitsCollector # import the external collector
|
||||
|
||||
# set a fixed seed
|
||||
torch.manual_seed(0)
|
||||
random.seed(0)
|
||||
DEVICE = torch_shims.get_default_device()
|
||||
torch.set_default_device(DEVICE)
|
||||
|
||||
# BPE Init
|
||||
VOCABULARY_PATH = Path("Assets/Model/toy_10/toy_dictionary.json")
|
||||
SPECIAL_VOC = BPE.default_special_tokens()
|
||||
|
||||
VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)
|
||||
TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)
|
||||
|
||||
# Constants
|
||||
TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + 1
|
||||
EMBEDDED_SIZE = 256
|
||||
FEED_FORWARD_MULTIPLIER = 4
|
||||
ATTENTION_HEADS = 4
|
||||
SENTENCE_LENGTH = 256
|
||||
NUMBER_OF_BLOCKS = 2
|
||||
MAX_EPOCHS = int(1e3)
|
||||
|
||||
PAD_TOKEN = TOKENANO.encode("<PAD>")[0]
|
||||
END_TOKEN = TOKENANO.encode("<END>")[0]
|
||||
|
||||
# Load CSV
|
||||
TOY_DATASET_PATH = Path("Assets/Dataset/1-hop/toy/rdf_text.csv")
|
||||
TOY_DATASET = pd.read_csv(TOY_DATASET_PATH)
|
||||
|
||||
TOY_BATCH_INPUT_LIST: list[list[int]] = []
|
||||
TOY_BATCH_PADDING_LIST: list[list[bool]] = []
|
||||
TOY_BATCH_TARGET_LIST: list[list[int]] = []
|
||||
TOY_BATCH_DECODER_DEFAULT: list[list[int]] = []
|
||||
|
||||
for index, row in TOY_DATASET.iterrows():
|
||||
RDFs: str = row["RDFs"]
|
||||
Abstract: str = row["Abstract"]
|
||||
|
||||
input_tokens = TOKENANO.encode(RDFs) # encoder input ids
|
||||
output_tokens = TOKENANO.encode(Abstract)[1:] # decoder target ids (shifted left)
|
||||
decoder_default_tokens = TOKENANO.encode("<SOS>") # decoder input starts with <SOS>
|
||||
|
||||
input_tokens, padding = Transformer.normalize_sequence(
|
||||
input_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN
|
||||
) # pad/trim + end token
|
||||
output_tokens, _ = Transformer.normalize_sequence(
|
||||
output_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN
|
||||
) # pad/trim + end token
|
||||
decoder_default_tokens = Transformer.pad_sequence(
|
||||
decoder_default_tokens, SENTENCE_LENGTH, PAD_TOKEN
|
||||
) # pad with PAD up to SENTENCE_LENGTH
|
||||
|
||||
TOY_BATCH_INPUT_LIST.append(input_tokens)
|
||||
TOY_BATCH_PADDING_LIST.append(padding)
|
||||
TOY_BATCH_TARGET_LIST.append(output_tokens)
|
||||
TOY_BATCH_DECODER_DEFAULT.append(decoder_default_tokens)
|
||||
|
||||
# Training loop
|
||||
LOSS_HISTORY = []
|
||||
NANOSOCRATES = Transformer.TrainingModel(
|
||||
TOKEN_SPACE_SIZE,
|
||||
EMBEDDED_SIZE,
|
||||
FEED_FORWARD_MULTIPLIER,
|
||||
ATTENTION_HEADS,
|
||||
NUMBER_OF_BLOCKS,
|
||||
)
|
||||
|
||||
collector = LogitsCollector(PAD_TOKEN, END_TOKEN, TOKENANO) # collects logits and decodes
|
||||
|
||||
NANOSOCRATES.train()
|
||||
cross_entropy = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)
|
||||
optimizer = torch.optim.AdamW(NANOSOCRATES.parameters())
|
||||
scheduler = Custom_lr(EMBEDDED_SIZE, 4000) # step each optimizer step
|
||||
|
||||
current_epoch = 0
|
||||
BATCH_SIZE = min(32, len(TOY_BATCH_INPUT_LIST)) # small batch to stabilize
|
||||
|
||||
while current_epoch < MAX_EPOCHS:
|
||||
# simple fixed mini-batch from the top; later you can shuffle/slice
|
||||
enc = torch.tensor(TOY_BATCH_INPUT_LIST[:BATCH_SIZE], dtype=torch.long) # [B,T] encoder token ids
|
||||
pad = torch.tensor(TOY_BATCH_PADDING_LIST[:BATCH_SIZE], dtype=torch.bool) # [B,T] True where encoder PAD is present
|
||||
tgt = torch.tensor(TOY_BATCH_TARGET_LIST[:BATCH_SIZE], dtype=torch.long) # [B,T] decoder targets (ground-truth)
|
||||
|
||||
# decoder prefix buffer: <SOS> at pos 0, PAD elsewhere (no shift here) # we will fill it step by step
|
||||
dec = torch.tensor(TOY_BATCH_DECODER_DEFAULT[:BATCH_SIZE], dtype=torch.long) # [B,T]
|
||||
|
||||
total_loss = 0.0
|
||||
collector.reset() # start fresh for this epoch
|
||||
|
||||
T = tgt.size(1) # sequence length
|
||||
for t in range(T):
|
||||
optimizer.zero_grad(set_to_none=True) # clear grads for this token step
|
||||
|
||||
prefix = dec[:, : t + 1] # [B, t+1] current decoder prefix
|
||||
dec_pad_mask = prefix.eq(PAD_TOKEN) # [B, t+1] True where PAD inside prefix
|
||||
|
||||
# one-step logits given prefix (trainer model expects 4 args now)
|
||||
logits_t: torch.Tensor = NANOSOCRATES((enc, pad, prefix, dec_pad_mask)) # [B,V] logits for step t
|
||||
collector.add(logits_t) # store logits for decoding later
|
||||
|
||||
loss_t = cross_entropy(logits_t, tgt[:, t]) # CE expects raw logits; PAD ignored
|
||||
loss_t.backward() # backprop for this step
|
||||
optimizer.step() # update params
|
||||
scheduler.step() # Noam/warmup: step per optimizer step
|
||||
|
||||
total_loss = float(loss_t.detach()) # keep last step loss for logging
|
||||
|
||||
# teacher forcing: reveal the correct token for next position
|
||||
if t < T - 1:
|
||||
dec[:, t + 1] = tgt[:, t] # write ground-truth into next slot
|
||||
|
||||
current_epoch += 1
|
||||
print(f"EPOCH {current_epoch}\n\tLoss: {total_loss:.6f}") # simple log
|
||||
collector.print_decoded() # print decoded predictions for the batch
|
||||
221
Playgrounds/locistic_test.ipynb
Normal file
221
Playgrounds/locistic_test.ipynb
Normal file
@ -0,0 +1,221 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c8741a8f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"EPOCH 1\n",
|
||||
"\tLoss: 7.424792\n",
|
||||
"[0] \n",
|
||||
"[1] \n",
|
||||
"[2] \n",
|
||||
"[3] \n",
|
||||
"[4] \n",
|
||||
"[5] \n",
|
||||
"[6] \n",
|
||||
"[7] \n",
|
||||
"[8] \n",
|
||||
"[9] \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import random\n",
|
||||
"import torch\n",
|
||||
"import pandas as pd\n",
|
||||
"from pathlib import Path\n",
|
||||
"import Project_Model.Libs.Embedder as Embedder\n",
|
||||
"import Project_Model.Libs.BPE as BPE\n",
|
||||
"import Project_Model.Libs.Transformer as Transformer\n",
|
||||
"import Project_Model.Libs.TorchShims as torch_shims\n",
|
||||
"from Project_Model.Libs.Training.learning_rade_shedulers import Custom_lr\n",
|
||||
"\n",
|
||||
"import torch\n",
|
||||
"\n",
|
||||
"class LogitsCollector:\n",
|
||||
" def __init__(self, pad_token: int, end_token: int, tokenizer) -> None:\n",
|
||||
" self.__pad_token = pad_token # used to skip PAD\n",
|
||||
" self.__end_token = end_token # used to stop at END\n",
|
||||
" self.__tokenizer = tokenizer # exposes .decode(list[int]) -> str\n",
|
||||
" self.__steps: list[torch.Tensor] = [] # list of per-step logits [B,V]\n",
|
||||
"\n",
|
||||
" def reset(self) -> None:\n",
|
||||
" self.__steps.clear() # clear history\n",
|
||||
"\n",
|
||||
" def add(self, logits_step: torch.Tensor) -> None:\n",
|
||||
" if logits_step.dim() == 3: # handle [B,1,V]\n",
|
||||
" logits_step = logits_step[:, -1, :] # -> [B,V]\n",
|
||||
" self.__steps.append(logits_step.detach()) # store raw logits (detached)\n",
|
||||
"\n",
|
||||
" def tokens(self) -> list[list[int]]:\n",
|
||||
" if not self.__steps:\n",
|
||||
" return []\n",
|
||||
" stack = torch.stack(self.__steps, dim=0) # [T,B,V]\n",
|
||||
" probs = torch.softmax(stack, dim=-1) # softmax over vocab -> [T,B,V]\n",
|
||||
" ids = probs.argmax(dim=-1).transpose(0, 1) # greedy ids -> [B,T]\n",
|
||||
" out: list[list[int]] = []\n",
|
||||
" for row in ids.tolist():\n",
|
||||
" seq: list[int] = []\n",
|
||||
" for tok in row:\n",
|
||||
" if tok == self.__end_token: # stop on END\n",
|
||||
" break\n",
|
||||
" if tok == self.__pad_token: # skip PAD\n",
|
||||
" continue\n",
|
||||
" seq.append(tok)\n",
|
||||
" out.append(seq)\n",
|
||||
" return out\n",
|
||||
"\n",
|
||||
" def print_decoded(self) -> None:\n",
|
||||
" for i, seq in enumerate(self.tokens()):\n",
|
||||
" try:\n",
|
||||
" text = self.__tokenizer.decode(seq) # decode tokens to string\n",
|
||||
" except Exception:\n",
|
||||
" text = str(seq) # fallback to ids\n",
|
||||
" print(f\"[{i}] {text}\") # simple print\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# set a fixed seed\n",
|
||||
"torch.manual_seed(0)\n",
|
||||
"random.seed(0)\n",
|
||||
"DEVICE = torch_shims.get_default_device()\n",
|
||||
"torch.set_default_device(DEVICE)\n",
|
||||
"\n",
|
||||
"# BPE Init\n",
|
||||
"VOCABULARY_PATH = Path(\"Assets/Model/toy_10/toy_dictionary.json\")\n",
|
||||
"SPECIAL_VOC = BPE.default_special_tokens()\n",
|
||||
"\n",
|
||||
"VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)\n",
|
||||
"TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)\n",
|
||||
"\n",
|
||||
"# Constants\n",
|
||||
"TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + 1\n",
|
||||
"EMBEDDED_SIZE = 256\n",
|
||||
"FEED_FORWARD_MULTIPLIER = 4\n",
|
||||
"ATTENTION_HEADS = 4\n",
|
||||
"SENTENCE_LENGTH = 256\n",
|
||||
"NUMBER_OF_BLOCKS = 2\n",
|
||||
"MAX_EPOCHS = int(1e3)\n",
|
||||
"\n",
|
||||
"PAD_TOKEN = TOKENANO.encode(\"<PAD>\")[0]\n",
|
||||
"END_TOKEN = TOKENANO.encode(\"<END>\")[0]\n",
|
||||
"\n",
|
||||
"# Load CSV\n",
|
||||
"TOY_DATASET_PATH = Path(\"Assets/Dataset/1-hop/toy/rdf_text.csv\")\n",
|
||||
"TOY_DATASET = pd.read_csv(TOY_DATASET_PATH)\n",
|
||||
"\n",
|
||||
"TOY_BATCH_INPUT_LIST: list[list[int]] = []\n",
|
||||
"TOY_BATCH_PADDING_LIST: list[list[bool]] = []\n",
|
||||
"TOY_BATCH_TARGET_LIST: list[list[int]] = []\n",
|
||||
"TOY_BATCH_DECODER_DEFAULT: list[list[int]] = []\n",
|
||||
"\n",
|
||||
"for index, row in TOY_DATASET.iterrows():\n",
|
||||
" RDFs: str = row[\"RDFs\"]\n",
|
||||
" Abstract: str = row[\"Abstract\"]\n",
|
||||
"\n",
|
||||
" input_tokens = TOKENANO.encode(RDFs) # encoder input ids\n",
|
||||
" output_tokens = TOKENANO.encode(Abstract)[1:] # decoder target ids (shifted left)\n",
|
||||
" decoder_default_tokens = TOKENANO.encode(\"<SOS>\") # decoder input starts with <SOS>\n",
|
||||
"\n",
|
||||
" input_tokens, padding = Transformer.normalize_sequence(\n",
|
||||
" input_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n",
|
||||
" ) # pad/trim + end token\n",
|
||||
" output_tokens, _ = Transformer.normalize_sequence(\n",
|
||||
" output_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n",
|
||||
" ) # pad/trim + end token\n",
|
||||
" decoder_default_tokens = Transformer.pad_sequence(\n",
|
||||
" decoder_default_tokens, SENTENCE_LENGTH, PAD_TOKEN\n",
|
||||
" ) # pad with PAD up to SENTENCE_LENGTH\n",
|
||||
"\n",
|
||||
" TOY_BATCH_INPUT_LIST.append(input_tokens)\n",
|
||||
" TOY_BATCH_PADDING_LIST.append(padding)\n",
|
||||
" TOY_BATCH_TARGET_LIST.append(output_tokens)\n",
|
||||
" TOY_BATCH_DECODER_DEFAULT.append(decoder_default_tokens)\n",
|
||||
"\n",
|
||||
"# Training loop\n",
|
||||
"LOSS_HISTORY = []\n",
|
||||
"NANOSOCRATES = Transformer.TrainingModel(\n",
|
||||
" TOKEN_SPACE_SIZE,\n",
|
||||
" EMBEDDED_SIZE,\n",
|
||||
" FEED_FORWARD_MULTIPLIER,\n",
|
||||
" ATTENTION_HEADS,\n",
|
||||
" NUMBER_OF_BLOCKS,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"collector = LogitsCollector(PAD_TOKEN, END_TOKEN, TOKENANO) # collects logits and decodes\n",
|
||||
"\n",
|
||||
"NANOSOCRATES.train()\n",
|
||||
"cross_entropy = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)\n",
|
||||
"optimizer = torch.optim.AdamW(NANOSOCRATES.parameters())\n",
|
||||
"scheduler = Custom_lr(EMBEDDED_SIZE, 4000) # step each optimizer step\n",
|
||||
"\n",
|
||||
"current_epoch = 0\n",
|
||||
"BATCH_SIZE = min(32, len(TOY_BATCH_INPUT_LIST)) # small batch to stabilize\n",
|
||||
"\n",
|
||||
"while current_epoch < MAX_EPOCHS:\n",
|
||||
" # simple fixed mini-batch from the top; later you can shuffle/slice\n",
|
||||
" enc = torch.tensor(TOY_BATCH_INPUT_LIST[:BATCH_SIZE], dtype=torch.long) # [B,T] encoder token ids\n",
|
||||
" pad = torch.tensor(TOY_BATCH_PADDING_LIST[:BATCH_SIZE], dtype=torch.bool) # [B,T] True where encoder PAD is present\n",
|
||||
" tgt = torch.tensor(TOY_BATCH_TARGET_LIST[:BATCH_SIZE], dtype=torch.long) # [B,T] decoder targets (ground-truth)\n",
|
||||
"\n",
|
||||
" # decoder prefix buffer: <SOS> at pos 0, PAD elsewhere (no shift here) # we will fill it step by step\n",
|
||||
" dec = torch.tensor(TOY_BATCH_DECODER_DEFAULT[:BATCH_SIZE], dtype=torch.long) # [B,T]\n",
|
||||
"\n",
|
||||
" total_loss = 0.0\n",
|
||||
" collector.reset() # start fresh for this epoch\n",
|
||||
"\n",
|
||||
" T = tgt.size(1) # sequence length\n",
|
||||
" for t in range(T):\n",
|
||||
" optimizer.zero_grad(set_to_none=True) # clear grads for this token step\n",
|
||||
"\n",
|
||||
" prefix = dec[:, : t + 1] # [B, t+1] current decoder prefix\n",
|
||||
" dec_pad_mask = prefix.eq(PAD_TOKEN) # [B, t+1] True where PAD inside prefix\n",
|
||||
"\n",
|
||||
" # one-step logits given prefix (trainer model expects 4 args now)\n",
|
||||
" logits_t: torch.Tensor = NANOSOCRATES((enc, pad, prefix, dec_pad_mask)) # [B,V] logits for step t\n",
|
||||
" collector.add(logits_t) # store logits for decoding later\n",
|
||||
"\n",
|
||||
" loss_t = cross_entropy(logits_t, tgt[:, t]) # CE expects raw logits; PAD ignored\n",
|
||||
" loss_t.backward() # backprop for this step\n",
|
||||
" optimizer.step() # update params\n",
|
||||
" scheduler.step() # Noam/warmup: step per optimizer step\n",
|
||||
"\n",
|
||||
" total_loss = float(loss_t.detach()) # keep last step loss for logging\n",
|
||||
"\n",
|
||||
" # teacher forcing: reveal the correct token for next position\n",
|
||||
" if t < T - 1:\n",
|
||||
" dec[:, t + 1] = tgt[:, t] # write ground-truth into next slot\n",
|
||||
"\n",
|
||||
" current_epoch += 1\n",
|
||||
" print(f\"EPOCH {current_epoch}\\n\\tLoss: {total_loss:.6f}\") # simple log\n",
|
||||
" collector.print_decoded() # print decoded predictions for the batch\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "deep_learning",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.13.7"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
205
Playgrounds/model-teacher-forcing.ipynb
Normal file
205
Playgrounds/model-teacher-forcing.ipynb
Normal file
@ -0,0 +1,205 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0afbf498",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"EPOCH 1\n",
|
||||
"\tLoss: 9.174470901489258\n",
|
||||
"EPOCH 2\n",
|
||||
"\tLoss: 9.20919132232666\n",
|
||||
"EPOCH 3\n",
|
||||
"\tLoss: 9.227106094360352\n",
|
||||
"EPOCH 4\n",
|
||||
"\tLoss: 9.172086715698242\n",
|
||||
"EPOCH 5\n",
|
||||
"\tLoss: 9.180150985717773\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"ename": "KeyboardInterrupt",
|
||||
"evalue": "",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
||||
"\u001b[31mKeyboardInterrupt\u001b[39m Traceback (most recent call last)",
|
||||
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 116\u001b[39m\n\u001b[32m 113\u001b[39m step_target = target_logits[:, i] \u001b[38;5;66;03m# [B]\u001b[39;00m\n\u001b[32m 115\u001b[39m loss = cross_entropy(step_logits,step_target) \u001b[38;5;66;03m# now loss is without softmax\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m116\u001b[39m \u001b[43mloss\u001b[49m\u001b[43m.\u001b[49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# DAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMN\u001b[39;00m\n\u001b[32m 117\u001b[39m last_loss = loss\n\u001b[32m 118\u001b[39m optimizer.step()\n",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/deep_learning/lib/python3.13/site-packages/torch/_tensor.py:638\u001b[39m, in \u001b[36mTensor.backward\u001b[39m\u001b[34m(self, gradient, retain_graph, create_graph, inputs)\u001b[39m\n\u001b[32m 595\u001b[39m \u001b[38;5;250m\u001b[39m\u001b[33mr\u001b[39m\u001b[33;03m\"\"\"Computes the gradient of current tensor wrt graph leaves.\u001b[39;00m\n\u001b[32m 596\u001b[39m \n\u001b[32m 597\u001b[39m \u001b[33;03mThe graph is differentiated using the chain rule. If the tensor is\u001b[39;00m\n\u001b[32m (...)\u001b[39m\u001b[32m 635\u001b[39m \u001b[33;03m used to compute the :attr:`tensors`. Defaults to ``None``.\u001b[39;00m\n\u001b[32m 636\u001b[39m \u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 637\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m has_torch_function_unary(\u001b[38;5;28mself\u001b[39m):\n\u001b[32m--> \u001b[39m\u001b[32m638\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mhandle_torch_function\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 639\u001b[39m \u001b[43m \u001b[49m\u001b[43mTensor\u001b[49m\u001b[43m.\u001b[49m\u001b[43mbackward\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 640\u001b[39m \u001b[43m \u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 641\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 642\u001b[39m \u001b[43m \u001b[49m\u001b[43mgradient\u001b[49m\u001b[43m=\u001b[49m\u001b[43mgradient\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 643\u001b[39m \u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m=\u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 644\u001b[39m \u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 645\u001b[39m \u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m=\u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 646\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 647\u001b[39m torch.autograd.backward(\n\u001b[32m 648\u001b[39m \u001b[38;5;28mself\u001b[39m, gradient, retain_graph, create_graph, inputs=inputs\n\u001b[32m 649\u001b[39m )\n",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/deep_learning/lib/python3.13/site-packages/torch/overrides.py:1725\u001b[39m, in \u001b[36mhandle_torch_function\u001b[39m\u001b[34m(public_api, relevant_args, *args, **kwargs)\u001b[39m\n\u001b[32m 1721\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m _is_torch_function_mode_enabled():\n\u001b[32m 1722\u001b[39m \u001b[38;5;66;03m# if we're here, the mode must be set to a TorchFunctionStackMode\u001b[39;00m\n\u001b[32m 1723\u001b[39m \u001b[38;5;66;03m# this unsets it and calls directly into TorchFunctionStackMode's torch function\u001b[39;00m\n\u001b[32m 1724\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m _pop_mode_temporarily() \u001b[38;5;28;01mas\u001b[39;00m mode:\n\u001b[32m-> \u001b[39m\u001b[32m1725\u001b[39m result = \u001b[43mmode\u001b[49m\u001b[43m.\u001b[49m\u001b[43m__torch_function__\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpublic_api\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtypes\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1726\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m result \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mNotImplemented\u001b[39m:\n\u001b[32m 1727\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m result\n",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/deep_learning/lib/python3.13/site-packages/torch/utils/_device.py:103\u001b[39m, in \u001b[36mDeviceContext.__torch_function__\u001b[39m\u001b[34m(self, func, types, args, kwargs)\u001b[39m\n\u001b[32m 101\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m func \u001b[38;5;129;01min\u001b[39;00m _device_constructors() \u001b[38;5;129;01mand\u001b[39;00m kwargs.get(\u001b[33m\"\u001b[39m\u001b[33mdevice\u001b[39m\u001b[33m\"\u001b[39m) \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 102\u001b[39m kwargs[\u001b[33m\"\u001b[39m\u001b[33mdevice\u001b[39m\u001b[33m\"\u001b[39m] = \u001b[38;5;28mself\u001b[39m.device\n\u001b[32m--> \u001b[39m\u001b[32m103\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/deep_learning/lib/python3.13/site-packages/torch/_tensor.py:647\u001b[39m, in \u001b[36mTensor.backward\u001b[39m\u001b[34m(self, gradient, retain_graph, create_graph, inputs)\u001b[39m\n\u001b[32m 637\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m has_torch_function_unary(\u001b[38;5;28mself\u001b[39m):\n\u001b[32m 638\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m handle_torch_function(\n\u001b[32m 639\u001b[39m Tensor.backward,\n\u001b[32m 640\u001b[39m (\u001b[38;5;28mself\u001b[39m,),\n\u001b[32m (...)\u001b[39m\u001b[32m 645\u001b[39m inputs=inputs,\n\u001b[32m 646\u001b[39m )\n\u001b[32m--> \u001b[39m\u001b[32m647\u001b[39m \u001b[43mtorch\u001b[49m\u001b[43m.\u001b[49m\u001b[43mautograd\u001b[49m\u001b[43m.\u001b[49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 648\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgradient\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m=\u001b[49m\u001b[43minputs\u001b[49m\n\u001b[32m 649\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/deep_learning/lib/python3.13/site-packages/torch/autograd/__init__.py:354\u001b[39m, in \u001b[36mbackward\u001b[39m\u001b[34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001b[39m\n\u001b[32m 349\u001b[39m retain_graph = create_graph\n\u001b[32m 351\u001b[39m \u001b[38;5;66;03m# The reason we repeat the same comment below is that\u001b[39;00m\n\u001b[32m 352\u001b[39m \u001b[38;5;66;03m# some Python versions print out the first line of a multi-line function\u001b[39;00m\n\u001b[32m 353\u001b[39m \u001b[38;5;66;03m# calls in the traceback and some print out the last line\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m354\u001b[39m \u001b[43m_engine_run_backward\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 355\u001b[39m \u001b[43m \u001b[49m\u001b[43mtensors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 356\u001b[39m \u001b[43m \u001b[49m\u001b[43mgrad_tensors_\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 357\u001b[39m \u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 358\u001b[39m \u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 359\u001b[39m \u001b[43m \u001b[49m\u001b[43minputs_tuple\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 360\u001b[39m \u001b[43m \u001b[49m\u001b[43mallow_unreachable\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 361\u001b[39m \u001b[43m \u001b[49m\u001b[43maccumulate_grad\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 362\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/deep_learning/lib/python3.13/site-packages/torch/autograd/graph.py:829\u001b[39m, in \u001b[36m_engine_run_backward\u001b[39m\u001b[34m(t_outputs, *args, **kwargs)\u001b[39m\n\u001b[32m 827\u001b[39m unregister_hooks = _register_logging_hooks_on_whole_graph(t_outputs)\n\u001b[32m 828\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m829\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mVariable\u001b[49m\u001b[43m.\u001b[49m\u001b[43m_execution_engine\u001b[49m\u001b[43m.\u001b[49m\u001b[43mrun_backward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Calls into the C++ engine to run the backward pass\u001b[39;49;00m\n\u001b[32m 830\u001b[39m \u001b[43m \u001b[49m\u001b[43mt_outputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\n\u001b[32m 831\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# Calls into the C++ engine to run the backward pass\u001b[39;00m\n\u001b[32m 832\u001b[39m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[32m 833\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m attach_logging_hooks:\n",
|
||||
"\u001b[31mKeyboardInterrupt\u001b[39m: "
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import random\n",
|
||||
"import torch\n",
|
||||
"import pandas as pd\n",
|
||||
"from pathlib import Path\n",
|
||||
"import Project_Model.Libs.Embedder as Embedder\n",
|
||||
"import Project_Model.Libs.BPE as BPE\n",
|
||||
"import Project_Model.Libs.Transformer as Transformer\n",
|
||||
"import Project_Model.Libs.TorchShims as torch_shims\n",
|
||||
"\n",
|
||||
"# set a fixed seed\n",
|
||||
"torch.manual_seed(0)\n",
|
||||
"random.seed(0)\n",
|
||||
"DEVICE = torch_shims.get_default_device()\n",
|
||||
"torch.set_default_device(DEVICE)\n",
|
||||
"\n",
|
||||
"# set a default device\n",
|
||||
"\n",
|
||||
"# BPE Init\n",
|
||||
"VOCABULARY_PATH = Path(\"Assets/Model/toy_10/toy_dictionary.json\")\n",
|
||||
"SPECIAL_VOC = BPE.default_special_tokens()\n",
|
||||
"\n",
|
||||
"VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)\n",
|
||||
"TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Constants\n",
|
||||
"TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + 1\n",
|
||||
"EMBEDDED_SIZE = 256\n",
|
||||
"FEED_FORWARD_MULTIPLIER = 4\n",
|
||||
"ATTENTION_HEADS = 4\n",
|
||||
"SENTENCE_LENGTH = 256\n",
|
||||
"NUMBER_OF_BLOCKS = 2\n",
|
||||
"MAX_EPOCHS = int(1e3)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"PAD_TOKEN = TOKENANO.encode(\"<PAD>\")[0]\n",
|
||||
"END_TOKEN = TOKENANO.encode(\"<END>\")[0]\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Load CSV\n",
|
||||
"TOY_DATASET_PATH = Path(\"Assets/Dataset/1-hop/toy/rdf_text.csv\")\n",
|
||||
"\n",
|
||||
"TOY_DATASET = pd.read_csv(TOY_DATASET_PATH)\n",
|
||||
"\n",
|
||||
"TOY_BATCH_INPUT_LIST: list[list[int]] = []\n",
|
||||
"TOY_BATCH_PADDING_LIST: list[list[bool]] = []\n",
|
||||
"TOY_BATCH_TARGET_LIST: list[list[int]] = []\n",
|
||||
"TOY_BATCH_DECODER_DEFAULT: list[list[int]]= []\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"for index, row in TOY_DATASET.iterrows():\n",
|
||||
"\n",
|
||||
" RDFs: str = row[\"RDFs\"]\n",
|
||||
" Abstract: str = row[\"Abstract\"]\n",
|
||||
"\n",
|
||||
" input_tokens = TOKENANO.encode(RDFs)\n",
|
||||
" output_tokens = TOKENANO.encode(Abstract)[1:]\n",
|
||||
" decoder_default_tokens = TOKENANO.encode(\"<SOS>\")\n",
|
||||
"\n",
|
||||
" input_tokens, padding = Transformer.normalize_sequence(\n",
|
||||
" input_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n",
|
||||
" )\n",
|
||||
" output_tokens, _ = Transformer.normalize_sequence(\n",
|
||||
" output_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n",
|
||||
" )\n",
|
||||
" decoder_default_tokens, _ = Transformer.normalize_sequence(\n",
|
||||
" decoder_default_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" TOY_BATCH_INPUT_LIST.append(input_tokens)\n",
|
||||
" TOY_BATCH_PADDING_LIST.append(padding)\n",
|
||||
" TOY_BATCH_TARGET_LIST.append(output_tokens)\n",
|
||||
" TOY_BATCH_DECODER_DEFAULT.append(decoder_default_tokens)\n",
|
||||
"\n",
|
||||
"# Training loop\n",
|
||||
"LOSS_HISTORY = []\n",
|
||||
"NANOSOCRATES = Transformer.TrainingModel(\n",
|
||||
" TOKEN_SPACE_SIZE,\n",
|
||||
" EMBEDDED_SIZE,\n",
|
||||
" FEED_FORWARD_MULTIPLIER,\n",
|
||||
" ATTENTION_HEADS,\n",
|
||||
" NUMBER_OF_BLOCKS\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"NANOSOCRATES.train() # nothing important, activates dropout etc \n",
|
||||
"cross_entropy = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)\n",
|
||||
"optimizer = torch.optim.AdamW(NANOSOCRATES.parameters())\n",
|
||||
"scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 4)\n",
|
||||
"\n",
|
||||
"last_loss = 0\n",
|
||||
"\n",
|
||||
"current_epoch = 0\n",
|
||||
"while current_epoch < MAX_EPOCHS:\n",
|
||||
"\n",
|
||||
" encoder_list = torch.tensor([TOY_BATCH_INPUT_LIST[0]])\n",
|
||||
" decoder_list = torch.tensor([TOY_BATCH_DECODER_DEFAULT[0]])\n",
|
||||
" padding_list = torch.tensor([TOY_BATCH_PADDING_LIST[0]], dtype=torch.bool)\n",
|
||||
" target_logits = torch.tensor([TOY_BATCH_TARGET_LIST[0]]) # Transform target into logits\n",
|
||||
"\n",
|
||||
" optimizer.zero_grad() # to clear gradient\n",
|
||||
"\n",
|
||||
" last_loss = 0.0\n",
|
||||
"\n",
|
||||
" for i in range(0, SENTENCE_LENGTH):\n",
|
||||
"\n",
|
||||
" # optimizer.zero_grad()\n",
|
||||
" # forward \n",
|
||||
" logits: torch.Tensor = NANOSOCRATES((encoder_list, padding_list, decoder_list))\n",
|
||||
" # probabilities = torch.softmax(logits,2)\n",
|
||||
" \n",
|
||||
"\n",
|
||||
" step_logits = logits[:, i, :] # [B, V]\n",
|
||||
" step_target = target_logits[:, i] # [B]\n",
|
||||
"\n",
|
||||
" loss = cross_entropy(step_logits,step_target) # now loss is without softmax\n",
|
||||
" loss.backward() # DAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMN\n",
|
||||
" last_loss = loss\n",
|
||||
" optimizer.step()\n",
|
||||
" optimizer.zero_grad()\n",
|
||||
" scheduler.step()\n",
|
||||
" \n",
|
||||
" probabilities = torch.softmax(logits,2)\n",
|
||||
" most_probable_tokens = torch.argmax(probabilities, 2) \n",
|
||||
" if i < SENTENCE_LENGTH - 1:\n",
|
||||
" decoder_list[:,i+1] = most_probable_tokens[:,i]\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" current_epoch += 1\n",
|
||||
"\n",
|
||||
" if current_epoch % 1 == 0:\n",
|
||||
" print(f\"EPOCH {current_epoch}\\n\\tLoss: {last_loss}\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "deep_learning",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.13.7"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
0
Playgrounds/trainer.ipynb
Normal file
0
Playgrounds/trainer.ipynb
Normal file
11
Project_Model/Libs/Batch/Classes/BatchEmbedder.py
Normal file
11
Project_Model/Libs/Batch/Classes/BatchEmbedder.py
Normal file
@ -0,0 +1,11 @@
|
||||
from ....Libs.Embedder.Classes.NanoSocratesEmbedder import NanoSocratesEmbedder
|
||||
import torch
|
||||
|
||||
class BatchEmbedder(torch.nn.Module):
|
||||
|
||||
def __init__(self, vocabulary_size: int, embedding_size: int) -> None:
|
||||
super().__init__()
|
||||
self.__embedder = NanoSocratesEmbedder(vocabulary_size,embedding_size)
|
||||
|
||||
|
||||
def forward(self, )
|
||||
42
Project_Model/Libs/Training/logistic_collector.py
Normal file
42
Project_Model/Libs/Training/logistic_collector.py
Normal file
@ -0,0 +1,42 @@
|
||||
import torch
|
||||
|
||||
class LogitsCollector:
|
||||
def __init__(self, pad_token: int, end_token: int, tokenizer) -> None:
|
||||
self.__pad_token = pad_token # used to skip PAD
|
||||
self.__end_token = end_token # used to stop at END
|
||||
self.__tokenizer = tokenizer # exposes .decode(list[int]) -> str
|
||||
self.__steps: list[torch.Tensor] = [] # list of per-step logits [B,V]
|
||||
|
||||
def reset(self) -> None:
|
||||
self.__steps.clear() # clear history
|
||||
|
||||
def add(self, logits_step: torch.Tensor) -> None:
|
||||
if logits_step.dim() == 3: # handle [B,1,V]
|
||||
logits_step = logits_step[:, -1, :] # -> [B,V]
|
||||
self.__steps.append(logits_step.detach()) # store raw logits (detached)
|
||||
|
||||
def tokens(self) -> list[list[int]]:
|
||||
if not self.__steps:
|
||||
return []
|
||||
stack = torch.stack(self.__steps, dim=0) # [T,B,V]
|
||||
probs = torch.softmax(stack, dim=-1) # softmax over vocab -> [T,B,V]
|
||||
ids = probs.argmax(dim=-1).transpose(0, 1) # greedy ids -> [B,T]
|
||||
out: list[list[int]] = []
|
||||
for row in ids.tolist():
|
||||
seq: list[int] = []
|
||||
for tok in row:
|
||||
if tok == self.__end_token: # stop on END
|
||||
break
|
||||
if tok == self.__pad_token: # skip PAD
|
||||
continue
|
||||
seq.append(tok)
|
||||
out.append(seq)
|
||||
return out
|
||||
|
||||
def print_decoded(self) -> None:
|
||||
for i, seq in enumerate(self.tokens()):
|
||||
try:
|
||||
text = self.__tokenizer.decode(seq) # decode tokens to string
|
||||
except Exception:
|
||||
text = str(seq) # fallback to ids
|
||||
print(f"[{i}] {text}") # simple print
|
||||
0
Project_Model/Libs/Training/training.py
Normal file
0
Project_Model/Libs/Training/training.py
Normal file
@ -14,6 +14,6 @@ class DeToken(torch.nn.Module):
|
||||
x = self.__linear(x)
|
||||
|
||||
# 2) Go to logits
|
||||
x = torch.softmax(x, 2)
|
||||
# x = torch.softmax(x, 2)
|
||||
|
||||
return x
|
||||
|
||||
@ -41,11 +41,12 @@ class Decoder(nn.Module):
|
||||
torch.Tensor,
|
||||
torch.Tensor,
|
||||
torch.Tensor,
|
||||
torch.Tensor,
|
||||
torch.Tensor
|
||||
]
|
||||
): # -> list[torch.Tensor]: # k_x = v_x . While x_q = x
|
||||
# WARNING: args is needed to have sequential
|
||||
x, k_x, v_x, padding_mask = args
|
||||
x, k_x, v_x, padding_mask,encoder_padding_mask = args
|
||||
|
||||
# build of attention mask
|
||||
attention_mask = get_causal_attention_mask(x.size(1))
|
||||
@ -68,7 +69,7 @@ class Decoder(nn.Module):
|
||||
|
||||
# 5) Encoder–decoder (cross) attention
|
||||
CROSS_ATTENTION = self.__cross_attention(
|
||||
x, k_x, v_x, key_padding_mask=padding_mask
|
||||
x, k_x, v_x, key_padding_mask=encoder_padding_mask
|
||||
)
|
||||
|
||||
# 6) Dropout
|
||||
@ -96,7 +97,7 @@ class Decoder(nn.Module):
|
||||
# 12) Layer Normalization
|
||||
x = self.__layer_norm_3(x)
|
||||
|
||||
return (x, k_x, v_x, padding_mask)
|
||||
return (x, k_x, v_x, padding_mask, encoder_padding_mask)
|
||||
|
||||
|
||||
# use eval to disable dropout ecc
|
||||
|
||||
23
Project_Model/Libs/Transformer/Classes/NanoSocrates.py
Normal file
23
Project_Model/Libs/Transformer/Classes/NanoSocrates.py
Normal file
@ -0,0 +1,23 @@
|
||||
import torch
|
||||
from NanoSocratesCore import NanoSocratesCore
|
||||
|
||||
class NanoSocrates(torch.nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
embedded_size: int,
|
||||
feed_forward_dim: int,
|
||||
encoder_layers: int,
|
||||
decoder_layers:int,
|
||||
attention_heads: int,
|
||||
vocab_size: int) -> None:
|
||||
|
||||
super().__init__()
|
||||
|
||||
self._model = NanoSocratesCore(
|
||||
embedded_size,
|
||||
feed_forward_dim,
|
||||
encoder_layers,
|
||||
decoder_layers,
|
||||
attention_heads,
|
||||
vocab_size)
|
||||
|
||||
@ -16,8 +16,11 @@ class NanoSocratesCore(torch.nn.Module):
|
||||
num_encoder_layers: int = 2,
|
||||
num_decoder_layers: int = 2,
|
||||
num_attention_heads: int = 4,
|
||||
pad_token: int = 0,
|
||||
) -> None:
|
||||
|
||||
super().__init__()
|
||||
self.__pad_token = pad_token
|
||||
feed_forward_dim = embedding_size * feed_forward_multiplier
|
||||
|
||||
self.__sentence_length = sentence_length
|
||||
@ -43,69 +46,64 @@ class NanoSocratesCore(torch.nn.Module):
|
||||
self.__input_embeder = NanoSocratesEmbedder(vocab_size, embedding_size)
|
||||
self.__output_embedder = NanoSocratesEmbedder(vocab_size, embedding_size)
|
||||
|
||||
@torch.no_grad() # inference only
|
||||
def forward(
|
||||
self,
|
||||
encoder_input: list[list[int]],
|
||||
decoder_input: list[list[int]],
|
||||
encoder_padding_mask: list[list[int]],
|
||||
decoder_input: list[list[int]], # must start with <SOS> and PAD elsewhere
|
||||
encoder_padding_mask: list[list[bool]], # True where encoder is PAD
|
||||
):
|
||||
|
||||
if len(encoder_padding_mask) != len(encoder_input):
|
||||
raise Exception("Mismatch in received_dimensions")
|
||||
|
||||
# TODO: check for tensor in input to embedder
|
||||
# 1) Embed User-Input for encoders
|
||||
ENCODER_INPUT = self.__input_embeder(encoder_input)
|
||||
ENCODER_INPUT = self.__input_embeder(encoder_input) # [B,S,E]
|
||||
|
||||
# 2) Encode User-Input
|
||||
ENCODER_OUTPUT, _ = self.__encoder_sequence(ENCODER_INPUT, encoder_padding_mask)
|
||||
ENCODER_OUTPUT, encoder_padding_mask = self.__encoder_sequence(
|
||||
(ENCODER_INPUT, encoder_padding_mask) # as tuple
|
||||
) # [B,S,E], [B,S]
|
||||
del ENCODER_INPUT
|
||||
|
||||
exit_loop = False
|
||||
decoder_token_list = decoder_input[:]
|
||||
# 3) Autoregressive Output (greedy)
|
||||
LOGITS_HISTORY: list[torch.Tensor] = [] # keep per-step distributions
|
||||
decoder_token_list = [row[:] for row in decoder_input] # copy tokens
|
||||
decoder_phase = 0
|
||||
exit_loop = False
|
||||
|
||||
LOGITS_HISTORY: list[torch.Tensor] = []
|
||||
|
||||
# 3) Autoregressive Output
|
||||
while not exit_loop:
|
||||
decoder_phase += 1 # move to next position
|
||||
|
||||
# 3.0) Increment Counter
|
||||
decoder_phase += 1
|
||||
# 3.1) Build decoder key padding mask from current tokens (True where PAD)
|
||||
DECODER_KEY_PADDING_MASK: list[list[bool]] = [
|
||||
[tok == self.__pad_token for tok in row] for row in decoder_token_list
|
||||
] # [B,T]
|
||||
|
||||
# 3.1) Embed Decoder Input
|
||||
decoder_input = self.__output_embedder(decoder_token_list)
|
||||
# 3.2) Embed Decoder Input (full sequence; decoder builds causal mask inside)
|
||||
DECODER_INPUT = self.__output_embedder(decoder_token_list) # [B,T,E]
|
||||
|
||||
# 3.2) Decode Decoder Input
|
||||
# 3.3) Decode (self-attn uses causal mask internally; we provide PAD masks)
|
||||
DECODER_OUTPUT, _, _, _ = self.__decoder_sequence(
|
||||
decoder_input, ENCODER_OUTPUT, ENCODER_OUTPUT
|
||||
)
|
||||
(DECODER_INPUT, ENCODER_OUTPUT, ENCODER_OUTPUT,
|
||||
DECODER_KEY_PADDING_MASK, encoder_padding_mask)
|
||||
) # [B,T,E]
|
||||
del DECODER_INPUT
|
||||
|
||||
# 3.3) Go back to Token space
|
||||
# TODO: change name
|
||||
LOGITS = self.__linear(DECODER_OUTPUT)
|
||||
# 3.4) Project to token space
|
||||
LOGITS = self.__linear(DECODER_OUTPUT) # [B,T,V]
|
||||
del DECODER_OUTPUT
|
||||
|
||||
# 3.4) Transform in probabilities
|
||||
# TODO: change name
|
||||
TOKEN_PROBABILITIES = torch.softmax(LOGITS, dim=-1)
|
||||
del LOGITS
|
||||
# 3.5) Probabilities and greedy pick at current step
|
||||
TOKEN_PROBABILITIES = torch.softmax(LOGITS, dim=-1) # [B,T,V]
|
||||
LOGITS_HISTORY.append(TOKEN_PROBABILITIES) # store for this step
|
||||
|
||||
LOGITS_HISTORY.append(TOKEN_PROBABILITIES)
|
||||
step_idx = decoder_phase - 1 # 0-based
|
||||
TOKEN_IDS = TOKEN_PROBABILITIES[:, step_idx, :].argmax(dim=-1).tolist() # [B] -> list[int]
|
||||
|
||||
# 3.5) Take most probable tokens
|
||||
TOKEN_IDS = torch.argmax(TOKEN_PROBABILITIES, -1)
|
||||
# 3.6) Write prediction into next slot (the slot is PAD)
|
||||
if step_idx + 1 < self.__sentence_length:
|
||||
for b, tok in enumerate(TOKEN_IDS):
|
||||
decoder_token_list[b][step_idx + 1] = tok # feed next position
|
||||
|
||||
# TODO: check for dimensions and for efficiency
|
||||
DECODER_TOKEN_TENSOR = torch.tensor(decoder_token_list)
|
||||
DECODER_TOKEN_TENSOR[:, decoder_phase] = TOKEN_IDS
|
||||
decoder_token_list = DECODER_TOKEN_TENSOR.tolist()
|
||||
|
||||
del TOKEN_IDS
|
||||
del DECODER_TOKEN_TENSOR
|
||||
|
||||
# 3.6) Check if we generated all tokens
|
||||
# 3.7) Stop when we filled the sequence
|
||||
if decoder_phase == self.__sentence_length - 1:
|
||||
exit_loop = True
|
||||
|
||||
return LOGITS_HISTORY
|
||||
return LOGITS_HISTORY # list of [B,T,V] (per step)
|
||||
|
||||
@ -24,32 +24,49 @@ class TrainingModel(torch.nn.Module):
|
||||
vocabulary_size, latent_space
|
||||
)
|
||||
|
||||
TMP_ENCODERS = [
|
||||
# do NOT share layer weights
|
||||
enc_layers = [
|
||||
Encoder(latent_space, feed_forward_latent_space, attention_heads)
|
||||
] * layer_number
|
||||
|
||||
TMP_DECODERS = [
|
||||
for _ in range(layer_number)
|
||||
]
|
||||
dec_layers = [
|
||||
Decoder(latent_space, feed_forward_latent_space, attention_heads)
|
||||
] * layer_number
|
||||
for _ in range(layer_number)
|
||||
]
|
||||
|
||||
self.__encoder = torch.nn.Sequential(*TMP_ENCODERS)
|
||||
self.__decoder = torch.nn.Sequential(*TMP_DECODERS)
|
||||
self.__encoder = torch.nn.Sequential(*enc_layers)
|
||||
self.__decoder = torch.nn.Sequential(*dec_layers)
|
||||
|
||||
self.__detokener = DeToken(latent_space, vocabulary_size)
|
||||
|
||||
def forward(self, args: tuple[torch.Tensor, torch.Tensor, torch.Tensor]):
|
||||
|
||||
encoder_embedder_input, padding_tensor, decoder_embedder_input = args
|
||||
def forward(
|
||||
self,
|
||||
args: tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]
|
||||
):
|
||||
# returns logits for the LAST decoder position only -> [B, V]
|
||||
(
|
||||
encoder_embedder_input, # [B,S] encoder tokens
|
||||
encoder_padding_mask, # [B,S] True where encoder is PAD
|
||||
decoder_embedder_prefix, # [B,Tp] decoder prefix (e.g., <SOS> + tokens so far)
|
||||
decoder_padding_mask, # [B,Tp] True where decoder prefix has PAD
|
||||
) = args
|
||||
|
||||
encoder_tensor = self.__encoder_embedder(encoder_embedder_input)
|
||||
decoder_tensor = self.__decoder_embedder(decoder_embedder_input)
|
||||
# 1) embeddings
|
||||
encoder_tensor = self.__encoder_embedder(encoder_embedder_input) # [B,S,E]
|
||||
decoder_tensor = self.__decoder_embedder(decoder_embedder_prefix) # [B,Tp,E]
|
||||
|
||||
encoder_output, _ = self.__encoder((encoder_tensor, padding_tensor))
|
||||
# 2) encode
|
||||
encoder_output, _ = self.__encoder((encoder_tensor, encoder_padding_mask)) # [B,S,E], [B,S]
|
||||
|
||||
decoder_output, _, _, _ = self.__decoder(
|
||||
(decoder_tensor, encoder_tensor, encoder_tensor, None)
|
||||
)
|
||||
# 3) decode (causal mask is built inside the decoder)
|
||||
decoder_output, _, _, _, _ = self.__decoder(
|
||||
(decoder_tensor, encoder_output, encoder_output,
|
||||
decoder_padding_mask, encoder_padding_mask)
|
||||
) # [B,Tp,E], ...
|
||||
|
||||
logits: torch.Tensor = self.__detokener(decoder_output)
|
||||
# 4) project only the last time step
|
||||
last_hidden = decoder_output[:, -1:, :] # [B,1,E]
|
||||
step_logits = self.__detokener(last_hidden) # [B,1,V]
|
||||
step_logits = step_logits[:, -1, :] # [B,V]
|
||||
|
||||
return logits
|
||||
return step_logits # logits for one token
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user