76 Commits

Author SHA1 Message Date
Christian Risi
b79521995c Last fixes 2025-10-17 22:17:24 +02:00
Christian Risi
540b78204c Added epochs 2025-10-17 17:06:42 +02:00
GassiGiuseppe
86a063591e Merge branch 'dev.train' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev.train 2025-10-17 16:38:12 +02:00
GassiGiuseppe
f33d4f1db6 Added a loss_saver file to save the losses 2025-10-17 16:37:44 +02:00
Christian Risi
fe62b1edd5 Fixed evaluation 2025-10-16 20:05:35 +02:00
Christian Risi
892f91aad7 Fixes for evaluation 2025-10-16 19:20:23 +02:00
Christian Risi
9ff117f437 Adding best model 2025-10-16 19:20:09 +02:00
Christian Risi
83693f1d4e Fixed a patience bug and added label smoothing 2025-10-14 11:03:15 +02:00
Christian Risi
0b256001fe Changed position to reflect other datasets 2025-10-14 10:51:11 +02:00
Christian Risi
7585f556f8 Added more logging 2025-10-14 10:41:28 +02:00
Christian Risi
4968d79403 Fixed a masking problem 2025-10-14 10:34:14 +02:00
GassiGiuseppe
80fd7fd600 evaluator WIP 2025-10-12 22:59:07 +02:00
GassiGiuseppe
972a73758d added holdout for curated dataset 2025-10-12 19:06:09 +02:00
GassiGiuseppe
b38a011105 added curated dataset, which is 8000 2025-10-12 19:01:28 +02:00
GassiGiuseppe
2bdcd78622 Merge branch 'dev.train' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev.train 2025-10-12 18:20:04 +02:00
GassiGiuseppe
7dedbc481b evaluator WIP 2025-10-12 18:18:20 +02:00
Christian Risi
76345f8d4f Fixed a visual bug 2025-10-12 16:42:59 +02:00
GassiGiuseppe
2ccec9efb8 typo 2025-10-12 16:41:06 +02:00
GassiGiuseppe
e2231eb3b9 Merge branch 'dev.train' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev.train 2025-10-12 16:36:09 +02:00
GassiGiuseppe
144f8724d6 Update of the batcher to resolve a bug in the 4th construction 2025-10-12 16:35:42 +02:00
Christian Risi
07130ff489 Fixed several bugs for task 4 2025-10-12 16:30:30 +02:00
Christian Risi
e0f8a36aa5 Added support for fast resuming 2025-10-12 13:53:07 +02:00
Christian Risi
37a2501a79 Added a way to load checkpoints 2025-10-12 12:28:24 +02:00
Christian Risi
4ca1d0a189 Activated Dropout to avoid overfitting 2025-10-12 12:28:06 +02:00
Christian Risi
f463f699cf Fixed a bug over task 4 2025-10-12 12:22:38 +02:00
Christian Risi
ab3d68bc13 fixed patience not quitting 2025-10-12 01:41:34 +02:00
Christian Risi
79438e3d30 Fixed Patience system 2025-10-12 01:22:06 +02:00
Christian Risi
f98f5a2611 Fixed misprint in task 3 2025-10-12 01:16:09 +02:00
Christian Risi
4281f8724b Fixed Validation loss 2025-10-12 00:57:24 +02:00
Christian Risi
71d602e36e Fixed a memory bug 2025-10-12 00:47:20 +02:00
Christian Risi
46ee6055ec Added Colab default values 2025-10-12 00:15:54 +02:00
Christian Risi
e579e1c88b fixed verbosity 2025-10-12 00:15:15 +02:00
Christian Risi
f51ada866f Added verbosity level 2025-10-12 00:13:03 +02:00
Christian Risi
acd978cbc5 Merge branch 'dev.train' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev.train 2025-10-12 00:05:36 +02:00
Christian Risi
56fbadd55e Fixed training 2025-10-12 00:05:30 +02:00
GassiGiuseppe
14f1c574e7 typo batch size 2025-10-11 22:11:53 +02:00
Christian Risi
d8e65bfb8a Fixed a bug about mismatched batch sizes 2025-10-11 22:09:46 +02:00
Christian Risi
bcc2fe7368 Fixed bugs and added visibility 2025-10-11 21:49:29 +02:00
Christian Risi
160b7dbfc0 V0.0.1 Athene 2025-10-11 19:35:43 +02:00
GassiGiuseppe
49946727d8 updated decoder_input to work without embedder 2025-10-11 16:53:36 +02:00
GassiGiuseppe
1649cd7768 added decoder_input method to build the batch tensor to give in input to the deocder 2025-10-11 16:18:43 +02:00
GassiGiuseppe
443f54fffd WIP decoder with prefix mask 2025-10-11 15:31:43 +02:00
GassiGiuseppe
ff721107b9 typo 2025-10-11 15:26:58 +02:00
GassiGiuseppe
f1886e5be1 added builder for prefix mask 2025-10-11 15:19:09 +02:00
GassiGiuseppe
5e3878ea17 Merge branch 'dev' into dev.train 2025-10-11 11:51:58 +02:00
GassiGiuseppe
79d3fb9ff8 Merge branch 'dev' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev 2025-10-11 11:51:19 +02:00
GassiGiuseppe
586f021276 Merge branch 'dev.train' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev.train 2025-10-11 11:28:35 +02:00
GassiGiuseppe
82462078f8 WIP for the new prefix mask 2025-10-11 11:28:15 +02:00
Christian Risi
625f79f7c3 Fixed imports 2025-10-11 11:18:44 +02:00
GassiGiuseppe
3446870291 typo 2025-10-10 22:27:01 +02:00
GassiGiuseppe
e76dbeb9a7 typo 2025-10-10 22:26:06 +02:00
GassiGiuseppe
96610612fe Batcher added 2025-10-10 20:10:08 +02:00
Christian Risi
92ae40013d Added a way to detach models and create them standalone 2025-10-10 18:43:20 +02:00
Christian Risi
15f203cad5 Added boe 16k tokens vocabulary 2025-10-10 18:43:02 +02:00
Christian Risi
31c8541dfb Co-authored-by: GassiGiuseppe <GassiGiuseppe@users.noreply.github.com> 2025-10-10 16:28:09 +02:00
Christian Risi
bed9718f27 Added BPE small vocabulary 2025-10-10 11:40:39 +02:00
GassiGiuseppe
93865bee8a typo 2025-10-09 22:26:17 +02:00
Christian Risi
1c0ddb8753 Merge branch 'dev.embedder' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev.embedder 2025-10-09 22:23:36 +02:00
Christian Risi
51399f9dc9 commit of toy dataset with whole batch 2025-10-09 22:22:42 +02:00
GassiGiuseppe
d1ba4ae026 last update for collab
( we are gonna run it on a 100 yey)
2025-10-09 21:57:05 +02:00
Christian Risi
db0090981c Merge branch 'dev.embedder' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev.embedder 2025-10-09 21:53:45 +02:00
Christian Risi
e1c5649d67 updated to overfit over toy dataset 2025-10-09 21:53:42 +02:00
GassiGiuseppe
0bca241662 update environment yaml 2025-10-09 20:53:45 +02:00
GassiGiuseppe
005d7af6a0 lil update of requirements 2025-10-09 20:30:06 +02:00
GassiGiuseppe
9068db550e Merge branch 'dev.embedder' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev.embedder 2025-10-09 19:44:46 +02:00
GassiGiuseppe
d8f81e1a47 that god can have mercy upon us 2025-10-09 19:43:50 +02:00
Christian Risi
a67df9724e Merge branch 'dev.embedder' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev.embedder 2025-10-09 18:14:33 +02:00
Christian Risi
c5fd57d854 Updated train playground 2025-10-09 18:14:29 +02:00
GassiGiuseppe
ee253c39f4 Merge branch 'dev.embedder' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev.embedder 2025-10-09 13:31:37 +02:00
GassiGiuseppe
2036b4015f added logistic collector 2025-10-09 13:31:16 +02:00
Christian Risi
aac7675b30 Pipeline fix and added a util to decode 2025-10-09 13:24:48 +02:00
GassiGiuseppe
d2fdeb18a2 bla bla doctor 2025-10-09 12:41:47 +02:00
Christian Risi
f3b83eda3d Rework 2025-10-09 11:37:46 +02:00
Christian Risi
0158db2dce Fixed a bug where I took encoder embeddings rather than encoder output 2025-10-09 11:37:21 +02:00
Christian Risi
ba592c3480 Disabled Softmax 2025-10-09 11:36:56 +02:00
Christian Risi
1f9c30b531 Added Custom Learning Rate 2025-10-09 11:36:40 +02:00
63 changed files with 3207 additions and 6219 deletions

Binary file not shown.

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:6e0a193f90f2b0efc5185b0db9555178b172268b3eab289225b894ac1493493f
3 size 2471083

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:dd309865b60df86f63f76341858e382a8423297ec63eb6f525ccd28b62caf486
3 size 2494589

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:2949f2e9c6ae2b4784e04405dd7f5a3ec2eb65537b421fdc6751e9d5a19af41d
3 size 19527224

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:bc28507d806df96d6c953fbba1999f62a55e26025001de5135892069df05b9bc
3 size 22021103

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:176f13b63859c4dc0ca42b94d875aa82b74ad1cd88a186c439ef5444f45ed715
3 size 24455751

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:206f83b88b442f617575985ac88f4241071fa1b7d66b5935405178051511a369
3 size 1344466

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:6914b6b1f8f06f8cf73b96b9c27bf556f1ee93256f435b7da0be0df2af093d05
3 size 1334675

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:41e92da8af52ca1c83334ebea7312c63d37fdeacde425ba91b78f44a56e4fb88
3 size 10568092

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

BIN
Assets/Model/curated/log_loss.csv LFS Normal file

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:203b6cb364cf95cbb6cc0ebbff9e8b80e80dda73ff210ad91edeedf6024f6ab1
3 size 2876

Binary file not shown.

Binary file not shown.

BIN
Assets/Model/small/bpe-small.json LFS Normal file

Binary file not shown.

193
Playgrounds/doctor.ipynb Normal file
View File

@@ -0,0 +1,193 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "ddfb4457",
"metadata": {},
"outputs": [
{
"ename": "AssertionError",
"evalue": "target id 3872 >= V (256). Fix TOKEN_SPACE_SIZE.",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mAssertionError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 126\u001b[39m\n\u001b[32m 124\u001b[39m \u001b[38;5;66;03m# sanity guard (helps debug vocab mismatches fast)\u001b[39;00m\n\u001b[32m 125\u001b[39m max_seen = tgt[:, :Tp].max().item()\n\u001b[32m--> \u001b[39m\u001b[32m126\u001b[39m \u001b[38;5;28;01massert\u001b[39;00m max_seen < V \u001b[38;5;129;01mor\u001b[39;00m (tgt[:, :Tp] == PAD_TOKEN).all(), \\\n\u001b[32m 127\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mtarget id \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmax_seen\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m >= V (\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mV\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m). Fix TOKEN_SPACE_SIZE.\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 129\u001b[39m \u001b[38;5;66;03m# CE over all tokens produced so far (0..t). PAD is ignored by ignore_index\u001b[39;00m\n\u001b[32m 130\u001b[39m loss_t = cross_entropy(\n\u001b[32m 131\u001b[39m logits_btV.reshape(-\u001b[32m1\u001b[39m, V), \u001b[38;5;66;03m# [B*(t+1), V]\u001b[39;00m\n\u001b[32m 132\u001b[39m tgt[:, :Tp].reshape(-\u001b[32m1\u001b[39m) \u001b[38;5;66;03m# [B*(t+1)]\u001b[39;00m\n\u001b[32m 133\u001b[39m )\n",
"\u001b[31mAssertionError\u001b[39m: target id 3872 >= V (256). Fix TOKEN_SPACE_SIZE."
]
}
],
"source": [
"import random\n",
"import torch\n",
"import pandas as pd\n",
"from pathlib import Path\n",
"import Project_Model.Libs.Embedder as Embedder\n",
"import Project_Model.Libs.BPE as BPE\n",
"import Project_Model.Libs.Transformer as Transformer\n",
"import Project_Model.Libs.TorchShims as torch_shims\n",
"from Project_Model.Libs.Training.learning_rade_shedulers import CustomLR\n",
"from Project_Model.Libs.Training.logistic_collector import LogitsCollector # external collector\n",
"\n",
"# set a fixed seed\n",
"torch.manual_seed(0)\n",
"random.seed(0)\n",
"DEVICE = torch_shims.get_default_device()\n",
"torch.set_default_device(DEVICE)\n",
"\n",
"# BPE Init\n",
"VOCABULARY_PATH = Path(\"Assets/Model/toy_10/toy_dictionary.json\")\n",
"SPECIAL_VOC = BPE.default_special_tokens()\n",
"\n",
"VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)\n",
"TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)\n",
"\n",
"# Constants (TEMP size; will be corrected after dataset scan below)\n",
"TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + 1\n",
"EMBEDDED_SIZE = 256\n",
"FEED_FORWARD_MULTIPLIER = 4\n",
"ATTENTION_HEADS = 4\n",
"SENTENCE_LENGTH = 256\n",
"NUMBER_OF_BLOCKS = 2\n",
"MAX_EPOCHS = int(1e4)\n",
"\n",
"PAD_TOKEN = TOKENANO.encode(\"<PAD>\")[0]\n",
"END_TOKEN = TOKENANO.encode(\"<END>\")[0]\n",
"\n",
"# Load CSV\n",
"TOY_DATASET_PATH = Path(\"Assets/Dataset/1-hop/toy/rdf_text.csv\")\n",
"TOY_DATASET = pd.read_csv(TOY_DATASET_PATH)\n",
"\n",
"TOY_BATCH_INPUT_LIST: list[list[int]] = []\n",
"TOY_BATCH_PADDING_LIST: list[list[bool]] = []\n",
"TOY_BATCH_TARGET_LIST: list[list[int]] = []\n",
"TOY_BATCH_DECODER_DEFAULT: list[list[int]] = []\n",
"\n",
"for index, row in TOY_DATASET.iterrows():\n",
" RDFs: str = row[\"RDFs\"]\n",
" Abstract: str = row[\"Abstract\"]\n",
"\n",
" input_tokens = TOKENANO.encode(RDFs) # encoder input ids\n",
" output_tokens = TOKENANO.encode(Abstract)[1:] # decoder target ids (shifted left)\n",
" decoder_default_tokens = TOKENANO.encode(\"<SOS>\") # decoder input starts with <SOS>\n",
"\n",
" input_tokens, padding = Transformer.normalize_sequence(\n",
" input_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n",
" ) # pad/trim + end token\n",
" output_tokens, _ = Transformer.normalize_sequence(\n",
" output_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n",
" ) # pad/trim + end token\n",
" decoder_default_tokens = Transformer.pad_sequence(\n",
" decoder_default_tokens, SENTENCE_LENGTH, PAD_TOKEN\n",
" ) # pad with PAD up to SENTENCE_LENGTH\n",
"\n",
" TOY_BATCH_INPUT_LIST.append(input_tokens)\n",
" TOY_BATCH_PADDING_LIST.append(padding)\n",
" TOY_BATCH_TARGET_LIST.append(output_tokens)\n",
" TOY_BATCH_DECODER_DEFAULT.append(decoder_default_tokens)\n",
"\n",
"# fix V to cover ALL ids (including specials) # <- important\n",
"max_enc_id = max(max(row) for row in TOY_BATCH_INPUT_LIST) if TOY_BATCH_INPUT_LIST else 0\n",
"max_tgt_id = max(max(row) for row in TOY_BATCH_TARGET_LIST) if TOY_BATCH_TARGET_LIST else 0\n",
"TOKEN_SPACE_SIZE = max(TOKEN_SPACE_SIZE, max(PAD_TOKEN, END_TOKEN, max_enc_id, max_tgt_id) + 1)\n",
"\n",
"# Training loop\n",
"LOSS_HISTORY = []\n",
"NANOSOCRATES = Transformer.TrainingModel(\n",
" TOKEN_SPACE_SIZE,\n",
" EMBEDDED_SIZE,\n",
" FEED_FORWARD_MULTIPLIER,\n",
" ATTENTION_HEADS,\n",
" NUMBER_OF_BLOCKS,\n",
")\n",
"\n",
"collector = LogitsCollector(PAD_TOKEN, END_TOKEN, TOKENANO) # collects logits and decodes\n",
"\n",
"NANOSOCRATES.train()\n",
"cross_entropy = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)\n",
"optimizer = torch.optim.AdamW(NANOSOCRATES.parameters(), lr=1.0) # base lr works as factor\n",
"scheduler = CustomLR(optimizer, EMBEDDED_SIZE, warmup_steps=4000, factor=1.0) # step each optimizer step\n",
"\n",
"current_epoch = 0\n",
"BATCH_SIZE = min(32, len(TOY_BATCH_INPUT_LIST)) # small batch to stabilize\n",
"\n",
"while current_epoch < MAX_EPOCHS:\n",
" # simple fixed mini-batch from the top; later you can shuffle/slice\n",
" enc = torch.tensor(TOY_BATCH_INPUT_LIST[:BATCH_SIZE], dtype=torch.long) # [B,T] encoder token ids\n",
" pad = torch.tensor(TOY_BATCH_PADDING_LIST[:BATCH_SIZE], dtype=torch.bool) # [B,T] True where encoder PAD is present\n",
" tgt = torch.tensor(TOY_BATCH_TARGET_LIST[:BATCH_SIZE], dtype=torch.long) # [B,T] decoder targets (ground-truth)\n",
"\n",
" # decoder prefix buffer: <SOS> at pos 0, PAD elsewhere (no shift here) # we will fill it step by step\n",
" dec = torch.tensor(TOY_BATCH_DECODER_DEFAULT[:BATCH_SIZE], dtype=torch.long) # [B,T]\n",
"\n",
" total_loss = 0.0\n",
" collector.reset() # start fresh for this epoch\n",
"\n",
" T = tgt.size(1) # sequence length\n",
" for t in range(T):\n",
" # skip all-PAD steps to avoid CE divide-by-zero late in the sequence\n",
" if (tgt[:, t] == PAD_TOKEN).all(): # all PAD at this timestep\n",
" break\n",
"\n",
" optimizer.zero_grad(set_to_none=True) # clear grads for this token step\n",
"\n",
" prefix = dec[:, : t + 1] # [B, t+1] current decoder prefix\n",
" dec_pad_mask = prefix.eq(PAD_TOKEN) # [B, t+1] True where PAD inside prefix\n",
"\n",
" # now decoder returns all steps up to t -> [B, t+1, V]\n",
" logits_btV: torch.Tensor = NANOSOCRATES((enc, pad, prefix, dec_pad_mask)) # full logits for learning\n",
" collector.add(logits_btV) # collector will take the last step\n",
"\n",
" Tp = logits_btV.size(1) # t+1\n",
" V = logits_btV.size(-1) # vocab size\n",
"\n",
" # sanity guard (helps debug vocab mismatches fast)\n",
" max_seen = tgt[:, :Tp].max().item()\n",
" assert max_seen < V or (tgt[:, :Tp] == PAD_TOKEN).all(), \\\n",
" f\"target id {max_seen} >= V ({V}). Fix TOKEN_SPACE_SIZE.\"\n",
"\n",
" # CE over all tokens produced so far (0..t). PAD is ignored by ignore_index\n",
" loss_t = cross_entropy(\n",
" logits_btV.reshape(-1, V), # [B*(t+1), V]\n",
" tgt[:, :Tp].reshape(-1) # [B*(t+1)]\n",
" )\n",
"\n",
" loss_t.backward() # backprop for this step\n",
" optimizer.step() # update params\n",
" scheduler.step() # Noam/warmup: step per optimizer step\n",
"\n",
" total_loss = float(loss_t.detach()) # keep last step loss for logging\n",
"\n",
" # teacher forcing: reveal the correct token for next position\n",
" if t < T - 1:\n",
" dec[:, t + 1] = tgt[:, t] # write ground-truth into next slot\n",
"\n",
" current_epoch += 1\n",
" print(f\"EPOCH {current_epoch}\\n\\tLoss: {total_loss:.6f}\") # simple log\n",
" collector.print_decoded() # print decoded predictions for the batch\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "deep_learning",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

263
Playgrounds/evaluation.py Normal file
View File

@@ -0,0 +1,263 @@
import random
import torch
from pathlib import Path
import Project_Model.Libs.BPE as BPE
import Project_Model.Libs.Transformer as Transformer
import Project_Model.Libs.TransformerUtils as TUtils
import Project_Model.Libs.TorchShims as torch_shims
import Project_Model.Libs.Batch as Batch
# set a default device
DEVICE = torch_shims.get_default_device()
torch.set_default_device(DEVICE)
# set a fixed seed
torch.manual_seed(0)
random.seed(0)
# Get paths
MODEL_DIR = "Assets/Model/curated"
# MODEL_DIR= "Assets/Dataset/Tmp"
VOCABULARY_PATH = Path("Assets/Model/small/bpe-small-16.json")
TRAIN_DATASET_PATH = Path("Assets/Dataset/1-hop/small/holdout/train.csv")
VALIDATION_DATASET_PATH = Path("Assets/Dataset/1-hop/small/holdout/evaluation.csv")
TEST_DATASET_PATH = Path("Assets/Dataset/1-hop/small/holdout/test.csv")
# TEST_DATASET_PATH = Path("Assets/Dataset/1-hop/toy/rdf_text.csv")
MODEL_PATH = Path(f"{MODEL_DIR}/NanoSocrates.zip")
# BPE Init
SPECIAL_VOC = BPE.default_special_tokens()
VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)
TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)
# Constants
MASK_EXTRA_SPACE = 100
REAL_TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size
TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + MASK_EXTRA_SPACE
EMBEDDED_SIZE = 256
FEED_FORWARD_MULTIPLIER = 4
ATTENTION_HEADS = 4
SENTENCE_LENGTH = 256
NUMBER_OF_BLOCKS = 2
SOS_TOKEN = TOKENANO.encode("<SOS>")[0]
PAD_TOKEN = TOKENANO.encode("<PAD>")[0]
END_TOKEN = TOKENANO.encode("<EOS>")[0]
SUBJ_TOKEN = TOKENANO.encode("<SUBJ>")[0]
REL_TOKEN = TOKENANO.encode("<PRED>")[0]
OBJ_TOKEN = TOKENANO.encode("<OBJ>")[0]
MASK_TOKEN = TOKENANO.encode("<MASK>")[0]
CONTINUTE_TOKEN = TOKENANO.encode("<CONTINUERDF>")[0]
SPECIAL_TOKENS: set[int] = set(TOKENANO.encode("".join(BPE.default_special_tokens())))
ALLOWED_TOKENS = set([SUBJ_TOKEN, REL_TOKEN, OBJ_TOKEN])
FORBIDDEN_TOKENS = SPECIAL_TOKENS - ALLOWED_TOKENS
# Spanned_Masker
MASKER = Transformer.SpannedMasker(REAL_TOKEN_SPACE_SIZE, FORBIDDEN_TOKENS, average_span=4)
TRAIN_BATCHER = Batch.Batcher(TRAIN_DATASET_PATH, SENTENCE_LENGTH, TOKENANO, MASKER)
VALIDATION_BATCHER = Batch.Batcher(
VALIDATION_DATASET_PATH, SENTENCE_LENGTH, TOKENANO, MASKER
)
TEST_BATCHER = Batch.Batcher(TEST_DATASET_PATH, SENTENCE_LENGTH, TOKENANO, MASKER, debug=True)
# Model
NANOSOCRATES_TRAIN = Transformer.TrainingModel(
TOKEN_SPACE_SIZE,
EMBEDDED_SIZE,
FEED_FORWARD_MULTIPLIER,
ATTENTION_HEADS,
NUMBER_OF_BLOCKS,
)
NANOSOCRATES = Transformer.NanoSocratesCore(
TOKEN_SPACE_SIZE,
SENTENCE_LENGTH,
SOS_TOKEN,
PAD_TOKEN,
END_TOKEN,
CONTINUTE_TOKEN,
EMBEDDED_SIZE,
FEED_FORWARD_MULTIPLIER,
ATTENTION_HEADS,
NUMBER_OF_BLOCKS,
)
if MODEL_PATH.is_file():
nanosocrates_dict = torch.load(MODEL_PATH, weights_only=True, map_location=DEVICE)
NANOSOCRATES_TRAIN.load_state_dict(nanosocrates_dict)
_, ENCODER_ONLY, DECODER_ONLY = TUtils.decompose_nano_socrates(
NANOSOCRATES, TOKEN_SPACE_SIZE, EMBEDDED_SIZE
)
NANOSOCRATES = TUtils.train2inference(
NANOSOCRATES_TRAIN,
NANOSOCRATES
)
NANOSOCRATES.eval()
ENCODER_ONLY.eval()
DECODER_ONLY.eval()
NANOSOCRATES_TRAIN.eval()
task_1_metrics = []
task_2_metrics = []
task_3_metrics = []
task_4_metrics = []
example_num = 0
with torch.no_grad():
for example in TEST_BATCHER.batch(1):
print(f"DOING Example: {example_num}")
src_x, tgt_y, pad_x, pad_y, tasktype = example
enc_x = torch.tensor(src_x)
ACTUAL_BATCH_SIZE, _ = enc_x.shape
enc_x_pad = torch.tensor(pad_x, dtype=torch.bool)
tgt = torch.tensor(tgt_y)
tgt_pad = torch.tensor(pad_y, dtype=torch.bool)
dec_x = Transformer.get_decoder_input(
ACTUAL_BATCH_SIZE, SOS_TOKEN, PAD_TOKEN, SENTENCE_LENGTH
)
dec_x[:, 1:] = tgt[:, :-1]
dec_x_pad = dec_x.eq(PAD_TOKEN)
out: torch.Tensor = NANOSOCRATES.inference((enc_x, enc_x_pad), tasktype)
tokens: list[int] = out.tolist()[0]
tokens.append(END_TOKEN)
tokens = list(map(lambda x: MASK_TOKEN if x > TOKENANO.vocabulary_size else x, tokens))
out_string = TOKENANO.decode(tokens)
exp_tokens: list[int] = tgt_y[0]
exp_tokens = list(map(lambda x: MASK_TOKEN if x > TOKENANO.vocabulary_size else x, exp_tokens))
exp_string = TOKENANO.decode(exp_tokens)
enc_tokens: list[int] = src_x[0]
enc_tokens = list(map(lambda x: MASK_TOKEN if x > TOKENANO.vocabulary_size else x, enc_tokens))
enc_string = TOKENANO.decode(enc_tokens)
print(f"PROMPT:\n{enc_string}")
print(f"EXPECTED:\n{exp_string}")
print(f"ACTUAL:\n{out_string}")
if tasktype == Batch.TaskType.RDF2TXT:
example_num += 1
ref = TUtils.remove_padding(exp_tokens, PAD_TOKEN, END_TOKEN)
pred = TUtils.remove_padding(tokens, PAD_TOKEN, END_TOKEN)
ref_str = TOKENANO.decode(ref)
pred_str = TOKENANO.decode(pred)
bleu, rouge, meteor = TUtils.rdf2txt([ref_str], [pred_str])
task_1_metrics.append(
[
bleu["bleu"], rouge["rougeL"], meteor["meteor"] # type: ignore
]
)
if tasktype == Batch.TaskType.TEXT2RDF:
ref = TUtils.remove_padding(exp_tokens, PAD_TOKEN, END_TOKEN)
pred = TUtils.remove_padding(tokens[1:], PAD_TOKEN, END_TOKEN)
ref, pred = TUtils.balance_paddings(ref, pred, PAD_TOKEN)
precision, recall = TUtils.txt2rdf(ref, pred)
task_2_metrics.append(
[
precision["precision"], recall["recall"] # type: ignore
]
)
if tasktype == Batch.TaskType.MASKING:
ref = TUtils.remove_padding(exp_tokens, PAD_TOKEN, END_TOKEN)
pred = TUtils.remove_padding(tokens, PAD_TOKEN, END_TOKEN)
ref, pred = TUtils.balance_paddings(ref, pred, PAD_TOKEN)
accuracy = TUtils.accuracy(ref, pred)
task_3_metrics.append(
accuracy["accuracy"] # type: ignore
)
if tasktype == Batch.TaskType.COMPLETATION:
ref = TUtils.remove_padding(exp_tokens, PAD_TOKEN, END_TOKEN)
pred = TUtils.remove_padding(tokens, PAD_TOKEN, END_TOKEN)
ref, pred = TUtils.balance_paddings(ref, pred, PAD_TOKEN)
precision, recall = TUtils.txt2rdf(ref, pred)
task_4_metrics.append(
[
precision["precision"], recall["recall"] # type: ignore
]
)
bleus = [row[0] for row in task_1_metrics]
rouges = [row[1] for row in task_1_metrics]
meteors = [row[2] for row in task_1_metrics]
prec_1 = [row[0] for row in task_2_metrics]
rec_1 = [row[1] for row in task_2_metrics]
acc = task_3_metrics
prec_2 = [row[0] for row in task_4_metrics]
rec_2 = [row[1] for row in task_4_metrics]
BLEU = TUtils.average(bleus)
ROUGE = TUtils.average(rouges)
METEOR = TUtils.average(meteors)
PREC_1 = TUtils.average(prec_1)
REC_1 = TUtils.average(rec_1)
F1_1 = TUtils.f1(PREC_1, REC_1)
ACC = TUtils.average(acc)
PREC_2 = TUtils.average(prec_2)
REC_2 = TUtils.average(rec_2)
F1_2 = TUtils.f1(PREC_2, REC_2)
SEPARATOR = "**************************************************************************"
OUTPUT = "".join([
f"{SEPARATOR}\n",
"*\tRDF2TXT:\n",
f"*\t\tBLEU: {BLEU} - ROUGE: {ROUGE} - METEOR: {METEOR}\n"
f"{SEPARATOR}\n",
"*\tTXT2RDF:\n",
f"*\t\tPRECISION: {PREC_1} - RECALL: {REC_1} - F1: {F1_1}\n"
f"{SEPARATOR}\n",
"*\tRDF Completion 1:\n",
f"*\t\tACCURACY: {ACC}\n"
f"{SEPARATOR}\n",
"*\tRDF Completion 2:\n",
f"*\t\tPRECISION: {PREC_2} - RECALL: {REC_2} - F1: {F1_2}\n"
f"{SEPARATOR}\n",
""
])
print(OUTPUT)
print("\nDEBUG")
print(task_1_metrics)
print(task_2_metrics)
print(task_3_metrics)
print(task_4_metrics)

View File

@@ -0,0 +1,472 @@
import random
import sys
import torch
import pandas as pd
from pathlib import Path
import Project_Model.Libs.Embedder as Embedder
import Project_Model.Libs.BPE as BPE
import Project_Model.Libs.Transformer as Transformer
import Project_Model.Libs.TransformerUtils as TUtils
import Project_Model.Libs.TorchShims as torch_shims
import Project_Model.Libs.Batch as Batch
from Project_Model.Libs.Training.loss_saver import Log
# set a fixed seed
torch.manual_seed(0)
random.seed(0)
# set a default device
DEVICE = torch_shims.get_default_device()
torch.set_default_device(DEVICE)
# Get paths
CHECKPOINT_DIR = "Assets/Dataset/Tmp"
VOCABULARY_PATH = Path("Assets/Model/small/bpe-small-16.json")
TRAIN_DATASET_PATH = Path("Assets/Dataset/1-hop/toy/rdf_text.csv")
VALIDATION_DATASET_PATH = Path("Assets/Dataset/1-hop/toy/rdf_text.csv")
TEST_DATASET_PATH = Path("Assets/Dataset/1-hop/toy/rdf_text.csv")
CHECKPOINT_PATH = Path(f"{CHECKPOINT_DIR}/NanoSocrates.zip")
NANO_OPTIM_PATH = Path(f"{CHECKPOINT_DIR}/nano_optim.zip")
ENC_OPTIM_PATH = Path(f"{CHECKPOINT_DIR}/enc_optim.zip")
DEC_OPTIM_PATH = Path(f"{CHECKPOINT_DIR}/dec_optim.zip")
LAST_EPOCH_PATH = Path(f"{CHECKPOINT_DIR}/last_epoch.txt")
# log saver:
loss_saver = Log(f"{CHECKPOINT_DIR}/log_loss.csv")
# BPE Init
SPECIAL_VOC = BPE.default_special_tokens()
VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)
TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)
# Constants
MASK_EXTRA_SPACE = 100
REAL_TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size
TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + MASK_EXTRA_SPACE
EMBEDDED_SIZE = 256
FEED_FORWARD_MULTIPLIER = 4
ATTENTION_HEADS = 4
SENTENCE_LENGTH = 256
NUMBER_OF_BLOCKS = 2
MAX_EPOCHS = int(300)
PRETRAIN_EPOCHS = int(20)
WARMUP_EPOCHS = int(30)
MINI_BATCH_SIZE = 20
VALIDATION_STEPS = 10
CHECKPOINT_STEPS = VALIDATION_STEPS
PATIENCE = 4
CURRENT_EPOCH = -1 if not LAST_EPOCH_PATH.is_file() else int(LAST_EPOCH_PATH.read_text())
VERBOSE = False
LEARNING_RATE = 0.05
LABEL_SMOOTHING = 0.01
SOS_TOKEN = TOKENANO.encode("<SOS>")[0]
PAD_TOKEN = TOKENANO.encode("<PAD>")[0]
END_TOKEN = TOKENANO.encode("<END>")[0]
SUBJ_TOKEN = TOKENANO.encode("<SUBJ>")[0]
REL_TOKEN = TOKENANO.encode("<PRED>")[0]
OBJ_TOKEN = TOKENANO.encode("<OBJ>")[0]
MASK_TOKEN = TOKENANO.encode("<MASK>")[0]
SPECIAL_TOKENS: set[int] = set(TOKENANO.encode("".join(BPE.default_special_tokens())))
ALLOWED_TOKENS = set([SUBJ_TOKEN, REL_TOKEN, OBJ_TOKEN])
FORBIDDEN_TOKENS = SPECIAL_TOKENS - ALLOWED_TOKENS
# Spanned_Masker
MASKER = Transformer.SpannedMasker(REAL_TOKEN_SPACE_SIZE, FORBIDDEN_TOKENS, average_span=4)
TRAIN_BATCHER = Batch.Batcher(TRAIN_DATASET_PATH, SENTENCE_LENGTH, TOKENANO, MASKER)
VALIDATION_BATCHER = Batch.Batcher(
VALIDATION_DATASET_PATH, SENTENCE_LENGTH, TOKENANO, MASKER
)
TEST_BATCHER = Batch.Batcher(TEST_DATASET_PATH, SENTENCE_LENGTH, TOKENANO, MASKER)
# Model
NANOSOCRATES = Transformer.TrainingModel(
TOKEN_SPACE_SIZE,
EMBEDDED_SIZE,
FEED_FORWARD_MULTIPLIER,
ATTENTION_HEADS,
NUMBER_OF_BLOCKS,
)
if CHECKPOINT_PATH.is_file():
nanosocrates_dict = torch.load(CHECKPOINT_PATH, weights_only=True)
NANOSOCRATES.load_state_dict(nanosocrates_dict)
_, ENCODER_ONLY, DECODER_ONLY = TUtils.decompose_nano_socrates(
NANOSOCRATES, TOKEN_SPACE_SIZE, EMBEDDED_SIZE
)
# Training constants
nano_cross_entropy = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN, label_smoothing=LABEL_SMOOTHING)
encoder_ce = torch.nn.CrossEntropyLoss( label_smoothing=LABEL_SMOOTHING)
decoder_ce = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN, label_smoothing=LABEL_SMOOTHING)
nano_optim = torch.optim.AdamW(NANOSOCRATES.parameters(), LEARNING_RATE)
encoder_only_optim = torch.optim.AdamW(ENCODER_ONLY.parameters(), LEARNING_RATE)
decoder_only_optim = torch.optim.AdamW(DECODER_ONLY.parameters(), LEARNING_RATE)
if NANO_OPTIM_PATH.is_file():
optim_dict = torch.load(NANO_OPTIM_PATH)
nano_optim.load_state_dict(optim_dict)
if ENC_OPTIM_PATH.is_file():
optim_dict = torch.load(ENC_OPTIM_PATH)
encoder_only_optim.load_state_dict(optim_dict)
if DEC_OPTIM_PATH.is_file():
optim_dict = torch.load(DEC_OPTIM_PATH)
decoder_only_optim.load_state_dict(optim_dict)
nano_scheduler = Transformer.WarmupLR(nano_optim, WARMUP_EPOCHS, EMBEDDED_SIZE, last_epoch=CURRENT_EPOCH)
encoder_only_scheduler = Transformer.WarmupLR(
encoder_only_optim, WARMUP_EPOCHS, EMBEDDED_SIZE, last_epoch=CURRENT_EPOCH
)
decoder_only_scheduler = Transformer.WarmupLR(
decoder_only_optim, WARMUP_EPOCHS, EMBEDDED_SIZE, last_epoch=CURRENT_EPOCH
)
current_epoch = CURRENT_EPOCH + 2
patience = 0
average_loss_validation = {
"txt": float("inf"),
"encoder_only": float("inf"),
"decoder_only": float("inf"),
}
while current_epoch < MAX_EPOCHS:
NANOSOCRATES.train()
ENCODER_ONLY.train()
DECODER_ONLY.train()
text_batch_losses = []
encoder_batch_losses = []
decoder_batch_losses = []
batch_counter = 0
if VERBOSE:
print(f"EPOCH {current_epoch} STARTING")
for batch in TRAIN_BATCHER.batch(MINI_BATCH_SIZE):
batch_counter += 1
src_x, tgt_y, pad_x, pad_y, tasktype = batch
enc_x = torch.tensor(src_x)
ACTUAL_BATCH_SIZE, _ = enc_x.shape
enc_x_pad = torch.tensor(pad_x, dtype=torch.bool)
tgt = torch.tensor(tgt_y)
tgt_pad = torch.tensor(pad_y, dtype=torch.bool)
dec_x = Transformer.get_decoder_input(
ACTUAL_BATCH_SIZE, SOS_TOKEN, PAD_TOKEN, SENTENCE_LENGTH
)
dec_x[:, 1:] = tgt[:, :-1]
dec_x_pad = dec_x.eq(PAD_TOKEN)
if VERBOSE:
for s in TUtils.decode_batch(enc_x, TOKENANO, MASK_TOKEN):
print("Input")
print(s)
for s in TUtils.decode_batch(enc_x_pad, TOKENANO, MASK_TOKEN):
print("Encoder Padding mask")
print(s)
for s in TUtils.decode_batch(tgt, TOKENANO, MASK_TOKEN):
print("Desired Output")
print(s)
a_dx = dec_x[:,:]
a_dx[:, -1]= END_TOKEN
for s in TUtils.decode_batch(a_dx, TOKENANO, MASK_TOKEN):
print("Decoder Input")
print(s)
if VERBOSE:
print(f"\tBATCH {batch_counter} Starting")
# Task 1 and Task 2
if tasktype == Batch.TaskType.RDF2TXT or tasktype == Batch.TaskType.TEXT2RDF:
if VERBOSE:
print(f"\tExecuting TASK 1 or 2 - BATCH {batch_counter}")
nano_optim.zero_grad()
pred_logits: torch.Tensor = NANOSOCRATES((enc_x, enc_x_pad, dec_x, dec_x_pad))
pred_logits = pred_logits.permute(0, 2, 1)
loss: torch.Tensor = nano_cross_entropy(pred_logits, tgt)
loss.backward()
nano_optim.step()
text_batch_losses.append(loss)
continue
# Pretrain first
if current_epoch < PRETRAIN_EPOCHS:
continue
# Task 3
if tasktype == Batch.TaskType.MASKING:
if VERBOSE:
print(f"\tExecuting TASK 3 - BATCH {batch_counter}")
encoder_only_optim.zero_grad()
pred_logits = ENCODER_ONLY((enc_x, enc_x_pad))
pred_logits = pred_logits.permute(0, 2, 1)
# print(torch.max(tgt))
loss: torch.Tensor = encoder_ce(pred_logits, tgt)
loss.backward()
encoder_only_optim.step()
exp_tokens: list[int] = tgt_y[0]
exp_tokens = list(map(lambda x: MASK_TOKEN if x > TOKENANO.vocabulary_size else x, exp_tokens))
exp_string = TOKENANO.decode(exp_tokens)
enc_tokens: list[int] = src_x[0]
enc_tokens = list(map(lambda x: MASK_TOKEN if x > TOKENANO.vocabulary_size else x, enc_tokens))
enc_string = TOKENANO.decode(enc_tokens)
print(f"PROMPT:\n{enc_string}")
print(f"EXPECTED:\n{exp_string}")
encoder_batch_losses.append(loss.item())
continue
# Task 4
if tasktype == Batch.TaskType.COMPLETATION:
if VERBOSE:
print(f"\tExecuting TASK 4 - BATCH {batch_counter}")
decoder_only_optim.zero_grad()
pred_logits = DECODER_ONLY((dec_x, enc_x_pad, dec_x_pad))
pred_logits = pred_logits.permute(0, 2, 1)
loss: torch.Tensor = decoder_ce(pred_logits, tgt)
loss.backward()
decoder_only_optim.step()
decoder_batch_losses.append(
loss
)
continue
nano_scheduler.step()
encoder_only_scheduler.step()
decoder_only_scheduler.step()
current_epoch += 1
if current_epoch % VALIDATION_STEPS == 0:
NANOSOCRATES.eval()
ENCODER_ONLY.eval()
DECODER_ONLY.eval()
with torch.no_grad():
txt_avg_batch_losses = []
enc_avg_batch_losses = []
dec_avg_batch_losses = []
for batch in VALIDATION_BATCHER.batch(MINI_BATCH_SIZE):
src_x, tgt_y, pad_x, pad_y, tasktype = batch
enc_x = torch.tensor(src_x)
ACTUAL_BATCH_SIZE, _ = enc_x.shape
enc_x_pad = torch.tensor(pad_x, dtype=torch.bool)
tgt = torch.tensor(tgt_y)
tgt_pad = torch.tensor(pad_y, dtype=torch.bool)
dec_x = Transformer.get_decoder_input(
ACTUAL_BATCH_SIZE, SOS_TOKEN, PAD_TOKEN, SENTENCE_LENGTH
)
dec_x[:, 1:] = tgt[:, :-1]
dec_x_pad = dec_x.eq(PAD_TOKEN)
# Task 1 and Task 2
if (
tasktype == Batch.TaskType.RDF2TXT
or tasktype == Batch.TaskType.TEXT2RDF
):
pred_logits = NANOSOCRATES((enc_x, enc_x_pad, dec_x, dec_x_pad))
pred_logits = pred_logits.permute(0, 2, 1)
loss: torch.Tensor = nano_cross_entropy(
pred_logits, tgt
)
txt_avg_batch_losses.append(loss)
continue
# Pretrain first
if current_epoch <= PRETRAIN_EPOCHS:
continue
# Task 3
if tasktype == Batch.TaskType.MASKING:
pred_logits = ENCODER_ONLY((enc_x, enc_x_pad))
pred_logits = pred_logits.permute(0, 2, 1)
loss: torch.Tensor = encoder_ce(pred_logits, tgt)
enc_avg_batch_losses.append(loss.item())
continue
# Task 4
if tasktype == Batch.TaskType.COMPLETATION:
pred_logits = DECODER_ONLY((dec_x, enc_x_pad, dec_x_pad))
pred_logits = pred_logits.permute(0, 2, 1)
loss: torch.Tensor = decoder_ce(pred_logits, tgt)
dec_avg_batch_losses.append(loss)
continue
txt_avg_loss = sum(txt_avg_batch_losses) / len(txt_avg_batch_losses)
enc_avg_loss = float("inf")
dec_avg_loss = float("inf")
if current_epoch > PRETRAIN_EPOCHS:
enc_avg_loss = sum(enc_avg_batch_losses) / len(enc_avg_batch_losses)
dec_avg_loss = sum(dec_avg_batch_losses) / len(dec_avg_batch_losses)
if current_epoch < PRETRAIN_EPOCHS:
if txt_avg_loss < average_loss_validation["txt"]:
average_loss_validation["txt"] = txt_avg_loss
else:
patience += 1
if VERBOSE:
print(f"losing a patience, current irritation: {patience}")
else:
counter = 0
if txt_avg_loss > average_loss_validation["txt"]:
if VERBOSE:
print("txt average is higher than lowest")
counter += 1
else:
average_loss_validation["txt"] = txt_avg_loss
if enc_avg_loss > average_loss_validation["encoder_only"]:
if VERBOSE:
print("masking average is higher than lowest")
counter += 1
else:
average_loss_validation["encoder_only"] = enc_avg_loss
if dec_avg_loss > average_loss_validation["decoder_only"]:
if VERBOSE:
print("decoding only average is higher than lowest")
counter += 1
else:
average_loss_validation["decoder_only"] = dec_avg_loss
if counter > 1:
patience += 1
if VERBOSE:
print(f"losing a patience, current irritation: {patience}")
if counter == 0:
patience = max(0, patience - 1)
if VERBOSE:
print(f"all good, gaining a patience, current irritation: {patience}")
txt_train_avg_loss = sum(text_batch_losses) / len(text_batch_losses)
enc_avg_train_loss = float("inf")
dec_avg_train_loss = float("inf")
if current_epoch > PRETRAIN_EPOCHS:
try:
enc_avg_train_loss = sum(encoder_batch_losses) / len(encoder_batch_losses)
dec_avg_train_loss = sum(decoder_batch_losses) / len(decoder_batch_losses)
except:
pass
# write on log
loss_saver.write([current_epoch, txt_train_avg_loss,enc_avg_train_loss,dec_avg_train_loss,txt_avg_loss,enc_avg_loss,dec_avg_loss])
SEPARATOR = "================================================================================================================"
DEBUG_TEXT = "".join(
[
f"{SEPARATOR}\n",
f"EPOCH {current_epoch}\n",
f"{SEPARATOR}\n",
f"Train Losses:\n",
f"\tAvg Losses:\n",
f"\t\tavg_txt: {txt_train_avg_loss} - avg_enc: {enc_avg_train_loss} - avg_dec: {dec_avg_train_loss}\n",
f"{SEPARATOR}\n",
f"Validation Losses:\n",
f"\ttxt_loss: {txt_avg_loss} - masking_loss: {enc_avg_loss} - prediction_loss: {dec_avg_loss}\n",
f"{SEPARATOR}\n",
]
)
print(DEBUG_TEXT)
# Warn about patience
if patience == PATIENCE:
print("Model is likely overfitting, so let's stop here")
# SAVE MODEL
if current_epoch % CHECKPOINT_STEPS == 0 or patience == PATIENCE:
print(f"Saving model at {CHECKPOINT_PATH.as_posix()}")
torch.save(NANOSOCRATES.state_dict(), CHECKPOINT_PATH)
torch.save(nano_optim.state_dict(), NANO_OPTIM_PATH)
torch.save(encoder_only_optim.state_dict(), ENC_OPTIM_PATH)
torch.save(decoder_only_optim.state_dict(), DEC_OPTIM_PATH)
FILE = open(LAST_EPOCH_PATH, "w", encoding="utf-8")
FILE.write(f"{current_epoch}")
FILE.close()
if patience == PATIENCE:
exit(0)

View File

@@ -11,13 +11,7 @@
"output_type": "stream", "output_type": "stream",
"text": [ "text": [
"c:\\Users\\Chris\\miniconda3\\envs\\deep_learning\\Lib\\site-packages\\torch\\utils\\_device.py:103: UserWarning: Aten Op fallback from XPU to CPU happends. This may have performance implications. If need debug the fallback ops please set environment variable `PYTORCH_DEBUG_XPU_FALLBACK=1` (Triggered internally at C:\\actions-runner\\_work\\pytorch\\pytorch\\pytorch\\build\\xpu\\ATen\\RegisterXPU_0.cpp:54528.)\n", "c:\\Users\\Chris\\miniconda3\\envs\\deep_learning\\Lib\\site-packages\\torch\\utils\\_device.py:103: UserWarning: Aten Op fallback from XPU to CPU happends. This may have performance implications. If need debug the fallback ops please set environment variable `PYTORCH_DEBUG_XPU_FALLBACK=1` (Triggered internally at C:\\actions-runner\\_work\\pytorch\\pytorch\\pytorch\\build\\xpu\\ATen\\RegisterXPU_0.cpp:54528.)\n",
" return func(*args, **kwargs)\n", " return func(*args, **kwargs)\n"
"252.87s - name 'tensor' is not defined\n",
"Traceback (most recent call last):\n",
" File \"c:\\Users\\Chris\\miniconda3\\envs\\deep_learning\\Lib\\site-packages\\debugpy\\_vendored\\pydevd\\_pydevd_bundle\\pydevd_vars.py\", line 636, in change_attr_expression\n",
" value = eval(expression, frame.f_globals, frame.f_locals)\n",
" File \"<string>\", line 1, in <module>\n",
"NameError: name 'tensor' is not defined\n"
] ]
}, },
{ {
@@ -25,15 +19,9 @@
"evalue": "", "evalue": "",
"output_type": "error", "output_type": "error",
"traceback": [ "traceback": [
"\u001b[1;31mCannot execute code, session has been disposed. Please try restarting the Kernel." "\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n",
] "\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n",
}, "\u001b[1;31mClick <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. \n",
{
"ename": "",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[1;31mCannot execute code, session has been disposed. Please try restarting the Kernel. \n",
"\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details." "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
] ]
} }
@@ -68,9 +56,9 @@
"TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + 1\n", "TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + 1\n",
"EMBEDDED_SIZE = 256\n", "EMBEDDED_SIZE = 256\n",
"FEED_FORWARD_MULTIPLIER = 4\n", "FEED_FORWARD_MULTIPLIER = 4\n",
"ATTENTION_HEADS = 4\n", "ATTENTION_HEADS = 8\n",
"SENTENCE_LENGTH = 256\n", "SENTENCE_LENGTH = 256\n",
"NUMBER_OF_BLOCKS = 2\n", "NUMBER_OF_BLOCKS = 4\n",
"MAX_EPOCHS = int(1e3)\n", "MAX_EPOCHS = int(1e3)\n",
"\n", "\n",
"\n", "\n",
@@ -105,7 +93,26 @@
" output_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n", " output_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n",
" )\n", " )\n",
" decoder_default_tokens, _ = Transformer.normalize_sequence(\n", " decoder_default_tokens, _ = Transformer.normalize_sequence(\n",
" decoder_default_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n", " decoder_default_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN, False\n",
" )\n",
"\n",
" TOY_BATCH_INPUT_LIST.append(input_tokens)\n",
" TOY_BATCH_PADDING_LIST.append(padding)\n",
" TOY_BATCH_TARGET_LIST.append(output_tokens)\n",
" TOY_BATCH_DECODER_DEFAULT.append(decoder_default_tokens)\n",
"\n",
" output_tokens = TOKENANO.encode(RDFs)\n",
" input_tokens = TOKENANO.encode(Abstract)[1:]\n",
" decoder_default_tokens = TOKENANO.encode(\"<SOS>\")\n",
"\n",
" input_tokens, padding = Transformer.normalize_sequence(\n",
" input_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n",
" )\n",
" output_tokens, _ = Transformer.normalize_sequence(\n",
" output_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n",
" )\n",
" decoder_default_tokens, _ = Transformer.normalize_sequence(\n",
" decoder_default_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN, False\n",
" )\n", " )\n",
"\n", "\n",
" TOY_BATCH_INPUT_LIST.append(input_tokens)\n", " TOY_BATCH_INPUT_LIST.append(input_tokens)\n",
@@ -124,7 +131,7 @@
")\n", ")\n",
"cross_entropy = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)\n", "cross_entropy = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)\n",
"optimizer = torch.optim.AdamW(NANOSOCRATES.parameters())\n", "optimizer = torch.optim.AdamW(NANOSOCRATES.parameters())\n",
"scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 4)\n", "scheduler = Transformer.WarmupLR(optimizer, 4000, EMBEDDED_SIZE)\n",
"last_loss = 0\n", "last_loss = 0\n",
"current_epoch = 0\n", "current_epoch = 0\n",
"\n", "\n",
@@ -132,32 +139,44 @@
"\n", "\n",
" optimizer.zero_grad()\n", " optimizer.zero_grad()\n",
"\n", "\n",
" encoder_list = torch.tensor([TOY_BATCH_INPUT_LIST[0]])\n", " encoder_list = torch.tensor(TOY_BATCH_INPUT_LIST[:])\n",
" decoder_list = torch.tensor([TOY_BATCH_DECODER_DEFAULT[0]])\n", " decoder_list = torch.tensor(TOY_BATCH_DECODER_DEFAULT[:])\n",
" padding_list = torch.tensor([TOY_BATCH_PADDING_LIST[0]], dtype=torch.bool)\n", " src_padding = torch.tensor(TOY_BATCH_PADDING_LIST[:], dtype=torch.bool)\n",
"\n", "\n",
" # Transform target into logits\n", " # Transform target into logits\n",
" target_logits = torch.tensor([TOY_BATCH_TARGET_LIST[0]])\n", " target_logits = torch.tensor(TOY_BATCH_TARGET_LIST[:])\n",
"\n", "\n",
" last_loss = 0\n", " last_loss = 0\n",
" last_prediction: torch.Tensor\n",
"\n", "\n",
" for i in range(0, SENTENCE_LENGTH):\n", " for i in range(0, SENTENCE_LENGTH):\n",
"\n", "\n",
" optimizer.zero_grad()\n", " optimizer.zero_grad()\n",
" tgt_padding = decoder_list.eq(PAD_TOKEN)\n",
"\n", "\n",
" logits: torch.Tensor = NANOSOCRATES((encoder_list, padding_list, decoder_list))\n", " logits: torch.Tensor = NANOSOCRATES((encoder_list, src_padding, decoder_list, tgt_padding))\n",
" prob = torch.softmax(logits, 2)\n",
"\n", "\n",
" most_probable_tokens = torch.argmax(logits, 2)\n", " most_probable_tokens = torch.argmax(prob, 2)\n",
" last_prediction = most_probable_tokens\n",
"\n", "\n",
" logits = logits[:,i,:]\n", " logits = logits[:,:i,:]\n",
" logits = logits.permute(0, 2, 1)\n",
"\n",
" loss : torch.Tensor = cross_entropy(logits, target_logits[:, 0:i])\n",
" # loss : torch.Tensor = cross_entropy(logits, target_logits)\n",
"\n", "\n",
" loss = cross_entropy(logits, target_logits[:,i])\n",
" last_loss = loss\n", " last_loss = loss\n",
" loss.backward()\n",
" optimizer.step()\n", " optimizer.step()\n",
" scheduler.step()\n", " scheduler.step()\n",
"\n", "\n",
" if i < SENTENCE_LENGTH - 1:\n", " if i < SENTENCE_LENGTH - 1:\n",
" decoder_list[:,i+1] = most_probable_tokens[:,i]\n", " decoder_list[:,i+1] = target_logits[:,i]\n",
"\n",
"\n",
"\n",
"\n",
"\n", "\n",
"\n", "\n",
" current_epoch += 1\n", " current_epoch += 1\n",
@@ -165,6 +184,14 @@
" if current_epoch % 1 == 0:\n", " if current_epoch % 1 == 0:\n",
" print(f\"EPOCH {current_epoch}\\n\\tLoss: {last_loss}\")\n", " print(f\"EPOCH {current_epoch}\\n\\tLoss: {last_loss}\")\n",
"\n", "\n",
" for encoded_sentence, expected_sentence in zip(\n",
" Transformer.tensor2token(last_prediction[:,:], END_TOKEN), # type: ignore\n",
" Transformer.tensor2token(target_logits[:,:], END_TOKEN)\n",
" ):\n",
" decoded_sentence = TOKENANO.decode(encoded_sentence)\n",
" decoded_target = TOKENANO.decode(expected_sentence)\n",
" print(f\"\\tACTUAL:\\n\\t\\t{decoded_sentence}\\n\\tEXPECTED:\\n\\t\\t{decoded_target}\\n\")\n",
"\n",
"\n", "\n",
"\n", "\n",
"\n", "\n",

View File

@@ -0,0 +1,509 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "adbef43f",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\Chris\\miniconda3\\envs\\deep_learning\\Lib\\site-packages\\torch\\utils\\_device.py:103: UserWarning: Aten Op fallback from XPU to CPU happends. This may have performance implications. If need debug the fallback ops please set environment variable `PYTORCH_DEBUG_XPU_FALLBACK=1` (Triggered internally at C:\\actions-runner\\_work\\pytorch\\pytorch\\pytorch\\build\\xpu\\ATen\\RegisterXPU_0.cpp:54528.)\n",
" return func(*args, **kwargs)\n",
"c:\\Users\\Chris\\miniconda3\\envs\\deep_learning\\Lib\\site-packages\\torch\\optim\\lr_scheduler.py:192: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate\n",
" warnings.warn(\n"
]
},
{
"ename": "IndexError",
"evalue": "list index out of range",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mIndexError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 383\u001b[39m\n\u001b[32m 381\u001b[39m txt_min_train_losses = text_batch_losses[:][\u001b[32m0\u001b[39m]\n\u001b[32m 382\u001b[39m txt_avg_train_losses = text_batch_losses[:][\u001b[32m1\u001b[39m]\n\u001b[32m--> \u001b[39m\u001b[32m383\u001b[39m txt_max_train_losses = \u001b[43mtext_batch_losses\u001b[49m\u001b[43m[\u001b[49m\u001b[43m:\u001b[49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[32;43m2\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[32m 385\u001b[39m txt_min_loss = \u001b[38;5;28mmin\u001b[39m(txt_min_train_losses)\n\u001b[32m 386\u001b[39m txt_avg_min_loss = \u001b[38;5;28msum\u001b[39m(txt_min_train_losses) / \u001b[38;5;28mlen\u001b[39m(txt_min_train_losses)\n",
"\u001b[31mIndexError\u001b[39m: list index out of range"
]
}
],
"source": [
"import random\n",
"import sys\n",
"import torch\n",
"import pandas as pd\n",
"from pathlib import Path\n",
"import Project_Model.Libs.Embedder as Embedder\n",
"import Project_Model.Libs.BPE as BPE\n",
"import Project_Model.Libs.Transformer as Transformer\n",
"import Project_Model.Libs.TransformerUtils as TUtils\n",
"import Project_Model.Libs.TorchShims as torch_shims\n",
"import Project_Model.Libs.Batch as Batch\n",
"\n",
"# set a fixed seed\n",
"torch.manual_seed(0)\n",
"random.seed(0)\n",
"\n",
"\n",
"# set a default device\n",
"DEVICE = torch_shims.get_default_device()\n",
"torch.set_default_device(DEVICE)\n",
"\n",
"\n",
"# Get paths\n",
"VOCABULARY_PATH = Path(\"Assets/Model/small/bpe-small-16.json\")\n",
"TRAIN_DATASET_PATH = Path(\"Assets/Dataset/1-hop/toy/rdf_text.csv\")\n",
"VALIDATION_DATASET_PATH = Path(\"Assets/Dataset/1-hop/toy/rdf_text.csv\")\n",
"TEST_DATASET_PATH = Path(\"Assets/Dataset/1-hop/toy/rdf_text.csv\")\n",
"CHECKPOINT_PATH = Path(\"Assets/Dataset/Tmp/NanoSocrates.zip\")\n",
"\n",
"\n",
"# BPE Init\n",
"SPECIAL_VOC = BPE.default_special_tokens()\n",
"VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)\n",
"TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)\n",
"\n",
"\n",
"# Constants\n",
"MASK_EXTRA_SPACE = 25\n",
"TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + MASK_EXTRA_SPACE\n",
"EMBEDDED_SIZE = 256\n",
"FEED_FORWARD_MULTIPLIER = 4\n",
"ATTENTION_HEADS = 8\n",
"SENTENCE_LENGTH = 256\n",
"NUMBER_OF_BLOCKS = 4\n",
"MAX_EPOCHS = int(1e3)\n",
"PRETRAIN_EPOCHS = int(2)\n",
"WARMUP_EPOCHS = int(4e3)\n",
"MINI_BATCH_SIZE = 10\n",
"VALIDATION_STEPS = 1\n",
"CHECKPOINT_STEPS = VALIDATION_STEPS * 4\n",
"PATIENCE = 4\n",
"CURRENT_EPOCH = 0\n",
"\n",
"SOS_TOKEN = TOKENANO.encode(\"<SOS>\")[0]\n",
"\n",
"PAD_TOKEN = TOKENANO.encode(\"<PAD>\")[0]\n",
"END_TOKEN = TOKENANO.encode(\"<END>\")[0]\n",
"SUBJ_TOKEN = TOKENANO.encode(\"<SUBJ>\")[0]\n",
"REL_TOKEN = TOKENANO.encode(\"<PRED>\")[0]\n",
"OBJ_TOKEN = TOKENANO.encode(\"<OBJ>\")[0]\n",
"\n",
"SPECIAL_TOKENS: set[int] = set(TOKENANO.encode(\"\".join(BPE.default_special_tokens())))\n",
"ALLOWED_TOKENS = set([SUBJ_TOKEN, REL_TOKEN, OBJ_TOKEN])\n",
"FORBIDDEN_TOKENS = SPECIAL_TOKENS - ALLOWED_TOKENS\n",
"\n",
"\n",
"# Spanned_Masker\n",
"MASKER = Transformer.SpannedMasker(\n",
" TOKEN_SPACE_SIZE,\n",
" FORBIDDEN_TOKENS\n",
")\n",
"\n",
"TRAIN_BATCHER = Batch.Batcher(\n",
" TRAIN_DATASET_PATH,\n",
" SENTENCE_LENGTH,\n",
" TOKENANO,\n",
" MASKER\n",
")\n",
"VALIDATION_BATCHER = Batch.Batcher(\n",
" VALIDATION_DATASET_PATH,\n",
" SENTENCE_LENGTH,\n",
" TOKENANO,\n",
" MASKER\n",
")\n",
"TEST_BATCHER = Batch.Batcher(\n",
" TEST_DATASET_PATH,\n",
" SENTENCE_LENGTH,\n",
" TOKENANO,\n",
" MASKER\n",
")\n",
"\n",
"\n",
"# Model\n",
"NANOSOCRATES = Transformer.TrainingModel(\n",
" TOKEN_SPACE_SIZE,\n",
" EMBEDDED_SIZE,\n",
" FEED_FORWARD_MULTIPLIER,\n",
" ATTENTION_HEADS,\n",
" NUMBER_OF_BLOCKS\n",
")\n",
"_, ENCODER_ONLY, DECODER_ONLY = TUtils.decompose_nano_socrates(\n",
" NANOSOCRATES,\n",
" TOKEN_SPACE_SIZE,\n",
" EMBEDDED_SIZE\n",
")\n",
"\n",
"\n",
"# Training constants\n",
"cross_entropy = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)\n",
"nano_optim = torch.optim.AdamW(NANOSOCRATES.parameters())\n",
"encoder_only_optim = torch.optim.AdamW(ENCODER_ONLY.parameters())\n",
"decoder_only_optim = torch.optim.AdamW(DECODER_ONLY.parameters())\n",
"\n",
"nano_scheduler = Transformer.WarmupLR(nano_optim, WARMUP_EPOCHS, EMBEDDED_SIZE)\n",
"encoder_only_scheduler = Transformer.WarmupLR(encoder_only_optim, WARMUP_EPOCHS, EMBEDDED_SIZE)\n",
"decoder_only_scheduler = Transformer.WarmupLR(decoder_only_optim, WARMUP_EPOCHS, EMBEDDED_SIZE)\n",
"\n",
"current_epoch = CURRENT_EPOCH\n",
"patience = 0\n",
"\n",
"\n",
"average_loss_validation = {\n",
" \"txt\": float(\"inf\"),\n",
" \"encoder_only\": float(\"inf\"),\n",
" \"decoder_only\": float(\"inf\")\n",
"}\n",
"\n",
"while current_epoch < MAX_EPOCHS:\n",
"\n",
" text_batch_losses = []\n",
" encoder_batch_losses = []\n",
" decoder_batch_losses = []\n",
"\n",
" for batch in TRAIN_BATCHER.batch(MINI_BATCH_SIZE):\n",
"\n",
" src_x, tgt_y, pad_x, pad_y, tasktype = batch\n",
"\n",
" enc_x = torch.tensor(src_x)\n",
" enc_x_pad = torch.tensor(pad_x, dtype=torch.bool)\n",
" dec_x = Transformer.get_decoder_input(MINI_BATCH_SIZE, SOS_TOKEN, PAD_TOKEN, SENTENCE_LENGTH)\n",
" dec_x_pad = dec_x.eq(PAD_TOKEN)\n",
" tgt = torch.tensor(tgt_y)\n",
" tgt_pad = torch.tensor(pad_y, dtype=torch.bool)\n",
"\n",
" # Task 1 and Task 2\n",
" if tasktype == Batch.TaskType.RDF2TXT or tasktype == Batch.TaskType.TEXT2RDF:\n",
"\n",
" BATCH_LOSS = []\n",
"\n",
" for token_idx in range(0, SENTENCE_LENGTH):\n",
"\n",
" nano_optim.zero_grad()\n",
"\n",
"\n",
"\n",
" pred_logits = NANOSOCRATES((\n",
" enc_x, enc_x_pad, dec_x, dec_x_pad\n",
" ))\n",
"\n",
" pred_logits = pred_logits[:, token_idx, :]\n",
"\n",
" loss: torch.Tensor= cross_entropy(pred_logits, tgt[:, token_idx])\n",
"\n",
" loss.backward()\n",
" nano_optim.step()\n",
"\n",
"\n",
" BATCH_LOSS.append(\n",
" loss.item()\n",
" )\n",
"\n",
" if token_idx < SENTENCE_LENGTH - 1:\n",
" dec_x[:,token_idx + 1] = tgt[:, token_idx]\n",
"\n",
" MIN_BATCH_LOSS = min(BATCH_LOSS)\n",
" MAX_BATCH_LOSS = max(BATCH_LOSS)\n",
" AVG_BATCH_LOSS = sum(BATCH_LOSS) / MINI_BATCH_SIZE\n",
"\n",
" text_batch_losses.append([MIN_BATCH_LOSS, AVG_BATCH_LOSS, MAX_BATCH_LOSS])\n",
" continue\n",
"\n",
"\n",
" # Pretrain first\n",
" if current_epoch < PRETRAIN_EPOCHS:\n",
" continue\n",
"\n",
"\n",
" # Task 3\n",
" if tasktype == Batch.TaskType.MASKING:\n",
"\n",
" encoder_only_optim.zero_grad()\n",
"\n",
" pred_logits = ENCODER_ONLY((\n",
" enc_x, enc_x_pad\n",
" ))\n",
"\n",
" loss: torch.Tensor= cross_entropy(pred_logits, tgt)\n",
"\n",
" loss.backward()\n",
" encoder_only_optim.step()\n",
"\n",
" encoder_batch_losses.append(\n",
" loss.item()\n",
" )\n",
"\n",
" continue\n",
"\n",
"\n",
" # Task 4\n",
" if tasktype == Batch.TaskType.COMPLETATION:\n",
"\n",
" BATCH_LOSS = []\n",
"\n",
" for token_idx in range(0, SENTENCE_LENGTH):\n",
"\n",
" decoder_only_optim.zero_grad()\n",
"\n",
" pred_logits = DECODER_ONLY((\n",
" enc_x, enc_x_pad\n",
" ))\n",
"\n",
" pred_logits = pred_logits[:, token_idx, :]\n",
"\n",
" loss: torch.Tensor= cross_entropy(pred_logits, tgt[:, token_idx])\n",
"\n",
" loss.backward()\n",
" decoder_only_optim.step()\n",
"\n",
" BATCH_LOSS.append(\n",
" loss.item()\n",
" )\n",
"\n",
" if token_idx < SENTENCE_LENGTH - 1:\n",
" dec_x[:,token_idx + 1] = tgt[:, token_idx]\n",
"\n",
"\n",
" MIN_BATCH_LOSS = min(BATCH_LOSS)\n",
" MAX_BATCH_LOSS = max(BATCH_LOSS)\n",
" AVG_BATCH_LOSS = sum(BATCH_LOSS) / MINI_BATCH_SIZE\n",
"\n",
" decoder_batch_losses.append([MIN_BATCH_LOSS, AVG_BATCH_LOSS, MAX_BATCH_LOSS])\n",
"\n",
" continue\n",
"\n",
"\n",
" nano_scheduler.step()\n",
" encoder_only_scheduler.step()\n",
" decoder_only_scheduler.step()\n",
"\n",
" current_epoch += 1\n",
"\n",
" if current_epoch % VALIDATION_STEPS == 0:\n",
"\n",
" txt_avg_batch_losses = []\n",
" enc_avg_batch_losses = []\n",
" dec_avg_batch_losses = []\n",
"\n",
" for batch in VALIDATION_BATCHER.batch(MINI_BATCH_SIZE):\n",
"\n",
" src_x, tgt_y, pad_x, pad_y, tasktype = batch\n",
"\n",
" enc_x = torch.tensor(src_x)\n",
" enc_x_pad = torch.tensor(pad_x, dtype=torch.bool)\n",
" dec_x = Transformer.get_decoder_input(MINI_BATCH_SIZE, SOS_TOKEN, PAD_TOKEN, SENTENCE_LENGTH)\n",
" dec_x_pad = dec_x.eq(PAD_TOKEN)\n",
" tgt = torch.tensor(tgt_y)\n",
" tgt_pad = torch.tensor(pad_y, dtype=torch.bool)\n",
"\n",
" # Task 1 and Task 2\n",
" if tasktype == Batch.TaskType.RDF2TXT or tasktype == Batch.TaskType.TEXT2RDF:\n",
"\n",
" BATCH_LOSS = []\n",
"\n",
" for token_idx in range(0, SENTENCE_LENGTH):\n",
"\n",
"\n",
"\n",
" pred_logits = NANOSOCRATES((\n",
" enc_x, enc_x_pad, dec_x, dec_x_pad\n",
" ))\n",
"\n",
" pred_logits = pred_logits[:, token_idx, :]\n",
"\n",
" loss: torch.Tensor= cross_entropy(pred_logits, tgt[:, token_idx])\n",
"\n",
"\n",
" BATCH_LOSS.append(\n",
" loss.item()\n",
" )\n",
"\n",
" if token_idx < SENTENCE_LENGTH - 1:\n",
" dec_x[:,token_idx + 1] = tgt[:, token_idx]\n",
"\n",
"\n",
" AVG_BATCH_LOSS = sum(BATCH_LOSS) / MINI_BATCH_SIZE\n",
" txt_avg_batch_losses.append(AVG_BATCH_LOSS)\n",
"\n",
" continue\n",
"\n",
"\n",
" # Pretrain first\n",
" if current_epoch < PRETRAIN_EPOCHS:\n",
" continue\n",
"\n",
"\n",
" # Task 3\n",
" if tasktype == Batch.TaskType.MASKING:\n",
"\n",
" pred_logits = ENCODER_ONLY((\n",
" enc_x, enc_x_pad\n",
" ))\n",
"\n",
" loss: torch.Tensor= cross_entropy(pred_logits, tgt)\n",
"\n",
" enc_avg_batch_losses.append(\n",
" loss.item()\n",
" )\n",
"\n",
" continue\n",
"\n",
"\n",
" # Task 4\n",
" if tasktype == Batch.TaskType.COMPLETATION:\n",
"\n",
" BATCH_LOSS = []\n",
"\n",
" for token_idx in range(0, SENTENCE_LENGTH):\n",
"\n",
" pred_logits = DECODER_ONLY((\n",
" enc_x, enc_x_pad\n",
" ))\n",
"\n",
" pred_logits = pred_logits[:, token_idx, :]\n",
"\n",
" loss: torch.Tensor= cross_entropy(pred_logits, tgt[:, token_idx])\n",
"\n",
" BATCH_LOSS.append(\n",
" loss.item()\n",
" )\n",
"\n",
" if token_idx < SENTENCE_LENGTH - 1:\n",
" dec_x[:,token_idx + 1] = tgt[:, token_idx]\n",
"\n",
"\n",
" AVG_BATCH_LOSS = sum(BATCH_LOSS) / MINI_BATCH_SIZE\n",
"\n",
" dec_avg_batch_losses.append(AVG_BATCH_LOSS)\n",
"\n",
" continue\n",
"\n",
" txt_avg_loss = sum(txt_avg_batch_losses) / len(txt_avg_batch_losses)\n",
" enc_avg_loss = float(\"inf\")\n",
" dec_avg_loss = float(\"inf\")\n",
"\n",
" if current_epoch >= PRETRAIN_EPOCHS:\n",
" enc_avg_loss = sum(enc_avg_batch_losses) / len(enc_avg_batch_losses)\n",
" dec_avg_loss = sum(dec_avg_batch_losses) / len(dec_avg_batch_losses)\n",
"\n",
" if current_epoch < PRETRAIN_EPOCHS:\n",
"\n",
" if txt_avg_loss < average_loss_validation[\"txt\"]:\n",
" average_loss_validation[\"txt\"] = txt_avg_loss\n",
" else:\n",
" patience += 1\n",
" else:\n",
"\n",
" counter = 0\n",
"\n",
" if txt_avg_loss > average_loss_validation[\"txt\"]:\n",
" counter += 1\n",
"\n",
" if txt_avg_loss > average_loss_validation[\"encoder_only\"]:\n",
" counter += 1\n",
"\n",
" if txt_avg_loss > average_loss_validation[\"decoder_only\"]:\n",
" counter += 1\n",
"\n",
" if counter > 1:\n",
" patience += 1\n",
"\n",
" txt_min_train_losses = text_batch_losses[:][0]\n",
" txt_avg_train_losses = text_batch_losses[:][1]\n",
" txt_max_train_losses = text_batch_losses[:][2]\n",
"\n",
" txt_min_loss = min(txt_min_train_losses)\n",
" txt_avg_min_loss = sum(txt_min_train_losses) / len(txt_min_train_losses)\n",
" txt_max_loss = max(txt_max_train_losses)\n",
" txt_avg_max_loss = sum(txt_max_train_losses) / len(txt_max_train_losses)\n",
" txt_avg_loss = sum(txt_avg_train_losses) / len(txt_avg_train_losses)\n",
"\n",
" enc_avg_train_loss = float(\"inf\")\n",
"\n",
" dec_min_loss = float(\"inf\")\n",
" dec_avg_min_loss = float(\"inf\")\n",
" dec_max_loss = float(\"inf\")\n",
" dec_avg_max_loss = float(\"inf\")\n",
" dec_avg_loss = float(\"inf\")\n",
"\n",
" if current_epoch >= PRETRAIN_EPOCHS:\n",
" enc_avg_train_loss = sum(encoder_batch_losses) / len(encoder_batch_losses)\n",
"\n",
" dec_min_train_losses = decoder_batch_losses[:][0]\n",
" dec_avg_train_losses = decoder_batch_losses[:][1]\n",
" dec_max_train_losses = decoder_batch_losses[:][2]\n",
"\n",
" dec_min_loss = min(dec_min_train_losses)\n",
" dec_avg_min_loss = sum(dec_min_train_losses) / len(dec_min_train_losses)\n",
" dec_max_loss = max(dec_max_train_losses)\n",
" dec_avg_max_loss = sum(dec_max_train_losses) / len(dec_max_train_losses)\n",
" dec_avg_loss = sum(dec_avg_train_losses) / len(dec_avg_train_losses)\n",
"\n",
"\n",
" SEPARATOR = \"===========================================================================================\"\n",
" DEBUG_TEXT = \"\".join([\n",
" f\"{SEPARATOR}\\n\",\n",
" f\"EPOCH {current_epoch}\"\n",
" f\"{SEPARATOR}\\n\",\n",
" f\"Train Losses:\\n\"\n",
" f\"\\tMin Losses:\\n\"\n",
" f\"\\t\\tmin_txt: {txt_min_loss} - avg_txt: {txt_avg_min_loss}\\n\"\n",
" f\"\\t\\tmin_dec: {dec_min_loss} - avg_dec: {dec_avg_min_loss}\\n\"\n",
" f\"\\tMax Losses:\\n\"\n",
" f\"\\t\\tmax_txt: {txt_max_loss} - avg_txt: {txt_avg_max_loss}\\n\"\n",
" f\"\\t\\tmax_dec: {dec_min_loss} - avg_dec: {dec_avg_max_loss}\\n\"\n",
" f\"\\tAvg Losses:\\n\"\n",
" f\"\\t\\tavg_txt: {txt_avg_loss} - avg_enc: {enc_avg_loss} - avg_dec: {dec_avg_loss}\\n\"\n",
" f\"{SEPARATOR}\\n\",\n",
" f\"Validation Losses:\\n\"\n",
" f\"\\ttxt_loss: {txt_avg_loss} - masking_loss: {enc_avg_loss} - prediction: {dec_avg_loss}\"\n",
" f\"{SEPARATOR}\\n\",\n",
" ])\n",
"\n",
"\n",
"\n",
"\n",
"\n",
" # Warn about patience\n",
" if patience == PATIENCE:\n",
" print(\n",
" \"Model is likely overfitting, so let's stop here\"\n",
" )\n",
"\n",
" # SAVE MODEL\n",
" if current_epoch % CHECKPOINT_STEPS == 0 or patience == PATIENCE:\n",
" print(f\"Saving model at {CHECKPOINT_PATH.as_posix()}\")\n",
" torch.save(NANOSOCRATES.state_dict(), CHECKPOINT_PATH)\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "deep_learning",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,433 @@
import random
import sys
import torch
import pandas as pd
from pathlib import Path
import Project_Model.Libs.Embedder as Embedder
import Project_Model.Libs.BPE as BPE
import Project_Model.Libs.Transformer as Transformer
import Project_Model.Libs.TransformerUtils as TUtils
import Project_Model.Libs.TorchShims as torch_shims
import Project_Model.Libs.Batch as Batch
# set a fixed seed
torch.manual_seed(0)
random.seed(0)
# set a default device
DEVICE = torch_shims.get_default_device()
torch.set_default_device(DEVICE)
# Get paths
VOCABULARY_PATH = Path("Assets/Model/small/bpe-small-16.json")
TRAIN_DATASET_PATH = Path("Assets/Dataset/1-hop/small/holdout/train.csv")
VALIDATION_DATASET_PATH = Path("Assets/Dataset/1-hop/small/holdout/evaluation.csv")
TEST_DATASET_PATH = Path("Assets/Dataset/1-hop/small/holdout/test.csv")
CHECKPOINT_PATH = Path("Assets/Dataset/Tmp/NanoSocrates.zip")
# BPE Init
SPECIAL_VOC = BPE.default_special_tokens()
VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)
TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)
# Constants
MASK_EXTRA_SPACE = 100
REAL_TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size
TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + MASK_EXTRA_SPACE
EMBEDDED_SIZE = 256
FEED_FORWARD_MULTIPLIER = 4
ATTENTION_HEADS = 8
SENTENCE_LENGTH = 256
NUMBER_OF_BLOCKS = 4
MAX_EPOCHS = int(1e3)
PRETRAIN_EPOCHS = int(10)
WARMUP_EPOCHS = int(4e3)
MINI_BATCH_SIZE = 100
VALIDATION_STEPS = 5
CHECKPOINT_STEPS = VALIDATION_STEPS * 4
PATIENCE = 4
CURRENT_EPOCH = 0
SOS_TOKEN = TOKENANO.encode("<SOS>")[0]
PAD_TOKEN = TOKENANO.encode("<PAD>")[0]
END_TOKEN = TOKENANO.encode("<END>")[0]
SUBJ_TOKEN = TOKENANO.encode("<SUBJ>")[0]
REL_TOKEN = TOKENANO.encode("<PRED>")[0]
OBJ_TOKEN = TOKENANO.encode("<OBJ>")[0]
SPECIAL_TOKENS: set[int] = set(TOKENANO.encode("".join(BPE.default_special_tokens())))
ALLOWED_TOKENS = set([SUBJ_TOKEN, REL_TOKEN, OBJ_TOKEN])
FORBIDDEN_TOKENS = SPECIAL_TOKENS - ALLOWED_TOKENS
# Spanned_Masker
MASKER = Transformer.SpannedMasker(REAL_TOKEN_SPACE_SIZE, FORBIDDEN_TOKENS)
TRAIN_BATCHER = Batch.Batcher(TRAIN_DATASET_PATH, SENTENCE_LENGTH, TOKENANO, MASKER)
VALIDATION_BATCHER = Batch.Batcher(
VALIDATION_DATASET_PATH, SENTENCE_LENGTH, TOKENANO, MASKER
)
TEST_BATCHER = Batch.Batcher(TEST_DATASET_PATH, SENTENCE_LENGTH, TOKENANO, MASKER)
# Model
NANOSOCRATES = Transformer.TrainingModel(
TOKEN_SPACE_SIZE,
EMBEDDED_SIZE,
FEED_FORWARD_MULTIPLIER,
ATTENTION_HEADS,
NUMBER_OF_BLOCKS,
)
_, ENCODER_ONLY, DECODER_ONLY = TUtils.decompose_nano_socrates(
NANOSOCRATES, TOKEN_SPACE_SIZE, EMBEDDED_SIZE
)
# Training constants
nano_cross_entropy = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)
encoder_ce = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)
decoder_ce = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)
nano_optim = torch.optim.AdamW(NANOSOCRATES.parameters())
encoder_only_optim = torch.optim.AdamW(ENCODER_ONLY.parameters())
decoder_only_optim = torch.optim.AdamW(DECODER_ONLY.parameters())
nano_scheduler = Transformer.WarmupLR(nano_optim, WARMUP_EPOCHS, EMBEDDED_SIZE)
encoder_only_scheduler = Transformer.WarmupLR(
encoder_only_optim, WARMUP_EPOCHS, EMBEDDED_SIZE
)
decoder_only_scheduler = Transformer.WarmupLR(
decoder_only_optim, WARMUP_EPOCHS, EMBEDDED_SIZE
)
current_epoch = CURRENT_EPOCH
patience = 0
average_loss_validation = {
"txt": float("inf"),
"encoder_only": float("inf"),
"decoder_only": float("inf"),
}
while current_epoch < MAX_EPOCHS:
NANOSOCRATES.train()
ENCODER_ONLY.train()
DECODER_ONLY.train()
text_batch_losses = []
encoder_batch_losses = []
decoder_batch_losses = []
batch_counter = 0
print(f"EPOCH {current_epoch} STARTING")
for batch in TRAIN_BATCHER.batch(MINI_BATCH_SIZE):
batch_counter += 1
src_x, tgt_y, pad_x, pad_y, tasktype = batch
enc_x = torch.tensor(src_x)
ACTUAL_BATCH_SIZE, _ = enc_x.shape
enc_x_pad = torch.tensor(pad_x, dtype=torch.bool)
dec_x = Transformer.get_decoder_input(
ACTUAL_BATCH_SIZE, SOS_TOKEN, PAD_TOKEN, SENTENCE_LENGTH
)
dec_x_pad = dec_x.eq(PAD_TOKEN)
tgt = torch.tensor(tgt_y)
tgt_pad = torch.tensor(pad_y, dtype=torch.bool)
print(f"\tBATCH {batch_counter} Starting")
# Task 1 and Task 2
if tasktype == Batch.TaskType.RDF2TXT or tasktype == Batch.TaskType.TEXT2RDF:
print(f"\tExecuting TASK 1 or 2 - BATCH {batch_counter}")
BATCH_LOSS = []
for token_idx in range(0, SENTENCE_LENGTH):
nano_optim.zero_grad()
pred_logits = NANOSOCRATES((enc_x, enc_x_pad, dec_x, dec_x_pad))
pred_logits = pred_logits[:, token_idx, :]
loss: torch.Tensor = nano_cross_entropy(pred_logits, tgt[:, token_idx])
loss.backward()
nano_optim.step()
BATCH_LOSS.append(loss.item())
if token_idx < SENTENCE_LENGTH - 1:
dec_x[:, token_idx + 1] = tgt[:, token_idx]
MIN_BATCH_LOSS = min(BATCH_LOSS)
MAX_BATCH_LOSS = max(BATCH_LOSS)
AVG_BATCH_LOSS = sum(BATCH_LOSS) / MINI_BATCH_SIZE
text_batch_losses.append([MIN_BATCH_LOSS, AVG_BATCH_LOSS, MAX_BATCH_LOSS])
continue
# Pretrain first
if current_epoch < PRETRAIN_EPOCHS:
continue
# Task 3
if tasktype == Batch.TaskType.MASKING:
print(f"\tExecuting TASK 3 - BATCH {batch_counter}")
encoder_only_optim.zero_grad()
pred_logits = ENCODER_ONLY((enc_x, enc_x_pad))
pred_logits = pred_logits.permute(0, 2, 1)
print(torch.max(tgt))
loss: torch.Tensor = encoder_ce(pred_logits, tgt)
loss.backward()
encoder_only_optim.step()
encoder_batch_losses.append(loss.item())
continue
# Task 4
if tasktype == Batch.TaskType.COMPLETATION:
print(f"\tExecuting TASK 4 - BATCH {batch_counter}")
BATCH_LOSS = []
for token_idx in range(0, SENTENCE_LENGTH):
decoder_only_optim.zero_grad()
pred_logits = DECODER_ONLY((enc_x, enc_x_pad))
pred_logits = pred_logits[:, token_idx, :]
loss: torch.Tensor = decoder_ce(pred_logits, tgt[:, token_idx])
loss.backward()
decoder_only_optim.step()
BATCH_LOSS.append(loss.item())
if token_idx < SENTENCE_LENGTH - 1:
dec_x[:, token_idx + 1] = tgt[:, token_idx]
MIN_BATCH_LOSS = min(BATCH_LOSS)
MAX_BATCH_LOSS = max(BATCH_LOSS)
AVG_BATCH_LOSS = sum(BATCH_LOSS) / MINI_BATCH_SIZE
decoder_batch_losses.append(
[MIN_BATCH_LOSS, AVG_BATCH_LOSS, MAX_BATCH_LOSS]
)
continue
nano_scheduler.step()
encoder_only_scheduler.step()
decoder_only_scheduler.step()
current_epoch += 1
if current_epoch % VALIDATION_STEPS == 0:
NANOSOCRATES.eval()
ENCODER_ONLY.eval()
DECODER_ONLY.eval()
txt_avg_batch_losses = []
enc_avg_batch_losses = []
dec_avg_batch_losses = []
for batch in VALIDATION_BATCHER.batch(MINI_BATCH_SIZE):
src_x, tgt_y, pad_x, pad_y, tasktype = batch
enc_x = torch.tensor(src_x)
ACTUAL_BATCH_SIZE, _, _ = enc_x.shape
enc_x_pad = torch.tensor(pad_x, dtype=torch.bool)
dec_x = Transformer.get_decoder_input(
ACTUAL_BATCH_SIZE, SOS_TOKEN, PAD_TOKEN, SENTENCE_LENGTH
)
dec_x_pad = dec_x.eq(PAD_TOKEN)
tgt = torch.tensor(tgt_y)
tgt_pad = torch.tensor(pad_y, dtype=torch.bool)
# Task 1 and Task 2
if (
tasktype == Batch.TaskType.RDF2TXT
or tasktype == Batch.TaskType.TEXT2RDF
):
BATCH_LOSS = []
for token_idx in range(0, SENTENCE_LENGTH):
pred_logits = NANOSOCRATES((enc_x, enc_x_pad, dec_x, dec_x_pad))
pred_logits = pred_logits[:, token_idx, :]
loss: torch.Tensor = nano_cross_entropy(pred_logits, tgt[:, token_idx])
BATCH_LOSS.append(loss.item())
if token_idx < SENTENCE_LENGTH - 1:
dec_x[:, token_idx + 1] = tgt[:, token_idx]
AVG_BATCH_LOSS = sum(BATCH_LOSS) / MINI_BATCH_SIZE
txt_avg_batch_losses.append(AVG_BATCH_LOSS)
continue
# Pretrain first
if current_epoch < PRETRAIN_EPOCHS:
continue
# Task 3
if tasktype == Batch.TaskType.MASKING:
pred_logits = ENCODER_ONLY((enc_x, enc_x_pad))
pred_logits = pred_logits.permute(0, 2, 1)
loss: torch.Tensor = encoder_ce(pred_logits, tgt)
enc_avg_batch_losses.append(loss.item())
continue
# Task 4
if tasktype == Batch.TaskType.COMPLETATION:
BATCH_LOSS = []
for token_idx in range(0, SENTENCE_LENGTH):
pred_logits = DECODER_ONLY((enc_x, enc_x_pad))
pred_logits = pred_logits[:, token_idx, :]
loss: torch.Tensor = decoder_ce(pred_logits, tgt[:, token_idx])
BATCH_LOSS.append(loss.item())
if token_idx < SENTENCE_LENGTH - 1:
dec_x[:, token_idx + 1] = tgt[:, token_idx]
AVG_BATCH_LOSS = sum(BATCH_LOSS) / MINI_BATCH_SIZE
dec_avg_batch_losses.append(AVG_BATCH_LOSS)
continue
txt_avg_loss = sum(txt_avg_batch_losses) / len(txt_avg_batch_losses)
enc_avg_loss = float("inf")
dec_avg_loss = float("inf")
if current_epoch >= PRETRAIN_EPOCHS:
enc_avg_loss = sum(enc_avg_batch_losses) / len(enc_avg_batch_losses)
dec_avg_loss = sum(dec_avg_batch_losses) / len(dec_avg_batch_losses)
if current_epoch < PRETRAIN_EPOCHS:
if txt_avg_loss < average_loss_validation["txt"]:
average_loss_validation["txt"] = txt_avg_loss
else:
patience += 1
else:
counter = 0
if txt_avg_loss > average_loss_validation["txt"]:
counter += 1
if txt_avg_loss > average_loss_validation["encoder_only"]:
counter += 1
if txt_avg_loss > average_loss_validation["decoder_only"]:
counter += 1
if counter > 1:
patience += 1
txt_min_train_losses = [row[0] for row in text_batch_losses]
txt_avg_train_losses = [row[1] for row in text_batch_losses]
txt_max_train_losses = [row[2] for row in text_batch_losses]
txt_min_loss = min(txt_min_train_losses)
txt_avg_min_loss = sum(txt_min_train_losses) / len(txt_min_train_losses)
txt_max_loss = max(txt_max_train_losses)
txt_avg_max_loss = sum(txt_max_train_losses) / len(txt_max_train_losses)
txt_avg_loss = sum(txt_avg_train_losses) / len(txt_avg_train_losses)
enc_avg_train_loss = float("inf")
dec_min_loss = float("inf")
dec_avg_min_loss = float("inf")
dec_max_loss = float("inf")
dec_avg_max_loss = float("inf")
dec_avg_loss = float("inf")
if current_epoch >= PRETRAIN_EPOCHS:
enc_avg_train_loss = sum(encoder_batch_losses) / len(encoder_batch_losses)
dec_min_train_losses = [row[0] for row in decoder_batch_losses]
dec_avg_train_losses = [row[1] for row in decoder_batch_losses]
dec_max_train_losses = [row[2] for row in decoder_batch_losses]
dec_min_loss = min(dec_min_train_losses)
dec_avg_min_loss = sum(dec_min_train_losses) / len(dec_min_train_losses)
dec_max_loss = max(dec_max_train_losses)
dec_avg_max_loss = sum(dec_max_train_losses) / len(dec_max_train_losses)
dec_avg_loss = sum(dec_avg_train_losses) / len(dec_avg_train_losses)
SEPARATOR = "================================================================================================================"
DEBUG_TEXT = "".join(
[
f"{SEPARATOR}\n",
f"EPOCH {current_epoch}\n",
f"{SEPARATOR}\n",
f"Train Losses:\n",
f"\tMin Losses:\n",
f"\t\tmin_txt: {txt_min_loss} - avg_txt: {txt_avg_min_loss}\n",
f"\t\tmin_dec: {dec_min_loss} - avg_dec: {dec_avg_min_loss}\n",
f"\tMax Losses:\n",
f"\t\tmax_txt: {txt_max_loss} - avg_txt: {txt_avg_max_loss}\n",
f"\t\tmax_dec: {dec_min_loss} - avg_dec: {dec_avg_max_loss}\n",
f"\tAvg Losses:\n",
f"\t\tavg_txt: {txt_avg_loss} - avg_enc: {enc_avg_loss} - avg_dec: {dec_avg_loss}\n",
f"{SEPARATOR}\n",
f"Validation Losses:\n",
f"\ttxt_loss: {txt_avg_loss} - masking_loss: {enc_avg_loss} - prediction: {dec_avg_loss}\n",
f"{SEPARATOR}\n",
]
)
print(DEBUG_TEXT)
# Warn about patience
if patience == PATIENCE:
print("Model is likely overfitting, so let's stop here")
# SAVE MODEL
if current_epoch % CHECKPOINT_STEPS == 0 or patience == PATIENCE:
print(f"Saving model at {CHECKPOINT_PATH.as_posix()}")
torch.save(NANOSOCRATES.state_dict(), CHECKPOINT_PATH)

177
Playgrounds/prova.py Normal file
View File

@@ -0,0 +1,177 @@
import random
import time
import torch
import pandas as pd
from pathlib import Path
import Project_Model.Libs.Embedder as Embedder
import Project_Model.Libs.BPE as BPE
import Project_Model.Libs.Transformer as Transformer
import Project_Model.Libs.TorchShims as torch_shims
# set a fixed seed
torch.manual_seed(0)
random.seed(0)
DEVICE = torch_shims.get_default_device()
torch.set_default_device(DEVICE)
# set a default device
# BPE Init
VOCABULARY_PATH = Path("Assets/Model/toy_10/toy_dictionary.json")
SPECIAL_VOC = BPE.default_special_tokens()
VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)
TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)
# Constants
TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + 1
EMBEDDED_SIZE = 256
FEED_FORWARD_MULTIPLIER = 4
ATTENTION_HEADS = 8
SENTENCE_LENGTH = 256
NUMBER_OF_BLOCKS = 4
MAX_EPOCHS = int(1e3)
PAD_TOKEN = TOKENANO.encode("<PAD>")[0]
END_TOKEN = TOKENANO.encode("<END>")[0]
# Load CSV
TOY_DATASET_PATH = Path("Assets/Dataset/1-hop/toy/rdf_text.csv")
TOY_DATASET = pd.read_csv(TOY_DATASET_PATH)
TOY_BATCH_INPUT_LIST: list[list[int]] = []
TOY_BATCH_PADDING_LIST: list[list[bool]] = []
TOY_BATCH_TARGET_LIST: list[list[int]] = []
TOY_BATCH_DECODER_DEFAULT: list[list[int]] = []
for index, row in TOY_DATASET.iterrows():
RDFs: str = row["RDFs"]
Abstract: str = row["Abstract"]
input_tokens = TOKENANO.encode(RDFs)
output_tokens = TOKENANO.encode(Abstract)[1:]
decoder_default_tokens = TOKENANO.encode("<SOS>")
input_tokens, padding = Transformer.normalize_sequence(
input_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN
)
output_tokens, _ = Transformer.normalize_sequence(
output_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN
)
decoder_default_tokens, _ = Transformer.normalize_sequence(
decoder_default_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN, False
)
TOY_BATCH_INPUT_LIST.append(input_tokens)
TOY_BATCH_PADDING_LIST.append(padding)
TOY_BATCH_TARGET_LIST.append(output_tokens)
TOY_BATCH_DECODER_DEFAULT.append(decoder_default_tokens)
output_tokens = TOKENANO.encode(RDFs)
input_tokens = TOKENANO.encode(Abstract)[1:]
decoder_default_tokens = TOKENANO.encode("<SOS>")
input_tokens, padding = Transformer.normalize_sequence(
input_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN
)
output_tokens, _ = Transformer.normalize_sequence(
output_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN
)
decoder_default_tokens, _ = Transformer.normalize_sequence(
decoder_default_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN, False
)
TOY_BATCH_INPUT_LIST.append(input_tokens)
TOY_BATCH_PADDING_LIST.append(padding)
TOY_BATCH_TARGET_LIST.append(output_tokens)
TOY_BATCH_DECODER_DEFAULT.append(decoder_default_tokens)
# Training loop
LOSS_HISTORY = []
NANOSOCRATES = Transformer.TrainingModel(
TOKEN_SPACE_SIZE,
EMBEDDED_SIZE,
FEED_FORWARD_MULTIPLIER,
ATTENTION_HEADS,
NUMBER_OF_BLOCKS,
)
cross_entropy = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)
optimizer = torch.optim.AdamW(NANOSOCRATES.parameters())
scheduler = Transformer.WarmupLR(optimizer, 4000, EMBEDDED_SIZE)
last_loss = 0
current_epoch = 0
while current_epoch < MAX_EPOCHS:
optimizer.zero_grad()
encoder_list = torch.tensor(TOY_BATCH_INPUT_LIST[:])
decoder_list = torch.tensor(TOY_BATCH_DECODER_DEFAULT[:])
src_padding = torch.tensor(TOY_BATCH_PADDING_LIST[:], dtype=torch.bool)
# Transform target into logits
target_logits = torch.tensor(TOY_BATCH_TARGET_LIST[:])
last_loss = 0
last_prediction: torch.Tensor
LOSS_HISTORY = []
start = time.time_ns()
for i in range(0, SENTENCE_LENGTH):
optimizer.zero_grad()
tgt_padding = decoder_list.eq(PAD_TOKEN)
logits: torch.Tensor = NANOSOCRATES(
(encoder_list, src_padding, decoder_list, tgt_padding)
)
prob = torch.softmax(logits, 2)
most_probable_tokens = torch.argmax(prob, 2)
last_prediction = most_probable_tokens
logits = logits[:, i, :]
# logits = logits.permute(0, 2, 1)
loss: torch.Tensor = cross_entropy(logits, target_logits[:, i])
LOSS_HISTORY.append(loss.item())
# loss : torch.Tensor = cross_entropy(logits, target_logits[:, 0:i])
# loss : torch.Tensor = cross_entropy(logits, target_logits)
last_loss = loss
loss.backward()
optimizer.step()
scheduler.step()
if i < SENTENCE_LENGTH - 1:
decoder_list[:, i + 1] = target_logits[:, i]
current_epoch += 1
end = time.time_ns()
if current_epoch % 1 == 0:
MIN_LOSS = min(LOSS_HISTORY)
MAX_LOSS = max(LOSS_HISTORY)
AVERAGE_LOSS = sum(LOSS_HISTORY)/len(LOSS_HISTORY)
print(f"EPOCH {current_epoch}\n\tTime: {(end-start)/1E9}s\n\tLoss: {last_loss}")
print(f"\tMin Loss: {MIN_LOSS}\tAvg Loss: {AVERAGE_LOSS}\tMax Loss: {MAX_LOSS}\n")
# for encoded_sentence, expected_sentence in zip(
# Transformer.tensor2token(last_prediction[:, :], END_TOKEN), # type: ignore
# Transformer.tensor2token(target_logits[:, :], END_TOKEN),
# ):
# decoded_sentence = TOKENANO.decode(encoded_sentence)
# decoded_target = TOKENANO.decode(expected_sentence)
# print(
# f"\tACTUAL:\n\t\t{decoded_sentence}\n\tEXPECTED:\n\t\t{decoded_target}\n"
# )

View File

@@ -189,7 +189,7 @@ class NanoSocratesBPE(Encoder):
token_stack.appendleft(right_token) token_stack.appendleft(right_token)
token_stack.appendleft(left_token) token_stack.appendleft(left_token)
return UTF_8_STRING_ARR.decode("utf-8") return UTF_8_STRING_ARR.decode("utf-8", errors="ignore")
def __token_decode(self, token_id: int) -> tuple[int, int]: def __token_decode(self, token_id: int) -> tuple[int, int]:

View File

@@ -31,7 +31,7 @@ class TokeNanoCore:
def vocabulary_size(self): def vocabulary_size(self):
BPE_VOC_SIZE = self.__bpe_encoder.vocabulary_size BPE_VOC_SIZE = self.__bpe_encoder.vocabulary_size
SPECIAL_VOC_SIZE = self.__special_encoder.vocabulary_size SPECIAL_VOC_SIZE = self.__special_encoder.vocabulary_size
return BPE_VOC_SIZE + SPECIAL_VOC_SIZE return BPE_VOC_SIZE + SPECIAL_VOC_SIZE + 1
def encode(self, corpus: str) -> list[int]: def encode(self, corpus: str) -> list[int]:
output: list[int] = [] output: list[int] = []

View File

@@ -1,104 +1,243 @@
import random import random
from typing import Generator import sys
from typing import Any, Generator
import pandas as pd import pandas as pd
from pathlib import Path
from ..Enums import TaskType
import Project_Model.Libs.BPE as BPE import Project_Model.Libs.BPE as BPE
from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
from Project_Model.Libs.Transformer.Classes.SpannedMasker import SpannedMasker # from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
from TokenCompletation import TokenCompletationTransformer from Project_Model.Libs.Transformer import (
from Project_Model.Libs.BPE.Enums.SpecialToken import SpecialToken SpannedMasker,
truncate_rdf_list,
normalize_sequence,
)
from Project_Model.Libs.BPE import SpecialToken
class Batcher: class Batcher:
def __init__(self, dataset_path: str, batch_size:int, tokenizer: BPE.TokeNanoCore, masker: SpannedMasker) -> None: def __init__(
# ABSTRACT, TRIPLE self,
# tasks: dataset_path: Path,
# rdf2text: X: TRIPLE, Y: ABSTRACT max_length: int,
# text2rdf: X: ABSTRACT, X:TRIPLE tokenizer: BPE.TokeNanoCore,
# masking ( call masker): X: incomplete_triple Y: complete_triple (as exam) masker: SpannedMasker,
seed: int = 0,
debug = False
) -> None:
# ABSTRACT, TRIPLE
# tasks:
# rdf2text: X: TRIPLE, Y: ABSTRACT
# text2rdf: X: ABSTRACT, X:TRIPLE
# masking ( call masker): X: incomplete_triple Y: complete_triple (as exam)
# completation: X: TRIPLE SUBSET, Y: related TRIPLE SUBSET # completation: X: TRIPLE SUBSET, Y: related TRIPLE SUBSET
# it will truncate
# it will instantiate spanmaskter and truncator
self._dataset_path = dataset_path self._dataset_path = dataset_path
self._batch_size = batch_size
self._tokenizer = tokenizer self._tokenizer = tokenizer
self._masker = masker self._masker = masker
self.__max_length = max_length
self._seed = seed
# self._token_completation = TokenCompletationTransformer(sotl,eos)
self._completation_task_token_truncator = truncate_rdf_list
self.__debug = debug
sotl = self._tokenizer.encode(SpecialToken.START_TRIPLE_LIST.value) def batch(self, batch_size) -> Generator[
eos = self._tokenizer.encode(SpecialToken.END_OF_SEQUENCE.value) tuple[
self._token_completation = TokenCompletationTransformer(sotl,eos) list[list[int]],
list[list[int]],
list[list[int]],
list[list[int]],
TaskType
],
Any,
Any,
]:
"""
Yields: X,Y,padding_X
"""
RNG = random.Random(self._seed)
self._masker.reseed(self._seed)
for batch in pd.read_csv(self._dataset_path, chunksize=batch_size):
def get_batch(self)-> Generator[pd.DataFrame]:
for batch in pd.read_csv(self._dataset_path, chunksize= int(self._batch_size/4)): #now we support 3 task
tokenized_batch = pd.DataFrame() tokenized_batch = pd.DataFrame()
tokenized_batch[["Abstract","RDFs"]] = ( # encode
batch[["Abstract","RDFs"]] tokenized_batch[["Abstract", "RDFs"]] = batch[["Abstract", "RDFs"]].map(
.map(lambda t: self._tokenizer.encode(t)) lambda t: self._tokenizer.encode(t)
) )
rdf2txt_batch = self.__rdf2txt_transformation(tokenized_batch)
txt2rdf_batch = self.__txt2rdf_transformation(tokenized_batch)
mask_batch = self.__masking_trasformation(tokenized_batch)
completation_batch = self.__token_completation_task(tokenized_batch)
output = pd.concat([rdf2txt_batch,txt2rdf_batch,mask_batch,completation_batch],ignore_index=True) X, Y, padding_X, padding_Y = self.__rdf2txt_transformation(tokenized_batch)
output = output.sample(frac=1).reset_index(drop=True) yield X, Y, padding_X, padding_Y, TaskType.RDF2TXT
yield output (
X,
Y,
padding_X,
padding_Y,
) = self.__txt2rdf_transformation(tokenized_batch)
yield X, Y, padding_X, padding_Y, TaskType.TEXT2RDF
(
X,
Y,
padding_X,
padding_Y,
) = self.__masking_trasformation(tokenized_batch)
yield X, Y, padding_X, padding_Y, TaskType.MASKING
(
X,
Y,
padding_X,
padding_Y,
) = self.__token_completation_task(
tokenized_batch, RNG.randint(0, sys.maxsize)
)
yield X, Y, padding_X, padding_Y, TaskType.COMPLETATION
def __random_subset_rdfs(self, batch: pd.DataFrame, seed = 0): # output = pd.concat([rdf2txt_batch,txt2rdf_batch,completation_batch],ignore_index=True)
# WIP # output = output.sample(frac=1).reset_index(drop=True)
# self.decode_debug(output)
# yield output
def __random_subset_rdfs(self, batch: pd.DataFrame, seed=0):
# WIP
rng = random.Random(seed) rng = random.Random(seed)
def to_list(x): def to_list(x):
return x.split(SpecialToken.START_TRIPLE.value)[1:] return x.split(SpecialToken.START_TRIPLE.value)[1:]
batch["RDFs"] = batch["RDFs"].map( batch["RDFs"] = batch["RDFs"].map(to_list)
to_list
) def decode_debug(self, batch: pd.DataFrame):
decoded = pd.DataFrame()
decoded[["X", "Y"]] = batch[["X", "Y"]].map(lambda t: self._tokenizer.decode(t))
print(decoded)
def __normalization(
self, X: list[list[int]], Y: list[list[int]]
) -> tuple[list[list[int]], list[list[int]], list[list[int]], list[list[int]]]:
pad_token = self._tokenizer.encode(SpecialToken.PAD.value)[0]
end_token = self._tokenizer.encode(SpecialToken.END_OF_SEQUENCE.value)[0]
out_X = []
padding_X = []
out_Y = []
padding_Y = []
for x in X:
out_x, padding_x = normalize_sequence(
x, self.__max_length, pad_token, end_token, True
)
out_X.append(out_x)
padding_X.append(padding_x)
for y in Y:
out_y, padding_y = normalize_sequence(
y, self.__max_length, pad_token, end_token, True
)
out_Y.append(out_y)
padding_Y.append(padding_y)
return out_X, out_Y, padding_X, padding_Y
def __rdf2txt_transformation(self, batch: pd.DataFrame): def __rdf2txt_transformation(self, batch: pd.DataFrame):
batch = batch.rename(columns={"RDFs": "X", "Abstract": "Y"}) X: list[list[int]]
return batch[["X", "Y"]] task_token = self._tokenizer.encode(SpecialToken.RDF_TO_TEXT.value)
out = batch.rename(columns={"RDFs": "X", "Abstract": "Y"})[["X", "Y"]]
out["X"] = [task_token + x for x in out["X"]]
return self.__normalization(out["X"].to_list(), out["Y"].to_list())
def __txt2rdf_transformation(self, batch: pd.DataFrame): def __txt2rdf_transformation(self, batch: pd.DataFrame):
batch = batch.rename(columns={ "Abstract": "X","RDFs": "Y"}) task_token = self._tokenizer.encode(SpecialToken.TEXT_TO_RDF.value)
return batch[["X", "Y"]] out = batch.rename(columns={"Abstract": "X", "RDFs": "Y"})[["X", "Y"]]
out["X"] = [task_token + x for x in out["X"]]
return self.__normalization(out["X"].to_list(), out["Y"].to_list())
def __masking_trasformation(self, batch: pd.DataFrame): def __masking_trasformation(self, batch: pd.DataFrame):
# mask_sequence: List[int] -> Tuple[List[int], List[int]] X = []
xy_tuples = batch["RDFs"].apply(self._masker.mask_sequence) # Series of (X, Y) Y = []
for rdf in batch["RDFs"]:
x, y = self._masker.mask_sequence(rdf[:self.__max_length])
X.append(x)
Y.append(y)
return self.__normalization(X, Y)
output = batch.copy() def __token_completation_task(self, batch: pd.DataFrame, minibatch_seed: int):
# Expand into two columns preserving the original index continue_triple_token = self._tokenizer.encode(SpecialToken.CONTINUE_RDF.value)[
output[["X", "Y"]] = pd.DataFrame(xy_tuples.tolist(), index=batch.index) 0
return output[["X", "Y"]] ]
eot = self._tokenizer.encode(SpecialToken.END_TRIPLE.value)[0]
X = []
def __token_completation_task(self, batch: pd.DataFrame): Y = []
xy_tuples = batch["RDFs"].apply(self._token_completation.get_completation_tuple) for rdf in batch["RDFs"]:
output = batch.copy() # here first truncate to max_lenght
output[["X", "Y"]] = pd.DataFrame(xy_tuples.tolist(), index=batch.index) rdf = rdf[: self.__max_length] # truncator that uses "eot" so no problem
return output[["X", "Y"]] x, y = self._completation_task_token_truncator(
rdf, 0.5, continue_triple_token, eot, minibatch_seed
)
X.append(x)
Y.append(y)
return self.__token_cmpletation_task_special_normalization(X, Y)
def __token_cmpletation_task_special_normalization(self, X: list[list[int]], Y: list[list[int]]
) -> tuple[list[list[int]], list[list[int]], list[list[int]], list[list[int]]]:
def continue_rdf_padding(sequence: list[int], pad_token: int):
for i, x in enumerate(sequence):
if x == pad_token:
i = i+1 # continueRDF will be excluded by the mask
# fill the tail with True and stop
return [False] * i + [True] * (len(sequence) - i)
return [False] * len(sequence) # no pad token found
""" pad_token = self._tokenizer.encode(SpecialToken.PAD.value)[0]
DATASET_PATH = "Assets/Dataset/Tmp/rdf_text.csv" end_token = self._tokenizer.encode(SpecialToken.END_OF_SEQUENCE.value)[0]
VOCABULARY_path = "Assets/Dataset/Tmp/trimmed.json" continue_rdf = self._tokenizer.encode(SpecialToken.CONTINUE_RDF.value)[0]
out_X = []
padding_X = []
out_Y = []
padding_Y = []
from pathlib import Path for x in X:
VOCABULARY = BPE.load_nanos_vocabulary(Path(VOCABULARY_path)) out_x, _ = normalize_sequence(
SPECIAL_LIST = BPE.default_special_tokens() x, self.__max_length, pad_token, end_token, True
TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_LIST) )
SPECIAL_TOKENS: set[int] = set(TOKENANO.encode("".join(SPECIAL_LIST))) out_X.append(out_x)
# padding_X.append(padding_x)
special_padding = continue_rdf_padding(out_x,continue_rdf)
padding_X.append(special_padding)
MASKER = SpannedMasker(TOKENANO.vocabulary_size,SPECIAL_TOKENS) for y in Y:
out_y, padding_y = normalize_sequence(
y, self.__max_length, pad_token, end_token, True
)
out_Y.append(out_y)
# special padding
# special_padding = continue_rdf_padding(out_y,continue_rdf)
# padding_Y.append(special_padding)
padding_Y.append(padding_y)
prova = "<ABS>Cactus Flower is a 1969 American screwball comedy film directed by Gene Saks, and starring Walter Matthau, Ingrid Bergman and Goldie Hawn, who won an Academy Award for her performance.The screenplay was adapted by I. A. L. Diamond from the 1965 Broadway play of the same title written by Abe Burrows, which, in turn, is based on the French play Fleur de cactus by Pierre Barillet and Jean-Pierre Gredy. Cactus Flower was the ninth highest-grossing film of 1969." return out_X, out_Y, padding_X, padding_Y
print(TOKENANO.encode(prova))
batcher = Batcher(DATASET_PATH,8,TOKENANO,MASKER) if __name__ == "__main__":
for batch in batcher.get_batch():
print(batch) DATASET_PATH = Path("Assets/Dataset/Tmp/rdf_text.csv")
""" VOCABULARY_path = "Assets/Dataset/Tmp/trimmed.json"
from pathlib import Path
VOCABULARY = BPE.load_nanos_vocabulary(Path(VOCABULARY_path))
SPECIAL_LIST = BPE.default_special_tokens()
TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_LIST)
SPECIAL_TOKENS: set[int] = set(TOKENANO.encode("".join(SPECIAL_LIST)))
MASKER = SpannedMasker(TOKENANO.vocabulary_size, SPECIAL_TOKENS)
prova = "<ABS>Cactus Flower is a 1969 American screwball comedy film directed by Gene Saks, and starring Walter Matthau, Ingrid Bergman and Goldie Hawn, who won an Academy Award for her performance.The screenplay was adapted by I. A. L. Diamond from the 1965 Broadway play of the same title written by Abe Burrows, which, in turn, is based on the French play Fleur de cactus by Pierre Barillet and Jean-Pierre Gredy. Cactus Flower was the ninth highest-grossing film of 1969."
print(TOKENANO.encode(prova))
batcher = Batcher(DATASET_PATH,256, TOKENANO, MASKER)
for batch in batcher.batch(8):
print(batch)

View File

@@ -0,0 +1,2 @@
from .Batcher import Batcher
from .TokenCompletation import TokenCompletationTransformer

View File

@@ -0,0 +1,5 @@
from .TaskType import TaskType
__all__ = [
"TaskType"
]

View File

@@ -0,0 +1,5 @@
from .Classes import *
from .Enums import *
from . import Classes
from . import Enums

View File

@@ -0,0 +1,70 @@
import evaluate
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
class Evaluator():
def __init__(self) -> None:
# txt based evaluator
self.__rouge = evaluate.load("rouge")
self.__rouge_types = ["rougeLsum", "rouge1", "rouge2"] #rougeLsum will work bad because it expect that each sentence are divided with /n
self._bleu = evaluate.load("bleu")
self._meteor = evaluate.load("meteor")
# token based evaluator
self.__acc_m = evaluate.load("accuracy")
self.__prec_m = evaluate.load("precision")
self.__rec_m = evaluate.load("recall")
self.__f1_m = evaluate.load("f1")
def rdf2txt_rouge_evaluation(self, preds: list[str], refs: list[str]):
results = self.__rouge.compute(
predictions=preds, references=refs,
rouge_types=self.__rouge_types,
use_stemmer=True,
use_aggregator=True #F1
)
return {k: float(results[k]) for k in self.__rouge_types}
def rdf2txt_bleu_evaluation(self, preds: list[str], refs: list[str]) -> float:
# sacreBLEU via evaluate; expects references as list-of-lists
# each prediction can be evaluated against a list of references, hence [[ref]]
results = self._bleu.compute(predictions=preds, references=[[r] for r in refs])
return float(results["bleu"]) # (native sacreBLEU scale)
def rdf2txt_meteor_evaluation(self, preds: list[str], refs: list[str]) -> float:
# as bleu
res = self._meteor.compute(predictions=preds, references=[[r] for r in refs])
return float(res["meteor"])
def __my_accuracy(self,preds: list[list[int]], refs: list[list[int]]):
# it is done on token sequence not single token
total = len(preds)
correct = 0
for p, r in zip(preds, refs):
correct += int(p == r)
return correct / total
def __accuracy(self, preds, refs):
return accuracy_score(preds,refs)
def __clean_batch_by_pad(self, preds: list[list[int]], refs: list[list[int]]):
output_preds = []
output_refs = []
#TODO
pad_token: int = 7000 # percolate
for pred, ref in zip(preds,refs):
try:
i = ref.index(pad_token) # first time pad token appears
except ValueError:
i = len(ref)
output_preds.append(pred[:i])
output_refs.append(ref[:i])
return output_preds,output_refs
def __precision_recall(self, preds: list[list[int]], refs: list[list[int]]):
#TODO
p, r, f1, _ = precision_recall_fscore_support(
preds, refs, average="binary", zero_division=0
) #### watch later
return {"precision": float(p), "recall": float(r), "f1": float(f1)}

View File

@@ -1,41 +0,0 @@
import numpy as np
# custom LR from attention is all you need
class Custom_lr():
def __init__(self, d_model: int, warmup_step:int) -> None:
self.__d_model = d_model
self.__warmup_step = warmup_step
self.__epoch = 0
def step(self) -> int:
self.__epoch += 1
return (self.__d_model ** -0.5) * min(self.__epoch ** -0.5,
self.__epoch * (self.__warmup_step ** -1.5))
# OTHER LR
# Learning rate schedules (matching visualization parameters)
def step_lr(epoch, lr):
# StepLR: step_size=20, gamma=0.5 (from visualization)
return lr * 0.5 if epoch % 20 == 0 and epoch > 0 else lr
def exp_lr(epoch, lr):
# ExponentialLR: gamma=0.95 (from visualization)
return lr * 0.95
def cosine_lr(epoch, lr):
# CosineAnnealingLR: lr_min=0.001, lr_max=0.1, max_epochs=100 (from visualization)
lr_min, lr_max = 0.001, 0.1
max_epochs = 100
return lr_min + 0.5 * (lr_max - lr_min) * (1 + np.cos(epoch * np.pi / max_epochs))
def cyclical_lr(epoch, lr):
# CyclicalLR: base_lr=0.001, max_lr=0.1, step_size=20 (from visualization)
base_lr = 0.001
max_lr = 0.1
step_size = 20
cycle = np.floor(1 + epoch / (2 * step_size))
x = np.abs(epoch / step_size - 2 * cycle + 1)
return base_lr + (max_lr - base_lr) * max(0, (1 - x))

View File

@@ -0,0 +1,43 @@
import torch
class LogitsCollector:
def __init__(self, pad_token: int, end_token: int, tokenizer) -> None:
self.__pad_token = pad_token # used to skip PAD
self.__end_token = end_token # used to stop at END
self.__tokenizer = tokenizer # exposes .decode(list[int]) -> str
self.__steps: list[torch.Tensor] = [] # list of per-step logits [B,V]
def reset(self) -> None:
self.__steps.clear() # clear history
def add(self, logits_step: torch.Tensor) -> None:
if logits_step.dim() == 3: # handle [B,1,V]
logits_step = logits_step[:, -1, :] # -> [B,V]
self.__steps.append(logits_step.detach()) # store raw logits (detached)
def tokens(self) -> list[list[int]]:
if not self.__steps:
return []
stack = torch.stack(self.__steps, dim=0) # [T,B,V]
probs = torch.softmax(stack, dim=-1) # softmax over vocab -> [T,B,V]
ids = probs.argmax(dim=-1).transpose(0, 1) # greedy ids -> [B,T]
out: list[list[int]] = []
for row in ids.tolist():
seq: list[int] = []
for tok in row:
# if tok == self.__end_token: # stop on END
# break
if tok == self.__pad_token: # skip PAD
continue
seq.append(tok)
out.append(seq)
return out
def print_decoded(self) -> None:
for i, seq in enumerate(self.tokens()):
try:
# text = text + self.__end_token
text = self.__tokenizer.decode(seq) # decode tokens to string
except Exception:
text = str(seq) # fallback to ids
print(f"[{i}] {text}") # simple print

View File

@@ -0,0 +1,20 @@
import os
from pathlib import Path
class Log:
def __init__(self, path):
self.path = path
header = ["epoch","avg_txt","avg_enc","avg_dec","txt_loss","masking_loss","prediction_loss"]
if Path(path).is_file():
return
with open(self.path, "w", encoding="utf-8", newline="") as f:
f.write(",".join(header) + "\n")
def write(self, loss: list[float]):
line = ",".join(str(float(x)) for x in loss) + "\n"
with open(self.path, "a", encoding="utf-8", newline="") as f:
f.write(line)
f.flush()
os.fsync(f.fileno()) # extra durability per write # suggested against sudden crashes since it will be done

View File

@@ -14,6 +14,6 @@ class DeToken(torch.nn.Module):
x = self.__linear(x) x = self.__linear(x)
# 2) Go to logits # 2) Go to logits
x = torch.softmax(x, 2) # x = torch.softmax(x, 2)
return x return x

View File

@@ -1,8 +1,9 @@
from typing import Optional
import torch import torch
import torch.nn as nn import torch.nn as nn
from .FeedForwardNetwork import FeedForwardNetwork from .FeedForwardNetwork import FeedForwardNetwork
from .TorchMultiHeadAttention import TorchMultiHeadAttention as MultiHeadAttention from .TorchMultiHeadAttention import TorchMultiHeadAttention as MultiHeadAttention
from ..Utils.attention_mask import get_causal_attention_mask from ..Utils.attention_mask import get_causal_attention_mask, get_prefix_causal_mask_from_padding_mask
# B, L(T), E_D # B, L(T), E_D
@@ -15,8 +16,10 @@ class Decoder(nn.Module):
feed_forward_hidden_layer_dimension: int, feed_forward_hidden_layer_dimension: int,
number_of_attention_heads: int, number_of_attention_heads: int,
) -> None: ) -> None:
self.__attention_heads = number_of_attention_heads
super().__init__() super().__init__()
self.__masked_attention = MultiHeadAttention( self.__masked_attention = MultiHeadAttention(
embedding_dimension, number_of_attention_heads, dropout=0.1 embedding_dimension, number_of_attention_heads, dropout=0.1
) )
@@ -41,62 +44,72 @@ class Decoder(nn.Module):
torch.Tensor, torch.Tensor,
torch.Tensor, torch.Tensor,
torch.Tensor, torch.Tensor,
torch.Tensor torch.Tensor,
torch.Tensor,
Optional[bool]
] ]
): # -> list[torch.Tensor]: # k_x = v_x . While x_q = x ): # -> list[torch.Tensor]: # k_x = v_x . While x_q = x
# WARNING: args is needed to have sequential # WARNING: args is needed to have sequential
x, k_x, v_x, padding_mask = args if len(args) < 6:
args = args + (False)
x, k_x, v_x, src_padding_mask, tgt_padding_mask, decoder_only = args
# build of attention mask # build of attention mask
attention_mask = get_causal_attention_mask(x.size(1)) # TODO: create a prefix causal mask if needed
if decoder_only:
attention_mask = get_prefix_causal_mask_from_padding_mask(x.size(1),src_padding_mask,self.__attention_heads) # the correct is tgt however ...
else:
attention_mask = get_causal_attention_mask(x.size(1))
# 1) Masked Attention # 1) Masked Attention
MASKED_ATTENTION = self.__masked_attention( MASKED_ATTENTION = self.__masked_attention(
x, x, x, key_padding_mask=padding_mask, attention_mask=attention_mask x, x, x, key_padding_mask=tgt_padding_mask, attention_mask=attention_mask
) )
# 2) Dropout # 2) Dropout
# DROPPED_MASKED_ATTENTION = self.__dropout(MASKED_ATTENTION) DROPPED_MASKED_ATTENTION = self.__dropout(MASKED_ATTENTION)
# del MASKED_ATTENTION del MASKED_ATTENTION
# 3) Residual Connection # 3) Residual Connection
x = x + MASKED_ATTENTION x = x + DROPPED_MASKED_ATTENTION
del MASKED_ATTENTION del DROPPED_MASKED_ATTENTION
# 4) Layer Normalization # 4) Layer Normalization
x = self.__layer_norm_1(x) x = self.__layer_norm_1(x)
# 5) Encoderdecoder (cross) attention
CROSS_ATTENTION = self.__cross_attention(
x, k_x, v_x, key_padding_mask=padding_mask
)
# 6) Dropout if not decoder_only:
# DROPPED_CROSS_ATTENTION = self.__dropout(CROSS_ATTENTION) # 5) Encoderdecoder (cross) attention
# del CROSS_ATTENTION CROSS_ATTENTION = self.__cross_attention(
x, k_x, v_x, key_padding_mask=src_padding_mask
)
# 7) Residual Connection # 6) Dropout
x = x + CROSS_ATTENTION DROPPED_CROSS_ATTENTION = self.__dropout(CROSS_ATTENTION)
del CROSS_ATTENTION del CROSS_ATTENTION
# 8) Layer Normalization # 7) Residual Connection
x = self.__layer_norm_2(x) x = x + DROPPED_CROSS_ATTENTION
del DROPPED_CROSS_ATTENTION
# 8) Layer Normalization
x = self.__layer_norm_2(x)
# 9) Position-wise feed-forward # 9) Position-wise feed-forward
FEED_FORWARD = self.__feed_forward_network(x) FEED_FORWARD = self.__feed_forward_network(x)
# 10) Dropout # 10) Dropout
# DROPPED_FEED_FORWARD = self.__dropout(FEED_FORWARD) DROPPED_FEED_FORWARD = self.__dropout(FEED_FORWARD)
# del FEED_FORWARD del FEED_FORWARD
# 11) Residual Connection # 11) Residual Connection
x = x + FEED_FORWARD x = x + DROPPED_FEED_FORWARD
del FEED_FORWARD del DROPPED_FEED_FORWARD
# 12) Layer Normalization # 12) Layer Normalization
x = self.__layer_norm_3(x) x = self.__layer_norm_3(x)
return (x, k_x, v_x, padding_mask) return (x, k_x, v_x, src_padding_mask, tgt_padding_mask, decoder_only)
# use eval to disable dropout ecc # use eval to disable dropout ecc

View File

@@ -43,12 +43,12 @@ class Encoder(
ATTENTION = self.__attention(x, x, x, key_padding_mask=padding_mask) ATTENTION = self.__attention(x, x, x, key_padding_mask=padding_mask)
# 2) Dropout # 2) Dropout
# DROPPED_ATTENTION = self.__dropout(ATTENTION) DROPPED_ATTENTION = self.__dropout(ATTENTION)
# del ATTENTION del ATTENTION
# 3) Residual Connection # 3) Residual Connection
x = x + ATTENTION x = x + DROPPED_ATTENTION
del ATTENTION del DROPPED_ATTENTION
# 4) Layer Normalization # 4) Layer Normalization
x = self.__layer_norm_1(x) x = self.__layer_norm_1(x)
@@ -57,12 +57,12 @@ class Encoder(
FEED_FORWARD = self.__feed_forward(x) FEED_FORWARD = self.__feed_forward(x)
# 6) Dropout # 6) Dropout
# DROPPED_FEED_FORWARD = self.__dropout(FEED_FORWARD) DROPPED_FEED_FORWARD = self.__dropout(FEED_FORWARD)
# del FEED_FORWARD del FEED_FORWARD
# 7) Residual Connection # 7) Residual Connection
x = x + FEED_FORWARD x = x + DROPPED_FEED_FORWARD
del FEED_FORWARD del DROPPED_FEED_FORWARD
# 8) Layer Normalization # 8) Layer Normalization
x = self.__layer_norm_2(x) x = self.__layer_norm_2(x)

View File

@@ -10,7 +10,7 @@ class SpannedMasker:
max_vocabulary: int, max_vocabulary: int,
forbidden_tokens: set[int], forbidden_tokens: set[int],
change_token_probability: float = 0.15, change_token_probability: float = 0.15,
average_span: int = 1, average_span: int = 2,
seed: int = random.randint(0, sys.maxsize), seed: int = random.randint(0, sys.maxsize),
) -> None: ) -> None:
@@ -25,6 +25,11 @@ class SpannedMasker:
self.__forbidden_tokens = forbidden_tokens self.__forbidden_tokens = forbidden_tokens
def reseed(self, seed:int):
self.__rng = random.Random(seed)
def mask_sequence( def mask_sequence(
self, self,
token_sequence: list[int], token_sequence: list[int],
@@ -98,7 +103,7 @@ class SpannedMasker:
if self.__is_illegal_token(INNER_TOKEN, forbidden_tokens): if self.__is_illegal_token(INNER_TOKEN, forbidden_tokens):
continue continue
MASK[mask_index] = True MASK[mask_index] = True
mask_index += 1 mask_index += 1

View File

@@ -0,0 +1,47 @@
from typing import override
import torch
# custom LR from attention is all you need
class WarmupLR(torch.optim.lr_scheduler.LRScheduler):
def __init__(
self,
optimizer: torch.optim.Optimizer,
warmup_steps: int,
embedding_size: int,
warming_multiplier: float = -1.5,
decaying_multiplier: float = -0.5,
multiplicative_factor: float = 1.0,
last_epoch: int = -1,
) -> None:
self.__warmup_steps = warmup_steps
self.__embedding_size = embedding_size
self.__warming_multiplier = warming_multiplier
self.__decaying_multiplier = decaying_multiplier
self.__multiplicative_factor = multiplicative_factor
super().__init__(optimizer, last_epoch)
def __scale_at(self, step: int) -> float:
step = max(step, 1)
return (
self.__multiplicative_factor
* (self.__embedding_size**self.__decaying_multiplier)
* min(
step**self.__decaying_multiplier,
step * (self.__warmup_steps**self.__warming_multiplier),
)
)
@override
def get_lr(self) -> list[float]:
torch.optim.lr_scheduler._warn_get_lr_called_within_step(self)
step = max(self.last_epoch, 1)
scale = self.__scale_at(step)
return [base_lr * scale for base_lr in self.base_lrs]
def _get_closed_form_lr(self):
step = max(self.last_epoch, 1)
scale = self.__scale_at(step)
return [base_lr * scale for base_lr in self.base_lrs]

View File

@@ -5,6 +5,7 @@ from .FeedForwardNetwork import FeedForwardNetwork
from .TorchMultiHeadAttention import TorchMultiHeadAttention from .TorchMultiHeadAttention import TorchMultiHeadAttention
from .SpannedMasker import SpannedMasker from .SpannedMasker import SpannedMasker
from .DeToken import DeToken from .DeToken import DeToken
from .WarmupLR import WarmupLR
__all__ = [ __all__ = [
"Decoder", "Decoder",
@@ -12,5 +13,6 @@ __all__ = [
"FeedForwardNetwork", "FeedForwardNetwork",
"TorchMultiHeadAttention", "TorchMultiHeadAttention",
"SpannedMasker", "SpannedMasker",
"DeToken" "DeToken",
"WarmupLR"
] ]

View File

@@ -0,0 +1,33 @@
import torch
import Project_Model.Libs.Embedder as Embedder
from ..Classes import DeToken
class NanoSocraDecoder(torch.nn.Module):
def __init__(
self,
decoder_embedder: Embedder.NanoSocratesEmbedder,
decoder_layers: torch.nn.Sequential,
detokener: DeToken
) -> None:
super().__init__()
self.__decoder_embedder = decoder_embedder
self.__decoder = decoder_layers
self.__detokener = detokener
def forward(self, args: tuple[torch.Tensor,torch.Tensor, torch.Tensor]):
decoder_embedder_input, prefix_mask, tgt_padding = args
decoder_tensor = self.__decoder_embedder(decoder_embedder_input)
decoder_output, _, _, _, _, _ = self.__decoder(
(decoder_tensor, decoder_tensor, decoder_tensor, prefix_mask, tgt_padding, True)
)
logits: torch.Tensor = self.__detokener(decoder_output)
return logits

View File

@@ -0,0 +1,29 @@
import torch
import Project_Model.Libs.Embedder as Embedder
from ..Classes import DeToken
class NanoSocratEncoder(torch.nn.Module):
def __init__(
self,
encoder_embedder: Embedder.NanoSocratesEmbedder,
encoder_layers: torch.nn.Sequential,
detokener: DeToken
) -> None:
super().__init__()
self.__encoder_embedder = encoder_embedder
self.__encoder = encoder_layers
self.__detokener = detokener
def forward(self, args: tuple[torch.Tensor, torch.Tensor]):
encoder_embedder_input, src_padding = args
encoder_tensor = self.__encoder_embedder(encoder_embedder_input)
encoder_output, _ = self.__encoder((encoder_tensor, src_padding))
logits: torch.Tensor = self.__detokener(encoder_output)
return logits

View File

@@ -0,0 +1,219 @@
import torch
import Project_Model.Libs.Embedder as Embedder
from ..Classes import Encoder, Decoder, DeToken
from ..Utils import get_decoder_input
from Project_Model.Libs.Batch import TaskType
class NanoSocratesCore(torch.nn.Module):
def __init__(
self,
vocabulary_size: int,
sentence_max_length: int,
sos: int,
pad: int,
eos: int,
continuerdf: int,
latent_space: int = 256,
feed_forward_multiplier: int = 4,
attention_heads: int = 4,
layer_number: int = 2,
) -> None:
super().__init__()
self.__sos = sos
self.__pad = pad
self.__eos = eos
self.__continuerdf = continuerdf
self.__sentence_len = sentence_max_length
feed_forward_latent_space = latent_space * feed_forward_multiplier
self.__encoder_embedder = Embedder.NanoSocratesEmbedder(
vocabulary_size, latent_space
)
self.__decoder_embedder = Embedder.NanoSocratesEmbedder(
vocabulary_size, latent_space
)
TMP_ENCODERS = [
Encoder(latent_space, feed_forward_latent_space, attention_heads)
] * layer_number
TMP_DECODERS = [
Decoder(latent_space, feed_forward_latent_space, attention_heads)
] * layer_number
self.__encoder = torch.nn.Sequential(*TMP_ENCODERS)
self.__decoder = torch.nn.Sequential(*TMP_DECODERS)
self.__detokener = DeToken(latent_space, vocabulary_size)
self.__encoder_detokener = DeToken(latent_space, vocabulary_size)
def forward(self, args: tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]):
encoder_embedder_input, src_padding, decoder_embedder_input, tgt_padding = args
encoder_tensor = self.__encoder_embedder(encoder_embedder_input)
decoder_tensor = self.__decoder_embedder(decoder_embedder_input)
encoder_output, _ = self.__encoder((encoder_tensor, src_padding))
decoder_output, _, _, _, _, _ = self.__decoder(
(decoder_tensor, encoder_output, encoder_output, src_padding, tgt_padding, False)
)
logits: torch.Tensor = self.__detokener(decoder_output)
return logits
def inference(self, input: tuple[torch.Tensor, torch.Tensor], task_type: TaskType) -> torch.Tensor:
if task_type == TaskType.MASKING:
return self.__masking(input)
if task_type == TaskType.COMPLETATION:
return self.__continue_rdf(input)
return self.__text_generation(input)
def __text_generation(self, args: tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
x, padding = args
encoder_tensor = self.__encoder_embedder(x)
BATCH: int
if len(x.shape) > 2:
BATCH, SEQ_LEN, _ = x.shape
else:
_, SEQ_LEN = x.shape
BATCH = 1
encoder_output, _ = self.__encoder((encoder_tensor, padding))
decoder_in = get_decoder_input(BATCH, self.__sos, self.__pad, SEQ_LEN)
decoder_in_pad_mask = decoder_in.eq(self.__pad)
continue_generating = True
token_idx = 0
while continue_generating:
decoder_in_x = self.__decoder_embedder(decoder_in)
decoder_output, _, _, _, _, _ = self.__decoder(
(decoder_in_x, encoder_output, encoder_output, padding, decoder_in_pad_mask, False)
)
logits: torch.Tensor = self.__detokener(decoder_output)
logits = torch.softmax(logits, 2)
tokens = torch.argmax(logits, 2)
if token_idx < self.__sentence_len - 1:
decoder_in[:,token_idx + 1] = tokens[:,token_idx]
decoder_in_pad_mask = decoder_in.eq(self.__pad)
if token_idx == self.__sentence_len - 1:
continue_generating = False
continue
if tokens.shape[0] == 1 and tokens[0,token_idx] == self.__eos:
continue_generating = False
continue
token_idx += 1
return decoder_in
def __masking(self, args: tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
x, padding = args
encoder_tensor = self.__encoder_embedder(x)
x, _ = self.__encoder((encoder_tensor, padding))
logits: torch.Tensor = self.__encoder_detokener(x)
del x
logits = torch.softmax(logits, 2)
tokens = torch.argmax(logits, 2)
return tokens
def __continue_rdf(self, args: tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
decoder_in, _ = args
decoder_in_prefix_mask = decoder_in.eq(self.__pad)
decoder_in_pad_mask = decoder_in.eq(self.__pad)
continue_generating = True
token_idx: int= int((decoder_in[0] == self.__continuerdf).nonzero()[0].item()) + 1
while continue_generating:
decoder_x = self.__decoder_embedder(decoder_in)
decoder_output, _, _, _, _, _ = self.__decoder(
(decoder_x, decoder_in, decoder_in, decoder_in_prefix_mask, decoder_in_pad_mask, True)
)
logits: torch.Tensor = self.__detokener(decoder_output)
logits = torch.softmax(logits, 2)
tokens = torch.argmax(logits, 2)
if token_idx < self.__sentence_len - 1:
decoder_in[:,token_idx + 1] = tokens[:,token_idx]
decoder_in_pad_mask = decoder_in.eq(self.__pad)
if token_idx == self.__sentence_len - 1:
continue_generating = False
continue
if tokens.shape[0] == 1 and tokens[0,token_idx] == self.__eos:
continue_generating = False
continue
token_idx += 1
return decoder_in
def take_pieces(self):
return (
(self.__encoder_embedder, self.__encoder, self.__encoder_detokener),
(self.__decoder_embedder, self.__decoder, self.__detokener)
)
def load_pieces(
self,
encoder_embedder: Embedder.NanoSocratesEmbedder,
decoder_embedder: Embedder.NanoSocratesEmbedder,
encoder: torch.nn.Sequential,
decoder: torch.nn.Sequential,
encoder_detokener: DeToken,
decoder_detokener: DeToken
):
self.__encoder_embedder = encoder_embedder
self.__decoder_embedder = decoder_embedder
self.__encoder = encoder
self.__decoder = decoder
self.__encoder_detokener = encoder_detokener
self.__detokener = decoder_detokener

View File

@@ -36,20 +36,28 @@ class TrainingModel(torch.nn.Module):
self.__decoder = torch.nn.Sequential(*TMP_DECODERS) self.__decoder = torch.nn.Sequential(*TMP_DECODERS)
self.__detokener = DeToken(latent_space, vocabulary_size) self.__detokener = DeToken(latent_space, vocabulary_size)
self.__encoder_detokener = DeToken(latent_space, vocabulary_size)
def forward(self, args: tuple[torch.Tensor, torch.Tensor, torch.Tensor]): def forward(self, args: tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]):
encoder_embedder_input, padding_tensor, decoder_embedder_input = args encoder_embedder_input, src_padding, decoder_embedder_input, tgt_padding = args
encoder_tensor = self.__encoder_embedder(encoder_embedder_input) encoder_tensor = self.__encoder_embedder(encoder_embedder_input)
decoder_tensor = self.__decoder_embedder(decoder_embedder_input) decoder_tensor = self.__decoder_embedder(decoder_embedder_input)
encoder_output, _ = self.__encoder((encoder_tensor, padding_tensor)) encoder_output, _ = self.__encoder((encoder_tensor, src_padding))
decoder_output, _, _, _ = self.__decoder( decoder_output, _, _, _, _, _ = self.__decoder(
(decoder_tensor, encoder_tensor, encoder_tensor, None) (decoder_tensor, encoder_output, encoder_output, src_padding, tgt_padding, False)
) )
logits: torch.Tensor = self.__detokener(decoder_output) logits: torch.Tensor = self.__detokener(decoder_output)
return logits return logits
def take_pieces(self):
return (
(self.__encoder_embedder, self.__encoder, self.__encoder_detokener),
(self.__decoder_embedder, self.__decoder, self.__detokener)
)

View File

@@ -1,5 +1,11 @@
from .TrainingModel import TrainingModel from .TrainingModel import TrainingModel
from .NanoSocratEncoder import NanoSocratEncoder
from .NanoSocraDecoder import NanoSocraDecoder
from .NanoSocrates import NanoSocratesCore
__all__ = [ __all__ = [
"TrainingModel" "TrainingModel",
"NanoSocratEncoder",
"NanoSocraDecoder",
"NanoSocratesCore"
] ]

View File

@@ -3,6 +3,9 @@ from .task_type import TaskType
from .post_tokenization import truncate_sequence, pad_sequence, normalize_sequence, create_padding_mask from .post_tokenization import truncate_sequence, pad_sequence, normalize_sequence, create_padding_mask
from .inference_masking import inference_masking from .inference_masking import inference_masking
from .truncate_rdf_list import truncate_rdf_list from .truncate_rdf_list import truncate_rdf_list
from .decode_out import tensor2token
from .decoder_input import get_decoder_input
__all__ = [ __all__ = [
"TaskType", "TaskType",
@@ -13,5 +16,7 @@ __all__ = [
"create_padding_mask", "create_padding_mask",
"normalize_sequence", "normalize_sequence",
"inference_masking", "inference_masking",
"truncate_rdf_list" "truncate_rdf_list",
"tensor2token",
"get_decoder_input"
] ]

View File

@@ -8,4 +8,23 @@ def get_causal_attention_mask(seq_len: int) -> torch.Tensor:
def get_causal_attention_mask_batched(seq_len: int, batch_size: int ) -> torch.Tensor: def get_causal_attention_mask_batched(seq_len: int, batch_size: int ) -> torch.Tensor:
base_mask = get_causal_attention_mask(seq_len) base_mask = get_causal_attention_mask(seq_len)
return base_mask.unsqueeze(0).expand(batch_size, -1, -1) # add another dimension at the beginning, big as batch_size return base_mask.unsqueeze(0).expand(batch_size, -1, -1) # add another dimension at the beginning, big as batch_size
# the result is that z,x,y where x,y are repeated along z # the result is that z,x,y where x,y are repeated along z
def get_causal_attention_mask_with_prefix(seq_len, prefix):
mask = torch.triu(torch.ones(seq_len, seq_len, dtype=torch.bool), diagonal=1)
mask[:,:prefix] = False
return mask
def get_prefix_causal_mask_from_padding_mask(seq_len:int, prefix_mask, att_heads:int=1):
expanded_padding_mask = prefix_mask.unsqueeze(-1).repeat(1, 1, seq_len) # B,T,T
expanded_padding_mask = expanded_padding_mask.permute(0,2,1) # B,T,T
mask = torch.triu(torch.ones(seq_len, seq_len, dtype=torch.bool), diagonal=1) # T,T
tri_batched = mask.unsqueeze(0) # 1,T,T will broadcast over B
prefix_causal_mask = expanded_padding_mask & tri_batched
prefix_causal_mask = prefix_causal_mask.repeat_interleave(att_heads, dim=0) # B*H,T,T
return prefix_causal_mask
#def get_prefix_causal_mask():
# continue_rdf =

View File

@@ -0,0 +1,27 @@
from typing import Generator
import torch
def tensor2token(tensor: torch.Tensor, end_token: int) -> Generator[list[int]]:
if len(tensor.shape) < 1 or len(tensor.shape) > 2:
raise ValueError("Shape is not correct")
if len(tensor.shape) == 1:
token_list: list[int] = tensor.tolist()
token_list.append(end_token)
yield token_list
return
batch_len: int
batch_len, _ = tensor.shape
for i in range(batch_len):
smaller_tensor = tensor[i, :]
token_list: list[int] = smaller_tensor.tolist()
token_list.append(end_token)
yield token_list

View File

@@ -0,0 +1,14 @@
import torch
from ..Utils import normalize_sequence
# from Project_Model.Libs.Embedder import NanoSocratesEmbedder as Embedder
def get_decoder_input(batch_size, sos_token,pad_token, seq_len):
single_decoder_input, _ = normalize_sequence([sos_token],seq_len,pad_token, end_token=0, add_ending=False)
tensor_decoder_input = torch.tensor(single_decoder_input[:])
# embedded_decoder_intput = embedder(tensor_decoder_input)
batch_decoder_input = tensor_decoder_input.unsqueeze(0).repeat(batch_size, 1)
return batch_decoder_input

View File

@@ -1,17 +1,20 @@
def truncate_sequence( def truncate_sequence(
sequence: list[int], truncate_at: int, end_token: int sequence: list[int], truncate_at: int, end_token: int, add_ending: bool
) -> list[int]: ) -> list[int]:
if len(sequence) < truncate_at - 1: if len(sequence) < truncate_at - 1:
sequence.append(end_token) if add_ending:
sequence.append(end_token)
return sequence return sequence
if len(sequence) < truncate_at: if len(sequence) < truncate_at:
sequence[-1] = end_token if add_ending:
sequence[-1] = end_token
return sequence return sequence
TRUNCATED_SEQUENCE = sequence[:truncate_at] TRUNCATED_SEQUENCE = sequence[:truncate_at]
TRUNCATED_SEQUENCE[-1] = end_token if add_ending:
TRUNCATED_SEQUENCE[-1] = end_token
return TRUNCATED_SEQUENCE return TRUNCATED_SEQUENCE
@@ -48,8 +51,9 @@ def normalize_sequence(
max_length: int, max_length: int,
pad_token: int, pad_token: int,
end_token: int, end_token: int,
add_ending: bool = True
) -> tuple[list[int], list[bool]]: ) -> tuple[list[int], list[bool]]:
new_sequence = truncate_sequence(sequence, max_length, end_token) new_sequence = truncate_sequence(sequence, max_length, end_token, add_ending)
new_sequence = pad_sequence(new_sequence, max_length, pad_token) new_sequence = pad_sequence(new_sequence, max_length, pad_token)
PADDING_MASK = create_padding_mask(new_sequence, pad_token) PADDING_MASK = create_padding_mask(new_sequence, pad_token)

View File

@@ -1,6 +1,7 @@
from enum import Enum, auto from enum import Enum, auto
class TaskType(Enum): class TaskType(Enum):
TEXT2RDF = auto()
RDF2TEXT = auto() RDF2TEXT = auto()
MASK = auto() MASK = auto()
COMPLETATION = auto() COMPLETATION = auto()

View File

@@ -27,7 +27,6 @@ def truncate_rdf_list(
END_OF_TRIPLES.append(i + 1) END_OF_TRIPLES.append(i + 1)
TRIPLES_TOKENS: list[int] = [] TRIPLES_TOKENS: list[int] = []
TARGET_TRIPLES: list[int] = []
start_of_triple = 0 start_of_triple = 0
exit_loop = False exit_loop = False
@@ -56,10 +55,10 @@ def truncate_rdf_list(
EOT = END_OF_TRIPLES.popleft() EOT = END_OF_TRIPLES.popleft()
TRIPLE = sequence[start_of_triple:EOT] TRIPLE = sequence[start_of_triple:EOT]
TARGET_TRIPLES.extend(TRIPLE) TRIPLES_TOKENS.extend(TRIPLE)
start_of_triple = EOT start_of_triple = EOT
return (TRIPLES_TOKENS, TARGET_TRIPLES) return (TRIPLES_TOKENS, TRIPLES_TOKENS)

View File

@@ -1,7 +1,9 @@
from .Classes import * from .Classes import *
from .Enums import *
from .Utils import * from .Utils import *
from .Models import * from .Models import *
from . import Classes from . import Classes
from . import Enums
from . import Utils from . import Utils
from . import Models from . import Models

View File

@@ -0,0 +1,6 @@
from enum import Enum, auto
class ModelType(Enum):
ENCODER_ONLY = auto()
DECODER_ONLY = auto()

View File

@@ -0,0 +1,14 @@
from .model_utils import decompose_nano_socrates, create_standalone_model, train2inference
from .ModelType import ModelType
from .decode_batch import decode_batch
from .metrics import precision, recall, accuracy, f1, meteor, bleu, rouge, average, rdf2txt, txt2rdf, rdf_completion_1, rdf_completion_2, remove_padding, balance_paddings
__all__ = [
"ModelType",
"decompose_nano_socrates",
"create_standalone_model",
"decode_batch",
"train2inference",
"precision", "recall", "accuracy", "f1", "meteor", "bleu", "rouge", "average",
"rdf2txt", "txt2rdf", "rdf_completion_1", "rdf_completion_2", "remove_padding", "balance_paddings"
]

View File

@@ -0,0 +1,16 @@
import torch
import Project_Model.Libs.BPE as BPE
def decode_batch(batch: torch.Tensor, tokenizer: BPE.TokeNanoCore ,uknonw_token: int) -> list[str]:
strings = []
BATCH, _ = batch.shape
for i in range(0, BATCH):
tokens: list[int] = batch.tolist()[i]
tokens = list(map(lambda x: uknonw_token if x > tokenizer.vocabulary_size else x, tokens))
strings.append(tokenizer.decode(tokens))
return strings

View File

@@ -0,0 +1,100 @@
import evaluate as eval
BLEU = eval.load("bleu")
ROUGE = eval.load("rouge")
METEOR = eval.load("meteor")
def precision(ref: list[int], pred: list[int]):
metric = eval.load("precision")
return metric.compute(predictions=pred, references=ref, average="weighted", zero_division=0)
def recall(ref: list[int], pred: list[int]):
metric = eval.load("recall")
return metric.compute(predictions=pred, references=ref, average="weighted", zero_division=0)
def accuracy(ref: list[int], pred: list[int]):
metric = eval.load("accuracy")
return metric.compute(predictions=pred, references=ref)
def meteor(ref: list[str], pred: list[str]):
metric = METEOR
return metric.compute(predictions=pred, references=ref)
def bleu(ref: list[str], pred: list[str]):
metric = BLEU
return metric.compute(predictions=pred, references=ref)
def rouge(ref: list[str], pred: list[str]):
metric = ROUGE
return metric.compute(predictions=pred, references=ref)
def f1(precision: float, recall: float):
divisor = max((precision + recall), 1E-5)
return (2 * recall * precision) / divisor
def average(array: list[float]):
return sum(array) / len(array)
def rdf2txt(ref: list[str], pred: list[str]):
b_m = bleu(ref, pred)
r_m = rouge(ref, pred)
m_m = meteor(ref, pred)
return (b_m, r_m, m_m)
def txt2rdf(ref: list[int], pred: list[int]):
p_m = precision(ref, pred)
r_m = recall(ref, pred)
return (p_m, r_m)
def rdf_completion_1(ref: list[int], pred: list[int]):
a_m = accuracy(ref, pred)
return a_m
def rdf_completion_2(ref: list[int], pred: list[int]):
p_m = precision(ref, pred)
r_m = recall(ref, pred)
return (p_m, r_m)
def remove_padding(seq: list[int], pad_token: int, end_token: int):
clean_seq = list(filter(lambda x: x != pad_token, seq))
if clean_seq[-1] == end_token:
return clean_seq
clean_seq.append(
end_token
)
return clean_seq
def balance_paddings(seq_1: list[int], seq_2: list[int], pad_token: int):
SEQ_1_LEN = len(seq_1)
SEQ_2_LEN = len(seq_2)
if SEQ_1_LEN > SEQ_2_LEN:
PAD = [pad_token] * (SEQ_1_LEN - SEQ_2_LEN)
seq_2.extend(PAD)
if SEQ_2_LEN > SEQ_1_LEN:
seq_2 = seq_2[:SEQ_1_LEN]
return (seq_1, seq_2)

View File

@@ -0,0 +1,72 @@
import torch
from Project_Model.Libs.Embedder import NanoSocratesEmbedder
from Project_Model.Libs.Transformer import TrainingModel,NanoSocratesCore, NanoSocraDecoder, NanoSocratEncoder, DeToken, Encoder, Decoder
from .ModelType import ModelType
def decompose_nano_socrates(
model: TrainingModel | NanoSocratesCore , vocabulary_size: int, embedding_size: int
) -> tuple[TrainingModel | NanoSocratesCore, NanoSocratEncoder, NanoSocraDecoder]:
encoder_pieces, decoder_pieces = model.take_pieces()
encoder_embedder, encoder, encoder_detokener = encoder_pieces
decoder_embedder, decoder, decoder_detokener = decoder_pieces
return (
model,
NanoSocratEncoder(encoder_embedder, encoder, encoder_detokener),
NanoSocraDecoder(decoder_embedder, decoder, decoder_detokener),
)
def train2inference(
train_model: TrainingModel,
inference_model: NanoSocratesCore
) -> NanoSocratesCore:
encoder_pieces, decoder_pieces = train_model.take_pieces()
enc_emb, encoder, enc_det = encoder_pieces
dec_emb, decoder, dec_det = decoder_pieces
inference_model.load_pieces(
enc_emb,
dec_emb,
encoder,
decoder,
enc_det,
dec_det
)
return inference_model
def create_standalone_model(
model_type: ModelType,
vocabulary_size: int,
latent_space: int = 256,
feed_forward_multiplier: int = 4,
attention_heads: int = 4,
layer_number: int = 2,
) -> NanoSocratEncoder | NanoSocraDecoder:
feed_forward_latent_space = latent_space * feed_forward_multiplier
embedder = NanoSocratesEmbedder(vocabulary_size, latent_space)
detokener = DeToken(latent_space, vocabulary_size)
if model_type == ModelType.ENCODER_ONLY:
TMP_ENCODERS = [
Encoder(latent_space, feed_forward_latent_space, attention_heads)
] * layer_number
encoder = torch.nn.Sequential(*TMP_ENCODERS)
return NanoSocratEncoder(embedder, encoder, detokener)
TMP_DECODERS = [
Decoder(latent_space, feed_forward_latent_space, attention_heads)
] * layer_number
decoder = torch.nn.Sequential(*TMP_DECODERS)
return NanoSocraDecoder(embedder, decoder, detokener)

View File

@@ -2,3 +2,4 @@ from . import BPE
from . import Embedder from . import Embedder
from . import Transformer from . import Transformer
from . import TorchShims from . import TorchShims
from . import TransformerUtils

File diff suppressed because it is too large Load Diff

Binary file not shown.

Binary file not shown.