88 Commits

Author SHA1 Message Date
784c1fdc9e Merge pull request 'dev.train' (#8) from dev.train into dev
Reviewed-on: #8
2025-10-17 22:20:14 +02:00
Christian Risi
b79521995c Last fixes 2025-10-17 22:17:24 +02:00
GassiGiuseppe
cf3e35121b Merge branch 'dev.etl' into dev 2025-10-17 22:15:21 +02:00
Christian Risi
540b78204c Added epochs 2025-10-17 17:06:42 +02:00
GassiGiuseppe
86a063591e Merge branch 'dev.train' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev.train 2025-10-17 16:38:12 +02:00
GassiGiuseppe
f33d4f1db6 Added a loss_saver file to save the losses 2025-10-17 16:37:44 +02:00
Christian Risi
fe62b1edd5 Fixed evaluation 2025-10-16 20:05:35 +02:00
Christian Risi
892f91aad7 Fixes for evaluation 2025-10-16 19:20:23 +02:00
Christian Risi
9ff117f437 Adding best model 2025-10-16 19:20:09 +02:00
Christian Risi
83693f1d4e Fixed a patience bug and added label smoothing 2025-10-14 11:03:15 +02:00
Christian Risi
0b256001fe Changed position to reflect other datasets 2025-10-14 10:51:11 +02:00
Christian Risi
7585f556f8 Added more logging 2025-10-14 10:41:28 +02:00
Christian Risi
4968d79403 Fixed a masking problem 2025-10-14 10:34:14 +02:00
GassiGiuseppe
80fd7fd600 evaluator WIP 2025-10-12 22:59:07 +02:00
GassiGiuseppe
972a73758d added holdout for curated dataset 2025-10-12 19:06:09 +02:00
GassiGiuseppe
b38a011105 added curated dataset, which is 8000 2025-10-12 19:01:28 +02:00
GassiGiuseppe
2bdcd78622 Merge branch 'dev.train' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev.train 2025-10-12 18:20:04 +02:00
GassiGiuseppe
7dedbc481b evaluator WIP 2025-10-12 18:18:20 +02:00
Christian Risi
76345f8d4f Fixed a visual bug 2025-10-12 16:42:59 +02:00
GassiGiuseppe
2ccec9efb8 typo 2025-10-12 16:41:06 +02:00
GassiGiuseppe
e2231eb3b9 Merge branch 'dev.train' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev.train 2025-10-12 16:36:09 +02:00
GassiGiuseppe
144f8724d6 Update of the batcher to resolve a bug in the 4th construction 2025-10-12 16:35:42 +02:00
Christian Risi
07130ff489 Fixed several bugs for task 4 2025-10-12 16:30:30 +02:00
Christian Risi
e0f8a36aa5 Added support for fast resuming 2025-10-12 13:53:07 +02:00
Christian Risi
37a2501a79 Added a way to load checkpoints 2025-10-12 12:28:24 +02:00
Christian Risi
4ca1d0a189 Activated Dropout to avoid overfitting 2025-10-12 12:28:06 +02:00
GassiGiuseppe
856c693650 Added possibility to whitelist relationships 2025-10-12 12:26:26 +02:00
Christian Risi
f463f699cf Fixed a bug over task 4 2025-10-12 12:22:38 +02:00
Christian Risi
ab3d68bc13 fixed patience not quitting 2025-10-12 01:41:34 +02:00
Christian Risi
79438e3d30 Fixed Patience system 2025-10-12 01:22:06 +02:00
Christian Risi
f98f5a2611 Fixed misprint in task 3 2025-10-12 01:16:09 +02:00
Christian Risi
4281f8724b Fixed Validation loss 2025-10-12 00:57:24 +02:00
Christian Risi
71d602e36e Fixed a memory bug 2025-10-12 00:47:20 +02:00
Christian Risi
46ee6055ec Added Colab default values 2025-10-12 00:15:54 +02:00
Christian Risi
e579e1c88b fixed verbosity 2025-10-12 00:15:15 +02:00
Christian Risi
f51ada866f Added verbosity level 2025-10-12 00:13:03 +02:00
Christian Risi
acd978cbc5 Merge branch 'dev.train' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev.train 2025-10-12 00:05:36 +02:00
Christian Risi
56fbadd55e Fixed training 2025-10-12 00:05:30 +02:00
GassiGiuseppe
14f1c574e7 typo batch size 2025-10-11 22:11:53 +02:00
Christian Risi
d8e65bfb8a Fixed a bug about mismatched batch sizes 2025-10-11 22:09:46 +02:00
Christian Risi
bcc2fe7368 Fixed bugs and added visibility 2025-10-11 21:49:29 +02:00
Christian Risi
160b7dbfc0 V0.0.1 Athene 2025-10-11 19:35:43 +02:00
GassiGiuseppe
49946727d8 updated decoder_input to work without embedder 2025-10-11 16:53:36 +02:00
GassiGiuseppe
e9d30b3cea add divide method to create hold out dataset 2025-10-11 16:49:36 +02:00
GassiGiuseppe
1649cd7768 added decoder_input method to build the batch tensor to give in input to the deocder 2025-10-11 16:18:43 +02:00
GassiGiuseppe
443f54fffd WIP decoder with prefix mask 2025-10-11 15:31:43 +02:00
GassiGiuseppe
ff721107b9 typo 2025-10-11 15:26:58 +02:00
GassiGiuseppe
f1886e5be1 added builder for prefix mask 2025-10-11 15:19:09 +02:00
GassiGiuseppe
5e3878ea17 Merge branch 'dev' into dev.train 2025-10-11 11:51:58 +02:00
GassiGiuseppe
79d3fb9ff8 Merge branch 'dev' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev 2025-10-11 11:51:19 +02:00
GassiGiuseppe
586f021276 Merge branch 'dev.train' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev.train 2025-10-11 11:28:35 +02:00
GassiGiuseppe
82462078f8 WIP for the new prefix mask 2025-10-11 11:28:15 +02:00
Christian Risi
625f79f7c3 Fixed imports 2025-10-11 11:18:44 +02:00
GassiGiuseppe
3446870291 typo 2025-10-10 22:27:01 +02:00
GassiGiuseppe
e76dbeb9a7 typo 2025-10-10 22:26:06 +02:00
GassiGiuseppe
96610612fe Batcher added 2025-10-10 20:10:08 +02:00
Christian Risi
92ae40013d Added a way to detach models and create them standalone 2025-10-10 18:43:20 +02:00
Christian Risi
15f203cad5 Added boe 16k tokens vocabulary 2025-10-10 18:43:02 +02:00
Christian Risi
31c8541dfb Co-authored-by: GassiGiuseppe <GassiGiuseppe@users.noreply.github.com> 2025-10-10 16:28:09 +02:00
Christian Risi
bed9718f27 Added BPE small vocabulary 2025-10-10 11:40:39 +02:00
GassiGiuseppe
93865bee8a typo 2025-10-09 22:26:17 +02:00
Christian Risi
1c0ddb8753 Merge branch 'dev.embedder' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev.embedder 2025-10-09 22:23:36 +02:00
Christian Risi
51399f9dc9 commit of toy dataset with whole batch 2025-10-09 22:22:42 +02:00
GassiGiuseppe
d1ba4ae026 last update for collab
( we are gonna run it on a 100 yey)
2025-10-09 21:57:05 +02:00
Christian Risi
db0090981c Merge branch 'dev.embedder' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev.embedder 2025-10-09 21:53:45 +02:00
Christian Risi
e1c5649d67 updated to overfit over toy dataset 2025-10-09 21:53:42 +02:00
GassiGiuseppe
0bca241662 update environment yaml 2025-10-09 20:53:45 +02:00
GassiGiuseppe
005d7af6a0 lil update of requirements 2025-10-09 20:30:06 +02:00
GassiGiuseppe
9068db550e Merge branch 'dev.embedder' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev.embedder 2025-10-09 19:44:46 +02:00
GassiGiuseppe
d8f81e1a47 that god can have mercy upon us 2025-10-09 19:43:50 +02:00
Christian Risi
a67df9724e Merge branch 'dev.embedder' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev.embedder 2025-10-09 18:14:33 +02:00
Christian Risi
c5fd57d854 Updated train playground 2025-10-09 18:14:29 +02:00
GassiGiuseppe
ee253c39f4 Merge branch 'dev.embedder' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev.embedder 2025-10-09 13:31:37 +02:00
GassiGiuseppe
2036b4015f added logistic collector 2025-10-09 13:31:16 +02:00
Christian Risi
aac7675b30 Pipeline fix and added a util to decode 2025-10-09 13:24:48 +02:00
GassiGiuseppe
d2fdeb18a2 bla bla doctor 2025-10-09 12:41:47 +02:00
Christian Risi
f3b83eda3d Rework 2025-10-09 11:37:46 +02:00
Christian Risi
0158db2dce Fixed a bug where I took encoder embeddings rather than encoder output 2025-10-09 11:37:21 +02:00
Christian Risi
ba592c3480 Disabled Softmax 2025-10-09 11:36:56 +02:00
Christian Risi
1f9c30b531 Added Custom Learning Rate 2025-10-09 11:36:40 +02:00
GassiGiuseppe
ee12f53f12 Added EOS token 2025-10-07 22:47:59 +02:00
GassiGiuseppe
a04f4c7cb7 changes to shorten the dataset 2025-10-07 15:49:25 +02:00
GassiGiuseppe
a93e61b8c1 Update ETL 2025-10-07 00:54:00 +02:00
GassiGiuseppe
0373460105 Movie filters updated 2025-10-06 10:57:50 +02:00
GassiGiuseppe
7307916891 update sql_endpoint to work with the new pipeline 2025-10-05 14:58:03 +02:00
GassiGiuseppe
acb43fc899 new faster pipeline 2025-10-05 14:57:45 +02:00
GassiGiuseppe
255d801a80 updated the mask rdf_mask_task.
however since the model will build the mask itself, it is deprecated
2025-10-05 14:56:33 +02:00
GassiGiuseppe
2bd24ec278 Created legacy folder for old pipeline
this pipeline still works but is slower then the new,
some ot its method can be used later
2025-10-05 14:54:32 +02:00
75 changed files with 4208 additions and 6223 deletions

Binary file not shown.

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:6e0a193f90f2b0efc5185b0db9555178b172268b3eab289225b894ac1493493f
3 size 2471083

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:dd309865b60df86f63f76341858e382a8423297ec63eb6f525ccd28b62caf486
3 size 2494589

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:2949f2e9c6ae2b4784e04405dd7f5a3ec2eb65537b421fdc6751e9d5a19af41d
3 size 19527224

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:bc28507d806df96d6c953fbba1999f62a55e26025001de5135892069df05b9bc
3 size 22021103

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:176f13b63859c4dc0ca42b94d875aa82b74ad1cd88a186c439ef5444f45ed715
3 size 24455751

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:206f83b88b442f617575985ac88f4241071fa1b7d66b5935405178051511a369
3 size 1344466

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:6914b6b1f8f06f8cf73b96b9c27bf556f1ee93256f435b7da0be0df2af093d05
3 size 1334675

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:41e92da8af52ca1c83334ebea7312c63d37fdeacde425ba91b78f44a56e4fb88
3 size 10568092

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

BIN
Assets/Model/curated/log_loss.csv LFS Normal file

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:203b6cb364cf95cbb6cc0ebbff9e8b80e80dda73ff210ad91edeedf6024f6ab1
3 size 2876

Binary file not shown.

Binary file not shown.

BIN
Assets/Model/small/bpe-small.json LFS Normal file

Binary file not shown.

193
Playgrounds/doctor.ipynb Normal file
View File

@@ -0,0 +1,193 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "ddfb4457",
"metadata": {},
"outputs": [
{
"ename": "AssertionError",
"evalue": "target id 3872 >= V (256). Fix TOKEN_SPACE_SIZE.",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mAssertionError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 126\u001b[39m\n\u001b[32m 124\u001b[39m \u001b[38;5;66;03m# sanity guard (helps debug vocab mismatches fast)\u001b[39;00m\n\u001b[32m 125\u001b[39m max_seen = tgt[:, :Tp].max().item()\n\u001b[32m--> \u001b[39m\u001b[32m126\u001b[39m \u001b[38;5;28;01massert\u001b[39;00m max_seen < V \u001b[38;5;129;01mor\u001b[39;00m (tgt[:, :Tp] == PAD_TOKEN).all(), \\\n\u001b[32m 127\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mtarget id \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmax_seen\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m >= V (\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mV\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m). Fix TOKEN_SPACE_SIZE.\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 129\u001b[39m \u001b[38;5;66;03m# CE over all tokens produced so far (0..t). PAD is ignored by ignore_index\u001b[39;00m\n\u001b[32m 130\u001b[39m loss_t = cross_entropy(\n\u001b[32m 131\u001b[39m logits_btV.reshape(-\u001b[32m1\u001b[39m, V), \u001b[38;5;66;03m# [B*(t+1), V]\u001b[39;00m\n\u001b[32m 132\u001b[39m tgt[:, :Tp].reshape(-\u001b[32m1\u001b[39m) \u001b[38;5;66;03m# [B*(t+1)]\u001b[39;00m\n\u001b[32m 133\u001b[39m )\n",
"\u001b[31mAssertionError\u001b[39m: target id 3872 >= V (256). Fix TOKEN_SPACE_SIZE."
]
}
],
"source": [
"import random\n",
"import torch\n",
"import pandas as pd\n",
"from pathlib import Path\n",
"import Project_Model.Libs.Embedder as Embedder\n",
"import Project_Model.Libs.BPE as BPE\n",
"import Project_Model.Libs.Transformer as Transformer\n",
"import Project_Model.Libs.TorchShims as torch_shims\n",
"from Project_Model.Libs.Training.learning_rade_shedulers import CustomLR\n",
"from Project_Model.Libs.Training.logistic_collector import LogitsCollector # external collector\n",
"\n",
"# set a fixed seed\n",
"torch.manual_seed(0)\n",
"random.seed(0)\n",
"DEVICE = torch_shims.get_default_device()\n",
"torch.set_default_device(DEVICE)\n",
"\n",
"# BPE Init\n",
"VOCABULARY_PATH = Path(\"Assets/Model/toy_10/toy_dictionary.json\")\n",
"SPECIAL_VOC = BPE.default_special_tokens()\n",
"\n",
"VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)\n",
"TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)\n",
"\n",
"# Constants (TEMP size; will be corrected after dataset scan below)\n",
"TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + 1\n",
"EMBEDDED_SIZE = 256\n",
"FEED_FORWARD_MULTIPLIER = 4\n",
"ATTENTION_HEADS = 4\n",
"SENTENCE_LENGTH = 256\n",
"NUMBER_OF_BLOCKS = 2\n",
"MAX_EPOCHS = int(1e4)\n",
"\n",
"PAD_TOKEN = TOKENANO.encode(\"<PAD>\")[0]\n",
"END_TOKEN = TOKENANO.encode(\"<END>\")[0]\n",
"\n",
"# Load CSV\n",
"TOY_DATASET_PATH = Path(\"Assets/Dataset/1-hop/toy/rdf_text.csv\")\n",
"TOY_DATASET = pd.read_csv(TOY_DATASET_PATH)\n",
"\n",
"TOY_BATCH_INPUT_LIST: list[list[int]] = []\n",
"TOY_BATCH_PADDING_LIST: list[list[bool]] = []\n",
"TOY_BATCH_TARGET_LIST: list[list[int]] = []\n",
"TOY_BATCH_DECODER_DEFAULT: list[list[int]] = []\n",
"\n",
"for index, row in TOY_DATASET.iterrows():\n",
" RDFs: str = row[\"RDFs\"]\n",
" Abstract: str = row[\"Abstract\"]\n",
"\n",
" input_tokens = TOKENANO.encode(RDFs) # encoder input ids\n",
" output_tokens = TOKENANO.encode(Abstract)[1:] # decoder target ids (shifted left)\n",
" decoder_default_tokens = TOKENANO.encode(\"<SOS>\") # decoder input starts with <SOS>\n",
"\n",
" input_tokens, padding = Transformer.normalize_sequence(\n",
" input_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n",
" ) # pad/trim + end token\n",
" output_tokens, _ = Transformer.normalize_sequence(\n",
" output_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n",
" ) # pad/trim + end token\n",
" decoder_default_tokens = Transformer.pad_sequence(\n",
" decoder_default_tokens, SENTENCE_LENGTH, PAD_TOKEN\n",
" ) # pad with PAD up to SENTENCE_LENGTH\n",
"\n",
" TOY_BATCH_INPUT_LIST.append(input_tokens)\n",
" TOY_BATCH_PADDING_LIST.append(padding)\n",
" TOY_BATCH_TARGET_LIST.append(output_tokens)\n",
" TOY_BATCH_DECODER_DEFAULT.append(decoder_default_tokens)\n",
"\n",
"# fix V to cover ALL ids (including specials) # <- important\n",
"max_enc_id = max(max(row) for row in TOY_BATCH_INPUT_LIST) if TOY_BATCH_INPUT_LIST else 0\n",
"max_tgt_id = max(max(row) for row in TOY_BATCH_TARGET_LIST) if TOY_BATCH_TARGET_LIST else 0\n",
"TOKEN_SPACE_SIZE = max(TOKEN_SPACE_SIZE, max(PAD_TOKEN, END_TOKEN, max_enc_id, max_tgt_id) + 1)\n",
"\n",
"# Training loop\n",
"LOSS_HISTORY = []\n",
"NANOSOCRATES = Transformer.TrainingModel(\n",
" TOKEN_SPACE_SIZE,\n",
" EMBEDDED_SIZE,\n",
" FEED_FORWARD_MULTIPLIER,\n",
" ATTENTION_HEADS,\n",
" NUMBER_OF_BLOCKS,\n",
")\n",
"\n",
"collector = LogitsCollector(PAD_TOKEN, END_TOKEN, TOKENANO) # collects logits and decodes\n",
"\n",
"NANOSOCRATES.train()\n",
"cross_entropy = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)\n",
"optimizer = torch.optim.AdamW(NANOSOCRATES.parameters(), lr=1.0) # base lr works as factor\n",
"scheduler = CustomLR(optimizer, EMBEDDED_SIZE, warmup_steps=4000, factor=1.0) # step each optimizer step\n",
"\n",
"current_epoch = 0\n",
"BATCH_SIZE = min(32, len(TOY_BATCH_INPUT_LIST)) # small batch to stabilize\n",
"\n",
"while current_epoch < MAX_EPOCHS:\n",
" # simple fixed mini-batch from the top; later you can shuffle/slice\n",
" enc = torch.tensor(TOY_BATCH_INPUT_LIST[:BATCH_SIZE], dtype=torch.long) # [B,T] encoder token ids\n",
" pad = torch.tensor(TOY_BATCH_PADDING_LIST[:BATCH_SIZE], dtype=torch.bool) # [B,T] True where encoder PAD is present\n",
" tgt = torch.tensor(TOY_BATCH_TARGET_LIST[:BATCH_SIZE], dtype=torch.long) # [B,T] decoder targets (ground-truth)\n",
"\n",
" # decoder prefix buffer: <SOS> at pos 0, PAD elsewhere (no shift here) # we will fill it step by step\n",
" dec = torch.tensor(TOY_BATCH_DECODER_DEFAULT[:BATCH_SIZE], dtype=torch.long) # [B,T]\n",
"\n",
" total_loss = 0.0\n",
" collector.reset() # start fresh for this epoch\n",
"\n",
" T = tgt.size(1) # sequence length\n",
" for t in range(T):\n",
" # skip all-PAD steps to avoid CE divide-by-zero late in the sequence\n",
" if (tgt[:, t] == PAD_TOKEN).all(): # all PAD at this timestep\n",
" break\n",
"\n",
" optimizer.zero_grad(set_to_none=True) # clear grads for this token step\n",
"\n",
" prefix = dec[:, : t + 1] # [B, t+1] current decoder prefix\n",
" dec_pad_mask = prefix.eq(PAD_TOKEN) # [B, t+1] True where PAD inside prefix\n",
"\n",
" # now decoder returns all steps up to t -> [B, t+1, V]\n",
" logits_btV: torch.Tensor = NANOSOCRATES((enc, pad, prefix, dec_pad_mask)) # full logits for learning\n",
" collector.add(logits_btV) # collector will take the last step\n",
"\n",
" Tp = logits_btV.size(1) # t+1\n",
" V = logits_btV.size(-1) # vocab size\n",
"\n",
" # sanity guard (helps debug vocab mismatches fast)\n",
" max_seen = tgt[:, :Tp].max().item()\n",
" assert max_seen < V or (tgt[:, :Tp] == PAD_TOKEN).all(), \\\n",
" f\"target id {max_seen} >= V ({V}). Fix TOKEN_SPACE_SIZE.\"\n",
"\n",
" # CE over all tokens produced so far (0..t). PAD is ignored by ignore_index\n",
" loss_t = cross_entropy(\n",
" logits_btV.reshape(-1, V), # [B*(t+1), V]\n",
" tgt[:, :Tp].reshape(-1) # [B*(t+1)]\n",
" )\n",
"\n",
" loss_t.backward() # backprop for this step\n",
" optimizer.step() # update params\n",
" scheduler.step() # Noam/warmup: step per optimizer step\n",
"\n",
" total_loss = float(loss_t.detach()) # keep last step loss for logging\n",
"\n",
" # teacher forcing: reveal the correct token for next position\n",
" if t < T - 1:\n",
" dec[:, t + 1] = tgt[:, t] # write ground-truth into next slot\n",
"\n",
" current_epoch += 1\n",
" print(f\"EPOCH {current_epoch}\\n\\tLoss: {total_loss:.6f}\") # simple log\n",
" collector.print_decoded() # print decoded predictions for the batch\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "deep_learning",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

263
Playgrounds/evaluation.py Normal file
View File

@@ -0,0 +1,263 @@
import random
import torch
from pathlib import Path
import Project_Model.Libs.BPE as BPE
import Project_Model.Libs.Transformer as Transformer
import Project_Model.Libs.TransformerUtils as TUtils
import Project_Model.Libs.TorchShims as torch_shims
import Project_Model.Libs.Batch as Batch
# set a default device
DEVICE = torch_shims.get_default_device()
torch.set_default_device(DEVICE)
# set a fixed seed
torch.manual_seed(0)
random.seed(0)
# Get paths
MODEL_DIR = "Assets/Model/curated"
# MODEL_DIR= "Assets/Dataset/Tmp"
VOCABULARY_PATH = Path("Assets/Model/small/bpe-small-16.json")
TRAIN_DATASET_PATH = Path("Assets/Dataset/1-hop/small/holdout/train.csv")
VALIDATION_DATASET_PATH = Path("Assets/Dataset/1-hop/small/holdout/evaluation.csv")
TEST_DATASET_PATH = Path("Assets/Dataset/1-hop/small/holdout/test.csv")
# TEST_DATASET_PATH = Path("Assets/Dataset/1-hop/toy/rdf_text.csv")
MODEL_PATH = Path(f"{MODEL_DIR}/NanoSocrates.zip")
# BPE Init
SPECIAL_VOC = BPE.default_special_tokens()
VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)
TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)
# Constants
MASK_EXTRA_SPACE = 100
REAL_TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size
TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + MASK_EXTRA_SPACE
EMBEDDED_SIZE = 256
FEED_FORWARD_MULTIPLIER = 4
ATTENTION_HEADS = 4
SENTENCE_LENGTH = 256
NUMBER_OF_BLOCKS = 2
SOS_TOKEN = TOKENANO.encode("<SOS>")[0]
PAD_TOKEN = TOKENANO.encode("<PAD>")[0]
END_TOKEN = TOKENANO.encode("<EOS>")[0]
SUBJ_TOKEN = TOKENANO.encode("<SUBJ>")[0]
REL_TOKEN = TOKENANO.encode("<PRED>")[0]
OBJ_TOKEN = TOKENANO.encode("<OBJ>")[0]
MASK_TOKEN = TOKENANO.encode("<MASK>")[0]
CONTINUTE_TOKEN = TOKENANO.encode("<CONTINUERDF>")[0]
SPECIAL_TOKENS: set[int] = set(TOKENANO.encode("".join(BPE.default_special_tokens())))
ALLOWED_TOKENS = set([SUBJ_TOKEN, REL_TOKEN, OBJ_TOKEN])
FORBIDDEN_TOKENS = SPECIAL_TOKENS - ALLOWED_TOKENS
# Spanned_Masker
MASKER = Transformer.SpannedMasker(REAL_TOKEN_SPACE_SIZE, FORBIDDEN_TOKENS, average_span=4)
TRAIN_BATCHER = Batch.Batcher(TRAIN_DATASET_PATH, SENTENCE_LENGTH, TOKENANO, MASKER)
VALIDATION_BATCHER = Batch.Batcher(
VALIDATION_DATASET_PATH, SENTENCE_LENGTH, TOKENANO, MASKER
)
TEST_BATCHER = Batch.Batcher(TEST_DATASET_PATH, SENTENCE_LENGTH, TOKENANO, MASKER, debug=True)
# Model
NANOSOCRATES_TRAIN = Transformer.TrainingModel(
TOKEN_SPACE_SIZE,
EMBEDDED_SIZE,
FEED_FORWARD_MULTIPLIER,
ATTENTION_HEADS,
NUMBER_OF_BLOCKS,
)
NANOSOCRATES = Transformer.NanoSocratesCore(
TOKEN_SPACE_SIZE,
SENTENCE_LENGTH,
SOS_TOKEN,
PAD_TOKEN,
END_TOKEN,
CONTINUTE_TOKEN,
EMBEDDED_SIZE,
FEED_FORWARD_MULTIPLIER,
ATTENTION_HEADS,
NUMBER_OF_BLOCKS,
)
if MODEL_PATH.is_file():
nanosocrates_dict = torch.load(MODEL_PATH, weights_only=True, map_location=DEVICE)
NANOSOCRATES_TRAIN.load_state_dict(nanosocrates_dict)
_, ENCODER_ONLY, DECODER_ONLY = TUtils.decompose_nano_socrates(
NANOSOCRATES, TOKEN_SPACE_SIZE, EMBEDDED_SIZE
)
NANOSOCRATES = TUtils.train2inference(
NANOSOCRATES_TRAIN,
NANOSOCRATES
)
NANOSOCRATES.eval()
ENCODER_ONLY.eval()
DECODER_ONLY.eval()
NANOSOCRATES_TRAIN.eval()
task_1_metrics = []
task_2_metrics = []
task_3_metrics = []
task_4_metrics = []
example_num = 0
with torch.no_grad():
for example in TEST_BATCHER.batch(1):
print(f"DOING Example: {example_num}")
src_x, tgt_y, pad_x, pad_y, tasktype = example
enc_x = torch.tensor(src_x)
ACTUAL_BATCH_SIZE, _ = enc_x.shape
enc_x_pad = torch.tensor(pad_x, dtype=torch.bool)
tgt = torch.tensor(tgt_y)
tgt_pad = torch.tensor(pad_y, dtype=torch.bool)
dec_x = Transformer.get_decoder_input(
ACTUAL_BATCH_SIZE, SOS_TOKEN, PAD_TOKEN, SENTENCE_LENGTH
)
dec_x[:, 1:] = tgt[:, :-1]
dec_x_pad = dec_x.eq(PAD_TOKEN)
out: torch.Tensor = NANOSOCRATES.inference((enc_x, enc_x_pad), tasktype)
tokens: list[int] = out.tolist()[0]
tokens.append(END_TOKEN)
tokens = list(map(lambda x: MASK_TOKEN if x > TOKENANO.vocabulary_size else x, tokens))
out_string = TOKENANO.decode(tokens)
exp_tokens: list[int] = tgt_y[0]
exp_tokens = list(map(lambda x: MASK_TOKEN if x > TOKENANO.vocabulary_size else x, exp_tokens))
exp_string = TOKENANO.decode(exp_tokens)
enc_tokens: list[int] = src_x[0]
enc_tokens = list(map(lambda x: MASK_TOKEN if x > TOKENANO.vocabulary_size else x, enc_tokens))
enc_string = TOKENANO.decode(enc_tokens)
print(f"PROMPT:\n{enc_string}")
print(f"EXPECTED:\n{exp_string}")
print(f"ACTUAL:\n{out_string}")
if tasktype == Batch.TaskType.RDF2TXT:
example_num += 1
ref = TUtils.remove_padding(exp_tokens, PAD_TOKEN, END_TOKEN)
pred = TUtils.remove_padding(tokens, PAD_TOKEN, END_TOKEN)
ref_str = TOKENANO.decode(ref)
pred_str = TOKENANO.decode(pred)
bleu, rouge, meteor = TUtils.rdf2txt([ref_str], [pred_str])
task_1_metrics.append(
[
bleu["bleu"], rouge["rougeL"], meteor["meteor"] # type: ignore
]
)
if tasktype == Batch.TaskType.TEXT2RDF:
ref = TUtils.remove_padding(exp_tokens, PAD_TOKEN, END_TOKEN)
pred = TUtils.remove_padding(tokens[1:], PAD_TOKEN, END_TOKEN)
ref, pred = TUtils.balance_paddings(ref, pred, PAD_TOKEN)
precision, recall = TUtils.txt2rdf(ref, pred)
task_2_metrics.append(
[
precision["precision"], recall["recall"] # type: ignore
]
)
if tasktype == Batch.TaskType.MASKING:
ref = TUtils.remove_padding(exp_tokens, PAD_TOKEN, END_TOKEN)
pred = TUtils.remove_padding(tokens, PAD_TOKEN, END_TOKEN)
ref, pred = TUtils.balance_paddings(ref, pred, PAD_TOKEN)
accuracy = TUtils.accuracy(ref, pred)
task_3_metrics.append(
accuracy["accuracy"] # type: ignore
)
if tasktype == Batch.TaskType.COMPLETATION:
ref = TUtils.remove_padding(exp_tokens, PAD_TOKEN, END_TOKEN)
pred = TUtils.remove_padding(tokens, PAD_TOKEN, END_TOKEN)
ref, pred = TUtils.balance_paddings(ref, pred, PAD_TOKEN)
precision, recall = TUtils.txt2rdf(ref, pred)
task_4_metrics.append(
[
precision["precision"], recall["recall"] # type: ignore
]
)
bleus = [row[0] for row in task_1_metrics]
rouges = [row[1] for row in task_1_metrics]
meteors = [row[2] for row in task_1_metrics]
prec_1 = [row[0] for row in task_2_metrics]
rec_1 = [row[1] for row in task_2_metrics]
acc = task_3_metrics
prec_2 = [row[0] for row in task_4_metrics]
rec_2 = [row[1] for row in task_4_metrics]
BLEU = TUtils.average(bleus)
ROUGE = TUtils.average(rouges)
METEOR = TUtils.average(meteors)
PREC_1 = TUtils.average(prec_1)
REC_1 = TUtils.average(rec_1)
F1_1 = TUtils.f1(PREC_1, REC_1)
ACC = TUtils.average(acc)
PREC_2 = TUtils.average(prec_2)
REC_2 = TUtils.average(rec_2)
F1_2 = TUtils.f1(PREC_2, REC_2)
SEPARATOR = "**************************************************************************"
OUTPUT = "".join([
f"{SEPARATOR}\n",
"*\tRDF2TXT:\n",
f"*\t\tBLEU: {BLEU} - ROUGE: {ROUGE} - METEOR: {METEOR}\n"
f"{SEPARATOR}\n",
"*\tTXT2RDF:\n",
f"*\t\tPRECISION: {PREC_1} - RECALL: {REC_1} - F1: {F1_1}\n"
f"{SEPARATOR}\n",
"*\tRDF Completion 1:\n",
f"*\t\tACCURACY: {ACC}\n"
f"{SEPARATOR}\n",
"*\tRDF Completion 2:\n",
f"*\t\tPRECISION: {PREC_2} - RECALL: {REC_2} - F1: {F1_2}\n"
f"{SEPARATOR}\n",
""
])
print(OUTPUT)
print("\nDEBUG")
print(task_1_metrics)
print(task_2_metrics)
print(task_3_metrics)
print(task_4_metrics)

View File

@@ -0,0 +1,472 @@
import random
import sys
import torch
import pandas as pd
from pathlib import Path
import Project_Model.Libs.Embedder as Embedder
import Project_Model.Libs.BPE as BPE
import Project_Model.Libs.Transformer as Transformer
import Project_Model.Libs.TransformerUtils as TUtils
import Project_Model.Libs.TorchShims as torch_shims
import Project_Model.Libs.Batch as Batch
from Project_Model.Libs.Training.loss_saver import Log
# set a fixed seed
torch.manual_seed(0)
random.seed(0)
# set a default device
DEVICE = torch_shims.get_default_device()
torch.set_default_device(DEVICE)
# Get paths
CHECKPOINT_DIR = "Assets/Dataset/Tmp"
VOCABULARY_PATH = Path("Assets/Model/small/bpe-small-16.json")
TRAIN_DATASET_PATH = Path("Assets/Dataset/1-hop/toy/rdf_text.csv")
VALIDATION_DATASET_PATH = Path("Assets/Dataset/1-hop/toy/rdf_text.csv")
TEST_DATASET_PATH = Path("Assets/Dataset/1-hop/toy/rdf_text.csv")
CHECKPOINT_PATH = Path(f"{CHECKPOINT_DIR}/NanoSocrates.zip")
NANO_OPTIM_PATH = Path(f"{CHECKPOINT_DIR}/nano_optim.zip")
ENC_OPTIM_PATH = Path(f"{CHECKPOINT_DIR}/enc_optim.zip")
DEC_OPTIM_PATH = Path(f"{CHECKPOINT_DIR}/dec_optim.zip")
LAST_EPOCH_PATH = Path(f"{CHECKPOINT_DIR}/last_epoch.txt")
# log saver:
loss_saver = Log(f"{CHECKPOINT_DIR}/log_loss.csv")
# BPE Init
SPECIAL_VOC = BPE.default_special_tokens()
VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)
TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)
# Constants
MASK_EXTRA_SPACE = 100
REAL_TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size
TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + MASK_EXTRA_SPACE
EMBEDDED_SIZE = 256
FEED_FORWARD_MULTIPLIER = 4
ATTENTION_HEADS = 4
SENTENCE_LENGTH = 256
NUMBER_OF_BLOCKS = 2
MAX_EPOCHS = int(300)
PRETRAIN_EPOCHS = int(20)
WARMUP_EPOCHS = int(30)
MINI_BATCH_SIZE = 20
VALIDATION_STEPS = 10
CHECKPOINT_STEPS = VALIDATION_STEPS
PATIENCE = 4
CURRENT_EPOCH = -1 if not LAST_EPOCH_PATH.is_file() else int(LAST_EPOCH_PATH.read_text())
VERBOSE = False
LEARNING_RATE = 0.05
LABEL_SMOOTHING = 0.01
SOS_TOKEN = TOKENANO.encode("<SOS>")[0]
PAD_TOKEN = TOKENANO.encode("<PAD>")[0]
END_TOKEN = TOKENANO.encode("<END>")[0]
SUBJ_TOKEN = TOKENANO.encode("<SUBJ>")[0]
REL_TOKEN = TOKENANO.encode("<PRED>")[0]
OBJ_TOKEN = TOKENANO.encode("<OBJ>")[0]
MASK_TOKEN = TOKENANO.encode("<MASK>")[0]
SPECIAL_TOKENS: set[int] = set(TOKENANO.encode("".join(BPE.default_special_tokens())))
ALLOWED_TOKENS = set([SUBJ_TOKEN, REL_TOKEN, OBJ_TOKEN])
FORBIDDEN_TOKENS = SPECIAL_TOKENS - ALLOWED_TOKENS
# Spanned_Masker
MASKER = Transformer.SpannedMasker(REAL_TOKEN_SPACE_SIZE, FORBIDDEN_TOKENS, average_span=4)
TRAIN_BATCHER = Batch.Batcher(TRAIN_DATASET_PATH, SENTENCE_LENGTH, TOKENANO, MASKER)
VALIDATION_BATCHER = Batch.Batcher(
VALIDATION_DATASET_PATH, SENTENCE_LENGTH, TOKENANO, MASKER
)
TEST_BATCHER = Batch.Batcher(TEST_DATASET_PATH, SENTENCE_LENGTH, TOKENANO, MASKER)
# Model
NANOSOCRATES = Transformer.TrainingModel(
TOKEN_SPACE_SIZE,
EMBEDDED_SIZE,
FEED_FORWARD_MULTIPLIER,
ATTENTION_HEADS,
NUMBER_OF_BLOCKS,
)
if CHECKPOINT_PATH.is_file():
nanosocrates_dict = torch.load(CHECKPOINT_PATH, weights_only=True)
NANOSOCRATES.load_state_dict(nanosocrates_dict)
_, ENCODER_ONLY, DECODER_ONLY = TUtils.decompose_nano_socrates(
NANOSOCRATES, TOKEN_SPACE_SIZE, EMBEDDED_SIZE
)
# Training constants
nano_cross_entropy = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN, label_smoothing=LABEL_SMOOTHING)
encoder_ce = torch.nn.CrossEntropyLoss( label_smoothing=LABEL_SMOOTHING)
decoder_ce = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN, label_smoothing=LABEL_SMOOTHING)
nano_optim = torch.optim.AdamW(NANOSOCRATES.parameters(), LEARNING_RATE)
encoder_only_optim = torch.optim.AdamW(ENCODER_ONLY.parameters(), LEARNING_RATE)
decoder_only_optim = torch.optim.AdamW(DECODER_ONLY.parameters(), LEARNING_RATE)
if NANO_OPTIM_PATH.is_file():
optim_dict = torch.load(NANO_OPTIM_PATH)
nano_optim.load_state_dict(optim_dict)
if ENC_OPTIM_PATH.is_file():
optim_dict = torch.load(ENC_OPTIM_PATH)
encoder_only_optim.load_state_dict(optim_dict)
if DEC_OPTIM_PATH.is_file():
optim_dict = torch.load(DEC_OPTIM_PATH)
decoder_only_optim.load_state_dict(optim_dict)
nano_scheduler = Transformer.WarmupLR(nano_optim, WARMUP_EPOCHS, EMBEDDED_SIZE, last_epoch=CURRENT_EPOCH)
encoder_only_scheduler = Transformer.WarmupLR(
encoder_only_optim, WARMUP_EPOCHS, EMBEDDED_SIZE, last_epoch=CURRENT_EPOCH
)
decoder_only_scheduler = Transformer.WarmupLR(
decoder_only_optim, WARMUP_EPOCHS, EMBEDDED_SIZE, last_epoch=CURRENT_EPOCH
)
current_epoch = CURRENT_EPOCH + 2
patience = 0
average_loss_validation = {
"txt": float("inf"),
"encoder_only": float("inf"),
"decoder_only": float("inf"),
}
while current_epoch < MAX_EPOCHS:
NANOSOCRATES.train()
ENCODER_ONLY.train()
DECODER_ONLY.train()
text_batch_losses = []
encoder_batch_losses = []
decoder_batch_losses = []
batch_counter = 0
if VERBOSE:
print(f"EPOCH {current_epoch} STARTING")
for batch in TRAIN_BATCHER.batch(MINI_BATCH_SIZE):
batch_counter += 1
src_x, tgt_y, pad_x, pad_y, tasktype = batch
enc_x = torch.tensor(src_x)
ACTUAL_BATCH_SIZE, _ = enc_x.shape
enc_x_pad = torch.tensor(pad_x, dtype=torch.bool)
tgt = torch.tensor(tgt_y)
tgt_pad = torch.tensor(pad_y, dtype=torch.bool)
dec_x = Transformer.get_decoder_input(
ACTUAL_BATCH_SIZE, SOS_TOKEN, PAD_TOKEN, SENTENCE_LENGTH
)
dec_x[:, 1:] = tgt[:, :-1]
dec_x_pad = dec_x.eq(PAD_TOKEN)
if VERBOSE:
for s in TUtils.decode_batch(enc_x, TOKENANO, MASK_TOKEN):
print("Input")
print(s)
for s in TUtils.decode_batch(enc_x_pad, TOKENANO, MASK_TOKEN):
print("Encoder Padding mask")
print(s)
for s in TUtils.decode_batch(tgt, TOKENANO, MASK_TOKEN):
print("Desired Output")
print(s)
a_dx = dec_x[:,:]
a_dx[:, -1]= END_TOKEN
for s in TUtils.decode_batch(a_dx, TOKENANO, MASK_TOKEN):
print("Decoder Input")
print(s)
if VERBOSE:
print(f"\tBATCH {batch_counter} Starting")
# Task 1 and Task 2
if tasktype == Batch.TaskType.RDF2TXT or tasktype == Batch.TaskType.TEXT2RDF:
if VERBOSE:
print(f"\tExecuting TASK 1 or 2 - BATCH {batch_counter}")
nano_optim.zero_grad()
pred_logits: torch.Tensor = NANOSOCRATES((enc_x, enc_x_pad, dec_x, dec_x_pad))
pred_logits = pred_logits.permute(0, 2, 1)
loss: torch.Tensor = nano_cross_entropy(pred_logits, tgt)
loss.backward()
nano_optim.step()
text_batch_losses.append(loss)
continue
# Pretrain first
if current_epoch < PRETRAIN_EPOCHS:
continue
# Task 3
if tasktype == Batch.TaskType.MASKING:
if VERBOSE:
print(f"\tExecuting TASK 3 - BATCH {batch_counter}")
encoder_only_optim.zero_grad()
pred_logits = ENCODER_ONLY((enc_x, enc_x_pad))
pred_logits = pred_logits.permute(0, 2, 1)
# print(torch.max(tgt))
loss: torch.Tensor = encoder_ce(pred_logits, tgt)
loss.backward()
encoder_only_optim.step()
exp_tokens: list[int] = tgt_y[0]
exp_tokens = list(map(lambda x: MASK_TOKEN if x > TOKENANO.vocabulary_size else x, exp_tokens))
exp_string = TOKENANO.decode(exp_tokens)
enc_tokens: list[int] = src_x[0]
enc_tokens = list(map(lambda x: MASK_TOKEN if x > TOKENANO.vocabulary_size else x, enc_tokens))
enc_string = TOKENANO.decode(enc_tokens)
print(f"PROMPT:\n{enc_string}")
print(f"EXPECTED:\n{exp_string}")
encoder_batch_losses.append(loss.item())
continue
# Task 4
if tasktype == Batch.TaskType.COMPLETATION:
if VERBOSE:
print(f"\tExecuting TASK 4 - BATCH {batch_counter}")
decoder_only_optim.zero_grad()
pred_logits = DECODER_ONLY((dec_x, enc_x_pad, dec_x_pad))
pred_logits = pred_logits.permute(0, 2, 1)
loss: torch.Tensor = decoder_ce(pred_logits, tgt)
loss.backward()
decoder_only_optim.step()
decoder_batch_losses.append(
loss
)
continue
nano_scheduler.step()
encoder_only_scheduler.step()
decoder_only_scheduler.step()
current_epoch += 1
if current_epoch % VALIDATION_STEPS == 0:
NANOSOCRATES.eval()
ENCODER_ONLY.eval()
DECODER_ONLY.eval()
with torch.no_grad():
txt_avg_batch_losses = []
enc_avg_batch_losses = []
dec_avg_batch_losses = []
for batch in VALIDATION_BATCHER.batch(MINI_BATCH_SIZE):
src_x, tgt_y, pad_x, pad_y, tasktype = batch
enc_x = torch.tensor(src_x)
ACTUAL_BATCH_SIZE, _ = enc_x.shape
enc_x_pad = torch.tensor(pad_x, dtype=torch.bool)
tgt = torch.tensor(tgt_y)
tgt_pad = torch.tensor(pad_y, dtype=torch.bool)
dec_x = Transformer.get_decoder_input(
ACTUAL_BATCH_SIZE, SOS_TOKEN, PAD_TOKEN, SENTENCE_LENGTH
)
dec_x[:, 1:] = tgt[:, :-1]
dec_x_pad = dec_x.eq(PAD_TOKEN)
# Task 1 and Task 2
if (
tasktype == Batch.TaskType.RDF2TXT
or tasktype == Batch.TaskType.TEXT2RDF
):
pred_logits = NANOSOCRATES((enc_x, enc_x_pad, dec_x, dec_x_pad))
pred_logits = pred_logits.permute(0, 2, 1)
loss: torch.Tensor = nano_cross_entropy(
pred_logits, tgt
)
txt_avg_batch_losses.append(loss)
continue
# Pretrain first
if current_epoch <= PRETRAIN_EPOCHS:
continue
# Task 3
if tasktype == Batch.TaskType.MASKING:
pred_logits = ENCODER_ONLY((enc_x, enc_x_pad))
pred_logits = pred_logits.permute(0, 2, 1)
loss: torch.Tensor = encoder_ce(pred_logits, tgt)
enc_avg_batch_losses.append(loss.item())
continue
# Task 4
if tasktype == Batch.TaskType.COMPLETATION:
pred_logits = DECODER_ONLY((dec_x, enc_x_pad, dec_x_pad))
pred_logits = pred_logits.permute(0, 2, 1)
loss: torch.Tensor = decoder_ce(pred_logits, tgt)
dec_avg_batch_losses.append(loss)
continue
txt_avg_loss = sum(txt_avg_batch_losses) / len(txt_avg_batch_losses)
enc_avg_loss = float("inf")
dec_avg_loss = float("inf")
if current_epoch > PRETRAIN_EPOCHS:
enc_avg_loss = sum(enc_avg_batch_losses) / len(enc_avg_batch_losses)
dec_avg_loss = sum(dec_avg_batch_losses) / len(dec_avg_batch_losses)
if current_epoch < PRETRAIN_EPOCHS:
if txt_avg_loss < average_loss_validation["txt"]:
average_loss_validation["txt"] = txt_avg_loss
else:
patience += 1
if VERBOSE:
print(f"losing a patience, current irritation: {patience}")
else:
counter = 0
if txt_avg_loss > average_loss_validation["txt"]:
if VERBOSE:
print("txt average is higher than lowest")
counter += 1
else:
average_loss_validation["txt"] = txt_avg_loss
if enc_avg_loss > average_loss_validation["encoder_only"]:
if VERBOSE:
print("masking average is higher than lowest")
counter += 1
else:
average_loss_validation["encoder_only"] = enc_avg_loss
if dec_avg_loss > average_loss_validation["decoder_only"]:
if VERBOSE:
print("decoding only average is higher than lowest")
counter += 1
else:
average_loss_validation["decoder_only"] = dec_avg_loss
if counter > 1:
patience += 1
if VERBOSE:
print(f"losing a patience, current irritation: {patience}")
if counter == 0:
patience = max(0, patience - 1)
if VERBOSE:
print(f"all good, gaining a patience, current irritation: {patience}")
txt_train_avg_loss = sum(text_batch_losses) / len(text_batch_losses)
enc_avg_train_loss = float("inf")
dec_avg_train_loss = float("inf")
if current_epoch > PRETRAIN_EPOCHS:
try:
enc_avg_train_loss = sum(encoder_batch_losses) / len(encoder_batch_losses)
dec_avg_train_loss = sum(decoder_batch_losses) / len(decoder_batch_losses)
except:
pass
# write on log
loss_saver.write([current_epoch, txt_train_avg_loss,enc_avg_train_loss,dec_avg_train_loss,txt_avg_loss,enc_avg_loss,dec_avg_loss])
SEPARATOR = "================================================================================================================"
DEBUG_TEXT = "".join(
[
f"{SEPARATOR}\n",
f"EPOCH {current_epoch}\n",
f"{SEPARATOR}\n",
f"Train Losses:\n",
f"\tAvg Losses:\n",
f"\t\tavg_txt: {txt_train_avg_loss} - avg_enc: {enc_avg_train_loss} - avg_dec: {dec_avg_train_loss}\n",
f"{SEPARATOR}\n",
f"Validation Losses:\n",
f"\ttxt_loss: {txt_avg_loss} - masking_loss: {enc_avg_loss} - prediction_loss: {dec_avg_loss}\n",
f"{SEPARATOR}\n",
]
)
print(DEBUG_TEXT)
# Warn about patience
if patience == PATIENCE:
print("Model is likely overfitting, so let's stop here")
# SAVE MODEL
if current_epoch % CHECKPOINT_STEPS == 0 or patience == PATIENCE:
print(f"Saving model at {CHECKPOINT_PATH.as_posix()}")
torch.save(NANOSOCRATES.state_dict(), CHECKPOINT_PATH)
torch.save(nano_optim.state_dict(), NANO_OPTIM_PATH)
torch.save(encoder_only_optim.state_dict(), ENC_OPTIM_PATH)
torch.save(decoder_only_optim.state_dict(), DEC_OPTIM_PATH)
FILE = open(LAST_EPOCH_PATH, "w", encoding="utf-8")
FILE.write(f"{current_epoch}")
FILE.close()
if patience == PATIENCE:
exit(0)

View File

@@ -11,13 +11,7 @@
"output_type": "stream",
"text": [
"c:\\Users\\Chris\\miniconda3\\envs\\deep_learning\\Lib\\site-packages\\torch\\utils\\_device.py:103: UserWarning: Aten Op fallback from XPU to CPU happends. This may have performance implications. If need debug the fallback ops please set environment variable `PYTORCH_DEBUG_XPU_FALLBACK=1` (Triggered internally at C:\\actions-runner\\_work\\pytorch\\pytorch\\pytorch\\build\\xpu\\ATen\\RegisterXPU_0.cpp:54528.)\n",
" return func(*args, **kwargs)\n",
"252.87s - name 'tensor' is not defined\n",
"Traceback (most recent call last):\n",
" File \"c:\\Users\\Chris\\miniconda3\\envs\\deep_learning\\Lib\\site-packages\\debugpy\\_vendored\\pydevd\\_pydevd_bundle\\pydevd_vars.py\", line 636, in change_attr_expression\n",
" value = eval(expression, frame.f_globals, frame.f_locals)\n",
" File \"<string>\", line 1, in <module>\n",
"NameError: name 'tensor' is not defined\n"
" return func(*args, **kwargs)\n"
]
},
{
@@ -25,15 +19,9 @@
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[1;31mCannot execute code, session has been disposed. Please try restarting the Kernel."
]
},
{
"ename": "",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[1;31mCannot execute code, session has been disposed. Please try restarting the Kernel. \n",
"\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n",
"\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n",
"\u001b[1;31mClick <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. \n",
"\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
]
}
@@ -68,9 +56,9 @@
"TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + 1\n",
"EMBEDDED_SIZE = 256\n",
"FEED_FORWARD_MULTIPLIER = 4\n",
"ATTENTION_HEADS = 4\n",
"ATTENTION_HEADS = 8\n",
"SENTENCE_LENGTH = 256\n",
"NUMBER_OF_BLOCKS = 2\n",
"NUMBER_OF_BLOCKS = 4\n",
"MAX_EPOCHS = int(1e3)\n",
"\n",
"\n",
@@ -105,7 +93,26 @@
" output_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n",
" )\n",
" decoder_default_tokens, _ = Transformer.normalize_sequence(\n",
" decoder_default_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n",
" decoder_default_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN, False\n",
" )\n",
"\n",
" TOY_BATCH_INPUT_LIST.append(input_tokens)\n",
" TOY_BATCH_PADDING_LIST.append(padding)\n",
" TOY_BATCH_TARGET_LIST.append(output_tokens)\n",
" TOY_BATCH_DECODER_DEFAULT.append(decoder_default_tokens)\n",
"\n",
" output_tokens = TOKENANO.encode(RDFs)\n",
" input_tokens = TOKENANO.encode(Abstract)[1:]\n",
" decoder_default_tokens = TOKENANO.encode(\"<SOS>\")\n",
"\n",
" input_tokens, padding = Transformer.normalize_sequence(\n",
" input_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n",
" )\n",
" output_tokens, _ = Transformer.normalize_sequence(\n",
" output_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n",
" )\n",
" decoder_default_tokens, _ = Transformer.normalize_sequence(\n",
" decoder_default_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN, False\n",
" )\n",
"\n",
" TOY_BATCH_INPUT_LIST.append(input_tokens)\n",
@@ -124,7 +131,7 @@
")\n",
"cross_entropy = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)\n",
"optimizer = torch.optim.AdamW(NANOSOCRATES.parameters())\n",
"scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 4)\n",
"scheduler = Transformer.WarmupLR(optimizer, 4000, EMBEDDED_SIZE)\n",
"last_loss = 0\n",
"current_epoch = 0\n",
"\n",
@@ -132,32 +139,44 @@
"\n",
" optimizer.zero_grad()\n",
"\n",
" encoder_list = torch.tensor([TOY_BATCH_INPUT_LIST[0]])\n",
" decoder_list = torch.tensor([TOY_BATCH_DECODER_DEFAULT[0]])\n",
" padding_list = torch.tensor([TOY_BATCH_PADDING_LIST[0]], dtype=torch.bool)\n",
" encoder_list = torch.tensor(TOY_BATCH_INPUT_LIST[:])\n",
" decoder_list = torch.tensor(TOY_BATCH_DECODER_DEFAULT[:])\n",
" src_padding = torch.tensor(TOY_BATCH_PADDING_LIST[:], dtype=torch.bool)\n",
"\n",
" # Transform target into logits\n",
" target_logits = torch.tensor([TOY_BATCH_TARGET_LIST[0]])\n",
" target_logits = torch.tensor(TOY_BATCH_TARGET_LIST[:])\n",
"\n",
" last_loss = 0\n",
" last_prediction: torch.Tensor\n",
"\n",
" for i in range(0, SENTENCE_LENGTH):\n",
"\n",
" optimizer.zero_grad()\n",
" tgt_padding = decoder_list.eq(PAD_TOKEN)\n",
"\n",
" logits: torch.Tensor = NANOSOCRATES((encoder_list, padding_list, decoder_list))\n",
" logits: torch.Tensor = NANOSOCRATES((encoder_list, src_padding, decoder_list, tgt_padding))\n",
" prob = torch.softmax(logits, 2)\n",
"\n",
" most_probable_tokens = torch.argmax(logits, 2)\n",
" most_probable_tokens = torch.argmax(prob, 2)\n",
" last_prediction = most_probable_tokens\n",
"\n",
" logits = logits[:,i,:]\n",
" logits = logits[:,:i,:]\n",
" logits = logits.permute(0, 2, 1)\n",
"\n",
" loss : torch.Tensor = cross_entropy(logits, target_logits[:, 0:i])\n",
" # loss : torch.Tensor = cross_entropy(logits, target_logits)\n",
"\n",
" loss = cross_entropy(logits, target_logits[:,i])\n",
" last_loss = loss\n",
" loss.backward()\n",
" optimizer.step()\n",
" scheduler.step()\n",
"\n",
" if i < SENTENCE_LENGTH - 1:\n",
" decoder_list[:,i+1] = most_probable_tokens[:,i]\n",
" decoder_list[:,i+1] = target_logits[:,i]\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
" current_epoch += 1\n",
@@ -165,6 +184,14 @@
" if current_epoch % 1 == 0:\n",
" print(f\"EPOCH {current_epoch}\\n\\tLoss: {last_loss}\")\n",
"\n",
" for encoded_sentence, expected_sentence in zip(\n",
" Transformer.tensor2token(last_prediction[:,:], END_TOKEN), # type: ignore\n",
" Transformer.tensor2token(target_logits[:,:], END_TOKEN)\n",
" ):\n",
" decoded_sentence = TOKENANO.decode(encoded_sentence)\n",
" decoded_target = TOKENANO.decode(expected_sentence)\n",
" print(f\"\\tACTUAL:\\n\\t\\t{decoded_sentence}\\n\\tEXPECTED:\\n\\t\\t{decoded_target}\\n\")\n",
"\n",
"\n",
"\n",
"\n",

View File

@@ -0,0 +1,509 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "adbef43f",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\Chris\\miniconda3\\envs\\deep_learning\\Lib\\site-packages\\torch\\utils\\_device.py:103: UserWarning: Aten Op fallback from XPU to CPU happends. This may have performance implications. If need debug the fallback ops please set environment variable `PYTORCH_DEBUG_XPU_FALLBACK=1` (Triggered internally at C:\\actions-runner\\_work\\pytorch\\pytorch\\pytorch\\build\\xpu\\ATen\\RegisterXPU_0.cpp:54528.)\n",
" return func(*args, **kwargs)\n",
"c:\\Users\\Chris\\miniconda3\\envs\\deep_learning\\Lib\\site-packages\\torch\\optim\\lr_scheduler.py:192: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate\n",
" warnings.warn(\n"
]
},
{
"ename": "IndexError",
"evalue": "list index out of range",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mIndexError\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 383\u001b[39m\n\u001b[32m 381\u001b[39m txt_min_train_losses = text_batch_losses[:][\u001b[32m0\u001b[39m]\n\u001b[32m 382\u001b[39m txt_avg_train_losses = text_batch_losses[:][\u001b[32m1\u001b[39m]\n\u001b[32m--> \u001b[39m\u001b[32m383\u001b[39m txt_max_train_losses = \u001b[43mtext_batch_losses\u001b[49m\u001b[43m[\u001b[49m\u001b[43m:\u001b[49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[32;43m2\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[32m 385\u001b[39m txt_min_loss = \u001b[38;5;28mmin\u001b[39m(txt_min_train_losses)\n\u001b[32m 386\u001b[39m txt_avg_min_loss = \u001b[38;5;28msum\u001b[39m(txt_min_train_losses) / \u001b[38;5;28mlen\u001b[39m(txt_min_train_losses)\n",
"\u001b[31mIndexError\u001b[39m: list index out of range"
]
}
],
"source": [
"import random\n",
"import sys\n",
"import torch\n",
"import pandas as pd\n",
"from pathlib import Path\n",
"import Project_Model.Libs.Embedder as Embedder\n",
"import Project_Model.Libs.BPE as BPE\n",
"import Project_Model.Libs.Transformer as Transformer\n",
"import Project_Model.Libs.TransformerUtils as TUtils\n",
"import Project_Model.Libs.TorchShims as torch_shims\n",
"import Project_Model.Libs.Batch as Batch\n",
"\n",
"# set a fixed seed\n",
"torch.manual_seed(0)\n",
"random.seed(0)\n",
"\n",
"\n",
"# set a default device\n",
"DEVICE = torch_shims.get_default_device()\n",
"torch.set_default_device(DEVICE)\n",
"\n",
"\n",
"# Get paths\n",
"VOCABULARY_PATH = Path(\"Assets/Model/small/bpe-small-16.json\")\n",
"TRAIN_DATASET_PATH = Path(\"Assets/Dataset/1-hop/toy/rdf_text.csv\")\n",
"VALIDATION_DATASET_PATH = Path(\"Assets/Dataset/1-hop/toy/rdf_text.csv\")\n",
"TEST_DATASET_PATH = Path(\"Assets/Dataset/1-hop/toy/rdf_text.csv\")\n",
"CHECKPOINT_PATH = Path(\"Assets/Dataset/Tmp/NanoSocrates.zip\")\n",
"\n",
"\n",
"# BPE Init\n",
"SPECIAL_VOC = BPE.default_special_tokens()\n",
"VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)\n",
"TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)\n",
"\n",
"\n",
"# Constants\n",
"MASK_EXTRA_SPACE = 25\n",
"TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + MASK_EXTRA_SPACE\n",
"EMBEDDED_SIZE = 256\n",
"FEED_FORWARD_MULTIPLIER = 4\n",
"ATTENTION_HEADS = 8\n",
"SENTENCE_LENGTH = 256\n",
"NUMBER_OF_BLOCKS = 4\n",
"MAX_EPOCHS = int(1e3)\n",
"PRETRAIN_EPOCHS = int(2)\n",
"WARMUP_EPOCHS = int(4e3)\n",
"MINI_BATCH_SIZE = 10\n",
"VALIDATION_STEPS = 1\n",
"CHECKPOINT_STEPS = VALIDATION_STEPS * 4\n",
"PATIENCE = 4\n",
"CURRENT_EPOCH = 0\n",
"\n",
"SOS_TOKEN = TOKENANO.encode(\"<SOS>\")[0]\n",
"\n",
"PAD_TOKEN = TOKENANO.encode(\"<PAD>\")[0]\n",
"END_TOKEN = TOKENANO.encode(\"<END>\")[0]\n",
"SUBJ_TOKEN = TOKENANO.encode(\"<SUBJ>\")[0]\n",
"REL_TOKEN = TOKENANO.encode(\"<PRED>\")[0]\n",
"OBJ_TOKEN = TOKENANO.encode(\"<OBJ>\")[0]\n",
"\n",
"SPECIAL_TOKENS: set[int] = set(TOKENANO.encode(\"\".join(BPE.default_special_tokens())))\n",
"ALLOWED_TOKENS = set([SUBJ_TOKEN, REL_TOKEN, OBJ_TOKEN])\n",
"FORBIDDEN_TOKENS = SPECIAL_TOKENS - ALLOWED_TOKENS\n",
"\n",
"\n",
"# Spanned_Masker\n",
"MASKER = Transformer.SpannedMasker(\n",
" TOKEN_SPACE_SIZE,\n",
" FORBIDDEN_TOKENS\n",
")\n",
"\n",
"TRAIN_BATCHER = Batch.Batcher(\n",
" TRAIN_DATASET_PATH,\n",
" SENTENCE_LENGTH,\n",
" TOKENANO,\n",
" MASKER\n",
")\n",
"VALIDATION_BATCHER = Batch.Batcher(\n",
" VALIDATION_DATASET_PATH,\n",
" SENTENCE_LENGTH,\n",
" TOKENANO,\n",
" MASKER\n",
")\n",
"TEST_BATCHER = Batch.Batcher(\n",
" TEST_DATASET_PATH,\n",
" SENTENCE_LENGTH,\n",
" TOKENANO,\n",
" MASKER\n",
")\n",
"\n",
"\n",
"# Model\n",
"NANOSOCRATES = Transformer.TrainingModel(\n",
" TOKEN_SPACE_SIZE,\n",
" EMBEDDED_SIZE,\n",
" FEED_FORWARD_MULTIPLIER,\n",
" ATTENTION_HEADS,\n",
" NUMBER_OF_BLOCKS\n",
")\n",
"_, ENCODER_ONLY, DECODER_ONLY = TUtils.decompose_nano_socrates(\n",
" NANOSOCRATES,\n",
" TOKEN_SPACE_SIZE,\n",
" EMBEDDED_SIZE\n",
")\n",
"\n",
"\n",
"# Training constants\n",
"cross_entropy = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)\n",
"nano_optim = torch.optim.AdamW(NANOSOCRATES.parameters())\n",
"encoder_only_optim = torch.optim.AdamW(ENCODER_ONLY.parameters())\n",
"decoder_only_optim = torch.optim.AdamW(DECODER_ONLY.parameters())\n",
"\n",
"nano_scheduler = Transformer.WarmupLR(nano_optim, WARMUP_EPOCHS, EMBEDDED_SIZE)\n",
"encoder_only_scheduler = Transformer.WarmupLR(encoder_only_optim, WARMUP_EPOCHS, EMBEDDED_SIZE)\n",
"decoder_only_scheduler = Transformer.WarmupLR(decoder_only_optim, WARMUP_EPOCHS, EMBEDDED_SIZE)\n",
"\n",
"current_epoch = CURRENT_EPOCH\n",
"patience = 0\n",
"\n",
"\n",
"average_loss_validation = {\n",
" \"txt\": float(\"inf\"),\n",
" \"encoder_only\": float(\"inf\"),\n",
" \"decoder_only\": float(\"inf\")\n",
"}\n",
"\n",
"while current_epoch < MAX_EPOCHS:\n",
"\n",
" text_batch_losses = []\n",
" encoder_batch_losses = []\n",
" decoder_batch_losses = []\n",
"\n",
" for batch in TRAIN_BATCHER.batch(MINI_BATCH_SIZE):\n",
"\n",
" src_x, tgt_y, pad_x, pad_y, tasktype = batch\n",
"\n",
" enc_x = torch.tensor(src_x)\n",
" enc_x_pad = torch.tensor(pad_x, dtype=torch.bool)\n",
" dec_x = Transformer.get_decoder_input(MINI_BATCH_SIZE, SOS_TOKEN, PAD_TOKEN, SENTENCE_LENGTH)\n",
" dec_x_pad = dec_x.eq(PAD_TOKEN)\n",
" tgt = torch.tensor(tgt_y)\n",
" tgt_pad = torch.tensor(pad_y, dtype=torch.bool)\n",
"\n",
" # Task 1 and Task 2\n",
" if tasktype == Batch.TaskType.RDF2TXT or tasktype == Batch.TaskType.TEXT2RDF:\n",
"\n",
" BATCH_LOSS = []\n",
"\n",
" for token_idx in range(0, SENTENCE_LENGTH):\n",
"\n",
" nano_optim.zero_grad()\n",
"\n",
"\n",
"\n",
" pred_logits = NANOSOCRATES((\n",
" enc_x, enc_x_pad, dec_x, dec_x_pad\n",
" ))\n",
"\n",
" pred_logits = pred_logits[:, token_idx, :]\n",
"\n",
" loss: torch.Tensor= cross_entropy(pred_logits, tgt[:, token_idx])\n",
"\n",
" loss.backward()\n",
" nano_optim.step()\n",
"\n",
"\n",
" BATCH_LOSS.append(\n",
" loss.item()\n",
" )\n",
"\n",
" if token_idx < SENTENCE_LENGTH - 1:\n",
" dec_x[:,token_idx + 1] = tgt[:, token_idx]\n",
"\n",
" MIN_BATCH_LOSS = min(BATCH_LOSS)\n",
" MAX_BATCH_LOSS = max(BATCH_LOSS)\n",
" AVG_BATCH_LOSS = sum(BATCH_LOSS) / MINI_BATCH_SIZE\n",
"\n",
" text_batch_losses.append([MIN_BATCH_LOSS, AVG_BATCH_LOSS, MAX_BATCH_LOSS])\n",
" continue\n",
"\n",
"\n",
" # Pretrain first\n",
" if current_epoch < PRETRAIN_EPOCHS:\n",
" continue\n",
"\n",
"\n",
" # Task 3\n",
" if tasktype == Batch.TaskType.MASKING:\n",
"\n",
" encoder_only_optim.zero_grad()\n",
"\n",
" pred_logits = ENCODER_ONLY((\n",
" enc_x, enc_x_pad\n",
" ))\n",
"\n",
" loss: torch.Tensor= cross_entropy(pred_logits, tgt)\n",
"\n",
" loss.backward()\n",
" encoder_only_optim.step()\n",
"\n",
" encoder_batch_losses.append(\n",
" loss.item()\n",
" )\n",
"\n",
" continue\n",
"\n",
"\n",
" # Task 4\n",
" if tasktype == Batch.TaskType.COMPLETATION:\n",
"\n",
" BATCH_LOSS = []\n",
"\n",
" for token_idx in range(0, SENTENCE_LENGTH):\n",
"\n",
" decoder_only_optim.zero_grad()\n",
"\n",
" pred_logits = DECODER_ONLY((\n",
" enc_x, enc_x_pad\n",
" ))\n",
"\n",
" pred_logits = pred_logits[:, token_idx, :]\n",
"\n",
" loss: torch.Tensor= cross_entropy(pred_logits, tgt[:, token_idx])\n",
"\n",
" loss.backward()\n",
" decoder_only_optim.step()\n",
"\n",
" BATCH_LOSS.append(\n",
" loss.item()\n",
" )\n",
"\n",
" if token_idx < SENTENCE_LENGTH - 1:\n",
" dec_x[:,token_idx + 1] = tgt[:, token_idx]\n",
"\n",
"\n",
" MIN_BATCH_LOSS = min(BATCH_LOSS)\n",
" MAX_BATCH_LOSS = max(BATCH_LOSS)\n",
" AVG_BATCH_LOSS = sum(BATCH_LOSS) / MINI_BATCH_SIZE\n",
"\n",
" decoder_batch_losses.append([MIN_BATCH_LOSS, AVG_BATCH_LOSS, MAX_BATCH_LOSS])\n",
"\n",
" continue\n",
"\n",
"\n",
" nano_scheduler.step()\n",
" encoder_only_scheduler.step()\n",
" decoder_only_scheduler.step()\n",
"\n",
" current_epoch += 1\n",
"\n",
" if current_epoch % VALIDATION_STEPS == 0:\n",
"\n",
" txt_avg_batch_losses = []\n",
" enc_avg_batch_losses = []\n",
" dec_avg_batch_losses = []\n",
"\n",
" for batch in VALIDATION_BATCHER.batch(MINI_BATCH_SIZE):\n",
"\n",
" src_x, tgt_y, pad_x, pad_y, tasktype = batch\n",
"\n",
" enc_x = torch.tensor(src_x)\n",
" enc_x_pad = torch.tensor(pad_x, dtype=torch.bool)\n",
" dec_x = Transformer.get_decoder_input(MINI_BATCH_SIZE, SOS_TOKEN, PAD_TOKEN, SENTENCE_LENGTH)\n",
" dec_x_pad = dec_x.eq(PAD_TOKEN)\n",
" tgt = torch.tensor(tgt_y)\n",
" tgt_pad = torch.tensor(pad_y, dtype=torch.bool)\n",
"\n",
" # Task 1 and Task 2\n",
" if tasktype == Batch.TaskType.RDF2TXT or tasktype == Batch.TaskType.TEXT2RDF:\n",
"\n",
" BATCH_LOSS = []\n",
"\n",
" for token_idx in range(0, SENTENCE_LENGTH):\n",
"\n",
"\n",
"\n",
" pred_logits = NANOSOCRATES((\n",
" enc_x, enc_x_pad, dec_x, dec_x_pad\n",
" ))\n",
"\n",
" pred_logits = pred_logits[:, token_idx, :]\n",
"\n",
" loss: torch.Tensor= cross_entropy(pred_logits, tgt[:, token_idx])\n",
"\n",
"\n",
" BATCH_LOSS.append(\n",
" loss.item()\n",
" )\n",
"\n",
" if token_idx < SENTENCE_LENGTH - 1:\n",
" dec_x[:,token_idx + 1] = tgt[:, token_idx]\n",
"\n",
"\n",
" AVG_BATCH_LOSS = sum(BATCH_LOSS) / MINI_BATCH_SIZE\n",
" txt_avg_batch_losses.append(AVG_BATCH_LOSS)\n",
"\n",
" continue\n",
"\n",
"\n",
" # Pretrain first\n",
" if current_epoch < PRETRAIN_EPOCHS:\n",
" continue\n",
"\n",
"\n",
" # Task 3\n",
" if tasktype == Batch.TaskType.MASKING:\n",
"\n",
" pred_logits = ENCODER_ONLY((\n",
" enc_x, enc_x_pad\n",
" ))\n",
"\n",
" loss: torch.Tensor= cross_entropy(pred_logits, tgt)\n",
"\n",
" enc_avg_batch_losses.append(\n",
" loss.item()\n",
" )\n",
"\n",
" continue\n",
"\n",
"\n",
" # Task 4\n",
" if tasktype == Batch.TaskType.COMPLETATION:\n",
"\n",
" BATCH_LOSS = []\n",
"\n",
" for token_idx in range(0, SENTENCE_LENGTH):\n",
"\n",
" pred_logits = DECODER_ONLY((\n",
" enc_x, enc_x_pad\n",
" ))\n",
"\n",
" pred_logits = pred_logits[:, token_idx, :]\n",
"\n",
" loss: torch.Tensor= cross_entropy(pred_logits, tgt[:, token_idx])\n",
"\n",
" BATCH_LOSS.append(\n",
" loss.item()\n",
" )\n",
"\n",
" if token_idx < SENTENCE_LENGTH - 1:\n",
" dec_x[:,token_idx + 1] = tgt[:, token_idx]\n",
"\n",
"\n",
" AVG_BATCH_LOSS = sum(BATCH_LOSS) / MINI_BATCH_SIZE\n",
"\n",
" dec_avg_batch_losses.append(AVG_BATCH_LOSS)\n",
"\n",
" continue\n",
"\n",
" txt_avg_loss = sum(txt_avg_batch_losses) / len(txt_avg_batch_losses)\n",
" enc_avg_loss = float(\"inf\")\n",
" dec_avg_loss = float(\"inf\")\n",
"\n",
" if current_epoch >= PRETRAIN_EPOCHS:\n",
" enc_avg_loss = sum(enc_avg_batch_losses) / len(enc_avg_batch_losses)\n",
" dec_avg_loss = sum(dec_avg_batch_losses) / len(dec_avg_batch_losses)\n",
"\n",
" if current_epoch < PRETRAIN_EPOCHS:\n",
"\n",
" if txt_avg_loss < average_loss_validation[\"txt\"]:\n",
" average_loss_validation[\"txt\"] = txt_avg_loss\n",
" else:\n",
" patience += 1\n",
" else:\n",
"\n",
" counter = 0\n",
"\n",
" if txt_avg_loss > average_loss_validation[\"txt\"]:\n",
" counter += 1\n",
"\n",
" if txt_avg_loss > average_loss_validation[\"encoder_only\"]:\n",
" counter += 1\n",
"\n",
" if txt_avg_loss > average_loss_validation[\"decoder_only\"]:\n",
" counter += 1\n",
"\n",
" if counter > 1:\n",
" patience += 1\n",
"\n",
" txt_min_train_losses = text_batch_losses[:][0]\n",
" txt_avg_train_losses = text_batch_losses[:][1]\n",
" txt_max_train_losses = text_batch_losses[:][2]\n",
"\n",
" txt_min_loss = min(txt_min_train_losses)\n",
" txt_avg_min_loss = sum(txt_min_train_losses) / len(txt_min_train_losses)\n",
" txt_max_loss = max(txt_max_train_losses)\n",
" txt_avg_max_loss = sum(txt_max_train_losses) / len(txt_max_train_losses)\n",
" txt_avg_loss = sum(txt_avg_train_losses) / len(txt_avg_train_losses)\n",
"\n",
" enc_avg_train_loss = float(\"inf\")\n",
"\n",
" dec_min_loss = float(\"inf\")\n",
" dec_avg_min_loss = float(\"inf\")\n",
" dec_max_loss = float(\"inf\")\n",
" dec_avg_max_loss = float(\"inf\")\n",
" dec_avg_loss = float(\"inf\")\n",
"\n",
" if current_epoch >= PRETRAIN_EPOCHS:\n",
" enc_avg_train_loss = sum(encoder_batch_losses) / len(encoder_batch_losses)\n",
"\n",
" dec_min_train_losses = decoder_batch_losses[:][0]\n",
" dec_avg_train_losses = decoder_batch_losses[:][1]\n",
" dec_max_train_losses = decoder_batch_losses[:][2]\n",
"\n",
" dec_min_loss = min(dec_min_train_losses)\n",
" dec_avg_min_loss = sum(dec_min_train_losses) / len(dec_min_train_losses)\n",
" dec_max_loss = max(dec_max_train_losses)\n",
" dec_avg_max_loss = sum(dec_max_train_losses) / len(dec_max_train_losses)\n",
" dec_avg_loss = sum(dec_avg_train_losses) / len(dec_avg_train_losses)\n",
"\n",
"\n",
" SEPARATOR = \"===========================================================================================\"\n",
" DEBUG_TEXT = \"\".join([\n",
" f\"{SEPARATOR}\\n\",\n",
" f\"EPOCH {current_epoch}\"\n",
" f\"{SEPARATOR}\\n\",\n",
" f\"Train Losses:\\n\"\n",
" f\"\\tMin Losses:\\n\"\n",
" f\"\\t\\tmin_txt: {txt_min_loss} - avg_txt: {txt_avg_min_loss}\\n\"\n",
" f\"\\t\\tmin_dec: {dec_min_loss} - avg_dec: {dec_avg_min_loss}\\n\"\n",
" f\"\\tMax Losses:\\n\"\n",
" f\"\\t\\tmax_txt: {txt_max_loss} - avg_txt: {txt_avg_max_loss}\\n\"\n",
" f\"\\t\\tmax_dec: {dec_min_loss} - avg_dec: {dec_avg_max_loss}\\n\"\n",
" f\"\\tAvg Losses:\\n\"\n",
" f\"\\t\\tavg_txt: {txt_avg_loss} - avg_enc: {enc_avg_loss} - avg_dec: {dec_avg_loss}\\n\"\n",
" f\"{SEPARATOR}\\n\",\n",
" f\"Validation Losses:\\n\"\n",
" f\"\\ttxt_loss: {txt_avg_loss} - masking_loss: {enc_avg_loss} - prediction: {dec_avg_loss}\"\n",
" f\"{SEPARATOR}\\n\",\n",
" ])\n",
"\n",
"\n",
"\n",
"\n",
"\n",
" # Warn about patience\n",
" if patience == PATIENCE:\n",
" print(\n",
" \"Model is likely overfitting, so let's stop here\"\n",
" )\n",
"\n",
" # SAVE MODEL\n",
" if current_epoch % CHECKPOINT_STEPS == 0 or patience == PATIENCE:\n",
" print(f\"Saving model at {CHECKPOINT_PATH.as_posix()}\")\n",
" torch.save(NANOSOCRATES.state_dict(), CHECKPOINT_PATH)\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "deep_learning",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,433 @@
import random
import sys
import torch
import pandas as pd
from pathlib import Path
import Project_Model.Libs.Embedder as Embedder
import Project_Model.Libs.BPE as BPE
import Project_Model.Libs.Transformer as Transformer
import Project_Model.Libs.TransformerUtils as TUtils
import Project_Model.Libs.TorchShims as torch_shims
import Project_Model.Libs.Batch as Batch
# set a fixed seed
torch.manual_seed(0)
random.seed(0)
# set a default device
DEVICE = torch_shims.get_default_device()
torch.set_default_device(DEVICE)
# Get paths
VOCABULARY_PATH = Path("Assets/Model/small/bpe-small-16.json")
TRAIN_DATASET_PATH = Path("Assets/Dataset/1-hop/small/holdout/train.csv")
VALIDATION_DATASET_PATH = Path("Assets/Dataset/1-hop/small/holdout/evaluation.csv")
TEST_DATASET_PATH = Path("Assets/Dataset/1-hop/small/holdout/test.csv")
CHECKPOINT_PATH = Path("Assets/Dataset/Tmp/NanoSocrates.zip")
# BPE Init
SPECIAL_VOC = BPE.default_special_tokens()
VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)
TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)
# Constants
MASK_EXTRA_SPACE = 100
REAL_TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size
TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + MASK_EXTRA_SPACE
EMBEDDED_SIZE = 256
FEED_FORWARD_MULTIPLIER = 4
ATTENTION_HEADS = 8
SENTENCE_LENGTH = 256
NUMBER_OF_BLOCKS = 4
MAX_EPOCHS = int(1e3)
PRETRAIN_EPOCHS = int(10)
WARMUP_EPOCHS = int(4e3)
MINI_BATCH_SIZE = 100
VALIDATION_STEPS = 5
CHECKPOINT_STEPS = VALIDATION_STEPS * 4
PATIENCE = 4
CURRENT_EPOCH = 0
SOS_TOKEN = TOKENANO.encode("<SOS>")[0]
PAD_TOKEN = TOKENANO.encode("<PAD>")[0]
END_TOKEN = TOKENANO.encode("<END>")[0]
SUBJ_TOKEN = TOKENANO.encode("<SUBJ>")[0]
REL_TOKEN = TOKENANO.encode("<PRED>")[0]
OBJ_TOKEN = TOKENANO.encode("<OBJ>")[0]
SPECIAL_TOKENS: set[int] = set(TOKENANO.encode("".join(BPE.default_special_tokens())))
ALLOWED_TOKENS = set([SUBJ_TOKEN, REL_TOKEN, OBJ_TOKEN])
FORBIDDEN_TOKENS = SPECIAL_TOKENS - ALLOWED_TOKENS
# Spanned_Masker
MASKER = Transformer.SpannedMasker(REAL_TOKEN_SPACE_SIZE, FORBIDDEN_TOKENS)
TRAIN_BATCHER = Batch.Batcher(TRAIN_DATASET_PATH, SENTENCE_LENGTH, TOKENANO, MASKER)
VALIDATION_BATCHER = Batch.Batcher(
VALIDATION_DATASET_PATH, SENTENCE_LENGTH, TOKENANO, MASKER
)
TEST_BATCHER = Batch.Batcher(TEST_DATASET_PATH, SENTENCE_LENGTH, TOKENANO, MASKER)
# Model
NANOSOCRATES = Transformer.TrainingModel(
TOKEN_SPACE_SIZE,
EMBEDDED_SIZE,
FEED_FORWARD_MULTIPLIER,
ATTENTION_HEADS,
NUMBER_OF_BLOCKS,
)
_, ENCODER_ONLY, DECODER_ONLY = TUtils.decompose_nano_socrates(
NANOSOCRATES, TOKEN_SPACE_SIZE, EMBEDDED_SIZE
)
# Training constants
nano_cross_entropy = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)
encoder_ce = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)
decoder_ce = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)
nano_optim = torch.optim.AdamW(NANOSOCRATES.parameters())
encoder_only_optim = torch.optim.AdamW(ENCODER_ONLY.parameters())
decoder_only_optim = torch.optim.AdamW(DECODER_ONLY.parameters())
nano_scheduler = Transformer.WarmupLR(nano_optim, WARMUP_EPOCHS, EMBEDDED_SIZE)
encoder_only_scheduler = Transformer.WarmupLR(
encoder_only_optim, WARMUP_EPOCHS, EMBEDDED_SIZE
)
decoder_only_scheduler = Transformer.WarmupLR(
decoder_only_optim, WARMUP_EPOCHS, EMBEDDED_SIZE
)
current_epoch = CURRENT_EPOCH
patience = 0
average_loss_validation = {
"txt": float("inf"),
"encoder_only": float("inf"),
"decoder_only": float("inf"),
}
while current_epoch < MAX_EPOCHS:
NANOSOCRATES.train()
ENCODER_ONLY.train()
DECODER_ONLY.train()
text_batch_losses = []
encoder_batch_losses = []
decoder_batch_losses = []
batch_counter = 0
print(f"EPOCH {current_epoch} STARTING")
for batch in TRAIN_BATCHER.batch(MINI_BATCH_SIZE):
batch_counter += 1
src_x, tgt_y, pad_x, pad_y, tasktype = batch
enc_x = torch.tensor(src_x)
ACTUAL_BATCH_SIZE, _ = enc_x.shape
enc_x_pad = torch.tensor(pad_x, dtype=torch.bool)
dec_x = Transformer.get_decoder_input(
ACTUAL_BATCH_SIZE, SOS_TOKEN, PAD_TOKEN, SENTENCE_LENGTH
)
dec_x_pad = dec_x.eq(PAD_TOKEN)
tgt = torch.tensor(tgt_y)
tgt_pad = torch.tensor(pad_y, dtype=torch.bool)
print(f"\tBATCH {batch_counter} Starting")
# Task 1 and Task 2
if tasktype == Batch.TaskType.RDF2TXT or tasktype == Batch.TaskType.TEXT2RDF:
print(f"\tExecuting TASK 1 or 2 - BATCH {batch_counter}")
BATCH_LOSS = []
for token_idx in range(0, SENTENCE_LENGTH):
nano_optim.zero_grad()
pred_logits = NANOSOCRATES((enc_x, enc_x_pad, dec_x, dec_x_pad))
pred_logits = pred_logits[:, token_idx, :]
loss: torch.Tensor = nano_cross_entropy(pred_logits, tgt[:, token_idx])
loss.backward()
nano_optim.step()
BATCH_LOSS.append(loss.item())
if token_idx < SENTENCE_LENGTH - 1:
dec_x[:, token_idx + 1] = tgt[:, token_idx]
MIN_BATCH_LOSS = min(BATCH_LOSS)
MAX_BATCH_LOSS = max(BATCH_LOSS)
AVG_BATCH_LOSS = sum(BATCH_LOSS) / MINI_BATCH_SIZE
text_batch_losses.append([MIN_BATCH_LOSS, AVG_BATCH_LOSS, MAX_BATCH_LOSS])
continue
# Pretrain first
if current_epoch < PRETRAIN_EPOCHS:
continue
# Task 3
if tasktype == Batch.TaskType.MASKING:
print(f"\tExecuting TASK 3 - BATCH {batch_counter}")
encoder_only_optim.zero_grad()
pred_logits = ENCODER_ONLY((enc_x, enc_x_pad))
pred_logits = pred_logits.permute(0, 2, 1)
print(torch.max(tgt))
loss: torch.Tensor = encoder_ce(pred_logits, tgt)
loss.backward()
encoder_only_optim.step()
encoder_batch_losses.append(loss.item())
continue
# Task 4
if tasktype == Batch.TaskType.COMPLETATION:
print(f"\tExecuting TASK 4 - BATCH {batch_counter}")
BATCH_LOSS = []
for token_idx in range(0, SENTENCE_LENGTH):
decoder_only_optim.zero_grad()
pred_logits = DECODER_ONLY((enc_x, enc_x_pad))
pred_logits = pred_logits[:, token_idx, :]
loss: torch.Tensor = decoder_ce(pred_logits, tgt[:, token_idx])
loss.backward()
decoder_only_optim.step()
BATCH_LOSS.append(loss.item())
if token_idx < SENTENCE_LENGTH - 1:
dec_x[:, token_idx + 1] = tgt[:, token_idx]
MIN_BATCH_LOSS = min(BATCH_LOSS)
MAX_BATCH_LOSS = max(BATCH_LOSS)
AVG_BATCH_LOSS = sum(BATCH_LOSS) / MINI_BATCH_SIZE
decoder_batch_losses.append(
[MIN_BATCH_LOSS, AVG_BATCH_LOSS, MAX_BATCH_LOSS]
)
continue
nano_scheduler.step()
encoder_only_scheduler.step()
decoder_only_scheduler.step()
current_epoch += 1
if current_epoch % VALIDATION_STEPS == 0:
NANOSOCRATES.eval()
ENCODER_ONLY.eval()
DECODER_ONLY.eval()
txt_avg_batch_losses = []
enc_avg_batch_losses = []
dec_avg_batch_losses = []
for batch in VALIDATION_BATCHER.batch(MINI_BATCH_SIZE):
src_x, tgt_y, pad_x, pad_y, tasktype = batch
enc_x = torch.tensor(src_x)
ACTUAL_BATCH_SIZE, _, _ = enc_x.shape
enc_x_pad = torch.tensor(pad_x, dtype=torch.bool)
dec_x = Transformer.get_decoder_input(
ACTUAL_BATCH_SIZE, SOS_TOKEN, PAD_TOKEN, SENTENCE_LENGTH
)
dec_x_pad = dec_x.eq(PAD_TOKEN)
tgt = torch.tensor(tgt_y)
tgt_pad = torch.tensor(pad_y, dtype=torch.bool)
# Task 1 and Task 2
if (
tasktype == Batch.TaskType.RDF2TXT
or tasktype == Batch.TaskType.TEXT2RDF
):
BATCH_LOSS = []
for token_idx in range(0, SENTENCE_LENGTH):
pred_logits = NANOSOCRATES((enc_x, enc_x_pad, dec_x, dec_x_pad))
pred_logits = pred_logits[:, token_idx, :]
loss: torch.Tensor = nano_cross_entropy(pred_logits, tgt[:, token_idx])
BATCH_LOSS.append(loss.item())
if token_idx < SENTENCE_LENGTH - 1:
dec_x[:, token_idx + 1] = tgt[:, token_idx]
AVG_BATCH_LOSS = sum(BATCH_LOSS) / MINI_BATCH_SIZE
txt_avg_batch_losses.append(AVG_BATCH_LOSS)
continue
# Pretrain first
if current_epoch < PRETRAIN_EPOCHS:
continue
# Task 3
if tasktype == Batch.TaskType.MASKING:
pred_logits = ENCODER_ONLY((enc_x, enc_x_pad))
pred_logits = pred_logits.permute(0, 2, 1)
loss: torch.Tensor = encoder_ce(pred_logits, tgt)
enc_avg_batch_losses.append(loss.item())
continue
# Task 4
if tasktype == Batch.TaskType.COMPLETATION:
BATCH_LOSS = []
for token_idx in range(0, SENTENCE_LENGTH):
pred_logits = DECODER_ONLY((enc_x, enc_x_pad))
pred_logits = pred_logits[:, token_idx, :]
loss: torch.Tensor = decoder_ce(pred_logits, tgt[:, token_idx])
BATCH_LOSS.append(loss.item())
if token_idx < SENTENCE_LENGTH - 1:
dec_x[:, token_idx + 1] = tgt[:, token_idx]
AVG_BATCH_LOSS = sum(BATCH_LOSS) / MINI_BATCH_SIZE
dec_avg_batch_losses.append(AVG_BATCH_LOSS)
continue
txt_avg_loss = sum(txt_avg_batch_losses) / len(txt_avg_batch_losses)
enc_avg_loss = float("inf")
dec_avg_loss = float("inf")
if current_epoch >= PRETRAIN_EPOCHS:
enc_avg_loss = sum(enc_avg_batch_losses) / len(enc_avg_batch_losses)
dec_avg_loss = sum(dec_avg_batch_losses) / len(dec_avg_batch_losses)
if current_epoch < PRETRAIN_EPOCHS:
if txt_avg_loss < average_loss_validation["txt"]:
average_loss_validation["txt"] = txt_avg_loss
else:
patience += 1
else:
counter = 0
if txt_avg_loss > average_loss_validation["txt"]:
counter += 1
if txt_avg_loss > average_loss_validation["encoder_only"]:
counter += 1
if txt_avg_loss > average_loss_validation["decoder_only"]:
counter += 1
if counter > 1:
patience += 1
txt_min_train_losses = [row[0] for row in text_batch_losses]
txt_avg_train_losses = [row[1] for row in text_batch_losses]
txt_max_train_losses = [row[2] for row in text_batch_losses]
txt_min_loss = min(txt_min_train_losses)
txt_avg_min_loss = sum(txt_min_train_losses) / len(txt_min_train_losses)
txt_max_loss = max(txt_max_train_losses)
txt_avg_max_loss = sum(txt_max_train_losses) / len(txt_max_train_losses)
txt_avg_loss = sum(txt_avg_train_losses) / len(txt_avg_train_losses)
enc_avg_train_loss = float("inf")
dec_min_loss = float("inf")
dec_avg_min_loss = float("inf")
dec_max_loss = float("inf")
dec_avg_max_loss = float("inf")
dec_avg_loss = float("inf")
if current_epoch >= PRETRAIN_EPOCHS:
enc_avg_train_loss = sum(encoder_batch_losses) / len(encoder_batch_losses)
dec_min_train_losses = [row[0] for row in decoder_batch_losses]
dec_avg_train_losses = [row[1] for row in decoder_batch_losses]
dec_max_train_losses = [row[2] for row in decoder_batch_losses]
dec_min_loss = min(dec_min_train_losses)
dec_avg_min_loss = sum(dec_min_train_losses) / len(dec_min_train_losses)
dec_max_loss = max(dec_max_train_losses)
dec_avg_max_loss = sum(dec_max_train_losses) / len(dec_max_train_losses)
dec_avg_loss = sum(dec_avg_train_losses) / len(dec_avg_train_losses)
SEPARATOR = "================================================================================================================"
DEBUG_TEXT = "".join(
[
f"{SEPARATOR}\n",
f"EPOCH {current_epoch}\n",
f"{SEPARATOR}\n",
f"Train Losses:\n",
f"\tMin Losses:\n",
f"\t\tmin_txt: {txt_min_loss} - avg_txt: {txt_avg_min_loss}\n",
f"\t\tmin_dec: {dec_min_loss} - avg_dec: {dec_avg_min_loss}\n",
f"\tMax Losses:\n",
f"\t\tmax_txt: {txt_max_loss} - avg_txt: {txt_avg_max_loss}\n",
f"\t\tmax_dec: {dec_min_loss} - avg_dec: {dec_avg_max_loss}\n",
f"\tAvg Losses:\n",
f"\t\tavg_txt: {txt_avg_loss} - avg_enc: {enc_avg_loss} - avg_dec: {dec_avg_loss}\n",
f"{SEPARATOR}\n",
f"Validation Losses:\n",
f"\ttxt_loss: {txt_avg_loss} - masking_loss: {enc_avg_loss} - prediction: {dec_avg_loss}\n",
f"{SEPARATOR}\n",
]
)
print(DEBUG_TEXT)
# Warn about patience
if patience == PATIENCE:
print("Model is likely overfitting, so let's stop here")
# SAVE MODEL
if current_epoch % CHECKPOINT_STEPS == 0 or patience == PATIENCE:
print(f"Saving model at {CHECKPOINT_PATH.as_posix()}")
torch.save(NANOSOCRATES.state_dict(), CHECKPOINT_PATH)

177
Playgrounds/prova.py Normal file
View File

@@ -0,0 +1,177 @@
import random
import time
import torch
import pandas as pd
from pathlib import Path
import Project_Model.Libs.Embedder as Embedder
import Project_Model.Libs.BPE as BPE
import Project_Model.Libs.Transformer as Transformer
import Project_Model.Libs.TorchShims as torch_shims
# set a fixed seed
torch.manual_seed(0)
random.seed(0)
DEVICE = torch_shims.get_default_device()
torch.set_default_device(DEVICE)
# set a default device
# BPE Init
VOCABULARY_PATH = Path("Assets/Model/toy_10/toy_dictionary.json")
SPECIAL_VOC = BPE.default_special_tokens()
VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)
TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)
# Constants
TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + 1
EMBEDDED_SIZE = 256
FEED_FORWARD_MULTIPLIER = 4
ATTENTION_HEADS = 8
SENTENCE_LENGTH = 256
NUMBER_OF_BLOCKS = 4
MAX_EPOCHS = int(1e3)
PAD_TOKEN = TOKENANO.encode("<PAD>")[0]
END_TOKEN = TOKENANO.encode("<END>")[0]
# Load CSV
TOY_DATASET_PATH = Path("Assets/Dataset/1-hop/toy/rdf_text.csv")
TOY_DATASET = pd.read_csv(TOY_DATASET_PATH)
TOY_BATCH_INPUT_LIST: list[list[int]] = []
TOY_BATCH_PADDING_LIST: list[list[bool]] = []
TOY_BATCH_TARGET_LIST: list[list[int]] = []
TOY_BATCH_DECODER_DEFAULT: list[list[int]] = []
for index, row in TOY_DATASET.iterrows():
RDFs: str = row["RDFs"]
Abstract: str = row["Abstract"]
input_tokens = TOKENANO.encode(RDFs)
output_tokens = TOKENANO.encode(Abstract)[1:]
decoder_default_tokens = TOKENANO.encode("<SOS>")
input_tokens, padding = Transformer.normalize_sequence(
input_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN
)
output_tokens, _ = Transformer.normalize_sequence(
output_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN
)
decoder_default_tokens, _ = Transformer.normalize_sequence(
decoder_default_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN, False
)
TOY_BATCH_INPUT_LIST.append(input_tokens)
TOY_BATCH_PADDING_LIST.append(padding)
TOY_BATCH_TARGET_LIST.append(output_tokens)
TOY_BATCH_DECODER_DEFAULT.append(decoder_default_tokens)
output_tokens = TOKENANO.encode(RDFs)
input_tokens = TOKENANO.encode(Abstract)[1:]
decoder_default_tokens = TOKENANO.encode("<SOS>")
input_tokens, padding = Transformer.normalize_sequence(
input_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN
)
output_tokens, _ = Transformer.normalize_sequence(
output_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN
)
decoder_default_tokens, _ = Transformer.normalize_sequence(
decoder_default_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN, False
)
TOY_BATCH_INPUT_LIST.append(input_tokens)
TOY_BATCH_PADDING_LIST.append(padding)
TOY_BATCH_TARGET_LIST.append(output_tokens)
TOY_BATCH_DECODER_DEFAULT.append(decoder_default_tokens)
# Training loop
LOSS_HISTORY = []
NANOSOCRATES = Transformer.TrainingModel(
TOKEN_SPACE_SIZE,
EMBEDDED_SIZE,
FEED_FORWARD_MULTIPLIER,
ATTENTION_HEADS,
NUMBER_OF_BLOCKS,
)
cross_entropy = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)
optimizer = torch.optim.AdamW(NANOSOCRATES.parameters())
scheduler = Transformer.WarmupLR(optimizer, 4000, EMBEDDED_SIZE)
last_loss = 0
current_epoch = 0
while current_epoch < MAX_EPOCHS:
optimizer.zero_grad()
encoder_list = torch.tensor(TOY_BATCH_INPUT_LIST[:])
decoder_list = torch.tensor(TOY_BATCH_DECODER_DEFAULT[:])
src_padding = torch.tensor(TOY_BATCH_PADDING_LIST[:], dtype=torch.bool)
# Transform target into logits
target_logits = torch.tensor(TOY_BATCH_TARGET_LIST[:])
last_loss = 0
last_prediction: torch.Tensor
LOSS_HISTORY = []
start = time.time_ns()
for i in range(0, SENTENCE_LENGTH):
optimizer.zero_grad()
tgt_padding = decoder_list.eq(PAD_TOKEN)
logits: torch.Tensor = NANOSOCRATES(
(encoder_list, src_padding, decoder_list, tgt_padding)
)
prob = torch.softmax(logits, 2)
most_probable_tokens = torch.argmax(prob, 2)
last_prediction = most_probable_tokens
logits = logits[:, i, :]
# logits = logits.permute(0, 2, 1)
loss: torch.Tensor = cross_entropy(logits, target_logits[:, i])
LOSS_HISTORY.append(loss.item())
# loss : torch.Tensor = cross_entropy(logits, target_logits[:, 0:i])
# loss : torch.Tensor = cross_entropy(logits, target_logits)
last_loss = loss
loss.backward()
optimizer.step()
scheduler.step()
if i < SENTENCE_LENGTH - 1:
decoder_list[:, i + 1] = target_logits[:, i]
current_epoch += 1
end = time.time_ns()
if current_epoch % 1 == 0:
MIN_LOSS = min(LOSS_HISTORY)
MAX_LOSS = max(LOSS_HISTORY)
AVERAGE_LOSS = sum(LOSS_HISTORY)/len(LOSS_HISTORY)
print(f"EPOCH {current_epoch}\n\tTime: {(end-start)/1E9}s\n\tLoss: {last_loss}")
print(f"\tMin Loss: {MIN_LOSS}\tAvg Loss: {AVERAGE_LOSS}\tMax Loss: {MAX_LOSS}\n")
# for encoded_sentence, expected_sentence in zip(
# Transformer.tensor2token(last_prediction[:, :], END_TOKEN), # type: ignore
# Transformer.tensor2token(target_logits[:, :], END_TOKEN),
# ):
# decoded_sentence = TOKENANO.decode(encoded_sentence)
# decoded_target = TOKENANO.decode(expected_sentence)
# print(
# f"\tACTUAL:\n\t\t{decoded_sentence}\n\tEXPECTED:\n\t\t{decoded_target}\n"
# )

View File

@@ -189,7 +189,7 @@ class NanoSocratesBPE(Encoder):
token_stack.appendleft(right_token)
token_stack.appendleft(left_token)
return UTF_8_STRING_ARR.decode("utf-8")
return UTF_8_STRING_ARR.decode("utf-8", errors="ignore")
def __token_decode(self, token_id: int) -> tuple[int, int]:

View File

@@ -31,7 +31,7 @@ class TokeNanoCore:
def vocabulary_size(self):
BPE_VOC_SIZE = self.__bpe_encoder.vocabulary_size
SPECIAL_VOC_SIZE = self.__special_encoder.vocabulary_size
return BPE_VOC_SIZE + SPECIAL_VOC_SIZE
return BPE_VOC_SIZE + SPECIAL_VOC_SIZE + 1
def encode(self, corpus: str) -> list[int]:
output: list[int] = []

View File

@@ -1,104 +1,243 @@
import random
from typing import Generator
import sys
from typing import Any, Generator
import pandas as pd
from pathlib import Path
from ..Enums import TaskType
import Project_Model.Libs.BPE as BPE
from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
from Project_Model.Libs.Transformer.Classes.SpannedMasker import SpannedMasker
from TokenCompletation import TokenCompletationTransformer
from Project_Model.Libs.BPE.Enums.SpecialToken import SpecialToken
# from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
from Project_Model.Libs.Transformer import (
SpannedMasker,
truncate_rdf_list,
normalize_sequence,
)
from Project_Model.Libs.BPE import SpecialToken
class Batcher:
def __init__(self, dataset_path: str, batch_size:int, tokenizer: BPE.TokeNanoCore, masker: SpannedMasker) -> None:
# ABSTRACT, TRIPLE
# tasks:
# rdf2text: X: TRIPLE, Y: ABSTRACT
# text2rdf: X: ABSTRACT, X:TRIPLE
# masking ( call masker): X: incomplete_triple Y: complete_triple (as exam)
def __init__(
self,
dataset_path: Path,
max_length: int,
tokenizer: BPE.TokeNanoCore,
masker: SpannedMasker,
seed: int = 0,
debug = False
) -> None:
# ABSTRACT, TRIPLE
# tasks:
# rdf2text: X: TRIPLE, Y: ABSTRACT
# text2rdf: X: ABSTRACT, X:TRIPLE
# masking ( call masker): X: incomplete_triple Y: complete_triple (as exam)
# completation: X: TRIPLE SUBSET, Y: related TRIPLE SUBSET
# it will truncate
# it will instantiate spanmaskter and truncator
self._dataset_path = dataset_path
self._batch_size = batch_size
self._tokenizer = tokenizer
self._masker = masker
self.__max_length = max_length
self._seed = seed
# self._token_completation = TokenCompletationTransformer(sotl,eos)
self._completation_task_token_truncator = truncate_rdf_list
self.__debug = debug
sotl = self._tokenizer.encode(SpecialToken.START_TRIPLE_LIST.value)
eos = self._tokenizer.encode(SpecialToken.END_OF_SEQUENCE.value)
self._token_completation = TokenCompletationTransformer(sotl,eos)
def batch(self, batch_size) -> Generator[
tuple[
list[list[int]],
list[list[int]],
list[list[int]],
list[list[int]],
TaskType
],
Any,
Any,
]:
"""
Yields: X,Y,padding_X
"""
RNG = random.Random(self._seed)
self._masker.reseed(self._seed)
def get_batch(self)-> Generator[pd.DataFrame]:
for batch in pd.read_csv(self._dataset_path, chunksize= int(self._batch_size/4)): #now we support 3 task
for batch in pd.read_csv(self._dataset_path, chunksize=batch_size):
tokenized_batch = pd.DataFrame()
tokenized_batch[["Abstract","RDFs"]] = (
batch[["Abstract","RDFs"]]
.map(lambda t: self._tokenizer.encode(t))
# encode
tokenized_batch[["Abstract", "RDFs"]] = batch[["Abstract", "RDFs"]].map(
lambda t: self._tokenizer.encode(t)
)
rdf2txt_batch = self.__rdf2txt_transformation(tokenized_batch)
txt2rdf_batch = self.__txt2rdf_transformation(tokenized_batch)
mask_batch = self.__masking_trasformation(tokenized_batch)
completation_batch = self.__token_completation_task(tokenized_batch)
output = pd.concat([rdf2txt_batch,txt2rdf_batch,mask_batch,completation_batch],ignore_index=True)
output = output.sample(frac=1).reset_index(drop=True)
yield output
X, Y, padding_X, padding_Y = self.__rdf2txt_transformation(tokenized_batch)
yield X, Y, padding_X, padding_Y, TaskType.RDF2TXT
(
X,
Y,
padding_X,
padding_Y,
) = self.__txt2rdf_transformation(tokenized_batch)
yield X, Y, padding_X, padding_Y, TaskType.TEXT2RDF
(
X,
Y,
padding_X,
padding_Y,
) = self.__masking_trasformation(tokenized_batch)
yield X, Y, padding_X, padding_Y, TaskType.MASKING
(
X,
Y,
padding_X,
padding_Y,
) = self.__token_completation_task(
tokenized_batch, RNG.randint(0, sys.maxsize)
)
yield X, Y, padding_X, padding_Y, TaskType.COMPLETATION
def __random_subset_rdfs(self, batch: pd.DataFrame, seed = 0):
# WIP
# output = pd.concat([rdf2txt_batch,txt2rdf_batch,completation_batch],ignore_index=True)
# output = output.sample(frac=1).reset_index(drop=True)
# self.decode_debug(output)
# yield output
def __random_subset_rdfs(self, batch: pd.DataFrame, seed=0):
# WIP
rng = random.Random(seed)
def to_list(x):
return x.split(SpecialToken.START_TRIPLE.value)[1:]
batch["RDFs"] = batch["RDFs"].map(
to_list
)
batch["RDFs"] = batch["RDFs"].map(to_list)
def decode_debug(self, batch: pd.DataFrame):
decoded = pd.DataFrame()
decoded[["X", "Y"]] = batch[["X", "Y"]].map(lambda t: self._tokenizer.decode(t))
print(decoded)
def __normalization(
self, X: list[list[int]], Y: list[list[int]]
) -> tuple[list[list[int]], list[list[int]], list[list[int]], list[list[int]]]:
pad_token = self._tokenizer.encode(SpecialToken.PAD.value)[0]
end_token = self._tokenizer.encode(SpecialToken.END_OF_SEQUENCE.value)[0]
out_X = []
padding_X = []
out_Y = []
padding_Y = []
for x in X:
out_x, padding_x = normalize_sequence(
x, self.__max_length, pad_token, end_token, True
)
out_X.append(out_x)
padding_X.append(padding_x)
for y in Y:
out_y, padding_y = normalize_sequence(
y, self.__max_length, pad_token, end_token, True
)
out_Y.append(out_y)
padding_Y.append(padding_y)
return out_X, out_Y, padding_X, padding_Y
def __rdf2txt_transformation(self, batch: pd.DataFrame):
batch = batch.rename(columns={"RDFs": "X", "Abstract": "Y"})
return batch[["X", "Y"]]
X: list[list[int]]
task_token = self._tokenizer.encode(SpecialToken.RDF_TO_TEXT.value)
out = batch.rename(columns={"RDFs": "X", "Abstract": "Y"})[["X", "Y"]]
out["X"] = [task_token + x for x in out["X"]]
return self.__normalization(out["X"].to_list(), out["Y"].to_list())
def __txt2rdf_transformation(self, batch: pd.DataFrame):
batch = batch.rename(columns={ "Abstract": "X","RDFs": "Y"})
return batch[["X", "Y"]]
task_token = self._tokenizer.encode(SpecialToken.TEXT_TO_RDF.value)
out = batch.rename(columns={"Abstract": "X", "RDFs": "Y"})[["X", "Y"]]
out["X"] = [task_token + x for x in out["X"]]
return self.__normalization(out["X"].to_list(), out["Y"].to_list())
def __masking_trasformation(self, batch: pd.DataFrame):
# mask_sequence: List[int] -> Tuple[List[int], List[int]]
xy_tuples = batch["RDFs"].apply(self._masker.mask_sequence) # Series of (X, Y)
X = []
Y = []
for rdf in batch["RDFs"]:
x, y = self._masker.mask_sequence(rdf[:self.__max_length])
X.append(x)
Y.append(y)
return self.__normalization(X, Y)
output = batch.copy()
# Expand into two columns preserving the original index
output[["X", "Y"]] = pd.DataFrame(xy_tuples.tolist(), index=batch.index)
return output[["X", "Y"]]
def __token_completation_task(self, batch: pd.DataFrame):
xy_tuples = batch["RDFs"].apply(self._token_completation.get_completation_tuple)
output = batch.copy()
output[["X", "Y"]] = pd.DataFrame(xy_tuples.tolist(), index=batch.index)
return output[["X", "Y"]]
def __token_completation_task(self, batch: pd.DataFrame, minibatch_seed: int):
continue_triple_token = self._tokenizer.encode(SpecialToken.CONTINUE_RDF.value)[
0
]
eot = self._tokenizer.encode(SpecialToken.END_TRIPLE.value)[0]
X = []
Y = []
for rdf in batch["RDFs"]:
# here first truncate to max_lenght
rdf = rdf[: self.__max_length] # truncator that uses "eot" so no problem
x, y = self._completation_task_token_truncator(
rdf, 0.5, continue_triple_token, eot, minibatch_seed
)
X.append(x)
Y.append(y)
return self.__token_cmpletation_task_special_normalization(X, Y)
def __token_cmpletation_task_special_normalization(self, X: list[list[int]], Y: list[list[int]]
) -> tuple[list[list[int]], list[list[int]], list[list[int]], list[list[int]]]:
def continue_rdf_padding(sequence: list[int], pad_token: int):
for i, x in enumerate(sequence):
if x == pad_token:
i = i+1 # continueRDF will be excluded by the mask
# fill the tail with True and stop
return [False] * i + [True] * (len(sequence) - i)
return [False] * len(sequence) # no pad token found
"""
DATASET_PATH = "Assets/Dataset/Tmp/rdf_text.csv"
VOCABULARY_path = "Assets/Dataset/Tmp/trimmed.json"
pad_token = self._tokenizer.encode(SpecialToken.PAD.value)[0]
end_token = self._tokenizer.encode(SpecialToken.END_OF_SEQUENCE.value)[0]
continue_rdf = self._tokenizer.encode(SpecialToken.CONTINUE_RDF.value)[0]
out_X = []
padding_X = []
out_Y = []
padding_Y = []
from pathlib import Path
VOCABULARY = BPE.load_nanos_vocabulary(Path(VOCABULARY_path))
SPECIAL_LIST = BPE.default_special_tokens()
TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_LIST)
SPECIAL_TOKENS: set[int] = set(TOKENANO.encode("".join(SPECIAL_LIST)))
for x in X:
out_x, _ = normalize_sequence(
x, self.__max_length, pad_token, end_token, True
)
out_X.append(out_x)
# padding_X.append(padding_x)
special_padding = continue_rdf_padding(out_x,continue_rdf)
padding_X.append(special_padding)
MASKER = SpannedMasker(TOKENANO.vocabulary_size,SPECIAL_TOKENS)
for y in Y:
out_y, padding_y = normalize_sequence(
y, self.__max_length, pad_token, end_token, True
)
out_Y.append(out_y)
# special padding
# special_padding = continue_rdf_padding(out_y,continue_rdf)
# padding_Y.append(special_padding)
padding_Y.append(padding_y)
prova = "<ABS>Cactus Flower is a 1969 American screwball comedy film directed by Gene Saks, and starring Walter Matthau, Ingrid Bergman and Goldie Hawn, who won an Academy Award for her performance.The screenplay was adapted by I. A. L. Diamond from the 1965 Broadway play of the same title written by Abe Burrows, which, in turn, is based on the French play Fleur de cactus by Pierre Barillet and Jean-Pierre Gredy. Cactus Flower was the ninth highest-grossing film of 1969."
print(TOKENANO.encode(prova))
batcher = Batcher(DATASET_PATH,8,TOKENANO,MASKER)
for batch in batcher.get_batch():
print(batch)
"""
return out_X, out_Y, padding_X, padding_Y
if __name__ == "__main__":
DATASET_PATH = Path("Assets/Dataset/Tmp/rdf_text.csv")
VOCABULARY_path = "Assets/Dataset/Tmp/trimmed.json"
from pathlib import Path
VOCABULARY = BPE.load_nanos_vocabulary(Path(VOCABULARY_path))
SPECIAL_LIST = BPE.default_special_tokens()
TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_LIST)
SPECIAL_TOKENS: set[int] = set(TOKENANO.encode("".join(SPECIAL_LIST)))
MASKER = SpannedMasker(TOKENANO.vocabulary_size, SPECIAL_TOKENS)
prova = "<ABS>Cactus Flower is a 1969 American screwball comedy film directed by Gene Saks, and starring Walter Matthau, Ingrid Bergman and Goldie Hawn, who won an Academy Award for her performance.The screenplay was adapted by I. A. L. Diamond from the 1965 Broadway play of the same title written by Abe Burrows, which, in turn, is based on the French play Fleur de cactus by Pierre Barillet and Jean-Pierre Gredy. Cactus Flower was the ninth highest-grossing film of 1969."
print(TOKENANO.encode(prova))
batcher = Batcher(DATASET_PATH,256, TOKENANO, MASKER)
for batch in batcher.batch(8):
print(batch)

View File

@@ -0,0 +1,2 @@
from .Batcher import Batcher
from .TokenCompletation import TokenCompletationTransformer

View File

@@ -0,0 +1,5 @@
from .TaskType import TaskType
__all__ = [
"TaskType"
]

View File

@@ -0,0 +1,5 @@
from .Classes import *
from .Enums import *
from . import Classes
from . import Enums

View File

@@ -0,0 +1,70 @@
import evaluate
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
class Evaluator():
def __init__(self) -> None:
# txt based evaluator
self.__rouge = evaluate.load("rouge")
self.__rouge_types = ["rougeLsum", "rouge1", "rouge2"] #rougeLsum will work bad because it expect that each sentence are divided with /n
self._bleu = evaluate.load("bleu")
self._meteor = evaluate.load("meteor")
# token based evaluator
self.__acc_m = evaluate.load("accuracy")
self.__prec_m = evaluate.load("precision")
self.__rec_m = evaluate.load("recall")
self.__f1_m = evaluate.load("f1")
def rdf2txt_rouge_evaluation(self, preds: list[str], refs: list[str]):
results = self.__rouge.compute(
predictions=preds, references=refs,
rouge_types=self.__rouge_types,
use_stemmer=True,
use_aggregator=True #F1
)
return {k: float(results[k]) for k in self.__rouge_types}
def rdf2txt_bleu_evaluation(self, preds: list[str], refs: list[str]) -> float:
# sacreBLEU via evaluate; expects references as list-of-lists
# each prediction can be evaluated against a list of references, hence [[ref]]
results = self._bleu.compute(predictions=preds, references=[[r] for r in refs])
return float(results["bleu"]) # (native sacreBLEU scale)
def rdf2txt_meteor_evaluation(self, preds: list[str], refs: list[str]) -> float:
# as bleu
res = self._meteor.compute(predictions=preds, references=[[r] for r in refs])
return float(res["meteor"])
def __my_accuracy(self,preds: list[list[int]], refs: list[list[int]]):
# it is done on token sequence not single token
total = len(preds)
correct = 0
for p, r in zip(preds, refs):
correct += int(p == r)
return correct / total
def __accuracy(self, preds, refs):
return accuracy_score(preds,refs)
def __clean_batch_by_pad(self, preds: list[list[int]], refs: list[list[int]]):
output_preds = []
output_refs = []
#TODO
pad_token: int = 7000 # percolate
for pred, ref in zip(preds,refs):
try:
i = ref.index(pad_token) # first time pad token appears
except ValueError:
i = len(ref)
output_preds.append(pred[:i])
output_refs.append(ref[:i])
return output_preds,output_refs
def __precision_recall(self, preds: list[list[int]], refs: list[list[int]]):
#TODO
p, r, f1, _ = precision_recall_fscore_support(
preds, refs, average="binary", zero_division=0
) #### watch later
return {"precision": float(p), "recall": float(r), "f1": float(f1)}

View File

@@ -1,41 +0,0 @@
import numpy as np
# custom LR from attention is all you need
class Custom_lr():
def __init__(self, d_model: int, warmup_step:int) -> None:
self.__d_model = d_model
self.__warmup_step = warmup_step
self.__epoch = 0
def step(self) -> int:
self.__epoch += 1
return (self.__d_model ** -0.5) * min(self.__epoch ** -0.5,
self.__epoch * (self.__warmup_step ** -1.5))
# OTHER LR
# Learning rate schedules (matching visualization parameters)
def step_lr(epoch, lr):
# StepLR: step_size=20, gamma=0.5 (from visualization)
return lr * 0.5 if epoch % 20 == 0 and epoch > 0 else lr
def exp_lr(epoch, lr):
# ExponentialLR: gamma=0.95 (from visualization)
return lr * 0.95
def cosine_lr(epoch, lr):
# CosineAnnealingLR: lr_min=0.001, lr_max=0.1, max_epochs=100 (from visualization)
lr_min, lr_max = 0.001, 0.1
max_epochs = 100
return lr_min + 0.5 * (lr_max - lr_min) * (1 + np.cos(epoch * np.pi / max_epochs))
def cyclical_lr(epoch, lr):
# CyclicalLR: base_lr=0.001, max_lr=0.1, step_size=20 (from visualization)
base_lr = 0.001
max_lr = 0.1
step_size = 20
cycle = np.floor(1 + epoch / (2 * step_size))
x = np.abs(epoch / step_size - 2 * cycle + 1)
return base_lr + (max_lr - base_lr) * max(0, (1 - x))

View File

@@ -0,0 +1,43 @@
import torch
class LogitsCollector:
def __init__(self, pad_token: int, end_token: int, tokenizer) -> None:
self.__pad_token = pad_token # used to skip PAD
self.__end_token = end_token # used to stop at END
self.__tokenizer = tokenizer # exposes .decode(list[int]) -> str
self.__steps: list[torch.Tensor] = [] # list of per-step logits [B,V]
def reset(self) -> None:
self.__steps.clear() # clear history
def add(self, logits_step: torch.Tensor) -> None:
if logits_step.dim() == 3: # handle [B,1,V]
logits_step = logits_step[:, -1, :] # -> [B,V]
self.__steps.append(logits_step.detach()) # store raw logits (detached)
def tokens(self) -> list[list[int]]:
if not self.__steps:
return []
stack = torch.stack(self.__steps, dim=0) # [T,B,V]
probs = torch.softmax(stack, dim=-1) # softmax over vocab -> [T,B,V]
ids = probs.argmax(dim=-1).transpose(0, 1) # greedy ids -> [B,T]
out: list[list[int]] = []
for row in ids.tolist():
seq: list[int] = []
for tok in row:
# if tok == self.__end_token: # stop on END
# break
if tok == self.__pad_token: # skip PAD
continue
seq.append(tok)
out.append(seq)
return out
def print_decoded(self) -> None:
for i, seq in enumerate(self.tokens()):
try:
# text = text + self.__end_token
text = self.__tokenizer.decode(seq) # decode tokens to string
except Exception:
text = str(seq) # fallback to ids
print(f"[{i}] {text}") # simple print

View File

@@ -0,0 +1,20 @@
import os
from pathlib import Path
class Log:
def __init__(self, path):
self.path = path
header = ["epoch","avg_txt","avg_enc","avg_dec","txt_loss","masking_loss","prediction_loss"]
if Path(path).is_file():
return
with open(self.path, "w", encoding="utf-8", newline="") as f:
f.write(",".join(header) + "\n")
def write(self, loss: list[float]):
line = ",".join(str(float(x)) for x in loss) + "\n"
with open(self.path, "a", encoding="utf-8", newline="") as f:
f.write(line)
f.flush()
os.fsync(f.fileno()) # extra durability per write # suggested against sudden crashes since it will be done

View File

@@ -14,6 +14,6 @@ class DeToken(torch.nn.Module):
x = self.__linear(x)
# 2) Go to logits
x = torch.softmax(x, 2)
# x = torch.softmax(x, 2)
return x

View File

@@ -1,8 +1,9 @@
from typing import Optional
import torch
import torch.nn as nn
from .FeedForwardNetwork import FeedForwardNetwork
from .TorchMultiHeadAttention import TorchMultiHeadAttention as MultiHeadAttention
from ..Utils.attention_mask import get_causal_attention_mask
from ..Utils.attention_mask import get_causal_attention_mask, get_prefix_causal_mask_from_padding_mask
# B, L(T), E_D
@@ -15,8 +16,10 @@ class Decoder(nn.Module):
feed_forward_hidden_layer_dimension: int,
number_of_attention_heads: int,
) -> None:
self.__attention_heads = number_of_attention_heads
super().__init__()
self.__masked_attention = MultiHeadAttention(
embedding_dimension, number_of_attention_heads, dropout=0.1
)
@@ -41,62 +44,72 @@ class Decoder(nn.Module):
torch.Tensor,
torch.Tensor,
torch.Tensor,
torch.Tensor
torch.Tensor,
torch.Tensor,
Optional[bool]
]
): # -> list[torch.Tensor]: # k_x = v_x . While x_q = x
# WARNING: args is needed to have sequential
x, k_x, v_x, padding_mask = args
if len(args) < 6:
args = args + (False)
x, k_x, v_x, src_padding_mask, tgt_padding_mask, decoder_only = args
# build of attention mask
attention_mask = get_causal_attention_mask(x.size(1))
# TODO: create a prefix causal mask if needed
if decoder_only:
attention_mask = get_prefix_causal_mask_from_padding_mask(x.size(1),src_padding_mask,self.__attention_heads) # the correct is tgt however ...
else:
attention_mask = get_causal_attention_mask(x.size(1))
# 1) Masked Attention
MASKED_ATTENTION = self.__masked_attention(
x, x, x, key_padding_mask=padding_mask, attention_mask=attention_mask
x, x, x, key_padding_mask=tgt_padding_mask, attention_mask=attention_mask
)
# 2) Dropout
# DROPPED_MASKED_ATTENTION = self.__dropout(MASKED_ATTENTION)
# del MASKED_ATTENTION
DROPPED_MASKED_ATTENTION = self.__dropout(MASKED_ATTENTION)
del MASKED_ATTENTION
# 3) Residual Connection
x = x + MASKED_ATTENTION
del MASKED_ATTENTION
x = x + DROPPED_MASKED_ATTENTION
del DROPPED_MASKED_ATTENTION
# 4) Layer Normalization
x = self.__layer_norm_1(x)
# 5) Encoderdecoder (cross) attention
CROSS_ATTENTION = self.__cross_attention(
x, k_x, v_x, key_padding_mask=padding_mask
)
# 6) Dropout
# DROPPED_CROSS_ATTENTION = self.__dropout(CROSS_ATTENTION)
# del CROSS_ATTENTION
if not decoder_only:
# 5) Encoderdecoder (cross) attention
CROSS_ATTENTION = self.__cross_attention(
x, k_x, v_x, key_padding_mask=src_padding_mask
)
# 7) Residual Connection
x = x + CROSS_ATTENTION
del CROSS_ATTENTION
# 6) Dropout
DROPPED_CROSS_ATTENTION = self.__dropout(CROSS_ATTENTION)
del CROSS_ATTENTION
# 8) Layer Normalization
x = self.__layer_norm_2(x)
# 7) Residual Connection
x = x + DROPPED_CROSS_ATTENTION
del DROPPED_CROSS_ATTENTION
# 8) Layer Normalization
x = self.__layer_norm_2(x)
# 9) Position-wise feed-forward
FEED_FORWARD = self.__feed_forward_network(x)
# 10) Dropout
# DROPPED_FEED_FORWARD = self.__dropout(FEED_FORWARD)
# del FEED_FORWARD
DROPPED_FEED_FORWARD = self.__dropout(FEED_FORWARD)
del FEED_FORWARD
# 11) Residual Connection
x = x + FEED_FORWARD
del FEED_FORWARD
x = x + DROPPED_FEED_FORWARD
del DROPPED_FEED_FORWARD
# 12) Layer Normalization
x = self.__layer_norm_3(x)
return (x, k_x, v_x, padding_mask)
return (x, k_x, v_x, src_padding_mask, tgt_padding_mask, decoder_only)
# use eval to disable dropout ecc

View File

@@ -43,12 +43,12 @@ class Encoder(
ATTENTION = self.__attention(x, x, x, key_padding_mask=padding_mask)
# 2) Dropout
# DROPPED_ATTENTION = self.__dropout(ATTENTION)
# del ATTENTION
DROPPED_ATTENTION = self.__dropout(ATTENTION)
del ATTENTION
# 3) Residual Connection
x = x + ATTENTION
del ATTENTION
x = x + DROPPED_ATTENTION
del DROPPED_ATTENTION
# 4) Layer Normalization
x = self.__layer_norm_1(x)
@@ -57,12 +57,12 @@ class Encoder(
FEED_FORWARD = self.__feed_forward(x)
# 6) Dropout
# DROPPED_FEED_FORWARD = self.__dropout(FEED_FORWARD)
# del FEED_FORWARD
DROPPED_FEED_FORWARD = self.__dropout(FEED_FORWARD)
del FEED_FORWARD
# 7) Residual Connection
x = x + FEED_FORWARD
del FEED_FORWARD
x = x + DROPPED_FEED_FORWARD
del DROPPED_FEED_FORWARD
# 8) Layer Normalization
x = self.__layer_norm_2(x)

View File

@@ -10,7 +10,7 @@ class SpannedMasker:
max_vocabulary: int,
forbidden_tokens: set[int],
change_token_probability: float = 0.15,
average_span: int = 1,
average_span: int = 2,
seed: int = random.randint(0, sys.maxsize),
) -> None:
@@ -25,6 +25,11 @@ class SpannedMasker:
self.__forbidden_tokens = forbidden_tokens
def reseed(self, seed:int):
self.__rng = random.Random(seed)
def mask_sequence(
self,
token_sequence: list[int],
@@ -98,7 +103,7 @@ class SpannedMasker:
if self.__is_illegal_token(INNER_TOKEN, forbidden_tokens):
continue
MASK[mask_index] = True
mask_index += 1

View File

@@ -0,0 +1,47 @@
from typing import override
import torch
# custom LR from attention is all you need
class WarmupLR(torch.optim.lr_scheduler.LRScheduler):
def __init__(
self,
optimizer: torch.optim.Optimizer,
warmup_steps: int,
embedding_size: int,
warming_multiplier: float = -1.5,
decaying_multiplier: float = -0.5,
multiplicative_factor: float = 1.0,
last_epoch: int = -1,
) -> None:
self.__warmup_steps = warmup_steps
self.__embedding_size = embedding_size
self.__warming_multiplier = warming_multiplier
self.__decaying_multiplier = decaying_multiplier
self.__multiplicative_factor = multiplicative_factor
super().__init__(optimizer, last_epoch)
def __scale_at(self, step: int) -> float:
step = max(step, 1)
return (
self.__multiplicative_factor
* (self.__embedding_size**self.__decaying_multiplier)
* min(
step**self.__decaying_multiplier,
step * (self.__warmup_steps**self.__warming_multiplier),
)
)
@override
def get_lr(self) -> list[float]:
torch.optim.lr_scheduler._warn_get_lr_called_within_step(self)
step = max(self.last_epoch, 1)
scale = self.__scale_at(step)
return [base_lr * scale for base_lr in self.base_lrs]
def _get_closed_form_lr(self):
step = max(self.last_epoch, 1)
scale = self.__scale_at(step)
return [base_lr * scale for base_lr in self.base_lrs]

View File

@@ -5,6 +5,7 @@ from .FeedForwardNetwork import FeedForwardNetwork
from .TorchMultiHeadAttention import TorchMultiHeadAttention
from .SpannedMasker import SpannedMasker
from .DeToken import DeToken
from .WarmupLR import WarmupLR
__all__ = [
"Decoder",
@@ -12,5 +13,6 @@ __all__ = [
"FeedForwardNetwork",
"TorchMultiHeadAttention",
"SpannedMasker",
"DeToken"
"DeToken",
"WarmupLR"
]

View File

@@ -0,0 +1,33 @@
import torch
import Project_Model.Libs.Embedder as Embedder
from ..Classes import DeToken
class NanoSocraDecoder(torch.nn.Module):
def __init__(
self,
decoder_embedder: Embedder.NanoSocratesEmbedder,
decoder_layers: torch.nn.Sequential,
detokener: DeToken
) -> None:
super().__init__()
self.__decoder_embedder = decoder_embedder
self.__decoder = decoder_layers
self.__detokener = detokener
def forward(self, args: tuple[torch.Tensor,torch.Tensor, torch.Tensor]):
decoder_embedder_input, prefix_mask, tgt_padding = args
decoder_tensor = self.__decoder_embedder(decoder_embedder_input)
decoder_output, _, _, _, _, _ = self.__decoder(
(decoder_tensor, decoder_tensor, decoder_tensor, prefix_mask, tgt_padding, True)
)
logits: torch.Tensor = self.__detokener(decoder_output)
return logits

View File

@@ -0,0 +1,29 @@
import torch
import Project_Model.Libs.Embedder as Embedder
from ..Classes import DeToken
class NanoSocratEncoder(torch.nn.Module):
def __init__(
self,
encoder_embedder: Embedder.NanoSocratesEmbedder,
encoder_layers: torch.nn.Sequential,
detokener: DeToken
) -> None:
super().__init__()
self.__encoder_embedder = encoder_embedder
self.__encoder = encoder_layers
self.__detokener = detokener
def forward(self, args: tuple[torch.Tensor, torch.Tensor]):
encoder_embedder_input, src_padding = args
encoder_tensor = self.__encoder_embedder(encoder_embedder_input)
encoder_output, _ = self.__encoder((encoder_tensor, src_padding))
logits: torch.Tensor = self.__detokener(encoder_output)
return logits

View File

@@ -0,0 +1,219 @@
import torch
import Project_Model.Libs.Embedder as Embedder
from ..Classes import Encoder, Decoder, DeToken
from ..Utils import get_decoder_input
from Project_Model.Libs.Batch import TaskType
class NanoSocratesCore(torch.nn.Module):
def __init__(
self,
vocabulary_size: int,
sentence_max_length: int,
sos: int,
pad: int,
eos: int,
continuerdf: int,
latent_space: int = 256,
feed_forward_multiplier: int = 4,
attention_heads: int = 4,
layer_number: int = 2,
) -> None:
super().__init__()
self.__sos = sos
self.__pad = pad
self.__eos = eos
self.__continuerdf = continuerdf
self.__sentence_len = sentence_max_length
feed_forward_latent_space = latent_space * feed_forward_multiplier
self.__encoder_embedder = Embedder.NanoSocratesEmbedder(
vocabulary_size, latent_space
)
self.__decoder_embedder = Embedder.NanoSocratesEmbedder(
vocabulary_size, latent_space
)
TMP_ENCODERS = [
Encoder(latent_space, feed_forward_latent_space, attention_heads)
] * layer_number
TMP_DECODERS = [
Decoder(latent_space, feed_forward_latent_space, attention_heads)
] * layer_number
self.__encoder = torch.nn.Sequential(*TMP_ENCODERS)
self.__decoder = torch.nn.Sequential(*TMP_DECODERS)
self.__detokener = DeToken(latent_space, vocabulary_size)
self.__encoder_detokener = DeToken(latent_space, vocabulary_size)
def forward(self, args: tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]):
encoder_embedder_input, src_padding, decoder_embedder_input, tgt_padding = args
encoder_tensor = self.__encoder_embedder(encoder_embedder_input)
decoder_tensor = self.__decoder_embedder(decoder_embedder_input)
encoder_output, _ = self.__encoder((encoder_tensor, src_padding))
decoder_output, _, _, _, _, _ = self.__decoder(
(decoder_tensor, encoder_output, encoder_output, src_padding, tgt_padding, False)
)
logits: torch.Tensor = self.__detokener(decoder_output)
return logits
def inference(self, input: tuple[torch.Tensor, torch.Tensor], task_type: TaskType) -> torch.Tensor:
if task_type == TaskType.MASKING:
return self.__masking(input)
if task_type == TaskType.COMPLETATION:
return self.__continue_rdf(input)
return self.__text_generation(input)
def __text_generation(self, args: tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
x, padding = args
encoder_tensor = self.__encoder_embedder(x)
BATCH: int
if len(x.shape) > 2:
BATCH, SEQ_LEN, _ = x.shape
else:
_, SEQ_LEN = x.shape
BATCH = 1
encoder_output, _ = self.__encoder((encoder_tensor, padding))
decoder_in = get_decoder_input(BATCH, self.__sos, self.__pad, SEQ_LEN)
decoder_in_pad_mask = decoder_in.eq(self.__pad)
continue_generating = True
token_idx = 0
while continue_generating:
decoder_in_x = self.__decoder_embedder(decoder_in)
decoder_output, _, _, _, _, _ = self.__decoder(
(decoder_in_x, encoder_output, encoder_output, padding, decoder_in_pad_mask, False)
)
logits: torch.Tensor = self.__detokener(decoder_output)
logits = torch.softmax(logits, 2)
tokens = torch.argmax(logits, 2)
if token_idx < self.__sentence_len - 1:
decoder_in[:,token_idx + 1] = tokens[:,token_idx]
decoder_in_pad_mask = decoder_in.eq(self.__pad)
if token_idx == self.__sentence_len - 1:
continue_generating = False
continue
if tokens.shape[0] == 1 and tokens[0,token_idx] == self.__eos:
continue_generating = False
continue
token_idx += 1
return decoder_in
def __masking(self, args: tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
x, padding = args
encoder_tensor = self.__encoder_embedder(x)
x, _ = self.__encoder((encoder_tensor, padding))
logits: torch.Tensor = self.__encoder_detokener(x)
del x
logits = torch.softmax(logits, 2)
tokens = torch.argmax(logits, 2)
return tokens
def __continue_rdf(self, args: tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
decoder_in, _ = args
decoder_in_prefix_mask = decoder_in.eq(self.__pad)
decoder_in_pad_mask = decoder_in.eq(self.__pad)
continue_generating = True
token_idx: int= int((decoder_in[0] == self.__continuerdf).nonzero()[0].item()) + 1
while continue_generating:
decoder_x = self.__decoder_embedder(decoder_in)
decoder_output, _, _, _, _, _ = self.__decoder(
(decoder_x, decoder_in, decoder_in, decoder_in_prefix_mask, decoder_in_pad_mask, True)
)
logits: torch.Tensor = self.__detokener(decoder_output)
logits = torch.softmax(logits, 2)
tokens = torch.argmax(logits, 2)
if token_idx < self.__sentence_len - 1:
decoder_in[:,token_idx + 1] = tokens[:,token_idx]
decoder_in_pad_mask = decoder_in.eq(self.__pad)
if token_idx == self.__sentence_len - 1:
continue_generating = False
continue
if tokens.shape[0] == 1 and tokens[0,token_idx] == self.__eos:
continue_generating = False
continue
token_idx += 1
return decoder_in
def take_pieces(self):
return (
(self.__encoder_embedder, self.__encoder, self.__encoder_detokener),
(self.__decoder_embedder, self.__decoder, self.__detokener)
)
def load_pieces(
self,
encoder_embedder: Embedder.NanoSocratesEmbedder,
decoder_embedder: Embedder.NanoSocratesEmbedder,
encoder: torch.nn.Sequential,
decoder: torch.nn.Sequential,
encoder_detokener: DeToken,
decoder_detokener: DeToken
):
self.__encoder_embedder = encoder_embedder
self.__decoder_embedder = decoder_embedder
self.__encoder = encoder
self.__decoder = decoder
self.__encoder_detokener = encoder_detokener
self.__detokener = decoder_detokener

View File

@@ -36,20 +36,28 @@ class TrainingModel(torch.nn.Module):
self.__decoder = torch.nn.Sequential(*TMP_DECODERS)
self.__detokener = DeToken(latent_space, vocabulary_size)
self.__encoder_detokener = DeToken(latent_space, vocabulary_size)
def forward(self, args: tuple[torch.Tensor, torch.Tensor, torch.Tensor]):
encoder_embedder_input, padding_tensor, decoder_embedder_input = args
def forward(self, args: tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]):
encoder_embedder_input, src_padding, decoder_embedder_input, tgt_padding = args
encoder_tensor = self.__encoder_embedder(encoder_embedder_input)
decoder_tensor = self.__decoder_embedder(decoder_embedder_input)
encoder_output, _ = self.__encoder((encoder_tensor, padding_tensor))
encoder_output, _ = self.__encoder((encoder_tensor, src_padding))
decoder_output, _, _, _ = self.__decoder(
(decoder_tensor, encoder_tensor, encoder_tensor, None)
decoder_output, _, _, _, _, _ = self.__decoder(
(decoder_tensor, encoder_output, encoder_output, src_padding, tgt_padding, False)
)
logits: torch.Tensor = self.__detokener(decoder_output)
return logits
def take_pieces(self):
return (
(self.__encoder_embedder, self.__encoder, self.__encoder_detokener),
(self.__decoder_embedder, self.__decoder, self.__detokener)
)

View File

@@ -1,5 +1,11 @@
from .TrainingModel import TrainingModel
from .NanoSocratEncoder import NanoSocratEncoder
from .NanoSocraDecoder import NanoSocraDecoder
from .NanoSocrates import NanoSocratesCore
__all__ = [
"TrainingModel"
"TrainingModel",
"NanoSocratEncoder",
"NanoSocraDecoder",
"NanoSocratesCore"
]

View File

@@ -3,6 +3,9 @@ from .task_type import TaskType
from .post_tokenization import truncate_sequence, pad_sequence, normalize_sequence, create_padding_mask
from .inference_masking import inference_masking
from .truncate_rdf_list import truncate_rdf_list
from .decode_out import tensor2token
from .decoder_input import get_decoder_input
__all__ = [
"TaskType",
@@ -13,5 +16,7 @@ __all__ = [
"create_padding_mask",
"normalize_sequence",
"inference_masking",
"truncate_rdf_list"
"truncate_rdf_list",
"tensor2token",
"get_decoder_input"
]

View File

@@ -8,4 +8,23 @@ def get_causal_attention_mask(seq_len: int) -> torch.Tensor:
def get_causal_attention_mask_batched(seq_len: int, batch_size: int ) -> torch.Tensor:
base_mask = get_causal_attention_mask(seq_len)
return base_mask.unsqueeze(0).expand(batch_size, -1, -1) # add another dimension at the beginning, big as batch_size
# the result is that z,x,y where x,y are repeated along z
# the result is that z,x,y where x,y are repeated along z
def get_causal_attention_mask_with_prefix(seq_len, prefix):
mask = torch.triu(torch.ones(seq_len, seq_len, dtype=torch.bool), diagonal=1)
mask[:,:prefix] = False
return mask
def get_prefix_causal_mask_from_padding_mask(seq_len:int, prefix_mask, att_heads:int=1):
expanded_padding_mask = prefix_mask.unsqueeze(-1).repeat(1, 1, seq_len) # B,T,T
expanded_padding_mask = expanded_padding_mask.permute(0,2,1) # B,T,T
mask = torch.triu(torch.ones(seq_len, seq_len, dtype=torch.bool), diagonal=1) # T,T
tri_batched = mask.unsqueeze(0) # 1,T,T will broadcast over B
prefix_causal_mask = expanded_padding_mask & tri_batched
prefix_causal_mask = prefix_causal_mask.repeat_interleave(att_heads, dim=0) # B*H,T,T
return prefix_causal_mask
#def get_prefix_causal_mask():
# continue_rdf =

View File

@@ -0,0 +1,27 @@
from typing import Generator
import torch
def tensor2token(tensor: torch.Tensor, end_token: int) -> Generator[list[int]]:
if len(tensor.shape) < 1 or len(tensor.shape) > 2:
raise ValueError("Shape is not correct")
if len(tensor.shape) == 1:
token_list: list[int] = tensor.tolist()
token_list.append(end_token)
yield token_list
return
batch_len: int
batch_len, _ = tensor.shape
for i in range(batch_len):
smaller_tensor = tensor[i, :]
token_list: list[int] = smaller_tensor.tolist()
token_list.append(end_token)
yield token_list

View File

@@ -0,0 +1,14 @@
import torch
from ..Utils import normalize_sequence
# from Project_Model.Libs.Embedder import NanoSocratesEmbedder as Embedder
def get_decoder_input(batch_size, sos_token,pad_token, seq_len):
single_decoder_input, _ = normalize_sequence([sos_token],seq_len,pad_token, end_token=0, add_ending=False)
tensor_decoder_input = torch.tensor(single_decoder_input[:])
# embedded_decoder_intput = embedder(tensor_decoder_input)
batch_decoder_input = tensor_decoder_input.unsqueeze(0).repeat(batch_size, 1)
return batch_decoder_input

View File

@@ -1,17 +1,20 @@
def truncate_sequence(
sequence: list[int], truncate_at: int, end_token: int
sequence: list[int], truncate_at: int, end_token: int, add_ending: bool
) -> list[int]:
if len(sequence) < truncate_at - 1:
sequence.append(end_token)
if add_ending:
sequence.append(end_token)
return sequence
if len(sequence) < truncate_at:
sequence[-1] = end_token
if add_ending:
sequence[-1] = end_token
return sequence
TRUNCATED_SEQUENCE = sequence[:truncate_at]
TRUNCATED_SEQUENCE[-1] = end_token
if add_ending:
TRUNCATED_SEQUENCE[-1] = end_token
return TRUNCATED_SEQUENCE
@@ -48,8 +51,9 @@ def normalize_sequence(
max_length: int,
pad_token: int,
end_token: int,
add_ending: bool = True
) -> tuple[list[int], list[bool]]:
new_sequence = truncate_sequence(sequence, max_length, end_token)
new_sequence = truncate_sequence(sequence, max_length, end_token, add_ending)
new_sequence = pad_sequence(new_sequence, max_length, pad_token)
PADDING_MASK = create_padding_mask(new_sequence, pad_token)

View File

@@ -1,6 +1,7 @@
from enum import Enum, auto
class TaskType(Enum):
TEXT2RDF = auto()
RDF2TEXT = auto()
MASK = auto()
COMPLETATION = auto()

View File

@@ -27,7 +27,6 @@ def truncate_rdf_list(
END_OF_TRIPLES.append(i + 1)
TRIPLES_TOKENS: list[int] = []
TARGET_TRIPLES: list[int] = []
start_of_triple = 0
exit_loop = False
@@ -56,10 +55,10 @@ def truncate_rdf_list(
EOT = END_OF_TRIPLES.popleft()
TRIPLE = sequence[start_of_triple:EOT]
TARGET_TRIPLES.extend(TRIPLE)
TRIPLES_TOKENS.extend(TRIPLE)
start_of_triple = EOT
return (TRIPLES_TOKENS, TARGET_TRIPLES)
return (TRIPLES_TOKENS, TRIPLES_TOKENS)

View File

@@ -1,7 +1,9 @@
from .Classes import *
from .Enums import *
from .Utils import *
from .Models import *
from . import Classes
from . import Enums
from . import Utils
from . import Models
from . import Models

View File

@@ -0,0 +1,6 @@
from enum import Enum, auto
class ModelType(Enum):
ENCODER_ONLY = auto()
DECODER_ONLY = auto()

View File

@@ -0,0 +1,14 @@
from .model_utils import decompose_nano_socrates, create_standalone_model, train2inference
from .ModelType import ModelType
from .decode_batch import decode_batch
from .metrics import precision, recall, accuracy, f1, meteor, bleu, rouge, average, rdf2txt, txt2rdf, rdf_completion_1, rdf_completion_2, remove_padding, balance_paddings
__all__ = [
"ModelType",
"decompose_nano_socrates",
"create_standalone_model",
"decode_batch",
"train2inference",
"precision", "recall", "accuracy", "f1", "meteor", "bleu", "rouge", "average",
"rdf2txt", "txt2rdf", "rdf_completion_1", "rdf_completion_2", "remove_padding", "balance_paddings"
]

View File

@@ -0,0 +1,16 @@
import torch
import Project_Model.Libs.BPE as BPE
def decode_batch(batch: torch.Tensor, tokenizer: BPE.TokeNanoCore ,uknonw_token: int) -> list[str]:
strings = []
BATCH, _ = batch.shape
for i in range(0, BATCH):
tokens: list[int] = batch.tolist()[i]
tokens = list(map(lambda x: uknonw_token if x > tokenizer.vocabulary_size else x, tokens))
strings.append(tokenizer.decode(tokens))
return strings

View File

@@ -0,0 +1,100 @@
import evaluate as eval
BLEU = eval.load("bleu")
ROUGE = eval.load("rouge")
METEOR = eval.load("meteor")
def precision(ref: list[int], pred: list[int]):
metric = eval.load("precision")
return metric.compute(predictions=pred, references=ref, average="weighted", zero_division=0)
def recall(ref: list[int], pred: list[int]):
metric = eval.load("recall")
return metric.compute(predictions=pred, references=ref, average="weighted", zero_division=0)
def accuracy(ref: list[int], pred: list[int]):
metric = eval.load("accuracy")
return metric.compute(predictions=pred, references=ref)
def meteor(ref: list[str], pred: list[str]):
metric = METEOR
return metric.compute(predictions=pred, references=ref)
def bleu(ref: list[str], pred: list[str]):
metric = BLEU
return metric.compute(predictions=pred, references=ref)
def rouge(ref: list[str], pred: list[str]):
metric = ROUGE
return metric.compute(predictions=pred, references=ref)
def f1(precision: float, recall: float):
divisor = max((precision + recall), 1E-5)
return (2 * recall * precision) / divisor
def average(array: list[float]):
return sum(array) / len(array)
def rdf2txt(ref: list[str], pred: list[str]):
b_m = bleu(ref, pred)
r_m = rouge(ref, pred)
m_m = meteor(ref, pred)
return (b_m, r_m, m_m)
def txt2rdf(ref: list[int], pred: list[int]):
p_m = precision(ref, pred)
r_m = recall(ref, pred)
return (p_m, r_m)
def rdf_completion_1(ref: list[int], pred: list[int]):
a_m = accuracy(ref, pred)
return a_m
def rdf_completion_2(ref: list[int], pred: list[int]):
p_m = precision(ref, pred)
r_m = recall(ref, pred)
return (p_m, r_m)
def remove_padding(seq: list[int], pad_token: int, end_token: int):
clean_seq = list(filter(lambda x: x != pad_token, seq))
if clean_seq[-1] == end_token:
return clean_seq
clean_seq.append(
end_token
)
return clean_seq
def balance_paddings(seq_1: list[int], seq_2: list[int], pad_token: int):
SEQ_1_LEN = len(seq_1)
SEQ_2_LEN = len(seq_2)
if SEQ_1_LEN > SEQ_2_LEN:
PAD = [pad_token] * (SEQ_1_LEN - SEQ_2_LEN)
seq_2.extend(PAD)
if SEQ_2_LEN > SEQ_1_LEN:
seq_2 = seq_2[:SEQ_1_LEN]
return (seq_1, seq_2)

View File

@@ -0,0 +1,72 @@
import torch
from Project_Model.Libs.Embedder import NanoSocratesEmbedder
from Project_Model.Libs.Transformer import TrainingModel,NanoSocratesCore, NanoSocraDecoder, NanoSocratEncoder, DeToken, Encoder, Decoder
from .ModelType import ModelType
def decompose_nano_socrates(
model: TrainingModel | NanoSocratesCore , vocabulary_size: int, embedding_size: int
) -> tuple[TrainingModel | NanoSocratesCore, NanoSocratEncoder, NanoSocraDecoder]:
encoder_pieces, decoder_pieces = model.take_pieces()
encoder_embedder, encoder, encoder_detokener = encoder_pieces
decoder_embedder, decoder, decoder_detokener = decoder_pieces
return (
model,
NanoSocratEncoder(encoder_embedder, encoder, encoder_detokener),
NanoSocraDecoder(decoder_embedder, decoder, decoder_detokener),
)
def train2inference(
train_model: TrainingModel,
inference_model: NanoSocratesCore
) -> NanoSocratesCore:
encoder_pieces, decoder_pieces = train_model.take_pieces()
enc_emb, encoder, enc_det = encoder_pieces
dec_emb, decoder, dec_det = decoder_pieces
inference_model.load_pieces(
enc_emb,
dec_emb,
encoder,
decoder,
enc_det,
dec_det
)
return inference_model
def create_standalone_model(
model_type: ModelType,
vocabulary_size: int,
latent_space: int = 256,
feed_forward_multiplier: int = 4,
attention_heads: int = 4,
layer_number: int = 2,
) -> NanoSocratEncoder | NanoSocraDecoder:
feed_forward_latent_space = latent_space * feed_forward_multiplier
embedder = NanoSocratesEmbedder(vocabulary_size, latent_space)
detokener = DeToken(latent_space, vocabulary_size)
if model_type == ModelType.ENCODER_ONLY:
TMP_ENCODERS = [
Encoder(latent_space, feed_forward_latent_space, attention_heads)
] * layer_number
encoder = torch.nn.Sequential(*TMP_ENCODERS)
return NanoSocratEncoder(embedder, encoder, detokener)
TMP_DECODERS = [
Decoder(latent_space, feed_forward_latent_space, attention_heads)
] * layer_number
decoder = torch.nn.Sequential(*TMP_DECODERS)
return NanoSocraDecoder(embedder, decoder, detokener)

View File

@@ -2,3 +2,4 @@ from . import BPE
from . import Embedder
from . import Transformer
from . import TorchShims
from . import TransformerUtils

File diff suppressed because it is too large Load Diff

View File

@@ -1,7 +1,7 @@
import pandas as pd
# do not worry about circular dependencies, this class will never call something else
from Scripts.DataCleaning.filter import PipelineApplier
from Scripts.DataCleaning.legacy.filter import PipelineApplier
class RDF_mask_task_dataset():
"""

View File

@@ -0,0 +1,29 @@
import pandas as pd
def split_csv_by_percent(csv_path, train=70, val=15, test=15, seed=42):
# 1) Read and shuffle rows with a fixed seed for reproducibility
df = pd.read_csv(csv_path).sample(frac=1, random_state=seed).reset_index(drop=True)
# 2) Turn the three inputs into proportions relative to their sum
total = train + val + test # eheh you got it there :p
n = len(df)
n_train = int(n * train / total) # floor to keep indices integral
n_val = int(n * val / total)
# 3) Give the remainder to test to ensure every row is assigned
# (this naturally absorbs any rounding loss)
train_df = df.iloc[:n_train].reset_index(drop=True)
val_df = df.iloc[n_train:n_train + n_val].reset_index(drop=True)
test_df = df.iloc[n_train + n_val:].reset_index(drop=True)
return train_df, val_df, test_df
# usage:
DATASET = "Assets/Dataset/Tmp/rdf_text.csv"
TRAIN = "Assets/Dataset/Tmp/hold_out/train.csv"
TEST = "Assets/Dataset/Tmp/hold_out/test.csv"
EVALUATION = "Assets/Dataset/Tmp/hold_out/evaluation.csv"
train_df, val_df, test_df = split_csv_by_percent(DATASET, train=80, val=10, test=10, seed=42)
train_df.to_csv(TRAIN)
val_df.to_csv(EVALUATION)
test_df.to_csv(TEST)

View File

@@ -0,0 +1,381 @@
# This file deletes in the pipeline the unwanted relationship by different rules
# -----------------------------------------------------------------------------
# SQL-FIRST VERSION
# -----------------------------------------------------------------------------
# In the original (pandas) version this module:
# - stored frequency filters in DataFrames,
# - filtered/cleaned DataFrames in-memory,
# - added special tokens via string ops,
# - rebuilt one row per movie using groupby/aggregation.
#
# In this rewrite:
# - Every transformation RETURNS a SQLAlchemy `Select` object instead of a DataFrame.
# - Your pipeline can pass this `Select` (a "dataview") from one stage to the next,
# composing more SQL lazily. Nothing is executed until you call `session.execute(...)`.
# - Frequency filters are represented as SUBSELECTS, applied with `WHERE IN (subquery)`.
#
# Notes:
# - We keep the same CLASS and METHOD NAMES to preserve call sites.
# - Method comments/docstrings from your original file are carried over and updated
# to reflect Select-based behavior and return types.
# - We drop pandas/numpy/sqlite3 imports because filtering is pushed into SQL.
# - `GROUP_CONCAT` is used for the rebuild phase (SQLite-compatible). For other DBs,
# swap with an equivalent string-agg function.
# -----------------------------------------------------------------------------
from __future__ import annotations
from typing import Optional
from sqlalchemy import select, func, literal
from sqlalchemy.sql import Select
from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
class PipelineApplier():
"""
SQL-first pipeline applier.
In the pandas version, frequency filters were stored as DataFrames (self.MOVIE_FILTER / self.REL_FILTER)
and every method worked with/returned pandas.DataFrame. In this SQLAlchemy rewrite:
- self.MOVIE_FILTER and self.REL_FILTER become *subselects* (Select objects) that yield a single
column each (MovieID or RelationshipURI). These subselects can be applied via `WHERE IN (subquery)`.
- Every method that previously returned a DataFrame now returns a *Select* that represents the same
logical transformation, but pushed into the database engine.
- Comments and docstrings are updated to reflect SQL semantics while preserving your original intent.
"""
def __init__(self):
# In the pandas version these were DataFrames storing allowed keys.
# Here they are Select objects (single-column subselects) or None.
# Expected column names:
# - self.MOVIE_FILTER: "MovieID"
# - self.REL_FILTER: "RelationshipURI"
self.MOVIE_FILTER: Optional[Select] = None
self.REL_FILTER: Optional[Select] = None
# -------------------------------------------------------------------------
# Relationship deletion
# -------------------------------------------------------------------------
def delete_relationship_by_str(self, RDF: Select, uri: str) -> Select:
"""
Return a Select where rows having the given relationship URI are removed.
Original signature (pandas):
def delete_relationship_by_str(self, RDF: pd.DataFrame, uri: str) -> pd.DataFrame
Updated behavior:
- RDF is a Select with columns: MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
- We apply a WHERE clause: RelationshipURI != <uri>
- Returns a Select you can continue composing.
Args:
RDF (Select): a selectable representing the RDF joined view
uri (str): RelationshipURI to exclude
Returns:
Select: filtered selectable (no execution yet)
"""
sc = RDF.selected_columns
return RDF.where(sc.RelationshipURI != literal(uri))
# -------------------------------------------------------------------------
# Frequency filter: MOVIE
# -------------------------------------------------------------------------
def generate_frequency_movie_filter(self, MOVIE_COUNT: Select, min_treshold: int, max_treshold: int):
"""
You MUST call this before filtering by movie frequency [filter_by_frequency_movie_id()],
since this method creates such filter.
Original behavior:
- Input MOVIE_COUNT as DataFrame ["MovieID","Count"]
- Keep rows where Count in [min_treshold, max_treshold)
- Store the filtered keys in self.MOVIE_FILTER
Updated behavior (SQL):
- MOVIE_COUNT is a Select that yields ["MovieID","Count"].
- We build and store a *subselect* of allowed MovieID (single column) to be used by WHERE IN.
- No query is executed here; we only create a new Select.
Args:
MOVIE_COUNT (Select): yields columns MovieID, Count
min_treshold (int):
max_treshold (int):
"""
sc = MOVIE_COUNT.selected_columns
filtered = MOVIE_COUNT.where(sc.Count >= min_treshold).where(sc.Count < max_treshold)
# Keep only the key column so it can be used in an IN (subquery)
self.MOVIE_FILTER = select(filtered.selected_columns.MovieID)
# -------------------------------------------------------------------------
# Frequency filter: RELATIONSHIP
# -------------------------------------------------------------------------
def generate_frequency_relationship_filter(self, REL_COUNT: Select, min_treshold: int, max_treshold: int):
"""
Original behavior:
- Input REL_COUNT as DataFrame ["RelationshipURI","Count"]
- Keep rows where Count in [min_treshold, max_treshold)
- Store the filtered keys in self.REL_FILTER
Updated behavior (SQL):
- REL_COUNT is a Select that yields ["RelationshipURI","Count"].
- We build and store a *subselect* of allowed RelationshipURI (single column) to be used by WHERE IN.
- No query is executed here; we only create a new Select.
Args:
REL_COUNT (Select): yields columns RelationshipURI, Count
min_treshold (int):
max_treshold (int):
"""
sc = REL_COUNT.selected_columns
filtered = REL_COUNT.where(sc.Count >= min_treshold).where(sc.Count < max_treshold)
self.REL_FILTER = select(filtered.selected_columns.RelationshipURI)
# -------------------------------------------------------------------------
# Apply frequency filters
# -------------------------------------------------------------------------
def filter_by_frequency_movie_id(self, RDF: Select) -> Select:
"""
Original behavior (pandas):
RDF = RDF[RDF["MovieID"].isin(self.MOVIE_FILTER["MovieID"])]
Updated behavior (SQL):
- If self.MOVIE_FILTER is present, apply: WHERE MovieID IN ( <subselect> )
- Otherwise, return RDF unchanged.
Args:
RDF (Select): current dataset
Returns:
Select: filtered dataset (or unchanged if no filter exists)
"""
if self.MOVIE_FILTER is None:
return RDF
sc = RDF.selected_columns
return RDF.where(sc.MovieID.in_(self.MOVIE_FILTER))
def filter_by_frequency_relationship(self, RDF: Select) -> Select:
"""
Original behavior (pandas):
RDF = RDF[RDF["RelationshipURI"].isin(self.REL_FILTER["RelationshipURI"])]
Updated behavior (SQL):
- If self.REL_FILTER is present, apply: WHERE RelationshipURI IN ( <subselect> )
- Otherwise, return RDF unchanged.
Args:
RDF (Select): current dataset
Returns:
Select: filtered dataset (or unchanged if no filter exists)
"""
if self.REL_FILTER is None:
return RDF
sc = RDF.selected_columns
return RDF.where(sc.RelationshipURI.in_(self.REL_FILTER))
# -------------------------------------------------------------------------
# Token prefixing (SubjectURI/RelationshipURI/ObjectURI)
# -------------------------------------------------------------------------
def rdf_add_special_token(self, RDF: Select) -> Select:
"""
Adds RDF special token to each element of the tuple. i.e: SUBJ to SubjectURI,
OBJ to ObjectURI, REL to RelationshipURI. Check
Scripts/Libs/CleaningPipeline/special_token.py for the up-to-date special token.
It only adds the special token of the three elements of the RDF; no other special token.
Original behavior (pandas):
- String concatenation with columns in a DataFrame.
- Returned a new DataFrame.
Updated behavior (SQL):
- Build projected columns using SQL string concatenation.
- Return a new Select with the same output column names:
["MovieID","SubjectURI","RelationshipURI","ObjectURI","Abstract"].
Args:
RDF (Select): current dataset
Returns:
Select: projected dataset with tokenized SubjectURI/RelationshipURI/ObjectURI
"""
sc = RDF.selected_columns
subj_tok = literal(SpecialToken.SUBJECT.value) + sc.SubjectURI
rel_tok = literal(SpecialToken.RELATIONSHIP.value) + sc.RelationshipURI
obj_tok = literal(SpecialToken.OBJECT.value) + sc.ObjectURI
return RDF.with_only_columns(
sc.MovieID.label("MovieID"),
subj_tok.label("SubjectURI"),
rel_tok.label("RelationshipURI"),
obj_tok.label("ObjectURI"),
sc.Abstract.label("Abstract"),
)
# -------------------------------------------------------------------------
# NA/empty drop on key columns (SubjectURI, RelationshipURI, ObjectURI)
# -------------------------------------------------------------------------
def drop_na_from_dataset(self, RDF: Select) -> Select:
"""
Dataset has SubjectURI, RelationshipURI, ObjectURI. We want to drop rows
where any of these is empty or NULL.
Original behavior (pandas):
- Replace '' with NaN and dropna on the three columns.
Updated behavior (SQL):
- Apply WHERE clauses checking for NOT NULL and not empty string.
Args:
RDF (Select): current dataset
Returns:
Select: dataset filtered to non-empty SubjectURI/RelationshipURI/ObjectURI
"""
sc = RDF.selected_columns
return RDF.where(
(sc.SubjectURI.is_not(None)) & (sc.SubjectURI != "") &
(sc.RelationshipURI.is_not(None)) & (sc.RelationshipURI != "") &
(sc.ObjectURI.is_not(None)) & (sc.ObjectURI != "")
)
# -------------------------------------------------------------------------
# Rebuild by movie (one row per movie)
# -------------------------------------------------------------------------
def rebuild_by_movie(self, RDF: Select) -> Select:
"""
To execute this method you have to have iterated by movie_id conceptually,
because as design we want at the end one row for each movie.
Original behavior (pandas):
- Build per-row "Triple" as SubjectURI + RelationshipURI + ObjectURI,
wrapped with START_TRIPLE/END_TRIPLE.
- Group by ["MovieID", "Abstract"] and join ("".join) all Triple strings into one.
- Prefix the whole list with START_TRIPLE_LIST and Abstract with ABSTRACT.
- Return DataFrame [["MovieID","Triple","Abstract"]].
Updated behavior (SQL):
- Build per-row Triple using SQL string concatenation and constants.
- Use GROUP_CONCAT (empty separator) to aggregate per-movie.
- Prefix with START_TRIPLE_LIST and ABSTRACT in SQL.
- Return a Select with columns: ["MovieID","Triple","Abstract"].
Args:
RDF (Select): current dataset with columns
MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
Returns:
Select: aggregated dataset with one row per movie
"""
sc = RDF.selected_columns
# Per-row triple with START/END_TRIPLE tokens
row_triple = (
literal(SpecialToken.START_TRIPLE.value) +
(sc.SubjectURI + sc.RelationshipURI + sc.ObjectURI) +
literal(SpecialToken.END_TRIPLE.value)
).label("Triple")
# Prefixed abstract
abstract_tok = (literal(SpecialToken.ABSTRACT.value) + sc.Abstract).label("Abstract")
# Subquery of per-row triples / abstracts
row_view = RDF.with_only_columns(
sc.MovieID.label("MovieID"),
row_triple,
abstract_tok,
).subquery()
# Concatenate all triples for each movie (SQLite syntax; adjust for other DBs)
triple_concat = (
literal(SpecialToken.START_TRIPLE_LIST.value) +
func.group_concat(row_view.c.Triple, literal(""))
).label("Triple")
return (
select(
row_view.c.MovieID.label("MovieID"),
triple_concat,
row_view.c.Abstract.label("Abstract"),
)
.group_by(row_view.c.MovieID, row_view.c.Abstract)
)
# -------------------------------------------------------------------------
# Build triple(s) projection
# -------------------------------------------------------------------------
@staticmethod
def build_triple(RDF: Select) -> Select:
"""
Obtains joined RDF triple in one element, together with START and END special tokens.
Original behavior (pandas):
- Returned a Series/DataFrame column "Triple" built from three string columns.
Updated behavior (SQL):
- Returns a Select with a single column "Triple" built in SQL.
Args:
RDF (Select): at least columns ["SubjectURI", "RelationshipURI", "ObjectURI"]
Returns:
Select: a projection containing one column named "Triple"
"""
sc = RDF.selected_columns
triple = (
literal(SpecialToken.START_TRIPLE.value) +
(sc.SubjectURI + sc.RelationshipURI + sc.ObjectURI) +
literal(SpecialToken.END_TRIPLE.value)
).label("Triple")
return RDF.with_only_columns(triple)
@staticmethod
def build_incomplete_triple(RDF: Select) -> Select:
"""
Method helper used for the third task: "Predicting a masked component within an RDF triple".
Obtains joined RDF triple in one element, together with START and END special tokens.
The MISSING element will be replaced by the special token <MASK>.
Original behavior (pandas):
- Created a Series "Triple" using fallback values for missing columns.
Updated behavior (SQL):
- Uses COALESCE to replace NULLs with <MASK> directly in SQL.
- Returns a Select with a single column "Triple".
Args:
RDF (Select): 2 of the following columns present ["SubjectURI", "RelationshipURI", "ObjectURI"]
Returns:
Select: projection with column "Triple"
"""
sc = RDF.selected_columns
mask = literal(SpecialToken.MASK.value)
triple = (
literal(SpecialToken.START_TRIPLE.value) +
(func.coalesce(sc.SubjectURI, mask) +
func.coalesce(sc.RelationshipURI, mask) +
func.coalesce(sc.ObjectURI, mask)) +
literal(SpecialToken.END_TRIPLE.value)
).label("Triple")
return RDF.with_only_columns(triple)
@staticmethod
def build_for_mask_task(RDF_incomplete: Select, MISSING) -> None:
"""
Currently not used.
Original intention:
Given two DataFrames (one incomplete RDF and another with just the missing component),
apply special tokens accordingly.
Updated note:
This stub remains for API parity. If needed in the future, it can be implemented
as a Select-building helper that merges/COALESCEs columns from different selects.
"""
return None

View File

@@ -0,0 +1,148 @@
# This file deletes in the pipeline the unwanted relationship by different rules
import pandas as pd
import sqlite3 # kept for compatibility
import numpy as np
from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
class PipelineApplier:
def __init__(self):
# Fast internal caches for O(1) membership checks
self._MOVIE_FILTER_SET = set()
self._REL_FILTER_SET = set()
# ------------------------------
# Filters
# ------------------------------
def delete_relationship_by_str(self, RDF: pd.DataFrame, uri: str) -> pd.DataFrame:
# Vectorized boolean mask
return RDF.loc[RDF["RelationshipURI"] != uri]
def generate_frequency_movie_filter(self, MOVIE_COUNT: pd.DataFrame, min_threshold: int, max_threshold: int):
"""
You MUST call this before filter the dataset by movie frequency [filter_by_frequency_movie_id()],
since this method creates such filter.
Args:
MOVIE_COUNT (pd.DataFrame): ["MovieID","Count"]
"""
sel = (MOVIE_COUNT["Count"] >= min_threshold) & (MOVIE_COUNT["Count"] < max_threshold)
self._MOVIE_FILTER_SET = set(MOVIE_COUNT.loc[sel, "MovieID"].tolist())
def generate_frequency_relationship_filter(self, REL_COUNT: pd.DataFrame, min_threshold: int, max_threshold: int):
sel = (REL_COUNT["Count"] >= min_threshold) & (REL_COUNT["Count"] < max_threshold)
self._REL_FILTER_SET = set(REL_COUNT.loc[sel, "RelationshipURI"].tolist())
def filter_by_frequency_movie_id(self, RDF: pd.DataFrame) -> pd.DataFrame:
# Set-backed isin is the fastest path
return RDF.loc[RDF["MovieID"].isin(self._MOVIE_FILTER_SET)]
def filter_by_frequency_relationship(self, RDF: pd.DataFrame) -> pd.DataFrame:
return RDF.loc[RDF["RelationshipURI"].isin(self._REL_FILTER_SET)]
# ------------------------------
# Cleaning & preprocessing
# ------------------------------
def rdf_add_special_token(self, RDF: pd.DataFrame) -> pd.DataFrame:
"""
Adds RDF special token to SubjectURI / RelationshipURI / ObjectURI.
Returns a new DataFrame (no inplace modification of the caller's object).
"""
subj = np.char.add(SpecialToken.SUBJECT.value, RDF["SubjectURI"].to_numpy(dtype=object))
rel = np.char.add(SpecialToken.RELATIONSHIP.value, RDF["RelationshipURI"].to_numpy(dtype=object))
obj = np.char.add(SpecialToken.OBJECT.value, RDF["ObjectURI"].to_numpy(dtype=object))
return RDF.assign(SubjectURI=subj, RelationshipURI=rel, ObjectURI=obj)
def drop_na_from_dataset(self, RDF: pd.DataFrame) -> pd.DataFrame:
"""
Replace '' with NaN only on key columns, then drop rows missing any of them.
"""
cols = ["SubjectURI", "RelationshipURI", "ObjectURI"]
rdf = RDF.copy()
for c in cols:
m = rdf[c] == ""
if m.any():
rdf.loc[m, c] = np.nan
return rdf.dropna(subset=cols)
# ------------------------------
# Building triples
# ------------------------------
@staticmethod
def build_triple(RDF: pd.DataFrame):
"""
Obtains joined RDF triple in one element, together with START and END special token.
Returns:
pd.Series: RDF["Triple"] (just this column). Side-effect: sets RDF["Triple"].
"""
start = SpecialToken.START_TRIPLE.value
end = SpecialToken.END_TRIPLE.value
subj = RDF["SubjectURI"].to_numpy(dtype=object)
rel = RDF["RelationshipURI"].to_numpy(dtype=object)
obj = RDF["ObjectURI"].to_numpy(dtype=object)
arr = np.char.add(np.char.add(np.char.add(start, subj),
np.char.add(rel, obj)),
end)
RDF["Triple"] = pd.Series(arr, index=RDF.index, dtype=object, name="Triple")
return RDF["Triple"]
@staticmethod
def build_incomplete_triple(RDF: pd.DataFrame):
"""
Helper used for the third task: "Predicting a masked component within an RDF triple".
Accepts any subset of ["SubjectURI","RelationshipURI","ObjectURI"] (typically 2 of 3).
Missing components are replaced by <MASK>.
Returns:
pd.Series: RDF["Triple"] (just this column). Side-effect: sets RDF["Triple"].
"""
start = SpecialToken.START_TRIPLE.value
end = SpecialToken.END_TRIPLE.value
maskv = SpecialToken.MASK.value
n = len(RDF.index)
subj = RDF["SubjectURI"].to_numpy(dtype=object) if "SubjectURI" in RDF else np.full(n, maskv, dtype=object)
rel = RDF["RelationshipURI"].to_numpy(dtype=object) if "RelationshipURI" in RDF else np.full(n, maskv, dtype=object)
obj = RDF["ObjectURI"].to_numpy(dtype=object) if "ObjectURI" in RDF else np.full(n, maskv, dtype=object)
arr = np.char.add(np.char.add(np.char.add(start, subj),
np.char.add(rel, obj)),
end)
RDF["Triple"] = pd.Series(arr, index=RDF.index, dtype=object, name="Triple")
return RDF["Triple"]
def rebuild_by_movie(self, RDF: pd.DataFrame):
"""
Collapse triples + abstract into a single row per movie.
Returns: ["MovieID","Triple","Abstract"]
"""
# Build triples once (vectorized); method also sets RDF["Triple"]
triples = self.build_triple(RDF)
# Minimal frame for grouping (avoid carrying extra columns)
tmp = pd.DataFrame({
"MovieID": RDF["MovieID"].to_numpy(),
"Abstract": RDF["Abstract"].to_numpy(),
"Triple": triples.to_numpy(),
})
# Factorize high-cardinality keys to fast integer codes, group on codes,
# then map back to labels; sum concatenates strings for object dtype.
mid_codes, mid_uniques = pd.factorize(tmp["MovieID"], sort=False)
abs_codes, abs_uniques = pd.factorize(tmp["Abstract"], sort=False)
tmp["_mid"] = mid_codes
tmp["_abs"] = abs_codes
grouped = tmp.groupby(["_mid", "_abs"], sort=False, as_index=False)["Triple"].sum()
grouped["MovieID"] = grouped["_mid"].map(lambda i: mid_uniques[i])
grouped["Abstract"] = grouped["_abs"].map(lambda i: abs_uniques[i])
# Final tokens
grouped["Triple"] = SpecialToken.START_TRIPLE_LIST.value + grouped["Triple"]
grouped["Abstract"] = SpecialToken.ABSTRACT.value + grouped["Abstract"]
return grouped[["MovieID", "Triple", "Abstract"]]

View File

@@ -26,6 +26,7 @@ class PipelineApplier():
"""Remove rows whose RelationshipURI is in the stored filter. Generate it first callig the generate_list_relationship_filter"""
return RDF[~RDF["RelationshipURI"].isin(self.relationship_filter_list)]
# def filter_movie_by_rel_uri_frequence()
def generate_frequency_movie_filter(self, MOVIE_COUNT: pd.DataFrame ,min_treshold: int, max_treshold: int):
"""

View File

@@ -1,6 +1,6 @@
import re
from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
from Scripts.DataCleaning.filter import PipelineApplier
from Scripts.DataCleaning.legacy.filter import PipelineApplier
# tasks dataset builder
from Scripts.DataCleaning.data_output_models.rdf_mask_task import RDF_mask_task_dataset
from Scripts.DataCleaning.data_output_models.bpe_corpus import BPE_corpus
@@ -25,13 +25,16 @@ class Pipeline():
MOVIE_COUNT = self.sql_endpoint.get_movies_id_count()
REL_COUNT = self.sql_endpoint.get_relationship_count()
self.filter_applier.generate_frequency_movie_filter(MOVIE_COUNT,50,3000)
self.filter_applier.generate_frequency_relationship_filter(REL_COUNT, 50, 2395627)
self.filter_applier.generate_frequency_relationship_filter(REL_COUNT, 50, 2395627) # from 2718 to 3069
# prepare the filter on the relationshipURI you want to delete:
relationship_uri_banned_list = [
"dbp-dbp:wikiPageUsesTemplate","w3:2000/01/rdf-schema#label","dbp-dbo:abstract",
"dbp-dbo:wikiPageID","dbp-dbo:wikiPageRevisionID", "dbp-dbo:wikiPageDisambiguates",
"w3:2002/07/owl#sameAs","dbp-dbp:image","dbp-dbo:wikiPageLength", "w3:2000/01/rdf-schema#comment",
"dbp-dbo:thumbnail", "foaf:depiction", "w3:1999/02/22-rdf-syntax-ns#type"]
"dbp-dbo:thumbnail", "foaf:depiction", "w3:1999/02/22-rdf-syntax-ns#type",
"dbp-dbp:id","dbp-dbp:totalWidth", "w3:ns/prov#wasDerivedFrom", "dbp-dbp:n", "dbp-dbp:alt",
"dbp-dbo:soundRecording"
]
self.filter_applier.generate_list_relationship_filter(relationship_uri_banned_list)

View File

@@ -0,0 +1,86 @@
# This file deletes in the pipeline the unwanted relationship by different rules
import pandas as pd
import sqlite3
import numpy as np
from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
class PipelineApplier():
def __init__(self):
pass
def rdf_add_special_token(self, RDF: pd.DataFrame):
"""
Adds RDF special token to each element of the tuple. i.e: SUBJ to SubjectURI, OBJ to ObjectURI, REL to RelationshipURI.
Check Scrits/Libs/CleaningPipeline/special_token.py for the up-to-date special token.
It only adds the special token of the three element of the RDF, no other special token.
Args:
RDF (pd.DataFrame):
Returns:
pd.DataFrame: ["MovieURI","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
"""
# if the filter runned before sliced the RDF and created a View, here the problem is resolved
# for more context: SettingWithCopyWarning
RDF = RDF.copy()
# at the beginning of SubjectURI RelationshipURI ObjectURI, add their special token
RDF["SubjectURI"] = SpecialToken.SUBJECT.value + RDF["SubjectURI"]
RDF["ObjectURI"] = SpecialToken.OBJECT.value + RDF["ObjectURI"]
RDF["RelationshipURI"] = SpecialToken.RELATIONSHIP.value + RDF["RelationshipURI"]
return RDF
def drop_na_from_dataset(self, RDF: pd.DataFrame) -> pd.DataFrame:
RDF = RDF.replace('', np.nan)
# Drop rows where any of the key columns are NaN
RDF = RDF.dropna(subset=["SubjectURI", "RelationshipURI", "ObjectURI"])
return RDF
def rebuild_by_movie(self, RDF: pd.DataFrame) -> pd.DataFrame:
"""
Args:
RDF (pd.DataFrame): ["MovieID","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
Returns:
pd.DataFrame: ["MovieID","Triple","Abstract"]
"""
# to execute this method you have to have itereted by movie_id
# because as design we want at the end one row for each movie
# MovieID and abstract can be given as input for a more generic method
# first let's combine each row creating column triple as join of rdf
RDF["Triple"] = RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"]
# special token
RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
# combine rows into one
# MovieID and Abstract are unique for each other 1 <-> 1
RDF = RDF.groupby(["MovieID", "Abstract"])["Triple"].apply("".join).reset_index()
# add special token for: start of triple, end of triple and start of abstract
RDF["Triple"] = SpecialToken.START_TRIPLE_LIST.value + RDF["Triple"]+SpecialToken.END_OF_SENTENCE.value
RDF["Abstract"] = SpecialToken.ABSTRACT.value + RDF["Abstract"] + SpecialToken.END_OF_SENTENCE.value
return RDF[["MovieID","Triple","Abstract"]]
@staticmethod
def build_triple(RDF: pd.DataFrame):
"""
Obtains joined RDF triple in one element, togheter with START and END special token
Args:
RDF (pd.DataFrame): at least ["SubjectURI", "RelationshipURI", "ObjectURI"]
Returns:
pd.DataFrame: RDF["Triple"] (just this column)
"""
# let's combine each row creating column triple as join of rdf
RDF["Triple"] = RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"]
# special token
RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
return RDF["Triple"]
def regex_on_objects(self, RDF: pd.DataFrame) -> pd.DataFrame:
RDF["ObjectURI"] = (RDF["ObjectURI"].astype("string")
.str.replace(r"\r?\n+", ", ", regex=True) # newlines -> ", "
.str.replace(r"\*", "", regex=True)) # delete all asterisks
return RDF

View File

@@ -0,0 +1,103 @@
import pandas as pd
from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
class MovieFilter:
def __init__(self) -> None:
self.sql_endpoint = SqlEndpoint()
# first obtain all movie_id
movie_query = "SELECT MovieID FROM Movies"
self.MOVIE_FILTER = self.sql_endpoint.get_dataframe_from_query(movie_query)
def frequency_filter(self, min_treshold:int, max_treshold:int):
movie_list_placeholder = ",".join(["?"] * len(self.MOVIE_FILTER))
filter_query = f"""
SELECT MovieID
FROM RDFs
WHERE MovieID IN ({movie_list_placeholder})
GROUP BY MovieID
HAVING COUNT(*) BETWEEN {min_treshold} AND {max_treshold};
"""
self.MOVIE_FILTER = self.sql_endpoint.get_dataframe_from_query(filter_query, tuple(self.MOVIE_FILTER["MovieID"].to_list()))
def get_movie_id(self):
return self.MOVIE_FILTER
def relation_filter(self, parsed_rel_uri: str, min_treshold:int, max_treshold:int):
movie_ids = self.MOVIE_FILTER["MovieID"].to_list()
movie_list_placeholder = ",".join(["?"] * len(movie_ids))
filter_query = f"""
SELECT MovieID
FROM RDFs
JOIN ParsedRelationships ON ParsedRelationships.RelationshipID = RDFs.RelationshipID
WHERE MovieID IN ({movie_list_placeholder})
GROUP BY MovieID
HAVING SUM(CASE WHEN ParsedRelationships.RelationshipURI = '{parsed_rel_uri}' THEN 1 ELSE 0 END)
BETWEEN {min_treshold} AND {max_treshold};
"""
params = tuple(movie_ids) # + (parsed_rel_uri, min_treshold, max_treshold)
self.MOVIE_FILTER = self.sql_endpoint.get_dataframe_from_query(filter_query, params)
def filter_by_director(self):
director_list = ['dbp-dbo:director','dbp-dbp:director']
movie_ids = self.MOVIE_FILTER["MovieID"].to_list()
movie_list_placeholder = ",".join(["?"] * len(movie_ids))
filter_query = f"""
SELECT DISTINCT RDFs.MovieID
FROM RDFs
JOIN ParsedRelationships USING (RelationshipID)
WHERE RDFs.MovieID IN ({movie_list_placeholder})
AND ParsedRelationships.RelationshipURI IN {tuple(director_list)};
"""
params = tuple(movie_ids)
self.MOVIE_FILTER = self.sql_endpoint.get_dataframe_from_query(filter_query, params)
def filter_by_english_movies(self):
movie_ids = self.MOVIE_FILTER["MovieID"].to_list()
movie_list_placeholder = ",".join(["?"] * len(movie_ids))
relationship = ["dbp-dbp:language"]
objects_list = ["English", "dbp-dbr:English_language"]
filter_query = f"""
SELECT DISTINCT RDFs.MovieID
FROM RDFs
INNER JOIN ParsedRelationships USING (RelationshipID)
INNER JOIN ParsedObjects USING (ObjectID)
WHERE RDFs.MovieID IN ({movie_list_placeholder})
AND ParsedRelationships.RelationshipURI IN ('{relationship[0]}')
AND ParsedObjects.ObjectURI in {tuple(objects_list)};
"""
other_query = f"""
SELECT RDFs.MovieID
FROM RDFs
INNER JOIN ParsedRelationships USING (RelationshipID)
INNER JOIN ParsedObjects USING (ObjectID)
WHERE RDFs.MovieID IN ({movie_list_placeholder})
AND ParsedRelationships.RelationshipURI IN ('{relationship[0]}')
GROUP BY RDFs.MovieID
HAVING
SUM(CASE WHEN ParsedObjects.ObjectURI IN {tuple(objects_list)} THEN 1 ELSE 0 END) >= 1
AND
SUM(CASE WHEN ParsedObjects.ObjectURI NOT IN {tuple(objects_list)} THEN 1 ELSE 0 END) = 0;
"""
params = tuple(movie_ids)
self.MOVIE_FILTER = self.sql_endpoint.get_dataframe_from_query(other_query, params)
# movie_filter = MovieFilter()
# movie_filter.frequency_filter(5,10)

View File

@@ -0,0 +1,155 @@
from movie_filter import MovieFilter
from relationship_filter import RelationshipFilter
from rdf_filter import RdfFilter
from cleaner import PipelineApplier
from Scripts.DataCleaning.data_output_models.bpe_corpus import BPE_corpus
from Scripts.DataCleaning.data_output_models.rdf_text_tasks import RDF_text_task_dataset
from Scripts.DataCleaning.data_output_models.rdf_completation_task import RDF_completation_task_dataset
from Scripts.DataCleaning.data_output_models.debug_csv import Debug_csv
import pandas as pd
RELATIONSHIP_FILTER_LIST = [
"dbp-dbp:wikiPageUsesTemplate","w3:2000/01/rdf-schema#label","dbp-dbo:abstract",
"dbp-dbo:wikiPageID","dbp-dbo:wikiPageRevisionID", "dbp-dbo:wikiPageDisambiguates",
"w3:2002/07/owl#sameAs","dbp-dbp:image","dbp-dbo:wikiPageLength", "w3:2000/01/rdf-schema#comment",
"dbp-dbo:thumbnail", "foaf:depiction", "w3:1999/02/22-rdf-syntax-ns#type",
"dbp-dbp:id","dbp-dbp:totalWidth", "w3:ns/prov#wasDerivedFrom", "dbp-dbp:n", "dbp-dbp:alt",
"dbp-dbo:soundRecording", "dbp-dbp:align", "dbp-dbp:format",
"dbp-dbp:filename", "dbp-dbp:wikt", "foaf:isPrimaryTopicOf", "dbp-dbp:quote", "foaf:homepage",
"dbp-dbp:wordnet_type", "dbp-dbp:length","dbp-dbp:caption", "dbp-dbo:imdbId", "dbp-dbp:border", "dbp-dbp:note",
"dbp-dbp:postalCodeType", "dbp-dbp:extraColumn", "foaf:homepage", "dbp-dbp:bgcolor","dbp-dbp:prevTitle",
"dbp-dbp:imageUpright", "dbp-dbp:url", "dbp-dbp:italicTitle", "dbp-dbp:imageSize", "dbp-dbp:text",
"dbp-dbp:captionAlign", "dbp-dbp:headerAlign", "dbp-dbp:height", "dbp-dbp:link", "dbp-dbo:wikiPageInterLanguageLink",
"w3:2003/01/geo/wgs84_pos#lat", "w3:2003/01/geo/wgs84_pos#long", "http://www.georss.org/georss/point",
"dbp-dbp:bgcolor", "dbp-dbp:mc", "dbp-dbp:rev3score", "dbp-dbp:rev4score", "dbp-dbp:imageAlt",
"dbp-dbp:b", "dbp-dbp:s", "dbp-dbp:c", "dbp-dbp:d", "dbp-dbp:m", "dbp-dbp:v", "dbp-dbp:mw", "dbp-dbp:fontsize",
"dbp-dbp:salign", "dbp-dbp:q", "dbp-dbp:portal", "dbp-dbp:dSearch", "dbp-dbp:header", "w3:2003/01/geo/wgs84_pos#geometry",
"dbp-dbp:shortsummary", "dbp-dbp:fixAttempted", "dbp-dbo:developer", "dbp-dbp:no", "dbp-dbp:ref", "dbp-dbp:infoa"
"dbp-dbp:infob", "dbp-dbp:1a", "dbp-dbp:1p", "dbp-dbp:2a", "dbp-dbp:2p", "http://rdvocab.info/RDARelationshipsWEMI/manifestationOfWork",
"dbp-dbp:isbn", "dbp-dbp:titleWidth", "dbp-dbp:prodcode", "dbp-dbp:page", "w3:2004/02/skos/core#closeMatch",
"dbp-dbp:colwidth", "dbp-dbp:imagesize", "dbp-dbp:rr", "dbp-dbp:date", "dbp-dbp:type", "dbp-dbp:list",
"dbp-dbp:listEpisodes", "dbp-dbp:footerAlign", "dbp-dbp:float", "dbp-dbp:bot", "dbp-dbp:p", "dbp-dbp:l", "dbp-dbp:t", "dbp-dbp:j",
"dbp-dbp:1y", "dbp-dbp:2y", "dbp-dbp:1pp", "dbp-dbp:vgs", "dbp-dbp:3a", "dbp-dbp:3p", "dbp-dbp:3y", "dbp-dbp:4a", "dbp-dbp:4y",
"dbp-dbp:website"
]
RELATIONSHIP_WHITE_LIST = [
"dbp-dbp:director","dbp-dbo:starring", "dbp-dbo:writer", "dbp-dbp:name", "dbp-dbp:genre", "purl:dc/terms/subject"
]
"""
SELECT DISTINCT field3
FROM debug
"""
class Pipeline():
def __init__(self) -> None:
self._movie_filter = MovieFilter()
self._relationship_filter = RelationshipFilter()
self._rdf_filter = RdfFilter()
self._pipeline = PipelineApplier()
self.task_bpe_corpus = BPE_corpus("./Assets/Dataset/Tmp/corpus.txt")
self.task_rdf_text = RDF_text_task_dataset("./Assets/Dataset/Tmp/rdf_text.csv")
self.task_rdf_completation = RDF_completation_task_dataset("./Assets/Dataset/Tmp/rdf_completation.csv")
self._movie_filter.frequency_filter(50,3000)
self._relationship_filter.frequency_filter(25, 2395627) # from 2718 to 3069
self._relationship_filter.delete_relationship_uri_by_list(RELATIONSHIP_FILTER_LIST)
def other_filter(self):
self._movie_filter.relation_filter("purl:dc/terms/subject",5,100)
self._movie_filter.filter_by_director()
self._movie_filter.filter_by_english_movies()
self._movie_filter.relation_filter("dbp-dbp:budget",1,100) # the most important film have relationship budget
self._movie_filter.relation_filter("dbp-dbp:released",1,100) # to cut to 2000 :(
def _get_cleaned_movie_rows(self):
movie_ids = self._movie_filter.get_movie_id()
rel_ids = self._relationship_filter.get_relationship_id()
# rel_ids = self._relationship_filter.get_relationship_id_from_white_list(RELATIONSHIP_WHITE_LIST)
for RDF in self._rdf_filter.yield_movie_abbreviated_rdfs(movie_ids,rel_ids):
RDF = self._pipeline.drop_na_from_dataset(RDF)
RDF = self._pipeline.regex_on_objects(RDF)
RDF = self._pipeline.rdf_add_special_token(RDF)
if RDF.empty:
continue
yield RDF
def execute_task_bpe_corpus(self):
for RDF in self._get_cleaned_movie_rows():
RDF = self._pipeline.rebuild_by_movie(RDF)
RDF = RDF[["Triple","Abstract"]]
self.task_bpe_corpus.write_from_df(RDF)
self._end_file_handler()
def execute_tasks_rdf_text(self):
for RDF in self._get_cleaned_movie_rows():
RDF = self._pipeline.rebuild_by_movie(RDF)
self.task_rdf_text.write(RDF)
self._end_file_handler()
def execute_task_rdf_completation(self):
for RDF in self._get_cleaned_movie_rows():
RDF["Triple"] = self._pipeline.build_triple(RDF)
self.task_rdf_completation.write(RDF[["MovieID","Triple"]])
self._end_file_handler()
def _end_file_handler(self):
self.task_bpe_corpus.close()
self.task_rdf_text.close()
self.task_rdf_completation.close()
def execute_all_task(self):
for RDF in self._get_cleaned_movie_rows():
completation_RDF = RDF.copy()
completation_RDF["Triple"] = self._pipeline.build_triple(completation_RDF)
self.task_rdf_completation.write(completation_RDF[["MovieID","Triple"]])
RDF = self._pipeline.rebuild_by_movie(RDF)
self.task_rdf_text.write(RDF)
self.task_bpe_corpus.write_from_df(RDF[["Triple","Abstract"]])
self._end_file_handler()
def use_toy_dataset(self):
# CHOOSEN MOVIE:
# The Dark Knight : 117248
# Inception : 147074
# The Avengers : 113621
# Cast Away : 1123
# The Departed : 117586
# American Psycho : 90177
# Avatar : 71587
# Django Unchained : 138952
# Spirited Away : 144137
# Knives Out : 148025
# [106465,106466,106467,106468,106469,106470,106471,106472,106473]
movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]
self._movie_filter.MOVIE_FILTER = pd.DataFrame({"MovieID": movie_list})
def generate_csv_debug_file(self, debug_path:str):
debug_csv = Debug_csv(debug_path)
for RDF in self._get_cleaned_movie_rows():
debug_csv.write(RDF)
debug_csv.close()
pipe = Pipeline()
#pipe.use_toy_dataset()
pipe.other_filter()
# pipe.execute_all_task()
pipe.generate_csv_debug_file("Assets/Dataset/Tmp/debug.csv")

View File

@@ -0,0 +1,32 @@
import pandas as pd
from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
class RdfFilter:
def __init__(self) -> None:
self.sql_endpoint = SqlEndpoint()
# def delete_hyperum_when_movie(self):
# purl:linguistics/gold/hypernym
# is almost ever as "dbp-dbr:Movie" or "dbp-dbr:Film"
# banned triple
def yield_movie_abbreviated_rdfs(self, MOVIE_ID: pd.DataFrame, REL_ID: pd.DataFrame):
relationship_placeholder = ",".join(["?"] * len(REL_ID))
param = tuple(REL_ID["RelationshipID"].to_list())
QUERY = f"""
SELECT MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
FROM RDFs
INNER JOIN ParsedSubjects USING (SubjectID)
INNER JOIN ParsedRelationships USING (RelationshipID)
INNER JOIN ParsedObjects USING (ObjectID)
INNER JOIN WikipediaAbstracts USING (MovieID)
WHERE MovieID = (?) AND RelationshipID IN ({relationship_placeholder});
"""
for movie_id in MOVIE_ID["MovieID"].to_list():
params = (movie_id,) + param
yield self.sql_endpoint.get_dataframe_from_query(QUERY, params=params)

View File

@@ -0,0 +1,54 @@
import pandas as pd
from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
class RelationshipFilter:
def __init__(self) -> None:
self.sql_endpoint = SqlEndpoint()
# first obtain all relationship_id
relationship_query = "SELECT RelationshipID FROM Relationships"
self.RELATIONSHIP_FILTER = self.sql_endpoint.get_dataframe_from_query(relationship_query)
def frequency_filter(self, min_treshold:int, max_treshold:int):
movie_list_placeholder = ",".join(["?"] * len(self.RELATIONSHIP_FILTER))
filter_query = f"""
SELECT RelationshipID
FROM RDFs
WHERE RelationshipID IN ({movie_list_placeholder})
GROUP BY RelationshipID
HAVING COUNT(*) BETWEEN {min_treshold} AND {max_treshold};
"""
self.RELATIONSHIP_FILTER = self.sql_endpoint.get_dataframe_from_query(filter_query, tuple(self.RELATIONSHIP_FILTER["RelationshipID"].to_list()))
def get_relationship_id(self):
return self.RELATIONSHIP_FILTER
def get_relationship_id_from_white_list(self, relationship_list: list[str]):
ids_placeholder = ",".join(["?"] * len(self.RELATIONSHIP_FILTER))
uri_placeholder = ",".join(["?"] * len(relationship_list))
filter_query = f"""
SELECT RelationshipID
FROM ParsedRelationships
WHERE RelationshipID IN ({ids_placeholder})
AND RelationshipURI IN ({uri_placeholder});
"""
params = tuple(self.RELATIONSHIP_FILTER["RelationshipID"].to_list()) + tuple(relationship_list)
return self.sql_endpoint.get_dataframe_from_query(filter_query, params)
def delete_relationship_uri_by_list(self, filter_list: list[str]):
ids_placeholder = ",".join(["?"] * len(self.RELATIONSHIP_FILTER))
uri_placeholder = ",".join(["?"] * len(filter_list))
filter_query = f"""
SELECT RelationshipID
FROM ParsedRelationships
WHERE RelationshipID IN ({ids_placeholder})
AND RelationshipURI NOT IN ({uri_placeholder});
"""
params = tuple(self.RELATIONSHIP_FILTER["RelationshipID"].to_list()) + tuple(filter_list)
self.RELATIONSHIP_FILTER = self.sql_endpoint.get_dataframe_from_query(filter_query, params)

View File

@@ -133,6 +133,11 @@ class SqlEndpoint():
GROUP BY RelationshipURI;
"""
return pd.read_sql_query(QUERY, self.sql_engine)
def get_dataframe_from_query(self, query: str, params=None):
if params is None:
return pd.read_sql_query(query, self.sql_engine)
return pd.read_sql_query(query, self.sql_engine, params=params)

Binary file not shown.

Binary file not shown.