Compare commits
107 Commits
dev.bpe
...
dev.embedd
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3446870291 | ||
|
|
e76dbeb9a7 | ||
|
|
96610612fe | ||
|
|
bed9718f27 | ||
|
|
93865bee8a | ||
|
|
1c0ddb8753 | ||
|
|
51399f9dc9 | ||
|
|
d1ba4ae026 | ||
|
|
db0090981c | ||
|
|
e1c5649d67 | ||
|
|
0bca241662 | ||
|
|
005d7af6a0 | ||
|
|
9068db550e | ||
|
|
d8f81e1a47 | ||
|
|
a67df9724e | ||
|
|
c5fd57d854 | ||
|
|
ee253c39f4 | ||
|
|
2036b4015f | ||
|
|
aac7675b30 | ||
|
|
d2fdeb18a2 | ||
|
|
f3b83eda3d | ||
|
|
0158db2dce | ||
|
|
ba592c3480 | ||
|
|
1f9c30b531 | ||
|
|
b805dc538e | ||
|
|
c2e13bc9c6 | ||
|
|
14c3914571 | ||
|
|
b9273b95e2 | ||
|
|
c263e2cf13 | ||
|
|
c9a50d50b7 | ||
|
|
9b0c57c238 | ||
|
|
24ea4d3ba4 | ||
|
|
e353c200d7 | ||
|
|
159266a603 | ||
|
|
7027414342 | ||
|
|
fc44929a7b | ||
|
|
0560bc439a | ||
|
|
8adacdb08c | ||
|
|
533347ee22 | ||
|
|
d1ff88da82 | ||
|
|
3f465991f0 | ||
|
|
96cbf4eabb | ||
|
|
f801afe0e4 | ||
|
|
b4ee8362a2 | ||
|
|
3021a51961 | ||
|
|
99b5198c9a | ||
|
|
b97282179d | ||
|
|
fdece42462 | ||
|
|
109ad9f36b | ||
|
|
fef933da9d | ||
|
|
c65f5e66fe | ||
|
|
f9545aca1d | ||
|
|
490edcfd53 | ||
|
|
9b5bb6d5f8 | ||
|
|
14b810c451 | ||
|
|
56d438f01a | ||
|
|
745424a978 | ||
|
|
e1549d4458 | ||
|
|
456ce724fe | ||
|
|
44307cd917 | ||
|
|
ffdb312d58 | ||
|
|
0007c38212 | ||
|
|
9c1043e0ba | ||
|
|
ee8e56798c | ||
|
|
1797571bb2 | ||
|
|
e93710af08 | ||
|
|
d3bba9b944 | ||
|
|
b1e7af0607 | ||
|
|
d3b1f7da91 | ||
|
|
c217f5dec9 | ||
|
|
49f0beb6ea | ||
|
|
05bb460999 | ||
|
|
948c3fd7ac | ||
|
|
87409fecd5 | ||
|
|
7e40a36701 | ||
|
|
d48815cca2 | ||
|
|
0f243eaac2 | ||
|
|
9c83d9fa71 | ||
|
|
a693cbb77e | ||
|
|
6f219f634f | ||
|
|
b303affd18 | ||
|
|
53c4decac7 | ||
|
|
c60da8ba82 | ||
|
|
3b5e6c099c | ||
|
|
ba3a718480 | ||
|
|
69fba7c3e9 | ||
|
|
76200d936d | ||
|
|
9b656e7918 | ||
|
|
9a797a0485 | ||
|
|
3b274ad807 | ||
|
|
8f5e2f2f0d | ||
|
|
da0bdf703b | ||
|
|
03cdca1f00 | ||
|
|
7188c8678a | ||
|
|
1eef25a697 | ||
|
|
e9165fb146 | ||
|
|
e8ff82c40a | ||
|
|
23d1eaf99e | ||
|
|
25a6ad1254 | ||
|
|
460d4f5188 | ||
|
|
c6ac6df2c2 | ||
|
|
15baba54ab | ||
|
|
87f24878f4 | ||
|
|
999141f886 | ||
|
|
8e095ebb7a | ||
|
|
64e355e80c | ||
|
|
397e29742a |
BIN
Assets/Dataset/1-hop/curated/corpus.txt
LFS
Normal file
BIN
Assets/Dataset/1-hop/curated/corpus.txt
LFS
Normal file
Binary file not shown.
BIN
Assets/Dataset/1-hop/small/corpus.txt
LFS
Normal file
BIN
Assets/Dataset/1-hop/small/corpus.txt
LFS
Normal file
Binary file not shown.
BIN
Assets/Dataset/1-hop/small/rdf_completation.csv
LFS
Normal file
BIN
Assets/Dataset/1-hop/small/rdf_completation.csv
LFS
Normal file
Binary file not shown.
|
BIN
Assets/Dataset/1-hop/small/rdf_text.csv
LFS
Normal file
BIN
Assets/Dataset/1-hop/small/rdf_text.csv
LFS
Normal file
Binary file not shown.
|
BIN
Assets/Dataset/1-hop/toy/corpus.txt
LFS
Normal file
BIN
Assets/Dataset/1-hop/toy/corpus.txt
LFS
Normal file
Binary file not shown.
BIN
Assets/Dataset/1-hop/toy/rdf_completation.csv
LFS
Normal file
BIN
Assets/Dataset/1-hop/toy/rdf_completation.csv
LFS
Normal file
Binary file not shown.
|
BIN
Assets/Dataset/1-hop/toy/rdf_mask.csv
LFS
Normal file
BIN
Assets/Dataset/1-hop/toy/rdf_mask.csv
LFS
Normal file
Binary file not shown.
|
BIN
Assets/Dataset/1-hop/toy/rdf_text.csv
LFS
Normal file
BIN
Assets/Dataset/1-hop/toy/rdf_text.csv
LFS
Normal file
Binary file not shown.
|
BIN
Assets/Model/small/bpe-small.json
LFS
Normal file
BIN
Assets/Model/small/bpe-small.json
LFS
Normal file
Binary file not shown.
193
Playgrounds/doctor.ipynb
Normal file
193
Playgrounds/doctor.ipynb
Normal file
@@ -0,0 +1,193 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "ddfb4457",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"ename": "AssertionError",
|
||||||
|
"evalue": "target id 3872 >= V (256). Fix TOKEN_SPACE_SIZE.",
|
||||||
|
"output_type": "error",
|
||||||
|
"traceback": [
|
||||||
|
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
||||||
|
"\u001b[31mAssertionError\u001b[39m Traceback (most recent call last)",
|
||||||
|
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 126\u001b[39m\n\u001b[32m 124\u001b[39m \u001b[38;5;66;03m# sanity guard (helps debug vocab mismatches fast)\u001b[39;00m\n\u001b[32m 125\u001b[39m max_seen = tgt[:, :Tp].max().item()\n\u001b[32m--> \u001b[39m\u001b[32m126\u001b[39m \u001b[38;5;28;01massert\u001b[39;00m max_seen < V \u001b[38;5;129;01mor\u001b[39;00m (tgt[:, :Tp] == PAD_TOKEN).all(), \\\n\u001b[32m 127\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mtarget id \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmax_seen\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m >= V (\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mV\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m). Fix TOKEN_SPACE_SIZE.\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 129\u001b[39m \u001b[38;5;66;03m# CE over all tokens produced so far (0..t). PAD is ignored by ignore_index\u001b[39;00m\n\u001b[32m 130\u001b[39m loss_t = cross_entropy(\n\u001b[32m 131\u001b[39m logits_btV.reshape(-\u001b[32m1\u001b[39m, V), \u001b[38;5;66;03m# [B*(t+1), V]\u001b[39;00m\n\u001b[32m 132\u001b[39m tgt[:, :Tp].reshape(-\u001b[32m1\u001b[39m) \u001b[38;5;66;03m# [B*(t+1)]\u001b[39;00m\n\u001b[32m 133\u001b[39m )\n",
|
||||||
|
"\u001b[31mAssertionError\u001b[39m: target id 3872 >= V (256). Fix TOKEN_SPACE_SIZE."
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import random\n",
|
||||||
|
"import torch\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"from pathlib import Path\n",
|
||||||
|
"import Project_Model.Libs.Embedder as Embedder\n",
|
||||||
|
"import Project_Model.Libs.BPE as BPE\n",
|
||||||
|
"import Project_Model.Libs.Transformer as Transformer\n",
|
||||||
|
"import Project_Model.Libs.TorchShims as torch_shims\n",
|
||||||
|
"from Project_Model.Libs.Training.learning_rade_shedulers import CustomLR\n",
|
||||||
|
"from Project_Model.Libs.Training.logistic_collector import LogitsCollector # external collector\n",
|
||||||
|
"\n",
|
||||||
|
"# set a fixed seed\n",
|
||||||
|
"torch.manual_seed(0)\n",
|
||||||
|
"random.seed(0)\n",
|
||||||
|
"DEVICE = torch_shims.get_default_device()\n",
|
||||||
|
"torch.set_default_device(DEVICE)\n",
|
||||||
|
"\n",
|
||||||
|
"# BPE Init\n",
|
||||||
|
"VOCABULARY_PATH = Path(\"Assets/Model/toy_10/toy_dictionary.json\")\n",
|
||||||
|
"SPECIAL_VOC = BPE.default_special_tokens()\n",
|
||||||
|
"\n",
|
||||||
|
"VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)\n",
|
||||||
|
"TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)\n",
|
||||||
|
"\n",
|
||||||
|
"# Constants (TEMP size; will be corrected after dataset scan below)\n",
|
||||||
|
"TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + 1\n",
|
||||||
|
"EMBEDDED_SIZE = 256\n",
|
||||||
|
"FEED_FORWARD_MULTIPLIER = 4\n",
|
||||||
|
"ATTENTION_HEADS = 4\n",
|
||||||
|
"SENTENCE_LENGTH = 256\n",
|
||||||
|
"NUMBER_OF_BLOCKS = 2\n",
|
||||||
|
"MAX_EPOCHS = int(1e4)\n",
|
||||||
|
"\n",
|
||||||
|
"PAD_TOKEN = TOKENANO.encode(\"<PAD>\")[0]\n",
|
||||||
|
"END_TOKEN = TOKENANO.encode(\"<END>\")[0]\n",
|
||||||
|
"\n",
|
||||||
|
"# Load CSV\n",
|
||||||
|
"TOY_DATASET_PATH = Path(\"Assets/Dataset/1-hop/toy/rdf_text.csv\")\n",
|
||||||
|
"TOY_DATASET = pd.read_csv(TOY_DATASET_PATH)\n",
|
||||||
|
"\n",
|
||||||
|
"TOY_BATCH_INPUT_LIST: list[list[int]] = []\n",
|
||||||
|
"TOY_BATCH_PADDING_LIST: list[list[bool]] = []\n",
|
||||||
|
"TOY_BATCH_TARGET_LIST: list[list[int]] = []\n",
|
||||||
|
"TOY_BATCH_DECODER_DEFAULT: list[list[int]] = []\n",
|
||||||
|
"\n",
|
||||||
|
"for index, row in TOY_DATASET.iterrows():\n",
|
||||||
|
" RDFs: str = row[\"RDFs\"]\n",
|
||||||
|
" Abstract: str = row[\"Abstract\"]\n",
|
||||||
|
"\n",
|
||||||
|
" input_tokens = TOKENANO.encode(RDFs) # encoder input ids\n",
|
||||||
|
" output_tokens = TOKENANO.encode(Abstract)[1:] # decoder target ids (shifted left)\n",
|
||||||
|
" decoder_default_tokens = TOKENANO.encode(\"<SOS>\") # decoder input starts with <SOS>\n",
|
||||||
|
"\n",
|
||||||
|
" input_tokens, padding = Transformer.normalize_sequence(\n",
|
||||||
|
" input_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n",
|
||||||
|
" ) # pad/trim + end token\n",
|
||||||
|
" output_tokens, _ = Transformer.normalize_sequence(\n",
|
||||||
|
" output_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n",
|
||||||
|
" ) # pad/trim + end token\n",
|
||||||
|
" decoder_default_tokens = Transformer.pad_sequence(\n",
|
||||||
|
" decoder_default_tokens, SENTENCE_LENGTH, PAD_TOKEN\n",
|
||||||
|
" ) # pad with PAD up to SENTENCE_LENGTH\n",
|
||||||
|
"\n",
|
||||||
|
" TOY_BATCH_INPUT_LIST.append(input_tokens)\n",
|
||||||
|
" TOY_BATCH_PADDING_LIST.append(padding)\n",
|
||||||
|
" TOY_BATCH_TARGET_LIST.append(output_tokens)\n",
|
||||||
|
" TOY_BATCH_DECODER_DEFAULT.append(decoder_default_tokens)\n",
|
||||||
|
"\n",
|
||||||
|
"# fix V to cover ALL ids (including specials) # <- important\n",
|
||||||
|
"max_enc_id = max(max(row) for row in TOY_BATCH_INPUT_LIST) if TOY_BATCH_INPUT_LIST else 0\n",
|
||||||
|
"max_tgt_id = max(max(row) for row in TOY_BATCH_TARGET_LIST) if TOY_BATCH_TARGET_LIST else 0\n",
|
||||||
|
"TOKEN_SPACE_SIZE = max(TOKEN_SPACE_SIZE, max(PAD_TOKEN, END_TOKEN, max_enc_id, max_tgt_id) + 1)\n",
|
||||||
|
"\n",
|
||||||
|
"# Training loop\n",
|
||||||
|
"LOSS_HISTORY = []\n",
|
||||||
|
"NANOSOCRATES = Transformer.TrainingModel(\n",
|
||||||
|
" TOKEN_SPACE_SIZE,\n",
|
||||||
|
" EMBEDDED_SIZE,\n",
|
||||||
|
" FEED_FORWARD_MULTIPLIER,\n",
|
||||||
|
" ATTENTION_HEADS,\n",
|
||||||
|
" NUMBER_OF_BLOCKS,\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"collector = LogitsCollector(PAD_TOKEN, END_TOKEN, TOKENANO) # collects logits and decodes\n",
|
||||||
|
"\n",
|
||||||
|
"NANOSOCRATES.train()\n",
|
||||||
|
"cross_entropy = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)\n",
|
||||||
|
"optimizer = torch.optim.AdamW(NANOSOCRATES.parameters(), lr=1.0) # base lr works as factor\n",
|
||||||
|
"scheduler = CustomLR(optimizer, EMBEDDED_SIZE, warmup_steps=4000, factor=1.0) # step each optimizer step\n",
|
||||||
|
"\n",
|
||||||
|
"current_epoch = 0\n",
|
||||||
|
"BATCH_SIZE = min(32, len(TOY_BATCH_INPUT_LIST)) # small batch to stabilize\n",
|
||||||
|
"\n",
|
||||||
|
"while current_epoch < MAX_EPOCHS:\n",
|
||||||
|
" # simple fixed mini-batch from the top; later you can shuffle/slice\n",
|
||||||
|
" enc = torch.tensor(TOY_BATCH_INPUT_LIST[:BATCH_SIZE], dtype=torch.long) # [B,T] encoder token ids\n",
|
||||||
|
" pad = torch.tensor(TOY_BATCH_PADDING_LIST[:BATCH_SIZE], dtype=torch.bool) # [B,T] True where encoder PAD is present\n",
|
||||||
|
" tgt = torch.tensor(TOY_BATCH_TARGET_LIST[:BATCH_SIZE], dtype=torch.long) # [B,T] decoder targets (ground-truth)\n",
|
||||||
|
"\n",
|
||||||
|
" # decoder prefix buffer: <SOS> at pos 0, PAD elsewhere (no shift here) # we will fill it step by step\n",
|
||||||
|
" dec = torch.tensor(TOY_BATCH_DECODER_DEFAULT[:BATCH_SIZE], dtype=torch.long) # [B,T]\n",
|
||||||
|
"\n",
|
||||||
|
" total_loss = 0.0\n",
|
||||||
|
" collector.reset() # start fresh for this epoch\n",
|
||||||
|
"\n",
|
||||||
|
" T = tgt.size(1) # sequence length\n",
|
||||||
|
" for t in range(T):\n",
|
||||||
|
" # skip all-PAD steps to avoid CE divide-by-zero late in the sequence\n",
|
||||||
|
" if (tgt[:, t] == PAD_TOKEN).all(): # all PAD at this timestep\n",
|
||||||
|
" break\n",
|
||||||
|
"\n",
|
||||||
|
" optimizer.zero_grad(set_to_none=True) # clear grads for this token step\n",
|
||||||
|
"\n",
|
||||||
|
" prefix = dec[:, : t + 1] # [B, t+1] current decoder prefix\n",
|
||||||
|
" dec_pad_mask = prefix.eq(PAD_TOKEN) # [B, t+1] True where PAD inside prefix\n",
|
||||||
|
"\n",
|
||||||
|
" # now decoder returns all steps up to t -> [B, t+1, V]\n",
|
||||||
|
" logits_btV: torch.Tensor = NANOSOCRATES((enc, pad, prefix, dec_pad_mask)) # full logits for learning\n",
|
||||||
|
" collector.add(logits_btV) # collector will take the last step\n",
|
||||||
|
"\n",
|
||||||
|
" Tp = logits_btV.size(1) # t+1\n",
|
||||||
|
" V = logits_btV.size(-1) # vocab size\n",
|
||||||
|
"\n",
|
||||||
|
" # sanity guard (helps debug vocab mismatches fast)\n",
|
||||||
|
" max_seen = tgt[:, :Tp].max().item()\n",
|
||||||
|
" assert max_seen < V or (tgt[:, :Tp] == PAD_TOKEN).all(), \\\n",
|
||||||
|
" f\"target id {max_seen} >= V ({V}). Fix TOKEN_SPACE_SIZE.\"\n",
|
||||||
|
"\n",
|
||||||
|
" # CE over all tokens produced so far (0..t). PAD is ignored by ignore_index\n",
|
||||||
|
" loss_t = cross_entropy(\n",
|
||||||
|
" logits_btV.reshape(-1, V), # [B*(t+1), V]\n",
|
||||||
|
" tgt[:, :Tp].reshape(-1) # [B*(t+1)]\n",
|
||||||
|
" )\n",
|
||||||
|
"\n",
|
||||||
|
" loss_t.backward() # backprop for this step\n",
|
||||||
|
" optimizer.step() # update params\n",
|
||||||
|
" scheduler.step() # Noam/warmup: step per optimizer step\n",
|
||||||
|
"\n",
|
||||||
|
" total_loss = float(loss_t.detach()) # keep last step loss for logging\n",
|
||||||
|
"\n",
|
||||||
|
" # teacher forcing: reveal the correct token for next position\n",
|
||||||
|
" if t < T - 1:\n",
|
||||||
|
" dec[:, t + 1] = tgt[:, t] # write ground-truth into next slot\n",
|
||||||
|
"\n",
|
||||||
|
" current_epoch += 1\n",
|
||||||
|
" print(f\"EPOCH {current_epoch}\\n\\tLoss: {total_loss:.6f}\") # simple log\n",
|
||||||
|
" collector.print_decoded() # print decoded predictions for the batch\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "deep_learning",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.13.7"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
182
Playgrounds/embedder.ipynb
Normal file
182
Playgrounds/embedder.ipynb
Normal file
File diff suppressed because one or more lines are too long
308
Playgrounds/encoder-decoder.ipynb
Normal file
308
Playgrounds/encoder-decoder.ipynb
Normal file
@@ -0,0 +1,308 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"id": "7a311d4b",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"[[7706, 290, 756, 4270, 7357, 115, 351, 1507, 1213, 410, 3382, 317, 497, 4740, 2784, 7712], [7706, 290, 756, 4270, 7357, 115, 351, 1507, 1213, 410, 3382, 317, 497, 4740, 2784, 7712], [7706, 290, 756, 4270, 7357, 115, 351, 1507, 1213, 410, 3382, 317, 497, 4740, 2784, 7712]]\n",
|
||||||
|
"3\n",
|
||||||
|
"Embedder Tensor: torch.Size([3, 16, 256])\n",
|
||||||
|
"Values:\n",
|
||||||
|
"tensor([[[-0.6981, 0.0804, -2.1672, ..., 0.3919, 0.3341, 1.0794],\n",
|
||||||
|
" [ 2.5818, -0.2308, 0.6001, ..., -0.0500, -0.0408, -0.9852],\n",
|
||||||
|
" [-0.6967, 0.8109, 1.3108, ..., 2.1693, 1.4143, -0.1236],\n",
|
||||||
|
" ...,\n",
|
||||||
|
" [ 2.1226, 2.5695, -1.6178, ..., -0.0652, -0.0802, 0.1103],\n",
|
||||||
|
" [ 0.8770, -2.4782, 0.8536, ..., 2.0471, -1.5702, 0.7387],\n",
|
||||||
|
" [ 1.4284, -0.4654, 0.1394, ..., 1.6520, 0.6728, 1.3851]],\n",
|
||||||
|
"\n",
|
||||||
|
" [[-0.6981, 0.0804, -2.1672, ..., 0.3919, 0.3341, 1.0794],\n",
|
||||||
|
" [ 2.5818, -0.2308, 0.6001, ..., -0.0500, -0.0408, -0.9852],\n",
|
||||||
|
" [-0.6967, 0.8109, 1.3108, ..., 2.1693, 1.4143, -0.1236],\n",
|
||||||
|
" ...,\n",
|
||||||
|
" [ 2.1226, 2.5695, -1.6178, ..., -0.0652, -0.0802, 0.1103],\n",
|
||||||
|
" [ 0.8770, -2.4782, 0.8536, ..., 2.0471, -1.5702, 0.7387],\n",
|
||||||
|
" [ 1.4284, -0.4654, 0.1394, ..., 1.6520, 0.6728, 1.3851]],\n",
|
||||||
|
"\n",
|
||||||
|
" [[-0.6981, 0.0804, -2.1672, ..., 0.3919, 0.3341, 1.0794],\n",
|
||||||
|
" [ 2.5818, -0.2308, 0.6001, ..., -0.0500, -0.0408, -0.9852],\n",
|
||||||
|
" [-0.6967, 0.8109, 1.3108, ..., 2.1693, 1.4143, -0.1236],\n",
|
||||||
|
" ...,\n",
|
||||||
|
" [ 2.1226, 2.5695, -1.6178, ..., -0.0652, -0.0802, 0.1103],\n",
|
||||||
|
" [ 0.8770, -2.4782, 0.8536, ..., 2.0471, -1.5702, 0.7387],\n",
|
||||||
|
" [ 1.4284, -0.4654, 0.1394, ..., 1.6520, 0.6728, 1.3851]]],\n",
|
||||||
|
" grad_fn=<AddBackward0>)\n",
|
||||||
|
"ENCODER Tensor: torch.Size([3, 1, 256])\n",
|
||||||
|
"Values:\n",
|
||||||
|
"tensor([[[ 8.0069e-01, 4.0532e-01, -1.8316e+00, -1.3902e+00, -1.1784e+00,\n",
|
||||||
|
" 1.3667e+00, -9.7890e-01, 6.0696e-01, -1.4899e+00, 5.5765e-01,\n",
|
||||||
|
" 4.5991e-02, 5.1214e-01, 3.1901e-01, 4.7577e-01, -2.9585e-01,\n",
|
||||||
|
" -1.0811e+00, -1.5281e+00, -6.3773e-01, -9.5954e-01, 1.8497e+00,\n",
|
||||||
|
" -1.1789e+00, -9.7387e-01, 1.1931e-01, -7.2703e-01, 5.3108e-01,\n",
|
||||||
|
" -6.4877e-01, -4.5188e-01, 1.5185e+00, -8.3408e-01, 3.2824e-01,\n",
|
||||||
|
" -1.8166e+00, 1.9548e+00, -5.2419e-01, -1.0693e+00, -1.8510e+00,\n",
|
||||||
|
" 1.5440e+00, -3.2370e-01, -1.3990e+00, -4.6940e-01, 6.5840e-02,\n",
|
||||||
|
" -9.2057e-01, 1.2513e+00, -5.9168e-01, 7.8198e-01, -1.3121e+00,\n",
|
||||||
|
" 1.1492e+00, -2.3695e-01, -1.8935e+00, 1.1639e+00, -5.8169e-01,\n",
|
||||||
|
" 2.5051e-01, -8.1654e-01, -1.0328e+00, 1.4285e+00, -8.1485e-01,\n",
|
||||||
|
" 1.0614e+00, -3.3834e-01, -4.1667e-02, -1.1920e-01, 3.1383e-01,\n",
|
||||||
|
" -5.9857e-01, 1.7327e-01, -1.6854e+00, -1.5174e+00, -2.6508e-01,\n",
|
||||||
|
" -6.0082e-01, 5.1468e-01, 2.7909e-01, -2.5296e-01, -1.4670e+00,\n",
|
||||||
|
" -1.3587e+00, -8.8864e-02, 3.2825e-01, 1.0950e+00, -1.0371e+00,\n",
|
||||||
|
" 1.1744e+00, 5.2984e-01, 4.1751e-01, -9.8803e-01, 3.5631e-01,\n",
|
||||||
|
" 4.7484e-01, 2.2435e-01, 1.4022e+00, 1.2242e+00, 1.1447e+00,\n",
|
||||||
|
" -5.4052e-01, -9.1786e-01, -1.2299e+00, 1.1656e+00, 9.1570e-01,\n",
|
||||||
|
" 1.8956e+00, 7.4344e-01, 4.2187e-01, -9.5426e-02, -3.2428e-01,\n",
|
||||||
|
" 9.6364e-01, -2.3252e-01, 2.9036e-01, -2.4432e+00, 9.8019e-01,\n",
|
||||||
|
" -4.6697e-02, 8.3910e-01, -4.3541e-01, -7.1915e-01, -7.5638e-01,\n",
|
||||||
|
" 9.0217e-01, 2.0919e+00, -7.9533e-01, -1.5413e-01, -6.9260e-01,\n",
|
||||||
|
" -1.3086e+00, 7.8925e-01, 1.8855e-01, 7.4043e-01, -3.8834e-01,\n",
|
||||||
|
" 1.0272e-02, 1.0763e+00, 4.2142e-01, 6.6520e-01, 4.5996e-01,\n",
|
||||||
|
" -8.5060e-01, -9.0101e-01, -4.2090e-01, 2.5596e-01, -1.4946e+00,\n",
|
||||||
|
" 1.0925e-01, -7.5359e-01, -3.0447e-01, 1.0679e+00, 1.9398e+00,\n",
|
||||||
|
" 8.1472e-01, 1.3498e+00, 1.1107e+00, 6.3288e-01, 3.1149e-01,\n",
|
||||||
|
" -1.9333e+00, -1.5274e+00, 2.1794e-01, -3.1895e-02, 1.0756e+00,\n",
|
||||||
|
" 1.0215e+00, 1.6938e+00, -1.0939e+00, 2.2690e+00, -7.0921e-01,\n",
|
||||||
|
" 6.4212e-01, -6.5468e-01, 1.6839e+00, 5.7296e-01, -1.4031e+00,\n",
|
||||||
|
" 3.9133e-01, -5.3541e-01, 4.3439e-01, -1.6785e+00, 5.2030e-03,\n",
|
||||||
|
" 4.5155e-01, -7.0953e-01, -1.9656e-01, -3.8671e-02, -1.0927e+00,\n",
|
||||||
|
" -3.0405e-01, -1.3818e-02, -3.7748e-01, 1.4412e+00, -1.4254e-01,\n",
|
||||||
|
" 7.9939e-01, -8.5402e-01, -1.0330e+00, 1.7661e+00, -3.6084e-01,\n",
|
||||||
|
" 1.5622e+00, 1.0240e+00, 1.9056e-01, -4.1480e-01, 6.9056e-01,\n",
|
||||||
|
" 1.7204e+00, -9.9218e-01, -1.6504e-01, -1.1807e+00, 1.0827e+00,\n",
|
||||||
|
" 1.5973e+00, 1.4849e-01, -2.2867e+00, 7.7322e-01, -6.8401e-01,\n",
|
||||||
|
" -6.0493e-01, 1.0616e+00, -1.8034e-01, -1.8828e+00, 1.1031e-01,\n",
|
||||||
|
" 2.5452e-01, -4.2489e-02, 8.1171e-01, 1.3429e+00, -6.5058e-01,\n",
|
||||||
|
" -1.3531e+00, -1.2263e+00, 1.1226e+00, 1.2407e+00, -9.7453e-01,\n",
|
||||||
|
" 9.4696e-01, 6.6186e-01, -5.0804e-01, 1.2647e-01, -1.1777e+00,\n",
|
||||||
|
" 6.8443e-02, -1.3043e-01, 2.9595e-01, -1.5330e+00, -6.5733e-01,\n",
|
||||||
|
" 1.1291e+00, 6.9629e-01, 4.4690e-01, 8.0151e-01, -1.2406e+00,\n",
|
||||||
|
" 2.6085e+00, -2.0310e-01, -1.0226e+00, -6.9182e-02, 7.6600e-01,\n",
|
||||||
|
" -9.9842e-01, 2.0896e+00, 2.6334e-01, -1.1559e-01, -6.6876e-01,\n",
|
||||||
|
" -6.6295e-01, -1.6461e-01, 2.8270e+00, 3.2727e-01, 1.3724e+00,\n",
|
||||||
|
" -1.0749e+00, 3.7782e-01, -1.5472e+00, 3.0822e-01, 5.7273e-02,\n",
|
||||||
|
" 3.9136e-01, 8.2948e-01, 2.1438e-01, -9.8623e-01, 5.6053e-01,\n",
|
||||||
|
" -1.5617e+00, -3.9595e-01, 1.0451e-02, -1.1860e+00, -1.4994e-01,\n",
|
||||||
|
" 1.6566e+00, 2.0369e+00, -4.3995e-01, -4.4262e-01, -3.1014e-01,\n",
|
||||||
|
" 5.9083e-01, -1.0765e+00, -5.2906e-01, 4.6039e-02, -1.0154e+00,\n",
|
||||||
|
" 5.9942e-01]],\n",
|
||||||
|
"\n",
|
||||||
|
" [[ 1.2683e+00, -4.3200e-01, -1.3333e+00, -3.6705e-01, -5.8895e-01,\n",
|
||||||
|
" 9.9266e-01, -4.2914e-01, 9.2765e-01, -1.0935e+00, 1.4975e+00,\n",
|
||||||
|
" -5.3739e-01, -2.8332e-01, 9.1166e-01, 1.5010e+00, -2.1787e-01,\n",
|
||||||
|
" -1.4258e+00, -2.7524e-01, -1.2602e+00, 2.0117e-01, 2.3906e+00,\n",
|
||||||
|
" -9.6397e-01, -7.5872e-01, 3.3948e-01, -7.9353e-01, 9.1668e-01,\n",
|
||||||
|
" 8.7734e-04, -3.0271e-01, 1.7087e+00, -1.0273e+00, 1.5174e+00,\n",
|
||||||
|
" -2.6405e-02, 1.4236e+00, -9.9093e-01, 5.4787e-01, -1.0904e+00,\n",
|
||||||
|
" 5.2156e-01, -6.3470e-01, -7.7688e-01, -1.2538e+00, -3.9307e-01,\n",
|
||||||
|
" -7.6707e-01, 1.3733e+00, -7.2709e-01, 1.1185e+00, -1.5860e+00,\n",
|
||||||
|
" -2.6148e-01, -3.7984e-01, -1.3604e+00, 9.2864e-02, -7.9642e-01,\n",
|
||||||
|
" 1.0956e+00, 3.1202e-01, -4.1234e-01, 3.6488e-02, -1.4639e+00,\n",
|
||||||
|
" 1.0947e+00, -7.9230e-01, 4.6913e-01, -2.3407e-01, 4.1768e-02,\n",
|
||||||
|
" -1.5921e+00, 6.9743e-01, -7.0222e-01, -5.4705e-01, -6.5663e-01,\n",
|
||||||
|
" -4.1810e-01, 2.7744e-01, 7.9178e-01, 7.5886e-01, -7.6302e-01,\n",
|
||||||
|
" -1.2204e+00, -1.1103e+00, -1.3646e-01, 1.9589e+00, -1.3637e+00,\n",
|
||||||
|
" 9.0804e-01, 2.3094e-01, -5.5953e-02, -6.7626e-01, 1.4242e+00,\n",
|
||||||
|
" 1.0167e+00, 1.0705e+00, 2.2947e+00, 9.1274e-01, 1.2281e+00,\n",
|
||||||
|
" -7.0638e-01, -1.2249e+00, -8.9208e-02, 1.1016e+00, 1.1940e+00,\n",
|
||||||
|
" 3.5834e-01, 1.2961e+00, -4.6674e-01, 3.4572e-01, -4.3458e-01,\n",
|
||||||
|
" 1.1008e+00, 3.7783e-01, -6.5841e-01, -2.3127e+00, 1.4617e+00,\n",
|
||||||
|
" -1.2826e-01, 1.3463e-01, -8.5268e-01, -8.4144e-01, -1.8594e+00,\n",
|
||||||
|
" 1.9260e-01, 1.6432e+00, -2.0640e-02, -5.0030e-01, -1.5334e-01,\n",
|
||||||
|
" -6.1072e-01, -1.3694e-01, -3.7308e-01, 1.6603e+00, 1.1246e-01,\n",
|
||||||
|
" 6.0823e-02, 7.8749e-01, -1.7002e-01, 1.2058e+00, 8.5615e-01,\n",
|
||||||
|
" 1.2525e-01, -1.0584e+00, -4.7931e-01, 1.4088e-01, -1.8149e+00,\n",
|
||||||
|
" 1.4654e+00, -1.0936e+00, 5.3182e-01, 9.5694e-01, 3.2472e+00,\n",
|
||||||
|
" 3.4877e-01, 1.8491e+00, -1.5184e-01, 1.4711e+00, -7.6064e-01,\n",
|
||||||
|
" -2.2144e+00, -1.8952e+00, -4.9502e-01, -6.6836e-01, 1.4946e+00,\n",
|
||||||
|
" 6.7616e-01, 1.1501e+00, -9.4747e-01, 1.1009e+00, -1.4211e+00,\n",
|
||||||
|
" 3.9528e-01, -9.5220e-01, 1.4886e+00, 7.1784e-01, -1.9941e+00,\n",
|
||||||
|
" 6.7901e-02, -1.3109e-01, 1.1695e+00, 1.2861e-01, -2.8123e-01,\n",
|
||||||
|
" -6.1611e-01, 1.5513e-01, -3.9289e-01, -4.5543e-02, -2.8628e-01,\n",
|
||||||
|
" 2.6118e-01, 2.2623e-01, -6.3705e-01, 7.3591e-01, -7.8799e-01,\n",
|
||||||
|
" 2.5053e-01, -1.5923e-01, -4.9584e-01, 1.9009e+00, -2.3263e-01,\n",
|
||||||
|
" 1.2213e+00, 1.0313e+00, 2.0177e-02, -6.2209e-01, -3.5161e-01,\n",
|
||||||
|
" 1.5143e+00, -7.2332e-02, 2.3909e-02, -2.1261e+00, 8.5199e-01,\n",
|
||||||
|
" 1.9084e+00, 4.6845e-02, -2.3554e+00, 1.3735e+00, -7.3909e-01,\n",
|
||||||
|
" -8.3949e-01, -3.9314e-01, -4.3324e-01, -9.6804e-01, -5.3124e-01,\n",
|
||||||
|
" -6.5091e-01, -1.1738e+00, 1.3315e+00, 6.5606e-01, -1.4131e-01,\n",
|
||||||
|
" -1.7712e+00, -1.1628e+00, 9.6813e-01, 8.7314e-01, -9.8027e-01,\n",
|
||||||
|
" 6.9376e-01, 5.3878e-01, -1.6169e+00, 2.2860e-01, -6.2179e-01,\n",
|
||||||
|
" -1.1043e-01, -3.9658e-01, 2.8712e-01, 8.2201e-02, 2.0888e-01,\n",
|
||||||
|
" -5.9884e-01, 7.3092e-01, 6.9128e-01, 5.3977e-01, -1.5728e+00,\n",
|
||||||
|
" 1.6878e+00, -8.2669e-01, -9.8076e-01, -3.4203e-01, 4.6939e-02,\n",
|
||||||
|
" -1.3158e-01, 2.1923e+00, -6.6483e-02, -4.0687e-01, -1.2715e+00,\n",
|
||||||
|
" -8.1549e-01, -1.2047e+00, 1.3547e+00, -4.2072e-01, 1.1674e+00,\n",
|
||||||
|
" -5.1421e-01, 1.3055e+00, -1.1277e+00, 1.8372e+00, -1.1215e+00,\n",
|
||||||
|
" 1.4797e+00, 2.8354e-01, -6.3974e-01, -1.2869e+00, -2.7897e-01,\n",
|
||||||
|
" -1.0397e+00, 1.8622e-01, -5.0397e-02, -4.4865e-02, -7.6067e-01,\n",
|
||||||
|
" 1.7715e+00, 1.5040e+00, -2.6854e-01, -5.2802e-01, -5.3407e-01,\n",
|
||||||
|
" 2.0313e-02, -2.6276e-01, -7.0748e-01, -8.7328e-01, -3.4108e-01,\n",
|
||||||
|
" 1.4313e+00]],\n",
|
||||||
|
"\n",
|
||||||
|
" [[ 7.7464e-01, -4.2187e-01, -2.0571e+00, -8.6709e-01, -1.5722e+00,\n",
|
||||||
|
" 4.9540e-01, -1.5270e+00, 1.0499e+00, -1.9579e+00, -2.5298e-02,\n",
|
||||||
|
" 4.3419e-01, 5.8822e-01, 1.3392e+00, 6.9604e-01, -9.7883e-01,\n",
|
||||||
|
" -9.1354e-01, -9.1852e-01, -6.0951e-01, -6.6255e-02, 1.3907e+00,\n",
|
||||||
|
" -6.2912e-01, -2.7524e-01, 1.9520e-02, -2.7154e-01, 1.5162e-01,\n",
|
||||||
|
" 1.3318e-02, -8.9196e-01, 9.0976e-01, -1.3544e+00, 2.4276e-01,\n",
|
||||||
|
" -7.4038e-01, 9.7062e-01, 3.2011e-01, 3.4486e-01, -2.3374e+00,\n",
|
||||||
|
" 1.3311e+00, -3.1871e-02, -1.4468e+00, -1.5968e+00, 3.0418e-01,\n",
|
||||||
|
" -7.7136e-01, 1.3427e+00, -1.2493e+00, 1.4114e+00, -1.2475e+00,\n",
|
||||||
|
" 7.0239e-01, -9.6120e-02, -4.4365e-01, 5.3238e-01, -1.4933e+00,\n",
|
||||||
|
" 5.4476e-01, -1.8490e-02, -5.9936e-01, 1.0878e+00, -1.8892e+00,\n",
|
||||||
|
" 1.2810e+00, -1.0747e+00, 5.3514e-01, 1.7422e-01, 1.1354e+00,\n",
|
||||||
|
" -7.4837e-01, 4.0327e-01, -1.8950e+00, -7.2336e-01, 2.4441e-01,\n",
|
||||||
|
" -1.3650e-01, -4.8344e-01, 3.3921e-02, 5.0889e-01, -1.3769e+00,\n",
|
||||||
|
" -2.5907e-01, -2.7549e-01, -1.9128e-01, 1.9751e+00, -7.1191e-01,\n",
|
||||||
|
" 5.1910e-01, 1.0902e-01, 2.9995e-01, -3.5180e-01, -6.2139e-01,\n",
|
||||||
|
" 7.2905e-01, -5.3177e-01, 4.3340e-01, 1.0071e+00, 1.7586e+00,\n",
|
||||||
|
" -3.9963e-01, -2.5139e-01, -9.4213e-01, 9.2847e-01, 1.1298e+00,\n",
|
||||||
|
" 7.8545e-01, 1.3188e+00, 3.7466e-01, 9.0773e-01, -4.0454e-02,\n",
|
||||||
|
" 1.3444e+00, 6.0301e-01, 8.9929e-02, -2.0754e+00, 4.8614e-01,\n",
|
||||||
|
" -9.7160e-01, 8.2446e-01, -1.1813e+00, -9.6185e-01, -9.2922e-02,\n",
|
||||||
|
" 6.0154e-01, 1.6640e+00, -1.0461e+00, 1.5868e-01, -5.7239e-01,\n",
|
||||||
|
" -6.2726e-01, 3.2848e-01, 5.9609e-01, 1.5563e+00, -4.0883e-01,\n",
|
||||||
|
" 4.4902e-01, 1.4004e+00, 2.2426e-01, 3.8314e-01, -2.0641e-01,\n",
|
||||||
|
" -1.6465e-01, -6.4645e-01, 1.5772e-01, 6.8907e-01, -1.2703e+00,\n",
|
||||||
|
" 1.8914e-01, -6.2678e-01, 3.0179e-01, 1.2687e+00, 1.6849e+00,\n",
|
||||||
|
" 1.5690e+00, 1.0999e+00, 1.5820e+00, -6.4808e-01, 5.1003e-01,\n",
|
||||||
|
" -1.6674e+00, -1.2224e+00, 1.9769e-01, -1.3883e-01, 1.2179e+00,\n",
|
||||||
|
" 1.2971e+00, 4.6259e-01, -5.8717e-01, 1.4532e+00, -1.0540e+00,\n",
|
||||||
|
" 2.8689e-01, -1.3895e+00, 1.4014e+00, -4.0430e-01, -2.6099e+00,\n",
|
||||||
|
" -1.0293e+00, -1.1097e+00, 8.6266e-01, -1.0535e+00, 7.1789e-01,\n",
|
||||||
|
" 6.0642e-01, -1.2493e+00, -3.7762e-01, -4.1281e-02, -7.3049e-01,\n",
|
||||||
|
" -7.2913e-04, -7.3122e-02, -2.3850e-01, 1.2546e+00, 1.8802e-01,\n",
|
||||||
|
" 1.3135e+00, -5.0367e-01, 1.2456e-01, 2.7475e+00, -1.2486e+00,\n",
|
||||||
|
" 1.4441e+00, 8.7469e-01, -5.6901e-01, -1.2145e-01, 3.1091e-01,\n",
|
||||||
|
" 1.9406e+00, -8.1891e-01, 3.1316e-02, -1.2867e+00, 8.0780e-01,\n",
|
||||||
|
" 7.0041e-01, 2.8903e-01, -1.6387e+00, 6.6553e-01, -1.3696e+00,\n",
|
||||||
|
" -7.9454e-01, 3.3899e-01, -5.5822e-01, -8.1969e-01, -1.2410e-01,\n",
|
||||||
|
" -3.7024e-01, -7.2536e-01, 7.5648e-01, 1.6899e+00, -1.7404e-01,\n",
|
||||||
|
" -1.7191e+00, -7.2603e-01, 1.5046e+00, 8.3216e-01, -1.5304e+00,\n",
|
||||||
|
" -1.8264e-01, 3.3451e-01, -5.6636e-02, 6.1099e-01, -9.8517e-01,\n",
|
||||||
|
" 4.4856e-01, -8.6275e-01, 6.9264e-02, -1.1572e+00, 2.3373e-01,\n",
|
||||||
|
" 5.9896e-01, 1.2384e-01, 1.0309e+00, 1.4273e+00, -8.4776e-01,\n",
|
||||||
|
" 2.6236e+00, -9.0133e-01, -4.0009e-01, -4.9727e-01, 3.7945e-01,\n",
|
||||||
|
" -9.0712e-01, 1.5725e+00, 1.6298e-01, 1.1544e-01, -4.3125e-01,\n",
|
||||||
|
" -8.7131e-01, -2.5880e-01, 2.9032e+00, 2.7154e-01, 1.3677e+00,\n",
|
||||||
|
" -8.8544e-01, 5.6083e-01, -1.8256e+00, 9.4832e-01, -1.0762e+00,\n",
|
||||||
|
" 7.5421e-01, 6.5008e-01, -8.6361e-01, -1.4911e+00, -7.5930e-02,\n",
|
||||||
|
" -1.6896e+00, 1.5223e-02, -1.5283e-01, -1.8741e+00, 1.1400e-01,\n",
|
||||||
|
" 1.8822e+00, 2.6615e+00, 2.1607e-01, -5.6243e-01, 3.6730e-01,\n",
|
||||||
|
" 4.0374e-01, -1.1973e+00, -5.3006e-01, -3.4750e-01, -4.4187e-01,\n",
|
||||||
|
" 7.4358e-01]]], grad_fn=<NativeLayerNormBackward0>)\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import random\n",
|
||||||
|
"import torch\n",
|
||||||
|
"from pathlib import Path\n",
|
||||||
|
"import Project_Model.Libs.Embedder as Embedder\n",
|
||||||
|
"import Project_Model.Libs.BPE as BPE\n",
|
||||||
|
"import Project_Model.Libs.Transformer as Transformer\n",
|
||||||
|
"\n",
|
||||||
|
"# set a fixed seed\n",
|
||||||
|
"torch.manual_seed(0)\n",
|
||||||
|
"random.seed(0)\n",
|
||||||
|
"\n",
|
||||||
|
"TEXT = (\n",
|
||||||
|
" \"<ABS>The Dark Knight is a 2008 superhero film directed by Christopher Nolan,<SOTL>\"\n",
|
||||||
|
")\n",
|
||||||
|
"OUT_TEXT = \"<START>\"\n",
|
||||||
|
"VOCABULARY_PATH = Path(\"Assets/Model/toy_10/toy_dictionary.json\")\n",
|
||||||
|
"SPECIAL_VOC = BPE.default_special_tokens()\n",
|
||||||
|
"\n",
|
||||||
|
"VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)\n",
|
||||||
|
"TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)\n",
|
||||||
|
"\n",
|
||||||
|
"PAD_TOKEN = TOKENANO.encode(\"<PAD>\")[0]\n",
|
||||||
|
"END_TOKEN = TOKENANO.encode(\"<END>\")[0]\n",
|
||||||
|
"\n",
|
||||||
|
"ENCODER_INPUT = TOKENANO.encode(TEXT)\n",
|
||||||
|
"DECODER_INPUT = TOKENANO.encode(OUT_TEXT)\n",
|
||||||
|
"MAX_LEN = len(ENCODER_INPUT) + 1\n",
|
||||||
|
"\n",
|
||||||
|
"EN_IN, PAD_MASK = Transformer.normalize_sequence(ENCODER_INPUT, MAX_LEN, PAD_TOKEN, END_TOKEN)\n",
|
||||||
|
"DEC_IN, _ = Transformer.normalize_sequence(DECODER_INPUT, MAX_LEN, PAD_TOKEN, END_TOKEN)\n",
|
||||||
|
"BATCH_LEN = 3\n",
|
||||||
|
"\n",
|
||||||
|
"INPUT_TOKENIZATION = [\n",
|
||||||
|
" EN_IN\n",
|
||||||
|
"] * BATCH_LEN\n",
|
||||||
|
"OUTPUT_TOKENIZATION = [\n",
|
||||||
|
" DEC_IN\n",
|
||||||
|
"] * BATCH_LEN\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"print(INPUT_TOKENIZATION)\n",
|
||||||
|
"\n",
|
||||||
|
"TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size\n",
|
||||||
|
"EMBEDDED_SIZE = 256\n",
|
||||||
|
"FEED_FORWARD_DIM = EMBEDDED_SIZE * 4\n",
|
||||||
|
"\n",
|
||||||
|
"EMBEDDER = Embedder.NanoSocratesEmbedder(TOKEN_SPACE_SIZE, EMBEDDED_SIZE)\n",
|
||||||
|
"encoder_tensor: torch.Tensor = EMBEDDER(INPUT_TOKENIZATION)\n",
|
||||||
|
"ENCODER = torch.nn.Sequential(\n",
|
||||||
|
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||||
|
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||||
|
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||||
|
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||||
|
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||||
|
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||||
|
")\n",
|
||||||
|
"decoder_tensor: torch.Tensor = EMBEDDER(OUTPUT_TOKENIZATION)\n",
|
||||||
|
"DECODER = torch.nn.Sequential(\n",
|
||||||
|
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||||
|
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||||
|
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||||
|
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||||
|
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||||
|
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"print(len(INPUT_TOKENIZATION))\n",
|
||||||
|
"print(f\"Embedder Tensor: {encoder_tensor.shape}\")\n",
|
||||||
|
"print(f\"Values:\\n{encoder_tensor}\")\n",
|
||||||
|
"\n",
|
||||||
|
"BATCH_SIZE, TOKENS, DIMENSIONS = encoder_tensor.shape\n",
|
||||||
|
"PAD_MASK = torch.tensor([PAD_MASK] * BATCH_LEN)\n",
|
||||||
|
"\n",
|
||||||
|
"encoder_out, _ = ENCODER((encoder_tensor, PAD_MASK))\n",
|
||||||
|
"tensor: torch.Tensor\n",
|
||||||
|
"tensor, _, _, _ = DECODER((decoder_tensor, encoder_out, encoder_out, None))\n",
|
||||||
|
"\n",
|
||||||
|
"print(f\"ENCODER Tensor: {tensor.shape}\")\n",
|
||||||
|
"print(f\"Values:\\n{tensor}\")"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "deep_learning",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.13.7"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
0
Playgrounds/encoder-pretraining.py
Normal file
0
Playgrounds/encoder-pretraining.py
Normal file
131
Playgrounds/encoder.ipynb
Normal file
131
Playgrounds/encoder.ipynb
Normal file
@@ -0,0 +1,131 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "c64b0e24",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"[[7706, 290, 756, 4270, 7357, 115, 351, 1507, 1213, 410, 3382, 317, 497, 4740, 2784, 7700], [7706, 290, 756, 4270, 7357, 115, 351, 1507, 1213, 410, 3382, 317, 497, 4740, 2784, 7700]]\n",
|
||||||
|
"2\n",
|
||||||
|
"Embedder Tensor: torch.Size([2, 16, 256])\n",
|
||||||
|
"Values:\n",
|
||||||
|
"tensor([[[-0.6981, 0.0804, -2.1672, ..., 0.3919, 0.3341, 1.0794],\n",
|
||||||
|
" [ 2.5818, -0.2308, 0.6001, ..., -0.0500, -0.0408, -0.9852],\n",
|
||||||
|
" [-0.6967, 0.8109, 1.3108, ..., 2.1693, 1.4143, -0.1236],\n",
|
||||||
|
" ...,\n",
|
||||||
|
" [ 2.1226, 2.5695, -1.6178, ..., -0.0652, -0.0802, 0.1103],\n",
|
||||||
|
" [ 0.8770, -2.4782, 0.8536, ..., 2.0471, -1.5702, 0.7387],\n",
|
||||||
|
" [-0.0495, -1.8601, 0.0405, ..., 2.3944, -0.4297, 1.1141]],\n",
|
||||||
|
"\n",
|
||||||
|
" [[-0.6981, 0.0804, -2.1672, ..., 0.3919, 0.3341, 1.0794],\n",
|
||||||
|
" [ 2.5818, -0.2308, 0.6001, ..., -0.0500, -0.0408, -0.9852],\n",
|
||||||
|
" [-0.6967, 0.8109, 1.3108, ..., 2.1693, 1.4143, -0.1236],\n",
|
||||||
|
" ...,\n",
|
||||||
|
" [ 2.1226, 2.5695, -1.6178, ..., -0.0652, -0.0802, 0.1103],\n",
|
||||||
|
" [ 0.8770, -2.4782, 0.8536, ..., 2.0471, -1.5702, 0.7387],\n",
|
||||||
|
" [-0.0495, -1.8601, 0.0405, ..., 2.3944, -0.4297, 1.1141]]],\n",
|
||||||
|
" grad_fn=<AddBackward0>)\n",
|
||||||
|
"ENCODER Tensor: torch.Size([2, 16, 256])\n",
|
||||||
|
"Values:\n",
|
||||||
|
"tensor([[[-1.6325, 0.4094, -2.1403, ..., 0.4654, 0.5993, 0.9683],\n",
|
||||||
|
" [ 1.8236, 0.4025, -0.6972, ..., 0.2430, 0.2536, -1.0889],\n",
|
||||||
|
" [-0.0587, 0.1618, -0.2335, ..., 1.7609, 1.2664, -0.4452],\n",
|
||||||
|
" ...,\n",
|
||||||
|
" [ 2.0337, 1.3184, -1.3165, ..., -0.3303, 0.6572, 0.0884],\n",
|
||||||
|
" [ 0.5752, -2.5594, -0.2393, ..., 1.3318, -1.4236, 0.4686],\n",
|
||||||
|
" [ 1.0075, -2.4273, -0.4593, ..., 1.6660, 0.0359, 0.2927]],\n",
|
||||||
|
"\n",
|
||||||
|
" [[-1.8300, -0.3079, -1.6585, ..., 0.4859, 0.5652, 0.8072],\n",
|
||||||
|
" [ 1.5461, -0.5666, -0.0330, ..., 0.5651, 0.2974, -1.0879],\n",
|
||||||
|
" [-0.9060, 0.2700, -0.4585, ..., 2.0363, 1.2657, -0.7060],\n",
|
||||||
|
" ...,\n",
|
||||||
|
" [ 1.6688, 1.7038, -1.9549, ..., -0.2052, 0.6270, 0.4598],\n",
|
||||||
|
" [ 0.0482, -2.3951, -0.4351, ..., 1.6230, -1.3662, -0.0390],\n",
|
||||||
|
" [ 0.8146, -2.6169, -0.6188, ..., 1.4525, 0.0507, 0.5177]]],\n",
|
||||||
|
" grad_fn=<NativeLayerNormBackward0>)\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import random\n",
|
||||||
|
"import torch\n",
|
||||||
|
"from pathlib import Path\n",
|
||||||
|
"import Project_Model.Libs.Embedder as Embedder\n",
|
||||||
|
"import Project_Model.Libs.BPE as BPE\n",
|
||||||
|
"import Project_Model.Libs.Transformer as Transformer\n",
|
||||||
|
"\n",
|
||||||
|
"# set a fixed seed\n",
|
||||||
|
"torch.manual_seed(0)\n",
|
||||||
|
"random.seed(0)\n",
|
||||||
|
"\n",
|
||||||
|
"TEXT = \"<ABS>The Dark Knight is a 2008 superhero film directed by Christopher Nolan,<SOTL>\"\n",
|
||||||
|
"\n",
|
||||||
|
"VOCABULARY_PATH = Path(\"Assets/Model/toy_10/toy_dictionary.json\")\n",
|
||||||
|
"SPECIAL_VOC = BPE.default_special_tokens()\n",
|
||||||
|
"\n",
|
||||||
|
"VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)\n",
|
||||||
|
"TOKENANO = BPE.TokeNanoCore(\n",
|
||||||
|
" VOCABULARY,\n",
|
||||||
|
" SPECIAL_VOC\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"TOKENIZATION = [TOKENANO.encode(TEXT), TOKENANO.encode(TEXT)]\n",
|
||||||
|
"print(TOKENIZATION)\n",
|
||||||
|
"\n",
|
||||||
|
"TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size\n",
|
||||||
|
"EMBEDDED_SIZE = 256\n",
|
||||||
|
"FEED_FORWARD_DIM = EMBEDDED_SIZE * 4\n",
|
||||||
|
"\n",
|
||||||
|
"EMBEDDER = Embedder.NanoSocratesEmbedder(TOKEN_SPACE_SIZE, EMBEDDED_SIZE)\n",
|
||||||
|
"tensor: torch.Tensor = EMBEDDER(TOKENIZATION)\n",
|
||||||
|
"ENCODER = torch.nn.Sequential(\n",
|
||||||
|
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||||
|
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||||
|
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||||
|
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||||
|
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||||
|
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||||
|
")\n",
|
||||||
|
"print(len(TOKENIZATION))\n",
|
||||||
|
"print(f\"Embedder Tensor: {tensor.shape}\")\n",
|
||||||
|
"print(f\"Values:\\n{tensor}\")\n",
|
||||||
|
"\n",
|
||||||
|
"BATCH_SIZE, TOKENS, DIMENSIONS = tensor.shape\n",
|
||||||
|
"PAD_MASK = torch.tensor([[True] * TOKENS] * BATCH_SIZE, dtype=torch.bool)\n",
|
||||||
|
"tensor, _ = ENCODER((tensor, PAD_MASK))\n",
|
||||||
|
"print(f\"ENCODER Tensor: {tensor.shape}\")\n",
|
||||||
|
"print(f\"Values:\\n{tensor}\")\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "deep_learning",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.13.7"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
157
Playgrounds/nanosocrates-sanity-check.ipynb
Normal file
157
Playgrounds/nanosocrates-sanity-check.ipynb
Normal file
@@ -0,0 +1,157 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "f5762da9",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"torch.Size([3, 17, 7714])\n",
|
||||||
|
"torch.Size([3, 17])\n",
|
||||||
|
"tensor([[2034, 6523, 5406, 3985, 5406, 6523, 2034, 2034, 5745, 643, 5406, 7405,\n",
|
||||||
|
" 6523, 6230, 6419, 5745, 657],\n",
|
||||||
|
" [2458, 830, 5745, 5745, 5406, 3741, 2034, 5745, 6302, 6419, 5406, 2411,\n",
|
||||||
|
" 719, 830, 5745, 3189, 2775],\n",
|
||||||
|
" [2034, 5745, 5327, 4696, 6523, 643, 6419, 1671, 6302, 4406, 5745, 643,\n",
|
||||||
|
" 643, 1901, 1914, 1914, 719]])\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import random\n",
|
||||||
|
"import torch\n",
|
||||||
|
"from pathlib import Path\n",
|
||||||
|
"import Project_Model.Libs.Embedder as Embedder\n",
|
||||||
|
"import Project_Model.Libs.BPE as BPE\n",
|
||||||
|
"import Project_Model.Libs.Transformer as Transformer\n",
|
||||||
|
"\n",
|
||||||
|
"# set a fixed seed\n",
|
||||||
|
"torch.manual_seed(0)\n",
|
||||||
|
"random.seed(0)\n",
|
||||||
|
"\n",
|
||||||
|
"# BPE Init\n",
|
||||||
|
"VOCABULARY_PATH = Path(\"Assets/Model/toy_10/toy_dictionary.json\")\n",
|
||||||
|
"SPECIAL_VOC = BPE.default_special_tokens()\n",
|
||||||
|
"\n",
|
||||||
|
"VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)\n",
|
||||||
|
"TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"# Constants\n",
|
||||||
|
"TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + 1\n",
|
||||||
|
"EMBEDDED_SIZE = 256\n",
|
||||||
|
"FEED_FORWARD_DIM = EMBEDDED_SIZE * 4\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"# Model Init\n",
|
||||||
|
"ENCODER_EMBEDDER = Embedder.NanoSocratesEmbedder(TOKEN_SPACE_SIZE, EMBEDDED_SIZE)\n",
|
||||||
|
"DECODER_EMBEDDER = Embedder.NanoSocratesEmbedder(TOKEN_SPACE_SIZE, EMBEDDED_SIZE)\n",
|
||||||
|
"\n",
|
||||||
|
"ENCODER = torch.nn.Sequential(\n",
|
||||||
|
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||||
|
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||||
|
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||||
|
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||||
|
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||||
|
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"DECODER = torch.nn.Sequential(\n",
|
||||||
|
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||||
|
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||||
|
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||||
|
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||||
|
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||||
|
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"DETOKENER = Transformer.DeToken(\n",
|
||||||
|
" EMBEDDED_SIZE,\n",
|
||||||
|
" TOKEN_SPACE_SIZE\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"# Data\n",
|
||||||
|
"TEXT = (\n",
|
||||||
|
" \"<ABS>The Dark Knight is a 2008 superhero film directed by Christopher Nolan,<SOTL>\"\n",
|
||||||
|
")\n",
|
||||||
|
"OUT_TEXT = \"<START>\"\n",
|
||||||
|
"\n",
|
||||||
|
"PAD_TOKEN = TOKENANO.encode(\"<PAD>\")[0]\n",
|
||||||
|
"END_TOKEN = TOKENANO.encode(\"<END>\")[0]\n",
|
||||||
|
"\n",
|
||||||
|
"ENCODER_INPUT = TOKENANO.encode(TEXT)\n",
|
||||||
|
"DECODER_INPUT = TOKENANO.encode(OUT_TEXT)\n",
|
||||||
|
"MAX_LEN = len(ENCODER_INPUT) + 1\n",
|
||||||
|
"\n",
|
||||||
|
"EN_IN, PAD_MASK = Transformer.normalize_sequence(ENCODER_INPUT, MAX_LEN, PAD_TOKEN, END_TOKEN)\n",
|
||||||
|
"DEC_IN, _ = Transformer.normalize_sequence(DECODER_INPUT, MAX_LEN, PAD_TOKEN, END_TOKEN)\n",
|
||||||
|
"\n",
|
||||||
|
"BATCH_LEN = 3\n",
|
||||||
|
"\n",
|
||||||
|
"INPUT_TOKENIZATION = [\n",
|
||||||
|
" EN_IN\n",
|
||||||
|
"] * BATCH_LEN\n",
|
||||||
|
"OUTPUT_TOKENIZATION = [\n",
|
||||||
|
" DEC_IN\n",
|
||||||
|
"] * BATCH_LEN\n",
|
||||||
|
"\n",
|
||||||
|
"encoder_tensor_input = ENCODER_EMBEDDER(INPUT_TOKENIZATION)\n",
|
||||||
|
"encoder_padding_mask = torch.tensor([PAD_MASK] * BATCH_LEN)\n",
|
||||||
|
"\n",
|
||||||
|
"encoder_output, _ = ENCODER((encoder_tensor_input, encoder_padding_mask))\n",
|
||||||
|
"\n",
|
||||||
|
"decoder_tensor_input = DECODER_EMBEDDER(OUTPUT_TOKENIZATION)\n",
|
||||||
|
"decoder_padding_mask = torch.tensor([[False] * MAX_LEN] * BATCH_LEN)\n",
|
||||||
|
"\n",
|
||||||
|
"decoder_output, _, _, _ = DECODER((decoder_tensor_input, encoder_output, encoder_output, None))\n",
|
||||||
|
"\n",
|
||||||
|
"logits: torch.Tensor = DETOKENER(decoder_output)\n",
|
||||||
|
"\n",
|
||||||
|
"print(logits.shape)\n",
|
||||||
|
"\n",
|
||||||
|
"# print(logits)\n",
|
||||||
|
"\n",
|
||||||
|
"most_probable_tokens = torch.argmax(logits, 2)\n",
|
||||||
|
"\n",
|
||||||
|
"print(most_probable_tokens.shape)\n",
|
||||||
|
"print(most_probable_tokens)\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "deep_learning",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.13.7"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
8391
Playgrounds/nanosocrates-train-toy.ipynb
Normal file
8391
Playgrounds/nanosocrates-train-toy.ipynb
Normal file
File diff suppressed because one or more lines are too long
112
Playgrounds/prova.ipynb
Normal file
112
Playgrounds/prova.ipynb
Normal file
@@ -0,0 +1,112 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "4ae47336",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import torch\n",
|
||||||
|
"B, T, D = 4, 7, 32\n",
|
||||||
|
"x = torch.randn(B, T, D)\n",
|
||||||
|
"attn_mask = torch.triu(torch.ones(T, T, dtype=torch.bool), diagonal=1) # [T,T]\n",
|
||||||
|
"pad_mask = torch.zeros(B, T, dtype=torch.bool) # no pads\n",
|
||||||
|
"mha = torch.nn.MultiheadAttention(D, num_heads=4, batch_first=True)\n",
|
||||||
|
"y, _ = mha(x, x, x, attn_mask=attn_mask, key_padding_mask=pad_mask) # should work\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"id": "e38e3fb5",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"tensor([[[0, 0, 0, 0, 1, 0, 0, 0, 0, 0],\n",
|
||||||
|
" [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],\n",
|
||||||
|
" [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]],\n",
|
||||||
|
"\n",
|
||||||
|
" [[0, 0, 1, 0, 0, 0, 0, 0, 0, 0],\n",
|
||||||
|
" [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],\n",
|
||||||
|
" [0, 0, 0, 0, 0, 1, 0, 0, 0, 0]]])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"torch.nn.functional.one_hot(torch.tensor([\n",
|
||||||
|
" [4, 1, 9],\n",
|
||||||
|
" [2,4,5]\n",
|
||||||
|
"]))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"id": "7119ad53",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"device(type='cpu')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"torch.get_default_device()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"id": "8c95691a",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"xpu\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from Project_Model.Libs.TorchShims import get_default_device\n",
|
||||||
|
"\n",
|
||||||
|
"print(get_default_device())"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "deep_learning",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.13.7"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
170
Playgrounds/prova.py
Normal file
170
Playgrounds/prova.py
Normal file
@@ -0,0 +1,170 @@
|
|||||||
|
import random
|
||||||
|
import torch
|
||||||
|
import pandas as pd
|
||||||
|
from pathlib import Path
|
||||||
|
import Project_Model.Libs.Embedder as Embedder
|
||||||
|
import Project_Model.Libs.BPE as BPE
|
||||||
|
import Project_Model.Libs.Transformer as Transformer
|
||||||
|
import Project_Model.Libs.TorchShims as torch_shims
|
||||||
|
|
||||||
|
# set a fixed seed
|
||||||
|
torch.manual_seed(0)
|
||||||
|
random.seed(0)
|
||||||
|
DEVICE = torch_shims.get_default_device()
|
||||||
|
torch.set_default_device(DEVICE)
|
||||||
|
|
||||||
|
# set a default device
|
||||||
|
|
||||||
|
# BPE Init
|
||||||
|
VOCABULARY_PATH = Path("Assets/Model/toy_10/toy_dictionary.json")
|
||||||
|
SPECIAL_VOC = BPE.default_special_tokens()
|
||||||
|
|
||||||
|
VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)
|
||||||
|
TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)
|
||||||
|
|
||||||
|
|
||||||
|
# Constants
|
||||||
|
TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + 1
|
||||||
|
EMBEDDED_SIZE = 256
|
||||||
|
FEED_FORWARD_MULTIPLIER = 4
|
||||||
|
ATTENTION_HEADS = 8
|
||||||
|
SENTENCE_LENGTH = 256
|
||||||
|
NUMBER_OF_BLOCKS = 4
|
||||||
|
MAX_EPOCHS = int(1e3)
|
||||||
|
|
||||||
|
|
||||||
|
PAD_TOKEN = TOKENANO.encode("<PAD>")[0]
|
||||||
|
END_TOKEN = TOKENANO.encode("<END>")[0]
|
||||||
|
|
||||||
|
|
||||||
|
# Load CSV
|
||||||
|
TOY_DATASET_PATH = Path("Assets/Dataset/1-hop/toy/rdf_text.csv")
|
||||||
|
|
||||||
|
TOY_DATASET = pd.read_csv(TOY_DATASET_PATH)
|
||||||
|
|
||||||
|
TOY_BATCH_INPUT_LIST: list[list[int]] = []
|
||||||
|
TOY_BATCH_PADDING_LIST: list[list[bool]] = []
|
||||||
|
TOY_BATCH_TARGET_LIST: list[list[int]] = []
|
||||||
|
TOY_BATCH_DECODER_DEFAULT: list[list[int]]= []
|
||||||
|
|
||||||
|
|
||||||
|
for index, row in TOY_DATASET.iterrows():
|
||||||
|
|
||||||
|
RDFs: str = row["RDFs"]
|
||||||
|
Abstract: str = row["Abstract"]
|
||||||
|
|
||||||
|
input_tokens = TOKENANO.encode(RDFs)
|
||||||
|
output_tokens = TOKENANO.encode(Abstract)[1:]
|
||||||
|
decoder_default_tokens = TOKENANO.encode("<SOS>")
|
||||||
|
|
||||||
|
input_tokens, padding = Transformer.normalize_sequence(
|
||||||
|
input_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN
|
||||||
|
)
|
||||||
|
output_tokens, _ = Transformer.normalize_sequence(
|
||||||
|
output_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN
|
||||||
|
)
|
||||||
|
decoder_default_tokens, _ = Transformer.normalize_sequence(
|
||||||
|
decoder_default_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN, False
|
||||||
|
)
|
||||||
|
|
||||||
|
TOY_BATCH_INPUT_LIST.append(input_tokens)
|
||||||
|
TOY_BATCH_PADDING_LIST.append(padding)
|
||||||
|
TOY_BATCH_TARGET_LIST.append(output_tokens)
|
||||||
|
TOY_BATCH_DECODER_DEFAULT.append(decoder_default_tokens)
|
||||||
|
|
||||||
|
output_tokens = TOKENANO.encode(RDFs)
|
||||||
|
input_tokens = TOKENANO.encode(Abstract)[1:]
|
||||||
|
decoder_default_tokens = TOKENANO.encode("<SOS>")
|
||||||
|
|
||||||
|
input_tokens, padding = Transformer.normalize_sequence(
|
||||||
|
input_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN
|
||||||
|
)
|
||||||
|
output_tokens, _ = Transformer.normalize_sequence(
|
||||||
|
output_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN
|
||||||
|
)
|
||||||
|
decoder_default_tokens, _ = Transformer.normalize_sequence(
|
||||||
|
decoder_default_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN, False
|
||||||
|
)
|
||||||
|
|
||||||
|
TOY_BATCH_INPUT_LIST.append(input_tokens)
|
||||||
|
TOY_BATCH_PADDING_LIST.append(padding)
|
||||||
|
TOY_BATCH_TARGET_LIST.append(output_tokens)
|
||||||
|
TOY_BATCH_DECODER_DEFAULT.append(decoder_default_tokens)
|
||||||
|
|
||||||
|
# Training loop
|
||||||
|
LOSS_HISTORY = []
|
||||||
|
NANOSOCRATES = Transformer.TrainingModel(
|
||||||
|
TOKEN_SPACE_SIZE,
|
||||||
|
EMBEDDED_SIZE,
|
||||||
|
FEED_FORWARD_MULTIPLIER,
|
||||||
|
ATTENTION_HEADS,
|
||||||
|
NUMBER_OF_BLOCKS
|
||||||
|
)
|
||||||
|
cross_entropy = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)
|
||||||
|
optimizer = torch.optim.AdamW(NANOSOCRATES.parameters())
|
||||||
|
scheduler = Transformer.WarmupLR(optimizer, 4000, EMBEDDED_SIZE)
|
||||||
|
last_loss = 0
|
||||||
|
current_epoch = 0
|
||||||
|
|
||||||
|
while current_epoch < MAX_EPOCHS:
|
||||||
|
|
||||||
|
optimizer.zero_grad()
|
||||||
|
|
||||||
|
encoder_list = torch.tensor(TOY_BATCH_INPUT_LIST[:])
|
||||||
|
decoder_list = torch.tensor(TOY_BATCH_DECODER_DEFAULT[:])
|
||||||
|
src_padding = torch.tensor(TOY_BATCH_PADDING_LIST[:], dtype=torch.bool)
|
||||||
|
|
||||||
|
# Transform target into logits
|
||||||
|
target_logits = torch.tensor(TOY_BATCH_TARGET_LIST[:])
|
||||||
|
|
||||||
|
last_loss = 0
|
||||||
|
last_prediction: torch.Tensor
|
||||||
|
|
||||||
|
for i in range(0, SENTENCE_LENGTH):
|
||||||
|
|
||||||
|
optimizer.zero_grad()
|
||||||
|
tgt_padding = decoder_list.eq(PAD_TOKEN)
|
||||||
|
|
||||||
|
logits: torch.Tensor = NANOSOCRATES((encoder_list, src_padding, decoder_list, tgt_padding))
|
||||||
|
prob = torch.softmax(logits, 2)
|
||||||
|
|
||||||
|
most_probable_tokens = torch.argmax(prob, 2)
|
||||||
|
last_prediction = most_probable_tokens
|
||||||
|
|
||||||
|
logits = logits[:,:i,:]
|
||||||
|
logits = logits.permute(0, 2, 1)
|
||||||
|
|
||||||
|
loss : torch.Tensor = cross_entropy(logits, target_logits[:, 0:i])
|
||||||
|
# loss : torch.Tensor = cross_entropy(logits, target_logits)
|
||||||
|
|
||||||
|
last_loss = loss
|
||||||
|
loss.backward()
|
||||||
|
optimizer.step()
|
||||||
|
scheduler.step()
|
||||||
|
|
||||||
|
if i < SENTENCE_LENGTH - 1:
|
||||||
|
decoder_list[:,i+1] = target_logits[:,i]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
current_epoch += 1
|
||||||
|
|
||||||
|
if current_epoch % 1 == 0:
|
||||||
|
print(f"EPOCH {current_epoch}\n\tLoss: {last_loss}")
|
||||||
|
|
||||||
|
for encoded_sentence, expected_sentence in zip(
|
||||||
|
Transformer.tensor2token(last_prediction[:,:], END_TOKEN), # type: ignore
|
||||||
|
Transformer.tensor2token(target_logits[:,:], END_TOKEN)
|
||||||
|
):
|
||||||
|
decoded_sentence = TOKENANO.decode(encoded_sentence)
|
||||||
|
decoded_target = TOKENANO.decode(expected_sentence)
|
||||||
|
print(f"\tACTUAL:\n\t\t{decoded_sentence}\n\tEXPECTED:\n\t\t{decoded_target}\n")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
60
Playgrounds/sanity-check-pytorch.ipynb
Normal file
60
Playgrounds/sanity-check-pytorch.ipynb
Normal file
@@ -0,0 +1,60 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "dd23cc94",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Current detected architecture is: xpu\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import torch\n",
|
||||||
|
"from Project_Model.Libs.TorchShims import get_default_device\n",
|
||||||
|
"\n",
|
||||||
|
"DEVICE = get_default_device()\n",
|
||||||
|
"\n",
|
||||||
|
"print(f\"Current detected architecture is: {DEVICE.type}\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "6584882e",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import Project_Model.Libs.Transformer as Transformer\n",
|
||||||
|
"DECODER = Transformer.Decoder(256, 1024, 4)\n",
|
||||||
|
"print()"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "deep_learning",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.13.7"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
@@ -107,7 +107,7 @@ class NanoSocraTrainerPool:
|
|||||||
bpe = NanoSocratesBPE()
|
bpe = NanoSocratesBPE()
|
||||||
BPE = bpe
|
BPE = bpe
|
||||||
|
|
||||||
if BPE.vocabulary_size > self.__max_vocabulary:
|
if BPE.vocabulary_size >= self.__max_vocabulary:
|
||||||
return BPE
|
return BPE
|
||||||
|
|
||||||
exit = False
|
exit = False
|
||||||
|
|||||||
@@ -189,7 +189,7 @@ class NanoSocratesBPE(Encoder):
|
|||||||
token_stack.appendleft(right_token)
|
token_stack.appendleft(right_token)
|
||||||
token_stack.appendleft(left_token)
|
token_stack.appendleft(left_token)
|
||||||
|
|
||||||
return UTF_8_STRING_ARR.decode("utf-8")
|
return UTF_8_STRING_ARR.decode("utf-8", errors="ignore")
|
||||||
|
|
||||||
def __token_decode(self, token_id: int) -> tuple[int, int]:
|
def __token_decode(self, token_id: int) -> tuple[int, int]:
|
||||||
|
|
||||||
|
|||||||
@@ -29,6 +29,10 @@ class NanoSocratesSpecial(Encoder):
|
|||||||
VOC_LENGTH = len(self.__vocabulary)
|
VOC_LENGTH = len(self.__vocabulary)
|
||||||
return BPE_OFFSET + VOC_LENGTH + 1
|
return BPE_OFFSET + VOC_LENGTH + 1
|
||||||
|
|
||||||
|
@property
|
||||||
|
def vocabulary_size(self) -> int:
|
||||||
|
return len(self.vocabulary)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def vocabulary(self) -> dict[str, int]:
|
def vocabulary(self) -> dict[str, int]:
|
||||||
return self.__vocabulary
|
return self.__vocabulary
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ from ..Classes import NanoSocratesSpecial
|
|||||||
|
|
||||||
from ..Utils import special_regex_maker
|
from ..Utils import special_regex_maker
|
||||||
from ..Enums import TokenType
|
from ..Enums import TokenType
|
||||||
|
from ..Enums import SpecialToken
|
||||||
|
|
||||||
|
|
||||||
class TokeNanoCore:
|
class TokeNanoCore:
|
||||||
@@ -26,6 +27,12 @@ class TokeNanoCore:
|
|||||||
BPE_VOCABULARY_SIZE, special_token_list
|
BPE_VOCABULARY_SIZE, special_token_list
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def vocabulary_size(self):
|
||||||
|
BPE_VOC_SIZE = self.__bpe_encoder.vocabulary_size
|
||||||
|
SPECIAL_VOC_SIZE = self.__special_encoder.vocabulary_size
|
||||||
|
return BPE_VOC_SIZE + SPECIAL_VOC_SIZE + 1
|
||||||
|
|
||||||
def encode(self, corpus: str) -> list[int]:
|
def encode(self, corpus: str) -> list[int]:
|
||||||
output: list[int] = []
|
output: list[int] = []
|
||||||
for piece, token_type in self.__splitter.split_text(corpus):
|
for piece, token_type in self.__splitter.split_text(corpus):
|
||||||
@@ -39,6 +46,27 @@ class TokeNanoCore:
|
|||||||
|
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def encode_incomplete_string(self, corpus: str) -> list[int]:
|
||||||
|
"""
|
||||||
|
Encode string which don't end with a special token
|
||||||
|
"""
|
||||||
|
corpus = corpus + SpecialToken.CORPUS_END.value
|
||||||
|
output: list[int] = []
|
||||||
|
for piece, token_type in self.__splitter.split_text(corpus):
|
||||||
|
|
||||||
|
if token_type == TokenType.SPECIAL:
|
||||||
|
output.extend(self.__special_encoder.encode(piece))
|
||||||
|
|
||||||
|
# slow but clear
|
||||||
|
if token_type == TokenType.BPE:
|
||||||
|
output.extend(self.__bpe_encoder.encode(piece))
|
||||||
|
|
||||||
|
return output[:-1]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def decode(self, corpus: list[int]) -> str:
|
def decode(self, corpus: list[int]) -> str:
|
||||||
output_str = ""
|
output_str = ""
|
||||||
for token, token_type in self.__splitter.split_tokens(corpus):
|
for token, token_type in self.__splitter.split_tokens(corpus):
|
||||||
|
|||||||
@@ -5,6 +5,8 @@ from .NanoSocraTrainer import NanoSocraTrainer
|
|||||||
from .NanoSocraTraineRam import NanoSocraTraineRam
|
from .NanoSocraTraineRam import NanoSocraTraineRam
|
||||||
from .NanoSocraTrainerPool import NanoSocraTrainerPool
|
from .NanoSocraTrainerPool import NanoSocraTrainerPool
|
||||||
from .NanoSocratesSpecial import NanoSocratesSpecial
|
from .NanoSocratesSpecial import NanoSocratesSpecial
|
||||||
|
from .TokeNanoCore import TokeNanoCore
|
||||||
|
from .TokeNano import TokeNano
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"NanoSocratesChunker",
|
"NanoSocratesChunker",
|
||||||
@@ -12,5 +14,7 @@ __all__ = [
|
|||||||
"NanoSocratesBPE",
|
"NanoSocratesBPE",
|
||||||
"NanoSocraTrainer",
|
"NanoSocraTrainer",
|
||||||
"NanoSocraTraineRam",
|
"NanoSocraTraineRam",
|
||||||
"NanoSocraTrainerPool"
|
"NanoSocraTrainerPool",
|
||||||
|
"TokeNanoCore",
|
||||||
|
"TokeNano"
|
||||||
]
|
]
|
||||||
@@ -10,7 +10,6 @@ class SpecialToken(Enum):
|
|||||||
RELATIONSHIP = "<PRED>"
|
RELATIONSHIP = "<PRED>"
|
||||||
OBJECT = "<OBJ>"
|
OBJECT = "<OBJ>"
|
||||||
ABSTRACT = "<ABS>"
|
ABSTRACT = "<ABS>"
|
||||||
CORPUS_END = "<END>"
|
|
||||||
|
|
||||||
## Tasks' Token
|
## Tasks' Token
|
||||||
RDF_TO_TEXT = "<RDF2TXT>"
|
RDF_TO_TEXT = "<RDF2TXT>"
|
||||||
@@ -19,3 +18,10 @@ class SpecialToken(Enum):
|
|||||||
MASK = "<MASK>"
|
MASK = "<MASK>"
|
||||||
|
|
||||||
# BPE Training:
|
# BPE Training:
|
||||||
|
# NanoSocrates
|
||||||
|
START = "<START>"
|
||||||
|
CORPUS_END = "<END>"
|
||||||
|
START_OF_SEQUENCE = "<SOS>"
|
||||||
|
END_OF_SEQUENCE = "<EOS>"
|
||||||
|
PAD = "<PAD>"
|
||||||
|
|
||||||
@@ -1 +1,6 @@
|
|||||||
from .TokenType import TokenType
|
from .TokenType import TokenType
|
||||||
|
from .SpecialToken import SpecialToken
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"SpecialToken"
|
||||||
|
]
|
||||||
@@ -3,11 +3,13 @@ from .lag_checker_iterator import iterator_with_checks
|
|||||||
from .vocabulary import save_nanos_vocabulary, load_nanos_vocabulary
|
from .vocabulary import save_nanos_vocabulary, load_nanos_vocabulary
|
||||||
from .json_utils import save_json, load_json
|
from .json_utils import save_json, load_json
|
||||||
from .special_regex_maker import special_regex_maker
|
from .special_regex_maker import special_regex_maker
|
||||||
|
from .default_special_tokens import default_special_tokens
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"special_regex_maker",
|
"special_regex_maker",
|
||||||
"iterator_with_checks",
|
"iterator_with_checks",
|
||||||
"save_nanos_vocabulary",
|
"save_nanos_vocabulary",
|
||||||
"load_nanos_vocabulary",
|
"load_nanos_vocabulary",
|
||||||
"save_json", "load_json"
|
"save_json", "load_json",
|
||||||
|
"default_special_tokens"
|
||||||
]
|
]
|
||||||
4
Project_Model/Libs/BPE/Utils/default_special_tokens.py
Normal file
4
Project_Model/Libs/BPE/Utils/default_special_tokens.py
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
from ..Enums import SpecialToken
|
||||||
|
|
||||||
|
def default_special_tokens() -> list[str]:
|
||||||
|
return [token.value for token in SpecialToken]
|
||||||
164
Project_Model/Libs/Batch/Classes/Batcher.py
Normal file
164
Project_Model/Libs/Batch/Classes/Batcher.py
Normal file
@@ -0,0 +1,164 @@
|
|||||||
|
import random
|
||||||
|
import sys
|
||||||
|
from typing import Any, Generator
|
||||||
|
import pandas as pd
|
||||||
|
from pathlib import Path
|
||||||
|
from Project_Model.Libs.Batch.Enums.TaskType import TaskType
|
||||||
|
import Project_Model.Libs.BPE as BPE
|
||||||
|
# from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
|
||||||
|
from Project_Model.Libs.Transformer import SpannedMasker, truncate_rdf_list, normalize_sequence
|
||||||
|
from TokenCompletation import TokenCompletationTransformer
|
||||||
|
from Project_Model.Libs.BPE import SpecialToken
|
||||||
|
|
||||||
|
|
||||||
|
MAX_LENGHT = 128
|
||||||
|
class Batcher:
|
||||||
|
|
||||||
|
def __init__(self, dataset_path: Path, tokenizer: BPE.TokeNanoCore, masker: SpannedMasker, seed:int = 0) -> None:
|
||||||
|
# ABSTRACT, TRIPLE
|
||||||
|
# tasks:
|
||||||
|
# rdf2text: X: TRIPLE, Y: ABSTRACT
|
||||||
|
# text2rdf: X: ABSTRACT, X:TRIPLE
|
||||||
|
# masking ( call masker): X: incomplete_triple Y: complete_triple (as exam)
|
||||||
|
# completation: X: TRIPLE SUBSET, Y: related TRIPLE SUBSET
|
||||||
|
# it will truncate
|
||||||
|
# it will instantiate spanmaskter and truncator
|
||||||
|
self._dataset_path = dataset_path
|
||||||
|
self._tokenizer = tokenizer
|
||||||
|
self._masker = masker
|
||||||
|
|
||||||
|
self._seed = seed
|
||||||
|
# self._token_completation = TokenCompletationTransformer(sotl,eos)
|
||||||
|
self._completation_task_token_truncator = truncate_rdf_list
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def batch(self, batch_size)-> Generator[tuple[list[list[int]], list[list[int]], list[list[int]],list[list[int]], TaskType],Any,Any]:
|
||||||
|
"""
|
||||||
|
Yields: X,Y,padding_X
|
||||||
|
"""
|
||||||
|
RNG = random.Random(self._seed)
|
||||||
|
self._masker.reseed(self._seed)
|
||||||
|
|
||||||
|
for batch in pd.read_csv(self._dataset_path, chunksize= batch_size):
|
||||||
|
|
||||||
|
tokenized_batch = pd.DataFrame()
|
||||||
|
# encode
|
||||||
|
tokenized_batch[["Abstract","RDFs"]] = (
|
||||||
|
batch[["Abstract","RDFs"]]
|
||||||
|
.map(lambda t: self._tokenizer.encode(t))
|
||||||
|
)
|
||||||
|
|
||||||
|
X,Y, padding_X, padding_Y = self.__rdf2txt_transformation(tokenized_batch)
|
||||||
|
yield X,Y, padding_X, padding_Y, TaskType.RDF2TXT
|
||||||
|
X,Y, padding_X, padding_Y, = self.__txt2rdf_transformation(tokenized_batch)
|
||||||
|
yield X,Y, padding_X, padding_Y, TaskType.TEXT2RDF
|
||||||
|
X,Y, padding_X, padding_Y, = self.__masking_trasformation(tokenized_batch)
|
||||||
|
yield X,Y, padding_X, padding_Y, TaskType.MASKING
|
||||||
|
X,Y, padding_X, padding_Y, = self.__token_completation_task(tokenized_batch, RNG.randint(0,sys.maxsize))
|
||||||
|
yield X,Y, padding_X, padding_Y, TaskType.COMPLETATION
|
||||||
|
|
||||||
|
# output = pd.concat([rdf2txt_batch,txt2rdf_batch,completation_batch],ignore_index=True)
|
||||||
|
# output = output.sample(frac=1).reset_index(drop=True)
|
||||||
|
# self.decode_debug(output)
|
||||||
|
# yield output
|
||||||
|
|
||||||
|
|
||||||
|
def __random_subset_rdfs(self, batch: pd.DataFrame, seed = 0):
|
||||||
|
# WIP
|
||||||
|
rng = random.Random(seed)
|
||||||
|
|
||||||
|
def to_list(x):
|
||||||
|
return x.split(SpecialToken.START_TRIPLE.value)[1:]
|
||||||
|
|
||||||
|
batch["RDFs"] = batch["RDFs"].map(
|
||||||
|
to_list
|
||||||
|
)
|
||||||
|
|
||||||
|
def decode_debug(self, batch: pd.DataFrame):
|
||||||
|
decoded = pd.DataFrame()
|
||||||
|
decoded[["X","Y"]] = (
|
||||||
|
batch[["X","Y"]]
|
||||||
|
.map(lambda t: self._tokenizer.decode(t))
|
||||||
|
)
|
||||||
|
print(decoded)
|
||||||
|
|
||||||
|
|
||||||
|
def __normalization(self, X:list[list[int]], Y: list[list[int]])-> tuple[list[list[int]], list[list[int]], list[list[int]], list[list[int]]]:
|
||||||
|
pad_token = self._tokenizer.encode(SpecialToken.PAD.value)[0]
|
||||||
|
end_token = self._tokenizer.encode(SpecialToken.END_OF_SEQUENCE.value)[0]
|
||||||
|
out_X = []
|
||||||
|
padding_X = []
|
||||||
|
out_Y = []
|
||||||
|
padding_Y = []
|
||||||
|
|
||||||
|
for x in X:
|
||||||
|
out_x, padding_x = normalize_sequence(x,MAX_LENGHT,pad_token,end_token,True)
|
||||||
|
out_X.append(out_x)
|
||||||
|
padding_X.append(padding_x)
|
||||||
|
|
||||||
|
for y in Y:
|
||||||
|
out_y, padding_y = normalize_sequence(y,MAX_LENGHT,pad_token,end_token,True)
|
||||||
|
out_Y.append(out_y)
|
||||||
|
padding_Y.append(padding_y)
|
||||||
|
|
||||||
|
return out_X,out_Y,padding_X,padding_Y
|
||||||
|
|
||||||
|
|
||||||
|
def __rdf2txt_transformation(self, batch: pd.DataFrame):
|
||||||
|
task_token = self._tokenizer.encode(SpecialToken.RDF_TO_TEXT.value)
|
||||||
|
out = batch.rename(columns={"RDFs":"X","Abstract":"Y"})[["X","Y"]]
|
||||||
|
out["X"] = [task_token + x for x in out["X"]]
|
||||||
|
return self.__normalization(out["X"].to_list(),out["Y"].to_list())
|
||||||
|
|
||||||
|
|
||||||
|
def __txt2rdf_transformation(self, batch: pd.DataFrame):
|
||||||
|
task_token = self._tokenizer.encode(SpecialToken.TEXT_TO_RDF.value)
|
||||||
|
out = batch.rename(columns={"Abstract":"X","RDFs":"Y"})[["X","Y"]]
|
||||||
|
out["X"] = [task_token + x for x in out["X"]]
|
||||||
|
return self.__normalization(out["X"].to_list(),out["Y"].to_list())
|
||||||
|
|
||||||
|
|
||||||
|
def __masking_trasformation(self, batch: pd.DataFrame):
|
||||||
|
X = []
|
||||||
|
Y = []
|
||||||
|
for rdf in batch["RDFs"]:
|
||||||
|
x,y = self._masker.mask_sequence(rdf)
|
||||||
|
X.append(x)
|
||||||
|
Y.append(y)
|
||||||
|
return self.__normalization(X,Y)
|
||||||
|
|
||||||
|
|
||||||
|
def __token_completation_task(self, batch: pd.DataFrame, minibatch_seed: int):
|
||||||
|
continue_triple_token = self._tokenizer.encode(SpecialToken.CONTINUE_RDF.value)[0]
|
||||||
|
eot = self._tokenizer.encode(SpecialToken.END_TRIPLE.value)[0]
|
||||||
|
X = []
|
||||||
|
Y = []
|
||||||
|
for rdf in batch["RDFs"]:
|
||||||
|
x,y = self._completation_task_token_truncator(rdf, 0.5, continue_triple_token, eot, minibatch_seed)
|
||||||
|
X.append(x)
|
||||||
|
Y.append(y)
|
||||||
|
return self.__normalization(X,Y)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
|
||||||
|
DATASET_PATH = Path("Assets/Dataset/Tmp/rdf_text.csv")
|
||||||
|
VOCABULARY_path = "Assets/Dataset/Tmp/trimmed.json"
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
VOCABULARY = BPE.load_nanos_vocabulary(Path(VOCABULARY_path))
|
||||||
|
SPECIAL_LIST = BPE.default_special_tokens()
|
||||||
|
TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_LIST)
|
||||||
|
SPECIAL_TOKENS: set[int] = set(TOKENANO.encode("".join(SPECIAL_LIST)))
|
||||||
|
|
||||||
|
MASKER = SpannedMasker(TOKENANO.vocabulary_size,SPECIAL_TOKENS)
|
||||||
|
|
||||||
|
prova = "<ABS>Cactus Flower is a 1969 American screwball comedy film directed by Gene Saks, and starring Walter Matthau, Ingrid Bergman and Goldie Hawn, who won an Academy Award for her performance.The screenplay was adapted by I. A. L. Diamond from the 1965 Broadway play of the same title written by Abe Burrows, which, in turn, is based on the French play Fleur de cactus by Pierre Barillet and Jean-Pierre Gredy. Cactus Flower was the ninth highest-grossing film of 1969."
|
||||||
|
print(TOKENANO.encode(prova))
|
||||||
|
batcher = Batcher(DATASET_PATH,TOKENANO,MASKER)
|
||||||
|
for batch in batcher.batch(8):
|
||||||
|
print(batch)
|
||||||
33
Project_Model/Libs/Batch/Classes/TokenCompletation.py
Normal file
33
Project_Model/Libs/Batch/Classes/TokenCompletation.py
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
|
||||||
|
class TokenCompletationTransformer:
|
||||||
|
|
||||||
|
def __init__(self,SOTL_token,EOS_token, input_percent:float = 0.5) -> None:
|
||||||
|
self.__SOTL_token = SOTL_token
|
||||||
|
self.__EOS_token = EOS_token
|
||||||
|
self.__input_percent = input_percent
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def get_completation_tuple(
|
||||||
|
self,
|
||||||
|
token_sequence: list[int],
|
||||||
|
)-> tuple[list[int], list[int]]:
|
||||||
|
|
||||||
|
# split the sequence by encoded(<SOTL>), dont take the first, firts pertenge in as X the other as Y
|
||||||
|
sotl_count =int( token_sequence.count(self.__SOTL_token) * self.__input_percent)
|
||||||
|
|
||||||
|
sotl_index = 0
|
||||||
|
percent_index = 0
|
||||||
|
while sotl_index < sotl_count:
|
||||||
|
token = token_sequence[percent_index]
|
||||||
|
if token == self.__SOTL_token:
|
||||||
|
sotl_index += 1
|
||||||
|
|
||||||
|
percent_index+=1
|
||||||
|
|
||||||
|
percent_index = percent_index -1
|
||||||
|
x_list = token_sequence[:percent_index]
|
||||||
|
x_list.append(self.__EOS_token)
|
||||||
|
y_list = token_sequence[percent_index:]
|
||||||
|
return (x_list,y_list)
|
||||||
|
|
||||||
8
Project_Model/Libs/Batch/Enums/TaskType.py
Normal file
8
Project_Model/Libs/Batch/Enums/TaskType.py
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
from enum import Enum, auto
|
||||||
|
|
||||||
|
class TaskType(Enum):
|
||||||
|
|
||||||
|
RDF2TXT = auto()
|
||||||
|
TEXT2RDF = auto()
|
||||||
|
MASKING = auto()
|
||||||
|
COMPLETATION = auto()
|
||||||
23
Project_Model/Libs/Embedder/Classes/NanoSocratesEmbedder.py
Normal file
23
Project_Model/Libs/Embedder/Classes/NanoSocratesEmbedder.py
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
import torch
|
||||||
|
from ..Utils import fixed_positional_encoding
|
||||||
|
|
||||||
|
|
||||||
|
# WIP FOR BATCHING
|
||||||
|
class NanoSocratesEmbedder(torch.nn.Module):
|
||||||
|
|
||||||
|
def __init__(self, vocabulary_size: int, embedding_size: int) -> None:
|
||||||
|
super().__init__()
|
||||||
|
self.__embedder = torch.nn.Embedding(vocabulary_size, embedding_size)
|
||||||
|
|
||||||
|
def forward(self, tokenized_sentence: torch.Tensor) -> torch.Tensor:
|
||||||
|
|
||||||
|
computed_embeddings: torch.Tensor = self.__embedder(tokenized_sentence)
|
||||||
|
|
||||||
|
_, SENTENCE_LENGHT, EMBEDDING_SIZE = computed_embeddings.shape # for batching
|
||||||
|
|
||||||
|
POSITIONAL_ENCODINGS = fixed_positional_encoding(
|
||||||
|
SENTENCE_LENGHT, EMBEDDING_SIZE
|
||||||
|
)
|
||||||
|
|
||||||
|
computed_embeddings = computed_embeddings + POSITIONAL_ENCODINGS # for batching
|
||||||
|
return computed_embeddings
|
||||||
5
Project_Model/Libs/Embedder/Classes/__init__.py
Normal file
5
Project_Model/Libs/Embedder/Classes/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
from .NanoSocratesEmbedder import NanoSocratesEmbedder
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"NanoSocratesEmbedder"
|
||||||
|
]
|
||||||
5
Project_Model/Libs/Embedder/Utils/__init__.py
Normal file
5
Project_Model/Libs/Embedder/Utils/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
from .fixed_positional_encoding import fixed_positional_encoding
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"fixed_positional_encoding"
|
||||||
|
]
|
||||||
@@ -0,0 +1,28 @@
|
|||||||
|
import torch
|
||||||
|
|
||||||
|
|
||||||
|
def fixed_positional_encoding(
|
||||||
|
sentence_dimension: int,
|
||||||
|
embedding_dimension: int,
|
||||||
|
) -> torch.Tensor:
|
||||||
|
|
||||||
|
BIG_CONST = int(1e4)
|
||||||
|
INITIAL_ENCODING = torch.tensor([i for i in range(0, sentence_dimension)])
|
||||||
|
|
||||||
|
ENCODINGS: list[torch.Tensor] = []
|
||||||
|
|
||||||
|
for i in range(0, embedding_dimension):
|
||||||
|
EMBEDDING_POSITION = i
|
||||||
|
|
||||||
|
# Note: The original paper did not specify
|
||||||
|
# to compute: pos mod 2!!
|
||||||
|
DIVISOR = BIG_CONST ** ((2 * (EMBEDDING_POSITION // 2)) / embedding_dimension)
|
||||||
|
INTERMEDIATE_ENCODING = INITIAL_ENCODING / DIVISOR
|
||||||
|
|
||||||
|
if EMBEDDING_POSITION % 2 == 0:
|
||||||
|
ENCODINGS.append(torch.sin(INTERMEDIATE_ENCODING))
|
||||||
|
continue
|
||||||
|
|
||||||
|
ENCODINGS.append(torch.cos(INTERMEDIATE_ENCODING))
|
||||||
|
|
||||||
|
return torch.stack(ENCODINGS).transpose(0, 1)
|
||||||
7
Project_Model/Libs/Embedder/__init__.py
Normal file
7
Project_Model/Libs/Embedder/__init__.py
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
from .Utils import *
|
||||||
|
from .Classes import *
|
||||||
|
|
||||||
|
from . import Utils
|
||||||
|
from . import Classes
|
||||||
|
|
||||||
|
|
||||||
5
Project_Model/Libs/TorchShims/Utils/__init__.py
Normal file
5
Project_Model/Libs/TorchShims/Utils/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
from .get_default_device import get_default_device
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"get_default_device"
|
||||||
|
]
|
||||||
17
Project_Model/Libs/TorchShims/Utils/get_default_device.py
Normal file
17
Project_Model/Libs/TorchShims/Utils/get_default_device.py
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
import torch
|
||||||
|
|
||||||
|
def get_default_device() -> torch.device:
|
||||||
|
|
||||||
|
# Cuda or ROCm
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
return torch.device("cuda")
|
||||||
|
|
||||||
|
# Intel GPUs
|
||||||
|
if torch.xpu.is_available():
|
||||||
|
return torch.device("xpu")
|
||||||
|
|
||||||
|
# Apple GPUs
|
||||||
|
if torch.backends.mps.is_available():
|
||||||
|
return torch.device("mps")
|
||||||
|
|
||||||
|
return torch.device("cpu")
|
||||||
7
Project_Model/Libs/TorchShims/__init__.py
Normal file
7
Project_Model/Libs/TorchShims/__init__.py
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
from .Utils import *
|
||||||
|
|
||||||
|
from .Utils import get_default_device
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"get_default_device"
|
||||||
|
]
|
||||||
43
Project_Model/Libs/Training/logistic_collector.py
Normal file
43
Project_Model/Libs/Training/logistic_collector.py
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
import torch
|
||||||
|
|
||||||
|
class LogitsCollector:
|
||||||
|
def __init__(self, pad_token: int, end_token: int, tokenizer) -> None:
|
||||||
|
self.__pad_token = pad_token # used to skip PAD
|
||||||
|
self.__end_token = end_token # used to stop at END
|
||||||
|
self.__tokenizer = tokenizer # exposes .decode(list[int]) -> str
|
||||||
|
self.__steps: list[torch.Tensor] = [] # list of per-step logits [B,V]
|
||||||
|
|
||||||
|
def reset(self) -> None:
|
||||||
|
self.__steps.clear() # clear history
|
||||||
|
|
||||||
|
def add(self, logits_step: torch.Tensor) -> None:
|
||||||
|
if logits_step.dim() == 3: # handle [B,1,V]
|
||||||
|
logits_step = logits_step[:, -1, :] # -> [B,V]
|
||||||
|
self.__steps.append(logits_step.detach()) # store raw logits (detached)
|
||||||
|
|
||||||
|
def tokens(self) -> list[list[int]]:
|
||||||
|
if not self.__steps:
|
||||||
|
return []
|
||||||
|
stack = torch.stack(self.__steps, dim=0) # [T,B,V]
|
||||||
|
probs = torch.softmax(stack, dim=-1) # softmax over vocab -> [T,B,V]
|
||||||
|
ids = probs.argmax(dim=-1).transpose(0, 1) # greedy ids -> [B,T]
|
||||||
|
out: list[list[int]] = []
|
||||||
|
for row in ids.tolist():
|
||||||
|
seq: list[int] = []
|
||||||
|
for tok in row:
|
||||||
|
# if tok == self.__end_token: # stop on END
|
||||||
|
# break
|
||||||
|
if tok == self.__pad_token: # skip PAD
|
||||||
|
continue
|
||||||
|
seq.append(tok)
|
||||||
|
out.append(seq)
|
||||||
|
return out
|
||||||
|
|
||||||
|
def print_decoded(self) -> None:
|
||||||
|
for i, seq in enumerate(self.tokens()):
|
||||||
|
try:
|
||||||
|
# text = text + self.__end_token
|
||||||
|
text = self.__tokenizer.decode(seq) # decode tokens to string
|
||||||
|
except Exception:
|
||||||
|
text = str(seq) # fallback to ids
|
||||||
|
print(f"[{i}] {text}") # simple print
|
||||||
19
Project_Model/Libs/Transformer/Classes/DeToken.py
Normal file
19
Project_Model/Libs/Transformer/Classes/DeToken.py
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
import torch
|
||||||
|
|
||||||
|
|
||||||
|
class DeToken(torch.nn.Module):
|
||||||
|
|
||||||
|
def __init__(self, embedding_size: int, vocabulary_size: int) -> None:
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
self.__linear = torch.nn.Linear(embedding_size, vocabulary_size)
|
||||||
|
|
||||||
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||||
|
|
||||||
|
# 1) Go from latent space to vocabularu space
|
||||||
|
x = self.__linear(x)
|
||||||
|
|
||||||
|
# 2) Go to logits
|
||||||
|
# x = torch.softmax(x, 2)
|
||||||
|
|
||||||
|
return x
|
||||||
103
Project_Model/Libs/Transformer/Classes/Decoder.py
Normal file
103
Project_Model/Libs/Transformer/Classes/Decoder.py
Normal file
@@ -0,0 +1,103 @@
|
|||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
from .FeedForwardNetwork import FeedForwardNetwork
|
||||||
|
from .TorchMultiHeadAttention import TorchMultiHeadAttention as MultiHeadAttention
|
||||||
|
from ..Utils.attention_mask import get_causal_attention_mask
|
||||||
|
|
||||||
|
# B, L(T), E_D
|
||||||
|
|
||||||
|
|
||||||
|
class Decoder(nn.Module):
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
embedding_dimension: int,
|
||||||
|
feed_forward_hidden_layer_dimension: int,
|
||||||
|
number_of_attention_heads: int,
|
||||||
|
) -> None:
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
self.__masked_attention = MultiHeadAttention(
|
||||||
|
embedding_dimension, number_of_attention_heads, dropout=0.1
|
||||||
|
)
|
||||||
|
|
||||||
|
self.__layer_norm_1 = nn.LayerNorm(embedding_dimension)
|
||||||
|
|
||||||
|
self.__cross_attention = MultiHeadAttention(
|
||||||
|
embedding_dimension, number_of_attention_heads, dropout=0.1
|
||||||
|
)
|
||||||
|
self.__layer_norm_2 = nn.LayerNorm(embedding_dimension)
|
||||||
|
|
||||||
|
self.__dropout = nn.Dropout(0.1)
|
||||||
|
|
||||||
|
self.__feed_forward_network = FeedForwardNetwork(
|
||||||
|
embedding_dimension, feed_forward_hidden_layer_dimension
|
||||||
|
)
|
||||||
|
self.__layer_norm_3 = nn.LayerNorm(embedding_dimension)
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
args: tuple[
|
||||||
|
torch.Tensor,
|
||||||
|
torch.Tensor,
|
||||||
|
torch.Tensor,
|
||||||
|
torch.Tensor,
|
||||||
|
torch.Tensor
|
||||||
|
]
|
||||||
|
): # -> list[torch.Tensor]: # k_x = v_x . While x_q = x
|
||||||
|
# WARNING: args is needed to have sequential
|
||||||
|
x, k_x, v_x, src_padding_mask, tgt_padding_mask = args
|
||||||
|
|
||||||
|
# build of attention mask
|
||||||
|
attention_mask = get_causal_attention_mask(x.size(1))
|
||||||
|
|
||||||
|
# 1) Masked Attention
|
||||||
|
MASKED_ATTENTION = self.__masked_attention(
|
||||||
|
x, x, x, key_padding_mask=tgt_padding_mask, attention_mask=attention_mask
|
||||||
|
)
|
||||||
|
|
||||||
|
# 2) Dropout
|
||||||
|
# DROPPED_MASKED_ATTENTION = self.__dropout(MASKED_ATTENTION)
|
||||||
|
# del MASKED_ATTENTION
|
||||||
|
|
||||||
|
# 3) Residual Connection
|
||||||
|
x = x + MASKED_ATTENTION
|
||||||
|
del MASKED_ATTENTION
|
||||||
|
|
||||||
|
# 4) Layer Normalization
|
||||||
|
x = self.__layer_norm_1(x)
|
||||||
|
|
||||||
|
# 5) Encoder–decoder (cross) attention
|
||||||
|
CROSS_ATTENTION = self.__cross_attention(
|
||||||
|
x, k_x, v_x, key_padding_mask=src_padding_mask
|
||||||
|
)
|
||||||
|
|
||||||
|
# 6) Dropout
|
||||||
|
# DROPPED_CROSS_ATTENTION = self.__dropout(CROSS_ATTENTION)
|
||||||
|
# del CROSS_ATTENTION
|
||||||
|
|
||||||
|
# 7) Residual Connection
|
||||||
|
x = x + CROSS_ATTENTION
|
||||||
|
del CROSS_ATTENTION
|
||||||
|
|
||||||
|
# 8) Layer Normalization
|
||||||
|
x = self.__layer_norm_2(x)
|
||||||
|
|
||||||
|
# 9) Position-wise feed-forward
|
||||||
|
FEED_FORWARD = self.__feed_forward_network(x)
|
||||||
|
|
||||||
|
# 10) Dropout
|
||||||
|
# DROPPED_FEED_FORWARD = self.__dropout(FEED_FORWARD)
|
||||||
|
# del FEED_FORWARD
|
||||||
|
|
||||||
|
# 11) Residual Connection
|
||||||
|
x = x + FEED_FORWARD
|
||||||
|
del FEED_FORWARD
|
||||||
|
|
||||||
|
# 12) Layer Normalization
|
||||||
|
x = self.__layer_norm_3(x)
|
||||||
|
|
||||||
|
return (x, k_x, v_x, src_padding_mask, tgt_padding_mask)
|
||||||
|
|
||||||
|
|
||||||
|
# use eval to disable dropout ecc
|
||||||
73
Project_Model/Libs/Transformer/Classes/Encoder.py
Normal file
73
Project_Model/Libs/Transformer/Classes/Encoder.py
Normal file
@@ -0,0 +1,73 @@
|
|||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
from Project_Model.Libs.Transformer.Classes.FeedForwardNetwork import FeedForwardNetwork
|
||||||
|
from Project_Model.Libs.Transformer.Classes.TorchMultiHeadAttention import (
|
||||||
|
TorchMultiHeadAttention as MultiHeadAttention,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class Encoder(
|
||||||
|
nn.Module
|
||||||
|
): # in this way we expose the primitive of nn.Module for training purpose
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
embedding_dimension: int,
|
||||||
|
feed_forward_hidden_layer_dimension: int,
|
||||||
|
number_of_attention_heads: int,
|
||||||
|
) -> None:
|
||||||
|
super().__init__()
|
||||||
|
self.__attention = MultiHeadAttention(
|
||||||
|
embedding_dimension, number_of_attention_heads, dropout=0.1
|
||||||
|
)
|
||||||
|
self.__layer_norm_1 = nn.LayerNorm(
|
||||||
|
embedding_dimension
|
||||||
|
) # norm of first "Add and Normalize"
|
||||||
|
self.__feed_forward = FeedForwardNetwork(
|
||||||
|
embedding_dimension, feed_forward_hidden_layer_dimension
|
||||||
|
)
|
||||||
|
self.__layer_norm_2 = nn.LayerNorm(
|
||||||
|
embedding_dimension
|
||||||
|
) # norm of second "Add and Normalize"
|
||||||
|
self.__dropout = nn.Dropout(0.1) # ...
|
||||||
|
|
||||||
|
|
||||||
|
def forward(self, args: tuple[torch.Tensor, torch.Tensor]):
|
||||||
|
# WARNING: args is needed to have sequential
|
||||||
|
x, padding_mask = args
|
||||||
|
|
||||||
|
# -> ATTENTION -> dropout -> add and normalize -> FF -> dropout -> add and normalize ->
|
||||||
|
# Attention with Residual Connection [ input + self-attention]
|
||||||
|
|
||||||
|
# 1) Multi Head Attention
|
||||||
|
ATTENTION = self.__attention(x, x, x, key_padding_mask=padding_mask)
|
||||||
|
|
||||||
|
# 2) Dropout
|
||||||
|
# DROPPED_ATTENTION = self.__dropout(ATTENTION)
|
||||||
|
# del ATTENTION
|
||||||
|
|
||||||
|
# 3) Residual Connection
|
||||||
|
x = x + ATTENTION
|
||||||
|
del ATTENTION
|
||||||
|
|
||||||
|
# 4) Layer Normalization
|
||||||
|
x = self.__layer_norm_1(x)
|
||||||
|
|
||||||
|
# 5) Feed Forward
|
||||||
|
FEED_FORWARD = self.__feed_forward(x)
|
||||||
|
|
||||||
|
# 6) Dropout
|
||||||
|
# DROPPED_FEED_FORWARD = self.__dropout(FEED_FORWARD)
|
||||||
|
# del FEED_FORWARD
|
||||||
|
|
||||||
|
# 7) Residual Connection
|
||||||
|
x = x + FEED_FORWARD
|
||||||
|
del FEED_FORWARD
|
||||||
|
|
||||||
|
# 8) Layer Normalization
|
||||||
|
x = self.__layer_norm_2(x)
|
||||||
|
|
||||||
|
return (x, padding_mask)
|
||||||
|
|
||||||
|
|
||||||
|
# use eval to disable dropout ecc
|
||||||
43
Project_Model/Libs/Transformer/Classes/FeedForwardNetwork.py
Normal file
43
Project_Model/Libs/Transformer/Classes/FeedForwardNetwork.py
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
# it is position wise!
|
||||||
|
# https://stackoverflow.com/questions/74979359/how-is-position-wise-feed-forward-neural-network-implemented-for-transformers
|
||||||
|
|
||||||
|
# Why do we need a fixed size
|
||||||
|
# https://ai.stackexchange.com/questions/37624/why-do-transformers-have-a-fixed-input-length
|
||||||
|
|
||||||
|
import torch.nn as nn
|
||||||
|
|
||||||
|
|
||||||
|
class FeedForwardNetwork(nn.Module):
|
||||||
|
|
||||||
|
def __init__(self, embedding_size: int, feed_forward_hidden_layer_dimension: int):
|
||||||
|
|
||||||
|
super().__init__()
|
||||||
|
self.__fully_connected_1 = nn.Linear(
|
||||||
|
embedding_size, feed_forward_hidden_layer_dimension
|
||||||
|
) # expand in higher dimension
|
||||||
|
|
||||||
|
self.__relu = nn.ReLU()
|
||||||
|
self.__dropout = nn.Dropout(
|
||||||
|
0.1
|
||||||
|
) # during training we drop something, with eval it got deactivated
|
||||||
|
|
||||||
|
self.__fully_connected_2 = nn.Linear(
|
||||||
|
feed_forward_hidden_layer_dimension, embedding_size
|
||||||
|
) # return into the model dimension
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
# -> NN1 -> RELU -> (Droput during training) -> NN2 ->
|
||||||
|
|
||||||
|
# 1) Linear Layer
|
||||||
|
x = self.__fully_connected_1(x)
|
||||||
|
|
||||||
|
# 2) ReLU
|
||||||
|
x = self.__relu(x)
|
||||||
|
|
||||||
|
# 3) Dropout
|
||||||
|
x = self.__dropout(x)
|
||||||
|
|
||||||
|
# 4) Linear Layer
|
||||||
|
x = self.__fully_connected_2(x)
|
||||||
|
|
||||||
|
return x
|
||||||
111
Project_Model/Libs/Transformer/Classes/NanoSocratesCore.py
Normal file
111
Project_Model/Libs/Transformer/Classes/NanoSocratesCore.py
Normal file
@@ -0,0 +1,111 @@
|
|||||||
|
from ..Utils.task_type import TaskType
|
||||||
|
from .Decoder import Decoder
|
||||||
|
from .Encoder import Encoder
|
||||||
|
from ....Libs.Embedder import NanoSocratesEmbedder
|
||||||
|
import torch
|
||||||
|
|
||||||
|
|
||||||
|
class NanoSocratesCore(torch.nn.Module):
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
sentence_length: int,
|
||||||
|
vocab_size: int,
|
||||||
|
embedding_size: int = 256,
|
||||||
|
feed_forward_multiplier: int = 4,
|
||||||
|
num_encoder_layers: int = 2,
|
||||||
|
num_decoder_layers: int = 2,
|
||||||
|
num_attention_heads: int = 4,
|
||||||
|
) -> None:
|
||||||
|
|
||||||
|
feed_forward_dim = embedding_size * feed_forward_multiplier
|
||||||
|
|
||||||
|
self.__sentence_length = sentence_length
|
||||||
|
|
||||||
|
self.__encoder_sequence = torch.nn.Sequential(
|
||||||
|
*[
|
||||||
|
Encoder(embedding_size, feed_forward_dim, num_attention_heads)
|
||||||
|
for _ in range(num_encoder_layers)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
# * unpack the list so that each encoder has its own weights
|
||||||
|
|
||||||
|
self.__decoder_sequence = torch.nn.Sequential(
|
||||||
|
*[
|
||||||
|
Decoder(embedding_size, feed_forward_dim, num_attention_heads)
|
||||||
|
for _ in range(num_decoder_layers)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
self.__linear = torch.nn.Linear(embedding_size, vocab_size)
|
||||||
|
|
||||||
|
self.__input_embeder = NanoSocratesEmbedder(vocab_size, embedding_size)
|
||||||
|
self.__output_embedder = NanoSocratesEmbedder(vocab_size, embedding_size)
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
encoder_input: list[list[int]],
|
||||||
|
decoder_input: list[list[int]],
|
||||||
|
encoder_padding_mask: list[list[int]],
|
||||||
|
):
|
||||||
|
|
||||||
|
if len(encoder_padding_mask) != len(encoder_input):
|
||||||
|
raise Exception("Mismatch in received_dimensions")
|
||||||
|
|
||||||
|
# TODO: check for tensor in input to embedder
|
||||||
|
# 1) Embed User-Input for encoders
|
||||||
|
ENCODER_INPUT = self.__input_embeder(encoder_input)
|
||||||
|
|
||||||
|
# 2) Encode User-Input
|
||||||
|
ENCODER_OUTPUT, _ = self.__encoder_sequence(ENCODER_INPUT, encoder_padding_mask)
|
||||||
|
del ENCODER_INPUT
|
||||||
|
|
||||||
|
exit_loop = False
|
||||||
|
decoder_token_list = decoder_input[:]
|
||||||
|
decoder_phase = 0
|
||||||
|
|
||||||
|
LOGITS_HISTORY: list[torch.Tensor] = []
|
||||||
|
|
||||||
|
# 3) Autoregressive Output
|
||||||
|
while not exit_loop:
|
||||||
|
|
||||||
|
# 3.0) Increment Counter
|
||||||
|
decoder_phase += 1
|
||||||
|
|
||||||
|
# 3.1) Embed Decoder Input
|
||||||
|
decoder_input = self.__output_embedder(decoder_token_list)
|
||||||
|
|
||||||
|
# 3.2) Decode Decoder Input
|
||||||
|
DECODER_OUTPUT, _, _, _ = self.__decoder_sequence(
|
||||||
|
decoder_input, ENCODER_OUTPUT, ENCODER_OUTPUT
|
||||||
|
)
|
||||||
|
|
||||||
|
# 3.3) Go back to Token space
|
||||||
|
# TODO: change name
|
||||||
|
LOGITS = self.__linear(DECODER_OUTPUT)
|
||||||
|
del DECODER_OUTPUT
|
||||||
|
|
||||||
|
# 3.4) Transform in probabilities
|
||||||
|
# TODO: change name
|
||||||
|
TOKEN_PROBABILITIES = torch.softmax(LOGITS, dim=-1)
|
||||||
|
del LOGITS
|
||||||
|
|
||||||
|
LOGITS_HISTORY.append(TOKEN_PROBABILITIES)
|
||||||
|
|
||||||
|
# 3.5) Take most probable tokens
|
||||||
|
TOKEN_IDS = torch.argmax(TOKEN_PROBABILITIES, -1)
|
||||||
|
|
||||||
|
# TODO: check for dimensions and for efficiency
|
||||||
|
DECODER_TOKEN_TENSOR = torch.tensor(decoder_token_list)
|
||||||
|
DECODER_TOKEN_TENSOR[:, decoder_phase] = TOKEN_IDS
|
||||||
|
decoder_token_list = DECODER_TOKEN_TENSOR.tolist()
|
||||||
|
|
||||||
|
del TOKEN_IDS
|
||||||
|
del DECODER_TOKEN_TENSOR
|
||||||
|
|
||||||
|
# 3.6) Check if we generated all tokens
|
||||||
|
if decoder_phase == self.__sentence_length - 1:
|
||||||
|
exit_loop = True
|
||||||
|
|
||||||
|
return LOGITS_HISTORY
|
||||||
218
Project_Model/Libs/Transformer/Classes/SpannedMasker.py
Normal file
218
Project_Model/Libs/Transformer/Classes/SpannedMasker.py
Normal file
@@ -0,0 +1,218 @@
|
|||||||
|
import math
|
||||||
|
import random
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
|
class SpannedMasker:
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
max_vocabulary: int,
|
||||||
|
forbidden_tokens: set[int],
|
||||||
|
change_token_probability: float = 0.15,
|
||||||
|
average_span: int = 1,
|
||||||
|
seed: int = random.randint(0, sys.maxsize),
|
||||||
|
|
||||||
|
) -> None:
|
||||||
|
|
||||||
|
if change_token_probability < 0 or change_token_probability > 1:
|
||||||
|
raise ValueError("received a value that is not between 0 or 1")
|
||||||
|
|
||||||
|
self.__change_token_probability = change_token_probability
|
||||||
|
self.__average_span = average_span
|
||||||
|
self.__rng = random.Random(seed)
|
||||||
|
self.__max_vocabulary = max_vocabulary
|
||||||
|
self.__forbidden_tokens = forbidden_tokens
|
||||||
|
|
||||||
|
|
||||||
|
def reseed(self, seed:int):
|
||||||
|
self.__rng = random.Random(seed)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def mask_sequence(
|
||||||
|
self,
|
||||||
|
token_sequence: list[int],
|
||||||
|
) -> tuple[list[int], list[int]]:
|
||||||
|
|
||||||
|
MASK = self.__create_mask(token_sequence, self.__forbidden_tokens)
|
||||||
|
MASKED = self.__create_masked_input(token_sequence, MASK, self.__max_vocabulary)
|
||||||
|
TARGET = self.__create_target(token_sequence, MASK, self.__max_vocabulary)
|
||||||
|
|
||||||
|
return (MASKED, TARGET)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def __number_of_spans(self, legal_token_number: int):
|
||||||
|
EXPECTED_NUM_OF_CORRUPTED_TOKENS = self.__number_of_corrupted_tokens(legal_token_number)
|
||||||
|
|
||||||
|
return math.ceil(EXPECTED_NUM_OF_CORRUPTED_TOKENS / self.__average_span)
|
||||||
|
|
||||||
|
def __number_of_corrupted_tokens(self, legal_token_number: int):
|
||||||
|
EXPECTED_NUM_OF_CORRUPTED_TOKENS = math.ceil(
|
||||||
|
legal_token_number * self.__change_token_probability
|
||||||
|
)
|
||||||
|
|
||||||
|
return EXPECTED_NUM_OF_CORRUPTED_TOKENS
|
||||||
|
|
||||||
|
def __create_mask(self, sequence: list[int], forbidden_tokens: set[int]) -> list[bool]:
|
||||||
|
|
||||||
|
SEQ_LEN = len(sequence)
|
||||||
|
LEGAL_TOKENS = self.__count_legal_tokens(sequence, forbidden_tokens)
|
||||||
|
NUM_OF_CORRUPTIONS = self.__number_of_corrupted_tokens(LEGAL_TOKENS)
|
||||||
|
NUM_OF_SPANS = self.__number_of_spans(LEGAL_TOKENS)
|
||||||
|
MASK = [False] * SEQ_LEN
|
||||||
|
|
||||||
|
mask_index = 0
|
||||||
|
number_of_spans = 0
|
||||||
|
exit_loop = False
|
||||||
|
|
||||||
|
while not exit_loop:
|
||||||
|
|
||||||
|
TOKEN = sequence[mask_index]
|
||||||
|
MASKED = MASK[mask_index]
|
||||||
|
SHOULD_MASK = self.__random_mask()
|
||||||
|
skip = False
|
||||||
|
|
||||||
|
|
||||||
|
if self.__is_illegal_token(TOKEN, forbidden_tokens):
|
||||||
|
skip = True
|
||||||
|
|
||||||
|
if MASKED:
|
||||||
|
skip = True
|
||||||
|
|
||||||
|
if not SHOULD_MASK:
|
||||||
|
skip = True
|
||||||
|
|
||||||
|
if skip:
|
||||||
|
mask_index = (mask_index + 1) % SEQ_LEN
|
||||||
|
continue
|
||||||
|
|
||||||
|
|
||||||
|
CANDIDATE_SPAN = self.__random_span(
|
||||||
|
self.__average_span
|
||||||
|
)
|
||||||
|
|
||||||
|
REMAINING_MASK = SEQ_LEN - (mask_index + 1)
|
||||||
|
|
||||||
|
SPAN_LENGTH = min(CANDIDATE_SPAN, REMAINING_MASK)
|
||||||
|
|
||||||
|
for _ in range(0, SPAN_LENGTH):
|
||||||
|
INNER_TOKEN = sequence[mask_index]
|
||||||
|
|
||||||
|
if self.__is_illegal_token(INNER_TOKEN, forbidden_tokens):
|
||||||
|
continue
|
||||||
|
|
||||||
|
MASK[mask_index] = True
|
||||||
|
mask_index += 1
|
||||||
|
|
||||||
|
number_of_spans += 1
|
||||||
|
mask_index += 1
|
||||||
|
|
||||||
|
if number_of_spans == NUM_OF_SPANS:
|
||||||
|
exit_loop = True
|
||||||
|
continue
|
||||||
|
|
||||||
|
if mask_index >= SEQ_LEN - 1:
|
||||||
|
exit_loop = True
|
||||||
|
continue
|
||||||
|
|
||||||
|
return MASK
|
||||||
|
|
||||||
|
def __create_masked_input(self, sequence: list[int], mask: list[bool], max_voc: int) -> list[int]:
|
||||||
|
|
||||||
|
OUT: list[int] = []
|
||||||
|
mask_token_id = max_voc + 1
|
||||||
|
index = 0
|
||||||
|
while index < len(sequence):
|
||||||
|
|
||||||
|
TOKEN = sequence[index]
|
||||||
|
MASKED = mask[index]
|
||||||
|
|
||||||
|
if not MASKED:
|
||||||
|
OUT.append(
|
||||||
|
TOKEN
|
||||||
|
)
|
||||||
|
index += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
MASK_TOKEN = mask_token_id
|
||||||
|
OUT.append(
|
||||||
|
MASK_TOKEN
|
||||||
|
)
|
||||||
|
|
||||||
|
while mask[index]:
|
||||||
|
index += 1
|
||||||
|
|
||||||
|
mask_token_id += 1
|
||||||
|
|
||||||
|
return OUT
|
||||||
|
|
||||||
|
def __create_target(self, sequence: list[int], mask: list[bool], max_voc: int) -> list[int]:
|
||||||
|
|
||||||
|
OUT: list[int] = []
|
||||||
|
mask_token_id = max_voc + 1
|
||||||
|
index = 0
|
||||||
|
while index < len(sequence):
|
||||||
|
|
||||||
|
TOKEN = sequence[index]
|
||||||
|
MASKED = mask[index]
|
||||||
|
|
||||||
|
if MASKED:
|
||||||
|
OUT.append(
|
||||||
|
TOKEN
|
||||||
|
)
|
||||||
|
index += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
MASK_TOKEN = mask_token_id
|
||||||
|
OUT.append(
|
||||||
|
MASK_TOKEN
|
||||||
|
)
|
||||||
|
|
||||||
|
while index < len(mask) and not mask[index]:
|
||||||
|
index += 1
|
||||||
|
|
||||||
|
mask_token_id += 1
|
||||||
|
|
||||||
|
|
||||||
|
return OUT
|
||||||
|
|
||||||
|
def __is_illegal_token(self, token: int, illegal_voc: set[int]) -> bool:
|
||||||
|
if token in illegal_voc:
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
def __count_legal_tokens(self, sequence: list[int], illegal_voc: set[int]) -> int:
|
||||||
|
legal_count = 0
|
||||||
|
|
||||||
|
for token in sequence:
|
||||||
|
if self.__is_illegal_token(token, illegal_voc):
|
||||||
|
continue
|
||||||
|
legal_count += 1
|
||||||
|
|
||||||
|
return legal_count
|
||||||
|
|
||||||
|
def __random_mask(self) -> bool:
|
||||||
|
|
||||||
|
if self.__random_probability() > self.__change_token_probability:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
def __random_probability(self) -> float:
|
||||||
|
return self.__rng.random()
|
||||||
|
|
||||||
|
def __random_token(self, max_vocabulary: int) -> int:
|
||||||
|
return self.__rng.randint(0, max_vocabulary)
|
||||||
|
|
||||||
|
def __random_int_range(self, min: int, max: int) -> int:
|
||||||
|
return self.__rng.randint(min, max)
|
||||||
|
|
||||||
|
def __random_span(self, average: int) -> int:
|
||||||
|
candidate_span = self.__rng.gauss(mu=average)
|
||||||
|
candidate_span = max(1, candidate_span)
|
||||||
|
candidate_span = round(candidate_span)
|
||||||
|
return candidate_span
|
||||||
77
Project_Model/Libs/Transformer/Classes/TokenMasker.py
Normal file
77
Project_Model/Libs/Transformer/Classes/TokenMasker.py
Normal file
@@ -0,0 +1,77 @@
|
|||||||
|
import random
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
|
class TokenMasker:
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
change_token_probability: float = 0.15,
|
||||||
|
mask_token_probability: float = 0.8,
|
||||||
|
random_token_prob: float = 0.1,
|
||||||
|
seed: int = random.randint(0, sys.maxsize),
|
||||||
|
) -> None:
|
||||||
|
|
||||||
|
if change_token_probability < 0 or change_token_probability > 1:
|
||||||
|
raise ValueError("received a value that is not between 0 or 1")
|
||||||
|
|
||||||
|
if mask_token_probability < 0 or mask_token_probability > 1:
|
||||||
|
raise ValueError("received a value that is not between 0 or 1")
|
||||||
|
|
||||||
|
if random_token_prob < 0 or random_token_prob > 1:
|
||||||
|
raise ValueError("received a value that is not between 0 or 1")
|
||||||
|
|
||||||
|
if mask_token_probability + random_token_prob > 1:
|
||||||
|
raise ValueError("The sum of probabilities is over 1")
|
||||||
|
|
||||||
|
self.__change_token_probability = change_token_probability
|
||||||
|
self.__mask_token_probability = mask_token_probability
|
||||||
|
self.__random_token_prob = random_token_prob
|
||||||
|
self.__rng = random.Random(seed)
|
||||||
|
|
||||||
|
def mask_sequence(
|
||||||
|
self, token_sequence: list[int], max_vocabulary: int, mask_id: int
|
||||||
|
) -> list[int]:
|
||||||
|
|
||||||
|
if mask_id <= max_vocabulary:
|
||||||
|
raise ValueError("mask_id is a value of vocabulary")
|
||||||
|
|
||||||
|
MASKED_SEQUENCE: list[int] = []
|
||||||
|
|
||||||
|
for token in token_sequence:
|
||||||
|
|
||||||
|
if token > max_vocabulary:
|
||||||
|
MASKED_SEQUENCE.append(token)
|
||||||
|
continue
|
||||||
|
|
||||||
|
MASKED_TOKEN = self.__mask(token, max_vocabulary, mask_id)
|
||||||
|
MASKED_SEQUENCE.append(MASKED_TOKEN)
|
||||||
|
|
||||||
|
return MASKED_SEQUENCE
|
||||||
|
|
||||||
|
def __mask(self, token: int, max_vocabulary: int, mask_id: int) -> int:
|
||||||
|
|
||||||
|
if self.__random_probability() > self.__change_token_probability:
|
||||||
|
return token
|
||||||
|
|
||||||
|
MASK_TOKEN_TRESH = self.__mask_token_probability
|
||||||
|
RANDOM_TOKEN_TRESH = MASK_TOKEN_TRESH + self.__random_token_prob
|
||||||
|
CHANCE_PROBABILITY = self.__random_probability()
|
||||||
|
|
||||||
|
# It's over both probabilities, return same token
|
||||||
|
if CHANCE_PROBABILITY > RANDOM_TOKEN_TRESH:
|
||||||
|
return token
|
||||||
|
|
||||||
|
# It's over masking treshold, but lower than random
|
||||||
|
# return random token
|
||||||
|
if CHANCE_PROBABILITY > MASK_TOKEN_TRESH:
|
||||||
|
return self.__random_token(max_vocabulary)
|
||||||
|
|
||||||
|
# It's below masking treshold, mask token
|
||||||
|
return mask_id
|
||||||
|
|
||||||
|
def __random_probability(self) -> float:
|
||||||
|
return self.__rng.random()
|
||||||
|
|
||||||
|
def __random_token(self, max_vocabulary: int) -> int:
|
||||||
|
return self.__rng.randint(0, max_vocabulary)
|
||||||
@@ -0,0 +1,47 @@
|
|||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
class TorchMultiHeadAttention(nn.Module):
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
embedding_dimension: int,
|
||||||
|
number_of_attention_heads: int,
|
||||||
|
dropout: float = 0.0
|
||||||
|
):
|
||||||
|
super().__init__()
|
||||||
|
self.attention = torch.nn.MultiheadAttention(
|
||||||
|
embedding_dimension,
|
||||||
|
num_heads=number_of_attention_heads,
|
||||||
|
dropout=dropout,
|
||||||
|
batch_first=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
x_q: torch.Tensor,
|
||||||
|
x_k: torch.Tensor,
|
||||||
|
x_v: torch.Tensor,
|
||||||
|
key_padding_mask=None,
|
||||||
|
attention_mask: Optional[torch.Tensor] = None
|
||||||
|
) -> torch.Tensor:
|
||||||
|
|
||||||
|
# x * Wq -> Q
|
||||||
|
# x * Wk -> K
|
||||||
|
# x * Wv -> V
|
||||||
|
# REMEMBER: tochAttention uses Batch internally to build the 3 dimension attention mask given the 2 dimension
|
||||||
|
y, _ = self.attention(
|
||||||
|
x_q, x_k, x_v, attn_mask=attention_mask, key_padding_mask=key_padding_mask,
|
||||||
|
need_weights=False
|
||||||
|
)
|
||||||
|
return y
|
||||||
|
|
||||||
|
|
||||||
|
# batch_first=False (default storico)
|
||||||
|
# Formato: (L, N, E)
|
||||||
|
# L = lunghezza della sequenza (time/posizioni)
|
||||||
|
# N = batch size
|
||||||
|
# E = dimensione d_model (embed)
|
||||||
|
# batch_first=True
|
||||||
|
# Formato: (N, L, E) (più naturale per molti modelli)
|
||||||
47
Project_Model/Libs/Transformer/Classes/WarmupLR.py
Normal file
47
Project_Model/Libs/Transformer/Classes/WarmupLR.py
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
from typing import override
|
||||||
|
import torch
|
||||||
|
|
||||||
|
|
||||||
|
# custom LR from attention is all you need
|
||||||
|
class WarmupLR(torch.optim.lr_scheduler.LRScheduler):
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
optimizer: torch.optim.Optimizer,
|
||||||
|
warmup_steps: int,
|
||||||
|
embedding_size: int,
|
||||||
|
warming_multiplier: float = -1.5,
|
||||||
|
decaying_multiplier: float = -0.5,
|
||||||
|
multiplicative_factor: float = 1.0,
|
||||||
|
last_epoch: int = -1,
|
||||||
|
) -> None:
|
||||||
|
self.__warmup_steps = warmup_steps
|
||||||
|
self.__embedding_size = embedding_size
|
||||||
|
self.__warming_multiplier = warming_multiplier
|
||||||
|
self.__decaying_multiplier = decaying_multiplier
|
||||||
|
self.__multiplicative_factor = multiplicative_factor
|
||||||
|
super().__init__(optimizer, last_epoch)
|
||||||
|
|
||||||
|
def __scale_at(self, step: int) -> float:
|
||||||
|
step = max(step, 1)
|
||||||
|
return (
|
||||||
|
self.__multiplicative_factor
|
||||||
|
* (self.__embedding_size**self.__decaying_multiplier)
|
||||||
|
* min(
|
||||||
|
step**self.__decaying_multiplier,
|
||||||
|
step * (self.__warmup_steps**self.__warming_multiplier),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
@override
|
||||||
|
def get_lr(self) -> list[float]:
|
||||||
|
torch.optim.lr_scheduler._warn_get_lr_called_within_step(self)
|
||||||
|
|
||||||
|
step = max(self.last_epoch, 1)
|
||||||
|
scale = self.__scale_at(step)
|
||||||
|
return [base_lr * scale for base_lr in self.base_lrs]
|
||||||
|
|
||||||
|
def _get_closed_form_lr(self):
|
||||||
|
step = max(self.last_epoch, 1)
|
||||||
|
scale = self.__scale_at(step)
|
||||||
|
return [base_lr * scale for base_lr in self.base_lrs]
|
||||||
18
Project_Model/Libs/Transformer/Classes/__init__.py
Normal file
18
Project_Model/Libs/Transformer/Classes/__init__.py
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
from .Decoder import Decoder
|
||||||
|
from .Encoder import Encoder
|
||||||
|
from .FeedForwardNetwork import FeedForwardNetwork
|
||||||
|
# from .MultiHeadAttention import MultiheadAttention
|
||||||
|
from .TorchMultiHeadAttention import TorchMultiHeadAttention
|
||||||
|
from .SpannedMasker import SpannedMasker
|
||||||
|
from .DeToken import DeToken
|
||||||
|
from .WarmupLR import WarmupLR
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"Decoder",
|
||||||
|
"Encoder",
|
||||||
|
"FeedForwardNetwork",
|
||||||
|
"TorchMultiHeadAttention",
|
||||||
|
"SpannedMasker",
|
||||||
|
"DeToken",
|
||||||
|
"WarmupLR"
|
||||||
|
]
|
||||||
0
Project_Model/Libs/Transformer/Enums/__init__.py
Normal file
0
Project_Model/Libs/Transformer/Enums/__init__.py
Normal file
55
Project_Model/Libs/Transformer/Models/TrainingModel.py
Normal file
55
Project_Model/Libs/Transformer/Models/TrainingModel.py
Normal file
@@ -0,0 +1,55 @@
|
|||||||
|
import torch
|
||||||
|
import Project_Model.Libs.Embedder as Embedder
|
||||||
|
from ..Classes import Encoder, Decoder, DeToken
|
||||||
|
|
||||||
|
|
||||||
|
class TrainingModel(torch.nn.Module):
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
vocabulary_size: int,
|
||||||
|
latent_space: int = 256,
|
||||||
|
feed_forward_multiplier: int = 4,
|
||||||
|
attention_heads: int = 4,
|
||||||
|
layer_number: int = 2,
|
||||||
|
) -> None:
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
feed_forward_latent_space = latent_space * feed_forward_multiplier
|
||||||
|
|
||||||
|
self.__encoder_embedder = Embedder.NanoSocratesEmbedder(
|
||||||
|
vocabulary_size, latent_space
|
||||||
|
)
|
||||||
|
self.__decoder_embedder = Embedder.NanoSocratesEmbedder(
|
||||||
|
vocabulary_size, latent_space
|
||||||
|
)
|
||||||
|
|
||||||
|
TMP_ENCODERS = [
|
||||||
|
Encoder(latent_space, feed_forward_latent_space, attention_heads)
|
||||||
|
] * layer_number
|
||||||
|
|
||||||
|
TMP_DECODERS = [
|
||||||
|
Decoder(latent_space, feed_forward_latent_space, attention_heads)
|
||||||
|
] * layer_number
|
||||||
|
|
||||||
|
self.__encoder = torch.nn.Sequential(*TMP_ENCODERS)
|
||||||
|
self.__decoder = torch.nn.Sequential(*TMP_DECODERS)
|
||||||
|
|
||||||
|
self.__detokener = DeToken(latent_space, vocabulary_size)
|
||||||
|
|
||||||
|
def forward(self, args: tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]):
|
||||||
|
|
||||||
|
encoder_embedder_input, src_padding, decoder_embedder_input, tgt_padding = args
|
||||||
|
|
||||||
|
encoder_tensor = self.__encoder_embedder(encoder_embedder_input)
|
||||||
|
decoder_tensor = self.__decoder_embedder(decoder_embedder_input)
|
||||||
|
|
||||||
|
encoder_output, _ = self.__encoder((encoder_tensor, src_padding))
|
||||||
|
|
||||||
|
decoder_output, _, _, _, _ = self.__decoder(
|
||||||
|
(decoder_tensor, encoder_output, encoder_output, src_padding, tgt_padding)
|
||||||
|
)
|
||||||
|
|
||||||
|
logits: torch.Tensor = self.__detokener(decoder_output)
|
||||||
|
|
||||||
|
return logits
|
||||||
5
Project_Model/Libs/Transformer/Models/__init__.py
Normal file
5
Project_Model/Libs/Transformer/Models/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
from .TrainingModel import TrainingModel
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"TrainingModel"
|
||||||
|
]
|
||||||
19
Project_Model/Libs/Transformer/Utils/__init__.py
Normal file
19
Project_Model/Libs/Transformer/Utils/__init__.py
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
from .attention_mask import get_causal_attention_mask,get_causal_attention_mask_batched
|
||||||
|
from .task_type import TaskType
|
||||||
|
from .post_tokenization import truncate_sequence, pad_sequence, normalize_sequence, create_padding_mask
|
||||||
|
from .inference_masking import inference_masking
|
||||||
|
from .truncate_rdf_list import truncate_rdf_list
|
||||||
|
from .decode_out import tensor2token
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"TaskType",
|
||||||
|
"get_causal_attention_mask",
|
||||||
|
"get_causal_attention_mask_batched",
|
||||||
|
"truncate_sequence",
|
||||||
|
"pad_sequence",
|
||||||
|
"create_padding_mask",
|
||||||
|
"normalize_sequence",
|
||||||
|
"inference_masking",
|
||||||
|
"truncate_rdf_list",
|
||||||
|
"tensor2token"
|
||||||
|
]
|
||||||
11
Project_Model/Libs/Transformer/Utils/attention_mask.py
Normal file
11
Project_Model/Libs/Transformer/Utils/attention_mask.py
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
import torch
|
||||||
|
|
||||||
|
def get_causal_attention_mask(seq_len: int) -> torch.Tensor:
|
||||||
|
return torch.triu(torch.ones(seq_len, seq_len, dtype=torch.bool), diagonal=1)
|
||||||
|
|
||||||
|
|
||||||
|
# there is no need for this since MultiHeadAttention of torch does this internally
|
||||||
|
def get_causal_attention_mask_batched(seq_len: int, batch_size: int ) -> torch.Tensor:
|
||||||
|
base_mask = get_causal_attention_mask(seq_len)
|
||||||
|
return base_mask.unsqueeze(0).expand(batch_size, -1, -1) # add another dimension at the beginning, big as batch_size
|
||||||
|
# the result is that z,x,y where x,y are repeated along z
|
||||||
27
Project_Model/Libs/Transformer/Utils/decode_out.py
Normal file
27
Project_Model/Libs/Transformer/Utils/decode_out.py
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
from typing import Generator
|
||||||
|
|
||||||
|
import torch
|
||||||
|
|
||||||
|
|
||||||
|
def tensor2token(tensor: torch.Tensor, end_token: int) -> Generator[list[int]]:
|
||||||
|
|
||||||
|
if len(tensor.shape) < 1 or len(tensor.shape) > 2:
|
||||||
|
raise ValueError("Shape is not correct")
|
||||||
|
|
||||||
|
if len(tensor.shape) == 1:
|
||||||
|
token_list: list[int] = tensor.tolist()
|
||||||
|
token_list.append(end_token)
|
||||||
|
yield token_list
|
||||||
|
return
|
||||||
|
|
||||||
|
batch_len: int
|
||||||
|
batch_len, _ = tensor.shape
|
||||||
|
|
||||||
|
for i in range(batch_len):
|
||||||
|
|
||||||
|
smaller_tensor = tensor[i, :]
|
||||||
|
token_list: list[int] = smaller_tensor.tolist()
|
||||||
|
token_list.append(end_token)
|
||||||
|
yield token_list
|
||||||
|
|
||||||
|
|
||||||
13
Project_Model/Libs/Transformer/Utils/inference_masking.py
Normal file
13
Project_Model/Libs/Transformer/Utils/inference_masking.py
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
def inference_masking(sequence: list[int], mask_token: int, max_vocabulary: int) -> list[int]:
|
||||||
|
|
||||||
|
current_mask_token = max_vocabulary + 1
|
||||||
|
|
||||||
|
for i in range(0, len(sequence)):
|
||||||
|
|
||||||
|
if sequence[i] != mask_token:
|
||||||
|
continue
|
||||||
|
|
||||||
|
sequence[i] = current_mask_token
|
||||||
|
current_mask_token += 1
|
||||||
|
|
||||||
|
return sequence
|
||||||
60
Project_Model/Libs/Transformer/Utils/post_tokenization.py
Normal file
60
Project_Model/Libs/Transformer/Utils/post_tokenization.py
Normal file
@@ -0,0 +1,60 @@
|
|||||||
|
def truncate_sequence(
|
||||||
|
sequence: list[int], truncate_at: int, end_token: int, add_ending: bool
|
||||||
|
) -> list[int]:
|
||||||
|
|
||||||
|
if len(sequence) < truncate_at - 1:
|
||||||
|
if add_ending:
|
||||||
|
sequence.append(end_token)
|
||||||
|
return sequence
|
||||||
|
|
||||||
|
if len(sequence) < truncate_at:
|
||||||
|
if add_ending:
|
||||||
|
sequence[-1] = end_token
|
||||||
|
return sequence
|
||||||
|
|
||||||
|
TRUNCATED_SEQUENCE = sequence[:truncate_at]
|
||||||
|
if add_ending:
|
||||||
|
TRUNCATED_SEQUENCE[-1] = end_token
|
||||||
|
|
||||||
|
return TRUNCATED_SEQUENCE
|
||||||
|
|
||||||
|
|
||||||
|
def pad_sequence(sequence: list[int], pad_until: int, pad_token: int) -> list[int]:
|
||||||
|
|
||||||
|
if not (len(sequence) < pad_until):
|
||||||
|
return sequence
|
||||||
|
|
||||||
|
NUM_OF_PADDINGS = pad_until - len(sequence)
|
||||||
|
PADDINGS = [pad_token] * NUM_OF_PADDINGS
|
||||||
|
|
||||||
|
PADDED_SEQUENCE = sequence[:]
|
||||||
|
PADDED_SEQUENCE.extend(PADDINGS)
|
||||||
|
|
||||||
|
return PADDED_SEQUENCE
|
||||||
|
|
||||||
|
def create_padding_mask(sequence: list[int], pad_token: int) -> list[bool]:
|
||||||
|
|
||||||
|
PADDING_MASK = [False] * len(sequence)
|
||||||
|
|
||||||
|
for i in range(0, len(sequence)):
|
||||||
|
|
||||||
|
if sequence[i] != pad_token:
|
||||||
|
continue
|
||||||
|
|
||||||
|
PADDING_MASK[i] = True
|
||||||
|
|
||||||
|
return PADDING_MASK
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_sequence(
|
||||||
|
sequence: list[int],
|
||||||
|
max_length: int,
|
||||||
|
pad_token: int,
|
||||||
|
end_token: int,
|
||||||
|
add_ending: bool = True
|
||||||
|
) -> tuple[list[int], list[bool]]:
|
||||||
|
new_sequence = truncate_sequence(sequence, max_length, end_token, add_ending)
|
||||||
|
new_sequence = pad_sequence(new_sequence, max_length, pad_token)
|
||||||
|
PADDING_MASK = create_padding_mask(new_sequence, pad_token)
|
||||||
|
|
||||||
|
return (new_sequence, PADDING_MASK)
|
||||||
6
Project_Model/Libs/Transformer/Utils/task_type.py
Normal file
6
Project_Model/Libs/Transformer/Utils/task_type.py
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
from enum import Enum, auto
|
||||||
|
|
||||||
|
class TaskType(Enum):
|
||||||
|
RDF2TEXT = auto()
|
||||||
|
MASK = auto()
|
||||||
|
COMPLETATION = auto()
|
||||||
65
Project_Model/Libs/Transformer/Utils/truncate_rdf_list.py
Normal file
65
Project_Model/Libs/Transformer/Utils/truncate_rdf_list.py
Normal file
@@ -0,0 +1,65 @@
|
|||||||
|
from collections import deque
|
||||||
|
import random
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
|
def truncate_rdf_list(
|
||||||
|
sequence: list[int],
|
||||||
|
truncation_probability: float,
|
||||||
|
continue_triple_token: int,
|
||||||
|
end_of_triple_token: int,
|
||||||
|
seed: int = random.randint(0, sys.maxsize),
|
||||||
|
) -> tuple[list[int], list[int]]:
|
||||||
|
|
||||||
|
if truncation_probability < 0 or truncation_probability > 1:
|
||||||
|
raise ValueError("A probability must be between 0 and 1")
|
||||||
|
|
||||||
|
RNG = random.Random(seed)
|
||||||
|
|
||||||
|
END_OF_TRIPLES: deque[int] = deque()
|
||||||
|
|
||||||
|
for i in range(0, len(sequence)):
|
||||||
|
|
||||||
|
TOKEN = sequence[i]
|
||||||
|
if TOKEN != end_of_triple_token:
|
||||||
|
continue
|
||||||
|
|
||||||
|
END_OF_TRIPLES.append(i + 1)
|
||||||
|
|
||||||
|
TRIPLES_TOKENS: list[int] = []
|
||||||
|
TARGET_TRIPLES: list[int] = []
|
||||||
|
|
||||||
|
start_of_triple = 0
|
||||||
|
exit_loop = False
|
||||||
|
|
||||||
|
while not exit_loop:
|
||||||
|
|
||||||
|
EOT = END_OF_TRIPLES.popleft()
|
||||||
|
|
||||||
|
TRIPLE = sequence[start_of_triple:EOT]
|
||||||
|
TRIPLES_TOKENS.extend(TRIPLE)
|
||||||
|
|
||||||
|
start_of_triple = EOT
|
||||||
|
|
||||||
|
if RNG.random() < truncation_probability:
|
||||||
|
exit_loop = True
|
||||||
|
|
||||||
|
if len(END_OF_TRIPLES) == 1:
|
||||||
|
exit_loop = True
|
||||||
|
|
||||||
|
TRIPLES_TOKENS.append(
|
||||||
|
continue_triple_token
|
||||||
|
)
|
||||||
|
|
||||||
|
while len(END_OF_TRIPLES) > 0:
|
||||||
|
|
||||||
|
EOT = END_OF_TRIPLES.popleft()
|
||||||
|
|
||||||
|
TRIPLE = sequence[start_of_triple:EOT]
|
||||||
|
TARGET_TRIPLES.extend(TRIPLE)
|
||||||
|
|
||||||
|
start_of_triple = EOT
|
||||||
|
|
||||||
|
|
||||||
|
return (TRIPLES_TOKENS, TARGET_TRIPLES)
|
||||||
|
|
||||||
7
Project_Model/Libs/Transformer/__init__.py
Normal file
7
Project_Model/Libs/Transformer/__init__.py
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
from .Classes import *
|
||||||
|
from .Utils import *
|
||||||
|
from .Models import *
|
||||||
|
|
||||||
|
from . import Classes
|
||||||
|
from . import Utils
|
||||||
|
from . import Models
|
||||||
@@ -1 +1,4 @@
|
|||||||
from . import BPE
|
from . import BPE
|
||||||
|
from . import Embedder
|
||||||
|
from . import Transformer
|
||||||
|
from . import TorchShims
|
||||||
|
|||||||
92
Project_Model/Tests/spanned_masker_test.py
Normal file
92
Project_Model/Tests/spanned_masker_test.py
Normal file
@@ -0,0 +1,92 @@
|
|||||||
|
from functools import reduce
|
||||||
|
from pathlib import Path
|
||||||
|
import pytest
|
||||||
|
import Project_Model.Libs.BPE as BPE
|
||||||
|
import Project_Model.Libs.Transformer as Transformer
|
||||||
|
|
||||||
|
VOCABULARY_PATH = Path("Assets/Model/toy_10/toy_dictionary.json")
|
||||||
|
VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)
|
||||||
|
SPECIAL_LIST = BPE.default_special_tokens()
|
||||||
|
|
||||||
|
|
||||||
|
class TestSpannedMasker:
|
||||||
|
|
||||||
|
def test_spanned_masking(self):
|
||||||
|
|
||||||
|
CORPUS_PATH = Path("Project_Model/Tests/spanner_file/mask.txt")
|
||||||
|
TEXT = CORPUS_PATH.read_text("utf-8")
|
||||||
|
CORRUPTION_PERCENTAGE = 0.15
|
||||||
|
TOLERANCE = 0.15
|
||||||
|
|
||||||
|
TOKENIZER = BPE.TokeNanoCore(VOCABULARY, SPECIAL_LIST)
|
||||||
|
VOCABULARY_SIZE = TOKENIZER.vocabulary_size
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
TOKENS = TOKENIZER.encode(TEXT)
|
||||||
|
|
||||||
|
LEGAL_TOKENS: set[int] = set(TOKENIZER.encode("<SUBJ><OBJ><PRED>"))
|
||||||
|
|
||||||
|
SPECIAL_TOKENS: set[int] = set(TOKENIZER.encode("".join(SPECIAL_LIST)))
|
||||||
|
|
||||||
|
ILLEGAL_TOKENS: set[int] = SPECIAL_TOKENS.difference(LEGAL_TOKENS)
|
||||||
|
|
||||||
|
MASKER = Transformer.SpannedMasker(VOCABULARY_SIZE,ILLEGAL_TOKENS,CORRUPTION_PERCENTAGE, 3)
|
||||||
|
|
||||||
|
SPECIAL_FORMATTER = TOKENIZER.encode("*<SOT>")[0]
|
||||||
|
END_FORMATTER = TOKENIZER.encode("<EOT>")[0]
|
||||||
|
|
||||||
|
OUTPUT, TARGET = MASKER.mask_sequence(TOKENS)
|
||||||
|
|
||||||
|
UNCORRUPTED_TOKENS = list(
|
||||||
|
filter(lambda token: token <= VOCABULARY_SIZE, OUTPUT)
|
||||||
|
)
|
||||||
|
CORRUPTED_TOKENS = list(filter(lambda token: token <= VOCABULARY_SIZE, TARGET))
|
||||||
|
|
||||||
|
TARGET.append(END_FORMATTER)
|
||||||
|
|
||||||
|
OUTPUT = list(
|
||||||
|
map(
|
||||||
|
lambda token: SPECIAL_FORMATTER if token > VOCABULARY_SIZE else token,
|
||||||
|
OUTPUT,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
TARGET = list(
|
||||||
|
map(
|
||||||
|
lambda token: SPECIAL_FORMATTER if token > VOCABULARY_SIZE else token,
|
||||||
|
TARGET,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
OUT_TEXT = TOKENIZER.decode(OUTPUT)
|
||||||
|
TAR_TEXT = TOKENIZER.decode(TARGET)
|
||||||
|
|
||||||
|
ACTUAL_CORRUPTION_PERCENTAGE = len(CORRUPTED_TOKENS) / len(TOKENS)
|
||||||
|
|
||||||
|
print(f"Original text:\n\n{TEXT}")
|
||||||
|
print(f"Inputs:\n\n{OUT_TEXT}")
|
||||||
|
print(f"Targets:\n\n{TAR_TEXT}")
|
||||||
|
print(f"Target Tokens:\n\n{OUTPUT}")
|
||||||
|
|
||||||
|
print(
|
||||||
|
"\n".join(
|
||||||
|
[
|
||||||
|
f"======================",
|
||||||
|
f"Original length: {len(TOKENS)}",
|
||||||
|
f"Uncorrupted Chars: {len(UNCORRUPTED_TOKENS)}",
|
||||||
|
f"Corrupted Chars: {len(CORRUPTED_TOKENS)}",
|
||||||
|
f"Percentage_corruption: {(len(CORRUPTED_TOKENS)/len(TOKENS))*100}%",
|
||||||
|
f"======================",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
for token in TARGET[:len(TARGET) - 1]:
|
||||||
|
assert token not in ILLEGAL_TOKENS
|
||||||
|
|
||||||
|
assert ACTUAL_CORRUPTION_PERCENTAGE > CORRUPTION_PERCENTAGE - TOLERANCE
|
||||||
|
assert ACTUAL_CORRUPTION_PERCENTAGE < CORRUPTION_PERCENTAGE + TOLERANCE
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
TestSpannedMasker().test_spanned_masking()
|
||||||
1
Project_Model/Tests/spanner_file/mask.txt
Normal file
1
Project_Model/Tests/spanner_file/mask.txt
Normal file
@@ -0,0 +1 @@
|
|||||||
|
<SOT><SUBJ>dbp-dbr:How_It_Should_Have_Ended<PRED>dbp-dbp:title<OBJ>dbp-dbr:The_Dark_Knight<EOT><SOT><SUBJ>dbp-dbr:The_Dark_Knight<PRED>dbp-dbp:caption<OBJ>Theatrical release poster<EOT><SOT><SUBJ>dbp-dbr:The_Dark_Knight<PRED>dbp-dbp:director<OBJ>dbp-dbr:Christopher_Nolan<EOT><SOT><SUBJ>dbp-dbr:The_Dark_Knight<PRED>dbp-dbp:distributor<OBJ>Warner Bros. Pictures<EOT><SOT><SUBJ>dbp-dbr:The_Dark_Knight<PRED>dbp-dbp:producer<OBJ>Charles Roven<EOT><SOT><SUBJ>dbp-dbr:The_Dark_Knight<PRED>dbp-dbp:producer<OBJ>Christopher Nolan<EOT><SOT><SUBJ>dbp-dbr:The_Dark_Knight<PRED>dbp-dbp:producer<OBJ>Emma Thomas<EOT><SOT><SUBJ>dbp-dbr:The_Dark_Knight<PRED>dbp-dbp:starring<OBJ>Christian Bale<EOT>
|
||||||
BIN
Project_Model/UML/base-model.png
Normal file
BIN
Project_Model/UML/base-model.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 553 KiB |
6019
Project_Model/UML/model.excalidraw.json
Normal file
6019
Project_Model/UML/model.excalidraw.json
Normal file
File diff suppressed because it is too large
Load Diff
162
Project_Model/UML/pipeline-utils.excalidraw.json
Normal file
162
Project_Model/UML/pipeline-utils.excalidraw.json
Normal file
@@ -0,0 +1,162 @@
|
|||||||
|
{
|
||||||
|
"type": "excalidraw",
|
||||||
|
"version": 2,
|
||||||
|
"source": "https://marketplace.visualstudio.com/items?itemName=pomdtr.excalidraw-editor",
|
||||||
|
"elements": [
|
||||||
|
{
|
||||||
|
"id": "MR81S4quQLdw7ZYl2sbpi",
|
||||||
|
"type": "text",
|
||||||
|
"x": 309,
|
||||||
|
"y": 131,
|
||||||
|
"width": 649,
|
||||||
|
"height": 200,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "transparent",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "a0",
|
||||||
|
"roundness": null,
|
||||||
|
"seed": 194891189,
|
||||||
|
"version": 243,
|
||||||
|
"versionNonce": 1614205979,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": null,
|
||||||
|
"updated": 1759680941870,
|
||||||
|
"link": null,
|
||||||
|
"locked": false,
|
||||||
|
"text": "def truncate(seq: list[int], max_length: int, eos_id: int):\n\n if len(seq) < max_length:\n return seq\n\n seq[max_length - 1] = eos_id\n return seq[:max_length]\n ",
|
||||||
|
"fontSize": 20,
|
||||||
|
"fontFamily": 8,
|
||||||
|
"textAlign": "left",
|
||||||
|
"verticalAlign": "top",
|
||||||
|
"containerId": null,
|
||||||
|
"originalText": "def truncate(seq: list[int], max_length: int, eos_id: int):\n\n if len(seq) < max_length:\n return seq\n\n seq[max_length - 1] = eos_id\n return seq[:max_length]\n ",
|
||||||
|
"autoResize": true,
|
||||||
|
"lineHeight": 1.25
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "nzMSk0NIeEpMYeUQ1UvVw",
|
||||||
|
"type": "text",
|
||||||
|
"x": 309,
|
||||||
|
"y": 420.5,
|
||||||
|
"width": 594,
|
||||||
|
"height": 350,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "transparent",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "a1",
|
||||||
|
"roundness": null,
|
||||||
|
"seed": 807602389,
|
||||||
|
"version": 533,
|
||||||
|
"versionNonce": 1455522939,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": null,
|
||||||
|
"updated": 1759681067736,
|
||||||
|
"link": null,
|
||||||
|
"locked": false,
|
||||||
|
"text": "def pad(seq: list[int], max_length: int, pad_id: int):\n\n if len(seq) > max_length:\n raise Exception()\n\n if len(seq) == max_length:\n return seq\n \n SEQ_LEN = len(seq)\n PAD_LEN = max_length - SEQ_LEN\n PADDING = [pad_id] * PAD_LEN\n seq.extend(PADDING)\n return seq\n ",
|
||||||
|
"fontSize": 20,
|
||||||
|
"fontFamily": 8,
|
||||||
|
"textAlign": "left",
|
||||||
|
"verticalAlign": "top",
|
||||||
|
"containerId": null,
|
||||||
|
"originalText": "def pad(seq: list[int], max_length: int, pad_id: int):\n\n if len(seq) > max_length:\n raise Exception()\n\n if len(seq) == max_length:\n return seq\n \n SEQ_LEN = len(seq)\n PAD_LEN = max_length - SEQ_LEN\n PADDING = [pad_id] * PAD_LEN\n seq.extend(PADDING)\n return seq\n ",
|
||||||
|
"autoResize": true,
|
||||||
|
"lineHeight": 1.25
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "8ONOUf0ArJ5DMtxeM_fvG",
|
||||||
|
"type": "text",
|
||||||
|
"x": 299,
|
||||||
|
"y": 805.5,
|
||||||
|
"width": 473,
|
||||||
|
"height": 150,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "transparent",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "a2",
|
||||||
|
"roundness": null,
|
||||||
|
"seed": 1323527029,
|
||||||
|
"version": 294,
|
||||||
|
"versionNonce": 567965627,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": null,
|
||||||
|
"updated": 1759681345616,
|
||||||
|
"link": null,
|
||||||
|
"locked": false,
|
||||||
|
"text": "def mask(\n seq: list[int], \n max_bpe_voc_id: int\n mask_id: int,\n masking_probability: MaskingProbability\n)",
|
||||||
|
"fontSize": 20,
|
||||||
|
"fontFamily": 8,
|
||||||
|
"textAlign": "left",
|
||||||
|
"verticalAlign": "top",
|
||||||
|
"containerId": null,
|
||||||
|
"originalText": "def mask(\n seq: list[int], \n max_bpe_voc_id: int\n mask_id: int,\n masking_probability: MaskingProbability\n)",
|
||||||
|
"autoResize": true,
|
||||||
|
"lineHeight": 1.25
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "cpsIEF7OpTuNkcxiggRpa",
|
||||||
|
"type": "text",
|
||||||
|
"x": 997,
|
||||||
|
"y": 703,
|
||||||
|
"width": 330,
|
||||||
|
"height": 150,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "transparent",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "a3",
|
||||||
|
"roundness": null,
|
||||||
|
"seed": 516363867,
|
||||||
|
"version": 192,
|
||||||
|
"versionNonce": 430327707,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": null,
|
||||||
|
"updated": 1759681254424,
|
||||||
|
"link": null,
|
||||||
|
"locked": false,
|
||||||
|
"text": "class MaskingProbability:\n\n + change_token_prob: float\n + mask_token_prob: float\n + same_token_prob: float\n + random_token_prob: float",
|
||||||
|
"fontSize": 20,
|
||||||
|
"fontFamily": 8,
|
||||||
|
"textAlign": "left",
|
||||||
|
"verticalAlign": "top",
|
||||||
|
"containerId": null,
|
||||||
|
"originalText": "class MaskingProbability:\n\n + change_token_prob: float\n + mask_token_prob: float\n + same_token_prob: float\n + random_token_prob: float",
|
||||||
|
"autoResize": true,
|
||||||
|
"lineHeight": 1.25
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"appState": {
|
||||||
|
"gridSize": 20,
|
||||||
|
"gridStep": 5,
|
||||||
|
"gridModeEnabled": false,
|
||||||
|
"viewBackgroundColor": "#ffffff"
|
||||||
|
},
|
||||||
|
"files": {}
|
||||||
|
}
|
||||||
21
Scripts/DataCleaning/data_output_models/debug_csv.py
Normal file
21
Scripts/DataCleaning/data_output_models/debug_csv.py
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
class Debug_csv():
|
||||||
|
def __init__(self, output_path:str):
|
||||||
|
|
||||||
|
|
||||||
|
self.output = open(output_path, "w")
|
||||||
|
# then the first row as header
|
||||||
|
header = ["MovieURI","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
|
||||||
|
self.output.write(",".join(header) + "\n")
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
self.output.close()
|
||||||
|
|
||||||
|
def write(self, RDF: pd.DataFrame):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
RDF (pd.DataFrame): ["MovieURI","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
|
||||||
|
"""
|
||||||
|
|
||||||
|
RDF.to_csv(self.output, index=False, header=False)
|
||||||
@@ -186,3 +186,9 @@ class PipelineApplier():
|
|||||||
# as input two dataframe, one with 2 column
|
# as input two dataframe, one with 2 column
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def regex_on_objects(self, RDF: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
RDF["ObjectURI"] = (RDF["ObjectURI"].astype("string")
|
||||||
|
.str.replace(r"\r?\n+", ", ", regex=True) # newlines -> ", "
|
||||||
|
.str.replace(r"\*", "", regex=True)) # delete all asterisks
|
||||||
|
|
||||||
|
return RDF
|
||||||
@@ -6,17 +6,12 @@ from Scripts.DataCleaning.data_output_models.rdf_mask_task import RDF_mask_task_
|
|||||||
from Scripts.DataCleaning.data_output_models.bpe_corpus import BPE_corpus
|
from Scripts.DataCleaning.data_output_models.bpe_corpus import BPE_corpus
|
||||||
from Scripts.DataCleaning.data_output_models.rdf_text_tasks import RDF_text_task_dataset
|
from Scripts.DataCleaning.data_output_models.rdf_text_tasks import RDF_text_task_dataset
|
||||||
from Scripts.DataCleaning.data_output_models.rdf_completation_task import RDF_completation_task_dataset
|
from Scripts.DataCleaning.data_output_models.rdf_completation_task import RDF_completation_task_dataset
|
||||||
|
from Scripts.DataCleaning.data_output_models.debug_csv import Debug_csv
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
class Pipeline():
|
class Pipeline():
|
||||||
def __init__(self,
|
def __init__(self):
|
||||||
mask_task_dataset_path:str = "./Assets/Dataset/Tmp/rdf_mask.csv",
|
|
||||||
bpe_corpus_path:str = "./Assets/Dataset/Tmp/corpus.txt",
|
|
||||||
text_to_rdf_task_dataset_path:str = "./Assets/Dataset/Tmp/rdf_text.csv",
|
|
||||||
completation_rdf_task_dataset_path:str = "./Assets/Dataset/Tmp/rdf_completation.csv",
|
|
||||||
|
|
||||||
):
|
|
||||||
self.sql_endpoint = SqlEndpoint()
|
self.sql_endpoint = SqlEndpoint()
|
||||||
# classes to manage taskes' datasets
|
# classes to manage taskes' datasets
|
||||||
self.task_rdf_mask = RDF_mask_task_dataset(mask_task_dataset_path)
|
self.task_rdf_mask = RDF_mask_task_dataset(mask_task_dataset_path)
|
||||||
@@ -98,6 +93,8 @@ class Pipeline():
|
|||||||
# other filter
|
# other filter
|
||||||
#
|
#
|
||||||
RDF = self.filter_applier.delete_relationship_by_list_filter(RDF)
|
RDF = self.filter_applier.delete_relationship_by_list_filter(RDF)
|
||||||
|
# regex on ObjectURI
|
||||||
|
RDF = self.filter_applier.regex_on_objects(RDF)
|
||||||
if RDF.empty:
|
if RDF.empty:
|
||||||
continue
|
continue
|
||||||
RDF = self.filter_applier.rdf_add_special_token(RDF) # WARNING, THIS MUST BE DONE AFTER FILTER BY FREQUENCE
|
RDF = self.filter_applier.rdf_add_special_token(RDF) # WARNING, THIS MUST BE DONE AFTER FILTER BY FREQUENCE
|
||||||
@@ -119,9 +116,13 @@ class Pipeline():
|
|||||||
movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]
|
movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]
|
||||||
self.sql_endpoint.movie_ids = movie_list
|
self.sql_endpoint.movie_ids = movie_list
|
||||||
|
|
||||||
def reduce_movie_list(self, starting_offset:int , ending_offset:int):
|
def generate_csv_debug_file(self, debug_path:str):
|
||||||
self.filter_applier.reduce_movie_list(starting_offset,ending_offset)
|
debug_csv = Debug_csv(debug_path)
|
||||||
|
|
||||||
|
for RDF in self._get_cleaned_movie_rows():
|
||||||
|
debug_csv.write(RDF)
|
||||||
|
|
||||||
|
debug_csv.close()
|
||||||
|
|
||||||
|
|
||||||
# there are a lot of settings to manage
|
# there are a lot of settings to manage
|
||||||
@@ -132,9 +133,10 @@ class Pipeline():
|
|||||||
|
|
||||||
#pipeline = Pipeline()
|
#pipeline = Pipeline()
|
||||||
|
|
||||||
# pipeline.use_toy_dataset()
|
pipeline.use_toy_dataset()
|
||||||
# pipeline.execute_task_bpe_corpus()
|
# pipeline.execute_task_bpe_corpus()
|
||||||
# pipeline.execute_task_rdf_mask()
|
# pipeline.execute_task_rdf_mask()
|
||||||
# pipeline.execute_tasks_rdf_text()
|
# pipeline.execute_tasks_rdf_text()
|
||||||
# pipeline.execute_task_rdf_completation()
|
# pipeline.execute_task_rdf_completation()
|
||||||
# pipeline.execute_all_task()
|
# pipeline.execute_all_task()
|
||||||
|
pipeline.generate_csv_debug_file("Assets/Dataset/Tmp/debug.csv")
|
||||||
@@ -10,6 +10,7 @@ class SpecialToken(Enum):
|
|||||||
RELATIONSHIP = "<PRED>"
|
RELATIONSHIP = "<PRED>"
|
||||||
OBJECT = "<OBJ>"
|
OBJECT = "<OBJ>"
|
||||||
ABSTRACT = "<ABS>"
|
ABSTRACT = "<ABS>"
|
||||||
|
END_OF_SENTENCE = "<EOS>"
|
||||||
CORPUS_END = "<END>"
|
CORPUS_END = "<END>"
|
||||||
|
|
||||||
## Tasks' Token
|
## Tasks' Token
|
||||||
|
|||||||
57
docs/PAPERS.md
Normal file
57
docs/PAPERS.md
Normal file
@@ -0,0 +1,57 @@
|
|||||||
|
# Research Material
|
||||||
|
|
||||||
|
## BPE
|
||||||
|
|
||||||
|
- [BPE Wikipedia](https://en.wikipedia.org/wiki/Byte-pair_encoding)
|
||||||
|
- [BPE Hugging Face](https://huggingface.co/learn/llm-course/chapter6/5)
|
||||||
|
- [BPE GeeksForGeeks](https://www.geeksforgeeks.org/nlp/byte-pair-encoding-bpe-in-nlp/)
|
||||||
|
- [BPE Medium Chetna Khanna](https://medium.com/data-science/byte-pair-encoding-subword-based-tokenization-algorithm-77828a70bee0)
|
||||||
|
- [Stack Overflow "Explain bpe (Byte Pair Encoding) with examples?"](https://stackoverflow.com/questions/50583254/explain-bpe-byte-pair-encoding-with-examples)
|
||||||
|
- [Implementing a byte pair encoding(BPE) Tokenizer from scratch](https://sebastianraschka.com/blog/2025/bpe-from-scratch.html)
|
||||||
|
- [Thoretical Analysis of Byte-Pair Encoding](https://arxiv.org/pdf/2411.08671)
|
||||||
|
- [A Formal Perspective on Byte-Pair Encoding](https://aclanthology.org/2023.findings-acl.38v2.pdf)
|
||||||
|
- [Byte Pair Encoding is Suboptimal for Language Model Pretraining](https://arxiv.org/pdf/2004.03720)
|
||||||
|
- [Byte pair encoding: a text compression scheme that accelerates pattern matching](https://www.researchgate.net/profile/Takeshi-Shinohara/publication/2310624_Byte_Pair_Encoding_A_Text_Compression_Scheme_That_Accelerates_Pattern_Matching/links/02e7e522f8ea00c318000000/Byte-Pair-Encoding-A-Text-Compression-Scheme-That-Accelerates-Pattern-Matching.pdf)
|
||||||
|
- [A Formal Perspective on Byte-Pair Encoding](https://arxiv.org/pdf/2306.16837)
|
||||||
|
- [Controlling byte pair encoding for neural machine translation](https://ieeexplore.ieee.org/abstract/document/8300571)
|
||||||
|
- [Scaffold-BPE: Enhancing Byte Pair Encoding for Large Language Models with Simple and Effective Scaffold Token Removal](https://ojs.aaai.org/index.php/AAAI/article/view/34633)
|
||||||
|
- [Parity-Aware Byte-Pair Encoding: Improving Cross-lingual Fairness in Tokenization](https://arxiv.org/pdf/2508.04796)
|
||||||
|
- [Code Completion using Neural Aention and Byte Pair Encoding](https://arxiv.org/pdf/2004.06343)
|
||||||
|
- [Getting the most out of your tokenizer for pre-training and domain adaptation](https://arxiv.org/html/2402.01035v2)
|
||||||
|
|
||||||
|
## Embedder
|
||||||
|
|
||||||
|
- [ROFORMER: ENHANCED TRANSFORMER WITH ROTARY POSITION EMBEDDING](https://arxiv.org/pdf/2104.09864)
|
||||||
|
- [You could have designed state of the art positional encoding](https://huggingface.co/blog/designing-positional-encoding)
|
||||||
|
- [Rotary Embeddings: A Relative Revolution](https://blog.eleuther.ai/rotary-embeddings/)
|
||||||
|
- [Round and Round We Go! What makes Rotary Positional Encodings useful?](https://arxiv.org/html/2410.06205v1)
|
||||||
|
- [Inside RoPE: Rotary Magic into Position Embeddings](https://learnopencv.com/rope-position-embeddings/)
|
||||||
|
- [What Rotary Position Embedding Can Tell Us: Identifying Query and Key Weights Corresponding to Basic Syntactic or High-level Semantic Information](https://openreview.net/pdf?id=e5Mv7iWfVW)
|
||||||
|
- [A gentle introduction to Rotary Position Embedding](https://krasserm.github.io/2022/12/13/rotary-position-embedding/)
|
||||||
|
- [Context-aware Rotary Position Embedding](https://arxiv.org/pdf/2507.23083)
|
||||||
|
- [LIERE: GENERALIZING ROTARY POSITION ENCODINGS TO HIGHER DIMENSIONAL INPUTS](https://openreview.net/pdf?id=xHMMt7r3GW)
|
||||||
|
- [Rotary Positional Embeddings (RoPE)](https://nn.labml.ai/transformers/rope/index.html)
|
||||||
|
- [Decoding Llama3: An explainer for tinkerers](https://hasgeek.com/simrathanspal/the-llama3-guide/sub/decoding-llama3-part-4-rotary-positional-embedding-3K8ZHpdLi6E56N8ejnaWzm)
|
||||||
|
|
||||||
|
## Attention
|
||||||
|
|
||||||
|
- [Standard Self-Attention (Attention is all you need)](https://arxiv.org/pdf/1706.03762)
|
||||||
|
- [TransMLA: Multi-Head Latent Attention Is All You Need](https://arxiv.org/pdf/2502.07864)
|
||||||
|
- [A Gentle Introduction to Multi-Head Latent Attention (MLA)](https://machinelearningmastery.com/a-gentle-introduction-to-multi-head-latent-attention-mla/)
|
||||||
|
- [Understanding Multi-Head Latent Attention](https://planetbanatt.net/articles/mla.html)
|
||||||
|
- [DeepSeek's Multi-Head Latent Attention](https://liorsinai.github.io/machine-learning/2025/02/22/mla.html)
|
||||||
|
- [MatchFormer: Interleaving Attention in Transformers for Feature Matching](https://arxiv.org/pdf/2203.09645)
|
||||||
|
- [FIT: Far-reaching Interleaved Transformers](https://arxiv.org/pdf/2305.12689)
|
||||||
|
- [Gemma explained: What’s new in Gemma 3](https://developers.googleblog.com/en/gemma-explained-whats-new-in-gemma-3/)
|
||||||
|
- [The Llama 4 herd: The beginning of a new era of natively multimodal AI innovation](https://ai.meta.com/blog/llama-4-multimodal-intelligence/)
|
||||||
|
- [Attention was never enough: Tracing the rise of hybrid LLMs](https://www.ai21.com/blog/rise-of-hybrid-llms/)
|
||||||
|
-
|
||||||
|
|
||||||
|
## Spanned Masking
|
||||||
|
|
||||||
|
- [Salient Span Masking for Temporal Understanding](https://arxiv.org/pdf/2303.12860)
|
||||||
|
- [PMI-MASKING: PRINCIPLED MASKING OF CORRELATED SPANS](https://arxiv.org/pdf/2010.01825)
|
||||||
|
|
||||||
|
## Models
|
||||||
|
|
||||||
|
- [What Language Model Architecture and Pretraining Objective Work Best for Zero-Shot Generalization?](https://arxiv.org/pdf/2204.05832)
|
||||||
BIN
environment.yaml
BIN
environment.yaml
Binary file not shown.
@@ -16,3 +16,4 @@ urllib3==2.5.0
|
|||||||
wheel==0.45.1
|
wheel==0.45.1
|
||||||
Wikipedia-API==0.8.1
|
Wikipedia-API==0.8.1
|
||||||
SQLAlchemy
|
SQLAlchemy
|
||||||
|
torch
|
||||||
|
|||||||
Reference in New Issue
Block a user