Compare commits
1 Commits
dev.train
...
dev.modelt
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1de2cc59db |
Binary file not shown.
Binary file not shown.
|
Binary file not shown.
|
Binary file not shown.
|
Binary file not shown.
|
Binary file not shown.
|
Binary file not shown.
|
Binary file not shown.
|
Binary file not shown.
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
File diff suppressed because one or more lines are too long
125
Playgrounds/doctor.py
Normal file
125
Playgrounds/doctor.py
Normal file
@@ -0,0 +1,125 @@
|
|||||||
|
import random
|
||||||
|
import torch
|
||||||
|
import pandas as pd
|
||||||
|
from pathlib import Path
|
||||||
|
import Project_Model.Libs.Embedder as Embedder
|
||||||
|
import Project_Model.Libs.BPE as BPE
|
||||||
|
import Project_Model.Libs.Transformer as Transformer
|
||||||
|
import Project_Model.Libs.TorchShims as torch_shims
|
||||||
|
from Project_Model.Libs.Training.learning_rade_shedulers import Custom_lr
|
||||||
|
from Project_Model.Libs.Training.logistic_collector import LogitsCollector # import the external collector
|
||||||
|
|
||||||
|
# set a fixed seed
|
||||||
|
torch.manual_seed(0)
|
||||||
|
random.seed(0)
|
||||||
|
DEVICE = torch_shims.get_default_device()
|
||||||
|
torch.set_default_device(DEVICE)
|
||||||
|
|
||||||
|
# BPE Init
|
||||||
|
VOCABULARY_PATH = Path("Assets/Model/toy_10/toy_dictionary.json")
|
||||||
|
SPECIAL_VOC = BPE.default_special_tokens()
|
||||||
|
|
||||||
|
VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)
|
||||||
|
TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)
|
||||||
|
|
||||||
|
# Constants
|
||||||
|
TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + 1
|
||||||
|
EMBEDDED_SIZE = 256
|
||||||
|
FEED_FORWARD_MULTIPLIER = 4
|
||||||
|
ATTENTION_HEADS = 4
|
||||||
|
SENTENCE_LENGTH = 256
|
||||||
|
NUMBER_OF_BLOCKS = 2
|
||||||
|
MAX_EPOCHS = int(1e3)
|
||||||
|
|
||||||
|
PAD_TOKEN = TOKENANO.encode("<PAD>")[0]
|
||||||
|
END_TOKEN = TOKENANO.encode("<END>")[0]
|
||||||
|
|
||||||
|
# Load CSV
|
||||||
|
TOY_DATASET_PATH = Path("Assets/Dataset/1-hop/toy/rdf_text.csv")
|
||||||
|
TOY_DATASET = pd.read_csv(TOY_DATASET_PATH)
|
||||||
|
|
||||||
|
TOY_BATCH_INPUT_LIST: list[list[int]] = []
|
||||||
|
TOY_BATCH_PADDING_LIST: list[list[bool]] = []
|
||||||
|
TOY_BATCH_TARGET_LIST: list[list[int]] = []
|
||||||
|
TOY_BATCH_DECODER_DEFAULT: list[list[int]] = []
|
||||||
|
|
||||||
|
for index, row in TOY_DATASET.iterrows():
|
||||||
|
RDFs: str = row["RDFs"]
|
||||||
|
Abstract: str = row["Abstract"]
|
||||||
|
|
||||||
|
input_tokens = TOKENANO.encode(RDFs) # encoder input ids
|
||||||
|
output_tokens = TOKENANO.encode(Abstract)[1:] # decoder target ids (shifted left)
|
||||||
|
decoder_default_tokens = TOKENANO.encode("<SOS>") # decoder input starts with <SOS>
|
||||||
|
|
||||||
|
input_tokens, padding = Transformer.normalize_sequence(
|
||||||
|
input_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN
|
||||||
|
) # pad/trim + end token
|
||||||
|
output_tokens, _ = Transformer.normalize_sequence(
|
||||||
|
output_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN
|
||||||
|
) # pad/trim + end token
|
||||||
|
decoder_default_tokens = Transformer.pad_sequence(
|
||||||
|
decoder_default_tokens, SENTENCE_LENGTH, PAD_TOKEN
|
||||||
|
) # pad with PAD up to SENTENCE_LENGTH
|
||||||
|
|
||||||
|
TOY_BATCH_INPUT_LIST.append(input_tokens)
|
||||||
|
TOY_BATCH_PADDING_LIST.append(padding)
|
||||||
|
TOY_BATCH_TARGET_LIST.append(output_tokens)
|
||||||
|
TOY_BATCH_DECODER_DEFAULT.append(decoder_default_tokens)
|
||||||
|
|
||||||
|
# Training loop
|
||||||
|
LOSS_HISTORY = []
|
||||||
|
NANOSOCRATES = Transformer.TrainingModel(
|
||||||
|
TOKEN_SPACE_SIZE,
|
||||||
|
EMBEDDED_SIZE,
|
||||||
|
FEED_FORWARD_MULTIPLIER,
|
||||||
|
ATTENTION_HEADS,
|
||||||
|
NUMBER_OF_BLOCKS,
|
||||||
|
)
|
||||||
|
|
||||||
|
collector = LogitsCollector(PAD_TOKEN, END_TOKEN, TOKENANO) # collects logits and decodes
|
||||||
|
|
||||||
|
NANOSOCRATES.train()
|
||||||
|
cross_entropy = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)
|
||||||
|
optimizer = torch.optim.AdamW(NANOSOCRATES.parameters())
|
||||||
|
scheduler = Custom_lr(EMBEDDED_SIZE, 4000) # step each optimizer step
|
||||||
|
|
||||||
|
current_epoch = 0
|
||||||
|
BATCH_SIZE = min(32, len(TOY_BATCH_INPUT_LIST)) # small batch to stabilize
|
||||||
|
|
||||||
|
while current_epoch < MAX_EPOCHS:
|
||||||
|
# simple fixed mini-batch from the top; later you can shuffle/slice
|
||||||
|
enc = torch.tensor(TOY_BATCH_INPUT_LIST[:BATCH_SIZE], dtype=torch.long) # [B,T] encoder token ids
|
||||||
|
pad = torch.tensor(TOY_BATCH_PADDING_LIST[:BATCH_SIZE], dtype=torch.bool) # [B,T] True where encoder PAD is present
|
||||||
|
tgt = torch.tensor(TOY_BATCH_TARGET_LIST[:BATCH_SIZE], dtype=torch.long) # [B,T] decoder targets (ground-truth)
|
||||||
|
|
||||||
|
# decoder prefix buffer: <SOS> at pos 0, PAD elsewhere (no shift here) # we will fill it step by step
|
||||||
|
dec = torch.tensor(TOY_BATCH_DECODER_DEFAULT[:BATCH_SIZE], dtype=torch.long) # [B,T]
|
||||||
|
|
||||||
|
total_loss = 0.0
|
||||||
|
collector.reset() # start fresh for this epoch
|
||||||
|
|
||||||
|
T = tgt.size(1) # sequence length
|
||||||
|
for t in range(T):
|
||||||
|
optimizer.zero_grad(set_to_none=True) # clear grads for this token step
|
||||||
|
|
||||||
|
prefix = dec[:, : t + 1] # [B, t+1] current decoder prefix
|
||||||
|
dec_pad_mask = prefix.eq(PAD_TOKEN) # [B, t+1] True where PAD inside prefix
|
||||||
|
|
||||||
|
# one-step logits given prefix (trainer model expects 4 args now)
|
||||||
|
logits_t: torch.Tensor = NANOSOCRATES((enc, pad, prefix, dec_pad_mask)) # [B,V] logits for step t
|
||||||
|
collector.add(logits_t) # store logits for decoding later
|
||||||
|
|
||||||
|
loss_t = cross_entropy(logits_t, tgt[:, t]) # CE expects raw logits; PAD ignored
|
||||||
|
loss_t.backward() # backprop for this step
|
||||||
|
optimizer.step() # update params
|
||||||
|
scheduler.step() # Noam/warmup: step per optimizer step
|
||||||
|
|
||||||
|
total_loss = float(loss_t.detach()) # keep last step loss for logging
|
||||||
|
|
||||||
|
# teacher forcing: reveal the correct token for next position
|
||||||
|
if t < T - 1:
|
||||||
|
dec[:, t + 1] = tgt[:, t] # write ground-truth into next slot
|
||||||
|
|
||||||
|
current_epoch += 1
|
||||||
|
print(f"EPOCH {current_epoch}\n\tLoss: {total_loss:.6f}") # simple log
|
||||||
|
collector.print_decoded() # print decoded predictions for the batch
|
||||||
@@ -1,263 +0,0 @@
|
|||||||
import random
|
|
||||||
import torch
|
|
||||||
from pathlib import Path
|
|
||||||
import Project_Model.Libs.BPE as BPE
|
|
||||||
import Project_Model.Libs.Transformer as Transformer
|
|
||||||
import Project_Model.Libs.TransformerUtils as TUtils
|
|
||||||
import Project_Model.Libs.TorchShims as torch_shims
|
|
||||||
import Project_Model.Libs.Batch as Batch
|
|
||||||
|
|
||||||
# set a default device
|
|
||||||
DEVICE = torch_shims.get_default_device()
|
|
||||||
torch.set_default_device(DEVICE)
|
|
||||||
|
|
||||||
# set a fixed seed
|
|
||||||
torch.manual_seed(0)
|
|
||||||
random.seed(0)
|
|
||||||
|
|
||||||
# Get paths
|
|
||||||
MODEL_DIR = "Assets/Model/curated"
|
|
||||||
# MODEL_DIR= "Assets/Dataset/Tmp"
|
|
||||||
VOCABULARY_PATH = Path("Assets/Model/small/bpe-small-16.json")
|
|
||||||
TRAIN_DATASET_PATH = Path("Assets/Dataset/1-hop/small/holdout/train.csv")
|
|
||||||
VALIDATION_DATASET_PATH = Path("Assets/Dataset/1-hop/small/holdout/evaluation.csv")
|
|
||||||
TEST_DATASET_PATH = Path("Assets/Dataset/1-hop/small/holdout/test.csv")
|
|
||||||
# TEST_DATASET_PATH = Path("Assets/Dataset/1-hop/toy/rdf_text.csv")
|
|
||||||
MODEL_PATH = Path(f"{MODEL_DIR}/NanoSocrates.zip")
|
|
||||||
|
|
||||||
|
|
||||||
# BPE Init
|
|
||||||
SPECIAL_VOC = BPE.default_special_tokens()
|
|
||||||
VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)
|
|
||||||
TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)
|
|
||||||
|
|
||||||
|
|
||||||
# Constants
|
|
||||||
MASK_EXTRA_SPACE = 100
|
|
||||||
REAL_TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size
|
|
||||||
TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + MASK_EXTRA_SPACE
|
|
||||||
EMBEDDED_SIZE = 256
|
|
||||||
FEED_FORWARD_MULTIPLIER = 4
|
|
||||||
ATTENTION_HEADS = 4
|
|
||||||
SENTENCE_LENGTH = 256
|
|
||||||
NUMBER_OF_BLOCKS = 2
|
|
||||||
|
|
||||||
SOS_TOKEN = TOKENANO.encode("<SOS>")[0]
|
|
||||||
PAD_TOKEN = TOKENANO.encode("<PAD>")[0]
|
|
||||||
END_TOKEN = TOKENANO.encode("<EOS>")[0]
|
|
||||||
SUBJ_TOKEN = TOKENANO.encode("<SUBJ>")[0]
|
|
||||||
REL_TOKEN = TOKENANO.encode("<PRED>")[0]
|
|
||||||
OBJ_TOKEN = TOKENANO.encode("<OBJ>")[0]
|
|
||||||
MASK_TOKEN = TOKENANO.encode("<MASK>")[0]
|
|
||||||
CONTINUTE_TOKEN = TOKENANO.encode("<CONTINUERDF>")[0]
|
|
||||||
|
|
||||||
SPECIAL_TOKENS: set[int] = set(TOKENANO.encode("".join(BPE.default_special_tokens())))
|
|
||||||
ALLOWED_TOKENS = set([SUBJ_TOKEN, REL_TOKEN, OBJ_TOKEN])
|
|
||||||
FORBIDDEN_TOKENS = SPECIAL_TOKENS - ALLOWED_TOKENS
|
|
||||||
|
|
||||||
|
|
||||||
# Spanned_Masker
|
|
||||||
MASKER = Transformer.SpannedMasker(REAL_TOKEN_SPACE_SIZE, FORBIDDEN_TOKENS, average_span=4)
|
|
||||||
|
|
||||||
TRAIN_BATCHER = Batch.Batcher(TRAIN_DATASET_PATH, SENTENCE_LENGTH, TOKENANO, MASKER)
|
|
||||||
VALIDATION_BATCHER = Batch.Batcher(
|
|
||||||
VALIDATION_DATASET_PATH, SENTENCE_LENGTH, TOKENANO, MASKER
|
|
||||||
)
|
|
||||||
TEST_BATCHER = Batch.Batcher(TEST_DATASET_PATH, SENTENCE_LENGTH, TOKENANO, MASKER, debug=True)
|
|
||||||
|
|
||||||
|
|
||||||
# Model
|
|
||||||
NANOSOCRATES_TRAIN = Transformer.TrainingModel(
|
|
||||||
TOKEN_SPACE_SIZE,
|
|
||||||
EMBEDDED_SIZE,
|
|
||||||
FEED_FORWARD_MULTIPLIER,
|
|
||||||
ATTENTION_HEADS,
|
|
||||||
NUMBER_OF_BLOCKS,
|
|
||||||
)
|
|
||||||
|
|
||||||
NANOSOCRATES = Transformer.NanoSocratesCore(
|
|
||||||
TOKEN_SPACE_SIZE,
|
|
||||||
SENTENCE_LENGTH,
|
|
||||||
SOS_TOKEN,
|
|
||||||
PAD_TOKEN,
|
|
||||||
END_TOKEN,
|
|
||||||
CONTINUTE_TOKEN,
|
|
||||||
EMBEDDED_SIZE,
|
|
||||||
FEED_FORWARD_MULTIPLIER,
|
|
||||||
ATTENTION_HEADS,
|
|
||||||
NUMBER_OF_BLOCKS,
|
|
||||||
)
|
|
||||||
|
|
||||||
if MODEL_PATH.is_file():
|
|
||||||
nanosocrates_dict = torch.load(MODEL_PATH, weights_only=True, map_location=DEVICE)
|
|
||||||
NANOSOCRATES_TRAIN.load_state_dict(nanosocrates_dict)
|
|
||||||
|
|
||||||
_, ENCODER_ONLY, DECODER_ONLY = TUtils.decompose_nano_socrates(
|
|
||||||
NANOSOCRATES, TOKEN_SPACE_SIZE, EMBEDDED_SIZE
|
|
||||||
)
|
|
||||||
|
|
||||||
NANOSOCRATES = TUtils.train2inference(
|
|
||||||
NANOSOCRATES_TRAIN,
|
|
||||||
NANOSOCRATES
|
|
||||||
)
|
|
||||||
|
|
||||||
NANOSOCRATES.eval()
|
|
||||||
ENCODER_ONLY.eval()
|
|
||||||
DECODER_ONLY.eval()
|
|
||||||
NANOSOCRATES_TRAIN.eval()
|
|
||||||
|
|
||||||
task_1_metrics = []
|
|
||||||
task_2_metrics = []
|
|
||||||
task_3_metrics = []
|
|
||||||
task_4_metrics = []
|
|
||||||
|
|
||||||
example_num = 0
|
|
||||||
with torch.no_grad():
|
|
||||||
for example in TEST_BATCHER.batch(1):
|
|
||||||
|
|
||||||
|
|
||||||
print(f"DOING Example: {example_num}")
|
|
||||||
|
|
||||||
src_x, tgt_y, pad_x, pad_y, tasktype = example
|
|
||||||
|
|
||||||
enc_x = torch.tensor(src_x)
|
|
||||||
|
|
||||||
ACTUAL_BATCH_SIZE, _ = enc_x.shape
|
|
||||||
enc_x_pad = torch.tensor(pad_x, dtype=torch.bool)
|
|
||||||
tgt = torch.tensor(tgt_y)
|
|
||||||
tgt_pad = torch.tensor(pad_y, dtype=torch.bool)
|
|
||||||
|
|
||||||
dec_x = Transformer.get_decoder_input(
|
|
||||||
ACTUAL_BATCH_SIZE, SOS_TOKEN, PAD_TOKEN, SENTENCE_LENGTH
|
|
||||||
)
|
|
||||||
dec_x[:, 1:] = tgt[:, :-1]
|
|
||||||
dec_x_pad = dec_x.eq(PAD_TOKEN)
|
|
||||||
|
|
||||||
out: torch.Tensor = NANOSOCRATES.inference((enc_x, enc_x_pad), tasktype)
|
|
||||||
|
|
||||||
tokens: list[int] = out.tolist()[0]
|
|
||||||
tokens.append(END_TOKEN)
|
|
||||||
tokens = list(map(lambda x: MASK_TOKEN if x > TOKENANO.vocabulary_size else x, tokens))
|
|
||||||
out_string = TOKENANO.decode(tokens)
|
|
||||||
|
|
||||||
exp_tokens: list[int] = tgt_y[0]
|
|
||||||
exp_tokens = list(map(lambda x: MASK_TOKEN if x > TOKENANO.vocabulary_size else x, exp_tokens))
|
|
||||||
exp_string = TOKENANO.decode(exp_tokens)
|
|
||||||
|
|
||||||
enc_tokens: list[int] = src_x[0]
|
|
||||||
enc_tokens = list(map(lambda x: MASK_TOKEN if x > TOKENANO.vocabulary_size else x, enc_tokens))
|
|
||||||
enc_string = TOKENANO.decode(enc_tokens)
|
|
||||||
|
|
||||||
print(f"PROMPT:\n{enc_string}")
|
|
||||||
print(f"EXPECTED:\n{exp_string}")
|
|
||||||
print(f"ACTUAL:\n{out_string}")
|
|
||||||
|
|
||||||
if tasktype == Batch.TaskType.RDF2TXT:
|
|
||||||
example_num += 1
|
|
||||||
ref = TUtils.remove_padding(exp_tokens, PAD_TOKEN, END_TOKEN)
|
|
||||||
pred = TUtils.remove_padding(tokens, PAD_TOKEN, END_TOKEN)
|
|
||||||
ref_str = TOKENANO.decode(ref)
|
|
||||||
pred_str = TOKENANO.decode(pred)
|
|
||||||
|
|
||||||
bleu, rouge, meteor = TUtils.rdf2txt([ref_str], [pred_str])
|
|
||||||
|
|
||||||
task_1_metrics.append(
|
|
||||||
[
|
|
||||||
bleu["bleu"], rouge["rougeL"], meteor["meteor"] # type: ignore
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
if tasktype == Batch.TaskType.TEXT2RDF:
|
|
||||||
ref = TUtils.remove_padding(exp_tokens, PAD_TOKEN, END_TOKEN)
|
|
||||||
pred = TUtils.remove_padding(tokens[1:], PAD_TOKEN, END_TOKEN)
|
|
||||||
ref, pred = TUtils.balance_paddings(ref, pred, PAD_TOKEN)
|
|
||||||
|
|
||||||
precision, recall = TUtils.txt2rdf(ref, pred)
|
|
||||||
|
|
||||||
task_2_metrics.append(
|
|
||||||
[
|
|
||||||
precision["precision"], recall["recall"] # type: ignore
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
if tasktype == Batch.TaskType.MASKING:
|
|
||||||
ref = TUtils.remove_padding(exp_tokens, PAD_TOKEN, END_TOKEN)
|
|
||||||
pred = TUtils.remove_padding(tokens, PAD_TOKEN, END_TOKEN)
|
|
||||||
ref, pred = TUtils.balance_paddings(ref, pred, PAD_TOKEN)
|
|
||||||
|
|
||||||
accuracy = TUtils.accuracy(ref, pred)
|
|
||||||
|
|
||||||
task_3_metrics.append(
|
|
||||||
|
|
||||||
accuracy["accuracy"] # type: ignore
|
|
||||||
|
|
||||||
)
|
|
||||||
|
|
||||||
if tasktype == Batch.TaskType.COMPLETATION:
|
|
||||||
|
|
||||||
ref = TUtils.remove_padding(exp_tokens, PAD_TOKEN, END_TOKEN)
|
|
||||||
pred = TUtils.remove_padding(tokens, PAD_TOKEN, END_TOKEN)
|
|
||||||
ref, pred = TUtils.balance_paddings(ref, pred, PAD_TOKEN)
|
|
||||||
|
|
||||||
precision, recall = TUtils.txt2rdf(ref, pred)
|
|
||||||
|
|
||||||
task_4_metrics.append(
|
|
||||||
[
|
|
||||||
precision["precision"], recall["recall"] # type: ignore
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
bleus = [row[0] for row in task_1_metrics]
|
|
||||||
rouges = [row[1] for row in task_1_metrics]
|
|
||||||
meteors = [row[2] for row in task_1_metrics]
|
|
||||||
|
|
||||||
prec_1 = [row[0] for row in task_2_metrics]
|
|
||||||
rec_1 = [row[1] for row in task_2_metrics]
|
|
||||||
|
|
||||||
acc = task_3_metrics
|
|
||||||
|
|
||||||
prec_2 = [row[0] for row in task_4_metrics]
|
|
||||||
rec_2 = [row[1] for row in task_4_metrics]
|
|
||||||
|
|
||||||
BLEU = TUtils.average(bleus)
|
|
||||||
ROUGE = TUtils.average(rouges)
|
|
||||||
METEOR = TUtils.average(meteors)
|
|
||||||
|
|
||||||
PREC_1 = TUtils.average(prec_1)
|
|
||||||
REC_1 = TUtils.average(rec_1)
|
|
||||||
F1_1 = TUtils.f1(PREC_1, REC_1)
|
|
||||||
|
|
||||||
ACC = TUtils.average(acc)
|
|
||||||
|
|
||||||
PREC_2 = TUtils.average(prec_2)
|
|
||||||
REC_2 = TUtils.average(rec_2)
|
|
||||||
F1_2 = TUtils.f1(PREC_2, REC_2)
|
|
||||||
|
|
||||||
SEPARATOR = "**************************************************************************"
|
|
||||||
OUTPUT = "".join([
|
|
||||||
f"{SEPARATOR}\n",
|
|
||||||
"*\tRDF2TXT:\n",
|
|
||||||
f"*\t\tBLEU: {BLEU} - ROUGE: {ROUGE} - METEOR: {METEOR}\n"
|
|
||||||
f"{SEPARATOR}\n",
|
|
||||||
"*\tTXT2RDF:\n",
|
|
||||||
f"*\t\tPRECISION: {PREC_1} - RECALL: {REC_1} - F1: {F1_1}\n"
|
|
||||||
f"{SEPARATOR}\n",
|
|
||||||
"*\tRDF Completion 1:\n",
|
|
||||||
f"*\t\tACCURACY: {ACC}\n"
|
|
||||||
f"{SEPARATOR}\n",
|
|
||||||
"*\tRDF Completion 2:\n",
|
|
||||||
f"*\t\tPRECISION: {PREC_2} - RECALL: {REC_2} - F1: {F1_2}\n"
|
|
||||||
f"{SEPARATOR}\n",
|
|
||||||
""
|
|
||||||
])
|
|
||||||
|
|
||||||
print(OUTPUT)
|
|
||||||
|
|
||||||
|
|
||||||
print("\nDEBUG")
|
|
||||||
print(task_1_metrics)
|
|
||||||
print(task_2_metrics)
|
|
||||||
print(task_3_metrics)
|
|
||||||
print(task_4_metrics)
|
|
||||||
|
|
||||||
221
Playgrounds/locistic_test.ipynb
Normal file
221
Playgrounds/locistic_test.ipynb
Normal file
@@ -0,0 +1,221 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "c8741a8f",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"EPOCH 1\n",
|
||||||
|
"\tLoss: 7.424792\n",
|
||||||
|
"[0] \n",
|
||||||
|
"[1] \n",
|
||||||
|
"[2] \n",
|
||||||
|
"[3] \n",
|
||||||
|
"[4] \n",
|
||||||
|
"[5] \n",
|
||||||
|
"[6] \n",
|
||||||
|
"[7] \n",
|
||||||
|
"[8] \n",
|
||||||
|
"[9] \n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import random\n",
|
||||||
|
"import torch\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"from pathlib import Path\n",
|
||||||
|
"import Project_Model.Libs.Embedder as Embedder\n",
|
||||||
|
"import Project_Model.Libs.BPE as BPE\n",
|
||||||
|
"import Project_Model.Libs.Transformer as Transformer\n",
|
||||||
|
"import Project_Model.Libs.TorchShims as torch_shims\n",
|
||||||
|
"from Project_Model.Libs.Training.learning_rade_shedulers import Custom_lr\n",
|
||||||
|
"\n",
|
||||||
|
"import torch\n",
|
||||||
|
"\n",
|
||||||
|
"class LogitsCollector:\n",
|
||||||
|
" def __init__(self, pad_token: int, end_token: int, tokenizer) -> None:\n",
|
||||||
|
" self.__pad_token = pad_token # used to skip PAD\n",
|
||||||
|
" self.__end_token = end_token # used to stop at END\n",
|
||||||
|
" self.__tokenizer = tokenizer # exposes .decode(list[int]) -> str\n",
|
||||||
|
" self.__steps: list[torch.Tensor] = [] # list of per-step logits [B,V]\n",
|
||||||
|
"\n",
|
||||||
|
" def reset(self) -> None:\n",
|
||||||
|
" self.__steps.clear() # clear history\n",
|
||||||
|
"\n",
|
||||||
|
" def add(self, logits_step: torch.Tensor) -> None:\n",
|
||||||
|
" if logits_step.dim() == 3: # handle [B,1,V]\n",
|
||||||
|
" logits_step = logits_step[:, -1, :] # -> [B,V]\n",
|
||||||
|
" self.__steps.append(logits_step.detach()) # store raw logits (detached)\n",
|
||||||
|
"\n",
|
||||||
|
" def tokens(self) -> list[list[int]]:\n",
|
||||||
|
" if not self.__steps:\n",
|
||||||
|
" return []\n",
|
||||||
|
" stack = torch.stack(self.__steps, dim=0) # [T,B,V]\n",
|
||||||
|
" probs = torch.softmax(stack, dim=-1) # softmax over vocab -> [T,B,V]\n",
|
||||||
|
" ids = probs.argmax(dim=-1).transpose(0, 1) # greedy ids -> [B,T]\n",
|
||||||
|
" out: list[list[int]] = []\n",
|
||||||
|
" for row in ids.tolist():\n",
|
||||||
|
" seq: list[int] = []\n",
|
||||||
|
" for tok in row:\n",
|
||||||
|
" if tok == self.__end_token: # stop on END\n",
|
||||||
|
" break\n",
|
||||||
|
" if tok == self.__pad_token: # skip PAD\n",
|
||||||
|
" continue\n",
|
||||||
|
" seq.append(tok)\n",
|
||||||
|
" out.append(seq)\n",
|
||||||
|
" return out\n",
|
||||||
|
"\n",
|
||||||
|
" def print_decoded(self) -> None:\n",
|
||||||
|
" for i, seq in enumerate(self.tokens()):\n",
|
||||||
|
" try:\n",
|
||||||
|
" text = self.__tokenizer.decode(seq) # decode tokens to string\n",
|
||||||
|
" except Exception:\n",
|
||||||
|
" text = str(seq) # fallback to ids\n",
|
||||||
|
" print(f\"[{i}] {text}\") # simple print\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"# set a fixed seed\n",
|
||||||
|
"torch.manual_seed(0)\n",
|
||||||
|
"random.seed(0)\n",
|
||||||
|
"DEVICE = torch_shims.get_default_device()\n",
|
||||||
|
"torch.set_default_device(DEVICE)\n",
|
||||||
|
"\n",
|
||||||
|
"# BPE Init\n",
|
||||||
|
"VOCABULARY_PATH = Path(\"Assets/Model/toy_10/toy_dictionary.json\")\n",
|
||||||
|
"SPECIAL_VOC = BPE.default_special_tokens()\n",
|
||||||
|
"\n",
|
||||||
|
"VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)\n",
|
||||||
|
"TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)\n",
|
||||||
|
"\n",
|
||||||
|
"# Constants\n",
|
||||||
|
"TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + 1\n",
|
||||||
|
"EMBEDDED_SIZE = 256\n",
|
||||||
|
"FEED_FORWARD_MULTIPLIER = 4\n",
|
||||||
|
"ATTENTION_HEADS = 4\n",
|
||||||
|
"SENTENCE_LENGTH = 256\n",
|
||||||
|
"NUMBER_OF_BLOCKS = 2\n",
|
||||||
|
"MAX_EPOCHS = int(1e3)\n",
|
||||||
|
"\n",
|
||||||
|
"PAD_TOKEN = TOKENANO.encode(\"<PAD>\")[0]\n",
|
||||||
|
"END_TOKEN = TOKENANO.encode(\"<END>\")[0]\n",
|
||||||
|
"\n",
|
||||||
|
"# Load CSV\n",
|
||||||
|
"TOY_DATASET_PATH = Path(\"Assets/Dataset/1-hop/toy/rdf_text.csv\")\n",
|
||||||
|
"TOY_DATASET = pd.read_csv(TOY_DATASET_PATH)\n",
|
||||||
|
"\n",
|
||||||
|
"TOY_BATCH_INPUT_LIST: list[list[int]] = []\n",
|
||||||
|
"TOY_BATCH_PADDING_LIST: list[list[bool]] = []\n",
|
||||||
|
"TOY_BATCH_TARGET_LIST: list[list[int]] = []\n",
|
||||||
|
"TOY_BATCH_DECODER_DEFAULT: list[list[int]] = []\n",
|
||||||
|
"\n",
|
||||||
|
"for index, row in TOY_DATASET.iterrows():\n",
|
||||||
|
" RDFs: str = row[\"RDFs\"]\n",
|
||||||
|
" Abstract: str = row[\"Abstract\"]\n",
|
||||||
|
"\n",
|
||||||
|
" input_tokens = TOKENANO.encode(RDFs) # encoder input ids\n",
|
||||||
|
" output_tokens = TOKENANO.encode(Abstract)[1:] # decoder target ids (shifted left)\n",
|
||||||
|
" decoder_default_tokens = TOKENANO.encode(\"<SOS>\") # decoder input starts with <SOS>\n",
|
||||||
|
"\n",
|
||||||
|
" input_tokens, padding = Transformer.normalize_sequence(\n",
|
||||||
|
" input_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n",
|
||||||
|
" ) # pad/trim + end token\n",
|
||||||
|
" output_tokens, _ = Transformer.normalize_sequence(\n",
|
||||||
|
" output_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n",
|
||||||
|
" ) # pad/trim + end token\n",
|
||||||
|
" decoder_default_tokens = Transformer.pad_sequence(\n",
|
||||||
|
" decoder_default_tokens, SENTENCE_LENGTH, PAD_TOKEN\n",
|
||||||
|
" ) # pad with PAD up to SENTENCE_LENGTH\n",
|
||||||
|
"\n",
|
||||||
|
" TOY_BATCH_INPUT_LIST.append(input_tokens)\n",
|
||||||
|
" TOY_BATCH_PADDING_LIST.append(padding)\n",
|
||||||
|
" TOY_BATCH_TARGET_LIST.append(output_tokens)\n",
|
||||||
|
" TOY_BATCH_DECODER_DEFAULT.append(decoder_default_tokens)\n",
|
||||||
|
"\n",
|
||||||
|
"# Training loop\n",
|
||||||
|
"LOSS_HISTORY = []\n",
|
||||||
|
"NANOSOCRATES = Transformer.TrainingModel(\n",
|
||||||
|
" TOKEN_SPACE_SIZE,\n",
|
||||||
|
" EMBEDDED_SIZE,\n",
|
||||||
|
" FEED_FORWARD_MULTIPLIER,\n",
|
||||||
|
" ATTENTION_HEADS,\n",
|
||||||
|
" NUMBER_OF_BLOCKS,\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"collector = LogitsCollector(PAD_TOKEN, END_TOKEN, TOKENANO) # collects logits and decodes\n",
|
||||||
|
"\n",
|
||||||
|
"NANOSOCRATES.train()\n",
|
||||||
|
"cross_entropy = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)\n",
|
||||||
|
"optimizer = torch.optim.AdamW(NANOSOCRATES.parameters())\n",
|
||||||
|
"scheduler = Custom_lr(EMBEDDED_SIZE, 4000) # step each optimizer step\n",
|
||||||
|
"\n",
|
||||||
|
"current_epoch = 0\n",
|
||||||
|
"BATCH_SIZE = min(32, len(TOY_BATCH_INPUT_LIST)) # small batch to stabilize\n",
|
||||||
|
"\n",
|
||||||
|
"while current_epoch < MAX_EPOCHS:\n",
|
||||||
|
" # simple fixed mini-batch from the top; later you can shuffle/slice\n",
|
||||||
|
" enc = torch.tensor(TOY_BATCH_INPUT_LIST[:BATCH_SIZE], dtype=torch.long) # [B,T] encoder token ids\n",
|
||||||
|
" pad = torch.tensor(TOY_BATCH_PADDING_LIST[:BATCH_SIZE], dtype=torch.bool) # [B,T] True where encoder PAD is present\n",
|
||||||
|
" tgt = torch.tensor(TOY_BATCH_TARGET_LIST[:BATCH_SIZE], dtype=torch.long) # [B,T] decoder targets (ground-truth)\n",
|
||||||
|
"\n",
|
||||||
|
" # decoder prefix buffer: <SOS> at pos 0, PAD elsewhere (no shift here) # we will fill it step by step\n",
|
||||||
|
" dec = torch.tensor(TOY_BATCH_DECODER_DEFAULT[:BATCH_SIZE], dtype=torch.long) # [B,T]\n",
|
||||||
|
"\n",
|
||||||
|
" total_loss = 0.0\n",
|
||||||
|
" collector.reset() # start fresh for this epoch\n",
|
||||||
|
"\n",
|
||||||
|
" T = tgt.size(1) # sequence length\n",
|
||||||
|
" for t in range(T):\n",
|
||||||
|
" optimizer.zero_grad(set_to_none=True) # clear grads for this token step\n",
|
||||||
|
"\n",
|
||||||
|
" prefix = dec[:, : t + 1] # [B, t+1] current decoder prefix\n",
|
||||||
|
" dec_pad_mask = prefix.eq(PAD_TOKEN) # [B, t+1] True where PAD inside prefix\n",
|
||||||
|
"\n",
|
||||||
|
" # one-step logits given prefix (trainer model expects 4 args now)\n",
|
||||||
|
" logits_t: torch.Tensor = NANOSOCRATES((enc, pad, prefix, dec_pad_mask)) # [B,V] logits for step t\n",
|
||||||
|
" collector.add(logits_t) # store logits for decoding later\n",
|
||||||
|
"\n",
|
||||||
|
" loss_t = cross_entropy(logits_t, tgt[:, t]) # CE expects raw logits; PAD ignored\n",
|
||||||
|
" loss_t.backward() # backprop for this step\n",
|
||||||
|
" optimizer.step() # update params\n",
|
||||||
|
" scheduler.step() # Noam/warmup: step per optimizer step\n",
|
||||||
|
"\n",
|
||||||
|
" total_loss = float(loss_t.detach()) # keep last step loss for logging\n",
|
||||||
|
"\n",
|
||||||
|
" # teacher forcing: reveal the correct token for next position\n",
|
||||||
|
" if t < T - 1:\n",
|
||||||
|
" dec[:, t + 1] = tgt[:, t] # write ground-truth into next slot\n",
|
||||||
|
"\n",
|
||||||
|
" current_epoch += 1\n",
|
||||||
|
" print(f\"EPOCH {current_epoch}\\n\\tLoss: {total_loss:.6f}\") # simple log\n",
|
||||||
|
" collector.print_decoded() # print decoded predictions for the batch\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "deep_learning",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.13.7"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
205
Playgrounds/model-teacher-forcing.ipynb
Normal file
205
Playgrounds/model-teacher-forcing.ipynb
Normal file
@@ -0,0 +1,205 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "0afbf498",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"EPOCH 1\n",
|
||||||
|
"\tLoss: 9.174470901489258\n",
|
||||||
|
"EPOCH 2\n",
|
||||||
|
"\tLoss: 9.20919132232666\n",
|
||||||
|
"EPOCH 3\n",
|
||||||
|
"\tLoss: 9.227106094360352\n",
|
||||||
|
"EPOCH 4\n",
|
||||||
|
"\tLoss: 9.172086715698242\n",
|
||||||
|
"EPOCH 5\n",
|
||||||
|
"\tLoss: 9.180150985717773\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"ename": "KeyboardInterrupt",
|
||||||
|
"evalue": "",
|
||||||
|
"output_type": "error",
|
||||||
|
"traceback": [
|
||||||
|
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
||||||
|
"\u001b[31mKeyboardInterrupt\u001b[39m Traceback (most recent call last)",
|
||||||
|
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 116\u001b[39m\n\u001b[32m 113\u001b[39m step_target = target_logits[:, i] \u001b[38;5;66;03m# [B]\u001b[39;00m\n\u001b[32m 115\u001b[39m loss = cross_entropy(step_logits,step_target) \u001b[38;5;66;03m# now loss is without softmax\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m116\u001b[39m \u001b[43mloss\u001b[49m\u001b[43m.\u001b[49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# DAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMN\u001b[39;00m\n\u001b[32m 117\u001b[39m last_loss = loss\n\u001b[32m 118\u001b[39m optimizer.step()\n",
|
||||||
|
"\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/deep_learning/lib/python3.13/site-packages/torch/_tensor.py:638\u001b[39m, in \u001b[36mTensor.backward\u001b[39m\u001b[34m(self, gradient, retain_graph, create_graph, inputs)\u001b[39m\n\u001b[32m 595\u001b[39m \u001b[38;5;250m\u001b[39m\u001b[33mr\u001b[39m\u001b[33;03m\"\"\"Computes the gradient of current tensor wrt graph leaves.\u001b[39;00m\n\u001b[32m 596\u001b[39m \n\u001b[32m 597\u001b[39m \u001b[33;03mThe graph is differentiated using the chain rule. If the tensor is\u001b[39;00m\n\u001b[32m (...)\u001b[39m\u001b[32m 635\u001b[39m \u001b[33;03m used to compute the :attr:`tensors`. Defaults to ``None``.\u001b[39;00m\n\u001b[32m 636\u001b[39m \u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 637\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m has_torch_function_unary(\u001b[38;5;28mself\u001b[39m):\n\u001b[32m--> \u001b[39m\u001b[32m638\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mhandle_torch_function\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 639\u001b[39m \u001b[43m \u001b[49m\u001b[43mTensor\u001b[49m\u001b[43m.\u001b[49m\u001b[43mbackward\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 640\u001b[39m \u001b[43m \u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 641\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 642\u001b[39m \u001b[43m \u001b[49m\u001b[43mgradient\u001b[49m\u001b[43m=\u001b[49m\u001b[43mgradient\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 643\u001b[39m \u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m=\u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 644\u001b[39m \u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 645\u001b[39m \u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m=\u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 646\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 647\u001b[39m torch.autograd.backward(\n\u001b[32m 648\u001b[39m \u001b[38;5;28mself\u001b[39m, gradient, retain_graph, create_graph, inputs=inputs\n\u001b[32m 649\u001b[39m )\n",
|
||||||
|
"\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/deep_learning/lib/python3.13/site-packages/torch/overrides.py:1725\u001b[39m, in \u001b[36mhandle_torch_function\u001b[39m\u001b[34m(public_api, relevant_args, *args, **kwargs)\u001b[39m\n\u001b[32m 1721\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m _is_torch_function_mode_enabled():\n\u001b[32m 1722\u001b[39m \u001b[38;5;66;03m# if we're here, the mode must be set to a TorchFunctionStackMode\u001b[39;00m\n\u001b[32m 1723\u001b[39m \u001b[38;5;66;03m# this unsets it and calls directly into TorchFunctionStackMode's torch function\u001b[39;00m\n\u001b[32m 1724\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m _pop_mode_temporarily() \u001b[38;5;28;01mas\u001b[39;00m mode:\n\u001b[32m-> \u001b[39m\u001b[32m1725\u001b[39m result = \u001b[43mmode\u001b[49m\u001b[43m.\u001b[49m\u001b[43m__torch_function__\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpublic_api\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtypes\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1726\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m result \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mNotImplemented\u001b[39m:\n\u001b[32m 1727\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m result\n",
|
||||||
|
"\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/deep_learning/lib/python3.13/site-packages/torch/utils/_device.py:103\u001b[39m, in \u001b[36mDeviceContext.__torch_function__\u001b[39m\u001b[34m(self, func, types, args, kwargs)\u001b[39m\n\u001b[32m 101\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m func \u001b[38;5;129;01min\u001b[39;00m _device_constructors() \u001b[38;5;129;01mand\u001b[39;00m kwargs.get(\u001b[33m\"\u001b[39m\u001b[33mdevice\u001b[39m\u001b[33m\"\u001b[39m) \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 102\u001b[39m kwargs[\u001b[33m\"\u001b[39m\u001b[33mdevice\u001b[39m\u001b[33m\"\u001b[39m] = \u001b[38;5;28mself\u001b[39m.device\n\u001b[32m--> \u001b[39m\u001b[32m103\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
|
||||||
|
"\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/deep_learning/lib/python3.13/site-packages/torch/_tensor.py:647\u001b[39m, in \u001b[36mTensor.backward\u001b[39m\u001b[34m(self, gradient, retain_graph, create_graph, inputs)\u001b[39m\n\u001b[32m 637\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m has_torch_function_unary(\u001b[38;5;28mself\u001b[39m):\n\u001b[32m 638\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m handle_torch_function(\n\u001b[32m 639\u001b[39m Tensor.backward,\n\u001b[32m 640\u001b[39m (\u001b[38;5;28mself\u001b[39m,),\n\u001b[32m (...)\u001b[39m\u001b[32m 645\u001b[39m inputs=inputs,\n\u001b[32m 646\u001b[39m )\n\u001b[32m--> \u001b[39m\u001b[32m647\u001b[39m \u001b[43mtorch\u001b[49m\u001b[43m.\u001b[49m\u001b[43mautograd\u001b[49m\u001b[43m.\u001b[49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 648\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgradient\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m=\u001b[49m\u001b[43minputs\u001b[49m\n\u001b[32m 649\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
|
||||||
|
"\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/deep_learning/lib/python3.13/site-packages/torch/autograd/__init__.py:354\u001b[39m, in \u001b[36mbackward\u001b[39m\u001b[34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001b[39m\n\u001b[32m 349\u001b[39m retain_graph = create_graph\n\u001b[32m 351\u001b[39m \u001b[38;5;66;03m# The reason we repeat the same comment below is that\u001b[39;00m\n\u001b[32m 352\u001b[39m \u001b[38;5;66;03m# some Python versions print out the first line of a multi-line function\u001b[39;00m\n\u001b[32m 353\u001b[39m \u001b[38;5;66;03m# calls in the traceback and some print out the last line\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m354\u001b[39m \u001b[43m_engine_run_backward\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 355\u001b[39m \u001b[43m \u001b[49m\u001b[43mtensors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 356\u001b[39m \u001b[43m \u001b[49m\u001b[43mgrad_tensors_\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 357\u001b[39m \u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 358\u001b[39m \u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 359\u001b[39m \u001b[43m \u001b[49m\u001b[43minputs_tuple\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 360\u001b[39m \u001b[43m \u001b[49m\u001b[43mallow_unreachable\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 361\u001b[39m \u001b[43m \u001b[49m\u001b[43maccumulate_grad\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 362\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
|
||||||
|
"\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/deep_learning/lib/python3.13/site-packages/torch/autograd/graph.py:829\u001b[39m, in \u001b[36m_engine_run_backward\u001b[39m\u001b[34m(t_outputs, *args, **kwargs)\u001b[39m\n\u001b[32m 827\u001b[39m unregister_hooks = _register_logging_hooks_on_whole_graph(t_outputs)\n\u001b[32m 828\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m829\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mVariable\u001b[49m\u001b[43m.\u001b[49m\u001b[43m_execution_engine\u001b[49m\u001b[43m.\u001b[49m\u001b[43mrun_backward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Calls into the C++ engine to run the backward pass\u001b[39;49;00m\n\u001b[32m 830\u001b[39m \u001b[43m \u001b[49m\u001b[43mt_outputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\n\u001b[32m 831\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# Calls into the C++ engine to run the backward pass\u001b[39;00m\n\u001b[32m 832\u001b[39m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[32m 833\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m attach_logging_hooks:\n",
|
||||||
|
"\u001b[31mKeyboardInterrupt\u001b[39m: "
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import random\n",
|
||||||
|
"import torch\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"from pathlib import Path\n",
|
||||||
|
"import Project_Model.Libs.Embedder as Embedder\n",
|
||||||
|
"import Project_Model.Libs.BPE as BPE\n",
|
||||||
|
"import Project_Model.Libs.Transformer as Transformer\n",
|
||||||
|
"import Project_Model.Libs.TorchShims as torch_shims\n",
|
||||||
|
"\n",
|
||||||
|
"# set a fixed seed\n",
|
||||||
|
"torch.manual_seed(0)\n",
|
||||||
|
"random.seed(0)\n",
|
||||||
|
"DEVICE = torch_shims.get_default_device()\n",
|
||||||
|
"torch.set_default_device(DEVICE)\n",
|
||||||
|
"\n",
|
||||||
|
"# set a default device\n",
|
||||||
|
"\n",
|
||||||
|
"# BPE Init\n",
|
||||||
|
"VOCABULARY_PATH = Path(\"Assets/Model/toy_10/toy_dictionary.json\")\n",
|
||||||
|
"SPECIAL_VOC = BPE.default_special_tokens()\n",
|
||||||
|
"\n",
|
||||||
|
"VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)\n",
|
||||||
|
"TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"# Constants\n",
|
||||||
|
"TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + 1\n",
|
||||||
|
"EMBEDDED_SIZE = 256\n",
|
||||||
|
"FEED_FORWARD_MULTIPLIER = 4\n",
|
||||||
|
"ATTENTION_HEADS = 4\n",
|
||||||
|
"SENTENCE_LENGTH = 256\n",
|
||||||
|
"NUMBER_OF_BLOCKS = 2\n",
|
||||||
|
"MAX_EPOCHS = int(1e3)\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"PAD_TOKEN = TOKENANO.encode(\"<PAD>\")[0]\n",
|
||||||
|
"END_TOKEN = TOKENANO.encode(\"<END>\")[0]\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"# Load CSV\n",
|
||||||
|
"TOY_DATASET_PATH = Path(\"Assets/Dataset/1-hop/toy/rdf_text.csv\")\n",
|
||||||
|
"\n",
|
||||||
|
"TOY_DATASET = pd.read_csv(TOY_DATASET_PATH)\n",
|
||||||
|
"\n",
|
||||||
|
"TOY_BATCH_INPUT_LIST: list[list[int]] = []\n",
|
||||||
|
"TOY_BATCH_PADDING_LIST: list[list[bool]] = []\n",
|
||||||
|
"TOY_BATCH_TARGET_LIST: list[list[int]] = []\n",
|
||||||
|
"TOY_BATCH_DECODER_DEFAULT: list[list[int]]= []\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"for index, row in TOY_DATASET.iterrows():\n",
|
||||||
|
"\n",
|
||||||
|
" RDFs: str = row[\"RDFs\"]\n",
|
||||||
|
" Abstract: str = row[\"Abstract\"]\n",
|
||||||
|
"\n",
|
||||||
|
" input_tokens = TOKENANO.encode(RDFs)\n",
|
||||||
|
" output_tokens = TOKENANO.encode(Abstract)[1:]\n",
|
||||||
|
" decoder_default_tokens = TOKENANO.encode(\"<SOS>\")\n",
|
||||||
|
"\n",
|
||||||
|
" input_tokens, padding = Transformer.normalize_sequence(\n",
|
||||||
|
" input_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n",
|
||||||
|
" )\n",
|
||||||
|
" output_tokens, _ = Transformer.normalize_sequence(\n",
|
||||||
|
" output_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n",
|
||||||
|
" )\n",
|
||||||
|
" decoder_default_tokens, _ = Transformer.normalize_sequence(\n",
|
||||||
|
" decoder_default_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n",
|
||||||
|
" )\n",
|
||||||
|
"\n",
|
||||||
|
" TOY_BATCH_INPUT_LIST.append(input_tokens)\n",
|
||||||
|
" TOY_BATCH_PADDING_LIST.append(padding)\n",
|
||||||
|
" TOY_BATCH_TARGET_LIST.append(output_tokens)\n",
|
||||||
|
" TOY_BATCH_DECODER_DEFAULT.append(decoder_default_tokens)\n",
|
||||||
|
"\n",
|
||||||
|
"# Training loop\n",
|
||||||
|
"LOSS_HISTORY = []\n",
|
||||||
|
"NANOSOCRATES = Transformer.TrainingModel(\n",
|
||||||
|
" TOKEN_SPACE_SIZE,\n",
|
||||||
|
" EMBEDDED_SIZE,\n",
|
||||||
|
" FEED_FORWARD_MULTIPLIER,\n",
|
||||||
|
" ATTENTION_HEADS,\n",
|
||||||
|
" NUMBER_OF_BLOCKS\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"NANOSOCRATES.train() # nothing important, activates dropout etc \n",
|
||||||
|
"cross_entropy = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)\n",
|
||||||
|
"optimizer = torch.optim.AdamW(NANOSOCRATES.parameters())\n",
|
||||||
|
"scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 4)\n",
|
||||||
|
"\n",
|
||||||
|
"last_loss = 0\n",
|
||||||
|
"\n",
|
||||||
|
"current_epoch = 0\n",
|
||||||
|
"while current_epoch < MAX_EPOCHS:\n",
|
||||||
|
"\n",
|
||||||
|
" encoder_list = torch.tensor([TOY_BATCH_INPUT_LIST[0]])\n",
|
||||||
|
" decoder_list = torch.tensor([TOY_BATCH_DECODER_DEFAULT[0]])\n",
|
||||||
|
" padding_list = torch.tensor([TOY_BATCH_PADDING_LIST[0]], dtype=torch.bool)\n",
|
||||||
|
" target_logits = torch.tensor([TOY_BATCH_TARGET_LIST[0]]) # Transform target into logits\n",
|
||||||
|
"\n",
|
||||||
|
" optimizer.zero_grad() # to clear gradient\n",
|
||||||
|
"\n",
|
||||||
|
" last_loss = 0.0\n",
|
||||||
|
"\n",
|
||||||
|
" for i in range(0, SENTENCE_LENGTH):\n",
|
||||||
|
"\n",
|
||||||
|
" # optimizer.zero_grad()\n",
|
||||||
|
" # forward \n",
|
||||||
|
" logits: torch.Tensor = NANOSOCRATES((encoder_list, padding_list, decoder_list))\n",
|
||||||
|
" # probabilities = torch.softmax(logits,2)\n",
|
||||||
|
" \n",
|
||||||
|
"\n",
|
||||||
|
" step_logits = logits[:, i, :] # [B, V]\n",
|
||||||
|
" step_target = target_logits[:, i] # [B]\n",
|
||||||
|
"\n",
|
||||||
|
" loss = cross_entropy(step_logits,step_target) # now loss is without softmax\n",
|
||||||
|
" loss.backward() # DAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMN\n",
|
||||||
|
" last_loss = loss\n",
|
||||||
|
" optimizer.step()\n",
|
||||||
|
" optimizer.zero_grad()\n",
|
||||||
|
" scheduler.step()\n",
|
||||||
|
" \n",
|
||||||
|
" probabilities = torch.softmax(logits,2)\n",
|
||||||
|
" most_probable_tokens = torch.argmax(probabilities, 2) \n",
|
||||||
|
" if i < SENTENCE_LENGTH - 1:\n",
|
||||||
|
" decoder_list[:,i+1] = most_probable_tokens[:,i]\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
" current_epoch += 1\n",
|
||||||
|
"\n",
|
||||||
|
" if current_epoch % 1 == 0:\n",
|
||||||
|
" print(f\"EPOCH {current_epoch}\\n\\tLoss: {last_loss}\")\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "deep_learning",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.13.7"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
@@ -1,472 +0,0 @@
|
|||||||
import random
|
|
||||||
import sys
|
|
||||||
import torch
|
|
||||||
import pandas as pd
|
|
||||||
from pathlib import Path
|
|
||||||
import Project_Model.Libs.Embedder as Embedder
|
|
||||||
import Project_Model.Libs.BPE as BPE
|
|
||||||
import Project_Model.Libs.Transformer as Transformer
|
|
||||||
import Project_Model.Libs.TransformerUtils as TUtils
|
|
||||||
import Project_Model.Libs.TorchShims as torch_shims
|
|
||||||
import Project_Model.Libs.Batch as Batch
|
|
||||||
from Project_Model.Libs.Training.loss_saver import Log
|
|
||||||
|
|
||||||
# set a fixed seed
|
|
||||||
torch.manual_seed(0)
|
|
||||||
random.seed(0)
|
|
||||||
|
|
||||||
|
|
||||||
# set a default device
|
|
||||||
DEVICE = torch_shims.get_default_device()
|
|
||||||
torch.set_default_device(DEVICE)
|
|
||||||
|
|
||||||
|
|
||||||
# Get paths
|
|
||||||
CHECKPOINT_DIR = "Assets/Dataset/Tmp"
|
|
||||||
VOCABULARY_PATH = Path("Assets/Model/small/bpe-small-16.json")
|
|
||||||
TRAIN_DATASET_PATH = Path("Assets/Dataset/1-hop/toy/rdf_text.csv")
|
|
||||||
VALIDATION_DATASET_PATH = Path("Assets/Dataset/1-hop/toy/rdf_text.csv")
|
|
||||||
TEST_DATASET_PATH = Path("Assets/Dataset/1-hop/toy/rdf_text.csv")
|
|
||||||
CHECKPOINT_PATH = Path(f"{CHECKPOINT_DIR}/NanoSocrates.zip")
|
|
||||||
|
|
||||||
NANO_OPTIM_PATH = Path(f"{CHECKPOINT_DIR}/nano_optim.zip")
|
|
||||||
ENC_OPTIM_PATH = Path(f"{CHECKPOINT_DIR}/enc_optim.zip")
|
|
||||||
DEC_OPTIM_PATH = Path(f"{CHECKPOINT_DIR}/dec_optim.zip")
|
|
||||||
LAST_EPOCH_PATH = Path(f"{CHECKPOINT_DIR}/last_epoch.txt")
|
|
||||||
|
|
||||||
# log saver:
|
|
||||||
loss_saver = Log(f"{CHECKPOINT_DIR}/log_loss.csv")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# BPE Init
|
|
||||||
SPECIAL_VOC = BPE.default_special_tokens()
|
|
||||||
VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)
|
|
||||||
TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)
|
|
||||||
|
|
||||||
|
|
||||||
# Constants
|
|
||||||
MASK_EXTRA_SPACE = 100
|
|
||||||
REAL_TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size
|
|
||||||
TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + MASK_EXTRA_SPACE
|
|
||||||
EMBEDDED_SIZE = 256
|
|
||||||
FEED_FORWARD_MULTIPLIER = 4
|
|
||||||
ATTENTION_HEADS = 4
|
|
||||||
SENTENCE_LENGTH = 256
|
|
||||||
NUMBER_OF_BLOCKS = 2
|
|
||||||
MAX_EPOCHS = int(300)
|
|
||||||
PRETRAIN_EPOCHS = int(20)
|
|
||||||
WARMUP_EPOCHS = int(30)
|
|
||||||
MINI_BATCH_SIZE = 20
|
|
||||||
VALIDATION_STEPS = 10
|
|
||||||
CHECKPOINT_STEPS = VALIDATION_STEPS
|
|
||||||
PATIENCE = 4
|
|
||||||
CURRENT_EPOCH = -1 if not LAST_EPOCH_PATH.is_file() else int(LAST_EPOCH_PATH.read_text())
|
|
||||||
VERBOSE = False
|
|
||||||
LEARNING_RATE = 0.05
|
|
||||||
LABEL_SMOOTHING = 0.01
|
|
||||||
|
|
||||||
SOS_TOKEN = TOKENANO.encode("<SOS>")[0]
|
|
||||||
|
|
||||||
PAD_TOKEN = TOKENANO.encode("<PAD>")[0]
|
|
||||||
END_TOKEN = TOKENANO.encode("<END>")[0]
|
|
||||||
SUBJ_TOKEN = TOKENANO.encode("<SUBJ>")[0]
|
|
||||||
REL_TOKEN = TOKENANO.encode("<PRED>")[0]
|
|
||||||
OBJ_TOKEN = TOKENANO.encode("<OBJ>")[0]
|
|
||||||
MASK_TOKEN = TOKENANO.encode("<MASK>")[0]
|
|
||||||
|
|
||||||
SPECIAL_TOKENS: set[int] = set(TOKENANO.encode("".join(BPE.default_special_tokens())))
|
|
||||||
ALLOWED_TOKENS = set([SUBJ_TOKEN, REL_TOKEN, OBJ_TOKEN])
|
|
||||||
FORBIDDEN_TOKENS = SPECIAL_TOKENS - ALLOWED_TOKENS
|
|
||||||
|
|
||||||
|
|
||||||
# Spanned_Masker
|
|
||||||
MASKER = Transformer.SpannedMasker(REAL_TOKEN_SPACE_SIZE, FORBIDDEN_TOKENS, average_span=4)
|
|
||||||
|
|
||||||
TRAIN_BATCHER = Batch.Batcher(TRAIN_DATASET_PATH, SENTENCE_LENGTH, TOKENANO, MASKER)
|
|
||||||
VALIDATION_BATCHER = Batch.Batcher(
|
|
||||||
VALIDATION_DATASET_PATH, SENTENCE_LENGTH, TOKENANO, MASKER
|
|
||||||
)
|
|
||||||
TEST_BATCHER = Batch.Batcher(TEST_DATASET_PATH, SENTENCE_LENGTH, TOKENANO, MASKER)
|
|
||||||
|
|
||||||
|
|
||||||
# Model
|
|
||||||
NANOSOCRATES = Transformer.TrainingModel(
|
|
||||||
TOKEN_SPACE_SIZE,
|
|
||||||
EMBEDDED_SIZE,
|
|
||||||
FEED_FORWARD_MULTIPLIER,
|
|
||||||
ATTENTION_HEADS,
|
|
||||||
NUMBER_OF_BLOCKS,
|
|
||||||
)
|
|
||||||
|
|
||||||
if CHECKPOINT_PATH.is_file():
|
|
||||||
nanosocrates_dict = torch.load(CHECKPOINT_PATH, weights_only=True)
|
|
||||||
NANOSOCRATES.load_state_dict(nanosocrates_dict)
|
|
||||||
|
|
||||||
_, ENCODER_ONLY, DECODER_ONLY = TUtils.decompose_nano_socrates(
|
|
||||||
NANOSOCRATES, TOKEN_SPACE_SIZE, EMBEDDED_SIZE
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# Training constants
|
|
||||||
nano_cross_entropy = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN, label_smoothing=LABEL_SMOOTHING)
|
|
||||||
encoder_ce = torch.nn.CrossEntropyLoss( label_smoothing=LABEL_SMOOTHING)
|
|
||||||
decoder_ce = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN, label_smoothing=LABEL_SMOOTHING)
|
|
||||||
nano_optim = torch.optim.AdamW(NANOSOCRATES.parameters(), LEARNING_RATE)
|
|
||||||
encoder_only_optim = torch.optim.AdamW(ENCODER_ONLY.parameters(), LEARNING_RATE)
|
|
||||||
decoder_only_optim = torch.optim.AdamW(DECODER_ONLY.parameters(), LEARNING_RATE)
|
|
||||||
|
|
||||||
if NANO_OPTIM_PATH.is_file():
|
|
||||||
optim_dict = torch.load(NANO_OPTIM_PATH)
|
|
||||||
nano_optim.load_state_dict(optim_dict)
|
|
||||||
|
|
||||||
if ENC_OPTIM_PATH.is_file():
|
|
||||||
optim_dict = torch.load(ENC_OPTIM_PATH)
|
|
||||||
encoder_only_optim.load_state_dict(optim_dict)
|
|
||||||
|
|
||||||
if DEC_OPTIM_PATH.is_file():
|
|
||||||
optim_dict = torch.load(DEC_OPTIM_PATH)
|
|
||||||
decoder_only_optim.load_state_dict(optim_dict)
|
|
||||||
|
|
||||||
nano_scheduler = Transformer.WarmupLR(nano_optim, WARMUP_EPOCHS, EMBEDDED_SIZE, last_epoch=CURRENT_EPOCH)
|
|
||||||
encoder_only_scheduler = Transformer.WarmupLR(
|
|
||||||
encoder_only_optim, WARMUP_EPOCHS, EMBEDDED_SIZE, last_epoch=CURRENT_EPOCH
|
|
||||||
)
|
|
||||||
decoder_only_scheduler = Transformer.WarmupLR(
|
|
||||||
decoder_only_optim, WARMUP_EPOCHS, EMBEDDED_SIZE, last_epoch=CURRENT_EPOCH
|
|
||||||
)
|
|
||||||
|
|
||||||
current_epoch = CURRENT_EPOCH + 2
|
|
||||||
patience = 0
|
|
||||||
|
|
||||||
|
|
||||||
average_loss_validation = {
|
|
||||||
"txt": float("inf"),
|
|
||||||
"encoder_only": float("inf"),
|
|
||||||
"decoder_only": float("inf"),
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
while current_epoch < MAX_EPOCHS:
|
|
||||||
|
|
||||||
NANOSOCRATES.train()
|
|
||||||
ENCODER_ONLY.train()
|
|
||||||
DECODER_ONLY.train()
|
|
||||||
|
|
||||||
text_batch_losses = []
|
|
||||||
encoder_batch_losses = []
|
|
||||||
decoder_batch_losses = []
|
|
||||||
|
|
||||||
batch_counter = 0
|
|
||||||
|
|
||||||
if VERBOSE:
|
|
||||||
print(f"EPOCH {current_epoch} STARTING")
|
|
||||||
|
|
||||||
for batch in TRAIN_BATCHER.batch(MINI_BATCH_SIZE):
|
|
||||||
|
|
||||||
batch_counter += 1
|
|
||||||
|
|
||||||
src_x, tgt_y, pad_x, pad_y, tasktype = batch
|
|
||||||
|
|
||||||
enc_x = torch.tensor(src_x)
|
|
||||||
|
|
||||||
ACTUAL_BATCH_SIZE, _ = enc_x.shape
|
|
||||||
enc_x_pad = torch.tensor(pad_x, dtype=torch.bool)
|
|
||||||
tgt = torch.tensor(tgt_y)
|
|
||||||
tgt_pad = torch.tensor(pad_y, dtype=torch.bool)
|
|
||||||
|
|
||||||
dec_x = Transformer.get_decoder_input(
|
|
||||||
ACTUAL_BATCH_SIZE, SOS_TOKEN, PAD_TOKEN, SENTENCE_LENGTH
|
|
||||||
)
|
|
||||||
dec_x[:, 1:] = tgt[:, :-1]
|
|
||||||
dec_x_pad = dec_x.eq(PAD_TOKEN)
|
|
||||||
|
|
||||||
if VERBOSE:
|
|
||||||
for s in TUtils.decode_batch(enc_x, TOKENANO, MASK_TOKEN):
|
|
||||||
print("Input")
|
|
||||||
print(s)
|
|
||||||
|
|
||||||
for s in TUtils.decode_batch(enc_x_pad, TOKENANO, MASK_TOKEN):
|
|
||||||
print("Encoder Padding mask")
|
|
||||||
print(s)
|
|
||||||
|
|
||||||
for s in TUtils.decode_batch(tgt, TOKENANO, MASK_TOKEN):
|
|
||||||
print("Desired Output")
|
|
||||||
print(s)
|
|
||||||
a_dx = dec_x[:,:]
|
|
||||||
a_dx[:, -1]= END_TOKEN
|
|
||||||
for s in TUtils.decode_batch(a_dx, TOKENANO, MASK_TOKEN):
|
|
||||||
print("Decoder Input")
|
|
||||||
print(s)
|
|
||||||
|
|
||||||
if VERBOSE:
|
|
||||||
print(f"\tBATCH {batch_counter} Starting")
|
|
||||||
|
|
||||||
# Task 1 and Task 2
|
|
||||||
if tasktype == Batch.TaskType.RDF2TXT or tasktype == Batch.TaskType.TEXT2RDF:
|
|
||||||
|
|
||||||
if VERBOSE:
|
|
||||||
print(f"\tExecuting TASK 1 or 2 - BATCH {batch_counter}")
|
|
||||||
|
|
||||||
nano_optim.zero_grad()
|
|
||||||
|
|
||||||
pred_logits: torch.Tensor = NANOSOCRATES((enc_x, enc_x_pad, dec_x, dec_x_pad))
|
|
||||||
pred_logits = pred_logits.permute(0, 2, 1)
|
|
||||||
|
|
||||||
loss: torch.Tensor = nano_cross_entropy(pred_logits, tgt)
|
|
||||||
|
|
||||||
loss.backward()
|
|
||||||
nano_optim.step()
|
|
||||||
|
|
||||||
text_batch_losses.append(loss)
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Pretrain first
|
|
||||||
if current_epoch < PRETRAIN_EPOCHS:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Task 3
|
|
||||||
if tasktype == Batch.TaskType.MASKING:
|
|
||||||
|
|
||||||
if VERBOSE:
|
|
||||||
print(f"\tExecuting TASK 3 - BATCH {batch_counter}")
|
|
||||||
|
|
||||||
encoder_only_optim.zero_grad()
|
|
||||||
|
|
||||||
pred_logits = ENCODER_ONLY((enc_x, enc_x_pad))
|
|
||||||
pred_logits = pred_logits.permute(0, 2, 1)
|
|
||||||
# print(torch.max(tgt))
|
|
||||||
|
|
||||||
loss: torch.Tensor = encoder_ce(pred_logits, tgt)
|
|
||||||
|
|
||||||
loss.backward()
|
|
||||||
encoder_only_optim.step()
|
|
||||||
|
|
||||||
exp_tokens: list[int] = tgt_y[0]
|
|
||||||
exp_tokens = list(map(lambda x: MASK_TOKEN if x > TOKENANO.vocabulary_size else x, exp_tokens))
|
|
||||||
exp_string = TOKENANO.decode(exp_tokens)
|
|
||||||
|
|
||||||
enc_tokens: list[int] = src_x[0]
|
|
||||||
enc_tokens = list(map(lambda x: MASK_TOKEN if x > TOKENANO.vocabulary_size else x, enc_tokens))
|
|
||||||
enc_string = TOKENANO.decode(enc_tokens)
|
|
||||||
|
|
||||||
print(f"PROMPT:\n{enc_string}")
|
|
||||||
print(f"EXPECTED:\n{exp_string}")
|
|
||||||
|
|
||||||
|
|
||||||
encoder_batch_losses.append(loss.item())
|
|
||||||
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Task 4
|
|
||||||
if tasktype == Batch.TaskType.COMPLETATION:
|
|
||||||
|
|
||||||
if VERBOSE:
|
|
||||||
print(f"\tExecuting TASK 4 - BATCH {batch_counter}")
|
|
||||||
|
|
||||||
decoder_only_optim.zero_grad()
|
|
||||||
|
|
||||||
pred_logits = DECODER_ONLY((dec_x, enc_x_pad, dec_x_pad))
|
|
||||||
pred_logits = pred_logits.permute(0, 2, 1)
|
|
||||||
|
|
||||||
loss: torch.Tensor = decoder_ce(pred_logits, tgt)
|
|
||||||
|
|
||||||
loss.backward()
|
|
||||||
decoder_only_optim.step()
|
|
||||||
|
|
||||||
decoder_batch_losses.append(
|
|
||||||
loss
|
|
||||||
)
|
|
||||||
|
|
||||||
continue
|
|
||||||
|
|
||||||
nano_scheduler.step()
|
|
||||||
encoder_only_scheduler.step()
|
|
||||||
decoder_only_scheduler.step()
|
|
||||||
|
|
||||||
current_epoch += 1
|
|
||||||
|
|
||||||
if current_epoch % VALIDATION_STEPS == 0:
|
|
||||||
|
|
||||||
NANOSOCRATES.eval()
|
|
||||||
ENCODER_ONLY.eval()
|
|
||||||
DECODER_ONLY.eval()
|
|
||||||
|
|
||||||
with torch.no_grad():
|
|
||||||
txt_avg_batch_losses = []
|
|
||||||
enc_avg_batch_losses = []
|
|
||||||
dec_avg_batch_losses = []
|
|
||||||
|
|
||||||
for batch in VALIDATION_BATCHER.batch(MINI_BATCH_SIZE):
|
|
||||||
|
|
||||||
src_x, tgt_y, pad_x, pad_y, tasktype = batch
|
|
||||||
|
|
||||||
enc_x = torch.tensor(src_x)
|
|
||||||
|
|
||||||
ACTUAL_BATCH_SIZE, _ = enc_x.shape
|
|
||||||
enc_x_pad = torch.tensor(pad_x, dtype=torch.bool)
|
|
||||||
tgt = torch.tensor(tgt_y)
|
|
||||||
tgt_pad = torch.tensor(pad_y, dtype=torch.bool)
|
|
||||||
|
|
||||||
dec_x = Transformer.get_decoder_input(
|
|
||||||
ACTUAL_BATCH_SIZE, SOS_TOKEN, PAD_TOKEN, SENTENCE_LENGTH
|
|
||||||
)
|
|
||||||
dec_x[:, 1:] = tgt[:, :-1]
|
|
||||||
dec_x_pad = dec_x.eq(PAD_TOKEN)
|
|
||||||
|
|
||||||
# Task 1 and Task 2
|
|
||||||
if (
|
|
||||||
tasktype == Batch.TaskType.RDF2TXT
|
|
||||||
or tasktype == Batch.TaskType.TEXT2RDF
|
|
||||||
):
|
|
||||||
|
|
||||||
|
|
||||||
pred_logits = NANOSOCRATES((enc_x, enc_x_pad, dec_x, dec_x_pad))
|
|
||||||
|
|
||||||
pred_logits = pred_logits.permute(0, 2, 1)
|
|
||||||
|
|
||||||
loss: torch.Tensor = nano_cross_entropy(
|
|
||||||
pred_logits, tgt
|
|
||||||
)
|
|
||||||
|
|
||||||
txt_avg_batch_losses.append(loss)
|
|
||||||
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Pretrain first
|
|
||||||
if current_epoch <= PRETRAIN_EPOCHS:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Task 3
|
|
||||||
if tasktype == Batch.TaskType.MASKING:
|
|
||||||
|
|
||||||
pred_logits = ENCODER_ONLY((enc_x, enc_x_pad))
|
|
||||||
pred_logits = pred_logits.permute(0, 2, 1)
|
|
||||||
|
|
||||||
loss: torch.Tensor = encoder_ce(pred_logits, tgt)
|
|
||||||
|
|
||||||
enc_avg_batch_losses.append(loss.item())
|
|
||||||
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Task 4
|
|
||||||
if tasktype == Batch.TaskType.COMPLETATION:
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
pred_logits = DECODER_ONLY((dec_x, enc_x_pad, dec_x_pad))
|
|
||||||
|
|
||||||
pred_logits = pred_logits.permute(0, 2, 1)
|
|
||||||
|
|
||||||
loss: torch.Tensor = decoder_ce(pred_logits, tgt)
|
|
||||||
|
|
||||||
|
|
||||||
dec_avg_batch_losses.append(loss)
|
|
||||||
|
|
||||||
continue
|
|
||||||
|
|
||||||
txt_avg_loss = sum(txt_avg_batch_losses) / len(txt_avg_batch_losses)
|
|
||||||
enc_avg_loss = float("inf")
|
|
||||||
dec_avg_loss = float("inf")
|
|
||||||
|
|
||||||
if current_epoch > PRETRAIN_EPOCHS:
|
|
||||||
enc_avg_loss = sum(enc_avg_batch_losses) / len(enc_avg_batch_losses)
|
|
||||||
dec_avg_loss = sum(dec_avg_batch_losses) / len(dec_avg_batch_losses)
|
|
||||||
|
|
||||||
if current_epoch < PRETRAIN_EPOCHS:
|
|
||||||
|
|
||||||
if txt_avg_loss < average_loss_validation["txt"]:
|
|
||||||
average_loss_validation["txt"] = txt_avg_loss
|
|
||||||
else:
|
|
||||||
patience += 1
|
|
||||||
if VERBOSE:
|
|
||||||
print(f"losing a patience, current irritation: {patience}")
|
|
||||||
else:
|
|
||||||
|
|
||||||
counter = 0
|
|
||||||
|
|
||||||
if txt_avg_loss > average_loss_validation["txt"]:
|
|
||||||
|
|
||||||
if VERBOSE:
|
|
||||||
print("txt average is higher than lowest")
|
|
||||||
counter += 1
|
|
||||||
else:
|
|
||||||
average_loss_validation["txt"] = txt_avg_loss
|
|
||||||
|
|
||||||
if enc_avg_loss > average_loss_validation["encoder_only"]:
|
|
||||||
if VERBOSE:
|
|
||||||
print("masking average is higher than lowest")
|
|
||||||
counter += 1
|
|
||||||
else:
|
|
||||||
average_loss_validation["encoder_only"] = enc_avg_loss
|
|
||||||
|
|
||||||
if dec_avg_loss > average_loss_validation["decoder_only"]:
|
|
||||||
if VERBOSE:
|
|
||||||
print("decoding only average is higher than lowest")
|
|
||||||
counter += 1
|
|
||||||
else:
|
|
||||||
average_loss_validation["decoder_only"] = dec_avg_loss
|
|
||||||
|
|
||||||
if counter > 1:
|
|
||||||
patience += 1
|
|
||||||
if VERBOSE:
|
|
||||||
print(f"losing a patience, current irritation: {patience}")
|
|
||||||
|
|
||||||
|
|
||||||
if counter == 0:
|
|
||||||
patience = max(0, patience - 1)
|
|
||||||
if VERBOSE:
|
|
||||||
print(f"all good, gaining a patience, current irritation: {patience}")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
txt_train_avg_loss = sum(text_batch_losses) / len(text_batch_losses)
|
|
||||||
|
|
||||||
enc_avg_train_loss = float("inf")
|
|
||||||
dec_avg_train_loss = float("inf")
|
|
||||||
|
|
||||||
if current_epoch > PRETRAIN_EPOCHS:
|
|
||||||
try:
|
|
||||||
enc_avg_train_loss = sum(encoder_batch_losses) / len(encoder_batch_losses)
|
|
||||||
dec_avg_train_loss = sum(decoder_batch_losses) / len(decoder_batch_losses)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# write on log
|
|
||||||
loss_saver.write([current_epoch, txt_train_avg_loss,enc_avg_train_loss,dec_avg_train_loss,txt_avg_loss,enc_avg_loss,dec_avg_loss])
|
|
||||||
SEPARATOR = "================================================================================================================"
|
|
||||||
DEBUG_TEXT = "".join(
|
|
||||||
[
|
|
||||||
f"{SEPARATOR}\n",
|
|
||||||
f"EPOCH {current_epoch}\n",
|
|
||||||
f"{SEPARATOR}\n",
|
|
||||||
f"Train Losses:\n",
|
|
||||||
f"\tAvg Losses:\n",
|
|
||||||
f"\t\tavg_txt: {txt_train_avg_loss} - avg_enc: {enc_avg_train_loss} - avg_dec: {dec_avg_train_loss}\n",
|
|
||||||
f"{SEPARATOR}\n",
|
|
||||||
f"Validation Losses:\n",
|
|
||||||
f"\ttxt_loss: {txt_avg_loss} - masking_loss: {enc_avg_loss} - prediction_loss: {dec_avg_loss}\n",
|
|
||||||
f"{SEPARATOR}\n",
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
print(DEBUG_TEXT)
|
|
||||||
|
|
||||||
# Warn about patience
|
|
||||||
if patience == PATIENCE:
|
|
||||||
print("Model is likely overfitting, so let's stop here")
|
|
||||||
|
|
||||||
# SAVE MODEL
|
|
||||||
if current_epoch % CHECKPOINT_STEPS == 0 or patience == PATIENCE:
|
|
||||||
print(f"Saving model at {CHECKPOINT_PATH.as_posix()}")
|
|
||||||
torch.save(NANOSOCRATES.state_dict(), CHECKPOINT_PATH)
|
|
||||||
torch.save(nano_optim.state_dict(), NANO_OPTIM_PATH)
|
|
||||||
torch.save(encoder_only_optim.state_dict(), ENC_OPTIM_PATH)
|
|
||||||
torch.save(decoder_only_optim.state_dict(), DEC_OPTIM_PATH)
|
|
||||||
FILE = open(LAST_EPOCH_PATH, "w", encoding="utf-8")
|
|
||||||
FILE.write(f"{current_epoch}")
|
|
||||||
FILE.close()
|
|
||||||
|
|
||||||
|
|
||||||
if patience == PATIENCE:
|
|
||||||
exit(0)
|
|
||||||
@@ -11,7 +11,13 @@
|
|||||||
"output_type": "stream",
|
"output_type": "stream",
|
||||||
"text": [
|
"text": [
|
||||||
"c:\\Users\\Chris\\miniconda3\\envs\\deep_learning\\Lib\\site-packages\\torch\\utils\\_device.py:103: UserWarning: Aten Op fallback from XPU to CPU happends. This may have performance implications. If need debug the fallback ops please set environment variable `PYTORCH_DEBUG_XPU_FALLBACK=1` (Triggered internally at C:\\actions-runner\\_work\\pytorch\\pytorch\\pytorch\\build\\xpu\\ATen\\RegisterXPU_0.cpp:54528.)\n",
|
"c:\\Users\\Chris\\miniconda3\\envs\\deep_learning\\Lib\\site-packages\\torch\\utils\\_device.py:103: UserWarning: Aten Op fallback from XPU to CPU happends. This may have performance implications. If need debug the fallback ops please set environment variable `PYTORCH_DEBUG_XPU_FALLBACK=1` (Triggered internally at C:\\actions-runner\\_work\\pytorch\\pytorch\\pytorch\\build\\xpu\\ATen\\RegisterXPU_0.cpp:54528.)\n",
|
||||||
" return func(*args, **kwargs)\n"
|
" return func(*args, **kwargs)\n",
|
||||||
|
"252.87s - name 'tensor' is not defined\n",
|
||||||
|
"Traceback (most recent call last):\n",
|
||||||
|
" File \"c:\\Users\\Chris\\miniconda3\\envs\\deep_learning\\Lib\\site-packages\\debugpy\\_vendored\\pydevd\\_pydevd_bundle\\pydevd_vars.py\", line 636, in change_attr_expression\n",
|
||||||
|
" value = eval(expression, frame.f_globals, frame.f_locals)\n",
|
||||||
|
" File \"<string>\", line 1, in <module>\n",
|
||||||
|
"NameError: name 'tensor' is not defined\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -19,9 +25,15 @@
|
|||||||
"evalue": "",
|
"evalue": "",
|
||||||
"output_type": "error",
|
"output_type": "error",
|
||||||
"traceback": [
|
"traceback": [
|
||||||
"\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n",
|
"\u001b[1;31mCannot execute code, session has been disposed. Please try restarting the Kernel."
|
||||||
"\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n",
|
]
|
||||||
"\u001b[1;31mClick <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. \n",
|
},
|
||||||
|
{
|
||||||
|
"ename": "",
|
||||||
|
"evalue": "",
|
||||||
|
"output_type": "error",
|
||||||
|
"traceback": [
|
||||||
|
"\u001b[1;31mCannot execute code, session has been disposed. Please try restarting the Kernel. \n",
|
||||||
"\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
|
"\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
@@ -56,9 +68,9 @@
|
|||||||
"TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + 1\n",
|
"TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + 1\n",
|
||||||
"EMBEDDED_SIZE = 256\n",
|
"EMBEDDED_SIZE = 256\n",
|
||||||
"FEED_FORWARD_MULTIPLIER = 4\n",
|
"FEED_FORWARD_MULTIPLIER = 4\n",
|
||||||
"ATTENTION_HEADS = 8\n",
|
"ATTENTION_HEADS = 4\n",
|
||||||
"SENTENCE_LENGTH = 256\n",
|
"SENTENCE_LENGTH = 256\n",
|
||||||
"NUMBER_OF_BLOCKS = 4\n",
|
"NUMBER_OF_BLOCKS = 2\n",
|
||||||
"MAX_EPOCHS = int(1e3)\n",
|
"MAX_EPOCHS = int(1e3)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
@@ -93,26 +105,7 @@
|
|||||||
" output_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n",
|
" output_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n",
|
||||||
" )\n",
|
" )\n",
|
||||||
" decoder_default_tokens, _ = Transformer.normalize_sequence(\n",
|
" decoder_default_tokens, _ = Transformer.normalize_sequence(\n",
|
||||||
" decoder_default_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN, False\n",
|
" decoder_default_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n",
|
||||||
" )\n",
|
|
||||||
"\n",
|
|
||||||
" TOY_BATCH_INPUT_LIST.append(input_tokens)\n",
|
|
||||||
" TOY_BATCH_PADDING_LIST.append(padding)\n",
|
|
||||||
" TOY_BATCH_TARGET_LIST.append(output_tokens)\n",
|
|
||||||
" TOY_BATCH_DECODER_DEFAULT.append(decoder_default_tokens)\n",
|
|
||||||
"\n",
|
|
||||||
" output_tokens = TOKENANO.encode(RDFs)\n",
|
|
||||||
" input_tokens = TOKENANO.encode(Abstract)[1:]\n",
|
|
||||||
" decoder_default_tokens = TOKENANO.encode(\"<SOS>\")\n",
|
|
||||||
"\n",
|
|
||||||
" input_tokens, padding = Transformer.normalize_sequence(\n",
|
|
||||||
" input_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n",
|
|
||||||
" )\n",
|
|
||||||
" output_tokens, _ = Transformer.normalize_sequence(\n",
|
|
||||||
" output_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n",
|
|
||||||
" )\n",
|
|
||||||
" decoder_default_tokens, _ = Transformer.normalize_sequence(\n",
|
|
||||||
" decoder_default_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN, False\n",
|
|
||||||
" )\n",
|
" )\n",
|
||||||
"\n",
|
"\n",
|
||||||
" TOY_BATCH_INPUT_LIST.append(input_tokens)\n",
|
" TOY_BATCH_INPUT_LIST.append(input_tokens)\n",
|
||||||
@@ -131,7 +124,7 @@
|
|||||||
")\n",
|
")\n",
|
||||||
"cross_entropy = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)\n",
|
"cross_entropy = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)\n",
|
||||||
"optimizer = torch.optim.AdamW(NANOSOCRATES.parameters())\n",
|
"optimizer = torch.optim.AdamW(NANOSOCRATES.parameters())\n",
|
||||||
"scheduler = Transformer.WarmupLR(optimizer, 4000, EMBEDDED_SIZE)\n",
|
"scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 4)\n",
|
||||||
"last_loss = 0\n",
|
"last_loss = 0\n",
|
||||||
"current_epoch = 0\n",
|
"current_epoch = 0\n",
|
||||||
"\n",
|
"\n",
|
||||||
@@ -139,44 +132,32 @@
|
|||||||
"\n",
|
"\n",
|
||||||
" optimizer.zero_grad()\n",
|
" optimizer.zero_grad()\n",
|
||||||
"\n",
|
"\n",
|
||||||
" encoder_list = torch.tensor(TOY_BATCH_INPUT_LIST[:])\n",
|
" encoder_list = torch.tensor([TOY_BATCH_INPUT_LIST[0]])\n",
|
||||||
" decoder_list = torch.tensor(TOY_BATCH_DECODER_DEFAULT[:])\n",
|
" decoder_list = torch.tensor([TOY_BATCH_DECODER_DEFAULT[0]])\n",
|
||||||
" src_padding = torch.tensor(TOY_BATCH_PADDING_LIST[:], dtype=torch.bool)\n",
|
" padding_list = torch.tensor([TOY_BATCH_PADDING_LIST[0]], dtype=torch.bool)\n",
|
||||||
"\n",
|
"\n",
|
||||||
" # Transform target into logits\n",
|
" # Transform target into logits\n",
|
||||||
" target_logits = torch.tensor(TOY_BATCH_TARGET_LIST[:])\n",
|
" target_logits = torch.tensor([TOY_BATCH_TARGET_LIST[0]])\n",
|
||||||
"\n",
|
"\n",
|
||||||
" last_loss = 0\n",
|
" last_loss = 0\n",
|
||||||
" last_prediction: torch.Tensor\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
" for i in range(0, SENTENCE_LENGTH):\n",
|
" for i in range(0, SENTENCE_LENGTH):\n",
|
||||||
"\n",
|
"\n",
|
||||||
" optimizer.zero_grad()\n",
|
" optimizer.zero_grad()\n",
|
||||||
" tgt_padding = decoder_list.eq(PAD_TOKEN)\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
" logits: torch.Tensor = NANOSOCRATES((encoder_list, src_padding, decoder_list, tgt_padding))\n",
|
" logits: torch.Tensor = NANOSOCRATES((encoder_list, padding_list, decoder_list))\n",
|
||||||
" prob = torch.softmax(logits, 2)\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
" most_probable_tokens = torch.argmax(prob, 2)\n",
|
" most_probable_tokens = torch.argmax(logits, 2)\n",
|
||||||
" last_prediction = most_probable_tokens\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
" logits = logits[:,:i,:]\n",
|
" logits = logits[:,i,:]\n",
|
||||||
" logits = logits.permute(0, 2, 1)\n",
|
|
||||||
"\n",
|
|
||||||
" loss : torch.Tensor = cross_entropy(logits, target_logits[:, 0:i])\n",
|
|
||||||
" # loss : torch.Tensor = cross_entropy(logits, target_logits)\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
|
" loss = cross_entropy(logits, target_logits[:,i])\n",
|
||||||
" last_loss = loss\n",
|
" last_loss = loss\n",
|
||||||
" loss.backward()\n",
|
|
||||||
" optimizer.step()\n",
|
" optimizer.step()\n",
|
||||||
" scheduler.step()\n",
|
" scheduler.step()\n",
|
||||||
"\n",
|
"\n",
|
||||||
" if i < SENTENCE_LENGTH - 1:\n",
|
" if i < SENTENCE_LENGTH - 1:\n",
|
||||||
" decoder_list[:,i+1] = target_logits[:,i]\n",
|
" decoder_list[:,i+1] = most_probable_tokens[:,i]\n",
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
" current_epoch += 1\n",
|
" current_epoch += 1\n",
|
||||||
@@ -184,14 +165,6 @@
|
|||||||
" if current_epoch % 1 == 0:\n",
|
" if current_epoch % 1 == 0:\n",
|
||||||
" print(f\"EPOCH {current_epoch}\\n\\tLoss: {last_loss}\")\n",
|
" print(f\"EPOCH {current_epoch}\\n\\tLoss: {last_loss}\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
" for encoded_sentence, expected_sentence in zip(\n",
|
|
||||||
" Transformer.tensor2token(last_prediction[:,:], END_TOKEN), # type: ignore\n",
|
|
||||||
" Transformer.tensor2token(target_logits[:,:], END_TOKEN)\n",
|
|
||||||
" ):\n",
|
|
||||||
" decoded_sentence = TOKENANO.decode(encoded_sentence)\n",
|
|
||||||
" decoded_target = TOKENANO.decode(expected_sentence)\n",
|
|
||||||
" print(f\"\\tACTUAL:\\n\\t\\t{decoded_sentence}\\n\\tEXPECTED:\\n\\t\\t{decoded_target}\\n\")\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
|||||||
@@ -1,509 +0,0 @@
|
|||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 1,
|
|
||||||
"id": "adbef43f",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [
|
|
||||||
{
|
|
||||||
"name": "stderr",
|
|
||||||
"output_type": "stream",
|
|
||||||
"text": [
|
|
||||||
"c:\\Users\\Chris\\miniconda3\\envs\\deep_learning\\Lib\\site-packages\\torch\\utils\\_device.py:103: UserWarning: Aten Op fallback from XPU to CPU happends. This may have performance implications. If need debug the fallback ops please set environment variable `PYTORCH_DEBUG_XPU_FALLBACK=1` (Triggered internally at C:\\actions-runner\\_work\\pytorch\\pytorch\\pytorch\\build\\xpu\\ATen\\RegisterXPU_0.cpp:54528.)\n",
|
|
||||||
" return func(*args, **kwargs)\n",
|
|
||||||
"c:\\Users\\Chris\\miniconda3\\envs\\deep_learning\\Lib\\site-packages\\torch\\optim\\lr_scheduler.py:192: UserWarning: Detected call of `lr_scheduler.step()` before `optimizer.step()`. In PyTorch 1.1.0 and later, you should call them in the opposite order: `optimizer.step()` before `lr_scheduler.step()`. Failure to do this will result in PyTorch skipping the first value of the learning rate schedule. See more details at https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate\n",
|
|
||||||
" warnings.warn(\n"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"ename": "IndexError",
|
|
||||||
"evalue": "list index out of range",
|
|
||||||
"output_type": "error",
|
|
||||||
"traceback": [
|
|
||||||
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
|
||||||
"\u001b[31mIndexError\u001b[39m Traceback (most recent call last)",
|
|
||||||
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 383\u001b[39m\n\u001b[32m 381\u001b[39m txt_min_train_losses = text_batch_losses[:][\u001b[32m0\u001b[39m]\n\u001b[32m 382\u001b[39m txt_avg_train_losses = text_batch_losses[:][\u001b[32m1\u001b[39m]\n\u001b[32m--> \u001b[39m\u001b[32m383\u001b[39m txt_max_train_losses = \u001b[43mtext_batch_losses\u001b[49m\u001b[43m[\u001b[49m\u001b[43m:\u001b[49m\u001b[43m]\u001b[49m\u001b[43m[\u001b[49m\u001b[32;43m2\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[32m 385\u001b[39m txt_min_loss = \u001b[38;5;28mmin\u001b[39m(txt_min_train_losses)\n\u001b[32m 386\u001b[39m txt_avg_min_loss = \u001b[38;5;28msum\u001b[39m(txt_min_train_losses) / \u001b[38;5;28mlen\u001b[39m(txt_min_train_losses)\n",
|
|
||||||
"\u001b[31mIndexError\u001b[39m: list index out of range"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"source": [
|
|
||||||
"import random\n",
|
|
||||||
"import sys\n",
|
|
||||||
"import torch\n",
|
|
||||||
"import pandas as pd\n",
|
|
||||||
"from pathlib import Path\n",
|
|
||||||
"import Project_Model.Libs.Embedder as Embedder\n",
|
|
||||||
"import Project_Model.Libs.BPE as BPE\n",
|
|
||||||
"import Project_Model.Libs.Transformer as Transformer\n",
|
|
||||||
"import Project_Model.Libs.TransformerUtils as TUtils\n",
|
|
||||||
"import Project_Model.Libs.TorchShims as torch_shims\n",
|
|
||||||
"import Project_Model.Libs.Batch as Batch\n",
|
|
||||||
"\n",
|
|
||||||
"# set a fixed seed\n",
|
|
||||||
"torch.manual_seed(0)\n",
|
|
||||||
"random.seed(0)\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"# set a default device\n",
|
|
||||||
"DEVICE = torch_shims.get_default_device()\n",
|
|
||||||
"torch.set_default_device(DEVICE)\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"# Get paths\n",
|
|
||||||
"VOCABULARY_PATH = Path(\"Assets/Model/small/bpe-small-16.json\")\n",
|
|
||||||
"TRAIN_DATASET_PATH = Path(\"Assets/Dataset/1-hop/toy/rdf_text.csv\")\n",
|
|
||||||
"VALIDATION_DATASET_PATH = Path(\"Assets/Dataset/1-hop/toy/rdf_text.csv\")\n",
|
|
||||||
"TEST_DATASET_PATH = Path(\"Assets/Dataset/1-hop/toy/rdf_text.csv\")\n",
|
|
||||||
"CHECKPOINT_PATH = Path(\"Assets/Dataset/Tmp/NanoSocrates.zip\")\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"# BPE Init\n",
|
|
||||||
"SPECIAL_VOC = BPE.default_special_tokens()\n",
|
|
||||||
"VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)\n",
|
|
||||||
"TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"# Constants\n",
|
|
||||||
"MASK_EXTRA_SPACE = 25\n",
|
|
||||||
"TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + MASK_EXTRA_SPACE\n",
|
|
||||||
"EMBEDDED_SIZE = 256\n",
|
|
||||||
"FEED_FORWARD_MULTIPLIER = 4\n",
|
|
||||||
"ATTENTION_HEADS = 8\n",
|
|
||||||
"SENTENCE_LENGTH = 256\n",
|
|
||||||
"NUMBER_OF_BLOCKS = 4\n",
|
|
||||||
"MAX_EPOCHS = int(1e3)\n",
|
|
||||||
"PRETRAIN_EPOCHS = int(2)\n",
|
|
||||||
"WARMUP_EPOCHS = int(4e3)\n",
|
|
||||||
"MINI_BATCH_SIZE = 10\n",
|
|
||||||
"VALIDATION_STEPS = 1\n",
|
|
||||||
"CHECKPOINT_STEPS = VALIDATION_STEPS * 4\n",
|
|
||||||
"PATIENCE = 4\n",
|
|
||||||
"CURRENT_EPOCH = 0\n",
|
|
||||||
"\n",
|
|
||||||
"SOS_TOKEN = TOKENANO.encode(\"<SOS>\")[0]\n",
|
|
||||||
"\n",
|
|
||||||
"PAD_TOKEN = TOKENANO.encode(\"<PAD>\")[0]\n",
|
|
||||||
"END_TOKEN = TOKENANO.encode(\"<END>\")[0]\n",
|
|
||||||
"SUBJ_TOKEN = TOKENANO.encode(\"<SUBJ>\")[0]\n",
|
|
||||||
"REL_TOKEN = TOKENANO.encode(\"<PRED>\")[0]\n",
|
|
||||||
"OBJ_TOKEN = TOKENANO.encode(\"<OBJ>\")[0]\n",
|
|
||||||
"\n",
|
|
||||||
"SPECIAL_TOKENS: set[int] = set(TOKENANO.encode(\"\".join(BPE.default_special_tokens())))\n",
|
|
||||||
"ALLOWED_TOKENS = set([SUBJ_TOKEN, REL_TOKEN, OBJ_TOKEN])\n",
|
|
||||||
"FORBIDDEN_TOKENS = SPECIAL_TOKENS - ALLOWED_TOKENS\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"# Spanned_Masker\n",
|
|
||||||
"MASKER = Transformer.SpannedMasker(\n",
|
|
||||||
" TOKEN_SPACE_SIZE,\n",
|
|
||||||
" FORBIDDEN_TOKENS\n",
|
|
||||||
")\n",
|
|
||||||
"\n",
|
|
||||||
"TRAIN_BATCHER = Batch.Batcher(\n",
|
|
||||||
" TRAIN_DATASET_PATH,\n",
|
|
||||||
" SENTENCE_LENGTH,\n",
|
|
||||||
" TOKENANO,\n",
|
|
||||||
" MASKER\n",
|
|
||||||
")\n",
|
|
||||||
"VALIDATION_BATCHER = Batch.Batcher(\n",
|
|
||||||
" VALIDATION_DATASET_PATH,\n",
|
|
||||||
" SENTENCE_LENGTH,\n",
|
|
||||||
" TOKENANO,\n",
|
|
||||||
" MASKER\n",
|
|
||||||
")\n",
|
|
||||||
"TEST_BATCHER = Batch.Batcher(\n",
|
|
||||||
" TEST_DATASET_PATH,\n",
|
|
||||||
" SENTENCE_LENGTH,\n",
|
|
||||||
" TOKENANO,\n",
|
|
||||||
" MASKER\n",
|
|
||||||
")\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"# Model\n",
|
|
||||||
"NANOSOCRATES = Transformer.TrainingModel(\n",
|
|
||||||
" TOKEN_SPACE_SIZE,\n",
|
|
||||||
" EMBEDDED_SIZE,\n",
|
|
||||||
" FEED_FORWARD_MULTIPLIER,\n",
|
|
||||||
" ATTENTION_HEADS,\n",
|
|
||||||
" NUMBER_OF_BLOCKS\n",
|
|
||||||
")\n",
|
|
||||||
"_, ENCODER_ONLY, DECODER_ONLY = TUtils.decompose_nano_socrates(\n",
|
|
||||||
" NANOSOCRATES,\n",
|
|
||||||
" TOKEN_SPACE_SIZE,\n",
|
|
||||||
" EMBEDDED_SIZE\n",
|
|
||||||
")\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"# Training constants\n",
|
|
||||||
"cross_entropy = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)\n",
|
|
||||||
"nano_optim = torch.optim.AdamW(NANOSOCRATES.parameters())\n",
|
|
||||||
"encoder_only_optim = torch.optim.AdamW(ENCODER_ONLY.parameters())\n",
|
|
||||||
"decoder_only_optim = torch.optim.AdamW(DECODER_ONLY.parameters())\n",
|
|
||||||
"\n",
|
|
||||||
"nano_scheduler = Transformer.WarmupLR(nano_optim, WARMUP_EPOCHS, EMBEDDED_SIZE)\n",
|
|
||||||
"encoder_only_scheduler = Transformer.WarmupLR(encoder_only_optim, WARMUP_EPOCHS, EMBEDDED_SIZE)\n",
|
|
||||||
"decoder_only_scheduler = Transformer.WarmupLR(decoder_only_optim, WARMUP_EPOCHS, EMBEDDED_SIZE)\n",
|
|
||||||
"\n",
|
|
||||||
"current_epoch = CURRENT_EPOCH\n",
|
|
||||||
"patience = 0\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"average_loss_validation = {\n",
|
|
||||||
" \"txt\": float(\"inf\"),\n",
|
|
||||||
" \"encoder_only\": float(\"inf\"),\n",
|
|
||||||
" \"decoder_only\": float(\"inf\")\n",
|
|
||||||
"}\n",
|
|
||||||
"\n",
|
|
||||||
"while current_epoch < MAX_EPOCHS:\n",
|
|
||||||
"\n",
|
|
||||||
" text_batch_losses = []\n",
|
|
||||||
" encoder_batch_losses = []\n",
|
|
||||||
" decoder_batch_losses = []\n",
|
|
||||||
"\n",
|
|
||||||
" for batch in TRAIN_BATCHER.batch(MINI_BATCH_SIZE):\n",
|
|
||||||
"\n",
|
|
||||||
" src_x, tgt_y, pad_x, pad_y, tasktype = batch\n",
|
|
||||||
"\n",
|
|
||||||
" enc_x = torch.tensor(src_x)\n",
|
|
||||||
" enc_x_pad = torch.tensor(pad_x, dtype=torch.bool)\n",
|
|
||||||
" dec_x = Transformer.get_decoder_input(MINI_BATCH_SIZE, SOS_TOKEN, PAD_TOKEN, SENTENCE_LENGTH)\n",
|
|
||||||
" dec_x_pad = dec_x.eq(PAD_TOKEN)\n",
|
|
||||||
" tgt = torch.tensor(tgt_y)\n",
|
|
||||||
" tgt_pad = torch.tensor(pad_y, dtype=torch.bool)\n",
|
|
||||||
"\n",
|
|
||||||
" # Task 1 and Task 2\n",
|
|
||||||
" if tasktype == Batch.TaskType.RDF2TXT or tasktype == Batch.TaskType.TEXT2RDF:\n",
|
|
||||||
"\n",
|
|
||||||
" BATCH_LOSS = []\n",
|
|
||||||
"\n",
|
|
||||||
" for token_idx in range(0, SENTENCE_LENGTH):\n",
|
|
||||||
"\n",
|
|
||||||
" nano_optim.zero_grad()\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
" pred_logits = NANOSOCRATES((\n",
|
|
||||||
" enc_x, enc_x_pad, dec_x, dec_x_pad\n",
|
|
||||||
" ))\n",
|
|
||||||
"\n",
|
|
||||||
" pred_logits = pred_logits[:, token_idx, :]\n",
|
|
||||||
"\n",
|
|
||||||
" loss: torch.Tensor= cross_entropy(pred_logits, tgt[:, token_idx])\n",
|
|
||||||
"\n",
|
|
||||||
" loss.backward()\n",
|
|
||||||
" nano_optim.step()\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
" BATCH_LOSS.append(\n",
|
|
||||||
" loss.item()\n",
|
|
||||||
" )\n",
|
|
||||||
"\n",
|
|
||||||
" if token_idx < SENTENCE_LENGTH - 1:\n",
|
|
||||||
" dec_x[:,token_idx + 1] = tgt[:, token_idx]\n",
|
|
||||||
"\n",
|
|
||||||
" MIN_BATCH_LOSS = min(BATCH_LOSS)\n",
|
|
||||||
" MAX_BATCH_LOSS = max(BATCH_LOSS)\n",
|
|
||||||
" AVG_BATCH_LOSS = sum(BATCH_LOSS) / MINI_BATCH_SIZE\n",
|
|
||||||
"\n",
|
|
||||||
" text_batch_losses.append([MIN_BATCH_LOSS, AVG_BATCH_LOSS, MAX_BATCH_LOSS])\n",
|
|
||||||
" continue\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
" # Pretrain first\n",
|
|
||||||
" if current_epoch < PRETRAIN_EPOCHS:\n",
|
|
||||||
" continue\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
" # Task 3\n",
|
|
||||||
" if tasktype == Batch.TaskType.MASKING:\n",
|
|
||||||
"\n",
|
|
||||||
" encoder_only_optim.zero_grad()\n",
|
|
||||||
"\n",
|
|
||||||
" pred_logits = ENCODER_ONLY((\n",
|
|
||||||
" enc_x, enc_x_pad\n",
|
|
||||||
" ))\n",
|
|
||||||
"\n",
|
|
||||||
" loss: torch.Tensor= cross_entropy(pred_logits, tgt)\n",
|
|
||||||
"\n",
|
|
||||||
" loss.backward()\n",
|
|
||||||
" encoder_only_optim.step()\n",
|
|
||||||
"\n",
|
|
||||||
" encoder_batch_losses.append(\n",
|
|
||||||
" loss.item()\n",
|
|
||||||
" )\n",
|
|
||||||
"\n",
|
|
||||||
" continue\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
" # Task 4\n",
|
|
||||||
" if tasktype == Batch.TaskType.COMPLETATION:\n",
|
|
||||||
"\n",
|
|
||||||
" BATCH_LOSS = []\n",
|
|
||||||
"\n",
|
|
||||||
" for token_idx in range(0, SENTENCE_LENGTH):\n",
|
|
||||||
"\n",
|
|
||||||
" decoder_only_optim.zero_grad()\n",
|
|
||||||
"\n",
|
|
||||||
" pred_logits = DECODER_ONLY((\n",
|
|
||||||
" enc_x, enc_x_pad\n",
|
|
||||||
" ))\n",
|
|
||||||
"\n",
|
|
||||||
" pred_logits = pred_logits[:, token_idx, :]\n",
|
|
||||||
"\n",
|
|
||||||
" loss: torch.Tensor= cross_entropy(pred_logits, tgt[:, token_idx])\n",
|
|
||||||
"\n",
|
|
||||||
" loss.backward()\n",
|
|
||||||
" decoder_only_optim.step()\n",
|
|
||||||
"\n",
|
|
||||||
" BATCH_LOSS.append(\n",
|
|
||||||
" loss.item()\n",
|
|
||||||
" )\n",
|
|
||||||
"\n",
|
|
||||||
" if token_idx < SENTENCE_LENGTH - 1:\n",
|
|
||||||
" dec_x[:,token_idx + 1] = tgt[:, token_idx]\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
" MIN_BATCH_LOSS = min(BATCH_LOSS)\n",
|
|
||||||
" MAX_BATCH_LOSS = max(BATCH_LOSS)\n",
|
|
||||||
" AVG_BATCH_LOSS = sum(BATCH_LOSS) / MINI_BATCH_SIZE\n",
|
|
||||||
"\n",
|
|
||||||
" decoder_batch_losses.append([MIN_BATCH_LOSS, AVG_BATCH_LOSS, MAX_BATCH_LOSS])\n",
|
|
||||||
"\n",
|
|
||||||
" continue\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
" nano_scheduler.step()\n",
|
|
||||||
" encoder_only_scheduler.step()\n",
|
|
||||||
" decoder_only_scheduler.step()\n",
|
|
||||||
"\n",
|
|
||||||
" current_epoch += 1\n",
|
|
||||||
"\n",
|
|
||||||
" if current_epoch % VALIDATION_STEPS == 0:\n",
|
|
||||||
"\n",
|
|
||||||
" txt_avg_batch_losses = []\n",
|
|
||||||
" enc_avg_batch_losses = []\n",
|
|
||||||
" dec_avg_batch_losses = []\n",
|
|
||||||
"\n",
|
|
||||||
" for batch in VALIDATION_BATCHER.batch(MINI_BATCH_SIZE):\n",
|
|
||||||
"\n",
|
|
||||||
" src_x, tgt_y, pad_x, pad_y, tasktype = batch\n",
|
|
||||||
"\n",
|
|
||||||
" enc_x = torch.tensor(src_x)\n",
|
|
||||||
" enc_x_pad = torch.tensor(pad_x, dtype=torch.bool)\n",
|
|
||||||
" dec_x = Transformer.get_decoder_input(MINI_BATCH_SIZE, SOS_TOKEN, PAD_TOKEN, SENTENCE_LENGTH)\n",
|
|
||||||
" dec_x_pad = dec_x.eq(PAD_TOKEN)\n",
|
|
||||||
" tgt = torch.tensor(tgt_y)\n",
|
|
||||||
" tgt_pad = torch.tensor(pad_y, dtype=torch.bool)\n",
|
|
||||||
"\n",
|
|
||||||
" # Task 1 and Task 2\n",
|
|
||||||
" if tasktype == Batch.TaskType.RDF2TXT or tasktype == Batch.TaskType.TEXT2RDF:\n",
|
|
||||||
"\n",
|
|
||||||
" BATCH_LOSS = []\n",
|
|
||||||
"\n",
|
|
||||||
" for token_idx in range(0, SENTENCE_LENGTH):\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
" pred_logits = NANOSOCRATES((\n",
|
|
||||||
" enc_x, enc_x_pad, dec_x, dec_x_pad\n",
|
|
||||||
" ))\n",
|
|
||||||
"\n",
|
|
||||||
" pred_logits = pred_logits[:, token_idx, :]\n",
|
|
||||||
"\n",
|
|
||||||
" loss: torch.Tensor= cross_entropy(pred_logits, tgt[:, token_idx])\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
" BATCH_LOSS.append(\n",
|
|
||||||
" loss.item()\n",
|
|
||||||
" )\n",
|
|
||||||
"\n",
|
|
||||||
" if token_idx < SENTENCE_LENGTH - 1:\n",
|
|
||||||
" dec_x[:,token_idx + 1] = tgt[:, token_idx]\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
" AVG_BATCH_LOSS = sum(BATCH_LOSS) / MINI_BATCH_SIZE\n",
|
|
||||||
" txt_avg_batch_losses.append(AVG_BATCH_LOSS)\n",
|
|
||||||
"\n",
|
|
||||||
" continue\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
" # Pretrain first\n",
|
|
||||||
" if current_epoch < PRETRAIN_EPOCHS:\n",
|
|
||||||
" continue\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
" # Task 3\n",
|
|
||||||
" if tasktype == Batch.TaskType.MASKING:\n",
|
|
||||||
"\n",
|
|
||||||
" pred_logits = ENCODER_ONLY((\n",
|
|
||||||
" enc_x, enc_x_pad\n",
|
|
||||||
" ))\n",
|
|
||||||
"\n",
|
|
||||||
" loss: torch.Tensor= cross_entropy(pred_logits, tgt)\n",
|
|
||||||
"\n",
|
|
||||||
" enc_avg_batch_losses.append(\n",
|
|
||||||
" loss.item()\n",
|
|
||||||
" )\n",
|
|
||||||
"\n",
|
|
||||||
" continue\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
" # Task 4\n",
|
|
||||||
" if tasktype == Batch.TaskType.COMPLETATION:\n",
|
|
||||||
"\n",
|
|
||||||
" BATCH_LOSS = []\n",
|
|
||||||
"\n",
|
|
||||||
" for token_idx in range(0, SENTENCE_LENGTH):\n",
|
|
||||||
"\n",
|
|
||||||
" pred_logits = DECODER_ONLY((\n",
|
|
||||||
" enc_x, enc_x_pad\n",
|
|
||||||
" ))\n",
|
|
||||||
"\n",
|
|
||||||
" pred_logits = pred_logits[:, token_idx, :]\n",
|
|
||||||
"\n",
|
|
||||||
" loss: torch.Tensor= cross_entropy(pred_logits, tgt[:, token_idx])\n",
|
|
||||||
"\n",
|
|
||||||
" BATCH_LOSS.append(\n",
|
|
||||||
" loss.item()\n",
|
|
||||||
" )\n",
|
|
||||||
"\n",
|
|
||||||
" if token_idx < SENTENCE_LENGTH - 1:\n",
|
|
||||||
" dec_x[:,token_idx + 1] = tgt[:, token_idx]\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
" AVG_BATCH_LOSS = sum(BATCH_LOSS) / MINI_BATCH_SIZE\n",
|
|
||||||
"\n",
|
|
||||||
" dec_avg_batch_losses.append(AVG_BATCH_LOSS)\n",
|
|
||||||
"\n",
|
|
||||||
" continue\n",
|
|
||||||
"\n",
|
|
||||||
" txt_avg_loss = sum(txt_avg_batch_losses) / len(txt_avg_batch_losses)\n",
|
|
||||||
" enc_avg_loss = float(\"inf\")\n",
|
|
||||||
" dec_avg_loss = float(\"inf\")\n",
|
|
||||||
"\n",
|
|
||||||
" if current_epoch >= PRETRAIN_EPOCHS:\n",
|
|
||||||
" enc_avg_loss = sum(enc_avg_batch_losses) / len(enc_avg_batch_losses)\n",
|
|
||||||
" dec_avg_loss = sum(dec_avg_batch_losses) / len(dec_avg_batch_losses)\n",
|
|
||||||
"\n",
|
|
||||||
" if current_epoch < PRETRAIN_EPOCHS:\n",
|
|
||||||
"\n",
|
|
||||||
" if txt_avg_loss < average_loss_validation[\"txt\"]:\n",
|
|
||||||
" average_loss_validation[\"txt\"] = txt_avg_loss\n",
|
|
||||||
" else:\n",
|
|
||||||
" patience += 1\n",
|
|
||||||
" else:\n",
|
|
||||||
"\n",
|
|
||||||
" counter = 0\n",
|
|
||||||
"\n",
|
|
||||||
" if txt_avg_loss > average_loss_validation[\"txt\"]:\n",
|
|
||||||
" counter += 1\n",
|
|
||||||
"\n",
|
|
||||||
" if txt_avg_loss > average_loss_validation[\"encoder_only\"]:\n",
|
|
||||||
" counter += 1\n",
|
|
||||||
"\n",
|
|
||||||
" if txt_avg_loss > average_loss_validation[\"decoder_only\"]:\n",
|
|
||||||
" counter += 1\n",
|
|
||||||
"\n",
|
|
||||||
" if counter > 1:\n",
|
|
||||||
" patience += 1\n",
|
|
||||||
"\n",
|
|
||||||
" txt_min_train_losses = text_batch_losses[:][0]\n",
|
|
||||||
" txt_avg_train_losses = text_batch_losses[:][1]\n",
|
|
||||||
" txt_max_train_losses = text_batch_losses[:][2]\n",
|
|
||||||
"\n",
|
|
||||||
" txt_min_loss = min(txt_min_train_losses)\n",
|
|
||||||
" txt_avg_min_loss = sum(txt_min_train_losses) / len(txt_min_train_losses)\n",
|
|
||||||
" txt_max_loss = max(txt_max_train_losses)\n",
|
|
||||||
" txt_avg_max_loss = sum(txt_max_train_losses) / len(txt_max_train_losses)\n",
|
|
||||||
" txt_avg_loss = sum(txt_avg_train_losses) / len(txt_avg_train_losses)\n",
|
|
||||||
"\n",
|
|
||||||
" enc_avg_train_loss = float(\"inf\")\n",
|
|
||||||
"\n",
|
|
||||||
" dec_min_loss = float(\"inf\")\n",
|
|
||||||
" dec_avg_min_loss = float(\"inf\")\n",
|
|
||||||
" dec_max_loss = float(\"inf\")\n",
|
|
||||||
" dec_avg_max_loss = float(\"inf\")\n",
|
|
||||||
" dec_avg_loss = float(\"inf\")\n",
|
|
||||||
"\n",
|
|
||||||
" if current_epoch >= PRETRAIN_EPOCHS:\n",
|
|
||||||
" enc_avg_train_loss = sum(encoder_batch_losses) / len(encoder_batch_losses)\n",
|
|
||||||
"\n",
|
|
||||||
" dec_min_train_losses = decoder_batch_losses[:][0]\n",
|
|
||||||
" dec_avg_train_losses = decoder_batch_losses[:][1]\n",
|
|
||||||
" dec_max_train_losses = decoder_batch_losses[:][2]\n",
|
|
||||||
"\n",
|
|
||||||
" dec_min_loss = min(dec_min_train_losses)\n",
|
|
||||||
" dec_avg_min_loss = sum(dec_min_train_losses) / len(dec_min_train_losses)\n",
|
|
||||||
" dec_max_loss = max(dec_max_train_losses)\n",
|
|
||||||
" dec_avg_max_loss = sum(dec_max_train_losses) / len(dec_max_train_losses)\n",
|
|
||||||
" dec_avg_loss = sum(dec_avg_train_losses) / len(dec_avg_train_losses)\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
" SEPARATOR = \"===========================================================================================\"\n",
|
|
||||||
" DEBUG_TEXT = \"\".join([\n",
|
|
||||||
" f\"{SEPARATOR}\\n\",\n",
|
|
||||||
" f\"EPOCH {current_epoch}\"\n",
|
|
||||||
" f\"{SEPARATOR}\\n\",\n",
|
|
||||||
" f\"Train Losses:\\n\"\n",
|
|
||||||
" f\"\\tMin Losses:\\n\"\n",
|
|
||||||
" f\"\\t\\tmin_txt: {txt_min_loss} - avg_txt: {txt_avg_min_loss}\\n\"\n",
|
|
||||||
" f\"\\t\\tmin_dec: {dec_min_loss} - avg_dec: {dec_avg_min_loss}\\n\"\n",
|
|
||||||
" f\"\\tMax Losses:\\n\"\n",
|
|
||||||
" f\"\\t\\tmax_txt: {txt_max_loss} - avg_txt: {txt_avg_max_loss}\\n\"\n",
|
|
||||||
" f\"\\t\\tmax_dec: {dec_min_loss} - avg_dec: {dec_avg_max_loss}\\n\"\n",
|
|
||||||
" f\"\\tAvg Losses:\\n\"\n",
|
|
||||||
" f\"\\t\\tavg_txt: {txt_avg_loss} - avg_enc: {enc_avg_loss} - avg_dec: {dec_avg_loss}\\n\"\n",
|
|
||||||
" f\"{SEPARATOR}\\n\",\n",
|
|
||||||
" f\"Validation Losses:\\n\"\n",
|
|
||||||
" f\"\\ttxt_loss: {txt_avg_loss} - masking_loss: {enc_avg_loss} - prediction: {dec_avg_loss}\"\n",
|
|
||||||
" f\"{SEPARATOR}\\n\",\n",
|
|
||||||
" ])\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
" # Warn about patience\n",
|
|
||||||
" if patience == PATIENCE:\n",
|
|
||||||
" print(\n",
|
|
||||||
" \"Model is likely overfitting, so let's stop here\"\n",
|
|
||||||
" )\n",
|
|
||||||
"\n",
|
|
||||||
" # SAVE MODEL\n",
|
|
||||||
" if current_epoch % CHECKPOINT_STEPS == 0 or patience == PATIENCE:\n",
|
|
||||||
" print(f\"Saving model at {CHECKPOINT_PATH.as_posix()}\")\n",
|
|
||||||
" torch.save(NANOSOCRATES.state_dict(), CHECKPOINT_PATH)\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
"\n"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "deep_learning",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.13.7"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 5
|
|
||||||
}
|
|
||||||
@@ -1,433 +0,0 @@
|
|||||||
import random
|
|
||||||
import sys
|
|
||||||
import torch
|
|
||||||
import pandas as pd
|
|
||||||
from pathlib import Path
|
|
||||||
import Project_Model.Libs.Embedder as Embedder
|
|
||||||
import Project_Model.Libs.BPE as BPE
|
|
||||||
import Project_Model.Libs.Transformer as Transformer
|
|
||||||
import Project_Model.Libs.TransformerUtils as TUtils
|
|
||||||
import Project_Model.Libs.TorchShims as torch_shims
|
|
||||||
import Project_Model.Libs.Batch as Batch
|
|
||||||
|
|
||||||
# set a fixed seed
|
|
||||||
torch.manual_seed(0)
|
|
||||||
random.seed(0)
|
|
||||||
|
|
||||||
|
|
||||||
# set a default device
|
|
||||||
DEVICE = torch_shims.get_default_device()
|
|
||||||
torch.set_default_device(DEVICE)
|
|
||||||
|
|
||||||
|
|
||||||
# Get paths
|
|
||||||
VOCABULARY_PATH = Path("Assets/Model/small/bpe-small-16.json")
|
|
||||||
TRAIN_DATASET_PATH = Path("Assets/Dataset/1-hop/small/holdout/train.csv")
|
|
||||||
VALIDATION_DATASET_PATH = Path("Assets/Dataset/1-hop/small/holdout/evaluation.csv")
|
|
||||||
TEST_DATASET_PATH = Path("Assets/Dataset/1-hop/small/holdout/test.csv")
|
|
||||||
CHECKPOINT_PATH = Path("Assets/Dataset/Tmp/NanoSocrates.zip")
|
|
||||||
|
|
||||||
|
|
||||||
# BPE Init
|
|
||||||
SPECIAL_VOC = BPE.default_special_tokens()
|
|
||||||
VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)
|
|
||||||
TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)
|
|
||||||
|
|
||||||
|
|
||||||
# Constants
|
|
||||||
MASK_EXTRA_SPACE = 100
|
|
||||||
REAL_TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size
|
|
||||||
TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + MASK_EXTRA_SPACE
|
|
||||||
EMBEDDED_SIZE = 256
|
|
||||||
FEED_FORWARD_MULTIPLIER = 4
|
|
||||||
ATTENTION_HEADS = 8
|
|
||||||
SENTENCE_LENGTH = 256
|
|
||||||
NUMBER_OF_BLOCKS = 4
|
|
||||||
MAX_EPOCHS = int(1e3)
|
|
||||||
PRETRAIN_EPOCHS = int(10)
|
|
||||||
WARMUP_EPOCHS = int(4e3)
|
|
||||||
MINI_BATCH_SIZE = 100
|
|
||||||
VALIDATION_STEPS = 5
|
|
||||||
CHECKPOINT_STEPS = VALIDATION_STEPS * 4
|
|
||||||
PATIENCE = 4
|
|
||||||
CURRENT_EPOCH = 0
|
|
||||||
|
|
||||||
SOS_TOKEN = TOKENANO.encode("<SOS>")[0]
|
|
||||||
|
|
||||||
PAD_TOKEN = TOKENANO.encode("<PAD>")[0]
|
|
||||||
END_TOKEN = TOKENANO.encode("<END>")[0]
|
|
||||||
SUBJ_TOKEN = TOKENANO.encode("<SUBJ>")[0]
|
|
||||||
REL_TOKEN = TOKENANO.encode("<PRED>")[0]
|
|
||||||
OBJ_TOKEN = TOKENANO.encode("<OBJ>")[0]
|
|
||||||
|
|
||||||
SPECIAL_TOKENS: set[int] = set(TOKENANO.encode("".join(BPE.default_special_tokens())))
|
|
||||||
ALLOWED_TOKENS = set([SUBJ_TOKEN, REL_TOKEN, OBJ_TOKEN])
|
|
||||||
FORBIDDEN_TOKENS = SPECIAL_TOKENS - ALLOWED_TOKENS
|
|
||||||
|
|
||||||
|
|
||||||
# Spanned_Masker
|
|
||||||
MASKER = Transformer.SpannedMasker(REAL_TOKEN_SPACE_SIZE, FORBIDDEN_TOKENS)
|
|
||||||
|
|
||||||
TRAIN_BATCHER = Batch.Batcher(TRAIN_DATASET_PATH, SENTENCE_LENGTH, TOKENANO, MASKER)
|
|
||||||
VALIDATION_BATCHER = Batch.Batcher(
|
|
||||||
VALIDATION_DATASET_PATH, SENTENCE_LENGTH, TOKENANO, MASKER
|
|
||||||
)
|
|
||||||
TEST_BATCHER = Batch.Batcher(TEST_DATASET_PATH, SENTENCE_LENGTH, TOKENANO, MASKER)
|
|
||||||
|
|
||||||
|
|
||||||
# Model
|
|
||||||
NANOSOCRATES = Transformer.TrainingModel(
|
|
||||||
TOKEN_SPACE_SIZE,
|
|
||||||
EMBEDDED_SIZE,
|
|
||||||
FEED_FORWARD_MULTIPLIER,
|
|
||||||
ATTENTION_HEADS,
|
|
||||||
NUMBER_OF_BLOCKS,
|
|
||||||
)
|
|
||||||
_, ENCODER_ONLY, DECODER_ONLY = TUtils.decompose_nano_socrates(
|
|
||||||
NANOSOCRATES, TOKEN_SPACE_SIZE, EMBEDDED_SIZE
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# Training constants
|
|
||||||
nano_cross_entropy = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)
|
|
||||||
encoder_ce = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)
|
|
||||||
decoder_ce = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)
|
|
||||||
nano_optim = torch.optim.AdamW(NANOSOCRATES.parameters())
|
|
||||||
encoder_only_optim = torch.optim.AdamW(ENCODER_ONLY.parameters())
|
|
||||||
decoder_only_optim = torch.optim.AdamW(DECODER_ONLY.parameters())
|
|
||||||
|
|
||||||
nano_scheduler = Transformer.WarmupLR(nano_optim, WARMUP_EPOCHS, EMBEDDED_SIZE)
|
|
||||||
encoder_only_scheduler = Transformer.WarmupLR(
|
|
||||||
encoder_only_optim, WARMUP_EPOCHS, EMBEDDED_SIZE
|
|
||||||
)
|
|
||||||
decoder_only_scheduler = Transformer.WarmupLR(
|
|
||||||
decoder_only_optim, WARMUP_EPOCHS, EMBEDDED_SIZE
|
|
||||||
)
|
|
||||||
|
|
||||||
current_epoch = CURRENT_EPOCH
|
|
||||||
patience = 0
|
|
||||||
|
|
||||||
|
|
||||||
average_loss_validation = {
|
|
||||||
"txt": float("inf"),
|
|
||||||
"encoder_only": float("inf"),
|
|
||||||
"decoder_only": float("inf"),
|
|
||||||
}
|
|
||||||
|
|
||||||
while current_epoch < MAX_EPOCHS:
|
|
||||||
|
|
||||||
NANOSOCRATES.train()
|
|
||||||
ENCODER_ONLY.train()
|
|
||||||
DECODER_ONLY.train()
|
|
||||||
|
|
||||||
text_batch_losses = []
|
|
||||||
encoder_batch_losses = []
|
|
||||||
decoder_batch_losses = []
|
|
||||||
|
|
||||||
batch_counter = 0
|
|
||||||
|
|
||||||
print(f"EPOCH {current_epoch} STARTING")
|
|
||||||
|
|
||||||
for batch in TRAIN_BATCHER.batch(MINI_BATCH_SIZE):
|
|
||||||
|
|
||||||
batch_counter += 1
|
|
||||||
|
|
||||||
src_x, tgt_y, pad_x, pad_y, tasktype = batch
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
enc_x = torch.tensor(src_x)
|
|
||||||
|
|
||||||
ACTUAL_BATCH_SIZE, _ = enc_x.shape
|
|
||||||
enc_x_pad = torch.tensor(pad_x, dtype=torch.bool)
|
|
||||||
dec_x = Transformer.get_decoder_input(
|
|
||||||
ACTUAL_BATCH_SIZE, SOS_TOKEN, PAD_TOKEN, SENTENCE_LENGTH
|
|
||||||
)
|
|
||||||
dec_x_pad = dec_x.eq(PAD_TOKEN)
|
|
||||||
tgt = torch.tensor(tgt_y)
|
|
||||||
tgt_pad = torch.tensor(pad_y, dtype=torch.bool)
|
|
||||||
|
|
||||||
print(f"\tBATCH {batch_counter} Starting")
|
|
||||||
|
|
||||||
# Task 1 and Task 2
|
|
||||||
if tasktype == Batch.TaskType.RDF2TXT or tasktype == Batch.TaskType.TEXT2RDF:
|
|
||||||
|
|
||||||
print(f"\tExecuting TASK 1 or 2 - BATCH {batch_counter}")
|
|
||||||
|
|
||||||
BATCH_LOSS = []
|
|
||||||
|
|
||||||
|
|
||||||
for token_idx in range(0, SENTENCE_LENGTH):
|
|
||||||
|
|
||||||
nano_optim.zero_grad()
|
|
||||||
|
|
||||||
pred_logits = NANOSOCRATES((enc_x, enc_x_pad, dec_x, dec_x_pad))
|
|
||||||
|
|
||||||
pred_logits = pred_logits[:, token_idx, :]
|
|
||||||
|
|
||||||
loss: torch.Tensor = nano_cross_entropy(pred_logits, tgt[:, token_idx])
|
|
||||||
|
|
||||||
loss.backward()
|
|
||||||
nano_optim.step()
|
|
||||||
|
|
||||||
BATCH_LOSS.append(loss.item())
|
|
||||||
|
|
||||||
if token_idx < SENTENCE_LENGTH - 1:
|
|
||||||
dec_x[:, token_idx + 1] = tgt[:, token_idx]
|
|
||||||
|
|
||||||
MIN_BATCH_LOSS = min(BATCH_LOSS)
|
|
||||||
MAX_BATCH_LOSS = max(BATCH_LOSS)
|
|
||||||
AVG_BATCH_LOSS = sum(BATCH_LOSS) / MINI_BATCH_SIZE
|
|
||||||
|
|
||||||
text_batch_losses.append([MIN_BATCH_LOSS, AVG_BATCH_LOSS, MAX_BATCH_LOSS])
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Pretrain first
|
|
||||||
if current_epoch < PRETRAIN_EPOCHS:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Task 3
|
|
||||||
if tasktype == Batch.TaskType.MASKING:
|
|
||||||
|
|
||||||
print(f"\tExecuting TASK 3 - BATCH {batch_counter}")
|
|
||||||
|
|
||||||
encoder_only_optim.zero_grad()
|
|
||||||
|
|
||||||
pred_logits = ENCODER_ONLY((enc_x, enc_x_pad))
|
|
||||||
pred_logits = pred_logits.permute(0, 2, 1)
|
|
||||||
print(torch.max(tgt))
|
|
||||||
loss: torch.Tensor = encoder_ce(pred_logits, tgt)
|
|
||||||
|
|
||||||
loss.backward()
|
|
||||||
encoder_only_optim.step()
|
|
||||||
|
|
||||||
encoder_batch_losses.append(loss.item())
|
|
||||||
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Task 4
|
|
||||||
if tasktype == Batch.TaskType.COMPLETATION:
|
|
||||||
|
|
||||||
print(f"\tExecuting TASK 4 - BATCH {batch_counter}")
|
|
||||||
|
|
||||||
BATCH_LOSS = []
|
|
||||||
|
|
||||||
for token_idx in range(0, SENTENCE_LENGTH):
|
|
||||||
|
|
||||||
decoder_only_optim.zero_grad()
|
|
||||||
|
|
||||||
pred_logits = DECODER_ONLY((enc_x, enc_x_pad))
|
|
||||||
|
|
||||||
pred_logits = pred_logits[:, token_idx, :]
|
|
||||||
|
|
||||||
loss: torch.Tensor = decoder_ce(pred_logits, tgt[:, token_idx])
|
|
||||||
|
|
||||||
loss.backward()
|
|
||||||
decoder_only_optim.step()
|
|
||||||
|
|
||||||
BATCH_LOSS.append(loss.item())
|
|
||||||
|
|
||||||
if token_idx < SENTENCE_LENGTH - 1:
|
|
||||||
dec_x[:, token_idx + 1] = tgt[:, token_idx]
|
|
||||||
|
|
||||||
MIN_BATCH_LOSS = min(BATCH_LOSS)
|
|
||||||
MAX_BATCH_LOSS = max(BATCH_LOSS)
|
|
||||||
AVG_BATCH_LOSS = sum(BATCH_LOSS) / MINI_BATCH_SIZE
|
|
||||||
|
|
||||||
decoder_batch_losses.append(
|
|
||||||
[MIN_BATCH_LOSS, AVG_BATCH_LOSS, MAX_BATCH_LOSS]
|
|
||||||
)
|
|
||||||
|
|
||||||
continue
|
|
||||||
|
|
||||||
nano_scheduler.step()
|
|
||||||
encoder_only_scheduler.step()
|
|
||||||
decoder_only_scheduler.step()
|
|
||||||
|
|
||||||
current_epoch += 1
|
|
||||||
|
|
||||||
if current_epoch % VALIDATION_STEPS == 0:
|
|
||||||
|
|
||||||
NANOSOCRATES.eval()
|
|
||||||
ENCODER_ONLY.eval()
|
|
||||||
DECODER_ONLY.eval()
|
|
||||||
|
|
||||||
txt_avg_batch_losses = []
|
|
||||||
enc_avg_batch_losses = []
|
|
||||||
dec_avg_batch_losses = []
|
|
||||||
|
|
||||||
for batch in VALIDATION_BATCHER.batch(MINI_BATCH_SIZE):
|
|
||||||
|
|
||||||
src_x, tgt_y, pad_x, pad_y, tasktype = batch
|
|
||||||
|
|
||||||
enc_x = torch.tensor(src_x)
|
|
||||||
|
|
||||||
ACTUAL_BATCH_SIZE, _, _ = enc_x.shape
|
|
||||||
enc_x_pad = torch.tensor(pad_x, dtype=torch.bool)
|
|
||||||
dec_x = Transformer.get_decoder_input(
|
|
||||||
ACTUAL_BATCH_SIZE, SOS_TOKEN, PAD_TOKEN, SENTENCE_LENGTH
|
|
||||||
)
|
|
||||||
|
|
||||||
dec_x_pad = dec_x.eq(PAD_TOKEN)
|
|
||||||
tgt = torch.tensor(tgt_y)
|
|
||||||
tgt_pad = torch.tensor(pad_y, dtype=torch.bool)
|
|
||||||
|
|
||||||
# Task 1 and Task 2
|
|
||||||
if (
|
|
||||||
tasktype == Batch.TaskType.RDF2TXT
|
|
||||||
or tasktype == Batch.TaskType.TEXT2RDF
|
|
||||||
):
|
|
||||||
|
|
||||||
BATCH_LOSS = []
|
|
||||||
|
|
||||||
for token_idx in range(0, SENTENCE_LENGTH):
|
|
||||||
|
|
||||||
pred_logits = NANOSOCRATES((enc_x, enc_x_pad, dec_x, dec_x_pad))
|
|
||||||
|
|
||||||
pred_logits = pred_logits[:, token_idx, :]
|
|
||||||
|
|
||||||
loss: torch.Tensor = nano_cross_entropy(pred_logits, tgt[:, token_idx])
|
|
||||||
|
|
||||||
BATCH_LOSS.append(loss.item())
|
|
||||||
|
|
||||||
if token_idx < SENTENCE_LENGTH - 1:
|
|
||||||
dec_x[:, token_idx + 1] = tgt[:, token_idx]
|
|
||||||
|
|
||||||
AVG_BATCH_LOSS = sum(BATCH_LOSS) / MINI_BATCH_SIZE
|
|
||||||
txt_avg_batch_losses.append(AVG_BATCH_LOSS)
|
|
||||||
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Pretrain first
|
|
||||||
if current_epoch < PRETRAIN_EPOCHS:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Task 3
|
|
||||||
if tasktype == Batch.TaskType.MASKING:
|
|
||||||
|
|
||||||
pred_logits = ENCODER_ONLY((enc_x, enc_x_pad))
|
|
||||||
pred_logits = pred_logits.permute(0, 2, 1)
|
|
||||||
|
|
||||||
loss: torch.Tensor = encoder_ce(pred_logits, tgt)
|
|
||||||
|
|
||||||
enc_avg_batch_losses.append(loss.item())
|
|
||||||
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Task 4
|
|
||||||
if tasktype == Batch.TaskType.COMPLETATION:
|
|
||||||
|
|
||||||
BATCH_LOSS = []
|
|
||||||
|
|
||||||
for token_idx in range(0, SENTENCE_LENGTH):
|
|
||||||
|
|
||||||
pred_logits = DECODER_ONLY((enc_x, enc_x_pad))
|
|
||||||
|
|
||||||
pred_logits = pred_logits[:, token_idx, :]
|
|
||||||
|
|
||||||
loss: torch.Tensor = decoder_ce(pred_logits, tgt[:, token_idx])
|
|
||||||
|
|
||||||
BATCH_LOSS.append(loss.item())
|
|
||||||
|
|
||||||
if token_idx < SENTENCE_LENGTH - 1:
|
|
||||||
dec_x[:, token_idx + 1] = tgt[:, token_idx]
|
|
||||||
|
|
||||||
AVG_BATCH_LOSS = sum(BATCH_LOSS) / MINI_BATCH_SIZE
|
|
||||||
|
|
||||||
dec_avg_batch_losses.append(AVG_BATCH_LOSS)
|
|
||||||
|
|
||||||
continue
|
|
||||||
|
|
||||||
txt_avg_loss = sum(txt_avg_batch_losses) / len(txt_avg_batch_losses)
|
|
||||||
enc_avg_loss = float("inf")
|
|
||||||
dec_avg_loss = float("inf")
|
|
||||||
|
|
||||||
if current_epoch >= PRETRAIN_EPOCHS:
|
|
||||||
enc_avg_loss = sum(enc_avg_batch_losses) / len(enc_avg_batch_losses)
|
|
||||||
dec_avg_loss = sum(dec_avg_batch_losses) / len(dec_avg_batch_losses)
|
|
||||||
|
|
||||||
if current_epoch < PRETRAIN_EPOCHS:
|
|
||||||
|
|
||||||
if txt_avg_loss < average_loss_validation["txt"]:
|
|
||||||
average_loss_validation["txt"] = txt_avg_loss
|
|
||||||
else:
|
|
||||||
patience += 1
|
|
||||||
else:
|
|
||||||
|
|
||||||
counter = 0
|
|
||||||
|
|
||||||
if txt_avg_loss > average_loss_validation["txt"]:
|
|
||||||
counter += 1
|
|
||||||
|
|
||||||
if txt_avg_loss > average_loss_validation["encoder_only"]:
|
|
||||||
counter += 1
|
|
||||||
|
|
||||||
if txt_avg_loss > average_loss_validation["decoder_only"]:
|
|
||||||
counter += 1
|
|
||||||
|
|
||||||
if counter > 1:
|
|
||||||
patience += 1
|
|
||||||
|
|
||||||
txt_min_train_losses = [row[0] for row in text_batch_losses]
|
|
||||||
txt_avg_train_losses = [row[1] for row in text_batch_losses]
|
|
||||||
txt_max_train_losses = [row[2] for row in text_batch_losses]
|
|
||||||
|
|
||||||
txt_min_loss = min(txt_min_train_losses)
|
|
||||||
txt_avg_min_loss = sum(txt_min_train_losses) / len(txt_min_train_losses)
|
|
||||||
txt_max_loss = max(txt_max_train_losses)
|
|
||||||
txt_avg_max_loss = sum(txt_max_train_losses) / len(txt_max_train_losses)
|
|
||||||
txt_avg_loss = sum(txt_avg_train_losses) / len(txt_avg_train_losses)
|
|
||||||
|
|
||||||
enc_avg_train_loss = float("inf")
|
|
||||||
|
|
||||||
dec_min_loss = float("inf")
|
|
||||||
dec_avg_min_loss = float("inf")
|
|
||||||
dec_max_loss = float("inf")
|
|
||||||
dec_avg_max_loss = float("inf")
|
|
||||||
dec_avg_loss = float("inf")
|
|
||||||
|
|
||||||
if current_epoch >= PRETRAIN_EPOCHS:
|
|
||||||
enc_avg_train_loss = sum(encoder_batch_losses) / len(encoder_batch_losses)
|
|
||||||
|
|
||||||
dec_min_train_losses = [row[0] for row in decoder_batch_losses]
|
|
||||||
dec_avg_train_losses = [row[1] for row in decoder_batch_losses]
|
|
||||||
dec_max_train_losses = [row[2] for row in decoder_batch_losses]
|
|
||||||
|
|
||||||
dec_min_loss = min(dec_min_train_losses)
|
|
||||||
dec_avg_min_loss = sum(dec_min_train_losses) / len(dec_min_train_losses)
|
|
||||||
dec_max_loss = max(dec_max_train_losses)
|
|
||||||
dec_avg_max_loss = sum(dec_max_train_losses) / len(dec_max_train_losses)
|
|
||||||
dec_avg_loss = sum(dec_avg_train_losses) / len(dec_avg_train_losses)
|
|
||||||
|
|
||||||
SEPARATOR = "================================================================================================================"
|
|
||||||
DEBUG_TEXT = "".join(
|
|
||||||
[
|
|
||||||
f"{SEPARATOR}\n",
|
|
||||||
f"EPOCH {current_epoch}\n",
|
|
||||||
f"{SEPARATOR}\n",
|
|
||||||
f"Train Losses:\n",
|
|
||||||
f"\tMin Losses:\n",
|
|
||||||
f"\t\tmin_txt: {txt_min_loss} - avg_txt: {txt_avg_min_loss}\n",
|
|
||||||
f"\t\tmin_dec: {dec_min_loss} - avg_dec: {dec_avg_min_loss}\n",
|
|
||||||
f"\tMax Losses:\n",
|
|
||||||
f"\t\tmax_txt: {txt_max_loss} - avg_txt: {txt_avg_max_loss}\n",
|
|
||||||
f"\t\tmax_dec: {dec_min_loss} - avg_dec: {dec_avg_max_loss}\n",
|
|
||||||
f"\tAvg Losses:\n",
|
|
||||||
f"\t\tavg_txt: {txt_avg_loss} - avg_enc: {enc_avg_loss} - avg_dec: {dec_avg_loss}\n",
|
|
||||||
f"{SEPARATOR}\n",
|
|
||||||
f"Validation Losses:\n",
|
|
||||||
f"\ttxt_loss: {txt_avg_loss} - masking_loss: {enc_avg_loss} - prediction: {dec_avg_loss}\n",
|
|
||||||
f"{SEPARATOR}\n",
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
print(DEBUG_TEXT)
|
|
||||||
|
|
||||||
# Warn about patience
|
|
||||||
if patience == PATIENCE:
|
|
||||||
print("Model is likely overfitting, so let's stop here")
|
|
||||||
|
|
||||||
# SAVE MODEL
|
|
||||||
if current_epoch % CHECKPOINT_STEPS == 0 or patience == PATIENCE:
|
|
||||||
print(f"Saving model at {CHECKPOINT_PATH.as_posix()}")
|
|
||||||
torch.save(NANOSOCRATES.state_dict(), CHECKPOINT_PATH)
|
|
||||||
@@ -1,177 +0,0 @@
|
|||||||
import random
|
|
||||||
import time
|
|
||||||
import torch
|
|
||||||
import pandas as pd
|
|
||||||
from pathlib import Path
|
|
||||||
import Project_Model.Libs.Embedder as Embedder
|
|
||||||
import Project_Model.Libs.BPE as BPE
|
|
||||||
import Project_Model.Libs.Transformer as Transformer
|
|
||||||
import Project_Model.Libs.TorchShims as torch_shims
|
|
||||||
|
|
||||||
# set a fixed seed
|
|
||||||
torch.manual_seed(0)
|
|
||||||
random.seed(0)
|
|
||||||
DEVICE = torch_shims.get_default_device()
|
|
||||||
torch.set_default_device(DEVICE)
|
|
||||||
|
|
||||||
# set a default device
|
|
||||||
|
|
||||||
# BPE Init
|
|
||||||
VOCABULARY_PATH = Path("Assets/Model/toy_10/toy_dictionary.json")
|
|
||||||
SPECIAL_VOC = BPE.default_special_tokens()
|
|
||||||
|
|
||||||
VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)
|
|
||||||
TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)
|
|
||||||
|
|
||||||
|
|
||||||
# Constants
|
|
||||||
TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + 1
|
|
||||||
EMBEDDED_SIZE = 256
|
|
||||||
FEED_FORWARD_MULTIPLIER = 4
|
|
||||||
ATTENTION_HEADS = 8
|
|
||||||
SENTENCE_LENGTH = 256
|
|
||||||
NUMBER_OF_BLOCKS = 4
|
|
||||||
MAX_EPOCHS = int(1e3)
|
|
||||||
|
|
||||||
|
|
||||||
PAD_TOKEN = TOKENANO.encode("<PAD>")[0]
|
|
||||||
END_TOKEN = TOKENANO.encode("<END>")[0]
|
|
||||||
|
|
||||||
|
|
||||||
# Load CSV
|
|
||||||
TOY_DATASET_PATH = Path("Assets/Dataset/1-hop/toy/rdf_text.csv")
|
|
||||||
|
|
||||||
TOY_DATASET = pd.read_csv(TOY_DATASET_PATH)
|
|
||||||
|
|
||||||
TOY_BATCH_INPUT_LIST: list[list[int]] = []
|
|
||||||
TOY_BATCH_PADDING_LIST: list[list[bool]] = []
|
|
||||||
TOY_BATCH_TARGET_LIST: list[list[int]] = []
|
|
||||||
TOY_BATCH_DECODER_DEFAULT: list[list[int]] = []
|
|
||||||
|
|
||||||
|
|
||||||
for index, row in TOY_DATASET.iterrows():
|
|
||||||
|
|
||||||
RDFs: str = row["RDFs"]
|
|
||||||
Abstract: str = row["Abstract"]
|
|
||||||
|
|
||||||
input_tokens = TOKENANO.encode(RDFs)
|
|
||||||
output_tokens = TOKENANO.encode(Abstract)[1:]
|
|
||||||
decoder_default_tokens = TOKENANO.encode("<SOS>")
|
|
||||||
|
|
||||||
input_tokens, padding = Transformer.normalize_sequence(
|
|
||||||
input_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN
|
|
||||||
)
|
|
||||||
output_tokens, _ = Transformer.normalize_sequence(
|
|
||||||
output_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN
|
|
||||||
)
|
|
||||||
decoder_default_tokens, _ = Transformer.normalize_sequence(
|
|
||||||
decoder_default_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN, False
|
|
||||||
)
|
|
||||||
|
|
||||||
TOY_BATCH_INPUT_LIST.append(input_tokens)
|
|
||||||
TOY_BATCH_PADDING_LIST.append(padding)
|
|
||||||
TOY_BATCH_TARGET_LIST.append(output_tokens)
|
|
||||||
TOY_BATCH_DECODER_DEFAULT.append(decoder_default_tokens)
|
|
||||||
|
|
||||||
output_tokens = TOKENANO.encode(RDFs)
|
|
||||||
input_tokens = TOKENANO.encode(Abstract)[1:]
|
|
||||||
decoder_default_tokens = TOKENANO.encode("<SOS>")
|
|
||||||
|
|
||||||
input_tokens, padding = Transformer.normalize_sequence(
|
|
||||||
input_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN
|
|
||||||
)
|
|
||||||
output_tokens, _ = Transformer.normalize_sequence(
|
|
||||||
output_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN
|
|
||||||
)
|
|
||||||
decoder_default_tokens, _ = Transformer.normalize_sequence(
|
|
||||||
decoder_default_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN, False
|
|
||||||
)
|
|
||||||
|
|
||||||
TOY_BATCH_INPUT_LIST.append(input_tokens)
|
|
||||||
TOY_BATCH_PADDING_LIST.append(padding)
|
|
||||||
TOY_BATCH_TARGET_LIST.append(output_tokens)
|
|
||||||
TOY_BATCH_DECODER_DEFAULT.append(decoder_default_tokens)
|
|
||||||
|
|
||||||
# Training loop
|
|
||||||
LOSS_HISTORY = []
|
|
||||||
NANOSOCRATES = Transformer.TrainingModel(
|
|
||||||
TOKEN_SPACE_SIZE,
|
|
||||||
EMBEDDED_SIZE,
|
|
||||||
FEED_FORWARD_MULTIPLIER,
|
|
||||||
ATTENTION_HEADS,
|
|
||||||
NUMBER_OF_BLOCKS,
|
|
||||||
)
|
|
||||||
cross_entropy = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)
|
|
||||||
optimizer = torch.optim.AdamW(NANOSOCRATES.parameters())
|
|
||||||
scheduler = Transformer.WarmupLR(optimizer, 4000, EMBEDDED_SIZE)
|
|
||||||
last_loss = 0
|
|
||||||
current_epoch = 0
|
|
||||||
|
|
||||||
while current_epoch < MAX_EPOCHS:
|
|
||||||
|
|
||||||
optimizer.zero_grad()
|
|
||||||
|
|
||||||
encoder_list = torch.tensor(TOY_BATCH_INPUT_LIST[:])
|
|
||||||
decoder_list = torch.tensor(TOY_BATCH_DECODER_DEFAULT[:])
|
|
||||||
src_padding = torch.tensor(TOY_BATCH_PADDING_LIST[:], dtype=torch.bool)
|
|
||||||
|
|
||||||
# Transform target into logits
|
|
||||||
target_logits = torch.tensor(TOY_BATCH_TARGET_LIST[:])
|
|
||||||
|
|
||||||
last_loss = 0
|
|
||||||
last_prediction: torch.Tensor
|
|
||||||
|
|
||||||
LOSS_HISTORY = []
|
|
||||||
|
|
||||||
start = time.time_ns()
|
|
||||||
|
|
||||||
|
|
||||||
for i in range(0, SENTENCE_LENGTH):
|
|
||||||
|
|
||||||
|
|
||||||
optimizer.zero_grad()
|
|
||||||
tgt_padding = decoder_list.eq(PAD_TOKEN)
|
|
||||||
|
|
||||||
logits: torch.Tensor = NANOSOCRATES(
|
|
||||||
(encoder_list, src_padding, decoder_list, tgt_padding)
|
|
||||||
)
|
|
||||||
prob = torch.softmax(logits, 2)
|
|
||||||
|
|
||||||
most_probable_tokens = torch.argmax(prob, 2)
|
|
||||||
last_prediction = most_probable_tokens
|
|
||||||
|
|
||||||
logits = logits[:, i, :]
|
|
||||||
# logits = logits.permute(0, 2, 1)
|
|
||||||
|
|
||||||
loss: torch.Tensor = cross_entropy(logits, target_logits[:, i])
|
|
||||||
LOSS_HISTORY.append(loss.item())
|
|
||||||
# loss : torch.Tensor = cross_entropy(logits, target_logits[:, 0:i])
|
|
||||||
# loss : torch.Tensor = cross_entropy(logits, target_logits)
|
|
||||||
|
|
||||||
last_loss = loss
|
|
||||||
loss.backward()
|
|
||||||
optimizer.step()
|
|
||||||
scheduler.step()
|
|
||||||
|
|
||||||
if i < SENTENCE_LENGTH - 1:
|
|
||||||
decoder_list[:, i + 1] = target_logits[:, i]
|
|
||||||
|
|
||||||
current_epoch += 1
|
|
||||||
|
|
||||||
end = time.time_ns()
|
|
||||||
|
|
||||||
if current_epoch % 1 == 0:
|
|
||||||
MIN_LOSS = min(LOSS_HISTORY)
|
|
||||||
MAX_LOSS = max(LOSS_HISTORY)
|
|
||||||
AVERAGE_LOSS = sum(LOSS_HISTORY)/len(LOSS_HISTORY)
|
|
||||||
print(f"EPOCH {current_epoch}\n\tTime: {(end-start)/1E9}s\n\tLoss: {last_loss}")
|
|
||||||
print(f"\tMin Loss: {MIN_LOSS}\tAvg Loss: {AVERAGE_LOSS}\tMax Loss: {MAX_LOSS}\n")
|
|
||||||
# for encoded_sentence, expected_sentence in zip(
|
|
||||||
# Transformer.tensor2token(last_prediction[:, :], END_TOKEN), # type: ignore
|
|
||||||
# Transformer.tensor2token(target_logits[:, :], END_TOKEN),
|
|
||||||
# ):
|
|
||||||
# decoded_sentence = TOKENANO.decode(encoded_sentence)
|
|
||||||
# decoded_target = TOKENANO.decode(expected_sentence)
|
|
||||||
# print(
|
|
||||||
# f"\tACTUAL:\n\t\t{decoded_sentence}\n\tEXPECTED:\n\t\t{decoded_target}\n"
|
|
||||||
# )
|
|
||||||
@@ -189,7 +189,7 @@ class NanoSocratesBPE(Encoder):
|
|||||||
token_stack.appendleft(right_token)
|
token_stack.appendleft(right_token)
|
||||||
token_stack.appendleft(left_token)
|
token_stack.appendleft(left_token)
|
||||||
|
|
||||||
return UTF_8_STRING_ARR.decode("utf-8", errors="ignore")
|
return UTF_8_STRING_ARR.decode("utf-8")
|
||||||
|
|
||||||
def __token_decode(self, token_id: int) -> tuple[int, int]:
|
def __token_decode(self, token_id: int) -> tuple[int, int]:
|
||||||
|
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ class TokeNanoCore:
|
|||||||
def vocabulary_size(self):
|
def vocabulary_size(self):
|
||||||
BPE_VOC_SIZE = self.__bpe_encoder.vocabulary_size
|
BPE_VOC_SIZE = self.__bpe_encoder.vocabulary_size
|
||||||
SPECIAL_VOC_SIZE = self.__special_encoder.vocabulary_size
|
SPECIAL_VOC_SIZE = self.__special_encoder.vocabulary_size
|
||||||
return BPE_VOC_SIZE + SPECIAL_VOC_SIZE + 1
|
return BPE_VOC_SIZE + SPECIAL_VOC_SIZE
|
||||||
|
|
||||||
def encode(self, corpus: str) -> list[int]:
|
def encode(self, corpus: str) -> list[int]:
|
||||||
output: list[int] = []
|
output: list[int] = []
|
||||||
|
|||||||
11
Project_Model/Libs/Batch/Classes/BatchEmbedder.py
Normal file
11
Project_Model/Libs/Batch/Classes/BatchEmbedder.py
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
from ....Libs.Embedder.Classes.NanoSocratesEmbedder import NanoSocratesEmbedder
|
||||||
|
import torch
|
||||||
|
|
||||||
|
class BatchEmbedder(torch.nn.Module):
|
||||||
|
|
||||||
|
def __init__(self, vocabulary_size: int, embedding_size: int) -> None:
|
||||||
|
super().__init__()
|
||||||
|
self.__embedder = NanoSocratesEmbedder(vocabulary_size,embedding_size)
|
||||||
|
|
||||||
|
|
||||||
|
def forward(self, )
|
||||||
@@ -1,107 +1,50 @@
|
|||||||
import random
|
import random
|
||||||
import sys
|
from typing import Generator
|
||||||
from typing import Any, Generator
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from pathlib import Path
|
|
||||||
from ..Enums import TaskType
|
|
||||||
import Project_Model.Libs.BPE as BPE
|
import Project_Model.Libs.BPE as BPE
|
||||||
|
from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
|
||||||
# from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
|
from Project_Model.Libs.Transformer.Classes.SpannedMasker import SpannedMasker
|
||||||
from Project_Model.Libs.Transformer import (
|
from TokenCompletation import TokenCompletationTransformer
|
||||||
SpannedMasker,
|
from Project_Model.Libs.BPE.Enums.SpecialToken import SpecialToken
|
||||||
truncate_rdf_list,
|
|
||||||
normalize_sequence,
|
|
||||||
)
|
|
||||||
|
|
||||||
from Project_Model.Libs.BPE import SpecialToken
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class Batcher:
|
class Batcher:
|
||||||
|
|
||||||
def __init__(
|
def __init__(self, dataset_path: str, batch_size:int, tokenizer: BPE.TokeNanoCore, masker: SpannedMasker) -> None:
|
||||||
self,
|
|
||||||
dataset_path: Path,
|
|
||||||
max_length: int,
|
|
||||||
tokenizer: BPE.TokeNanoCore,
|
|
||||||
masker: SpannedMasker,
|
|
||||||
seed: int = 0,
|
|
||||||
debug = False
|
|
||||||
) -> None:
|
|
||||||
# ABSTRACT, TRIPLE
|
# ABSTRACT, TRIPLE
|
||||||
# tasks:
|
# tasks:
|
||||||
# rdf2text: X: TRIPLE, Y: ABSTRACT
|
# rdf2text: X: TRIPLE, Y: ABSTRACT
|
||||||
# text2rdf: X: ABSTRACT, X:TRIPLE
|
# text2rdf: X: ABSTRACT, X:TRIPLE
|
||||||
# masking ( call masker): X: incomplete_triple Y: complete_triple (as exam)
|
# masking ( call masker): X: incomplete_triple Y: complete_triple (as exam)
|
||||||
# completation: X: TRIPLE SUBSET, Y: related TRIPLE SUBSET
|
# completation: X: TRIPLE SUBSET, Y: related TRIPLE SUBSET
|
||||||
# it will truncate
|
|
||||||
# it will instantiate spanmaskter and truncator
|
|
||||||
self._dataset_path = dataset_path
|
self._dataset_path = dataset_path
|
||||||
|
self._batch_size = batch_size
|
||||||
self._tokenizer = tokenizer
|
self._tokenizer = tokenizer
|
||||||
self._masker = masker
|
self._masker = masker
|
||||||
self.__max_length = max_length
|
|
||||||
self._seed = seed
|
|
||||||
# self._token_completation = TokenCompletationTransformer(sotl,eos)
|
|
||||||
self._completation_task_token_truncator = truncate_rdf_list
|
|
||||||
self.__debug = debug
|
|
||||||
|
|
||||||
def batch(self, batch_size) -> Generator[
|
sotl = self._tokenizer.encode(SpecialToken.START_TRIPLE_LIST.value)
|
||||||
tuple[
|
eos = self._tokenizer.encode(SpecialToken.END_OF_SEQUENCE.value)
|
||||||
list[list[int]],
|
self._token_completation = TokenCompletationTransformer(sotl,eos)
|
||||||
list[list[int]],
|
|
||||||
list[list[int]],
|
|
||||||
list[list[int]],
|
|
||||||
TaskType
|
|
||||||
],
|
|
||||||
Any,
|
|
||||||
Any,
|
|
||||||
]:
|
|
||||||
"""
|
|
||||||
Yields: X,Y,padding_X
|
|
||||||
"""
|
|
||||||
RNG = random.Random(self._seed)
|
|
||||||
self._masker.reseed(self._seed)
|
|
||||||
|
|
||||||
for batch in pd.read_csv(self._dataset_path, chunksize=batch_size):
|
|
||||||
|
def get_batch(self)-> Generator[pd.DataFrame]:
|
||||||
|
for batch in pd.read_csv(self._dataset_path, chunksize= int(self._batch_size/4)): #now we support 3 task
|
||||||
|
|
||||||
tokenized_batch = pd.DataFrame()
|
tokenized_batch = pd.DataFrame()
|
||||||
# encode
|
tokenized_batch[["Abstract","RDFs"]] = (
|
||||||
tokenized_batch[["Abstract", "RDFs"]] = batch[["Abstract", "RDFs"]].map(
|
batch[["Abstract","RDFs"]]
|
||||||
lambda t: self._tokenizer.encode(t)
|
.map(lambda t: self._tokenizer.encode(t))
|
||||||
)
|
)
|
||||||
|
|
||||||
X, Y, padding_X, padding_Y = self.__rdf2txt_transformation(tokenized_batch)
|
rdf2txt_batch = self.__rdf2txt_transformation(tokenized_batch)
|
||||||
yield X, Y, padding_X, padding_Y, TaskType.RDF2TXT
|
txt2rdf_batch = self.__txt2rdf_transformation(tokenized_batch)
|
||||||
(
|
mask_batch = self.__masking_trasformation(tokenized_batch)
|
||||||
X,
|
completation_batch = self.__token_completation_task(tokenized_batch)
|
||||||
Y,
|
|
||||||
padding_X,
|
output = pd.concat([rdf2txt_batch,txt2rdf_batch,mask_batch,completation_batch],ignore_index=True)
|
||||||
padding_Y,
|
output = output.sample(frac=1).reset_index(drop=True)
|
||||||
) = self.__txt2rdf_transformation(tokenized_batch)
|
yield output
|
||||||
yield X, Y, padding_X, padding_Y, TaskType.TEXT2RDF
|
|
||||||
(
|
|
||||||
X,
|
|
||||||
Y,
|
|
||||||
padding_X,
|
|
||||||
padding_Y,
|
|
||||||
) = self.__masking_trasformation(tokenized_batch)
|
|
||||||
yield X, Y, padding_X, padding_Y, TaskType.MASKING
|
|
||||||
(
|
|
||||||
X,
|
|
||||||
Y,
|
|
||||||
padding_X,
|
|
||||||
padding_Y,
|
|
||||||
) = self.__token_completation_task(
|
|
||||||
tokenized_batch, RNG.randint(0, sys.maxsize)
|
|
||||||
)
|
|
||||||
yield X, Y, padding_X, padding_Y, TaskType.COMPLETATION
|
|
||||||
|
|
||||||
# output = pd.concat([rdf2txt_batch,txt2rdf_batch,completation_batch],ignore_index=True)
|
|
||||||
# output = output.sample(frac=1).reset_index(drop=True)
|
|
||||||
# self.decode_debug(output)
|
|
||||||
# yield output
|
|
||||||
|
|
||||||
def __random_subset_rdfs(self, batch: pd.DataFrame, seed = 0):
|
def __random_subset_rdfs(self, batch: pd.DataFrame, seed = 0):
|
||||||
# WIP
|
# WIP
|
||||||
@@ -110,125 +53,42 @@ class Batcher:
|
|||||||
def to_list(x):
|
def to_list(x):
|
||||||
return x.split(SpecialToken.START_TRIPLE.value)[1:]
|
return x.split(SpecialToken.START_TRIPLE.value)[1:]
|
||||||
|
|
||||||
batch["RDFs"] = batch["RDFs"].map(to_list)
|
batch["RDFs"] = batch["RDFs"].map(
|
||||||
|
to_list
|
||||||
def decode_debug(self, batch: pd.DataFrame):
|
|
||||||
decoded = pd.DataFrame()
|
|
||||||
decoded[["X", "Y"]] = batch[["X", "Y"]].map(lambda t: self._tokenizer.decode(t))
|
|
||||||
print(decoded)
|
|
||||||
|
|
||||||
def __normalization(
|
|
||||||
self, X: list[list[int]], Y: list[list[int]]
|
|
||||||
) -> tuple[list[list[int]], list[list[int]], list[list[int]], list[list[int]]]:
|
|
||||||
pad_token = self._tokenizer.encode(SpecialToken.PAD.value)[0]
|
|
||||||
end_token = self._tokenizer.encode(SpecialToken.END_OF_SEQUENCE.value)[0]
|
|
||||||
out_X = []
|
|
||||||
padding_X = []
|
|
||||||
out_Y = []
|
|
||||||
padding_Y = []
|
|
||||||
|
|
||||||
for x in X:
|
|
||||||
out_x, padding_x = normalize_sequence(
|
|
||||||
x, self.__max_length, pad_token, end_token, True
|
|
||||||
)
|
)
|
||||||
out_X.append(out_x)
|
|
||||||
padding_X.append(padding_x)
|
|
||||||
|
|
||||||
for y in Y:
|
|
||||||
out_y, padding_y = normalize_sequence(
|
|
||||||
y, self.__max_length, pad_token, end_token, True
|
|
||||||
)
|
|
||||||
out_Y.append(out_y)
|
|
||||||
padding_Y.append(padding_y)
|
|
||||||
|
|
||||||
return out_X, out_Y, padding_X, padding_Y
|
|
||||||
|
|
||||||
def __rdf2txt_transformation(self, batch: pd.DataFrame):
|
def __rdf2txt_transformation(self, batch: pd.DataFrame):
|
||||||
X: list[list[int]]
|
batch = batch.rename(columns={"RDFs": "X", "Abstract": "Y"})
|
||||||
task_token = self._tokenizer.encode(SpecialToken.RDF_TO_TEXT.value)
|
return batch[["X", "Y"]]
|
||||||
out = batch.rename(columns={"RDFs": "X", "Abstract": "Y"})[["X", "Y"]]
|
|
||||||
out["X"] = [task_token + x for x in out["X"]]
|
|
||||||
return self.__normalization(out["X"].to_list(), out["Y"].to_list())
|
|
||||||
|
|
||||||
def __txt2rdf_transformation(self, batch: pd.DataFrame):
|
def __txt2rdf_transformation(self, batch: pd.DataFrame):
|
||||||
task_token = self._tokenizer.encode(SpecialToken.TEXT_TO_RDF.value)
|
batch = batch.rename(columns={ "Abstract": "X","RDFs": "Y"})
|
||||||
out = batch.rename(columns={"Abstract": "X", "RDFs": "Y"})[["X", "Y"]]
|
return batch[["X", "Y"]]
|
||||||
out["X"] = [task_token + x for x in out["X"]]
|
|
||||||
return self.__normalization(out["X"].to_list(), out["Y"].to_list())
|
|
||||||
|
|
||||||
def __masking_trasformation(self, batch: pd.DataFrame):
|
def __masking_trasformation(self, batch: pd.DataFrame):
|
||||||
X = []
|
# mask_sequence: List[int] -> Tuple[List[int], List[int]]
|
||||||
Y = []
|
xy_tuples = batch["RDFs"].apply(self._masker.mask_sequence) # Series of (X, Y)
|
||||||
for rdf in batch["RDFs"]:
|
|
||||||
x, y = self._masker.mask_sequence(rdf[:self.__max_length])
|
|
||||||
X.append(x)
|
|
||||||
Y.append(y)
|
|
||||||
return self.__normalization(X, Y)
|
|
||||||
|
|
||||||
def __token_completation_task(self, batch: pd.DataFrame, minibatch_seed: int):
|
output = batch.copy()
|
||||||
continue_triple_token = self._tokenizer.encode(SpecialToken.CONTINUE_RDF.value)[
|
# Expand into two columns preserving the original index
|
||||||
0
|
output[["X", "Y"]] = pd.DataFrame(xy_tuples.tolist(), index=batch.index)
|
||||||
]
|
return output[["X", "Y"]]
|
||||||
eot = self._tokenizer.encode(SpecialToken.END_TRIPLE.value)[0]
|
|
||||||
X = []
|
|
||||||
Y = []
|
|
||||||
for rdf in batch["RDFs"]:
|
|
||||||
# here first truncate to max_lenght
|
|
||||||
rdf = rdf[: self.__max_length] # truncator that uses "eot" so no problem
|
|
||||||
x, y = self._completation_task_token_truncator(
|
|
||||||
rdf, 0.5, continue_triple_token, eot, minibatch_seed
|
|
||||||
)
|
|
||||||
X.append(x)
|
|
||||||
Y.append(y)
|
|
||||||
return self.__token_cmpletation_task_special_normalization(X, Y)
|
|
||||||
|
|
||||||
def __token_cmpletation_task_special_normalization(self, X: list[list[int]], Y: list[list[int]]
|
|
||||||
) -> tuple[list[list[int]], list[list[int]], list[list[int]], list[list[int]]]:
|
|
||||||
|
|
||||||
def continue_rdf_padding(sequence: list[int], pad_token: int):
|
def __token_completation_task(self, batch: pd.DataFrame):
|
||||||
for i, x in enumerate(sequence):
|
xy_tuples = batch["RDFs"].apply(self._token_completation.get_completation_tuple)
|
||||||
if x == pad_token:
|
output = batch.copy()
|
||||||
i = i+1 # continueRDF will be excluded by the mask
|
output[["X", "Y"]] = pd.DataFrame(xy_tuples.tolist(), index=batch.index)
|
||||||
# fill the tail with True and stop
|
return output[["X", "Y"]]
|
||||||
return [False] * i + [True] * (len(sequence) - i)
|
|
||||||
return [False] * len(sequence) # no pad token found
|
|
||||||
|
|
||||||
pad_token = self._tokenizer.encode(SpecialToken.PAD.value)[0]
|
|
||||||
end_token = self._tokenizer.encode(SpecialToken.END_OF_SEQUENCE.value)[0]
|
|
||||||
continue_rdf = self._tokenizer.encode(SpecialToken.CONTINUE_RDF.value)[0]
|
|
||||||
out_X = []
|
|
||||||
padding_X = []
|
|
||||||
out_Y = []
|
|
||||||
padding_Y = []
|
|
||||||
|
|
||||||
for x in X:
|
|
||||||
out_x, _ = normalize_sequence(
|
|
||||||
x, self.__max_length, pad_token, end_token, True
|
|
||||||
)
|
|
||||||
out_X.append(out_x)
|
|
||||||
# padding_X.append(padding_x)
|
|
||||||
special_padding = continue_rdf_padding(out_x,continue_rdf)
|
|
||||||
padding_X.append(special_padding)
|
|
||||||
|
|
||||||
for y in Y:
|
"""
|
||||||
out_y, padding_y = normalize_sequence(
|
DATASET_PATH = "Assets/Dataset/Tmp/rdf_text.csv"
|
||||||
y, self.__max_length, pad_token, end_token, True
|
|
||||||
)
|
|
||||||
out_Y.append(out_y)
|
|
||||||
# special padding
|
|
||||||
# special_padding = continue_rdf_padding(out_y,continue_rdf)
|
|
||||||
# padding_Y.append(special_padding)
|
|
||||||
padding_Y.append(padding_y)
|
|
||||||
|
|
||||||
return out_X, out_Y, padding_X, padding_Y
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
|
|
||||||
DATASET_PATH = Path("Assets/Dataset/Tmp/rdf_text.csv")
|
|
||||||
VOCABULARY_path = "Assets/Dataset/Tmp/trimmed.json"
|
VOCABULARY_path = "Assets/Dataset/Tmp/trimmed.json"
|
||||||
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
VOCABULARY = BPE.load_nanos_vocabulary(Path(VOCABULARY_path))
|
VOCABULARY = BPE.load_nanos_vocabulary(Path(VOCABULARY_path))
|
||||||
SPECIAL_LIST = BPE.default_special_tokens()
|
SPECIAL_LIST = BPE.default_special_tokens()
|
||||||
TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_LIST)
|
TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_LIST)
|
||||||
@@ -238,6 +98,7 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
prova = "<ABS>Cactus Flower is a 1969 American screwball comedy film directed by Gene Saks, and starring Walter Matthau, Ingrid Bergman and Goldie Hawn, who won an Academy Award for her performance.The screenplay was adapted by I. A. L. Diamond from the 1965 Broadway play of the same title written by Abe Burrows, which, in turn, is based on the French play Fleur de cactus by Pierre Barillet and Jean-Pierre Gredy. Cactus Flower was the ninth highest-grossing film of 1969."
|
prova = "<ABS>Cactus Flower is a 1969 American screwball comedy film directed by Gene Saks, and starring Walter Matthau, Ingrid Bergman and Goldie Hawn, who won an Academy Award for her performance.The screenplay was adapted by I. A. L. Diamond from the 1965 Broadway play of the same title written by Abe Burrows, which, in turn, is based on the French play Fleur de cactus by Pierre Barillet and Jean-Pierre Gredy. Cactus Flower was the ninth highest-grossing film of 1969."
|
||||||
print(TOKENANO.encode(prova))
|
print(TOKENANO.encode(prova))
|
||||||
batcher = Batcher(DATASET_PATH,256, TOKENANO, MASKER)
|
batcher = Batcher(DATASET_PATH,8,TOKENANO,MASKER)
|
||||||
for batch in batcher.batch(8):
|
for batch in batcher.get_batch():
|
||||||
print(batch)
|
print(batch)
|
||||||
|
"""
|
||||||
@@ -1,2 +0,0 @@
|
|||||||
from .Batcher import Batcher
|
|
||||||
from .TokenCompletation import TokenCompletationTransformer
|
|
||||||
@@ -1,5 +0,0 @@
|
|||||||
from .TaskType import TaskType
|
|
||||||
|
|
||||||
__all__ = [
|
|
||||||
"TaskType"
|
|
||||||
]
|
|
||||||
@@ -1,5 +0,0 @@
|
|||||||
from .Classes import *
|
|
||||||
from .Enums import *
|
|
||||||
|
|
||||||
from . import Classes
|
|
||||||
from . import Enums
|
|
||||||
@@ -1,70 +0,0 @@
|
|||||||
import evaluate
|
|
||||||
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
|
|
||||||
class Evaluator():
|
|
||||||
def __init__(self) -> None:
|
|
||||||
# txt based evaluator
|
|
||||||
self.__rouge = evaluate.load("rouge")
|
|
||||||
self.__rouge_types = ["rougeLsum", "rouge1", "rouge2"] #rougeLsum will work bad because it expect that each sentence are divided with /n
|
|
||||||
self._bleu = evaluate.load("bleu")
|
|
||||||
self._meteor = evaluate.load("meteor")
|
|
||||||
# token based evaluator
|
|
||||||
self.__acc_m = evaluate.load("accuracy")
|
|
||||||
self.__prec_m = evaluate.load("precision")
|
|
||||||
self.__rec_m = evaluate.load("recall")
|
|
||||||
self.__f1_m = evaluate.load("f1")
|
|
||||||
|
|
||||||
def rdf2txt_rouge_evaluation(self, preds: list[str], refs: list[str]):
|
|
||||||
|
|
||||||
results = self.__rouge.compute(
|
|
||||||
predictions=preds, references=refs,
|
|
||||||
rouge_types=self.__rouge_types,
|
|
||||||
use_stemmer=True,
|
|
||||||
use_aggregator=True #F1
|
|
||||||
)
|
|
||||||
|
|
||||||
return {k: float(results[k]) for k in self.__rouge_types}
|
|
||||||
|
|
||||||
def rdf2txt_bleu_evaluation(self, preds: list[str], refs: list[str]) -> float:
|
|
||||||
# sacreBLEU via evaluate; expects references as list-of-lists
|
|
||||||
# each prediction can be evaluated against a list of references, hence [[ref]]
|
|
||||||
results = self._bleu.compute(predictions=preds, references=[[r] for r in refs])
|
|
||||||
return float(results["bleu"]) # (native sacreBLEU scale)
|
|
||||||
|
|
||||||
def rdf2txt_meteor_evaluation(self, preds: list[str], refs: list[str]) -> float:
|
|
||||||
# as bleu
|
|
||||||
res = self._meteor.compute(predictions=preds, references=[[r] for r in refs])
|
|
||||||
return float(res["meteor"])
|
|
||||||
|
|
||||||
|
|
||||||
def __my_accuracy(self,preds: list[list[int]], refs: list[list[int]]):
|
|
||||||
# it is done on token sequence not single token
|
|
||||||
total = len(preds)
|
|
||||||
correct = 0
|
|
||||||
for p, r in zip(preds, refs):
|
|
||||||
correct += int(p == r)
|
|
||||||
return correct / total
|
|
||||||
|
|
||||||
def __accuracy(self, preds, refs):
|
|
||||||
return accuracy_score(preds,refs)
|
|
||||||
|
|
||||||
def __clean_batch_by_pad(self, preds: list[list[int]], refs: list[list[int]]):
|
|
||||||
output_preds = []
|
|
||||||
output_refs = []
|
|
||||||
#TODO
|
|
||||||
pad_token: int = 7000 # percolate
|
|
||||||
for pred, ref in zip(preds,refs):
|
|
||||||
try:
|
|
||||||
i = ref.index(pad_token) # first time pad token appears
|
|
||||||
except ValueError:
|
|
||||||
i = len(ref)
|
|
||||||
output_preds.append(pred[:i])
|
|
||||||
output_refs.append(ref[:i])
|
|
||||||
|
|
||||||
return output_preds,output_refs
|
|
||||||
|
|
||||||
def __precision_recall(self, preds: list[list[int]], refs: list[list[int]]):
|
|
||||||
#TODO
|
|
||||||
p, r, f1, _ = precision_recall_fscore_support(
|
|
||||||
preds, refs, average="binary", zero_division=0
|
|
||||||
) #### watch later
|
|
||||||
return {"precision": float(p), "recall": float(r), "f1": float(f1)}
|
|
||||||
41
Project_Model/Libs/Training/learning_rade_shedulers.py
Normal file
41
Project_Model/Libs/Training/learning_rade_shedulers.py
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
import numpy as np
|
||||||
|
# custom LR from attention is all you need
|
||||||
|
class Custom_lr():
|
||||||
|
def __init__(self, d_model: int, warmup_step:int) -> None:
|
||||||
|
|
||||||
|
self.__d_model = d_model
|
||||||
|
self.__warmup_step = warmup_step
|
||||||
|
self.__epoch = 0
|
||||||
|
|
||||||
|
|
||||||
|
def step(self) -> int:
|
||||||
|
self.__epoch += 1
|
||||||
|
return (self.__d_model ** -0.5) * min(self.__epoch ** -0.5,
|
||||||
|
self.__epoch * (self.__warmup_step ** -1.5))
|
||||||
|
|
||||||
|
# OTHER LR
|
||||||
|
|
||||||
|
# Learning rate schedules (matching visualization parameters)
|
||||||
|
def step_lr(epoch, lr):
|
||||||
|
# StepLR: step_size=20, gamma=0.5 (from visualization)
|
||||||
|
return lr * 0.5 if epoch % 20 == 0 and epoch > 0 else lr
|
||||||
|
|
||||||
|
def exp_lr(epoch, lr):
|
||||||
|
# ExponentialLR: gamma=0.95 (from visualization)
|
||||||
|
return lr * 0.95
|
||||||
|
|
||||||
|
def cosine_lr(epoch, lr):
|
||||||
|
# CosineAnnealingLR: lr_min=0.001, lr_max=0.1, max_epochs=100 (from visualization)
|
||||||
|
lr_min, lr_max = 0.001, 0.1
|
||||||
|
max_epochs = 100
|
||||||
|
return lr_min + 0.5 * (lr_max - lr_min) * (1 + np.cos(epoch * np.pi / max_epochs))
|
||||||
|
|
||||||
|
def cyclical_lr(epoch, lr):
|
||||||
|
# CyclicalLR: base_lr=0.001, max_lr=0.1, step_size=20 (from visualization)
|
||||||
|
base_lr = 0.001
|
||||||
|
max_lr = 0.1
|
||||||
|
step_size = 20
|
||||||
|
|
||||||
|
cycle = np.floor(1 + epoch / (2 * step_size))
|
||||||
|
x = np.abs(epoch / step_size - 2 * cycle + 1)
|
||||||
|
return base_lr + (max_lr - base_lr) * max(0, (1 - x))
|
||||||
@@ -25,8 +25,8 @@ class LogitsCollector:
|
|||||||
for row in ids.tolist():
|
for row in ids.tolist():
|
||||||
seq: list[int] = []
|
seq: list[int] = []
|
||||||
for tok in row:
|
for tok in row:
|
||||||
# if tok == self.__end_token: # stop on END
|
if tok == self.__end_token: # stop on END
|
||||||
# break
|
break
|
||||||
if tok == self.__pad_token: # skip PAD
|
if tok == self.__pad_token: # skip PAD
|
||||||
continue
|
continue
|
||||||
seq.append(tok)
|
seq.append(tok)
|
||||||
@@ -36,7 +36,6 @@ class LogitsCollector:
|
|||||||
def print_decoded(self) -> None:
|
def print_decoded(self) -> None:
|
||||||
for i, seq in enumerate(self.tokens()):
|
for i, seq in enumerate(self.tokens()):
|
||||||
try:
|
try:
|
||||||
# text = text + self.__end_token
|
|
||||||
text = self.__tokenizer.decode(seq) # decode tokens to string
|
text = self.__tokenizer.decode(seq) # decode tokens to string
|
||||||
except Exception:
|
except Exception:
|
||||||
text = str(seq) # fallback to ids
|
text = str(seq) # fallback to ids
|
||||||
|
|||||||
@@ -1,20 +0,0 @@
|
|||||||
import os
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
class Log:
|
|
||||||
def __init__(self, path):
|
|
||||||
self.path = path
|
|
||||||
header = ["epoch","avg_txt","avg_enc","avg_dec","txt_loss","masking_loss","prediction_loss"]
|
|
||||||
|
|
||||||
if Path(path).is_file():
|
|
||||||
return
|
|
||||||
|
|
||||||
with open(self.path, "w", encoding="utf-8", newline="") as f:
|
|
||||||
f.write(",".join(header) + "\n")
|
|
||||||
|
|
||||||
def write(self, loss: list[float]):
|
|
||||||
line = ",".join(str(float(x)) for x in loss) + "\n"
|
|
||||||
with open(self.path, "a", encoding="utf-8", newline="") as f:
|
|
||||||
f.write(line)
|
|
||||||
f.flush()
|
|
||||||
os.fsync(f.fileno()) # extra durability per write # suggested against sudden crashes since it will be done
|
|
||||||
0
Project_Model/Libs/Training/training.py
Normal file
0
Project_Model/Libs/Training/training.py
Normal file
@@ -1,9 +1,8 @@
|
|||||||
from typing import Optional
|
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
from .FeedForwardNetwork import FeedForwardNetwork
|
from .FeedForwardNetwork import FeedForwardNetwork
|
||||||
from .TorchMultiHeadAttention import TorchMultiHeadAttention as MultiHeadAttention
|
from .TorchMultiHeadAttention import TorchMultiHeadAttention as MultiHeadAttention
|
||||||
from ..Utils.attention_mask import get_causal_attention_mask, get_prefix_causal_mask_from_padding_mask
|
from ..Utils.attention_mask import get_causal_attention_mask
|
||||||
|
|
||||||
# B, L(T), E_D
|
# B, L(T), E_D
|
||||||
|
|
||||||
@@ -16,10 +15,8 @@ class Decoder(nn.Module):
|
|||||||
feed_forward_hidden_layer_dimension: int,
|
feed_forward_hidden_layer_dimension: int,
|
||||||
number_of_attention_heads: int,
|
number_of_attention_heads: int,
|
||||||
) -> None:
|
) -> None:
|
||||||
self.__attention_heads = number_of_attention_heads
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
|
|
||||||
self.__masked_attention = MultiHeadAttention(
|
self.__masked_attention = MultiHeadAttention(
|
||||||
embedding_dimension, number_of_attention_heads, dropout=0.1
|
embedding_dimension, number_of_attention_heads, dropout=0.1
|
||||||
)
|
)
|
||||||
@@ -45,52 +42,43 @@ class Decoder(nn.Module):
|
|||||||
torch.Tensor,
|
torch.Tensor,
|
||||||
torch.Tensor,
|
torch.Tensor,
|
||||||
torch.Tensor,
|
torch.Tensor,
|
||||||
torch.Tensor,
|
torch.Tensor
|
||||||
Optional[bool]
|
|
||||||
]
|
]
|
||||||
): # -> list[torch.Tensor]: # k_x = v_x . While x_q = x
|
): # -> list[torch.Tensor]: # k_x = v_x . While x_q = x
|
||||||
# WARNING: args is needed to have sequential
|
# WARNING: args is needed to have sequential
|
||||||
if len(args) < 6:
|
x, k_x, v_x, padding_mask,encoder_padding_mask = args
|
||||||
args = args + (False)
|
|
||||||
x, k_x, v_x, src_padding_mask, tgt_padding_mask, decoder_only = args
|
|
||||||
|
|
||||||
# build of attention mask
|
# build of attention mask
|
||||||
# TODO: create a prefix causal mask if needed
|
|
||||||
if decoder_only:
|
|
||||||
attention_mask = get_prefix_causal_mask_from_padding_mask(x.size(1),src_padding_mask,self.__attention_heads) # the correct is tgt however ...
|
|
||||||
else:
|
|
||||||
attention_mask = get_causal_attention_mask(x.size(1))
|
attention_mask = get_causal_attention_mask(x.size(1))
|
||||||
|
|
||||||
# 1) Masked Attention
|
# 1) Masked Attention
|
||||||
MASKED_ATTENTION = self.__masked_attention(
|
MASKED_ATTENTION = self.__masked_attention(
|
||||||
x, x, x, key_padding_mask=tgt_padding_mask, attention_mask=attention_mask
|
x, x, x, key_padding_mask=padding_mask, attention_mask=attention_mask
|
||||||
)
|
)
|
||||||
|
|
||||||
# 2) Dropout
|
# 2) Dropout
|
||||||
DROPPED_MASKED_ATTENTION = self.__dropout(MASKED_ATTENTION)
|
# DROPPED_MASKED_ATTENTION = self.__dropout(MASKED_ATTENTION)
|
||||||
del MASKED_ATTENTION
|
# del MASKED_ATTENTION
|
||||||
|
|
||||||
# 3) Residual Connection
|
# 3) Residual Connection
|
||||||
x = x + DROPPED_MASKED_ATTENTION
|
x = x + MASKED_ATTENTION
|
||||||
del DROPPED_MASKED_ATTENTION
|
del MASKED_ATTENTION
|
||||||
|
|
||||||
# 4) Layer Normalization
|
# 4) Layer Normalization
|
||||||
x = self.__layer_norm_1(x)
|
x = self.__layer_norm_1(x)
|
||||||
|
|
||||||
|
|
||||||
if not decoder_only:
|
|
||||||
# 5) Encoder–decoder (cross) attention
|
# 5) Encoder–decoder (cross) attention
|
||||||
CROSS_ATTENTION = self.__cross_attention(
|
CROSS_ATTENTION = self.__cross_attention(
|
||||||
x, k_x, v_x, key_padding_mask=src_padding_mask
|
x, k_x, v_x, key_padding_mask=encoder_padding_mask
|
||||||
)
|
)
|
||||||
|
|
||||||
# 6) Dropout
|
# 6) Dropout
|
||||||
DROPPED_CROSS_ATTENTION = self.__dropout(CROSS_ATTENTION)
|
# DROPPED_CROSS_ATTENTION = self.__dropout(CROSS_ATTENTION)
|
||||||
del CROSS_ATTENTION
|
# del CROSS_ATTENTION
|
||||||
|
|
||||||
# 7) Residual Connection
|
# 7) Residual Connection
|
||||||
x = x + DROPPED_CROSS_ATTENTION
|
x = x + CROSS_ATTENTION
|
||||||
del DROPPED_CROSS_ATTENTION
|
del CROSS_ATTENTION
|
||||||
|
|
||||||
# 8) Layer Normalization
|
# 8) Layer Normalization
|
||||||
x = self.__layer_norm_2(x)
|
x = self.__layer_norm_2(x)
|
||||||
@@ -99,17 +87,17 @@ class Decoder(nn.Module):
|
|||||||
FEED_FORWARD = self.__feed_forward_network(x)
|
FEED_FORWARD = self.__feed_forward_network(x)
|
||||||
|
|
||||||
# 10) Dropout
|
# 10) Dropout
|
||||||
DROPPED_FEED_FORWARD = self.__dropout(FEED_FORWARD)
|
# DROPPED_FEED_FORWARD = self.__dropout(FEED_FORWARD)
|
||||||
del FEED_FORWARD
|
# del FEED_FORWARD
|
||||||
|
|
||||||
# 11) Residual Connection
|
# 11) Residual Connection
|
||||||
x = x + DROPPED_FEED_FORWARD
|
x = x + FEED_FORWARD
|
||||||
del DROPPED_FEED_FORWARD
|
del FEED_FORWARD
|
||||||
|
|
||||||
# 12) Layer Normalization
|
# 12) Layer Normalization
|
||||||
x = self.__layer_norm_3(x)
|
x = self.__layer_norm_3(x)
|
||||||
|
|
||||||
return (x, k_x, v_x, src_padding_mask, tgt_padding_mask, decoder_only)
|
return (x, k_x, v_x, padding_mask, encoder_padding_mask)
|
||||||
|
|
||||||
|
|
||||||
# use eval to disable dropout ecc
|
# use eval to disable dropout ecc
|
||||||
|
|||||||
@@ -43,12 +43,12 @@ class Encoder(
|
|||||||
ATTENTION = self.__attention(x, x, x, key_padding_mask=padding_mask)
|
ATTENTION = self.__attention(x, x, x, key_padding_mask=padding_mask)
|
||||||
|
|
||||||
# 2) Dropout
|
# 2) Dropout
|
||||||
DROPPED_ATTENTION = self.__dropout(ATTENTION)
|
# DROPPED_ATTENTION = self.__dropout(ATTENTION)
|
||||||
del ATTENTION
|
# del ATTENTION
|
||||||
|
|
||||||
# 3) Residual Connection
|
# 3) Residual Connection
|
||||||
x = x + DROPPED_ATTENTION
|
x = x + ATTENTION
|
||||||
del DROPPED_ATTENTION
|
del ATTENTION
|
||||||
|
|
||||||
# 4) Layer Normalization
|
# 4) Layer Normalization
|
||||||
x = self.__layer_norm_1(x)
|
x = self.__layer_norm_1(x)
|
||||||
@@ -57,12 +57,12 @@ class Encoder(
|
|||||||
FEED_FORWARD = self.__feed_forward(x)
|
FEED_FORWARD = self.__feed_forward(x)
|
||||||
|
|
||||||
# 6) Dropout
|
# 6) Dropout
|
||||||
DROPPED_FEED_FORWARD = self.__dropout(FEED_FORWARD)
|
# DROPPED_FEED_FORWARD = self.__dropout(FEED_FORWARD)
|
||||||
del FEED_FORWARD
|
# del FEED_FORWARD
|
||||||
|
|
||||||
# 7) Residual Connection
|
# 7) Residual Connection
|
||||||
x = x + DROPPED_FEED_FORWARD
|
x = x + FEED_FORWARD
|
||||||
del DROPPED_FEED_FORWARD
|
del FEED_FORWARD
|
||||||
|
|
||||||
# 8) Layer Normalization
|
# 8) Layer Normalization
|
||||||
x = self.__layer_norm_2(x)
|
x = self.__layer_norm_2(x)
|
||||||
|
|||||||
23
Project_Model/Libs/Transformer/Classes/NanoSocrates.py
Normal file
23
Project_Model/Libs/Transformer/Classes/NanoSocrates.py
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
import torch
|
||||||
|
from NanoSocratesCore import NanoSocratesCore
|
||||||
|
|
||||||
|
class NanoSocrates(torch.nn.Module):
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
embedded_size: int,
|
||||||
|
feed_forward_dim: int,
|
||||||
|
encoder_layers: int,
|
||||||
|
decoder_layers:int,
|
||||||
|
attention_heads: int,
|
||||||
|
vocab_size: int) -> None:
|
||||||
|
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
self._model = NanoSocratesCore(
|
||||||
|
embedded_size,
|
||||||
|
feed_forward_dim,
|
||||||
|
encoder_layers,
|
||||||
|
decoder_layers,
|
||||||
|
attention_heads,
|
||||||
|
vocab_size)
|
||||||
|
|
||||||
@@ -16,8 +16,11 @@ class NanoSocratesCore(torch.nn.Module):
|
|||||||
num_encoder_layers: int = 2,
|
num_encoder_layers: int = 2,
|
||||||
num_decoder_layers: int = 2,
|
num_decoder_layers: int = 2,
|
||||||
num_attention_heads: int = 4,
|
num_attention_heads: int = 4,
|
||||||
|
pad_token: int = 0,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
|
||||||
|
super().__init__()
|
||||||
|
self.__pad_token = pad_token
|
||||||
feed_forward_dim = embedding_size * feed_forward_multiplier
|
feed_forward_dim = embedding_size * feed_forward_multiplier
|
||||||
|
|
||||||
self.__sentence_length = sentence_length
|
self.__sentence_length = sentence_length
|
||||||
@@ -43,69 +46,64 @@ class NanoSocratesCore(torch.nn.Module):
|
|||||||
self.__input_embeder = NanoSocratesEmbedder(vocab_size, embedding_size)
|
self.__input_embeder = NanoSocratesEmbedder(vocab_size, embedding_size)
|
||||||
self.__output_embedder = NanoSocratesEmbedder(vocab_size, embedding_size)
|
self.__output_embedder = NanoSocratesEmbedder(vocab_size, embedding_size)
|
||||||
|
|
||||||
|
@torch.no_grad() # inference only
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
encoder_input: list[list[int]],
|
encoder_input: list[list[int]],
|
||||||
decoder_input: list[list[int]],
|
decoder_input: list[list[int]], # must start with <SOS> and PAD elsewhere
|
||||||
encoder_padding_mask: list[list[int]],
|
encoder_padding_mask: list[list[bool]], # True where encoder is PAD
|
||||||
):
|
):
|
||||||
|
|
||||||
if len(encoder_padding_mask) != len(encoder_input):
|
|
||||||
raise Exception("Mismatch in received_dimensions")
|
|
||||||
|
|
||||||
# TODO: check for tensor in input to embedder
|
|
||||||
# 1) Embed User-Input for encoders
|
# 1) Embed User-Input for encoders
|
||||||
ENCODER_INPUT = self.__input_embeder(encoder_input)
|
ENCODER_INPUT = self.__input_embeder(encoder_input) # [B,S,E]
|
||||||
|
|
||||||
# 2) Encode User-Input
|
# 2) Encode User-Input
|
||||||
ENCODER_OUTPUT, _ = self.__encoder_sequence(ENCODER_INPUT, encoder_padding_mask)
|
ENCODER_OUTPUT, encoder_padding_mask = self.__encoder_sequence(
|
||||||
|
(ENCODER_INPUT, encoder_padding_mask) # as tuple
|
||||||
|
) # [B,S,E], [B,S]
|
||||||
del ENCODER_INPUT
|
del ENCODER_INPUT
|
||||||
|
|
||||||
exit_loop = False
|
# 3) Autoregressive Output (greedy)
|
||||||
decoder_token_list = decoder_input[:]
|
LOGITS_HISTORY: list[torch.Tensor] = [] # keep per-step distributions
|
||||||
|
decoder_token_list = [row[:] for row in decoder_input] # copy tokens
|
||||||
decoder_phase = 0
|
decoder_phase = 0
|
||||||
|
exit_loop = False
|
||||||
|
|
||||||
LOGITS_HISTORY: list[torch.Tensor] = []
|
|
||||||
|
|
||||||
# 3) Autoregressive Output
|
|
||||||
while not exit_loop:
|
while not exit_loop:
|
||||||
|
decoder_phase += 1 # move to next position
|
||||||
|
|
||||||
# 3.0) Increment Counter
|
# 3.1) Build decoder key padding mask from current tokens (True where PAD)
|
||||||
decoder_phase += 1
|
DECODER_KEY_PADDING_MASK: list[list[bool]] = [
|
||||||
|
[tok == self.__pad_token for tok in row] for row in decoder_token_list
|
||||||
|
] # [B,T]
|
||||||
|
|
||||||
# 3.1) Embed Decoder Input
|
# 3.2) Embed Decoder Input (full sequence; decoder builds causal mask inside)
|
||||||
decoder_input = self.__output_embedder(decoder_token_list)
|
DECODER_INPUT = self.__output_embedder(decoder_token_list) # [B,T,E]
|
||||||
|
|
||||||
# 3.2) Decode Decoder Input
|
# 3.3) Decode (self-attn uses causal mask internally; we provide PAD masks)
|
||||||
DECODER_OUTPUT, _, _, _ = self.__decoder_sequence(
|
DECODER_OUTPUT, _, _, _ = self.__decoder_sequence(
|
||||||
decoder_input, ENCODER_OUTPUT, ENCODER_OUTPUT
|
(DECODER_INPUT, ENCODER_OUTPUT, ENCODER_OUTPUT,
|
||||||
)
|
DECODER_KEY_PADDING_MASK, encoder_padding_mask)
|
||||||
|
) # [B,T,E]
|
||||||
|
del DECODER_INPUT
|
||||||
|
|
||||||
# 3.3) Go back to Token space
|
# 3.4) Project to token space
|
||||||
# TODO: change name
|
LOGITS = self.__linear(DECODER_OUTPUT) # [B,T,V]
|
||||||
LOGITS = self.__linear(DECODER_OUTPUT)
|
|
||||||
del DECODER_OUTPUT
|
del DECODER_OUTPUT
|
||||||
|
|
||||||
# 3.4) Transform in probabilities
|
# 3.5) Probabilities and greedy pick at current step
|
||||||
# TODO: change name
|
TOKEN_PROBABILITIES = torch.softmax(LOGITS, dim=-1) # [B,T,V]
|
||||||
TOKEN_PROBABILITIES = torch.softmax(LOGITS, dim=-1)
|
LOGITS_HISTORY.append(TOKEN_PROBABILITIES) # store for this step
|
||||||
del LOGITS
|
|
||||||
|
|
||||||
LOGITS_HISTORY.append(TOKEN_PROBABILITIES)
|
step_idx = decoder_phase - 1 # 0-based
|
||||||
|
TOKEN_IDS = TOKEN_PROBABILITIES[:, step_idx, :].argmax(dim=-1).tolist() # [B] -> list[int]
|
||||||
|
|
||||||
# 3.5) Take most probable tokens
|
# 3.6) Write prediction into next slot (the slot is PAD)
|
||||||
TOKEN_IDS = torch.argmax(TOKEN_PROBABILITIES, -1)
|
if step_idx + 1 < self.__sentence_length:
|
||||||
|
for b, tok in enumerate(TOKEN_IDS):
|
||||||
|
decoder_token_list[b][step_idx + 1] = tok # feed next position
|
||||||
|
|
||||||
# TODO: check for dimensions and for efficiency
|
# 3.7) Stop when we filled the sequence
|
||||||
DECODER_TOKEN_TENSOR = torch.tensor(decoder_token_list)
|
|
||||||
DECODER_TOKEN_TENSOR[:, decoder_phase] = TOKEN_IDS
|
|
||||||
decoder_token_list = DECODER_TOKEN_TENSOR.tolist()
|
|
||||||
|
|
||||||
del TOKEN_IDS
|
|
||||||
del DECODER_TOKEN_TENSOR
|
|
||||||
|
|
||||||
# 3.6) Check if we generated all tokens
|
|
||||||
if decoder_phase == self.__sentence_length - 1:
|
if decoder_phase == self.__sentence_length - 1:
|
||||||
exit_loop = True
|
exit_loop = True
|
||||||
|
|
||||||
return LOGITS_HISTORY
|
return LOGITS_HISTORY # list of [B,T,V] (per step)
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ class SpannedMasker:
|
|||||||
max_vocabulary: int,
|
max_vocabulary: int,
|
||||||
forbidden_tokens: set[int],
|
forbidden_tokens: set[int],
|
||||||
change_token_probability: float = 0.15,
|
change_token_probability: float = 0.15,
|
||||||
average_span: int = 2,
|
average_span: int = 1,
|
||||||
seed: int = random.randint(0, sys.maxsize),
|
seed: int = random.randint(0, sys.maxsize),
|
||||||
|
|
||||||
) -> None:
|
) -> None:
|
||||||
@@ -25,11 +25,6 @@ class SpannedMasker:
|
|||||||
self.__forbidden_tokens = forbidden_tokens
|
self.__forbidden_tokens = forbidden_tokens
|
||||||
|
|
||||||
|
|
||||||
def reseed(self, seed:int):
|
|
||||||
self.__rng = random.Random(seed)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def mask_sequence(
|
def mask_sequence(
|
||||||
self,
|
self,
|
||||||
token_sequence: list[int],
|
token_sequence: list[int],
|
||||||
|
|||||||
@@ -1,47 +0,0 @@
|
|||||||
from typing import override
|
|
||||||
import torch
|
|
||||||
|
|
||||||
|
|
||||||
# custom LR from attention is all you need
|
|
||||||
class WarmupLR(torch.optim.lr_scheduler.LRScheduler):
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
optimizer: torch.optim.Optimizer,
|
|
||||||
warmup_steps: int,
|
|
||||||
embedding_size: int,
|
|
||||||
warming_multiplier: float = -1.5,
|
|
||||||
decaying_multiplier: float = -0.5,
|
|
||||||
multiplicative_factor: float = 1.0,
|
|
||||||
last_epoch: int = -1,
|
|
||||||
) -> None:
|
|
||||||
self.__warmup_steps = warmup_steps
|
|
||||||
self.__embedding_size = embedding_size
|
|
||||||
self.__warming_multiplier = warming_multiplier
|
|
||||||
self.__decaying_multiplier = decaying_multiplier
|
|
||||||
self.__multiplicative_factor = multiplicative_factor
|
|
||||||
super().__init__(optimizer, last_epoch)
|
|
||||||
|
|
||||||
def __scale_at(self, step: int) -> float:
|
|
||||||
step = max(step, 1)
|
|
||||||
return (
|
|
||||||
self.__multiplicative_factor
|
|
||||||
* (self.__embedding_size**self.__decaying_multiplier)
|
|
||||||
* min(
|
|
||||||
step**self.__decaying_multiplier,
|
|
||||||
step * (self.__warmup_steps**self.__warming_multiplier),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
@override
|
|
||||||
def get_lr(self) -> list[float]:
|
|
||||||
torch.optim.lr_scheduler._warn_get_lr_called_within_step(self)
|
|
||||||
|
|
||||||
step = max(self.last_epoch, 1)
|
|
||||||
scale = self.__scale_at(step)
|
|
||||||
return [base_lr * scale for base_lr in self.base_lrs]
|
|
||||||
|
|
||||||
def _get_closed_form_lr(self):
|
|
||||||
step = max(self.last_epoch, 1)
|
|
||||||
scale = self.__scale_at(step)
|
|
||||||
return [base_lr * scale for base_lr in self.base_lrs]
|
|
||||||
@@ -5,7 +5,6 @@ from .FeedForwardNetwork import FeedForwardNetwork
|
|||||||
from .TorchMultiHeadAttention import TorchMultiHeadAttention
|
from .TorchMultiHeadAttention import TorchMultiHeadAttention
|
||||||
from .SpannedMasker import SpannedMasker
|
from .SpannedMasker import SpannedMasker
|
||||||
from .DeToken import DeToken
|
from .DeToken import DeToken
|
||||||
from .WarmupLR import WarmupLR
|
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"Decoder",
|
"Decoder",
|
||||||
@@ -13,6 +12,5 @@ __all__ = [
|
|||||||
"FeedForwardNetwork",
|
"FeedForwardNetwork",
|
||||||
"TorchMultiHeadAttention",
|
"TorchMultiHeadAttention",
|
||||||
"SpannedMasker",
|
"SpannedMasker",
|
||||||
"DeToken",
|
"DeToken"
|
||||||
"WarmupLR"
|
|
||||||
]
|
]
|
||||||
@@ -1,33 +0,0 @@
|
|||||||
import torch
|
|
||||||
import Project_Model.Libs.Embedder as Embedder
|
|
||||||
from ..Classes import DeToken
|
|
||||||
|
|
||||||
class NanoSocraDecoder(torch.nn.Module):
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
decoder_embedder: Embedder.NanoSocratesEmbedder,
|
|
||||||
decoder_layers: torch.nn.Sequential,
|
|
||||||
detokener: DeToken
|
|
||||||
|
|
||||||
) -> None:
|
|
||||||
super().__init__()
|
|
||||||
|
|
||||||
|
|
||||||
self.__decoder_embedder = decoder_embedder
|
|
||||||
self.__decoder = decoder_layers
|
|
||||||
self.__detokener = detokener
|
|
||||||
|
|
||||||
def forward(self, args: tuple[torch.Tensor,torch.Tensor, torch.Tensor]):
|
|
||||||
|
|
||||||
decoder_embedder_input, prefix_mask, tgt_padding = args
|
|
||||||
|
|
||||||
decoder_tensor = self.__decoder_embedder(decoder_embedder_input)
|
|
||||||
|
|
||||||
decoder_output, _, _, _, _, _ = self.__decoder(
|
|
||||||
(decoder_tensor, decoder_tensor, decoder_tensor, prefix_mask, tgt_padding, True)
|
|
||||||
)
|
|
||||||
|
|
||||||
logits: torch.Tensor = self.__detokener(decoder_output)
|
|
||||||
|
|
||||||
return logits
|
|
||||||
@@ -1,29 +0,0 @@
|
|||||||
import torch
|
|
||||||
import Project_Model.Libs.Embedder as Embedder
|
|
||||||
from ..Classes import DeToken
|
|
||||||
|
|
||||||
class NanoSocratEncoder(torch.nn.Module):
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
encoder_embedder: Embedder.NanoSocratesEmbedder,
|
|
||||||
encoder_layers: torch.nn.Sequential,
|
|
||||||
detokener: DeToken
|
|
||||||
) -> None:
|
|
||||||
super().__init__()
|
|
||||||
|
|
||||||
self.__encoder_embedder = encoder_embedder
|
|
||||||
self.__encoder = encoder_layers
|
|
||||||
self.__detokener = detokener
|
|
||||||
|
|
||||||
def forward(self, args: tuple[torch.Tensor, torch.Tensor]):
|
|
||||||
|
|
||||||
encoder_embedder_input, src_padding = args
|
|
||||||
|
|
||||||
encoder_tensor = self.__encoder_embedder(encoder_embedder_input)
|
|
||||||
|
|
||||||
encoder_output, _ = self.__encoder((encoder_tensor, src_padding))
|
|
||||||
|
|
||||||
logits: torch.Tensor = self.__detokener(encoder_output)
|
|
||||||
|
|
||||||
return logits
|
|
||||||
@@ -1,219 +0,0 @@
|
|||||||
import torch
|
|
||||||
import Project_Model.Libs.Embedder as Embedder
|
|
||||||
from ..Classes import Encoder, Decoder, DeToken
|
|
||||||
from ..Utils import get_decoder_input
|
|
||||||
from Project_Model.Libs.Batch import TaskType
|
|
||||||
|
|
||||||
|
|
||||||
class NanoSocratesCore(torch.nn.Module):
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
vocabulary_size: int,
|
|
||||||
sentence_max_length: int,
|
|
||||||
sos: int,
|
|
||||||
pad: int,
|
|
||||||
eos: int,
|
|
||||||
continuerdf: int,
|
|
||||||
latent_space: int = 256,
|
|
||||||
feed_forward_multiplier: int = 4,
|
|
||||||
attention_heads: int = 4,
|
|
||||||
layer_number: int = 2,
|
|
||||||
) -> None:
|
|
||||||
super().__init__()
|
|
||||||
|
|
||||||
self.__sos = sos
|
|
||||||
self.__pad = pad
|
|
||||||
self.__eos = eos
|
|
||||||
self.__continuerdf = continuerdf
|
|
||||||
self.__sentence_len = sentence_max_length
|
|
||||||
|
|
||||||
feed_forward_latent_space = latent_space * feed_forward_multiplier
|
|
||||||
|
|
||||||
self.__encoder_embedder = Embedder.NanoSocratesEmbedder(
|
|
||||||
vocabulary_size, latent_space
|
|
||||||
)
|
|
||||||
self.__decoder_embedder = Embedder.NanoSocratesEmbedder(
|
|
||||||
vocabulary_size, latent_space
|
|
||||||
)
|
|
||||||
|
|
||||||
TMP_ENCODERS = [
|
|
||||||
Encoder(latent_space, feed_forward_latent_space, attention_heads)
|
|
||||||
] * layer_number
|
|
||||||
|
|
||||||
TMP_DECODERS = [
|
|
||||||
Decoder(latent_space, feed_forward_latent_space, attention_heads)
|
|
||||||
] * layer_number
|
|
||||||
|
|
||||||
self.__encoder = torch.nn.Sequential(*TMP_ENCODERS)
|
|
||||||
self.__decoder = torch.nn.Sequential(*TMP_DECODERS)
|
|
||||||
|
|
||||||
self.__detokener = DeToken(latent_space, vocabulary_size)
|
|
||||||
self.__encoder_detokener = DeToken(latent_space, vocabulary_size)
|
|
||||||
|
|
||||||
def forward(self, args: tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]):
|
|
||||||
|
|
||||||
encoder_embedder_input, src_padding, decoder_embedder_input, tgt_padding = args
|
|
||||||
|
|
||||||
encoder_tensor = self.__encoder_embedder(encoder_embedder_input)
|
|
||||||
decoder_tensor = self.__decoder_embedder(decoder_embedder_input)
|
|
||||||
|
|
||||||
encoder_output, _ = self.__encoder((encoder_tensor, src_padding))
|
|
||||||
|
|
||||||
decoder_output, _, _, _, _, _ = self.__decoder(
|
|
||||||
(decoder_tensor, encoder_output, encoder_output, src_padding, tgt_padding, False)
|
|
||||||
)
|
|
||||||
|
|
||||||
logits: torch.Tensor = self.__detokener(decoder_output)
|
|
||||||
|
|
||||||
return logits
|
|
||||||
|
|
||||||
def inference(self, input: tuple[torch.Tensor, torch.Tensor], task_type: TaskType) -> torch.Tensor:
|
|
||||||
|
|
||||||
if task_type == TaskType.MASKING:
|
|
||||||
return self.__masking(input)
|
|
||||||
|
|
||||||
if task_type == TaskType.COMPLETATION:
|
|
||||||
return self.__continue_rdf(input)
|
|
||||||
|
|
||||||
return self.__text_generation(input)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def __text_generation(self, args: tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
|
|
||||||
|
|
||||||
x, padding = args
|
|
||||||
|
|
||||||
encoder_tensor = self.__encoder_embedder(x)
|
|
||||||
|
|
||||||
BATCH: int
|
|
||||||
|
|
||||||
if len(x.shape) > 2:
|
|
||||||
BATCH, SEQ_LEN, _ = x.shape
|
|
||||||
else:
|
|
||||||
_, SEQ_LEN = x.shape
|
|
||||||
BATCH = 1
|
|
||||||
|
|
||||||
encoder_output, _ = self.__encoder((encoder_tensor, padding))
|
|
||||||
|
|
||||||
decoder_in = get_decoder_input(BATCH, self.__sos, self.__pad, SEQ_LEN)
|
|
||||||
decoder_in_pad_mask = decoder_in.eq(self.__pad)
|
|
||||||
|
|
||||||
continue_generating = True
|
|
||||||
token_idx = 0
|
|
||||||
|
|
||||||
while continue_generating:
|
|
||||||
|
|
||||||
decoder_in_x = self.__decoder_embedder(decoder_in)
|
|
||||||
|
|
||||||
decoder_output, _, _, _, _, _ = self.__decoder(
|
|
||||||
(decoder_in_x, encoder_output, encoder_output, padding, decoder_in_pad_mask, False)
|
|
||||||
)
|
|
||||||
|
|
||||||
logits: torch.Tensor = self.__detokener(decoder_output)
|
|
||||||
|
|
||||||
logits = torch.softmax(logits, 2)
|
|
||||||
|
|
||||||
tokens = torch.argmax(logits, 2)
|
|
||||||
|
|
||||||
if token_idx < self.__sentence_len - 1:
|
|
||||||
decoder_in[:,token_idx + 1] = tokens[:,token_idx]
|
|
||||||
decoder_in_pad_mask = decoder_in.eq(self.__pad)
|
|
||||||
|
|
||||||
if token_idx == self.__sentence_len - 1:
|
|
||||||
continue_generating = False
|
|
||||||
continue
|
|
||||||
|
|
||||||
if tokens.shape[0] == 1 and tokens[0,token_idx] == self.__eos:
|
|
||||||
continue_generating = False
|
|
||||||
continue
|
|
||||||
|
|
||||||
|
|
||||||
token_idx += 1
|
|
||||||
|
|
||||||
return decoder_in
|
|
||||||
|
|
||||||
|
|
||||||
def __masking(self, args: tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
|
|
||||||
|
|
||||||
x, padding = args
|
|
||||||
|
|
||||||
encoder_tensor = self.__encoder_embedder(x)
|
|
||||||
x, _ = self.__encoder((encoder_tensor, padding))
|
|
||||||
|
|
||||||
logits: torch.Tensor = self.__encoder_detokener(x)
|
|
||||||
del x
|
|
||||||
|
|
||||||
logits = torch.softmax(logits, 2)
|
|
||||||
|
|
||||||
tokens = torch.argmax(logits, 2)
|
|
||||||
|
|
||||||
return tokens
|
|
||||||
|
|
||||||
|
|
||||||
def __continue_rdf(self, args: tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
|
|
||||||
|
|
||||||
decoder_in, _ = args
|
|
||||||
decoder_in_prefix_mask = decoder_in.eq(self.__pad)
|
|
||||||
decoder_in_pad_mask = decoder_in.eq(self.__pad)
|
|
||||||
|
|
||||||
continue_generating = True
|
|
||||||
token_idx: int= int((decoder_in[0] == self.__continuerdf).nonzero()[0].item()) + 1
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
while continue_generating:
|
|
||||||
|
|
||||||
decoder_x = self.__decoder_embedder(decoder_in)
|
|
||||||
|
|
||||||
decoder_output, _, _, _, _, _ = self.__decoder(
|
|
||||||
(decoder_x, decoder_in, decoder_in, decoder_in_prefix_mask, decoder_in_pad_mask, True)
|
|
||||||
)
|
|
||||||
|
|
||||||
logits: torch.Tensor = self.__detokener(decoder_output)
|
|
||||||
|
|
||||||
logits = torch.softmax(logits, 2)
|
|
||||||
|
|
||||||
tokens = torch.argmax(logits, 2)
|
|
||||||
|
|
||||||
if token_idx < self.__sentence_len - 1:
|
|
||||||
decoder_in[:,token_idx + 1] = tokens[:,token_idx]
|
|
||||||
decoder_in_pad_mask = decoder_in.eq(self.__pad)
|
|
||||||
|
|
||||||
if token_idx == self.__sentence_len - 1:
|
|
||||||
continue_generating = False
|
|
||||||
continue
|
|
||||||
|
|
||||||
if tokens.shape[0] == 1 and tokens[0,token_idx] == self.__eos:
|
|
||||||
continue_generating = False
|
|
||||||
continue
|
|
||||||
|
|
||||||
token_idx += 1
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
return decoder_in
|
|
||||||
|
|
||||||
def take_pieces(self):
|
|
||||||
|
|
||||||
return (
|
|
||||||
(self.__encoder_embedder, self.__encoder, self.__encoder_detokener),
|
|
||||||
(self.__decoder_embedder, self.__decoder, self.__detokener)
|
|
||||||
)
|
|
||||||
|
|
||||||
def load_pieces(
|
|
||||||
self,
|
|
||||||
encoder_embedder: Embedder.NanoSocratesEmbedder,
|
|
||||||
decoder_embedder: Embedder.NanoSocratesEmbedder,
|
|
||||||
encoder: torch.nn.Sequential,
|
|
||||||
decoder: torch.nn.Sequential,
|
|
||||||
encoder_detokener: DeToken,
|
|
||||||
decoder_detokener: DeToken
|
|
||||||
):
|
|
||||||
self.__encoder_embedder = encoder_embedder
|
|
||||||
self.__decoder_embedder = decoder_embedder
|
|
||||||
self.__encoder = encoder
|
|
||||||
self.__decoder = decoder
|
|
||||||
self.__encoder_detokener = encoder_detokener
|
|
||||||
self.__detokener = decoder_detokener
|
|
||||||
@@ -24,40 +24,49 @@ class TrainingModel(torch.nn.Module):
|
|||||||
vocabulary_size, latent_space
|
vocabulary_size, latent_space
|
||||||
)
|
)
|
||||||
|
|
||||||
TMP_ENCODERS = [
|
# do NOT share layer weights
|
||||||
|
enc_layers = [
|
||||||
Encoder(latent_space, feed_forward_latent_space, attention_heads)
|
Encoder(latent_space, feed_forward_latent_space, attention_heads)
|
||||||
] * layer_number
|
for _ in range(layer_number)
|
||||||
|
]
|
||||||
TMP_DECODERS = [
|
dec_layers = [
|
||||||
Decoder(latent_space, feed_forward_latent_space, attention_heads)
|
Decoder(latent_space, feed_forward_latent_space, attention_heads)
|
||||||
] * layer_number
|
for _ in range(layer_number)
|
||||||
|
]
|
||||||
|
|
||||||
self.__encoder = torch.nn.Sequential(*TMP_ENCODERS)
|
self.__encoder = torch.nn.Sequential(*enc_layers)
|
||||||
self.__decoder = torch.nn.Sequential(*TMP_DECODERS)
|
self.__decoder = torch.nn.Sequential(*dec_layers)
|
||||||
|
|
||||||
self.__detokener = DeToken(latent_space, vocabulary_size)
|
self.__detokener = DeToken(latent_space, vocabulary_size)
|
||||||
self.__encoder_detokener = DeToken(latent_space, vocabulary_size)
|
|
||||||
|
|
||||||
def forward(self, args: tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]):
|
def forward(
|
||||||
|
self,
|
||||||
|
args: tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]
|
||||||
|
):
|
||||||
|
# returns logits for the LAST decoder position only -> [B, V]
|
||||||
|
(
|
||||||
|
encoder_embedder_input, # [B,S] encoder tokens
|
||||||
|
encoder_padding_mask, # [B,S] True where encoder is PAD
|
||||||
|
decoder_embedder_prefix, # [B,Tp] decoder prefix (e.g., <SOS> + tokens so far)
|
||||||
|
decoder_padding_mask, # [B,Tp] True where decoder prefix has PAD
|
||||||
|
) = args
|
||||||
|
|
||||||
encoder_embedder_input, src_padding, decoder_embedder_input, tgt_padding = args
|
# 1) embeddings
|
||||||
|
encoder_tensor = self.__encoder_embedder(encoder_embedder_input) # [B,S,E]
|
||||||
|
decoder_tensor = self.__decoder_embedder(decoder_embedder_prefix) # [B,Tp,E]
|
||||||
|
|
||||||
encoder_tensor = self.__encoder_embedder(encoder_embedder_input)
|
# 2) encode
|
||||||
decoder_tensor = self.__decoder_embedder(decoder_embedder_input)
|
encoder_output, _ = self.__encoder((encoder_tensor, encoder_padding_mask)) # [B,S,E], [B,S]
|
||||||
|
|
||||||
encoder_output, _ = self.__encoder((encoder_tensor, src_padding))
|
# 3) decode (causal mask is built inside the decoder)
|
||||||
|
decoder_output, _, _, _, _ = self.__decoder(
|
||||||
|
(decoder_tensor, encoder_output, encoder_output,
|
||||||
|
decoder_padding_mask, encoder_padding_mask)
|
||||||
|
) # [B,Tp,E], ...
|
||||||
|
|
||||||
decoder_output, _, _, _, _, _ = self.__decoder(
|
# 4) project only the last time step
|
||||||
(decoder_tensor, encoder_output, encoder_output, src_padding, tgt_padding, False)
|
last_hidden = decoder_output[:, -1:, :] # [B,1,E]
|
||||||
)
|
step_logits = self.__detokener(last_hidden) # [B,1,V]
|
||||||
|
step_logits = step_logits[:, -1, :] # [B,V]
|
||||||
|
|
||||||
logits: torch.Tensor = self.__detokener(decoder_output)
|
return step_logits # logits for one token
|
||||||
|
|
||||||
return logits
|
|
||||||
|
|
||||||
def take_pieces(self):
|
|
||||||
|
|
||||||
return (
|
|
||||||
(self.__encoder_embedder, self.__encoder, self.__encoder_detokener),
|
|
||||||
(self.__decoder_embedder, self.__decoder, self.__detokener)
|
|
||||||
)
|
|
||||||
|
|||||||
@@ -1,11 +1,5 @@
|
|||||||
from .TrainingModel import TrainingModel
|
from .TrainingModel import TrainingModel
|
||||||
from .NanoSocratEncoder import NanoSocratEncoder
|
|
||||||
from .NanoSocraDecoder import NanoSocraDecoder
|
|
||||||
from .NanoSocrates import NanoSocratesCore
|
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"TrainingModel",
|
"TrainingModel"
|
||||||
"NanoSocratEncoder",
|
|
||||||
"NanoSocraDecoder",
|
|
||||||
"NanoSocratesCore"
|
|
||||||
]
|
]
|
||||||
@@ -3,9 +3,6 @@ from .task_type import TaskType
|
|||||||
from .post_tokenization import truncate_sequence, pad_sequence, normalize_sequence, create_padding_mask
|
from .post_tokenization import truncate_sequence, pad_sequence, normalize_sequence, create_padding_mask
|
||||||
from .inference_masking import inference_masking
|
from .inference_masking import inference_masking
|
||||||
from .truncate_rdf_list import truncate_rdf_list
|
from .truncate_rdf_list import truncate_rdf_list
|
||||||
from .decode_out import tensor2token
|
|
||||||
from .decoder_input import get_decoder_input
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"TaskType",
|
"TaskType",
|
||||||
@@ -16,7 +13,5 @@ __all__ = [
|
|||||||
"create_padding_mask",
|
"create_padding_mask",
|
||||||
"normalize_sequence",
|
"normalize_sequence",
|
||||||
"inference_masking",
|
"inference_masking",
|
||||||
"truncate_rdf_list",
|
"truncate_rdf_list"
|
||||||
"tensor2token",
|
|
||||||
"get_decoder_input"
|
|
||||||
]
|
]
|
||||||
@@ -9,22 +9,3 @@ def get_causal_attention_mask_batched(seq_len: int, batch_size: int ) -> torch.T
|
|||||||
base_mask = get_causal_attention_mask(seq_len)
|
base_mask = get_causal_attention_mask(seq_len)
|
||||||
return base_mask.unsqueeze(0).expand(batch_size, -1, -1) # add another dimension at the beginning, big as batch_size
|
return base_mask.unsqueeze(0).expand(batch_size, -1, -1) # add another dimension at the beginning, big as batch_size
|
||||||
# the result is that z,x,y where x,y are repeated along z
|
# the result is that z,x,y where x,y are repeated along z
|
||||||
|
|
||||||
def get_causal_attention_mask_with_prefix(seq_len, prefix):
|
|
||||||
mask = torch.triu(torch.ones(seq_len, seq_len, dtype=torch.bool), diagonal=1)
|
|
||||||
mask[:,:prefix] = False
|
|
||||||
return mask
|
|
||||||
|
|
||||||
def get_prefix_causal_mask_from_padding_mask(seq_len:int, prefix_mask, att_heads:int=1):
|
|
||||||
expanded_padding_mask = prefix_mask.unsqueeze(-1).repeat(1, 1, seq_len) # B,T,T
|
|
||||||
expanded_padding_mask = expanded_padding_mask.permute(0,2,1) # B,T,T
|
|
||||||
mask = torch.triu(torch.ones(seq_len, seq_len, dtype=torch.bool), diagonal=1) # T,T
|
|
||||||
tri_batched = mask.unsqueeze(0) # 1,T,T will broadcast over B
|
|
||||||
prefix_causal_mask = expanded_padding_mask & tri_batched
|
|
||||||
prefix_causal_mask = prefix_causal_mask.repeat_interleave(att_heads, dim=0) # B*H,T,T
|
|
||||||
return prefix_causal_mask
|
|
||||||
|
|
||||||
#def get_prefix_causal_mask():
|
|
||||||
# continue_rdf =
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,27 +0,0 @@
|
|||||||
from typing import Generator
|
|
||||||
|
|
||||||
import torch
|
|
||||||
|
|
||||||
|
|
||||||
def tensor2token(tensor: torch.Tensor, end_token: int) -> Generator[list[int]]:
|
|
||||||
|
|
||||||
if len(tensor.shape) < 1 or len(tensor.shape) > 2:
|
|
||||||
raise ValueError("Shape is not correct")
|
|
||||||
|
|
||||||
if len(tensor.shape) == 1:
|
|
||||||
token_list: list[int] = tensor.tolist()
|
|
||||||
token_list.append(end_token)
|
|
||||||
yield token_list
|
|
||||||
return
|
|
||||||
|
|
||||||
batch_len: int
|
|
||||||
batch_len, _ = tensor.shape
|
|
||||||
|
|
||||||
for i in range(batch_len):
|
|
||||||
|
|
||||||
smaller_tensor = tensor[i, :]
|
|
||||||
token_list: list[int] = smaller_tensor.tolist()
|
|
||||||
token_list.append(end_token)
|
|
||||||
yield token_list
|
|
||||||
|
|
||||||
|
|
||||||
@@ -1,14 +0,0 @@
|
|||||||
import torch
|
|
||||||
from ..Utils import normalize_sequence
|
|
||||||
# from Project_Model.Libs.Embedder import NanoSocratesEmbedder as Embedder
|
|
||||||
|
|
||||||
|
|
||||||
def get_decoder_input(batch_size, sos_token,pad_token, seq_len):
|
|
||||||
|
|
||||||
single_decoder_input, _ = normalize_sequence([sos_token],seq_len,pad_token, end_token=0, add_ending=False)
|
|
||||||
tensor_decoder_input = torch.tensor(single_decoder_input[:])
|
|
||||||
# embedded_decoder_intput = embedder(tensor_decoder_input)
|
|
||||||
|
|
||||||
batch_decoder_input = tensor_decoder_input.unsqueeze(0).repeat(batch_size, 1)
|
|
||||||
return batch_decoder_input
|
|
||||||
|
|
||||||
@@ -1,19 +1,16 @@
|
|||||||
def truncate_sequence(
|
def truncate_sequence(
|
||||||
sequence: list[int], truncate_at: int, end_token: int, add_ending: bool
|
sequence: list[int], truncate_at: int, end_token: int
|
||||||
) -> list[int]:
|
) -> list[int]:
|
||||||
|
|
||||||
if len(sequence) < truncate_at - 1:
|
if len(sequence) < truncate_at - 1:
|
||||||
if add_ending:
|
|
||||||
sequence.append(end_token)
|
sequence.append(end_token)
|
||||||
return sequence
|
return sequence
|
||||||
|
|
||||||
if len(sequence) < truncate_at:
|
if len(sequence) < truncate_at:
|
||||||
if add_ending:
|
|
||||||
sequence[-1] = end_token
|
sequence[-1] = end_token
|
||||||
return sequence
|
return sequence
|
||||||
|
|
||||||
TRUNCATED_SEQUENCE = sequence[:truncate_at]
|
TRUNCATED_SEQUENCE = sequence[:truncate_at]
|
||||||
if add_ending:
|
|
||||||
TRUNCATED_SEQUENCE[-1] = end_token
|
TRUNCATED_SEQUENCE[-1] = end_token
|
||||||
|
|
||||||
return TRUNCATED_SEQUENCE
|
return TRUNCATED_SEQUENCE
|
||||||
@@ -51,9 +48,8 @@ def normalize_sequence(
|
|||||||
max_length: int,
|
max_length: int,
|
||||||
pad_token: int,
|
pad_token: int,
|
||||||
end_token: int,
|
end_token: int,
|
||||||
add_ending: bool = True
|
|
||||||
) -> tuple[list[int], list[bool]]:
|
) -> tuple[list[int], list[bool]]:
|
||||||
new_sequence = truncate_sequence(sequence, max_length, end_token, add_ending)
|
new_sequence = truncate_sequence(sequence, max_length, end_token)
|
||||||
new_sequence = pad_sequence(new_sequence, max_length, pad_token)
|
new_sequence = pad_sequence(new_sequence, max_length, pad_token)
|
||||||
PADDING_MASK = create_padding_mask(new_sequence, pad_token)
|
PADDING_MASK = create_padding_mask(new_sequence, pad_token)
|
||||||
|
|
||||||
|
|||||||
@@ -1,7 +1,6 @@
|
|||||||
from enum import Enum, auto
|
from enum import Enum, auto
|
||||||
|
|
||||||
class TaskType(Enum):
|
class TaskType(Enum):
|
||||||
TEXT2RDF = auto()
|
|
||||||
RDF2TEXT = auto()
|
RDF2TEXT = auto()
|
||||||
MASK = auto()
|
MASK = auto()
|
||||||
COMPLETATION = auto()
|
COMPLETATION = auto()
|
||||||
@@ -27,6 +27,7 @@ def truncate_rdf_list(
|
|||||||
END_OF_TRIPLES.append(i + 1)
|
END_OF_TRIPLES.append(i + 1)
|
||||||
|
|
||||||
TRIPLES_TOKENS: list[int] = []
|
TRIPLES_TOKENS: list[int] = []
|
||||||
|
TARGET_TRIPLES: list[int] = []
|
||||||
|
|
||||||
start_of_triple = 0
|
start_of_triple = 0
|
||||||
exit_loop = False
|
exit_loop = False
|
||||||
@@ -55,10 +56,10 @@ def truncate_rdf_list(
|
|||||||
EOT = END_OF_TRIPLES.popleft()
|
EOT = END_OF_TRIPLES.popleft()
|
||||||
|
|
||||||
TRIPLE = sequence[start_of_triple:EOT]
|
TRIPLE = sequence[start_of_triple:EOT]
|
||||||
TRIPLES_TOKENS.extend(TRIPLE)
|
TARGET_TRIPLES.extend(TRIPLE)
|
||||||
|
|
||||||
start_of_triple = EOT
|
start_of_triple = EOT
|
||||||
|
|
||||||
|
|
||||||
return (TRIPLES_TOKENS, TRIPLES_TOKENS)
|
return (TRIPLES_TOKENS, TARGET_TRIPLES)
|
||||||
|
|
||||||
|
|||||||
@@ -1,9 +1,7 @@
|
|||||||
from .Classes import *
|
from .Classes import *
|
||||||
from .Enums import *
|
|
||||||
from .Utils import *
|
from .Utils import *
|
||||||
from .Models import *
|
from .Models import *
|
||||||
|
|
||||||
from . import Classes
|
from . import Classes
|
||||||
from . import Enums
|
|
||||||
from . import Utils
|
from . import Utils
|
||||||
from . import Models
|
from . import Models
|
||||||
@@ -1,6 +0,0 @@
|
|||||||
from enum import Enum, auto
|
|
||||||
|
|
||||||
class ModelType(Enum):
|
|
||||||
|
|
||||||
ENCODER_ONLY = auto()
|
|
||||||
DECODER_ONLY = auto()
|
|
||||||
@@ -1,14 +0,0 @@
|
|||||||
from .model_utils import decompose_nano_socrates, create_standalone_model, train2inference
|
|
||||||
from .ModelType import ModelType
|
|
||||||
from .decode_batch import decode_batch
|
|
||||||
from .metrics import precision, recall, accuracy, f1, meteor, bleu, rouge, average, rdf2txt, txt2rdf, rdf_completion_1, rdf_completion_2, remove_padding, balance_paddings
|
|
||||||
|
|
||||||
__all__ = [
|
|
||||||
"ModelType",
|
|
||||||
"decompose_nano_socrates",
|
|
||||||
"create_standalone_model",
|
|
||||||
"decode_batch",
|
|
||||||
"train2inference",
|
|
||||||
"precision", "recall", "accuracy", "f1", "meteor", "bleu", "rouge", "average",
|
|
||||||
"rdf2txt", "txt2rdf", "rdf_completion_1", "rdf_completion_2", "remove_padding", "balance_paddings"
|
|
||||||
]
|
|
||||||
@@ -1,16 +0,0 @@
|
|||||||
import torch
|
|
||||||
import Project_Model.Libs.BPE as BPE
|
|
||||||
|
|
||||||
def decode_batch(batch: torch.Tensor, tokenizer: BPE.TokeNanoCore ,uknonw_token: int) -> list[str]:
|
|
||||||
|
|
||||||
strings = []
|
|
||||||
|
|
||||||
BATCH, _ = batch.shape
|
|
||||||
|
|
||||||
for i in range(0, BATCH):
|
|
||||||
|
|
||||||
tokens: list[int] = batch.tolist()[i]
|
|
||||||
tokens = list(map(lambda x: uknonw_token if x > tokenizer.vocabulary_size else x, tokens))
|
|
||||||
strings.append(tokenizer.decode(tokens))
|
|
||||||
|
|
||||||
return strings
|
|
||||||
@@ -1,100 +0,0 @@
|
|||||||
import evaluate as eval
|
|
||||||
|
|
||||||
BLEU = eval.load("bleu")
|
|
||||||
ROUGE = eval.load("rouge")
|
|
||||||
METEOR = eval.load("meteor")
|
|
||||||
|
|
||||||
def precision(ref: list[int], pred: list[int]):
|
|
||||||
metric = eval.load("precision")
|
|
||||||
return metric.compute(predictions=pred, references=ref, average="weighted", zero_division=0)
|
|
||||||
|
|
||||||
|
|
||||||
def recall(ref: list[int], pred: list[int]):
|
|
||||||
metric = eval.load("recall")
|
|
||||||
return metric.compute(predictions=pred, references=ref, average="weighted", zero_division=0)
|
|
||||||
|
|
||||||
|
|
||||||
def accuracy(ref: list[int], pred: list[int]):
|
|
||||||
metric = eval.load("accuracy")
|
|
||||||
return metric.compute(predictions=pred, references=ref)
|
|
||||||
|
|
||||||
|
|
||||||
def meteor(ref: list[str], pred: list[str]):
|
|
||||||
metric = METEOR
|
|
||||||
return metric.compute(predictions=pred, references=ref)
|
|
||||||
|
|
||||||
|
|
||||||
def bleu(ref: list[str], pred: list[str]):
|
|
||||||
metric = BLEU
|
|
||||||
return metric.compute(predictions=pred, references=ref)
|
|
||||||
|
|
||||||
|
|
||||||
def rouge(ref: list[str], pred: list[str]):
|
|
||||||
metric = ROUGE
|
|
||||||
return metric.compute(predictions=pred, references=ref)
|
|
||||||
|
|
||||||
|
|
||||||
def f1(precision: float, recall: float):
|
|
||||||
divisor = max((precision + recall), 1E-5)
|
|
||||||
return (2 * recall * precision) / divisor
|
|
||||||
|
|
||||||
|
|
||||||
def average(array: list[float]):
|
|
||||||
return sum(array) / len(array)
|
|
||||||
|
|
||||||
|
|
||||||
def rdf2txt(ref: list[str], pred: list[str]):
|
|
||||||
|
|
||||||
b_m = bleu(ref, pred)
|
|
||||||
r_m = rouge(ref, pred)
|
|
||||||
m_m = meteor(ref, pred)
|
|
||||||
|
|
||||||
return (b_m, r_m, m_m)
|
|
||||||
|
|
||||||
def txt2rdf(ref: list[int], pred: list[int]):
|
|
||||||
|
|
||||||
p_m = precision(ref, pred)
|
|
||||||
r_m = recall(ref, pred)
|
|
||||||
|
|
||||||
return (p_m, r_m)
|
|
||||||
|
|
||||||
def rdf_completion_1(ref: list[int], pred: list[int]):
|
|
||||||
|
|
||||||
a_m = accuracy(ref, pred)
|
|
||||||
|
|
||||||
return a_m
|
|
||||||
|
|
||||||
|
|
||||||
def rdf_completion_2(ref: list[int], pred: list[int]):
|
|
||||||
|
|
||||||
p_m = precision(ref, pred)
|
|
||||||
r_m = recall(ref, pred)
|
|
||||||
|
|
||||||
return (p_m, r_m)
|
|
||||||
|
|
||||||
|
|
||||||
def remove_padding(seq: list[int], pad_token: int, end_token: int):
|
|
||||||
clean_seq = list(filter(lambda x: x != pad_token, seq))
|
|
||||||
|
|
||||||
if clean_seq[-1] == end_token:
|
|
||||||
return clean_seq
|
|
||||||
|
|
||||||
clean_seq.append(
|
|
||||||
end_token
|
|
||||||
)
|
|
||||||
|
|
||||||
return clean_seq
|
|
||||||
|
|
||||||
|
|
||||||
def balance_paddings(seq_1: list[int], seq_2: list[int], pad_token: int):
|
|
||||||
SEQ_1_LEN = len(seq_1)
|
|
||||||
SEQ_2_LEN = len(seq_2)
|
|
||||||
|
|
||||||
if SEQ_1_LEN > SEQ_2_LEN:
|
|
||||||
PAD = [pad_token] * (SEQ_1_LEN - SEQ_2_LEN)
|
|
||||||
seq_2.extend(PAD)
|
|
||||||
|
|
||||||
if SEQ_2_LEN > SEQ_1_LEN:
|
|
||||||
seq_2 = seq_2[:SEQ_1_LEN]
|
|
||||||
|
|
||||||
return (seq_1, seq_2)
|
|
||||||
@@ -1,72 +0,0 @@
|
|||||||
import torch
|
|
||||||
from Project_Model.Libs.Embedder import NanoSocratesEmbedder
|
|
||||||
from Project_Model.Libs.Transformer import TrainingModel,NanoSocratesCore, NanoSocraDecoder, NanoSocratEncoder, DeToken, Encoder, Decoder
|
|
||||||
from .ModelType import ModelType
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def decompose_nano_socrates(
|
|
||||||
model: TrainingModel | NanoSocratesCore , vocabulary_size: int, embedding_size: int
|
|
||||||
) -> tuple[TrainingModel | NanoSocratesCore, NanoSocratEncoder, NanoSocraDecoder]:
|
|
||||||
|
|
||||||
encoder_pieces, decoder_pieces = model.take_pieces()
|
|
||||||
encoder_embedder, encoder, encoder_detokener = encoder_pieces
|
|
||||||
decoder_embedder, decoder, decoder_detokener = decoder_pieces
|
|
||||||
|
|
||||||
return (
|
|
||||||
model,
|
|
||||||
NanoSocratEncoder(encoder_embedder, encoder, encoder_detokener),
|
|
||||||
NanoSocraDecoder(decoder_embedder, decoder, decoder_detokener),
|
|
||||||
)
|
|
||||||
|
|
||||||
def train2inference(
|
|
||||||
train_model: TrainingModel,
|
|
||||||
inference_model: NanoSocratesCore
|
|
||||||
) -> NanoSocratesCore:
|
|
||||||
|
|
||||||
encoder_pieces, decoder_pieces = train_model.take_pieces()
|
|
||||||
enc_emb, encoder, enc_det = encoder_pieces
|
|
||||||
dec_emb, decoder, dec_det = decoder_pieces
|
|
||||||
inference_model.load_pieces(
|
|
||||||
enc_emb,
|
|
||||||
dec_emb,
|
|
||||||
encoder,
|
|
||||||
decoder,
|
|
||||||
enc_det,
|
|
||||||
dec_det
|
|
||||||
)
|
|
||||||
|
|
||||||
return inference_model
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def create_standalone_model(
|
|
||||||
model_type: ModelType,
|
|
||||||
vocabulary_size: int,
|
|
||||||
latent_space: int = 256,
|
|
||||||
feed_forward_multiplier: int = 4,
|
|
||||||
attention_heads: int = 4,
|
|
||||||
layer_number: int = 2,
|
|
||||||
) -> NanoSocratEncoder | NanoSocraDecoder:
|
|
||||||
|
|
||||||
feed_forward_latent_space = latent_space * feed_forward_multiplier
|
|
||||||
|
|
||||||
embedder = NanoSocratesEmbedder(vocabulary_size, latent_space)
|
|
||||||
detokener = DeToken(latent_space, vocabulary_size)
|
|
||||||
|
|
||||||
if model_type == ModelType.ENCODER_ONLY:
|
|
||||||
TMP_ENCODERS = [
|
|
||||||
Encoder(latent_space, feed_forward_latent_space, attention_heads)
|
|
||||||
] * layer_number
|
|
||||||
|
|
||||||
encoder = torch.nn.Sequential(*TMP_ENCODERS)
|
|
||||||
|
|
||||||
return NanoSocratEncoder(embedder, encoder, detokener)
|
|
||||||
|
|
||||||
TMP_DECODERS = [
|
|
||||||
Decoder(latent_space, feed_forward_latent_space, attention_heads)
|
|
||||||
] * layer_number
|
|
||||||
|
|
||||||
decoder = torch.nn.Sequential(*TMP_DECODERS)
|
|
||||||
|
|
||||||
return NanoSocraDecoder(embedder, decoder, detokener)
|
|
||||||
@@ -2,4 +2,3 @@ from . import BPE
|
|||||||
from . import Embedder
|
from . import Embedder
|
||||||
from . import Transformer
|
from . import Transformer
|
||||||
from . import TorchShims
|
from . import TorchShims
|
||||||
from . import TransformerUtils
|
|
||||||
|
|||||||
6019
Project_Model/UML/model.excalidraw.json
Normal file
6019
Project_Model/UML/model.excalidraw.json
Normal file
File diff suppressed because it is too large
Load Diff
BIN
environment.yaml
BIN
environment.yaml
Binary file not shown.
BIN
requirements.txt
BIN
requirements.txt
Binary file not shown.
Reference in New Issue
Block a user