In [None]:
import random
import torch
import pandas as pd
from pathlib import Path
import Project_Model.Libs.Embedder as Embedder
import Project_Model.Libs.BPE as BPE
import Project_Model.Libs.Transformer as Transformer
import Project_Model.Libs.TorchShims as torch_shims

# set a fixed seed
torch.manual_seed(0)
random.seed(0)
DEVICE = torch_shims.get_default_device()
torch.set_default_device(DEVICE)

# set a default device

# BPE Init
VOCABULARY_PATH = Path("Assets/Model/toy_10/toy_dictionary.json")
SPECIAL_VOC = BPE.default_special_tokens()

VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)
TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)


# Constants
TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + 1
EMBEDDED_SIZE = 256
FEED_FORWARD_MULTIPLIER = 4
ATTENTION_HEADS = 4
SENTENCE_LENGTH = 256
NUMBER_OF_BLOCKS = 2
MAX_EPOCHS = int(1e3)


PAD_TOKEN = TOKENANO.encode("")[0]
END_TOKEN = TOKENANO.encode("")[0]


# Load CSV
TOY_DATASET_PATH = Path("Assets/Dataset/1-hop/toy/rdf_text.csv")

TOY_DATASET = pd.read_csv(TOY_DATASET_PATH)

TOY_BATCH_INPUT_LIST: list[list[int]] = []
TOY_BATCH_PADDING_LIST: list[list[bool]] = []
TOY_BATCH_TARGET_LIST: list[list[int]] = []
TOY_BATCH_DECODER_DEFAULT: list[list[int]]= []


for index, row in TOY_DATASET.iterrows():

 RDFs: str = row["RDFs"]
 Abstract: str = row["Abstract"]

 input_tokens = TOKENANO.encode(RDFs)
 output_tokens = TOKENANO.encode(Abstract)[1:]
 decoder_default_tokens = TOKENANO.encode("")

 input_tokens, padding = Transformer.normalize_sequence(
 input_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN
 )
 output_tokens, _ = Transformer.normalize_sequence(
 output_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN
 )
 decoder_default_tokens, _ = Transformer.normalize_sequence(
 decoder_default_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN
 )

 TOY_BATCH_INPUT_LIST.append(input_tokens)
 TOY_BATCH_PADDING_LIST.append(padding)
 TOY_BATCH_TARGET_LIST.append(output_tokens)
 TOY_BATCH_DECODER_DEFAULT.append(decoder_default_tokens)

# Training loop
LOSS_HISTORY = []
NANOSOCRATES = Transformer.TrainingModel(
 TOKEN_SPACE_SIZE,
 EMBEDDED_SIZE,
 FEED_FORWARD_MULTIPLIER,
 ATTENTION_HEADS,
 NUMBER_OF_BLOCKS
)

NANOSOCRATES.train() # nothing important, activates dropout etc 
cross_entropy = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)
optimizer = torch.optim.AdamW(NANOSOCRATES.parameters())
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 4)

last_loss = 0

current_epoch = 0
while current_epoch < MAX_EPOCHS:

 encoder_list = torch.tensor([TOY_BATCH_INPUT_LIST[0]])
 decoder_list = torch.tensor([TOY_BATCH_DECODER_DEFAULT[0]])
 padding_list = torch.tensor([TOY_BATCH_PADDING_LIST[0]], dtype=torch.bool)
 target_logits = torch.tensor([TOY_BATCH_TARGET_LIST[0]]) # Transform target into logits

 optimizer.zero_grad() # to clear gradient

 last_loss = 0.0

 for i in range(0, SENTENCE_LENGTH):

 # optimizer.zero_grad()
 # forward 
 logits: torch.Tensor = NANOSOCRATES((encoder_list, padding_list, decoder_list))
 # probabilities = torch.softmax(logits,2)
 

 step_logits = logits[:, i, :] # [B, V]
 step_target = target_logits[:, i] # [B]

 loss = cross_entropy(step_logits,step_target) # now loss is without softmax
 loss.backward() # DAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMN
 last_loss = loss
 optimizer.step()
 optimizer.zero_grad()
 scheduler.step()
 
 probabilities = torch.softmax(logits,2)
 most_probable_tokens = torch.argmax(probabilities, 2) 
 if i < SENTENCE_LENGTH - 1:
 decoder_list[:,i+1] = most_probable_tokens[:,i]


 current_epoch += 1

 if current_epoch % 1 == 0:
 print(f"EPOCH {current_epoch}\n\tLoss: {last_loss}")








EPOCH 1
	Loss: 9.174470901489258
EPOCH 2
	Loss: 9.20919132232666
EPOCH 3
	Loss: 9.227106094360352
EPOCH 4
	Loss: 9.172086715698242
EPOCH 5
	Loss: 9.180150985717773


KeyboardInterrupt: 