In [6]:
import random
import torch
from pathlib import Path
import Project_Model.Libs.Embedder as Embedder
import Project_Model.Libs.BPE as BPE
import Project_Model.Libs.Transformer as Transformer

# set a fixed seed
torch.manual_seed(0)
random.seed(0)

TEXT = (
    "<ABS>The Dark Knight is a 2008 superhero film directed by Christopher Nolan,<SOTL>"
)
OUT_TEXT = "<START>"
VOCABULARY_PATH = Path("Assets/Model/toy_10/toy_dictionary.json")
SPECIAL_VOC = BPE.default_special_tokens()

VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)
TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)

PAD_TOKEN = TOKENANO.encode("<PAD>")[0]
END_TOKEN = TOKENANO.encode("<END>")[0]

ENCODER_INPUT = TOKENANO.encode(TEXT)
DECODER_INPUT = TOKENANO.encode(OUT_TEXT)
MAX_LEN = len(ENCODER_INPUT) + 1

EN_IN, PAD_MASK = Transformer.normalize_sequence(ENCODER_INPUT, MAX_LEN, PAD_TOKEN, END_TOKEN)
DEC_IN, _ = Transformer.normalize_sequence(DECODER_INPUT, MAX_LEN, PAD_TOKEN, END_TOKEN)
BATCH_LEN = 3

INPUT_TOKENIZATION = [
    EN_IN
] * BATCH_LEN
OUTPUT_TOKENIZATION = [
    DEC_IN
] * BATCH_LEN


print(INPUT_TOKENIZATION)

TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size
EMBEDDED_SIZE = 256
FEED_FORWARD_DIM = EMBEDDED_SIZE * 4

EMBEDDER = Embedder.NanoSocratesEmbedder(TOKEN_SPACE_SIZE, EMBEDDED_SIZE)
encoder_tensor: torch.Tensor = EMBEDDER(INPUT_TOKENIZATION)
ENCODER = torch.nn.Sequential(
    Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),
    Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),
    Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),
    Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),
    Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),
    Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),
)
decoder_tensor: torch.Tensor = EMBEDDER(OUTPUT_TOKENIZATION)
DECODER = torch.nn.Sequential(
    Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),
    Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),
    Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),
    Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),
    Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),
    Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),
)

print(len(INPUT_TOKENIZATION))
print(f"Embedder Tensor: {encoder_tensor.shape}")
print(f"Values:\n{encoder_tensor}")

BATCH_SIZE, TOKENS, DIMENSIONS = encoder_tensor.shape
PAD_MASK = torch.tensor([PAD_MASK] * BATCH_LEN)

encoder_out, _ = ENCODER((encoder_tensor, PAD_MASK))
tensor: torch.Tensor
tensor, _, _, _ = DECODER((decoder_tensor, encoder_out, encoder_out, None))

print(f"ENCODER Tensor: {tensor.shape}")
print(f"Values:\n{tensor}")

[[7706, 290, 756, 4270, 7357, 115, 351, 1507, 1213, 410, 3382, 317, 497, 4740, 2784, 7712], [7706, 290, 756, 4270, 7357, 115, 351, 1507, 1213, 410, 3382, 317, 497, 4740, 2784, 7712], [7706, 290, 756, 4270, 7357, 115, 351, 1507, 1213, 410, 3382, 317, 497, 4740, 2784, 7712]]
3
Embedder Tensor: torch.Size([3, 16, 256])
Values:
tensor([[[-0.6981,  0.0804, -2.1672,  ...,  0.3919,  0.3341,  1.0794],
         [ 2.5818, -0.2308,  0.6001,  ..., -0.0500, -0.0408, -0.9852],
         [-0.6967,  0.8109,  1.3108,  ...,  2.1693,  1.4143, -0.1236],
         ...,
         [ 2.1226,  2.5695, -1.6178,  ..., -0.0652, -0.0802,  0.1103],
         [ 0.8770, -2.4782,  0.8536,  ...,  2.0471, -1.5702,  0.7387],
         [ 1.4284, -0.4654,  0.1394,  ...,  1.6520,  0.6728,  1.3851]],

        [[-0.6981,  0.0804, -2.1672,  ...,  0.3919,  0.3341,  1.0794],
         [ 2.5818, -0.2308,  0.6001,  ..., -0.0500, -0.0408, -0.9852],
         [-0.6967,  0.8109,  1.3108,  ...,  2.1693,  1.4143, -0.1236],
         ...,
     