NanoSocrates/Playgrounds/nanosocrates-sanity-check.ipynb

158 lines
4.9 KiB
Plaintext
Raw Normal View History

2025-10-07 16:36:26 +02:00
{
"cells": [
{
"cell_type": "code",
2025-10-07 20:45:04 +02:00
"execution_count": null,
2025-10-07 16:36:26 +02:00
"id": "f5762da9",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"torch.Size([3, 17, 7714])\n",
"torch.Size([3, 17])\n",
"tensor([[2034, 6523, 5406, 3985, 5406, 6523, 2034, 2034, 5745, 643, 5406, 7405,\n",
" 6523, 6230, 6419, 5745, 657],\n",
" [2458, 830, 5745, 5745, 5406, 3741, 2034, 5745, 6302, 6419, 5406, 2411,\n",
" 719, 830, 5745, 3189, 2775],\n",
" [2034, 5745, 5327, 4696, 6523, 643, 6419, 1671, 6302, 4406, 5745, 643,\n",
" 643, 1901, 1914, 1914, 719]])\n"
]
}
],
"source": [
"import random\n",
"import torch\n",
"from pathlib import Path\n",
"import Project_Model.Libs.Embedder as Embedder\n",
"import Project_Model.Libs.BPE as BPE\n",
"import Project_Model.Libs.Transformer as Transformer\n",
"\n",
"# set a fixed seed\n",
"torch.manual_seed(0)\n",
"random.seed(0)\n",
"\n",
"# BPE Init\n",
"VOCABULARY_PATH = Path(\"Assets/Model/toy_10/toy_dictionary.json\")\n",
"SPECIAL_VOC = BPE.default_special_tokens()\n",
"\n",
"VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)\n",
"TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)\n",
"\n",
"\n",
"# Constants\n",
"TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + 1\n",
"EMBEDDED_SIZE = 256\n",
"FEED_FORWARD_DIM = EMBEDDED_SIZE * 4\n",
"\n",
"\n",
"# Model Init\n",
"ENCODER_EMBEDDER = Embedder.NanoSocratesEmbedder(TOKEN_SPACE_SIZE, EMBEDDED_SIZE)\n",
"DECODER_EMBEDDER = Embedder.NanoSocratesEmbedder(TOKEN_SPACE_SIZE, EMBEDDED_SIZE)\n",
"\n",
"ENCODER = torch.nn.Sequential(\n",
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
")\n",
"\n",
"DECODER = torch.nn.Sequential(\n",
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
")\n",
"\n",
"DETOKENER = Transformer.DeToken(\n",
" EMBEDDED_SIZE,\n",
" TOKEN_SPACE_SIZE\n",
")\n",
"\n",
"\n",
"# Data\n",
"TEXT = (\n",
" \"<ABS>The Dark Knight is a 2008 superhero film directed by Christopher Nolan,<SOTL>\"\n",
")\n",
"OUT_TEXT = \"<START>\"\n",
"\n",
"PAD_TOKEN = TOKENANO.encode(\"<PAD>\")[0]\n",
"END_TOKEN = TOKENANO.encode(\"<END>\")[0]\n",
"\n",
"ENCODER_INPUT = TOKENANO.encode(TEXT)\n",
"DECODER_INPUT = TOKENANO.encode(OUT_TEXT)\n",
"MAX_LEN = len(ENCODER_INPUT) + 1\n",
"\n",
"EN_IN, PAD_MASK = Transformer.normalize_sequence(ENCODER_INPUT, MAX_LEN, PAD_TOKEN, END_TOKEN)\n",
"DEC_IN, _ = Transformer.normalize_sequence(DECODER_INPUT, MAX_LEN, PAD_TOKEN, END_TOKEN)\n",
"\n",
"BATCH_LEN = 3\n",
"\n",
"INPUT_TOKENIZATION = [\n",
" EN_IN\n",
"] * BATCH_LEN\n",
"OUTPUT_TOKENIZATION = [\n",
" DEC_IN\n",
"] * BATCH_LEN\n",
"\n",
"encoder_tensor_input = ENCODER_EMBEDDER(INPUT_TOKENIZATION)\n",
"encoder_padding_mask = torch.tensor([PAD_MASK] * BATCH_LEN)\n",
"\n",
"encoder_output, _ = ENCODER((encoder_tensor_input, encoder_padding_mask))\n",
"\n",
"decoder_tensor_input = DECODER_EMBEDDER(OUTPUT_TOKENIZATION)\n",
"decoder_padding_mask = torch.tensor([[False] * MAX_LEN] * BATCH_LEN)\n",
"\n",
"decoder_output, _, _, _ = DECODER((decoder_tensor_input, encoder_output, encoder_output, None))\n",
"\n",
"logits: torch.Tensor = DETOKENER(decoder_output)\n",
"\n",
"print(logits.shape)\n",
"\n",
"# print(logits)\n",
"\n",
"most_probable_tokens = torch.argmax(logits, 2)\n",
"\n",
"print(most_probable_tokens.shape)\n",
"print(most_probable_tokens)\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
2025-10-07 20:45:04 +02:00
"\n",
"\n",
2025-10-07 16:36:26 +02:00
"\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "deep_learning",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}