doctor and model test

This commit is contained in:
GassiGiuseppe
2025-10-08 22:51:36 +02:00
parent b805dc538e
commit 1de2cc59db
13 changed files with 902 additions and 63 deletions

View File

@@ -14,6 +14,6 @@ class DeToken(torch.nn.Module):
x = self.__linear(x)
# 2) Go to logits
x = torch.softmax(x, 2)
# x = torch.softmax(x, 2)
return x

View File

@@ -41,11 +41,12 @@ class Decoder(nn.Module):
torch.Tensor,
torch.Tensor,
torch.Tensor,
torch.Tensor,
torch.Tensor
]
): # -> list[torch.Tensor]: # k_x = v_x . While x_q = x
# WARNING: args is needed to have sequential
x, k_x, v_x, padding_mask = args
x, k_x, v_x, padding_mask,encoder_padding_mask = args
# build of attention mask
attention_mask = get_causal_attention_mask(x.size(1))
@@ -68,7 +69,7 @@ class Decoder(nn.Module):
# 5) Encoderdecoder (cross) attention
CROSS_ATTENTION = self.__cross_attention(
x, k_x, v_x, key_padding_mask=padding_mask
x, k_x, v_x, key_padding_mask=encoder_padding_mask
)
# 6) Dropout
@@ -96,7 +97,7 @@ class Decoder(nn.Module):
# 12) Layer Normalization
x = self.__layer_norm_3(x)
return (x, k_x, v_x, padding_mask)
return (x, k_x, v_x, padding_mask, encoder_padding_mask)
# use eval to disable dropout ecc

View File

@@ -0,0 +1,23 @@
import torch
from NanoSocratesCore import NanoSocratesCore
class NanoSocrates(torch.nn.Module):
def __init__(self,
embedded_size: int,
feed_forward_dim: int,
encoder_layers: int,
decoder_layers:int,
attention_heads: int,
vocab_size: int) -> None:
super().__init__()
self._model = NanoSocratesCore(
embedded_size,
feed_forward_dim,
encoder_layers,
decoder_layers,
attention_heads,
vocab_size)

View File

@@ -16,8 +16,11 @@ class NanoSocratesCore(torch.nn.Module):
num_encoder_layers: int = 2,
num_decoder_layers: int = 2,
num_attention_heads: int = 4,
pad_token: int = 0,
) -> None:
super().__init__()
self.__pad_token = pad_token
feed_forward_dim = embedding_size * feed_forward_multiplier
self.__sentence_length = sentence_length
@@ -43,69 +46,64 @@ class NanoSocratesCore(torch.nn.Module):
self.__input_embeder = NanoSocratesEmbedder(vocab_size, embedding_size)
self.__output_embedder = NanoSocratesEmbedder(vocab_size, embedding_size)
@torch.no_grad() # inference only
def forward(
self,
encoder_input: list[list[int]],
decoder_input: list[list[int]],
encoder_padding_mask: list[list[int]],
decoder_input: list[list[int]], # must start with <SOS> and PAD elsewhere
encoder_padding_mask: list[list[bool]], # True where encoder is PAD
):
if len(encoder_padding_mask) != len(encoder_input):
raise Exception("Mismatch in received_dimensions")
# TODO: check for tensor in input to embedder
# 1) Embed User-Input for encoders
ENCODER_INPUT = self.__input_embeder(encoder_input)
ENCODER_INPUT = self.__input_embeder(encoder_input) # [B,S,E]
# 2) Encode User-Input
ENCODER_OUTPUT, _ = self.__encoder_sequence(ENCODER_INPUT, encoder_padding_mask)
ENCODER_OUTPUT, encoder_padding_mask = self.__encoder_sequence(
(ENCODER_INPUT, encoder_padding_mask) # as tuple
) # [B,S,E], [B,S]
del ENCODER_INPUT
exit_loop = False
decoder_token_list = decoder_input[:]
# 3) Autoregressive Output (greedy)
LOGITS_HISTORY: list[torch.Tensor] = [] # keep per-step distributions
decoder_token_list = [row[:] for row in decoder_input] # copy tokens
decoder_phase = 0
exit_loop = False
LOGITS_HISTORY: list[torch.Tensor] = []
# 3) Autoregressive Output
while not exit_loop:
decoder_phase += 1 # move to next position
# 3.0) Increment Counter
decoder_phase += 1
# 3.1) Build decoder key padding mask from current tokens (True where PAD)
DECODER_KEY_PADDING_MASK: list[list[bool]] = [
[tok == self.__pad_token for tok in row] for row in decoder_token_list
] # [B,T]
# 3.1) Embed Decoder Input
decoder_input = self.__output_embedder(decoder_token_list)
# 3.2) Embed Decoder Input (full sequence; decoder builds causal mask inside)
DECODER_INPUT = self.__output_embedder(decoder_token_list) # [B,T,E]
# 3.2) Decode Decoder Input
# 3.3) Decode (self-attn uses causal mask internally; we provide PAD masks)
DECODER_OUTPUT, _, _, _ = self.__decoder_sequence(
decoder_input, ENCODER_OUTPUT, ENCODER_OUTPUT
)
(DECODER_INPUT, ENCODER_OUTPUT, ENCODER_OUTPUT,
DECODER_KEY_PADDING_MASK, encoder_padding_mask)
) # [B,T,E]
del DECODER_INPUT
# 3.3) Go back to Token space
# TODO: change name
LOGITS = self.__linear(DECODER_OUTPUT)
# 3.4) Project to token space
LOGITS = self.__linear(DECODER_OUTPUT) # [B,T,V]
del DECODER_OUTPUT
# 3.4) Transform in probabilities
# TODO: change name
TOKEN_PROBABILITIES = torch.softmax(LOGITS, dim=-1)
del LOGITS
# 3.5) Probabilities and greedy pick at current step
TOKEN_PROBABILITIES = torch.softmax(LOGITS, dim=-1) # [B,T,V]
LOGITS_HISTORY.append(TOKEN_PROBABILITIES) # store for this step
LOGITS_HISTORY.append(TOKEN_PROBABILITIES)
step_idx = decoder_phase - 1 # 0-based
TOKEN_IDS = TOKEN_PROBABILITIES[:, step_idx, :].argmax(dim=-1).tolist() # [B] -> list[int]
# 3.5) Take most probable tokens
TOKEN_IDS = torch.argmax(TOKEN_PROBABILITIES, -1)
# 3.6) Write prediction into next slot (the slot is PAD)
if step_idx + 1 < self.__sentence_length:
for b, tok in enumerate(TOKEN_IDS):
decoder_token_list[b][step_idx + 1] = tok # feed next position
# TODO: check for dimensions and for efficiency
DECODER_TOKEN_TENSOR = torch.tensor(decoder_token_list)
DECODER_TOKEN_TENSOR[:, decoder_phase] = TOKEN_IDS
decoder_token_list = DECODER_TOKEN_TENSOR.tolist()
del TOKEN_IDS
del DECODER_TOKEN_TENSOR
# 3.6) Check if we generated all tokens
# 3.7) Stop when we filled the sequence
if decoder_phase == self.__sentence_length - 1:
exit_loop = True
return LOGITS_HISTORY
return LOGITS_HISTORY # list of [B,T,V] (per step)

View File

@@ -24,32 +24,49 @@ class TrainingModel(torch.nn.Module):
vocabulary_size, latent_space
)
TMP_ENCODERS = [
# do NOT share layer weights
enc_layers = [
Encoder(latent_space, feed_forward_latent_space, attention_heads)
] * layer_number
TMP_DECODERS = [
for _ in range(layer_number)
]
dec_layers = [
Decoder(latent_space, feed_forward_latent_space, attention_heads)
] * layer_number
for _ in range(layer_number)
]
self.__encoder = torch.nn.Sequential(*TMP_ENCODERS)
self.__decoder = torch.nn.Sequential(*TMP_DECODERS)
self.__encoder = torch.nn.Sequential(*enc_layers)
self.__decoder = torch.nn.Sequential(*dec_layers)
self.__detokener = DeToken(latent_space, vocabulary_size)
def forward(self, args: tuple[torch.Tensor, torch.Tensor, torch.Tensor]):
encoder_embedder_input, padding_tensor, decoder_embedder_input = args
def forward(
self,
args: tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]
):
# returns logits for the LAST decoder position only -> [B, V]
(
encoder_embedder_input, # [B,S] encoder tokens
encoder_padding_mask, # [B,S] True where encoder is PAD
decoder_embedder_prefix, # [B,Tp] decoder prefix (e.g., <SOS> + tokens so far)
decoder_padding_mask, # [B,Tp] True where decoder prefix has PAD
) = args
encoder_tensor = self.__encoder_embedder(encoder_embedder_input)
decoder_tensor = self.__decoder_embedder(decoder_embedder_input)
# 1) embeddings
encoder_tensor = self.__encoder_embedder(encoder_embedder_input) # [B,S,E]
decoder_tensor = self.__decoder_embedder(decoder_embedder_prefix) # [B,Tp,E]
encoder_output, _ = self.__encoder((encoder_tensor, padding_tensor))
# 2) encode
encoder_output, _ = self.__encoder((encoder_tensor, encoder_padding_mask)) # [B,S,E], [B,S]
decoder_output, _, _, _ = self.__decoder(
(decoder_tensor, encoder_tensor, encoder_tensor, None)
)
# 3) decode (causal mask is built inside the decoder)
decoder_output, _, _, _, _ = self.__decoder(
(decoder_tensor, encoder_output, encoder_output,
decoder_padding_mask, encoder_padding_mask)
) # [B,Tp,E], ...
logits: torch.Tensor = self.__detokener(decoder_output)
# 4) project only the last time step
last_hidden = decoder_output[:, -1:, :] # [B,1,E]
step_logits = self.__detokener(last_hidden) # [B,1,V]
step_logits = step_logits[:, -1, :] # [B,V]
return logits
return step_logits # logits for one token