doctor and model test
This commit is contained in:
@@ -14,6 +14,6 @@ class DeToken(torch.nn.Module):
|
||||
x = self.__linear(x)
|
||||
|
||||
# 2) Go to logits
|
||||
x = torch.softmax(x, 2)
|
||||
# x = torch.softmax(x, 2)
|
||||
|
||||
return x
|
||||
|
||||
@@ -41,11 +41,12 @@ class Decoder(nn.Module):
|
||||
torch.Tensor,
|
||||
torch.Tensor,
|
||||
torch.Tensor,
|
||||
torch.Tensor,
|
||||
torch.Tensor
|
||||
]
|
||||
): # -> list[torch.Tensor]: # k_x = v_x . While x_q = x
|
||||
# WARNING: args is needed to have sequential
|
||||
x, k_x, v_x, padding_mask = args
|
||||
x, k_x, v_x, padding_mask,encoder_padding_mask = args
|
||||
|
||||
# build of attention mask
|
||||
attention_mask = get_causal_attention_mask(x.size(1))
|
||||
@@ -68,7 +69,7 @@ class Decoder(nn.Module):
|
||||
|
||||
# 5) Encoder–decoder (cross) attention
|
||||
CROSS_ATTENTION = self.__cross_attention(
|
||||
x, k_x, v_x, key_padding_mask=padding_mask
|
||||
x, k_x, v_x, key_padding_mask=encoder_padding_mask
|
||||
)
|
||||
|
||||
# 6) Dropout
|
||||
@@ -96,7 +97,7 @@ class Decoder(nn.Module):
|
||||
# 12) Layer Normalization
|
||||
x = self.__layer_norm_3(x)
|
||||
|
||||
return (x, k_x, v_x, padding_mask)
|
||||
return (x, k_x, v_x, padding_mask, encoder_padding_mask)
|
||||
|
||||
|
||||
# use eval to disable dropout ecc
|
||||
|
||||
23
Project_Model/Libs/Transformer/Classes/NanoSocrates.py
Normal file
23
Project_Model/Libs/Transformer/Classes/NanoSocrates.py
Normal file
@@ -0,0 +1,23 @@
|
||||
import torch
|
||||
from NanoSocratesCore import NanoSocratesCore
|
||||
|
||||
class NanoSocrates(torch.nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
embedded_size: int,
|
||||
feed_forward_dim: int,
|
||||
encoder_layers: int,
|
||||
decoder_layers:int,
|
||||
attention_heads: int,
|
||||
vocab_size: int) -> None:
|
||||
|
||||
super().__init__()
|
||||
|
||||
self._model = NanoSocratesCore(
|
||||
embedded_size,
|
||||
feed_forward_dim,
|
||||
encoder_layers,
|
||||
decoder_layers,
|
||||
attention_heads,
|
||||
vocab_size)
|
||||
|
||||
@@ -16,8 +16,11 @@ class NanoSocratesCore(torch.nn.Module):
|
||||
num_encoder_layers: int = 2,
|
||||
num_decoder_layers: int = 2,
|
||||
num_attention_heads: int = 4,
|
||||
pad_token: int = 0,
|
||||
) -> None:
|
||||
|
||||
super().__init__()
|
||||
self.__pad_token = pad_token
|
||||
feed_forward_dim = embedding_size * feed_forward_multiplier
|
||||
|
||||
self.__sentence_length = sentence_length
|
||||
@@ -43,69 +46,64 @@ class NanoSocratesCore(torch.nn.Module):
|
||||
self.__input_embeder = NanoSocratesEmbedder(vocab_size, embedding_size)
|
||||
self.__output_embedder = NanoSocratesEmbedder(vocab_size, embedding_size)
|
||||
|
||||
@torch.no_grad() # inference only
|
||||
def forward(
|
||||
self,
|
||||
encoder_input: list[list[int]],
|
||||
decoder_input: list[list[int]],
|
||||
encoder_padding_mask: list[list[int]],
|
||||
decoder_input: list[list[int]], # must start with <SOS> and PAD elsewhere
|
||||
encoder_padding_mask: list[list[bool]], # True where encoder is PAD
|
||||
):
|
||||
|
||||
if len(encoder_padding_mask) != len(encoder_input):
|
||||
raise Exception("Mismatch in received_dimensions")
|
||||
|
||||
# TODO: check for tensor in input to embedder
|
||||
# 1) Embed User-Input for encoders
|
||||
ENCODER_INPUT = self.__input_embeder(encoder_input)
|
||||
ENCODER_INPUT = self.__input_embeder(encoder_input) # [B,S,E]
|
||||
|
||||
# 2) Encode User-Input
|
||||
ENCODER_OUTPUT, _ = self.__encoder_sequence(ENCODER_INPUT, encoder_padding_mask)
|
||||
ENCODER_OUTPUT, encoder_padding_mask = self.__encoder_sequence(
|
||||
(ENCODER_INPUT, encoder_padding_mask) # as tuple
|
||||
) # [B,S,E], [B,S]
|
||||
del ENCODER_INPUT
|
||||
|
||||
exit_loop = False
|
||||
decoder_token_list = decoder_input[:]
|
||||
# 3) Autoregressive Output (greedy)
|
||||
LOGITS_HISTORY: list[torch.Tensor] = [] # keep per-step distributions
|
||||
decoder_token_list = [row[:] for row in decoder_input] # copy tokens
|
||||
decoder_phase = 0
|
||||
exit_loop = False
|
||||
|
||||
LOGITS_HISTORY: list[torch.Tensor] = []
|
||||
|
||||
# 3) Autoregressive Output
|
||||
while not exit_loop:
|
||||
decoder_phase += 1 # move to next position
|
||||
|
||||
# 3.0) Increment Counter
|
||||
decoder_phase += 1
|
||||
# 3.1) Build decoder key padding mask from current tokens (True where PAD)
|
||||
DECODER_KEY_PADDING_MASK: list[list[bool]] = [
|
||||
[tok == self.__pad_token for tok in row] for row in decoder_token_list
|
||||
] # [B,T]
|
||||
|
||||
# 3.1) Embed Decoder Input
|
||||
decoder_input = self.__output_embedder(decoder_token_list)
|
||||
# 3.2) Embed Decoder Input (full sequence; decoder builds causal mask inside)
|
||||
DECODER_INPUT = self.__output_embedder(decoder_token_list) # [B,T,E]
|
||||
|
||||
# 3.2) Decode Decoder Input
|
||||
# 3.3) Decode (self-attn uses causal mask internally; we provide PAD masks)
|
||||
DECODER_OUTPUT, _, _, _ = self.__decoder_sequence(
|
||||
decoder_input, ENCODER_OUTPUT, ENCODER_OUTPUT
|
||||
)
|
||||
(DECODER_INPUT, ENCODER_OUTPUT, ENCODER_OUTPUT,
|
||||
DECODER_KEY_PADDING_MASK, encoder_padding_mask)
|
||||
) # [B,T,E]
|
||||
del DECODER_INPUT
|
||||
|
||||
# 3.3) Go back to Token space
|
||||
# TODO: change name
|
||||
LOGITS = self.__linear(DECODER_OUTPUT)
|
||||
# 3.4) Project to token space
|
||||
LOGITS = self.__linear(DECODER_OUTPUT) # [B,T,V]
|
||||
del DECODER_OUTPUT
|
||||
|
||||
# 3.4) Transform in probabilities
|
||||
# TODO: change name
|
||||
TOKEN_PROBABILITIES = torch.softmax(LOGITS, dim=-1)
|
||||
del LOGITS
|
||||
# 3.5) Probabilities and greedy pick at current step
|
||||
TOKEN_PROBABILITIES = torch.softmax(LOGITS, dim=-1) # [B,T,V]
|
||||
LOGITS_HISTORY.append(TOKEN_PROBABILITIES) # store for this step
|
||||
|
||||
LOGITS_HISTORY.append(TOKEN_PROBABILITIES)
|
||||
step_idx = decoder_phase - 1 # 0-based
|
||||
TOKEN_IDS = TOKEN_PROBABILITIES[:, step_idx, :].argmax(dim=-1).tolist() # [B] -> list[int]
|
||||
|
||||
# 3.5) Take most probable tokens
|
||||
TOKEN_IDS = torch.argmax(TOKEN_PROBABILITIES, -1)
|
||||
# 3.6) Write prediction into next slot (the slot is PAD)
|
||||
if step_idx + 1 < self.__sentence_length:
|
||||
for b, tok in enumerate(TOKEN_IDS):
|
||||
decoder_token_list[b][step_idx + 1] = tok # feed next position
|
||||
|
||||
# TODO: check for dimensions and for efficiency
|
||||
DECODER_TOKEN_TENSOR = torch.tensor(decoder_token_list)
|
||||
DECODER_TOKEN_TENSOR[:, decoder_phase] = TOKEN_IDS
|
||||
decoder_token_list = DECODER_TOKEN_TENSOR.tolist()
|
||||
|
||||
del TOKEN_IDS
|
||||
del DECODER_TOKEN_TENSOR
|
||||
|
||||
# 3.6) Check if we generated all tokens
|
||||
# 3.7) Stop when we filled the sequence
|
||||
if decoder_phase == self.__sentence_length - 1:
|
||||
exit_loop = True
|
||||
|
||||
return LOGITS_HISTORY
|
||||
return LOGITS_HISTORY # list of [B,T,V] (per step)
|
||||
|
||||
Reference in New Issue
Block a user