added first classes (Encoder, Decoder, Attention) for the model

This commit is contained in:
GassiGiuseppe 2025-10-04 21:07:58 +02:00
parent 9b656e7918
commit 76200d936d
5 changed files with 147 additions and 0 deletions

View File

@ -0,0 +1,44 @@
import torch
import torch.nn as nn
from Transformer.feed_forward_nn import FeedForwardNetwork
from Transformer.pytorch_multi_head_attention import TorchMultiHeadAttention as MultiHeadAttention
class Decoder(nn.Module):
def __init__(self, d_model:int, d_ff: int, attention_heads:int) -> None:
super().__init__()
self._masked_attention = MultiHeadAttention(d_model, attention_heads, dropout=0.1)
self.norm1 = nn.LayerNorm(d_model)
self.attention = MultiHeadAttention(d_model, attention_heads, dropout=0.1)
self.norm2 = nn.LayerNorm(d_model)
self.dropout = nn.Dropout(0.1)
self.ffn = FeedForwardNetwork(d_model, d_ff)
self.norm3 = nn.LayerNorm(d_model)
pass
def forward(self, x, k_x,v_x, attention_mask): # k_x = v_x . While x_q = x
# 1) Masked self-attention
x = x + self.dropout(self._masked_attention(x, x, x, attention_mask= attention_mask))
x = self.norm1(x)
# 2) Encoderdecoder (cross) attention
x = x + self.dropout(self.attention(x, k_x, v_x))
x = self.norm2(x)
# 3) Position-wise feed-forward
x = x + self.dropout(self.ffn(x))
x = self.norm3(x)
return x
# use eval to disable dropout ecc

View File

@ -0,0 +1,32 @@
import torch
import torch.nn as nn
from Transformer.feed_forward_nn import FeedForwardNetwork
from Transformer.pytorch_multi_head_attention import TorchMultiHeadAttention as MultiHeadAttention
class Encoder(nn.Module): # in this way we expose the primitive of nn.Module for training purpose
def __init__(self, d_model:int, d_ff: int, attention_heads:int) -> None:
super().__init__()
self.attention = MultiHeadAttention(d_model, attention_heads, dropout=0.1)
self.norm1 = nn.LayerNorm(d_model) # norm of first "Add and Normalize"
self.ffn = FeedForwardNetwork(d_model, d_ff)
self.norm2 = nn.LayerNorm(d_model) # norm of second "Add and Normalize"
self.dropout = nn.Dropout(0.1) # ...
pass
def forward(self, x):
# -> ATTENTION -> dropout -> add and normalize -> FF -> dropout -> add and normalize ->
# Attention with Residual Connection [ input + self-attention]
x = x + self.dropout(self.attention(x, x, x))
x = self.norm1(x)
# Feedforward with Residual Connection [ normed self-attention + ff]
x = x + self.dropout(self.ffn(x))
x = self.norm2(x)
return x
# use eval to disable dropout ecc

View File

@ -0,0 +1,18 @@
# it is position wise!
# https://stackoverflow.com/questions/74979359/how-is-position-wise-feed-forward-neural-network-implemented-for-transformers
import torch
import torch.nn as nn
class FeedForwardNetwork(nn.Module):
def __init__(self, d_model, d_ff):
super(FeedForwardNetwork, self).__init__()
self.fc1 = nn.Linear(d_model, d_ff) # expand in higher dimension
self.activation = nn.ReLU()
self.dropout = nn.Dropout(0.1) # during training we drop something, with eval it got deactivated
self.fc2 = nn.Linear(d_ff, d_model) # return into the model dimension
def forward(self, x):
# -> NN1 -> RELU -> (Droput during training) -> NN2 ->
return self.fc2(self.dropout(self.activation(self.fc1(x))))

View File

@ -0,0 +1,20 @@
# multi-head attention -> (then to) ff
# attention: qkv -> score = qk -> divide -> softamx
# multihead -> QKV diferent in each head ( built by : X*[WQ/QK/WV])
# z = soft(Q*K'/sqr(d))*V
# recombine Z: 1) concatenate. 2) [z01234] * W = Z
# we expect later to have padding token
########################
# WIP
########################
import torch
import torch.nn as nn
embed_dim = 256
num_heads = 8
multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
class MultiheadAttention():
def __init__(self, num_heads = 8, ) -> None:
pass

View File

@ -0,0 +1,33 @@
import torch
import torch.nn as nn
class TorchMultiHeadAttention(nn.Module):
def __init__(self, embed_dim: int, num_heads: int, dropout: float = 0.0):
super().__init__()
self.attention = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout, batch_first=True)
def forward(
self,
x_q: torch.Tensor,
x_k: torch.Tensor,
x_v: torch.Tensor,
attention_mask = None,
key_padding_mask = None
) -> torch.Tensor:
# x * Wq -> Q
# x * Wk -> K
# x * Wv -> V
y, _ = self.attention.forward(x_q, x_k, x_v, attn_mask=attention_mask, key_padding_mask=key_padding_mask)
return y
# batch_first=False (default storico)
# Formato: (L, N, E)
# L = lunghezza della sequenza (time/posizioni)
# N = batch size
# E = dimensione d_model (embed)
# batch_first=True
# Formato: (N, L, E) (più naturale per molti modelli)