added first classes (Encoder, Decoder, Attention) for the model

2025-10-04 21:07:58 +02:00
parent 9b656e7918
commit 76200d936d
5 changed files with 147 additions and 0 deletions
--- a/Project_Model/Libs/Transformer/decoder.py
+++ b/Project_Model/Libs/Transformer/decoder.py
@@ -0,0 +1,44 @@
 import torch
 import torch.nn as nn
 from Transformer.feed_forward_nn import FeedForwardNetwork
 from Transformer.pytorch_multi_head_attention import TorchMultiHeadAttention as MultiHeadAttention
 class Decoder(nn.Module):  
    def __init__(self, d_model:int, d_ff: int, attention_heads:int) -> None:
        super().__init__()
        self._masked_attention = MultiHeadAttention(d_model, attention_heads, dropout=0.1)
        self.norm1 = nn.LayerNorm(d_model)   
        self.attention = MultiHeadAttention(d_model, attention_heads, dropout=0.1)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(0.1)
        self.ffn = FeedForwardNetwork(d_model, d_ff)             
        self.norm3 = nn.LayerNorm(d_model)             
        pass
    def forward(self, x, k_x,v_x, attention_mask): # k_x = v_x . While x_q = x
        # 1) Masked self-attention
        x = x + self.dropout(self._masked_attention(x, x, x, attention_mask= attention_mask))
        x = self.norm1(x)
        # 2) Encoder–decoder (cross) attention
        x = x + self.dropout(self.attention(x, k_x, v_x))
        x = self.norm2(x)
        # 3) Position-wise feed-forward
        x = x + self.dropout(self.ffn(x))
        x = self.norm3(x)
        return x
 # use eval to disable dropout ecc
--- a/Project_Model/Libs/Transformer/encoder.py
+++ b/Project_Model/Libs/Transformer/encoder.py
@@ -0,0 +1,32 @@
 import torch
 import torch.nn as nn
 from Transformer.feed_forward_nn import FeedForwardNetwork
 from Transformer.pytorch_multi_head_attention import TorchMultiHeadAttention as MultiHeadAttention
 class Encoder(nn.Module):   # in this way we expose the primitive of nn.Module for training purpose
    def __init__(self, d_model:int, d_ff: int, attention_heads:int) -> None:
        super().__init__()
        self.attention = MultiHeadAttention(d_model, attention_heads, dropout=0.1)
        self.norm1 = nn.LayerNorm(d_model)              # norm of first "Add and Normalize"
        self.ffn = FeedForwardNetwork(d_model, d_ff)
        self.norm2 = nn.LayerNorm(d_model)              # norm of second "Add and Normalize"
        self.dropout = nn.Dropout(0.1)                  # ...
        pass
    def forward(self, x):
        # -> ATTENTION -> dropout -> add and normalize -> FF -> dropout -> add and normalize ->
        # Attention with Residual Connection    [ input + self-attention]
        x = x + self.dropout(self.attention(x, x, x))
        x = self.norm1(x)
        # Feedforward with Residual Connection  [ normed self-attention + ff]
        x = x + self.dropout(self.ffn(x))
        x = self.norm2(x)
        return x
 # use eval to disable dropout ecc
--- a/Project_Model/Libs/Transformer/feed_forward_nn.py
+++ b/Project_Model/Libs/Transformer/feed_forward_nn.py
@@ -0,0 +1,18 @@
 # it is position wise!
 # https://stackoverflow.com/questions/74979359/how-is-position-wise-feed-forward-neural-network-implemented-for-transformers
 import torch
 import torch.nn as nn
 class FeedForwardNetwork(nn.Module):
    def __init__(self, d_model, d_ff):
        super(FeedForwardNetwork, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)                 # expand in higher dimension
        self.activation = nn.ReLU()
        self.dropout = nn.Dropout(0.1)                      # during training we drop something, with eval it got deactivated
        self.fc2 = nn.Linear(d_ff, d_model)                 # return into the model dimension
    def forward(self, x):
        # -> NN1 -> RELU -> (Droput during training) -> NN2 -> 
        return self.fc2(self.dropout(self.activation(self.fc1(x))))
--- a/Project_Model/Libs/Transformer/multi_head_attention.py
+++ b/Project_Model/Libs/Transformer/multi_head_attention.py
@@ -0,0 +1,20 @@
 # multi-head attention -> (then to) ff
 # attention: qkv -> score = qk ->  divide -> softamx
 # multihead -> QKV diferent in each head ( built by : X*[WQ/QK/WV])
 # z = soft(Q*K'/sqr(d))*V
 # recombine Z: 1) concatenate. 2) [z01234] * W = Z
 # we expect later to have padding token
 ########################
 # WIP
 ########################
 import torch
 import torch.nn as nn
 embed_dim = 256
 num_heads = 8
 multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
 class MultiheadAttention():
    def __init__(self, num_heads = 8, ) -> None:
        pass
--- a/Project_Model/Libs/Transformer/pytorch_multi_head_attention.py
+++ b/Project_Model/Libs/Transformer/pytorch_multi_head_attention.py
@@ -0,0 +1,33 @@
 import torch
 import torch.nn as nn
 class TorchMultiHeadAttention(nn.Module):
    def __init__(self, embed_dim: int, num_heads: int, dropout: float = 0.0):
        super().__init__()
        self.attention = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout, batch_first=True)
    def forward(
        self,
        x_q: torch.Tensor,
        x_k: torch.Tensor,
        x_v: torch.Tensor,
        attention_mask = None,
        key_padding_mask = None
    ) -> torch.Tensor:
        # x * Wq -> Q
        # x * Wk -> K
        # x * Wv -> V
        y, _ = self.attention.forward(x_q, x_k, x_v, attn_mask=attention_mask, key_padding_mask=key_padding_mask)
        return y
 # batch_first=False (default storico)
 # Formato: (L, N, E)
 # L = lunghezza della sequenza (time/posizioni)
 # N = batch size
 # E = dimensione d_model (embed)
 # batch_first=True
 # Formato: (N, L, E) (più naturale per molti modelli)