added first classes (Encoder, Decoder, Attention) for the model

2025-10-04 21:07:58 +02:00
parent 9b656e7918
commit 76200d936d
5 changed files with 147 additions and 0 deletions
--- a/Project_Model/Libs/Transformer/decoder.py
+++ b/Project_Model/Libs/Transformer/decoder.py
@@ -0,0 +1,44 @@
+
+import torch
+import torch.nn as nn
+from Transformer.feed_forward_nn import FeedForwardNetwork
+from Transformer.pytorch_multi_head_attention import TorchMultiHeadAttention as MultiHeadAttention
+
+
+class Decoder(nn.Module):  
+
+    def __init__(self, d_model:int, d_ff: int, attention_heads:int) -> None:
+        super().__init__()
+        self._masked_attention = MultiHeadAttention(d_model, attention_heads, dropout=0.1)
+        self.norm1 = nn.LayerNorm(d_model)   
+
+        self.attention = MultiHeadAttention(d_model, attention_heads, dropout=0.1)
+        self.norm2 = nn.LayerNorm(d_model)
+
+        self.dropout = nn.Dropout(0.1)
+
+        self.ffn = FeedForwardNetwork(d_model, d_ff)             
+        self.norm3 = nn.LayerNorm(d_model)             
+        pass
+
+    def forward(self, x, k_x,v_x, attention_mask): # k_x = v_x . While x_q = x
+
+        # 1) Masked self-attention
+        x = x + self.dropout(self._masked_attention(x, x, x, attention_mask= attention_mask))
+        x = self.norm1(x)
+
+        # 2) Encoder–decoder (cross) attention
+        x = x + self.dropout(self.attention(x, k_x, v_x))
+        x = self.norm2(x)
+
+        # 3) Position-wise feed-forward
+        x = x + self.dropout(self.ffn(x))
+        x = self.norm3(x)
+
+        return x
+
+
+
+    
+
+# use eval to disable dropout ecc
--- a/Project_Model/Libs/Transformer/encoder.py
+++ b/Project_Model/Libs/Transformer/encoder.py
@@ -0,0 +1,32 @@
+
+import torch
+import torch.nn as nn
+from Transformer.feed_forward_nn import FeedForwardNetwork
+from Transformer.pytorch_multi_head_attention import TorchMultiHeadAttention as MultiHeadAttention
+
+
+class Encoder(nn.Module):   # in this way we expose the primitive of nn.Module for training purpose
+
+    def __init__(self, d_model:int, d_ff: int, attention_heads:int) -> None:
+        super().__init__()
+        self.attention = MultiHeadAttention(d_model, attention_heads, dropout=0.1)
+        self.norm1 = nn.LayerNorm(d_model)              # norm of first "Add and Normalize"
+        self.ffn = FeedForwardNetwork(d_model, d_ff)
+        self.norm2 = nn.LayerNorm(d_model)              # norm of second "Add and Normalize"
+        self.dropout = nn.Dropout(0.1)                  # ...
+        pass
+
+    def forward(self, x):
+        # -> ATTENTION -> dropout -> add and normalize -> FF -> dropout -> add and normalize ->
+        # Attention with Residual Connection    [ input + self-attention]
+        x = x + self.dropout(self.attention(x, x, x))
+        x = self.norm1(x)
+        
+        # Feedforward with Residual Connection  [ normed self-attention + ff]
+        x = x + self.dropout(self.ffn(x))
+        x = self.norm2(x)
+        return x
+
+
+
+# use eval to disable dropout ecc
--- a/Project_Model/Libs/Transformer/feed_forward_nn.py
+++ b/Project_Model/Libs/Transformer/feed_forward_nn.py
@@ -0,0 +1,18 @@
+# it is position wise!
+# https://stackoverflow.com/questions/74979359/how-is-position-wise-feed-forward-neural-network-implemented-for-transformers
+
+import torch
+import torch.nn as nn
+
+class FeedForwardNetwork(nn.Module):
+    def __init__(self, d_model, d_ff):
+        super(FeedForwardNetwork, self).__init__()
+        self.fc1 = nn.Linear(d_model, d_ff)                 # expand in higher dimension
+        self.activation = nn.ReLU()
+        self.dropout = nn.Dropout(0.1)                      # during training we drop something, with eval it got deactivated
+        self.fc2 = nn.Linear(d_ff, d_model)                 # return into the model dimension
+
+    
+    def forward(self, x):
+        # -> NN1 -> RELU -> (Droput during training) -> NN2 -> 
+        return self.fc2(self.dropout(self.activation(self.fc1(x))))
--- a/Project_Model/Libs/Transformer/multi_head_attention.py
+++ b/Project_Model/Libs/Transformer/multi_head_attention.py
@@ -0,0 +1,20 @@
+# multi-head attention -> (then to) ff
+# attention: qkv -> score = qk ->  divide -> softamx
+# multihead -> QKV diferent in each head ( built by : X*[WQ/QK/WV])
+# z = soft(Q*K'/sqr(d))*V
+# recombine Z: 1) concatenate. 2) [z01234] * W = Z
+# we expect later to have padding token
+########################
+# WIP
+########################
+import torch
+import torch.nn as nn
+
+embed_dim = 256
+num_heads = 8
+multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
+
+class MultiheadAttention():
+
+    def __init__(self, num_heads = 8, ) -> None:
+        pass
--- a/Project_Model/Libs/Transformer/pytorch_multi_head_attention.py
+++ b/Project_Model/Libs/Transformer/pytorch_multi_head_attention.py
@@ -0,0 +1,33 @@
+import torch
+import torch.nn as nn
+
+class TorchMultiHeadAttention(nn.Module):
+
+    def __init__(self, embed_dim: int, num_heads: int, dropout: float = 0.0):
+        super().__init__()
+        self.attention = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout, batch_first=True)
+
+    def forward(
+        self,
+        x_q: torch.Tensor,
+        x_k: torch.Tensor,
+        x_v: torch.Tensor,
+        attention_mask = None,
+        key_padding_mask = None
+    ) -> torch.Tensor:
+        
+        # x * Wq -> Q
+        # x * Wk -> K
+        # x * Wv -> V
+
+        y, _ = self.attention.forward(x_q, x_k, x_v, attn_mask=attention_mask, key_padding_mask=key_padding_mask)
+        return y
+
+
+# batch_first=False (default storico)
+# Formato: (L, N, E)
+# L = lunghezza della sequenza (time/posizioni)
+# N = batch size
+# E = dimensione d_model (embed)
+# batch_first=True
+# Formato: (N, L, E) (più naturale per molti modelli)