From 76200d936d27bad54ff76601965d2493d99f63f0 Mon Sep 17 00:00:00 2001 From: GassiGiuseppe Date: Sat, 4 Oct 2025 21:07:58 +0200 Subject: [PATCH] added first classes (Encoder, Decoder, Attention) for the model --- Project_Model/Libs/Transformer/decoder.py | 44 +++++++++++++++++++ Project_Model/Libs/Transformer/encoder.py | 32 ++++++++++++++ .../Libs/Transformer/feed_forward_nn.py | 18 ++++++++ .../Libs/Transformer/multi_head_attention.py | 20 +++++++++ .../pytorch_multi_head_attention.py | 33 ++++++++++++++ 5 files changed, 147 insertions(+) create mode 100644 Project_Model/Libs/Transformer/decoder.py create mode 100644 Project_Model/Libs/Transformer/encoder.py create mode 100644 Project_Model/Libs/Transformer/feed_forward_nn.py create mode 100644 Project_Model/Libs/Transformer/multi_head_attention.py create mode 100644 Project_Model/Libs/Transformer/pytorch_multi_head_attention.py diff --git a/Project_Model/Libs/Transformer/decoder.py b/Project_Model/Libs/Transformer/decoder.py new file mode 100644 index 0000000..4612ea1 --- /dev/null +++ b/Project_Model/Libs/Transformer/decoder.py @@ -0,0 +1,44 @@ + +import torch +import torch.nn as nn +from Transformer.feed_forward_nn import FeedForwardNetwork +from Transformer.pytorch_multi_head_attention import TorchMultiHeadAttention as MultiHeadAttention + + +class Decoder(nn.Module): + + def __init__(self, d_model:int, d_ff: int, attention_heads:int) -> None: + super().__init__() + self._masked_attention = MultiHeadAttention(d_model, attention_heads, dropout=0.1) + self.norm1 = nn.LayerNorm(d_model) + + self.attention = MultiHeadAttention(d_model, attention_heads, dropout=0.1) + self.norm2 = nn.LayerNorm(d_model) + + self.dropout = nn.Dropout(0.1) + + self.ffn = FeedForwardNetwork(d_model, d_ff) + self.norm3 = nn.LayerNorm(d_model) + pass + + def forward(self, x, k_x,v_x, attention_mask): # k_x = v_x . While x_q = x + + # 1) Masked self-attention + x = x + self.dropout(self._masked_attention(x, x, x, attention_mask= attention_mask)) + x = self.norm1(x) + + # 2) Encoder–decoder (cross) attention + x = x + self.dropout(self.attention(x, k_x, v_x)) + x = self.norm2(x) + + # 3) Position-wise feed-forward + x = x + self.dropout(self.ffn(x)) + x = self.norm3(x) + + return x + + + + + +# use eval to disable dropout ecc \ No newline at end of file diff --git a/Project_Model/Libs/Transformer/encoder.py b/Project_Model/Libs/Transformer/encoder.py new file mode 100644 index 0000000..81d9cb7 --- /dev/null +++ b/Project_Model/Libs/Transformer/encoder.py @@ -0,0 +1,32 @@ + +import torch +import torch.nn as nn +from Transformer.feed_forward_nn import FeedForwardNetwork +from Transformer.pytorch_multi_head_attention import TorchMultiHeadAttention as MultiHeadAttention + + +class Encoder(nn.Module): # in this way we expose the primitive of nn.Module for training purpose + + def __init__(self, d_model:int, d_ff: int, attention_heads:int) -> None: + super().__init__() + self.attention = MultiHeadAttention(d_model, attention_heads, dropout=0.1) + self.norm1 = nn.LayerNorm(d_model) # norm of first "Add and Normalize" + self.ffn = FeedForwardNetwork(d_model, d_ff) + self.norm2 = nn.LayerNorm(d_model) # norm of second "Add and Normalize" + self.dropout = nn.Dropout(0.1) # ... + pass + + def forward(self, x): + # -> ATTENTION -> dropout -> add and normalize -> FF -> dropout -> add and normalize -> + # Attention with Residual Connection [ input + self-attention] + x = x + self.dropout(self.attention(x, x, x)) + x = self.norm1(x) + + # Feedforward with Residual Connection [ normed self-attention + ff] + x = x + self.dropout(self.ffn(x)) + x = self.norm2(x) + return x + + + +# use eval to disable dropout ecc \ No newline at end of file diff --git a/Project_Model/Libs/Transformer/feed_forward_nn.py b/Project_Model/Libs/Transformer/feed_forward_nn.py new file mode 100644 index 0000000..70d3d8e --- /dev/null +++ b/Project_Model/Libs/Transformer/feed_forward_nn.py @@ -0,0 +1,18 @@ +# it is position wise! +# https://stackoverflow.com/questions/74979359/how-is-position-wise-feed-forward-neural-network-implemented-for-transformers + +import torch +import torch.nn as nn + +class FeedForwardNetwork(nn.Module): + def __init__(self, d_model, d_ff): + super(FeedForwardNetwork, self).__init__() + self.fc1 = nn.Linear(d_model, d_ff) # expand in higher dimension + self.activation = nn.ReLU() + self.dropout = nn.Dropout(0.1) # during training we drop something, with eval it got deactivated + self.fc2 = nn.Linear(d_ff, d_model) # return into the model dimension + + + def forward(self, x): + # -> NN1 -> RELU -> (Droput during training) -> NN2 -> + return self.fc2(self.dropout(self.activation(self.fc1(x)))) \ No newline at end of file diff --git a/Project_Model/Libs/Transformer/multi_head_attention.py b/Project_Model/Libs/Transformer/multi_head_attention.py new file mode 100644 index 0000000..cd48b66 --- /dev/null +++ b/Project_Model/Libs/Transformer/multi_head_attention.py @@ -0,0 +1,20 @@ +# multi-head attention -> (then to) ff +# attention: qkv -> score = qk -> divide -> softamx +# multihead -> QKV diferent in each head ( built by : X*[WQ/QK/WV]) +# z = soft(Q*K'/sqr(d))*V +# recombine Z: 1) concatenate. 2) [z01234] * W = Z +# we expect later to have padding token +######################## +# WIP +######################## +import torch +import torch.nn as nn + +embed_dim = 256 +num_heads = 8 +multihead_attn = nn.MultiheadAttention(embed_dim, num_heads) + +class MultiheadAttention(): + + def __init__(self, num_heads = 8, ) -> None: + pass \ No newline at end of file diff --git a/Project_Model/Libs/Transformer/pytorch_multi_head_attention.py b/Project_Model/Libs/Transformer/pytorch_multi_head_attention.py new file mode 100644 index 0000000..0bb6fc4 --- /dev/null +++ b/Project_Model/Libs/Transformer/pytorch_multi_head_attention.py @@ -0,0 +1,33 @@ +import torch +import torch.nn as nn + +class TorchMultiHeadAttention(nn.Module): + + def __init__(self, embed_dim: int, num_heads: int, dropout: float = 0.0): + super().__init__() + self.attention = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout, batch_first=True) + + def forward( + self, + x_q: torch.Tensor, + x_k: torch.Tensor, + x_v: torch.Tensor, + attention_mask = None, + key_padding_mask = None + ) -> torch.Tensor: + + # x * Wq -> Q + # x * Wk -> K + # x * Wv -> V + + y, _ = self.attention.forward(x_q, x_k, x_v, attn_mask=attention_mask, key_padding_mask=key_padding_mask) + return y + + +# batch_first=False (default storico) +# Formato: (L, N, E) +# L = lunghezza della sequenza (time/posizioni) +# N = batch size +# E = dimensione d_model (embed) +# batch_first=True +# Formato: (N, L, E) (più naturale per molti modelli) \ No newline at end of file