diff --git a/Project_Model/Libs/Embedder/__init__.py b/Project_Model/Libs/Embedder/__init__.py index faa89d3..32232f7 100644 --- a/Project_Model/Libs/Embedder/__init__.py +++ b/Project_Model/Libs/Embedder/__init__.py @@ -1,10 +1,7 @@ from .Utils import * from .Classes import * -from .Utils import fixed_positional_encoding -from .Classes import NanoSocratesEmbedder +from . import Utils +from . import Classes + -__all__ = [ - "fixed_positional_encoding", - "NanoSocratesEmbedder" -] diff --git a/Project_Model/Libs/Transformer/Classes/Decoder.py b/Project_Model/Libs/Transformer/Classes/Decoder.py new file mode 100644 index 0000000..d21a9ea --- /dev/null +++ b/Project_Model/Libs/Transformer/Classes/Decoder.py @@ -0,0 +1,86 @@ +import torch +import torch.nn as nn +from .FeedForwardNetwork import FeedForwardNetwork +from .TorchMultiHeadAttention import TorchMultiHeadAttention as MultiHeadAttention + + +class Decoder(nn.Module): + + def __init__( + self, + embedding_dimension: int, + feed_forward_hidden_layer_dimension: int, + number_of_attention_heads: int, + ) -> None: + super().__init__() + + self.__masked_attention = MultiHeadAttention( + embedding_dimension, number_of_attention_heads, dropout=0.1 + ) + + self.__layer_norm_1 = nn.LayerNorm(embedding_dimension) + + self.__cross_attention = MultiHeadAttention( + embedding_dimension, number_of_attention_heads, dropout=0.1 + ) + self.__layer_norm_2 = nn.LayerNorm(embedding_dimension) + + self.__dropout = nn.Dropout(0.1) + + self.__feed_forward_network = FeedForwardNetwork( + embedding_dimension, feed_forward_hidden_layer_dimension + ) + self.__layer_norm_3 = nn.LayerNorm(embedding_dimension) + + def forward(self, x, k_x, v_x, attention_mask) -> torch.Tensor: # k_x = v_x . While x_q = x + + # 1) Masked Attention + MASKED_ATTENTION = self.__masked_attention( + x, x, x, attention_mask=attention_mask + ) + + # 2) Dropout + DROPPED_MASKED_ATTENTION = self.__dropout( + MASKED_ATTENTION + ) + del MASKED_ATTENTION + + # 3) Residual Connection + x = x + DROPPED_MASKED_ATTENTION + del DROPPED_MASKED_ATTENTION + + # 4) Layer Normalization + x = self.__layer_norm_1(x) + + # 5) Encoder–decoder (cross) attention + CROSS_ATTENTION = self.__cross_attention(x, k_x, v_x) + + # 6) Dropout + DROPPED_CROSS_ATTENTION = self.__dropout(CROSS_ATTENTION) + del CROSS_ATTENTION + + # 7) Residual Connection + x = x + DROPPED_CROSS_ATTENTION + del DROPPED_CROSS_ATTENTION + + # 8) Layer Normalization + x = self.__layer_norm_2(x) + + # 9) Position-wise feed-forward + FEED_FORWARD = self.__feed_forward_network(x) + + # 10) Dropout + DROPPED_FEED_FORWARD = self.__dropout(FEED_FORWARD) + del FEED_FORWARD + + # 11) Residual Connection + x = x + DROPPED_FEED_FORWARD + del DROPPED_FEED_FORWARD + + # 12) Layer Normalization + x = self.__layer_norm_3(x) + + return x + + +# use eval to disable dropout ecc diff --git a/Project_Model/Libs/Transformer/Classes/Encoder.py b/Project_Model/Libs/Transformer/Classes/Encoder.py new file mode 100644 index 0000000..8adfc76 --- /dev/null +++ b/Project_Model/Libs/Transformer/Classes/Encoder.py @@ -0,0 +1,68 @@ +import torch.nn as nn +from Project_Model.Libs.Transformer.Classes.FeedForwardNetwork import FeedForwardNetwork +from Project_Model.Libs.Transformer.Classes.TorchMultiHeadAttention import ( + TorchMultiHeadAttention as MultiHeadAttention, +) + + +class Encoder( + nn.Module +): # in this way we expose the primitive of nn.Module for training purpose + + def __init__( + self, + embedding_dimension: int, + feed_forward_hidden_layer_dimension: int, + number_of_attention_heads: int, + ) -> None: + super().__init__() + self.__attention = MultiHeadAttention( + embedding_dimension, number_of_attention_heads, dropout=0.1 + ) + self.__layer_norm_1 = nn.LayerNorm( + embedding_dimension + ) # norm of first "Add and Normalize" + self.__feed_forward = FeedForwardNetwork( + embedding_dimension, feed_forward_hidden_layer_dimension + ) + self.__layer_norm_2 = nn.LayerNorm( + embedding_dimension + ) # norm of second "Add and Normalize" + self.__dropout = nn.Dropout(0.1) # ... + pass + + def forward(self, x): + # -> ATTENTION -> dropout -> add and normalize -> FF -> dropout -> add and normalize -> + # Attention with Residual Connection [ input + self-attention] + + # 1) Multi Head Attention + ATTENTION = self.__attention(x, x, x) + + # 2) Dropout + DROPPED_ATTENTION = self.__dropout(ATTENTION) + del ATTENTION + + # 3) Residual Connection + x = x + DROPPED_ATTENTION + + # 4) Layer Normalization + x = self.__layer_norm_1(x) + + # 5) Feed Forward + FEED_FORWARD = self.__feed_forward(x) + + # 6) Dropout + DROPPED_FEED_FORWARD = self.__dropout(FEED_FORWARD) + del FEED_FORWARD + + # 7) Residual Connection + x = x + DROPPED_FEED_FORWARD + del DROPPED_FEED_FORWARD + + # 8) Layer Normalization + x = self.__layer_norm_2(x) + + return x + + +# use eval to disable dropout ecc diff --git a/Project_Model/Libs/Transformer/Classes/FeedForwardNetwork.py b/Project_Model/Libs/Transformer/Classes/FeedForwardNetwork.py new file mode 100644 index 0000000..4cfc7e6 --- /dev/null +++ b/Project_Model/Libs/Transformer/Classes/FeedForwardNetwork.py @@ -0,0 +1,43 @@ +# it is position wise! +# https://stackoverflow.com/questions/74979359/how-is-position-wise-feed-forward-neural-network-implemented-for-transformers + +# Why do we need a fixed size +# https://ai.stackexchange.com/questions/37624/why-do-transformers-have-a-fixed-input-length + +import torch.nn as nn + + +class FeedForwardNetwork(nn.Module): + + def __init__(self, embedding_size: int, feed_forward_hidden_layer_dimension: int): + + super().__init__() + self.__fully_connected_1 = nn.Linear( + embedding_size, feed_forward_hidden_layer_dimension + ) # expand in higher dimension + + self.__relu = nn.ReLU() + self.__dropout = nn.Dropout( + 0.1 + ) # during training we drop something, with eval it got deactivated + + self.__fully_connected_2 = nn.Linear( + feed_forward_hidden_layer_dimension, embedding_size + ) # return into the model dimension + + def forward(self, x): + # -> NN1 -> RELU -> (Droput during training) -> NN2 -> + + # 1) Linear Layer + x = self.__fully_connected_1(x) + + # 2) ReLU + x = self.__relu(x) + + # 3) Dropout + x = self.__dropout(x) + + # 4) Linear Layer + x = self.__fully_connected_2(x) + + return x diff --git a/Project_Model/Libs/Transformer/multi_head_attention.py b/Project_Model/Libs/Transformer/Classes/MultiHeadAttention.py similarity index 80% rename from Project_Model/Libs/Transformer/multi_head_attention.py rename to Project_Model/Libs/Transformer/Classes/MultiHeadAttention.py index cd48b66..63c9a6f 100644 --- a/Project_Model/Libs/Transformer/multi_head_attention.py +++ b/Project_Model/Libs/Transformer/Classes/MultiHeadAttention.py @@ -7,14 +7,18 @@ ######################## # WIP ######################## -import torch + import torch.nn as nn embed_dim = 256 num_heads = 8 multihead_attn = nn.MultiheadAttention(embed_dim, num_heads) -class MultiheadAttention(): - def __init__(self, num_heads = 8, ) -> None: - pass \ No newline at end of file +class MultiheadAttention: + + def __init__( + self, + num_heads=8, + ) -> None: + pass diff --git a/Project_Model/Libs/Transformer/Classes/TorchMultiHeadAttention.py b/Project_Model/Libs/Transformer/Classes/TorchMultiHeadAttention.py new file mode 100644 index 0000000..6081f75 --- /dev/null +++ b/Project_Model/Libs/Transformer/Classes/TorchMultiHeadAttention.py @@ -0,0 +1,46 @@ +import torch +import torch.nn as nn + + +class TorchMultiHeadAttention(nn.Module): + + def __init__( + self, + embedding_dimension: int, + number_of_attention_heads: int, + dropout: float = 0.0, + ): + super().__init__() + self.attention = nn.MultiheadAttention( + embedding_dimension, + number_of_attention_heads, + dropout=dropout, + batch_first=True, + ) + + def forward( + self, + x_q: torch.Tensor, + x_k: torch.Tensor, + x_v: torch.Tensor, + attention_mask=None, + key_padding_mask=None, + ) -> torch.Tensor: + + # x * Wq -> Q + # x * Wk -> K + # x * Wv -> V + + y, _ = self.attention.forward( + x_q, x_k, x_v, attn_mask=attention_mask, key_padding_mask=key_padding_mask + ) + return y + + +# batch_first=False (default storico) +# Formato: (L, N, E) +# L = lunghezza della sequenza (time/posizioni) +# N = batch size +# E = dimensione d_model (embed) +# batch_first=True +# Formato: (N, L, E) (più naturale per molti modelli) diff --git a/Project_Model/Libs/Transformer/Classes/__init__.py b/Project_Model/Libs/Transformer/Classes/__init__.py new file mode 100644 index 0000000..e6c69ae --- /dev/null +++ b/Project_Model/Libs/Transformer/Classes/__init__.py @@ -0,0 +1,13 @@ +from .Decoder import Decoder +from .Encoder import Encoder +from .FeedForwardNetwork import FeedForwardNetwork +from .MultiHeadAttention import MultiheadAttention +from .TorchMultiHeadAttention import TorchMultiHeadAttention + +__all__ = [ + "Decoder", + "Encoder", + "FeedForwardNetwork", + "MultiheadAttention", + "TorchMultiHeadAttention" +] \ No newline at end of file diff --git a/Project_Model/Libs/Transformer/__init__.py b/Project_Model/Libs/Transformer/__init__.py index e69de29..e384727 100644 --- a/Project_Model/Libs/Transformer/__init__.py +++ b/Project_Model/Libs/Transformer/__init__.py @@ -0,0 +1,3 @@ +from .Classes import * + +from . import Classes \ No newline at end of file diff --git a/Project_Model/Libs/Transformer/decoder.py b/Project_Model/Libs/Transformer/decoder.py deleted file mode 100644 index 4612ea1..0000000 --- a/Project_Model/Libs/Transformer/decoder.py +++ /dev/null @@ -1,44 +0,0 @@ - -import torch -import torch.nn as nn -from Transformer.feed_forward_nn import FeedForwardNetwork -from Transformer.pytorch_multi_head_attention import TorchMultiHeadAttention as MultiHeadAttention - - -class Decoder(nn.Module): - - def __init__(self, d_model:int, d_ff: int, attention_heads:int) -> None: - super().__init__() - self._masked_attention = MultiHeadAttention(d_model, attention_heads, dropout=0.1) - self.norm1 = nn.LayerNorm(d_model) - - self.attention = MultiHeadAttention(d_model, attention_heads, dropout=0.1) - self.norm2 = nn.LayerNorm(d_model) - - self.dropout = nn.Dropout(0.1) - - self.ffn = FeedForwardNetwork(d_model, d_ff) - self.norm3 = nn.LayerNorm(d_model) - pass - - def forward(self, x, k_x,v_x, attention_mask): # k_x = v_x . While x_q = x - - # 1) Masked self-attention - x = x + self.dropout(self._masked_attention(x, x, x, attention_mask= attention_mask)) - x = self.norm1(x) - - # 2) Encoder–decoder (cross) attention - x = x + self.dropout(self.attention(x, k_x, v_x)) - x = self.norm2(x) - - # 3) Position-wise feed-forward - x = x + self.dropout(self.ffn(x)) - x = self.norm3(x) - - return x - - - - - -# use eval to disable dropout ecc \ No newline at end of file diff --git a/Project_Model/Libs/Transformer/encoder.py b/Project_Model/Libs/Transformer/encoder.py deleted file mode 100644 index 81d9cb7..0000000 --- a/Project_Model/Libs/Transformer/encoder.py +++ /dev/null @@ -1,32 +0,0 @@ - -import torch -import torch.nn as nn -from Transformer.feed_forward_nn import FeedForwardNetwork -from Transformer.pytorch_multi_head_attention import TorchMultiHeadAttention as MultiHeadAttention - - -class Encoder(nn.Module): # in this way we expose the primitive of nn.Module for training purpose - - def __init__(self, d_model:int, d_ff: int, attention_heads:int) -> None: - super().__init__() - self.attention = MultiHeadAttention(d_model, attention_heads, dropout=0.1) - self.norm1 = nn.LayerNorm(d_model) # norm of first "Add and Normalize" - self.ffn = FeedForwardNetwork(d_model, d_ff) - self.norm2 = nn.LayerNorm(d_model) # norm of second "Add and Normalize" - self.dropout = nn.Dropout(0.1) # ... - pass - - def forward(self, x): - # -> ATTENTION -> dropout -> add and normalize -> FF -> dropout -> add and normalize -> - # Attention with Residual Connection [ input + self-attention] - x = x + self.dropout(self.attention(x, x, x)) - x = self.norm1(x) - - # Feedforward with Residual Connection [ normed self-attention + ff] - x = x + self.dropout(self.ffn(x)) - x = self.norm2(x) - return x - - - -# use eval to disable dropout ecc \ No newline at end of file diff --git a/Project_Model/Libs/Transformer/feed_forward_nn.py b/Project_Model/Libs/Transformer/feed_forward_nn.py deleted file mode 100644 index 70d3d8e..0000000 --- a/Project_Model/Libs/Transformer/feed_forward_nn.py +++ /dev/null @@ -1,18 +0,0 @@ -# it is position wise! -# https://stackoverflow.com/questions/74979359/how-is-position-wise-feed-forward-neural-network-implemented-for-transformers - -import torch -import torch.nn as nn - -class FeedForwardNetwork(nn.Module): - def __init__(self, d_model, d_ff): - super(FeedForwardNetwork, self).__init__() - self.fc1 = nn.Linear(d_model, d_ff) # expand in higher dimension - self.activation = nn.ReLU() - self.dropout = nn.Dropout(0.1) # during training we drop something, with eval it got deactivated - self.fc2 = nn.Linear(d_ff, d_model) # return into the model dimension - - - def forward(self, x): - # -> NN1 -> RELU -> (Droput during training) -> NN2 -> - return self.fc2(self.dropout(self.activation(self.fc1(x)))) \ No newline at end of file diff --git a/Project_Model/Libs/Transformer/pytorch_multi_head_attention.py b/Project_Model/Libs/Transformer/pytorch_multi_head_attention.py deleted file mode 100644 index 0bb6fc4..0000000 --- a/Project_Model/Libs/Transformer/pytorch_multi_head_attention.py +++ /dev/null @@ -1,33 +0,0 @@ -import torch -import torch.nn as nn - -class TorchMultiHeadAttention(nn.Module): - - def __init__(self, embed_dim: int, num_heads: int, dropout: float = 0.0): - super().__init__() - self.attention = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout, batch_first=True) - - def forward( - self, - x_q: torch.Tensor, - x_k: torch.Tensor, - x_v: torch.Tensor, - attention_mask = None, - key_padding_mask = None - ) -> torch.Tensor: - - # x * Wq -> Q - # x * Wk -> K - # x * Wv -> V - - y, _ = self.attention.forward(x_q, x_k, x_v, attn_mask=attention_mask, key_padding_mask=key_padding_mask) - return y - - -# batch_first=False (default storico) -# Formato: (L, N, E) -# L = lunghezza della sequenza (time/posizioni) -# N = batch size -# E = dimensione d_model (embed) -# batch_first=True -# Formato: (N, L, E) (più naturale per molti modelli) \ No newline at end of file