import torch import torch.nn as nn from Project_Model.Libs.Transformer.Classes.FeedForwardNetwork import FeedForwardNetwork from Project_Model.Libs.Transformer.Classes.TorchMultiHeadAttention import ( TorchMultiHeadAttention as MultiHeadAttention, ) class Encoder( nn.Module ): # in this way we expose the primitive of nn.Module for training purpose def __init__( self, embedding_dimension: int, feed_forward_hidden_layer_dimension: int, number_of_attention_heads: int, ) -> None: super().__init__() self.__attention = MultiHeadAttention( embedding_dimension, number_of_attention_heads, dropout=0.1 ) self.__layer_norm_1 = nn.LayerNorm( embedding_dimension ) # norm of first "Add and Normalize" self.__feed_forward = FeedForwardNetwork( embedding_dimension, feed_forward_hidden_layer_dimension ) self.__layer_norm_2 = nn.LayerNorm( embedding_dimension ) # norm of second "Add and Normalize" self.__dropout = nn.Dropout(0.1) # ... def forward(self, args: tuple[torch.Tensor, torch.Tensor]): # WARNING: args is needed to have sequential x, padding_mask = args # -> ATTENTION -> dropout -> add and normalize -> FF -> dropout -> add and normalize -> # Attention with Residual Connection [ input + self-attention] # 1) Multi Head Attention ATTENTION = self.__attention(x, x, x, key_padding_mask=padding_mask) # 2) Dropout # DROPPED_ATTENTION = self.__dropout(ATTENTION) # del ATTENTION # 3) Residual Connection x = x + ATTENTION del ATTENTION # 4) Layer Normalization x = self.__layer_norm_1(x) # 5) Feed Forward FEED_FORWARD = self.__feed_forward(x) # 6) Dropout # DROPPED_FEED_FORWARD = self.__dropout(FEED_FORWARD) # del FEED_FORWARD # 7) Residual Connection x = x + FEED_FORWARD del FEED_FORWARD # 8) Layer Normalization x = self.__layer_norm_2(x) return (x, padding_mask) # use eval to disable dropout ecc