NanoSocrates/Project_Model/Libs/Transformer/Classes/Encoder.py

import torch.nn as nn
from Project_Model.Libs.Transformer.Classes.FeedForwardNetwork import FeedForwardNetwork
from Project_Model.Libs.Transformer.Classes.TorchMultiHeadAttention import (
    TorchMultiHeadAttention as MultiHeadAttention,
)


class Encoder(
    nn.Module
):  # in this way we expose the primitive of nn.Module for training purpose

    def __init__(
        self,
        embedding_dimension: int,
        feed_forward_hidden_layer_dimension: int,
        number_of_attention_heads: int,
    ) -> None:
        super().__init__()
        self.__attention = MultiHeadAttention(
            embedding_dimension, number_of_attention_heads, dropout=0.1
        )
        self.__layer_norm_1 = nn.LayerNorm(
            embedding_dimension
        )  # norm of first "Add and Normalize"
        self.__feed_forward = FeedForwardNetwork(
            embedding_dimension, feed_forward_hidden_layer_dimension
        )
        self.__layer_norm_2 = nn.LayerNorm(
            embedding_dimension
        )  # norm of second "Add and Normalize"
        self.__dropout = nn.Dropout(0.1)  # ...
        pass

    def forward(self, x, padding_mask = None):
        # -> ATTENTION -> dropout -> add and normalize -> FF -> dropout -> add and normalize ->
        # Attention with Residual Connection    [ input + self-attention]

        # 1) Multi Head Attention
        ATTENTION = self.__attention(x, x, x,key_padding_mask= padding_mask)

        # 2) Dropout
        DROPPED_ATTENTION = self.__dropout(ATTENTION)
        del ATTENTION

        # 3) Residual Connection
        x = x + DROPPED_ATTENTION

        # 4) Layer Normalization
        x = self.__layer_norm_1(x)

        # 5) Feed Forward
        FEED_FORWARD = self.__feed_forward(x)

        # 6) Dropout
        DROPPED_FEED_FORWARD = self.__dropout(FEED_FORWARD)
        del FEED_FORWARD

        # 7) Residual Connection
        x = x + DROPPED_FEED_FORWARD
        del DROPPED_FEED_FORWARD

        # 8) Layer Normalization
        x = self.__layer_norm_2(x)

        return x


# use eval to disable dropout ecc