NanoSocrates/Project_Model/Libs/Transformer/Classes/Decoder.py

from typing import Optional
import torch
import torch.nn as nn
from .FeedForwardNetwork import FeedForwardNetwork
from .TorchMultiHeadAttention import TorchMultiHeadAttention as MultiHeadAttention
from ..Utils.attention_mask import get_causal_attention_mask, get_prefix_causal_mask_from_padding_mask

# B, L(T), E_D


class Decoder(nn.Module):

    def __init__(
        self,
        embedding_dimension: int,
        feed_forward_hidden_layer_dimension: int,
        number_of_attention_heads: int,
    ) -> None:
        self.__attention_heads = number_of_attention_heads
        super().__init__()


        self.__masked_attention = MultiHeadAttention(
            embedding_dimension, number_of_attention_heads, dropout=0.1
        )

        self.__layer_norm_1 = nn.LayerNorm(embedding_dimension)

        self.__cross_attention = MultiHeadAttention(
            embedding_dimension, number_of_attention_heads, dropout=0.1
        )
        self.__layer_norm_2 = nn.LayerNorm(embedding_dimension)

        self.__dropout = nn.Dropout(0.1)

        self.__feed_forward_network = FeedForwardNetwork(
            embedding_dimension, feed_forward_hidden_layer_dimension
        )
        self.__layer_norm_3 = nn.LayerNorm(embedding_dimension)

    def forward(
        self,
        args: tuple[
            torch.Tensor,
            torch.Tensor,
            torch.Tensor,
            torch.Tensor,
            torch.Tensor,
            Optional[bool]
        ]
    ):  # -> list[torch.Tensor]:  # k_x = v_x . While x_q = x
        # WARNING: args is needed to have sequential
        if len(args) < 6:
            args = args + (False)
        x, k_x, v_x, src_padding_mask, tgt_padding_mask, decoder_only = args

        # build of attention mask
        # TODO: create a prefix causal mask if needed
        if decoder_only:
            attention_mask = get_prefix_causal_mask_from_padding_mask(x.size(1),src_padding_mask,self.__attention_heads) # the correct is tgt however ...
        else:
            attention_mask = get_causal_attention_mask(x.size(1))

        # 1) Masked Attention
        MASKED_ATTENTION = self.__masked_attention(
            x, x, x, key_padding_mask=tgt_padding_mask, attention_mask=attention_mask
        )

        # 2) Dropout
        DROPPED_MASKED_ATTENTION = self.__dropout(MASKED_ATTENTION)
        del MASKED_ATTENTION

        # 3) Residual Connection
        x = x + DROPPED_MASKED_ATTENTION
        del DROPPED_MASKED_ATTENTION

        # 4) Layer Normalization
        x = self.__layer_norm_1(x)


        if not decoder_only:
            # 5) Encoder–decoder (cross) attention
            CROSS_ATTENTION = self.__cross_attention(
                x, k_x, v_x, key_padding_mask=src_padding_mask
            )

            # 6) Dropout
            DROPPED_CROSS_ATTENTION = self.__dropout(CROSS_ATTENTION)
            del CROSS_ATTENTION

            # 7) Residual Connection
            x = x + DROPPED_CROSS_ATTENTION
            del DROPPED_CROSS_ATTENTION

            # 8) Layer Normalization
            x = self.__layer_norm_2(x)

        # 9) Position-wise feed-forward
        FEED_FORWARD = self.__feed_forward_network(x)

        # 10) Dropout
        DROPPED_FEED_FORWARD = self.__dropout(FEED_FORWARD)
        del FEED_FORWARD

        # 11) Residual Connection
        x = x + DROPPED_FEED_FORWARD
        del DROPPED_FEED_FORWARD

        # 12) Layer Normalization
        x = self.__layer_norm_3(x)

        return (x, k_x, v_x, src_padding_mask, tgt_padding_mask, decoder_only)


# use eval to disable dropout ecc
-												Added a way to detach models and create them standalone

											
										
										
											2025-10-10 18:43:20 +02:00
+								from typing import Optional
-												Refactoring

											
										
										
											2025-10-05 15:40:29 +02:00
+								import torch
 								import torch.nn as nn
 								from .FeedForwardNetwork import FeedForwardNetwork
 								from .TorchMultiHeadAttention import TorchMultiHeadAttention as MultiHeadAttention
-												WIP decoder with prefix mask

											
										
										
											2025-10-11 15:31:43 +02:00
+								from ..Utils.attention_mask import get_causal_attention_mask, get_prefix_causal_mask_from_padding_mask
-												Added attention_mask

											
										
										
											2025-10-05 17:49:01 +02:00
-												update to batch attention mask

											
										
										
											2025-10-06 13:03:03 +02:00
+								# B, L(T), E_D
-												Refactoring

											
										
										
											2025-10-05 15:40:29 +02:00
 								class Decoder(nn.Module):
 								    def __init__(
 								        self,
 								        embedding_dimension: int,
 								        feed_forward_hidden_layer_dimension: int,
 								        number_of_attention_heads: int,
 								    ) -> None:
-												WIP decoder with prefix mask

											
										
										
											2025-10-11 15:31:43 +02:00
+								        self.__attention_heads = number_of_attention_heads
-												Refactoring

											
										
										
											2025-10-05 15:40:29 +02:00
+								        super().__init__()
-												Activated Dropout to avoid overfitting

											
										
										
											2025-10-12 12:28:06 +02:00
-												Refactoring

											
										
										
											2025-10-05 15:40:29 +02:00
+								        self.__masked_attention = MultiHeadAttention(
-												update to batch attention mask

											
										
										
											2025-10-06 13:03:03 +02:00
+								            embedding_dimension, number_of_attention_heads, dropout=0.1
-												Refactoring

											
										
										
											2025-10-05 15:40:29 +02:00
+								        )
 								        self.__layer_norm_1 = nn.LayerNorm(embedding_dimension)
 								        self.__cross_attention = MultiHeadAttention(
 								            embedding_dimension, number_of_attention_heads, dropout=0.1
 								        )
 								        self.__layer_norm_2 = nn.LayerNorm(embedding_dimension)
 								        self.__dropout = nn.Dropout(0.1)
 								        self.__feed_forward_network = FeedForwardNetwork(
 								            embedding_dimension, feed_forward_hidden_layer_dimension
 								        )
 								        self.__layer_norm_3 = nn.LayerNorm(embedding_dimension)
-												Made model Batch ready

											
										
										
											2025-10-07 16:37:20 +02:00
+								    def forward(
 								        self,
 								        args: tuple[
 								            torch.Tensor,
 								            torch.Tensor,
 								            torch.Tensor,
-												Pipeline fix and added a util to decode

											
										
										
											2025-10-09 13:24:48 +02:00
+								            torch.Tensor,
-												Added a way to detach models and create them standalone

											
										
										
											2025-10-10 18:43:20 +02:00
+								            torch.Tensor,
 								            Optional[bool]
-												Made model Batch ready

											
										
										
											2025-10-07 16:37:20 +02:00
+								        ]
 								    ):  # -> list[torch.Tensor]:  # k_x = v_x . While x_q = x
 								        # WARNING: args is needed to have sequential
-												Added a way to detach models and create them standalone

											
										
										
											2025-10-10 18:43:20 +02:00
+								        if len(args) < 6:
 								            args = args + (False)
 								        x, k_x, v_x, src_padding_mask, tgt_padding_mask, decoder_only = args
-												Refactoring

											
										
										
											2025-10-05 15:40:29 +02:00
-												update to batch attention mask

											
										
										
											2025-10-06 13:03:03 +02:00
+								        # build of attention mask
-												Added a way to detach models and create them standalone

											
										
										
											2025-10-10 18:43:20 +02:00
+								        # TODO: create a prefix causal mask if needed
-												WIP decoder with prefix mask

											
										
										
											2025-10-11 15:31:43 +02:00
+								        if decoder_only:
-												Update of the batcher to resolve a bug in the 4th construction

											
										
										
											2025-10-12 16:35:42 +02:00
+								            attention_mask = get_prefix_causal_mask_from_padding_mask(x.size(1),src_padding_mask,self.__attention_heads) # the correct is tgt however ...
-												WIP decoder with prefix mask

											
										
										
											2025-10-11 15:31:43 +02:00
+								        else:
 								            attention_mask = get_causal_attention_mask(x.size(1))
-												update to batch attention mask

											
										
										
											2025-10-06 13:03:03 +02:00
-												Refactoring

											
										
										
											2025-10-05 15:40:29 +02:00
+								        # 1) Masked Attention
 								        MASKED_ATTENTION = self.__masked_attention(
-												Pipeline fix and added a util to decode

											
										
										
											2025-10-09 13:24:48 +02:00
+								            x, x, x, key_padding_mask=tgt_padding_mask, attention_mask=attention_mask
-												Refactoring

											
										
										
											2025-10-05 15:40:29 +02:00
+								        )
 								        # 2) Dropout
-												Activated Dropout to avoid overfitting

											
										
										
											2025-10-12 12:28:06 +02:00
+								        DROPPED_MASKED_ATTENTION = self.__dropout(MASKED_ATTENTION)
 								        del MASKED_ATTENTION
-												Refactoring

											
										
										
											2025-10-05 15:40:29 +02:00
 								        # 3) Residual Connection
-												Activated Dropout to avoid overfitting

											
										
										
											2025-10-12 12:28:06 +02:00
+								        x = x + DROPPED_MASKED_ATTENTION
 								        del DROPPED_MASKED_ATTENTION
-												Refactoring

											
										
										
											2025-10-05 15:40:29 +02:00
 								        # 4) Layer Normalization
 								        x = self.__layer_norm_1(x)
-												Added a way to detach models and create them standalone

											
										
										
											2025-10-10 18:43:20 +02:00
+								        if not decoder_only:
 								            # 5) Encoder–decoder (cross) attention
 								            CROSS_ATTENTION = self.__cross_attention(
 								                x, k_x, v_x, key_padding_mask=src_padding_mask
 								            )
 								            # 6) Dropout
-												Activated Dropout to avoid overfitting

											
										
										
											2025-10-12 12:28:06 +02:00
+								            DROPPED_CROSS_ATTENTION = self.__dropout(CROSS_ATTENTION)
 								            del CROSS_ATTENTION
-												Refactoring

											
										
										
											2025-10-05 15:40:29 +02:00
-												Added a way to detach models and create them standalone

											
										
										
											2025-10-10 18:43:20 +02:00
+								            # 7) Residual Connection
-												Activated Dropout to avoid overfitting

											
										
										
											2025-10-12 12:28:06 +02:00
+								            x = x + DROPPED_CROSS_ATTENTION
 								            del DROPPED_CROSS_ATTENTION
-												Refactoring

											
										
										
											2025-10-05 15:40:29 +02:00
-												Added a way to detach models and create them standalone

											
										
										
											2025-10-10 18:43:20 +02:00
+								            # 8) Layer Normalization
 								            x = self.__layer_norm_2(x)
-												Refactoring

											
										
										
											2025-10-05 15:40:29 +02:00
 								        # 9) Position-wise feed-forward
 								        FEED_FORWARD = self.__feed_forward_network(x)
 								        # 10) Dropout
-												Activated Dropout to avoid overfitting

											
										
										
											2025-10-12 12:28:06 +02:00
+								        DROPPED_FEED_FORWARD = self.__dropout(FEED_FORWARD)
 								        del FEED_FORWARD
-												Refactoring

											
										
										
											2025-10-05 15:40:29 +02:00
 								        # 11) Residual Connection
-												Activated Dropout to avoid overfitting

											
										
										
											2025-10-12 12:28:06 +02:00
+								        x = x + DROPPED_FEED_FORWARD
 								        del DROPPED_FEED_FORWARD
-												Refactoring

											
										
										
											2025-10-05 15:40:29 +02:00
 								        # 12) Layer Normalization
 								        x = self.__layer_norm_3(x)
-												Added a way to detach models and create them standalone

											
										
										
											2025-10-10 18:43:20 +02:00
+								        return (x, k_x, v_x, src_padding_mask, tgt_padding_mask, decoder_only)
-												Refactoring

											
										
										
											2025-10-05 15:40:29 +02:00
 								# use eval to disable dropout ecc