From 6f219f634f9268cab9a2a41598eb7ee0a00d4636 Mon Sep 17 00:00:00 2001 From: GassiGiuseppe Date: Sun, 5 Oct 2025 17:49:01 +0200 Subject: [PATCH 1/7] Added attention_mask --- Project_Model/Libs/Transformer/Classes/Decoder.py | 6 +++++- .../Libs/Transformer/Classes/TorchMultiHeadAttention.py | 8 +++++--- Project_Model/Libs/Transformer/Utils/__init__.py | 3 +++ Project_Model/Libs/Transformer/Utils/attention_mask.py | 4 ++++ Project_Model/Libs/Transformer/__init__.py | 4 +++- 5 files changed, 20 insertions(+), 5 deletions(-) create mode 100644 Project_Model/Libs/Transformer/Utils/__init__.py create mode 100644 Project_Model/Libs/Transformer/Utils/attention_mask.py diff --git a/Project_Model/Libs/Transformer/Classes/Decoder.py b/Project_Model/Libs/Transformer/Classes/Decoder.py index d21a9ea..73fe5a0 100644 --- a/Project_Model/Libs/Transformer/Classes/Decoder.py +++ b/Project_Model/Libs/Transformer/Classes/Decoder.py @@ -2,6 +2,8 @@ import torch import torch.nn as nn from .FeedForwardNetwork import FeedForwardNetwork from .TorchMultiHeadAttention import TorchMultiHeadAttention as MultiHeadAttention +from ..Utils.attention_mask import get_attention_mask + class Decoder(nn.Module): @@ -15,7 +17,7 @@ class Decoder(nn.Module): super().__init__() self.__masked_attention = MultiHeadAttention( - embedding_dimension, number_of_attention_heads, dropout=0.1 + embedding_dimension, number_of_attention_heads, dropout=0.1, attention_mask=get_attention_mask(embedding_dimension) ) self.__layer_norm_1 = nn.LayerNorm(embedding_dimension) @@ -32,6 +34,8 @@ class Decoder(nn.Module): ) self.__layer_norm_3 = nn.LayerNorm(embedding_dimension) + + def forward(self, x, k_x, v_x, attention_mask) -> torch.Tensor: # k_x = v_x . While x_q = x # 1) Masked Attention diff --git a/Project_Model/Libs/Transformer/Classes/TorchMultiHeadAttention.py b/Project_Model/Libs/Transformer/Classes/TorchMultiHeadAttention.py index 6081f75..d310ac8 100644 --- a/Project_Model/Libs/Transformer/Classes/TorchMultiHeadAttention.py +++ b/Project_Model/Libs/Transformer/Classes/TorchMultiHeadAttention.py @@ -1,6 +1,6 @@ import torch import torch.nn as nn - +from typing import Optional class TorchMultiHeadAttention(nn.Module): @@ -9,6 +9,7 @@ class TorchMultiHeadAttention(nn.Module): embedding_dimension: int, number_of_attention_heads: int, dropout: float = 0.0, + attention_mask: Optional[torch.Tensor] = None ): super().__init__() self.attention = nn.MultiheadAttention( @@ -18,12 +19,13 @@ class TorchMultiHeadAttention(nn.Module): batch_first=True, ) + self.__attention_mask = attention_mask + def forward( self, x_q: torch.Tensor, x_k: torch.Tensor, x_v: torch.Tensor, - attention_mask=None, key_padding_mask=None, ) -> torch.Tensor: @@ -32,7 +34,7 @@ class TorchMultiHeadAttention(nn.Module): # x * Wv -> V y, _ = self.attention.forward( - x_q, x_k, x_v, attn_mask=attention_mask, key_padding_mask=key_padding_mask + x_q, x_k, x_v, attn_mask=self.__attention_mask, key_padding_mask=key_padding_mask ) return y diff --git a/Project_Model/Libs/Transformer/Utils/__init__.py b/Project_Model/Libs/Transformer/Utils/__init__.py new file mode 100644 index 0000000..856b51f --- /dev/null +++ b/Project_Model/Libs/Transformer/Utils/__init__.py @@ -0,0 +1,3 @@ +from .attention_mask import get_attention_mask + +__all__ = ["get_attention_mask"] \ No newline at end of file diff --git a/Project_Model/Libs/Transformer/Utils/attention_mask.py b/Project_Model/Libs/Transformer/Utils/attention_mask.py new file mode 100644 index 0000000..a6c595e --- /dev/null +++ b/Project_Model/Libs/Transformer/Utils/attention_mask.py @@ -0,0 +1,4 @@ +import torch + +def get_attention_mask(embedding_dimension: int) -> torch.Tensor: + return torch.triu(torch.ones(embedding_dimension, embedding_dimension, dtype=torch.bool), diagonal=1) \ No newline at end of file diff --git a/Project_Model/Libs/Transformer/__init__.py b/Project_Model/Libs/Transformer/__init__.py index e384727..d906699 100644 --- a/Project_Model/Libs/Transformer/__init__.py +++ b/Project_Model/Libs/Transformer/__init__.py @@ -1,3 +1,5 @@ from .Classes import * +from .Utils import * -from . import Classes \ No newline at end of file +from . import Classes +from . import Utils \ No newline at end of file From 0f243eaac24fa54d6f2881ad608d07910d355afb Mon Sep 17 00:00:00 2001 From: GassiGiuseppe Date: Sun, 5 Oct 2025 18:46:06 +0200 Subject: [PATCH 2/7] added padding_mask entry to decoder and encoder --- Project_Model/Libs/Transformer/Classes/Decoder.py | 6 +++--- Project_Model/Libs/Transformer/Classes/Encoder.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Project_Model/Libs/Transformer/Classes/Decoder.py b/Project_Model/Libs/Transformer/Classes/Decoder.py index 73fe5a0..a1f5074 100644 --- a/Project_Model/Libs/Transformer/Classes/Decoder.py +++ b/Project_Model/Libs/Transformer/Classes/Decoder.py @@ -36,11 +36,11 @@ class Decoder(nn.Module): - def forward(self, x, k_x, v_x, attention_mask) -> torch.Tensor: # k_x = v_x . While x_q = x + def forward(self, x, k_x, v_x, padding_mask = None) -> torch.Tensor: # k_x = v_x . While x_q = x # 1) Masked Attention MASKED_ATTENTION = self.__masked_attention( - x, x, x, attention_mask=attention_mask + x, x, x, key_padding_mask=padding_mask ) # 2) Dropout @@ -57,7 +57,7 @@ class Decoder(nn.Module): x = self.__layer_norm_1(x) # 5) Encoder–decoder (cross) attention - CROSS_ATTENTION = self.__cross_attention(x, k_x, v_x) + CROSS_ATTENTION = self.__cross_attention(x, k_x, v_x key_padding_mask=padding_mask) # 6) Dropout DROPPED_CROSS_ATTENTION = self.__dropout(CROSS_ATTENTION) diff --git a/Project_Model/Libs/Transformer/Classes/Encoder.py b/Project_Model/Libs/Transformer/Classes/Encoder.py index 8adfc76..cdec92a 100644 --- a/Project_Model/Libs/Transformer/Classes/Encoder.py +++ b/Project_Model/Libs/Transformer/Classes/Encoder.py @@ -31,12 +31,12 @@ class Encoder( self.__dropout = nn.Dropout(0.1) # ... pass - def forward(self, x): + def forward(self, x, padding_mask = None): # -> ATTENTION -> dropout -> add and normalize -> FF -> dropout -> add and normalize -> # Attention with Residual Connection [ input + self-attention] # 1) Multi Head Attention - ATTENTION = self.__attention(x, x, x) + ATTENTION = self.__attention(x, x, x,key_padding_mask= padding_mask) # 2) Dropout DROPPED_ATTENTION = self.__dropout(ATTENTION) From d48815cca29951705cbd43cc7599b14c6b3c103d Mon Sep 17 00:00:00 2001 From: GassiGiuseppe Date: Sun, 5 Oct 2025 18:58:42 +0200 Subject: [PATCH 3/7] added task_type and updated init --- Project_Model/Libs/Transformer/Utils/__init__.py | 3 ++- Project_Model/Libs/Transformer/Utils/task_type.py | 6 ++++++ 2 files changed, 8 insertions(+), 1 deletion(-) create mode 100644 Project_Model/Libs/Transformer/Utils/task_type.py diff --git a/Project_Model/Libs/Transformer/Utils/__init__.py b/Project_Model/Libs/Transformer/Utils/__init__.py index 856b51f..2831ec4 100644 --- a/Project_Model/Libs/Transformer/Utils/__init__.py +++ b/Project_Model/Libs/Transformer/Utils/__init__.py @@ -1,3 +1,4 @@ from .attention_mask import get_attention_mask +from .task_type import TaskType -__all__ = ["get_attention_mask"] \ No newline at end of file +__all__ = ["get_attention_mask", "TaskType"] \ No newline at end of file diff --git a/Project_Model/Libs/Transformer/Utils/task_type.py b/Project_Model/Libs/Transformer/Utils/task_type.py new file mode 100644 index 0000000..46a42eb --- /dev/null +++ b/Project_Model/Libs/Transformer/Utils/task_type.py @@ -0,0 +1,6 @@ +from enum import Enum, auto + +class TaskType(Enum): + RDF2TEXT = auto() + MASK = auto() + COMPLETATION = auto() \ No newline at end of file From 7e40a367017ed8dc34e0a5215f4e489ec608f8c9 Mon Sep 17 00:00:00 2001 From: GassiGiuseppe Date: Sun, 5 Oct 2025 22:58:06 +0200 Subject: [PATCH 4/7] wip: NanoSocratesCore --- Project_Model/Libs/Transformer/Classes/NanoSocratesCore.py | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 Project_Model/Libs/Transformer/Classes/NanoSocratesCore.py diff --git a/Project_Model/Libs/Transformer/Classes/NanoSocratesCore.py b/Project_Model/Libs/Transformer/Classes/NanoSocratesCore.py new file mode 100644 index 0000000..7b2a9b0 --- /dev/null +++ b/Project_Model/Libs/Transformer/Classes/NanoSocratesCore.py @@ -0,0 +1,6 @@ +from ..Utils.task_type import TaskType + +class NanoSocratesCore(): + + def __init__(self) -> None: + pass \ No newline at end of file From 87409fecd5213eca5f96f7655dec89b4e63f1bc3 Mon Sep 17 00:00:00 2001 From: GassiGiuseppe Date: Mon, 6 Oct 2025 12:00:11 +0200 Subject: [PATCH 5/7] added method fot batched attention_mask --- Project_Model/Libs/Transformer/Utils/__init__.py | 4 ++-- Project_Model/Libs/Transformer/Utils/attention_mask.py | 9 +++++++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/Project_Model/Libs/Transformer/Utils/__init__.py b/Project_Model/Libs/Transformer/Utils/__init__.py index 2831ec4..d4dfba3 100644 --- a/Project_Model/Libs/Transformer/Utils/__init__.py +++ b/Project_Model/Libs/Transformer/Utils/__init__.py @@ -1,4 +1,4 @@ -from .attention_mask import get_attention_mask +from .attention_mask import get_causal_attention_mask,get_causal_attention_mask_batched from .task_type import TaskType -__all__ = ["get_attention_mask", "TaskType"] \ No newline at end of file +__all__ = ["get_causal_attention_mask", "TaskType", "get_causal_attention_mask_batched"] \ No newline at end of file diff --git a/Project_Model/Libs/Transformer/Utils/attention_mask.py b/Project_Model/Libs/Transformer/Utils/attention_mask.py index a6c595e..cb0ddcf 100644 --- a/Project_Model/Libs/Transformer/Utils/attention_mask.py +++ b/Project_Model/Libs/Transformer/Utils/attention_mask.py @@ -1,4 +1,9 @@ import torch -def get_attention_mask(embedding_dimension: int) -> torch.Tensor: - return torch.triu(torch.ones(embedding_dimension, embedding_dimension, dtype=torch.bool), diagonal=1) \ No newline at end of file +def get_causal_attention_mask(embedding_dimension: int) -> torch.Tensor: + return torch.triu(torch.ones(embedding_dimension, embedding_dimension, dtype=torch.bool), diagonal=1) + +def get_causal_attention_mask_batched(embedding_dimension: int, batch_size: int ) -> torch.Tensor: + base_mask = get_causal_attention_mask(embedding_dimension) + return base_mask.unsqueeze(0).expand(batch_size, -1, -1) # add another dimension at the beginning, big as batch_size + # the result is that z,x,y where x,y are repeated along z \ No newline at end of file From 948c3fd7ac6361d586ce44fb9ec920ba1c1c6741 Mon Sep 17 00:00:00 2001 From: GassiGiuseppe Date: Mon, 6 Oct 2025 13:03:03 +0200 Subject: [PATCH 6/7] update to batch attention mask --- Project_Model/Libs/Transformer/Classes/Decoder.py | 12 ++++++++---- .../Transformer/Classes/TorchMultiHeadAttention.py | 10 +++++----- .../Libs/Transformer/Utils/attention_mask.py | 10 ++++++---- 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/Project_Model/Libs/Transformer/Classes/Decoder.py b/Project_Model/Libs/Transformer/Classes/Decoder.py index a1f5074..11e9aa7 100644 --- a/Project_Model/Libs/Transformer/Classes/Decoder.py +++ b/Project_Model/Libs/Transformer/Classes/Decoder.py @@ -2,8 +2,9 @@ import torch import torch.nn as nn from .FeedForwardNetwork import FeedForwardNetwork from .TorchMultiHeadAttention import TorchMultiHeadAttention as MultiHeadAttention -from ..Utils.attention_mask import get_attention_mask +from ..Utils.attention_mask import get_causal_attention_mask +# B, L(T), E_D class Decoder(nn.Module): @@ -17,7 +18,7 @@ class Decoder(nn.Module): super().__init__() self.__masked_attention = MultiHeadAttention( - embedding_dimension, number_of_attention_heads, dropout=0.1, attention_mask=get_attention_mask(embedding_dimension) + embedding_dimension, number_of_attention_heads, dropout=0.1 ) self.__layer_norm_1 = nn.LayerNorm(embedding_dimension) @@ -38,9 +39,12 @@ class Decoder(nn.Module): def forward(self, x, k_x, v_x, padding_mask = None) -> torch.Tensor: # k_x = v_x . While x_q = x + # build of attention mask + attention_mask = get_causal_attention_mask(x.size(1)) + # 1) Masked Attention MASKED_ATTENTION = self.__masked_attention( - x, x, x, key_padding_mask=padding_mask + x, x, x, key_padding_mask=padding_mask, attn_mask=attention_mask ) # 2) Dropout @@ -57,7 +61,7 @@ class Decoder(nn.Module): x = self.__layer_norm_1(x) # 5) Encoder–decoder (cross) attention - CROSS_ATTENTION = self.__cross_attention(x, k_x, v_x key_padding_mask=padding_mask) + CROSS_ATTENTION = self.__cross_attention(x, k_x, v_x, key_padding_mask=padding_mask) # 6) Dropout DROPPED_CROSS_ATTENTION = self.__dropout(CROSS_ATTENTION) diff --git a/Project_Model/Libs/Transformer/Classes/TorchMultiHeadAttention.py b/Project_Model/Libs/Transformer/Classes/TorchMultiHeadAttention.py index d310ac8..52c0cc5 100644 --- a/Project_Model/Libs/Transformer/Classes/TorchMultiHeadAttention.py +++ b/Project_Model/Libs/Transformer/Classes/TorchMultiHeadAttention.py @@ -9,7 +9,6 @@ class TorchMultiHeadAttention(nn.Module): embedding_dimension: int, number_of_attention_heads: int, dropout: float = 0.0, - attention_mask: Optional[torch.Tensor] = None ): super().__init__() self.attention = nn.MultiheadAttention( @@ -19,7 +18,6 @@ class TorchMultiHeadAttention(nn.Module): batch_first=True, ) - self.__attention_mask = attention_mask def forward( self, @@ -27,14 +25,16 @@ class TorchMultiHeadAttention(nn.Module): x_k: torch.Tensor, x_v: torch.Tensor, key_padding_mask=None, + attention_mask: Optional[torch.Tensor] = None ) -> torch.Tensor: # x * Wq -> Q # x * Wk -> K # x * Wv -> V - - y, _ = self.attention.forward( - x_q, x_k, x_v, attn_mask=self.__attention_mask, key_padding_mask=key_padding_mask + # REMEMBER: tochAttention uses Batch internally to build the 3 dimension attention mask given the 2 dimension + y, _ = self.attention( + x_q, x_k, x_v, attn_mask=attention_mask, key_padding_mask=key_padding_mask, + need_weights=False ) return y diff --git a/Project_Model/Libs/Transformer/Utils/attention_mask.py b/Project_Model/Libs/Transformer/Utils/attention_mask.py index cb0ddcf..b1e97f3 100644 --- a/Project_Model/Libs/Transformer/Utils/attention_mask.py +++ b/Project_Model/Libs/Transformer/Utils/attention_mask.py @@ -1,9 +1,11 @@ import torch -def get_causal_attention_mask(embedding_dimension: int) -> torch.Tensor: - return torch.triu(torch.ones(embedding_dimension, embedding_dimension, dtype=torch.bool), diagonal=1) +def get_causal_attention_mask(seq_len: int) -> torch.Tensor: + return torch.triu(torch.ones(seq_len, seq_len, dtype=torch.bool), diagonal=1) -def get_causal_attention_mask_batched(embedding_dimension: int, batch_size: int ) -> torch.Tensor: - base_mask = get_causal_attention_mask(embedding_dimension) + +# there is no need for this since MultiHeadAttention of torch does this internally +def get_causal_attention_mask_batched(seq_len: int, batch_size: int ) -> torch.Tensor: + base_mask = get_causal_attention_mask(seq_len) return base_mask.unsqueeze(0).expand(batch_size, -1, -1) # add another dimension at the beginning, big as batch_size # the result is that z,x,y where x,y are repeated along z \ No newline at end of file From 05bb4609999d0f0a1b0b88d3c8f463876cfdd00d Mon Sep 17 00:00:00 2001 From: GassiGiuseppe Date: Mon, 6 Oct 2025 13:03:20 +0200 Subject: [PATCH 7/7] file to test batch attention mask --- Playgrounds/prova.ipynb | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 Playgrounds/prova.ipynb diff --git a/Playgrounds/prova.ipynb b/Playgrounds/prova.ipynb new file mode 100644 index 0000000..a4996bb --- /dev/null +++ b/Playgrounds/prova.ipynb @@ -0,0 +1,41 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "4ae47336", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "B, T, D = 4, 7, 32\n", + "x = torch.randn(B, T, D)\n", + "attn_mask = torch.triu(torch.ones(T, T, dtype=torch.bool), diagonal=1) # [T,T]\n", + "pad_mask = torch.zeros(B, T, dtype=torch.bool) # no pads\n", + "mha = torch.nn.MultiheadAttention(D, num_heads=4, batch_first=True)\n", + "y, _ = mha(x, x, x, attn_mask=attn_mask, key_padding_mask=pad_mask) # should work\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "deep_learning", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}