diff --git a/Playgrounds/prova.ipynb b/Playgrounds/prova.ipynb new file mode 100644 index 0000000..a4996bb --- /dev/null +++ b/Playgrounds/prova.ipynb @@ -0,0 +1,41 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "4ae47336", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "B, T, D = 4, 7, 32\n", + "x = torch.randn(B, T, D)\n", + "attn_mask = torch.triu(torch.ones(T, T, dtype=torch.bool), diagonal=1) # [T,T]\n", + "pad_mask = torch.zeros(B, T, dtype=torch.bool) # no pads\n", + "mha = torch.nn.MultiheadAttention(D, num_heads=4, batch_first=True)\n", + "y, _ = mha(x, x, x, attn_mask=attn_mask, key_padding_mask=pad_mask) # should work\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "deep_learning", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Project_Model/Libs/Transformer/Classes/Decoder.py b/Project_Model/Libs/Transformer/Classes/Decoder.py index d21a9ea..11e9aa7 100644 --- a/Project_Model/Libs/Transformer/Classes/Decoder.py +++ b/Project_Model/Libs/Transformer/Classes/Decoder.py @@ -2,6 +2,9 @@ import torch import torch.nn as nn from .FeedForwardNetwork import FeedForwardNetwork from .TorchMultiHeadAttention import TorchMultiHeadAttention as MultiHeadAttention +from ..Utils.attention_mask import get_causal_attention_mask + +# B, L(T), E_D class Decoder(nn.Module): @@ -32,11 +35,16 @@ class Decoder(nn.Module): ) self.__layer_norm_3 = nn.LayerNorm(embedding_dimension) - def forward(self, x, k_x, v_x, attention_mask) -> torch.Tensor: # k_x = v_x . While x_q = x + + + def forward(self, x, k_x, v_x, padding_mask = None) -> torch.Tensor: # k_x = v_x . While x_q = x + + # build of attention mask + attention_mask = get_causal_attention_mask(x.size(1)) # 1) Masked Attention MASKED_ATTENTION = self.__masked_attention( - x, x, x, attention_mask=attention_mask + x, x, x, key_padding_mask=padding_mask, attn_mask=attention_mask ) # 2) Dropout @@ -53,7 +61,7 @@ class Decoder(nn.Module): x = self.__layer_norm_1(x) # 5) Encoder–decoder (cross) attention - CROSS_ATTENTION = self.__cross_attention(x, k_x, v_x) + CROSS_ATTENTION = self.__cross_attention(x, k_x, v_x, key_padding_mask=padding_mask) # 6) Dropout DROPPED_CROSS_ATTENTION = self.__dropout(CROSS_ATTENTION) diff --git a/Project_Model/Libs/Transformer/Classes/Encoder.py b/Project_Model/Libs/Transformer/Classes/Encoder.py index 8adfc76..cdec92a 100644 --- a/Project_Model/Libs/Transformer/Classes/Encoder.py +++ b/Project_Model/Libs/Transformer/Classes/Encoder.py @@ -31,12 +31,12 @@ class Encoder( self.__dropout = nn.Dropout(0.1) # ... pass - def forward(self, x): + def forward(self, x, padding_mask = None): # -> ATTENTION -> dropout -> add and normalize -> FF -> dropout -> add and normalize -> # Attention with Residual Connection [ input + self-attention] # 1) Multi Head Attention - ATTENTION = self.__attention(x, x, x) + ATTENTION = self.__attention(x, x, x,key_padding_mask= padding_mask) # 2) Dropout DROPPED_ATTENTION = self.__dropout(ATTENTION) diff --git a/Project_Model/Libs/Transformer/Classes/NanoSocratesCore.py b/Project_Model/Libs/Transformer/Classes/NanoSocratesCore.py new file mode 100644 index 0000000..7b2a9b0 --- /dev/null +++ b/Project_Model/Libs/Transformer/Classes/NanoSocratesCore.py @@ -0,0 +1,6 @@ +from ..Utils.task_type import TaskType + +class NanoSocratesCore(): + + def __init__(self) -> None: + pass \ No newline at end of file diff --git a/Project_Model/Libs/Transformer/Classes/TorchMultiHeadAttention.py b/Project_Model/Libs/Transformer/Classes/TorchMultiHeadAttention.py index 6081f75..52c0cc5 100644 --- a/Project_Model/Libs/Transformer/Classes/TorchMultiHeadAttention.py +++ b/Project_Model/Libs/Transformer/Classes/TorchMultiHeadAttention.py @@ -1,6 +1,6 @@ import torch import torch.nn as nn - +from typing import Optional class TorchMultiHeadAttention(nn.Module): @@ -18,21 +18,23 @@ class TorchMultiHeadAttention(nn.Module): batch_first=True, ) + def forward( self, x_q: torch.Tensor, x_k: torch.Tensor, x_v: torch.Tensor, - attention_mask=None, key_padding_mask=None, + attention_mask: Optional[torch.Tensor] = None ) -> torch.Tensor: # x * Wq -> Q # x * Wk -> K # x * Wv -> V - - y, _ = self.attention.forward( - x_q, x_k, x_v, attn_mask=attention_mask, key_padding_mask=key_padding_mask + # REMEMBER: tochAttention uses Batch internally to build the 3 dimension attention mask given the 2 dimension + y, _ = self.attention( + x_q, x_k, x_v, attn_mask=attention_mask, key_padding_mask=key_padding_mask, + need_weights=False ) return y diff --git a/Project_Model/Libs/Transformer/Utils/__init__.py b/Project_Model/Libs/Transformer/Utils/__init__.py index e69de29..d4dfba3 100644 --- a/Project_Model/Libs/Transformer/Utils/__init__.py +++ b/Project_Model/Libs/Transformer/Utils/__init__.py @@ -0,0 +1,4 @@ +from .attention_mask import get_causal_attention_mask,get_causal_attention_mask_batched +from .task_type import TaskType + +__all__ = ["get_causal_attention_mask", "TaskType", "get_causal_attention_mask_batched"] \ No newline at end of file diff --git a/Project_Model/Libs/Transformer/Utils/attention_mask.py b/Project_Model/Libs/Transformer/Utils/attention_mask.py new file mode 100644 index 0000000..b1e97f3 --- /dev/null +++ b/Project_Model/Libs/Transformer/Utils/attention_mask.py @@ -0,0 +1,11 @@ +import torch + +def get_causal_attention_mask(seq_len: int) -> torch.Tensor: + return torch.triu(torch.ones(seq_len, seq_len, dtype=torch.bool), diagonal=1) + + +# there is no need for this since MultiHeadAttention of torch does this internally +def get_causal_attention_mask_batched(seq_len: int, batch_size: int ) -> torch.Tensor: + base_mask = get_causal_attention_mask(seq_len) + return base_mask.unsqueeze(0).expand(batch_size, -1, -1) # add another dimension at the beginning, big as batch_size + # the result is that z,x,y where x,y are repeated along z \ No newline at end of file diff --git a/Project_Model/Libs/Transformer/Utils/task_type.py b/Project_Model/Libs/Transformer/Utils/task_type.py new file mode 100644 index 0000000..46a42eb --- /dev/null +++ b/Project_Model/Libs/Transformer/Utils/task_type.py @@ -0,0 +1,6 @@ +from enum import Enum, auto + +class TaskType(Enum): + RDF2TEXT = auto() + MASK = auto() + COMPLETATION = auto() \ No newline at end of file diff --git a/Project_Model/Libs/Transformer/__init__.py b/Project_Model/Libs/Transformer/__init__.py index e384727..d906699 100644 --- a/Project_Model/Libs/Transformer/__init__.py +++ b/Project_Model/Libs/Transformer/__init__.py @@ -1,3 +1,5 @@ from .Classes import * +from .Utils import * -from . import Classes \ No newline at end of file +from . import Classes +from . import Utils \ No newline at end of file