This commit is contained in:
Christian Risi 2025-10-06 15:55:44 +02:00
commit b1e7af0607
9 changed files with 91 additions and 11 deletions

41
Playgrounds/prova.ipynb Normal file
View File

@ -0,0 +1,41 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "4ae47336",
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"B, T, D = 4, 7, 32\n",
"x = torch.randn(B, T, D)\n",
"attn_mask = torch.triu(torch.ones(T, T, dtype=torch.bool), diagonal=1) # [T,T]\n",
"pad_mask = torch.zeros(B, T, dtype=torch.bool) # no pads\n",
"mha = torch.nn.MultiheadAttention(D, num_heads=4, batch_first=True)\n",
"y, _ = mha(x, x, x, attn_mask=attn_mask, key_padding_mask=pad_mask) # should work\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "deep_learning",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -2,6 +2,9 @@ import torch
import torch.nn as nn import torch.nn as nn
from .FeedForwardNetwork import FeedForwardNetwork from .FeedForwardNetwork import FeedForwardNetwork
from .TorchMultiHeadAttention import TorchMultiHeadAttention as MultiHeadAttention from .TorchMultiHeadAttention import TorchMultiHeadAttention as MultiHeadAttention
from ..Utils.attention_mask import get_causal_attention_mask
# B, L(T), E_D
class Decoder(nn.Module): class Decoder(nn.Module):
@ -32,11 +35,16 @@ class Decoder(nn.Module):
) )
self.__layer_norm_3 = nn.LayerNorm(embedding_dimension) self.__layer_norm_3 = nn.LayerNorm(embedding_dimension)
def forward(self, x, k_x, v_x, attention_mask) -> torch.Tensor: # k_x = v_x . While x_q = x
def forward(self, x, k_x, v_x, padding_mask = None) -> torch.Tensor: # k_x = v_x . While x_q = x
# build of attention mask
attention_mask = get_causal_attention_mask(x.size(1))
# 1) Masked Attention # 1) Masked Attention
MASKED_ATTENTION = self.__masked_attention( MASKED_ATTENTION = self.__masked_attention(
x, x, x, attention_mask=attention_mask x, x, x, key_padding_mask=padding_mask, attn_mask=attention_mask
) )
# 2) Dropout # 2) Dropout
@ -53,7 +61,7 @@ class Decoder(nn.Module):
x = self.__layer_norm_1(x) x = self.__layer_norm_1(x)
# 5) Encoderdecoder (cross) attention # 5) Encoderdecoder (cross) attention
CROSS_ATTENTION = self.__cross_attention(x, k_x, v_x) CROSS_ATTENTION = self.__cross_attention(x, k_x, v_x, key_padding_mask=padding_mask)
# 6) Dropout # 6) Dropout
DROPPED_CROSS_ATTENTION = self.__dropout(CROSS_ATTENTION) DROPPED_CROSS_ATTENTION = self.__dropout(CROSS_ATTENTION)

View File

@ -31,12 +31,12 @@ class Encoder(
self.__dropout = nn.Dropout(0.1) # ... self.__dropout = nn.Dropout(0.1) # ...
pass pass
def forward(self, x): def forward(self, x, padding_mask = None):
# -> ATTENTION -> dropout -> add and normalize -> FF -> dropout -> add and normalize -> # -> ATTENTION -> dropout -> add and normalize -> FF -> dropout -> add and normalize ->
# Attention with Residual Connection [ input + self-attention] # Attention with Residual Connection [ input + self-attention]
# 1) Multi Head Attention # 1) Multi Head Attention
ATTENTION = self.__attention(x, x, x) ATTENTION = self.__attention(x, x, x,key_padding_mask= padding_mask)
# 2) Dropout # 2) Dropout
DROPPED_ATTENTION = self.__dropout(ATTENTION) DROPPED_ATTENTION = self.__dropout(ATTENTION)

View File

@ -0,0 +1,6 @@
from ..Utils.task_type import TaskType
class NanoSocratesCore():
def __init__(self) -> None:
pass

View File

@ -1,6 +1,6 @@
import torch import torch
import torch.nn as nn import torch.nn as nn
from typing import Optional
class TorchMultiHeadAttention(nn.Module): class TorchMultiHeadAttention(nn.Module):
@ -18,21 +18,23 @@ class TorchMultiHeadAttention(nn.Module):
batch_first=True, batch_first=True,
) )
def forward( def forward(
self, self,
x_q: torch.Tensor, x_q: torch.Tensor,
x_k: torch.Tensor, x_k: torch.Tensor,
x_v: torch.Tensor, x_v: torch.Tensor,
attention_mask=None,
key_padding_mask=None, key_padding_mask=None,
attention_mask: Optional[torch.Tensor] = None
) -> torch.Tensor: ) -> torch.Tensor:
# x * Wq -> Q # x * Wq -> Q
# x * Wk -> K # x * Wk -> K
# x * Wv -> V # x * Wv -> V
# REMEMBER: tochAttention uses Batch internally to build the 3 dimension attention mask given the 2 dimension
y, _ = self.attention.forward( y, _ = self.attention(
x_q, x_k, x_v, attn_mask=attention_mask, key_padding_mask=key_padding_mask x_q, x_k, x_v, attn_mask=attention_mask, key_padding_mask=key_padding_mask,
need_weights=False
) )
return y return y

View File

@ -0,0 +1,4 @@
from .attention_mask import get_causal_attention_mask,get_causal_attention_mask_batched
from .task_type import TaskType
__all__ = ["get_causal_attention_mask", "TaskType", "get_causal_attention_mask_batched"]

View File

@ -0,0 +1,11 @@
import torch
def get_causal_attention_mask(seq_len: int) -> torch.Tensor:
return torch.triu(torch.ones(seq_len, seq_len, dtype=torch.bool), diagonal=1)
# there is no need for this since MultiHeadAttention of torch does this internally
def get_causal_attention_mask_batched(seq_len: int, batch_size: int ) -> torch.Tensor:
base_mask = get_causal_attention_mask(seq_len)
return base_mask.unsqueeze(0).expand(batch_size, -1, -1) # add another dimension at the beginning, big as batch_size
# the result is that z,x,y where x,y are repeated along z

View File

@ -0,0 +1,6 @@
from enum import Enum, auto
class TaskType(Enum):
RDF2TEXT = auto()
MASK = auto()
COMPLETATION = auto()

View File

@ -1,3 +1,5 @@
from .Classes import * from .Classes import *
from .Utils import *
from . import Classes from . import Classes
from . import Utils