Refactoring

This commit is contained in:
Christian Risi 2025-10-05 15:40:29 +02:00
parent 3b5e6c099c
commit c60da8ba82
12 changed files with 270 additions and 137 deletions

View File

@ -1,10 +1,7 @@
from .Utils import *
from .Classes import *
from .Utils import fixed_positional_encoding
from .Classes import NanoSocratesEmbedder
from . import Utils
from . import Classes
__all__ = [
"fixed_positional_encoding",
"NanoSocratesEmbedder"
]

View File

@ -0,0 +1,86 @@
import torch
import torch.nn as nn
from .FeedForwardNetwork import FeedForwardNetwork
from .TorchMultiHeadAttention import TorchMultiHeadAttention as MultiHeadAttention
class Decoder(nn.Module):
def __init__(
self,
embedding_dimension: int,
feed_forward_hidden_layer_dimension: int,
number_of_attention_heads: int,
) -> None:
super().__init__()
self.__masked_attention = MultiHeadAttention(
embedding_dimension, number_of_attention_heads, dropout=0.1
)
self.__layer_norm_1 = nn.LayerNorm(embedding_dimension)
self.__cross_attention = MultiHeadAttention(
embedding_dimension, number_of_attention_heads, dropout=0.1
)
self.__layer_norm_2 = nn.LayerNorm(embedding_dimension)
self.__dropout = nn.Dropout(0.1)
self.__feed_forward_network = FeedForwardNetwork(
embedding_dimension, feed_forward_hidden_layer_dimension
)
self.__layer_norm_3 = nn.LayerNorm(embedding_dimension)
def forward(self, x, k_x, v_x, attention_mask) -> torch.Tensor: # k_x = v_x . While x_q = x
# 1) Masked Attention
MASKED_ATTENTION = self.__masked_attention(
x, x, x, attention_mask=attention_mask
)
# 2) Dropout
DROPPED_MASKED_ATTENTION = self.__dropout(
MASKED_ATTENTION
)
del MASKED_ATTENTION
# 3) Residual Connection
x = x + DROPPED_MASKED_ATTENTION
del DROPPED_MASKED_ATTENTION
# 4) Layer Normalization
x = self.__layer_norm_1(x)
# 5) Encoderdecoder (cross) attention
CROSS_ATTENTION = self.__cross_attention(x, k_x, v_x)
# 6) Dropout
DROPPED_CROSS_ATTENTION = self.__dropout(CROSS_ATTENTION)
del CROSS_ATTENTION
# 7) Residual Connection
x = x + DROPPED_CROSS_ATTENTION
del DROPPED_CROSS_ATTENTION
# 8) Layer Normalization
x = self.__layer_norm_2(x)
# 9) Position-wise feed-forward
FEED_FORWARD = self.__feed_forward_network(x)
# 10) Dropout
DROPPED_FEED_FORWARD = self.__dropout(FEED_FORWARD)
del FEED_FORWARD
# 11) Residual Connection
x = x + DROPPED_FEED_FORWARD
del DROPPED_FEED_FORWARD
# 12) Layer Normalization
x = self.__layer_norm_3(x)
return x
# use eval to disable dropout ecc

View File

@ -0,0 +1,68 @@
import torch.nn as nn
from Project_Model.Libs.Transformer.Classes.FeedForwardNetwork import FeedForwardNetwork
from Project_Model.Libs.Transformer.Classes.TorchMultiHeadAttention import (
TorchMultiHeadAttention as MultiHeadAttention,
)
class Encoder(
nn.Module
): # in this way we expose the primitive of nn.Module for training purpose
def __init__(
self,
embedding_dimension: int,
feed_forward_hidden_layer_dimension: int,
number_of_attention_heads: int,
) -> None:
super().__init__()
self.__attention = MultiHeadAttention(
embedding_dimension, number_of_attention_heads, dropout=0.1
)
self.__layer_norm_1 = nn.LayerNorm(
embedding_dimension
) # norm of first "Add and Normalize"
self.__feed_forward = FeedForwardNetwork(
embedding_dimension, feed_forward_hidden_layer_dimension
)
self.__layer_norm_2 = nn.LayerNorm(
embedding_dimension
) # norm of second "Add and Normalize"
self.__dropout = nn.Dropout(0.1) # ...
pass
def forward(self, x):
# -> ATTENTION -> dropout -> add and normalize -> FF -> dropout -> add and normalize ->
# Attention with Residual Connection [ input + self-attention]
# 1) Multi Head Attention
ATTENTION = self.__attention(x, x, x)
# 2) Dropout
DROPPED_ATTENTION = self.__dropout(ATTENTION)
del ATTENTION
# 3) Residual Connection
x = x + DROPPED_ATTENTION
# 4) Layer Normalization
x = self.__layer_norm_1(x)
# 5) Feed Forward
FEED_FORWARD = self.__feed_forward(x)
# 6) Dropout
DROPPED_FEED_FORWARD = self.__dropout(FEED_FORWARD)
del FEED_FORWARD
# 7) Residual Connection
x = x + DROPPED_FEED_FORWARD
del DROPPED_FEED_FORWARD
# 8) Layer Normalization
x = self.__layer_norm_2(x)
return x
# use eval to disable dropout ecc

View File

@ -0,0 +1,43 @@
# it is position wise!
# https://stackoverflow.com/questions/74979359/how-is-position-wise-feed-forward-neural-network-implemented-for-transformers
# Why do we need a fixed size
# https://ai.stackexchange.com/questions/37624/why-do-transformers-have-a-fixed-input-length
import torch.nn as nn
class FeedForwardNetwork(nn.Module):
def __init__(self, embedding_size: int, feed_forward_hidden_layer_dimension: int):
super().__init__()
self.__fully_connected_1 = nn.Linear(
embedding_size, feed_forward_hidden_layer_dimension
) # expand in higher dimension
self.__relu = nn.ReLU()
self.__dropout = nn.Dropout(
0.1
) # during training we drop something, with eval it got deactivated
self.__fully_connected_2 = nn.Linear(
feed_forward_hidden_layer_dimension, embedding_size
) # return into the model dimension
def forward(self, x):
# -> NN1 -> RELU -> (Droput during training) -> NN2 ->
# 1) Linear Layer
x = self.__fully_connected_1(x)
# 2) ReLU
x = self.__relu(x)
# 3) Dropout
x = self.__dropout(x)
# 4) Linear Layer
x = self.__fully_connected_2(x)
return x

View File

@ -7,14 +7,18 @@
########################
# WIP
########################
import torch
import torch.nn as nn
embed_dim = 256
num_heads = 8
multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
class MultiheadAttention():
def __init__(self, num_heads = 8, ) -> None:
class MultiheadAttention:
def __init__(
self,
num_heads=8,
) -> None:
pass

View File

@ -0,0 +1,46 @@
import torch
import torch.nn as nn
class TorchMultiHeadAttention(nn.Module):
def __init__(
self,
embedding_dimension: int,
number_of_attention_heads: int,
dropout: float = 0.0,
):
super().__init__()
self.attention = nn.MultiheadAttention(
embedding_dimension,
number_of_attention_heads,
dropout=dropout,
batch_first=True,
)
def forward(
self,
x_q: torch.Tensor,
x_k: torch.Tensor,
x_v: torch.Tensor,
attention_mask=None,
key_padding_mask=None,
) -> torch.Tensor:
# x * Wq -> Q
# x * Wk -> K
# x * Wv -> V
y, _ = self.attention.forward(
x_q, x_k, x_v, attn_mask=attention_mask, key_padding_mask=key_padding_mask
)
return y
# batch_first=False (default storico)
# Formato: (L, N, E)
# L = lunghezza della sequenza (time/posizioni)
# N = batch size
# E = dimensione d_model (embed)
# batch_first=True
# Formato: (N, L, E) (più naturale per molti modelli)

View File

@ -0,0 +1,13 @@
from .Decoder import Decoder
from .Encoder import Encoder
from .FeedForwardNetwork import FeedForwardNetwork
from .MultiHeadAttention import MultiheadAttention
from .TorchMultiHeadAttention import TorchMultiHeadAttention
__all__ = [
"Decoder",
"Encoder",
"FeedForwardNetwork",
"MultiheadAttention",
"TorchMultiHeadAttention"
]

View File

@ -0,0 +1,3 @@
from .Classes import *
from . import Classes

View File

@ -1,44 +0,0 @@
import torch
import torch.nn as nn
from Transformer.feed_forward_nn import FeedForwardNetwork
from Transformer.pytorch_multi_head_attention import TorchMultiHeadAttention as MultiHeadAttention
class Decoder(nn.Module):
def __init__(self, d_model:int, d_ff: int, attention_heads:int) -> None:
super().__init__()
self._masked_attention = MultiHeadAttention(d_model, attention_heads, dropout=0.1)
self.norm1 = nn.LayerNorm(d_model)
self.attention = MultiHeadAttention(d_model, attention_heads, dropout=0.1)
self.norm2 = nn.LayerNorm(d_model)
self.dropout = nn.Dropout(0.1)
self.ffn = FeedForwardNetwork(d_model, d_ff)
self.norm3 = nn.LayerNorm(d_model)
pass
def forward(self, x, k_x,v_x, attention_mask): # k_x = v_x . While x_q = x
# 1) Masked self-attention
x = x + self.dropout(self._masked_attention(x, x, x, attention_mask= attention_mask))
x = self.norm1(x)
# 2) Encoderdecoder (cross) attention
x = x + self.dropout(self.attention(x, k_x, v_x))
x = self.norm2(x)
# 3) Position-wise feed-forward
x = x + self.dropout(self.ffn(x))
x = self.norm3(x)
return x
# use eval to disable dropout ecc

View File

@ -1,32 +0,0 @@
import torch
import torch.nn as nn
from Transformer.feed_forward_nn import FeedForwardNetwork
from Transformer.pytorch_multi_head_attention import TorchMultiHeadAttention as MultiHeadAttention
class Encoder(nn.Module): # in this way we expose the primitive of nn.Module for training purpose
def __init__(self, d_model:int, d_ff: int, attention_heads:int) -> None:
super().__init__()
self.attention = MultiHeadAttention(d_model, attention_heads, dropout=0.1)
self.norm1 = nn.LayerNorm(d_model) # norm of first "Add and Normalize"
self.ffn = FeedForwardNetwork(d_model, d_ff)
self.norm2 = nn.LayerNorm(d_model) # norm of second "Add and Normalize"
self.dropout = nn.Dropout(0.1) # ...
pass
def forward(self, x):
# -> ATTENTION -> dropout -> add and normalize -> FF -> dropout -> add and normalize ->
# Attention with Residual Connection [ input + self-attention]
x = x + self.dropout(self.attention(x, x, x))
x = self.norm1(x)
# Feedforward with Residual Connection [ normed self-attention + ff]
x = x + self.dropout(self.ffn(x))
x = self.norm2(x)
return x
# use eval to disable dropout ecc

View File

@ -1,18 +0,0 @@
# it is position wise!
# https://stackoverflow.com/questions/74979359/how-is-position-wise-feed-forward-neural-network-implemented-for-transformers
import torch
import torch.nn as nn
class FeedForwardNetwork(nn.Module):
def __init__(self, d_model, d_ff):
super(FeedForwardNetwork, self).__init__()
self.fc1 = nn.Linear(d_model, d_ff) # expand in higher dimension
self.activation = nn.ReLU()
self.dropout = nn.Dropout(0.1) # during training we drop something, with eval it got deactivated
self.fc2 = nn.Linear(d_ff, d_model) # return into the model dimension
def forward(self, x):
# -> NN1 -> RELU -> (Droput during training) -> NN2 ->
return self.fc2(self.dropout(self.activation(self.fc1(x))))

View File

@ -1,33 +0,0 @@
import torch
import torch.nn as nn
class TorchMultiHeadAttention(nn.Module):
def __init__(self, embed_dim: int, num_heads: int, dropout: float = 0.0):
super().__init__()
self.attention = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout, batch_first=True)
def forward(
self,
x_q: torch.Tensor,
x_k: torch.Tensor,
x_v: torch.Tensor,
attention_mask = None,
key_padding_mask = None
) -> torch.Tensor:
# x * Wq -> Q
# x * Wk -> K
# x * Wv -> V
y, _ = self.attention.forward(x_q, x_k, x_v, attn_mask=attention_mask, key_padding_mask=key_padding_mask)
return y
# batch_first=False (default storico)
# Formato: (L, N, E)
# L = lunghezza della sequenza (time/posizioni)
# N = batch size
# E = dimensione d_model (embed)
# batch_first=True
# Formato: (N, L, E) (più naturale per molti modelli)