Refactoring
This commit is contained in:
parent
3b5e6c099c
commit
c60da8ba82
@ -1,10 +1,7 @@
|
|||||||
from .Utils import *
|
from .Utils import *
|
||||||
from .Classes import *
|
from .Classes import *
|
||||||
|
|
||||||
from .Utils import fixed_positional_encoding
|
from . import Utils
|
||||||
from .Classes import NanoSocratesEmbedder
|
from . import Classes
|
||||||
|
|
||||||
|
|
||||||
__all__ = [
|
|
||||||
"fixed_positional_encoding",
|
|
||||||
"NanoSocratesEmbedder"
|
|
||||||
]
|
|
||||||
|
|||||||
86
Project_Model/Libs/Transformer/Classes/Decoder.py
Normal file
86
Project_Model/Libs/Transformer/Classes/Decoder.py
Normal file
@ -0,0 +1,86 @@
|
|||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
from .FeedForwardNetwork import FeedForwardNetwork
|
||||||
|
from .TorchMultiHeadAttention import TorchMultiHeadAttention as MultiHeadAttention
|
||||||
|
|
||||||
|
|
||||||
|
class Decoder(nn.Module):
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
embedding_dimension: int,
|
||||||
|
feed_forward_hidden_layer_dimension: int,
|
||||||
|
number_of_attention_heads: int,
|
||||||
|
) -> None:
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
self.__masked_attention = MultiHeadAttention(
|
||||||
|
embedding_dimension, number_of_attention_heads, dropout=0.1
|
||||||
|
)
|
||||||
|
|
||||||
|
self.__layer_norm_1 = nn.LayerNorm(embedding_dimension)
|
||||||
|
|
||||||
|
self.__cross_attention = MultiHeadAttention(
|
||||||
|
embedding_dimension, number_of_attention_heads, dropout=0.1
|
||||||
|
)
|
||||||
|
self.__layer_norm_2 = nn.LayerNorm(embedding_dimension)
|
||||||
|
|
||||||
|
self.__dropout = nn.Dropout(0.1)
|
||||||
|
|
||||||
|
self.__feed_forward_network = FeedForwardNetwork(
|
||||||
|
embedding_dimension, feed_forward_hidden_layer_dimension
|
||||||
|
)
|
||||||
|
self.__layer_norm_3 = nn.LayerNorm(embedding_dimension)
|
||||||
|
|
||||||
|
def forward(self, x, k_x, v_x, attention_mask) -> torch.Tensor: # k_x = v_x . While x_q = x
|
||||||
|
|
||||||
|
# 1) Masked Attention
|
||||||
|
MASKED_ATTENTION = self.__masked_attention(
|
||||||
|
x, x, x, attention_mask=attention_mask
|
||||||
|
)
|
||||||
|
|
||||||
|
# 2) Dropout
|
||||||
|
DROPPED_MASKED_ATTENTION = self.__dropout(
|
||||||
|
MASKED_ATTENTION
|
||||||
|
)
|
||||||
|
del MASKED_ATTENTION
|
||||||
|
|
||||||
|
# 3) Residual Connection
|
||||||
|
x = x + DROPPED_MASKED_ATTENTION
|
||||||
|
del DROPPED_MASKED_ATTENTION
|
||||||
|
|
||||||
|
# 4) Layer Normalization
|
||||||
|
x = self.__layer_norm_1(x)
|
||||||
|
|
||||||
|
# 5) Encoder–decoder (cross) attention
|
||||||
|
CROSS_ATTENTION = self.__cross_attention(x, k_x, v_x)
|
||||||
|
|
||||||
|
# 6) Dropout
|
||||||
|
DROPPED_CROSS_ATTENTION = self.__dropout(CROSS_ATTENTION)
|
||||||
|
del CROSS_ATTENTION
|
||||||
|
|
||||||
|
# 7) Residual Connection
|
||||||
|
x = x + DROPPED_CROSS_ATTENTION
|
||||||
|
del DROPPED_CROSS_ATTENTION
|
||||||
|
|
||||||
|
# 8) Layer Normalization
|
||||||
|
x = self.__layer_norm_2(x)
|
||||||
|
|
||||||
|
# 9) Position-wise feed-forward
|
||||||
|
FEED_FORWARD = self.__feed_forward_network(x)
|
||||||
|
|
||||||
|
# 10) Dropout
|
||||||
|
DROPPED_FEED_FORWARD = self.__dropout(FEED_FORWARD)
|
||||||
|
del FEED_FORWARD
|
||||||
|
|
||||||
|
# 11) Residual Connection
|
||||||
|
x = x + DROPPED_FEED_FORWARD
|
||||||
|
del DROPPED_FEED_FORWARD
|
||||||
|
|
||||||
|
# 12) Layer Normalization
|
||||||
|
x = self.__layer_norm_3(x)
|
||||||
|
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
# use eval to disable dropout ecc
|
||||||
68
Project_Model/Libs/Transformer/Classes/Encoder.py
Normal file
68
Project_Model/Libs/Transformer/Classes/Encoder.py
Normal file
@ -0,0 +1,68 @@
|
|||||||
|
import torch.nn as nn
|
||||||
|
from Project_Model.Libs.Transformer.Classes.FeedForwardNetwork import FeedForwardNetwork
|
||||||
|
from Project_Model.Libs.Transformer.Classes.TorchMultiHeadAttention import (
|
||||||
|
TorchMultiHeadAttention as MultiHeadAttention,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class Encoder(
|
||||||
|
nn.Module
|
||||||
|
): # in this way we expose the primitive of nn.Module for training purpose
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
embedding_dimension: int,
|
||||||
|
feed_forward_hidden_layer_dimension: int,
|
||||||
|
number_of_attention_heads: int,
|
||||||
|
) -> None:
|
||||||
|
super().__init__()
|
||||||
|
self.__attention = MultiHeadAttention(
|
||||||
|
embedding_dimension, number_of_attention_heads, dropout=0.1
|
||||||
|
)
|
||||||
|
self.__layer_norm_1 = nn.LayerNorm(
|
||||||
|
embedding_dimension
|
||||||
|
) # norm of first "Add and Normalize"
|
||||||
|
self.__feed_forward = FeedForwardNetwork(
|
||||||
|
embedding_dimension, feed_forward_hidden_layer_dimension
|
||||||
|
)
|
||||||
|
self.__layer_norm_2 = nn.LayerNorm(
|
||||||
|
embedding_dimension
|
||||||
|
) # norm of second "Add and Normalize"
|
||||||
|
self.__dropout = nn.Dropout(0.1) # ...
|
||||||
|
pass
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
# -> ATTENTION -> dropout -> add and normalize -> FF -> dropout -> add and normalize ->
|
||||||
|
# Attention with Residual Connection [ input + self-attention]
|
||||||
|
|
||||||
|
# 1) Multi Head Attention
|
||||||
|
ATTENTION = self.__attention(x, x, x)
|
||||||
|
|
||||||
|
# 2) Dropout
|
||||||
|
DROPPED_ATTENTION = self.__dropout(ATTENTION)
|
||||||
|
del ATTENTION
|
||||||
|
|
||||||
|
# 3) Residual Connection
|
||||||
|
x = x + DROPPED_ATTENTION
|
||||||
|
|
||||||
|
# 4) Layer Normalization
|
||||||
|
x = self.__layer_norm_1(x)
|
||||||
|
|
||||||
|
# 5) Feed Forward
|
||||||
|
FEED_FORWARD = self.__feed_forward(x)
|
||||||
|
|
||||||
|
# 6) Dropout
|
||||||
|
DROPPED_FEED_FORWARD = self.__dropout(FEED_FORWARD)
|
||||||
|
del FEED_FORWARD
|
||||||
|
|
||||||
|
# 7) Residual Connection
|
||||||
|
x = x + DROPPED_FEED_FORWARD
|
||||||
|
del DROPPED_FEED_FORWARD
|
||||||
|
|
||||||
|
# 8) Layer Normalization
|
||||||
|
x = self.__layer_norm_2(x)
|
||||||
|
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
# use eval to disable dropout ecc
|
||||||
43
Project_Model/Libs/Transformer/Classes/FeedForwardNetwork.py
Normal file
43
Project_Model/Libs/Transformer/Classes/FeedForwardNetwork.py
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
# it is position wise!
|
||||||
|
# https://stackoverflow.com/questions/74979359/how-is-position-wise-feed-forward-neural-network-implemented-for-transformers
|
||||||
|
|
||||||
|
# Why do we need a fixed size
|
||||||
|
# https://ai.stackexchange.com/questions/37624/why-do-transformers-have-a-fixed-input-length
|
||||||
|
|
||||||
|
import torch.nn as nn
|
||||||
|
|
||||||
|
|
||||||
|
class FeedForwardNetwork(nn.Module):
|
||||||
|
|
||||||
|
def __init__(self, embedding_size: int, feed_forward_hidden_layer_dimension: int):
|
||||||
|
|
||||||
|
super().__init__()
|
||||||
|
self.__fully_connected_1 = nn.Linear(
|
||||||
|
embedding_size, feed_forward_hidden_layer_dimension
|
||||||
|
) # expand in higher dimension
|
||||||
|
|
||||||
|
self.__relu = nn.ReLU()
|
||||||
|
self.__dropout = nn.Dropout(
|
||||||
|
0.1
|
||||||
|
) # during training we drop something, with eval it got deactivated
|
||||||
|
|
||||||
|
self.__fully_connected_2 = nn.Linear(
|
||||||
|
feed_forward_hidden_layer_dimension, embedding_size
|
||||||
|
) # return into the model dimension
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
# -> NN1 -> RELU -> (Droput during training) -> NN2 ->
|
||||||
|
|
||||||
|
# 1) Linear Layer
|
||||||
|
x = self.__fully_connected_1(x)
|
||||||
|
|
||||||
|
# 2) ReLU
|
||||||
|
x = self.__relu(x)
|
||||||
|
|
||||||
|
# 3) Dropout
|
||||||
|
x = self.__dropout(x)
|
||||||
|
|
||||||
|
# 4) Linear Layer
|
||||||
|
x = self.__fully_connected_2(x)
|
||||||
|
|
||||||
|
return x
|
||||||
@ -7,14 +7,18 @@
|
|||||||
########################
|
########################
|
||||||
# WIP
|
# WIP
|
||||||
########################
|
########################
|
||||||
import torch
|
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
|
|
||||||
embed_dim = 256
|
embed_dim = 256
|
||||||
num_heads = 8
|
num_heads = 8
|
||||||
multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
|
multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
|
||||||
|
|
||||||
class MultiheadAttention():
|
|
||||||
|
|
||||||
def __init__(self, num_heads = 8, ) -> None:
|
class MultiheadAttention:
|
||||||
pass
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
num_heads=8,
|
||||||
|
) -> None:
|
||||||
|
pass
|
||||||
@ -0,0 +1,46 @@
|
|||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
|
||||||
|
|
||||||
|
class TorchMultiHeadAttention(nn.Module):
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
embedding_dimension: int,
|
||||||
|
number_of_attention_heads: int,
|
||||||
|
dropout: float = 0.0,
|
||||||
|
):
|
||||||
|
super().__init__()
|
||||||
|
self.attention = nn.MultiheadAttention(
|
||||||
|
embedding_dimension,
|
||||||
|
number_of_attention_heads,
|
||||||
|
dropout=dropout,
|
||||||
|
batch_first=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
x_q: torch.Tensor,
|
||||||
|
x_k: torch.Tensor,
|
||||||
|
x_v: torch.Tensor,
|
||||||
|
attention_mask=None,
|
||||||
|
key_padding_mask=None,
|
||||||
|
) -> torch.Tensor:
|
||||||
|
|
||||||
|
# x * Wq -> Q
|
||||||
|
# x * Wk -> K
|
||||||
|
# x * Wv -> V
|
||||||
|
|
||||||
|
y, _ = self.attention.forward(
|
||||||
|
x_q, x_k, x_v, attn_mask=attention_mask, key_padding_mask=key_padding_mask
|
||||||
|
)
|
||||||
|
return y
|
||||||
|
|
||||||
|
|
||||||
|
# batch_first=False (default storico)
|
||||||
|
# Formato: (L, N, E)
|
||||||
|
# L = lunghezza della sequenza (time/posizioni)
|
||||||
|
# N = batch size
|
||||||
|
# E = dimensione d_model (embed)
|
||||||
|
# batch_first=True
|
||||||
|
# Formato: (N, L, E) (più naturale per molti modelli)
|
||||||
13
Project_Model/Libs/Transformer/Classes/__init__.py
Normal file
13
Project_Model/Libs/Transformer/Classes/__init__.py
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
from .Decoder import Decoder
|
||||||
|
from .Encoder import Encoder
|
||||||
|
from .FeedForwardNetwork import FeedForwardNetwork
|
||||||
|
from .MultiHeadAttention import MultiheadAttention
|
||||||
|
from .TorchMultiHeadAttention import TorchMultiHeadAttention
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"Decoder",
|
||||||
|
"Encoder",
|
||||||
|
"FeedForwardNetwork",
|
||||||
|
"MultiheadAttention",
|
||||||
|
"TorchMultiHeadAttention"
|
||||||
|
]
|
||||||
@ -0,0 +1,3 @@
|
|||||||
|
from .Classes import *
|
||||||
|
|
||||||
|
from . import Classes
|
||||||
@ -1,44 +0,0 @@
|
|||||||
|
|
||||||
import torch
|
|
||||||
import torch.nn as nn
|
|
||||||
from Transformer.feed_forward_nn import FeedForwardNetwork
|
|
||||||
from Transformer.pytorch_multi_head_attention import TorchMultiHeadAttention as MultiHeadAttention
|
|
||||||
|
|
||||||
|
|
||||||
class Decoder(nn.Module):
|
|
||||||
|
|
||||||
def __init__(self, d_model:int, d_ff: int, attention_heads:int) -> None:
|
|
||||||
super().__init__()
|
|
||||||
self._masked_attention = MultiHeadAttention(d_model, attention_heads, dropout=0.1)
|
|
||||||
self.norm1 = nn.LayerNorm(d_model)
|
|
||||||
|
|
||||||
self.attention = MultiHeadAttention(d_model, attention_heads, dropout=0.1)
|
|
||||||
self.norm2 = nn.LayerNorm(d_model)
|
|
||||||
|
|
||||||
self.dropout = nn.Dropout(0.1)
|
|
||||||
|
|
||||||
self.ffn = FeedForwardNetwork(d_model, d_ff)
|
|
||||||
self.norm3 = nn.LayerNorm(d_model)
|
|
||||||
pass
|
|
||||||
|
|
||||||
def forward(self, x, k_x,v_x, attention_mask): # k_x = v_x . While x_q = x
|
|
||||||
|
|
||||||
# 1) Masked self-attention
|
|
||||||
x = x + self.dropout(self._masked_attention(x, x, x, attention_mask= attention_mask))
|
|
||||||
x = self.norm1(x)
|
|
||||||
|
|
||||||
# 2) Encoder–decoder (cross) attention
|
|
||||||
x = x + self.dropout(self.attention(x, k_x, v_x))
|
|
||||||
x = self.norm2(x)
|
|
||||||
|
|
||||||
# 3) Position-wise feed-forward
|
|
||||||
x = x + self.dropout(self.ffn(x))
|
|
||||||
x = self.norm3(x)
|
|
||||||
|
|
||||||
return x
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# use eval to disable dropout ecc
|
|
||||||
@ -1,32 +0,0 @@
|
|||||||
|
|
||||||
import torch
|
|
||||||
import torch.nn as nn
|
|
||||||
from Transformer.feed_forward_nn import FeedForwardNetwork
|
|
||||||
from Transformer.pytorch_multi_head_attention import TorchMultiHeadAttention as MultiHeadAttention
|
|
||||||
|
|
||||||
|
|
||||||
class Encoder(nn.Module): # in this way we expose the primitive of nn.Module for training purpose
|
|
||||||
|
|
||||||
def __init__(self, d_model:int, d_ff: int, attention_heads:int) -> None:
|
|
||||||
super().__init__()
|
|
||||||
self.attention = MultiHeadAttention(d_model, attention_heads, dropout=0.1)
|
|
||||||
self.norm1 = nn.LayerNorm(d_model) # norm of first "Add and Normalize"
|
|
||||||
self.ffn = FeedForwardNetwork(d_model, d_ff)
|
|
||||||
self.norm2 = nn.LayerNorm(d_model) # norm of second "Add and Normalize"
|
|
||||||
self.dropout = nn.Dropout(0.1) # ...
|
|
||||||
pass
|
|
||||||
|
|
||||||
def forward(self, x):
|
|
||||||
# -> ATTENTION -> dropout -> add and normalize -> FF -> dropout -> add and normalize ->
|
|
||||||
# Attention with Residual Connection [ input + self-attention]
|
|
||||||
x = x + self.dropout(self.attention(x, x, x))
|
|
||||||
x = self.norm1(x)
|
|
||||||
|
|
||||||
# Feedforward with Residual Connection [ normed self-attention + ff]
|
|
||||||
x = x + self.dropout(self.ffn(x))
|
|
||||||
x = self.norm2(x)
|
|
||||||
return x
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# use eval to disable dropout ecc
|
|
||||||
@ -1,18 +0,0 @@
|
|||||||
# it is position wise!
|
|
||||||
# https://stackoverflow.com/questions/74979359/how-is-position-wise-feed-forward-neural-network-implemented-for-transformers
|
|
||||||
|
|
||||||
import torch
|
|
||||||
import torch.nn as nn
|
|
||||||
|
|
||||||
class FeedForwardNetwork(nn.Module):
|
|
||||||
def __init__(self, d_model, d_ff):
|
|
||||||
super(FeedForwardNetwork, self).__init__()
|
|
||||||
self.fc1 = nn.Linear(d_model, d_ff) # expand in higher dimension
|
|
||||||
self.activation = nn.ReLU()
|
|
||||||
self.dropout = nn.Dropout(0.1) # during training we drop something, with eval it got deactivated
|
|
||||||
self.fc2 = nn.Linear(d_ff, d_model) # return into the model dimension
|
|
||||||
|
|
||||||
|
|
||||||
def forward(self, x):
|
|
||||||
# -> NN1 -> RELU -> (Droput during training) -> NN2 ->
|
|
||||||
return self.fc2(self.dropout(self.activation(self.fc1(x))))
|
|
||||||
@ -1,33 +0,0 @@
|
|||||||
import torch
|
|
||||||
import torch.nn as nn
|
|
||||||
|
|
||||||
class TorchMultiHeadAttention(nn.Module):
|
|
||||||
|
|
||||||
def __init__(self, embed_dim: int, num_heads: int, dropout: float = 0.0):
|
|
||||||
super().__init__()
|
|
||||||
self.attention = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout, batch_first=True)
|
|
||||||
|
|
||||||
def forward(
|
|
||||||
self,
|
|
||||||
x_q: torch.Tensor,
|
|
||||||
x_k: torch.Tensor,
|
|
||||||
x_v: torch.Tensor,
|
|
||||||
attention_mask = None,
|
|
||||||
key_padding_mask = None
|
|
||||||
) -> torch.Tensor:
|
|
||||||
|
|
||||||
# x * Wq -> Q
|
|
||||||
# x * Wk -> K
|
|
||||||
# x * Wv -> V
|
|
||||||
|
|
||||||
y, _ = self.attention.forward(x_q, x_k, x_v, attn_mask=attention_mask, key_padding_mask=key_padding_mask)
|
|
||||||
return y
|
|
||||||
|
|
||||||
|
|
||||||
# batch_first=False (default storico)
|
|
||||||
# Formato: (L, N, E)
|
|
||||||
# L = lunghezza della sequenza (time/posizioni)
|
|
||||||
# N = batch size
|
|
||||||
# E = dimensione d_model (embed)
|
|
||||||
# batch_first=True
|
|
||||||
# Formato: (N, L, E) (più naturale per molti modelli)
|
|
||||||
Loading…
x
Reference in New Issue
Block a user