added first classes (Encoder, Decoder, Attention) for the model
This commit is contained in:
parent
9b656e7918
commit
76200d936d
44
Project_Model/Libs/Transformer/decoder.py
Normal file
44
Project_Model/Libs/Transformer/decoder.py
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
from Transformer.feed_forward_nn import FeedForwardNetwork
|
||||||
|
from Transformer.pytorch_multi_head_attention import TorchMultiHeadAttention as MultiHeadAttention
|
||||||
|
|
||||||
|
|
||||||
|
class Decoder(nn.Module):
|
||||||
|
|
||||||
|
def __init__(self, d_model:int, d_ff: int, attention_heads:int) -> None:
|
||||||
|
super().__init__()
|
||||||
|
self._masked_attention = MultiHeadAttention(d_model, attention_heads, dropout=0.1)
|
||||||
|
self.norm1 = nn.LayerNorm(d_model)
|
||||||
|
|
||||||
|
self.attention = MultiHeadAttention(d_model, attention_heads, dropout=0.1)
|
||||||
|
self.norm2 = nn.LayerNorm(d_model)
|
||||||
|
|
||||||
|
self.dropout = nn.Dropout(0.1)
|
||||||
|
|
||||||
|
self.ffn = FeedForwardNetwork(d_model, d_ff)
|
||||||
|
self.norm3 = nn.LayerNorm(d_model)
|
||||||
|
pass
|
||||||
|
|
||||||
|
def forward(self, x, k_x,v_x, attention_mask): # k_x = v_x . While x_q = x
|
||||||
|
|
||||||
|
# 1) Masked self-attention
|
||||||
|
x = x + self.dropout(self._masked_attention(x, x, x, attention_mask= attention_mask))
|
||||||
|
x = self.norm1(x)
|
||||||
|
|
||||||
|
# 2) Encoder–decoder (cross) attention
|
||||||
|
x = x + self.dropout(self.attention(x, k_x, v_x))
|
||||||
|
x = self.norm2(x)
|
||||||
|
|
||||||
|
# 3) Position-wise feed-forward
|
||||||
|
x = x + self.dropout(self.ffn(x))
|
||||||
|
x = self.norm3(x)
|
||||||
|
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# use eval to disable dropout ecc
|
||||||
32
Project_Model/Libs/Transformer/encoder.py
Normal file
32
Project_Model/Libs/Transformer/encoder.py
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
from Transformer.feed_forward_nn import FeedForwardNetwork
|
||||||
|
from Transformer.pytorch_multi_head_attention import TorchMultiHeadAttention as MultiHeadAttention
|
||||||
|
|
||||||
|
|
||||||
|
class Encoder(nn.Module): # in this way we expose the primitive of nn.Module for training purpose
|
||||||
|
|
||||||
|
def __init__(self, d_model:int, d_ff: int, attention_heads:int) -> None:
|
||||||
|
super().__init__()
|
||||||
|
self.attention = MultiHeadAttention(d_model, attention_heads, dropout=0.1)
|
||||||
|
self.norm1 = nn.LayerNorm(d_model) # norm of first "Add and Normalize"
|
||||||
|
self.ffn = FeedForwardNetwork(d_model, d_ff)
|
||||||
|
self.norm2 = nn.LayerNorm(d_model) # norm of second "Add and Normalize"
|
||||||
|
self.dropout = nn.Dropout(0.1) # ...
|
||||||
|
pass
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
# -> ATTENTION -> dropout -> add and normalize -> FF -> dropout -> add and normalize ->
|
||||||
|
# Attention with Residual Connection [ input + self-attention]
|
||||||
|
x = x + self.dropout(self.attention(x, x, x))
|
||||||
|
x = self.norm1(x)
|
||||||
|
|
||||||
|
# Feedforward with Residual Connection [ normed self-attention + ff]
|
||||||
|
x = x + self.dropout(self.ffn(x))
|
||||||
|
x = self.norm2(x)
|
||||||
|
return x
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# use eval to disable dropout ecc
|
||||||
18
Project_Model/Libs/Transformer/feed_forward_nn.py
Normal file
18
Project_Model/Libs/Transformer/feed_forward_nn.py
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
# it is position wise!
|
||||||
|
# https://stackoverflow.com/questions/74979359/how-is-position-wise-feed-forward-neural-network-implemented-for-transformers
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
|
||||||
|
class FeedForwardNetwork(nn.Module):
|
||||||
|
def __init__(self, d_model, d_ff):
|
||||||
|
super(FeedForwardNetwork, self).__init__()
|
||||||
|
self.fc1 = nn.Linear(d_model, d_ff) # expand in higher dimension
|
||||||
|
self.activation = nn.ReLU()
|
||||||
|
self.dropout = nn.Dropout(0.1) # during training we drop something, with eval it got deactivated
|
||||||
|
self.fc2 = nn.Linear(d_ff, d_model) # return into the model dimension
|
||||||
|
|
||||||
|
|
||||||
|
def forward(self, x):
|
||||||
|
# -> NN1 -> RELU -> (Droput during training) -> NN2 ->
|
||||||
|
return self.fc2(self.dropout(self.activation(self.fc1(x))))
|
||||||
20
Project_Model/Libs/Transformer/multi_head_attention.py
Normal file
20
Project_Model/Libs/Transformer/multi_head_attention.py
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
# multi-head attention -> (then to) ff
|
||||||
|
# attention: qkv -> score = qk -> divide -> softamx
|
||||||
|
# multihead -> QKV diferent in each head ( built by : X*[WQ/QK/WV])
|
||||||
|
# z = soft(Q*K'/sqr(d))*V
|
||||||
|
# recombine Z: 1) concatenate. 2) [z01234] * W = Z
|
||||||
|
# we expect later to have padding token
|
||||||
|
########################
|
||||||
|
# WIP
|
||||||
|
########################
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
|
||||||
|
embed_dim = 256
|
||||||
|
num_heads = 8
|
||||||
|
multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
|
||||||
|
|
||||||
|
class MultiheadAttention():
|
||||||
|
|
||||||
|
def __init__(self, num_heads = 8, ) -> None:
|
||||||
|
pass
|
||||||
@ -0,0 +1,33 @@
|
|||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
|
||||||
|
class TorchMultiHeadAttention(nn.Module):
|
||||||
|
|
||||||
|
def __init__(self, embed_dim: int, num_heads: int, dropout: float = 0.0):
|
||||||
|
super().__init__()
|
||||||
|
self.attention = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout, batch_first=True)
|
||||||
|
|
||||||
|
def forward(
|
||||||
|
self,
|
||||||
|
x_q: torch.Tensor,
|
||||||
|
x_k: torch.Tensor,
|
||||||
|
x_v: torch.Tensor,
|
||||||
|
attention_mask = None,
|
||||||
|
key_padding_mask = None
|
||||||
|
) -> torch.Tensor:
|
||||||
|
|
||||||
|
# x * Wq -> Q
|
||||||
|
# x * Wk -> K
|
||||||
|
# x * Wv -> V
|
||||||
|
|
||||||
|
y, _ = self.attention.forward(x_q, x_k, x_v, attn_mask=attention_mask, key_padding_mask=key_padding_mask)
|
||||||
|
return y
|
||||||
|
|
||||||
|
|
||||||
|
# batch_first=False (default storico)
|
||||||
|
# Formato: (L, N, E)
|
||||||
|
# L = lunghezza della sequenza (time/posizioni)
|
||||||
|
# N = batch size
|
||||||
|
# E = dimensione d_model (embed)
|
||||||
|
# batch_first=True
|
||||||
|
# Formato: (N, L, E) (più naturale per molti modelli)
|
||||||
Loading…
x
Reference in New Issue
Block a user