Merge branch 'dev.train' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev.train
This commit is contained in:
commit
e2231eb3b9
@ -21,11 +21,18 @@ torch.set_default_device(DEVICE)
|
|||||||
|
|
||||||
|
|
||||||
# Get paths
|
# Get paths
|
||||||
|
CHECKPOINT_DIR = "Assets/Dataset/Tmp"
|
||||||
VOCABULARY_PATH = Path("Assets/Model/small/bpe-small-16.json")
|
VOCABULARY_PATH = Path("Assets/Model/small/bpe-small-16.json")
|
||||||
TRAIN_DATASET_PATH = Path("Assets/Dataset/1-hop/small/holdout/train.csv")
|
TRAIN_DATASET_PATH = Path("Assets/Dataset/1-hop/small/holdout/train.csv")
|
||||||
VALIDATION_DATASET_PATH = Path("Assets/Dataset/1-hop/small/holdout/evaluation.csv")
|
VALIDATION_DATASET_PATH = Path("Assets/Dataset/1-hop/small/holdout/evaluation.csv")
|
||||||
TEST_DATASET_PATH = Path("Assets/Dataset/1-hop/small/holdout/test.csv")
|
TEST_DATASET_PATH = Path("Assets/Dataset/1-hop/small/holdout/test.csv")
|
||||||
CHECKPOINT_PATH = Path("Assets/Dataset/Tmp/NanoSocrates.zip")
|
CHECKPOINT_PATH = Path(f"{CHECKPOINT_DIR}/NanoSocrates.zip")
|
||||||
|
|
||||||
|
NANO_OPTIM_PATH = Path(f"{CHECKPOINT_DIR}/nano_optim.zip")
|
||||||
|
ENC_OPTIM_PATH = Path(f"{CHECKPOINT_DIR}/enc_optim.zip")
|
||||||
|
DEC_OPTIM_PATH = Path(f"{CHECKPOINT_DIR}/dec_optim.zip")
|
||||||
|
LAST_EPOCH_PATH = Path(f"{CHECKPOINT_DIR}/last_epoch.txt")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# BPE Init
|
# BPE Init
|
||||||
@ -50,7 +57,7 @@ MINI_BATCH_SIZE = 80
|
|||||||
VALIDATION_STEPS = 5
|
VALIDATION_STEPS = 5
|
||||||
CHECKPOINT_STEPS = VALIDATION_STEPS * 4
|
CHECKPOINT_STEPS = VALIDATION_STEPS * 4
|
||||||
PATIENCE = 4
|
PATIENCE = 4
|
||||||
CURRENT_EPOCH = 0
|
CURRENT_EPOCH = -1 if not LAST_EPOCH_PATH.is_file() else int(LAST_EPOCH_PATH.read_text())
|
||||||
VERBOSE = True
|
VERBOSE = True
|
||||||
LEARNING_RATE = 1.5
|
LEARNING_RATE = 1.5
|
||||||
|
|
||||||
@ -78,7 +85,6 @@ TEST_BATCHER = Batch.Batcher(TEST_DATASET_PATH, SENTENCE_LENGTH, TOKENANO, MASKE
|
|||||||
|
|
||||||
|
|
||||||
# Model
|
# Model
|
||||||
|
|
||||||
NANOSOCRATES = Transformer.TrainingModel(
|
NANOSOCRATES = Transformer.TrainingModel(
|
||||||
TOKEN_SPACE_SIZE,
|
TOKEN_SPACE_SIZE,
|
||||||
EMBEDDED_SIZE,
|
EMBEDDED_SIZE,
|
||||||
@ -103,12 +109,25 @@ decoder_ce = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)
|
|||||||
nano_optim = torch.optim.AdamW(NANOSOCRATES.parameters(), LEARNING_RATE)
|
nano_optim = torch.optim.AdamW(NANOSOCRATES.parameters(), LEARNING_RATE)
|
||||||
encoder_only_optim = torch.optim.AdamW(ENCODER_ONLY.parameters(), LEARNING_RATE)
|
encoder_only_optim = torch.optim.AdamW(ENCODER_ONLY.parameters(), LEARNING_RATE)
|
||||||
decoder_only_optim = torch.optim.AdamW(DECODER_ONLY.parameters(), LEARNING_RATE)
|
decoder_only_optim = torch.optim.AdamW(DECODER_ONLY.parameters(), LEARNING_RATE)
|
||||||
nano_scheduler = Transformer.WarmupLR(nano_optim, WARMUP_EPOCHS, EMBEDDED_SIZE)
|
|
||||||
|
if NANO_OPTIM_PATH.is_file():
|
||||||
|
optim_dict = torch.load(NANO_OPTIM_PATH)
|
||||||
|
nano_optim.load_state_dict(optim_dict)
|
||||||
|
|
||||||
|
if ENC_OPTIM_PATH.is_file():
|
||||||
|
optim_dict = torch.load(ENC_OPTIM_PATH)
|
||||||
|
encoder_only_optim.load_state_dict(optim_dict)
|
||||||
|
|
||||||
|
if DEC_OPTIM_PATH.is_file():
|
||||||
|
optim_dict = torch.load(DEC_OPTIM_PATH)
|
||||||
|
decoder_only_optim.load_state_dict(optim_dict)
|
||||||
|
|
||||||
|
nano_scheduler = Transformer.WarmupLR(nano_optim, WARMUP_EPOCHS, EMBEDDED_SIZE, last_epoch=CURRENT_EPOCH)
|
||||||
encoder_only_scheduler = Transformer.WarmupLR(
|
encoder_only_scheduler = Transformer.WarmupLR(
|
||||||
encoder_only_optim, WARMUP_EPOCHS, EMBEDDED_SIZE
|
encoder_only_optim, WARMUP_EPOCHS, EMBEDDED_SIZE, last_epoch=CURRENT_EPOCH
|
||||||
)
|
)
|
||||||
decoder_only_scheduler = Transformer.WarmupLR(
|
decoder_only_scheduler = Transformer.WarmupLR(
|
||||||
decoder_only_optim, WARMUP_EPOCHS, EMBEDDED_SIZE
|
decoder_only_optim, WARMUP_EPOCHS, EMBEDDED_SIZE, last_epoch=CURRENT_EPOCH
|
||||||
)
|
)
|
||||||
|
|
||||||
current_epoch = CURRENT_EPOCH
|
current_epoch = CURRENT_EPOCH
|
||||||
@ -209,7 +228,7 @@ while current_epoch < MAX_EPOCHS:
|
|||||||
|
|
||||||
decoder_only_optim.zero_grad()
|
decoder_only_optim.zero_grad()
|
||||||
|
|
||||||
pred_logits = DECODER_ONLY((dec_x, dec_x_pad))
|
pred_logits = DECODER_ONLY((dec_x, enc_x_pad, dec_x_pad))
|
||||||
pred_logits = pred_logits.permute(0, 2, 1)
|
pred_logits = pred_logits.permute(0, 2, 1)
|
||||||
|
|
||||||
loss: torch.Tensor = decoder_ce(pred_logits, tgt)
|
loss: torch.Tensor = decoder_ce(pred_logits, tgt)
|
||||||
@ -297,7 +316,7 @@ while current_epoch < MAX_EPOCHS:
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
pred_logits = DECODER_ONLY((dec_x, dec_x_pad))
|
pred_logits = DECODER_ONLY((dec_x, enc_x_pad, dec_x_pad))
|
||||||
|
|
||||||
pred_logits = pred_logits.permute(0, 2, 1)
|
pred_logits = pred_logits.permute(0, 2, 1)
|
||||||
|
|
||||||
@ -380,6 +399,13 @@ while current_epoch < MAX_EPOCHS:
|
|||||||
if current_epoch % CHECKPOINT_STEPS == 0 or patience == PATIENCE:
|
if current_epoch % CHECKPOINT_STEPS == 0 or patience == PATIENCE:
|
||||||
print(f"Saving model at {CHECKPOINT_PATH.as_posix()}")
|
print(f"Saving model at {CHECKPOINT_PATH.as_posix()}")
|
||||||
torch.save(NANOSOCRATES.state_dict(), CHECKPOINT_PATH)
|
torch.save(NANOSOCRATES.state_dict(), CHECKPOINT_PATH)
|
||||||
|
torch.save(nano_optim.state_dict(), NANO_OPTIM_PATH)
|
||||||
|
torch.save(encoder_only_optim.state_dict(), ENC_OPTIM_PATH)
|
||||||
|
torch.save(decoder_only_optim.state_dict(), DEC_OPTIM_PATH)
|
||||||
|
FILE = open(LAST_EPOCH_PATH, "w", encoding="utf-8")
|
||||||
|
FILE.write(f"{current_epoch}")
|
||||||
|
FILE.close()
|
||||||
|
|
||||||
|
|
||||||
if patience == PATIENCE:
|
if patience == PATIENCE:
|
||||||
exit(0)
|
exit(0)
|
||||||
|
|||||||
@ -9,6 +9,7 @@ class NanoSocraDecoder(torch.nn.Module):
|
|||||||
decoder_embedder: Embedder.NanoSocratesEmbedder,
|
decoder_embedder: Embedder.NanoSocratesEmbedder,
|
||||||
decoder_layers: torch.nn.Sequential,
|
decoder_layers: torch.nn.Sequential,
|
||||||
detokener: DeToken
|
detokener: DeToken
|
||||||
|
|
||||||
) -> None:
|
) -> None:
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
@ -17,14 +18,14 @@ class NanoSocraDecoder(torch.nn.Module):
|
|||||||
self.__decoder = decoder_layers
|
self.__decoder = decoder_layers
|
||||||
self.__detokener = detokener
|
self.__detokener = detokener
|
||||||
|
|
||||||
def forward(self, args: tuple[torch.Tensor, torch.Tensor]):
|
def forward(self, args: tuple[torch.Tensor,torch.Tensor, torch.Tensor]):
|
||||||
|
|
||||||
decoder_embedder_input, tgt_padding = args
|
decoder_embedder_input, prefix_mask, tgt_padding = args
|
||||||
|
|
||||||
decoder_tensor = self.__decoder_embedder(decoder_embedder_input)
|
decoder_tensor = self.__decoder_embedder(decoder_embedder_input)
|
||||||
|
|
||||||
decoder_output, _, _, _, _, _ = self.__decoder(
|
decoder_output, _, _, _, _, _ = self.__decoder(
|
||||||
(decoder_tensor, decoder_tensor, decoder_tensor, tgt_padding, tgt_padding, True)
|
(decoder_tensor, decoder_tensor, decoder_tensor, prefix_mask, tgt_padding, True)
|
||||||
)
|
)
|
||||||
|
|
||||||
logits: torch.Tensor = self.__detokener(decoder_output)
|
logits: torch.Tensor = self.__detokener(decoder_output)
|
||||||
|
|||||||
176
Project_Model/Libs/Transformer/Models/NanoSocrates.py
Normal file
176
Project_Model/Libs/Transformer/Models/NanoSocrates.py
Normal file
@ -0,0 +1,176 @@
|
|||||||
|
import torch
|
||||||
|
import Project_Model.Libs.Embedder as Embedder
|
||||||
|
from ..Classes import Encoder, Decoder, DeToken
|
||||||
|
from ..Utils import get_decoder_input
|
||||||
|
from Project_Model.Libs.Batch import TaskType
|
||||||
|
|
||||||
|
|
||||||
|
class NanoSocratesCore(torch.nn.Module):
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
vocabulary_size: int,
|
||||||
|
sentence_max_length: int,
|
||||||
|
sos: int,
|
||||||
|
pad: int,
|
||||||
|
eos: int,
|
||||||
|
latent_space: int = 256,
|
||||||
|
feed_forward_multiplier: int = 4,
|
||||||
|
attention_heads: int = 4,
|
||||||
|
layer_number: int = 2,
|
||||||
|
) -> None:
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
self.__sos = sos
|
||||||
|
self.__pad = pad
|
||||||
|
self.__eos = eos
|
||||||
|
self.__sentence_len = sentence_max_length
|
||||||
|
|
||||||
|
feed_forward_latent_space = latent_space * feed_forward_multiplier
|
||||||
|
|
||||||
|
self.__encoder_embedder = Embedder.NanoSocratesEmbedder(
|
||||||
|
vocabulary_size, latent_space
|
||||||
|
)
|
||||||
|
self.__decoder_embedder = Embedder.NanoSocratesEmbedder(
|
||||||
|
vocabulary_size, latent_space
|
||||||
|
)
|
||||||
|
|
||||||
|
TMP_ENCODERS = [
|
||||||
|
Encoder(latent_space, feed_forward_latent_space, attention_heads)
|
||||||
|
] * layer_number
|
||||||
|
|
||||||
|
TMP_DECODERS = [
|
||||||
|
Decoder(latent_space, feed_forward_latent_space, attention_heads)
|
||||||
|
] * layer_number
|
||||||
|
|
||||||
|
self.__encoder = torch.nn.Sequential(*TMP_ENCODERS)
|
||||||
|
self.__decoder = torch.nn.Sequential(*TMP_DECODERS)
|
||||||
|
|
||||||
|
self.__detokener = DeToken(latent_space, vocabulary_size)
|
||||||
|
self.__encoder_detokener = DeToken(latent_space, vocabulary_size)
|
||||||
|
|
||||||
|
def forward(self, args: tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]):
|
||||||
|
|
||||||
|
encoder_embedder_input, src_padding, decoder_embedder_input, tgt_padding = args
|
||||||
|
|
||||||
|
encoder_tensor = self.__encoder_embedder(encoder_embedder_input)
|
||||||
|
decoder_tensor = self.__decoder_embedder(decoder_embedder_input)
|
||||||
|
|
||||||
|
encoder_output, _ = self.__encoder((encoder_tensor, src_padding))
|
||||||
|
|
||||||
|
decoder_output, _, _, _, _, _ = self.__decoder(
|
||||||
|
(decoder_tensor, encoder_output, encoder_output, src_padding, tgt_padding, False)
|
||||||
|
)
|
||||||
|
|
||||||
|
logits: torch.Tensor = self.__detokener(decoder_output)
|
||||||
|
|
||||||
|
return logits
|
||||||
|
|
||||||
|
def inference(self, input: tuple[torch.Tensor, torch.Tensor], task_type: TaskType) -> torch.Tensor:
|
||||||
|
|
||||||
|
if task_type == TaskType.MASKING:
|
||||||
|
return self.__masking(input)
|
||||||
|
|
||||||
|
if task_type == TaskType.COMPLETATION:
|
||||||
|
return self.__continue_rdf(input)
|
||||||
|
|
||||||
|
return self.__text_generation(input)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def __text_generation(self, args: tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
|
||||||
|
|
||||||
|
x, padding = args
|
||||||
|
|
||||||
|
encoder_tensor = self.__encoder_embedder(x)
|
||||||
|
BATCH, SEQ_LEN, _ = x.shape
|
||||||
|
|
||||||
|
encoder_output, _ = self.__encoder((encoder_tensor, padding))
|
||||||
|
|
||||||
|
decoder_in = get_decoder_input(BATCH, self.__sos, self.__pad, SEQ_LEN)
|
||||||
|
decoder_in_pad_mask = decoder_in.eq(self.__pad)
|
||||||
|
|
||||||
|
continue_generating = True
|
||||||
|
token_idx = 0
|
||||||
|
|
||||||
|
while continue_generating:
|
||||||
|
|
||||||
|
decoder_in = self.__decoder_embedder(decoder_in)
|
||||||
|
|
||||||
|
decoder_output, _, _, _, _, _ = self.__decoder(
|
||||||
|
(decoder_in, encoder_output, encoder_output, padding, decoder_in_pad_mask, False)
|
||||||
|
)
|
||||||
|
|
||||||
|
logits: torch.Tensor = self.__detokener(decoder_output)
|
||||||
|
|
||||||
|
logits = torch.softmax(logits, 2)
|
||||||
|
|
||||||
|
tokens = torch.argmax(logits)
|
||||||
|
|
||||||
|
if tokens.shape[0] == 1 and tokens[0,token_idx] == self.__eos:
|
||||||
|
continue_generating = False
|
||||||
|
continue
|
||||||
|
|
||||||
|
if token_idx < self.__sentence_len - 1:
|
||||||
|
decoder_in[:,token_idx + 1] = tokens[:,token_idx]
|
||||||
|
decoder_in_pad_mask = decoder_in.eq(self.__pad)
|
||||||
|
|
||||||
|
return decoder_in
|
||||||
|
|
||||||
|
|
||||||
|
def __masking(self, args: tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
|
||||||
|
|
||||||
|
x, padding = args
|
||||||
|
|
||||||
|
encoder_tensor = self.__encoder_embedder(x)
|
||||||
|
x, _ = self.__encoder((encoder_tensor, padding))
|
||||||
|
|
||||||
|
logits: torch.Tensor = self.__encoder_detokener(x)
|
||||||
|
del x
|
||||||
|
|
||||||
|
logits = torch.softmax(logits, 2)
|
||||||
|
|
||||||
|
tokens = torch.argmax(logits)
|
||||||
|
|
||||||
|
return tokens
|
||||||
|
|
||||||
|
|
||||||
|
def __continue_rdf(self, args: tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
|
||||||
|
|
||||||
|
decoder_in, _ = args
|
||||||
|
decoder_in_prefix_mask = decoder_in.eq(self.__pad)
|
||||||
|
decoder_in_pad_mask = decoder_in.eq(self.__pad)
|
||||||
|
|
||||||
|
continue_generating = True
|
||||||
|
token_idx = 0
|
||||||
|
|
||||||
|
while continue_generating:
|
||||||
|
|
||||||
|
decoder_in = self.__decoder_embedder(decoder_in)
|
||||||
|
|
||||||
|
decoder_output, _, _, _, _, _ = self.__decoder(
|
||||||
|
(decoder_in, decoder_in, decoder_in, decoder_in_prefix_mask, decoder_in_pad_mask, False)
|
||||||
|
)
|
||||||
|
|
||||||
|
logits: torch.Tensor = self.__detokener(decoder_output)
|
||||||
|
|
||||||
|
logits = torch.softmax(logits, 2)
|
||||||
|
|
||||||
|
tokens = torch.argmax(logits)
|
||||||
|
|
||||||
|
if tokens.shape[0] == 1 and tokens[0,token_idx] == self.__eos:
|
||||||
|
continue_generating = False
|
||||||
|
continue
|
||||||
|
|
||||||
|
if token_idx < self.__sentence_len - 1:
|
||||||
|
decoder_in[:,token_idx + 1] = tokens[:,token_idx]
|
||||||
|
decoder_in_pad_mask = decoder_in.eq(self.__pad)
|
||||||
|
|
||||||
|
return decoder_in
|
||||||
|
|
||||||
|
def take_pieces(self):
|
||||||
|
|
||||||
|
return (
|
||||||
|
(self.__encoder_embedder, self.__encoder),
|
||||||
|
(self.__decoder_embedder, self.__decoder, self.__detokener)
|
||||||
|
)
|
||||||
@ -36,6 +36,7 @@ class TrainingModel(torch.nn.Module):
|
|||||||
self.__decoder = torch.nn.Sequential(*TMP_DECODERS)
|
self.__decoder = torch.nn.Sequential(*TMP_DECODERS)
|
||||||
|
|
||||||
self.__detokener = DeToken(latent_space, vocabulary_size)
|
self.__detokener = DeToken(latent_space, vocabulary_size)
|
||||||
|
self.__encoder_detokener = DeToken(latent_space, vocabulary_size)
|
||||||
|
|
||||||
def forward(self, args: tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]):
|
def forward(self, args: tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]):
|
||||||
|
|
||||||
@ -57,6 +58,6 @@ class TrainingModel(torch.nn.Module):
|
|||||||
def take_pieces(self):
|
def take_pieces(self):
|
||||||
|
|
||||||
return (
|
return (
|
||||||
(self.__encoder_embedder, self.__encoder),
|
(self.__encoder_embedder, self.__encoder, self.__encoder_detokener),
|
||||||
(self.__decoder_embedder, self.__decoder, self.__detokener)
|
(self.__decoder_embedder, self.__decoder, self.__detokener)
|
||||||
)
|
)
|
||||||
@ -10,8 +10,7 @@ def decompose_nano_socrates(
|
|||||||
) -> tuple[TrainingModel, NanoSocratEncoder, NanoSocraDecoder]:
|
) -> tuple[TrainingModel, NanoSocratEncoder, NanoSocraDecoder]:
|
||||||
|
|
||||||
encoder_pieces, decoder_pieces = model.take_pieces()
|
encoder_pieces, decoder_pieces = model.take_pieces()
|
||||||
encoder_embedder, encoder = encoder_pieces
|
encoder_embedder, encoder, encoder_detokener = encoder_pieces
|
||||||
encoder_detokener = DeToken(embedding_size, vocabulary_size)
|
|
||||||
decoder_embedder, decoder, decoder_detokener = decoder_pieces
|
decoder_embedder, decoder, decoder_detokener = decoder_pieces
|
||||||
|
|
||||||
return (
|
return (
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user