diff --git a/Playgrounds/nanosocrates-train-experiment-2.py b/Playgrounds/nanosocrates-train-experiment-2.py index d845fe0..8eeda63 100644 --- a/Playgrounds/nanosocrates-train-experiment-2.py +++ b/Playgrounds/nanosocrates-train-experiment-2.py @@ -21,11 +21,18 @@ torch.set_default_device(DEVICE) # Get paths +CHECKPOINT_DIR = "Assets/Dataset/Tmp" VOCABULARY_PATH = Path("Assets/Model/small/bpe-small-16.json") TRAIN_DATASET_PATH = Path("Assets/Dataset/1-hop/small/holdout/train.csv") VALIDATION_DATASET_PATH = Path("Assets/Dataset/1-hop/small/holdout/evaluation.csv") TEST_DATASET_PATH = Path("Assets/Dataset/1-hop/small/holdout/test.csv") -CHECKPOINT_PATH = Path("Assets/Dataset/Tmp/NanoSocrates.zip") +CHECKPOINT_PATH = Path(f"{CHECKPOINT_DIR}/NanoSocrates.zip") + +NANO_OPTIM_PATH = Path(f"{CHECKPOINT_DIR}/nano_optim.zip") +ENC_OPTIM_PATH = Path(f"{CHECKPOINT_DIR}/enc_optim.zip") +DEC_OPTIM_PATH = Path(f"{CHECKPOINT_DIR}/dec_optim.zip") +LAST_EPOCH_PATH = Path(f"{CHECKPOINT_DIR}/last_epoch.txt") + # BPE Init @@ -50,7 +57,7 @@ MINI_BATCH_SIZE = 80 VALIDATION_STEPS = 5 CHECKPOINT_STEPS = VALIDATION_STEPS * 4 PATIENCE = 4 -CURRENT_EPOCH = 0 +CURRENT_EPOCH = -1 if not LAST_EPOCH_PATH.is_file() else int(LAST_EPOCH_PATH.read_text()) VERBOSE = True LEARNING_RATE = 1.5 @@ -78,7 +85,6 @@ TEST_BATCHER = Batch.Batcher(TEST_DATASET_PATH, SENTENCE_LENGTH, TOKENANO, MASKE # Model - NANOSOCRATES = Transformer.TrainingModel( TOKEN_SPACE_SIZE, EMBEDDED_SIZE, @@ -103,12 +109,25 @@ decoder_ce = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN) nano_optim = torch.optim.AdamW(NANOSOCRATES.parameters(), LEARNING_RATE) encoder_only_optim = torch.optim.AdamW(ENCODER_ONLY.parameters(), LEARNING_RATE) decoder_only_optim = torch.optim.AdamW(DECODER_ONLY.parameters(), LEARNING_RATE) -nano_scheduler = Transformer.WarmupLR(nano_optim, WARMUP_EPOCHS, EMBEDDED_SIZE) + +if NANO_OPTIM_PATH.is_file(): + optim_dict = torch.load(NANO_OPTIM_PATH) + nano_optim.load_state_dict(optim_dict) + +if ENC_OPTIM_PATH.is_file(): + optim_dict = torch.load(ENC_OPTIM_PATH) + encoder_only_optim.load_state_dict(optim_dict) + +if DEC_OPTIM_PATH.is_file(): + optim_dict = torch.load(DEC_OPTIM_PATH) + decoder_only_optim.load_state_dict(optim_dict) + +nano_scheduler = Transformer.WarmupLR(nano_optim, WARMUP_EPOCHS, EMBEDDED_SIZE, last_epoch=CURRENT_EPOCH) encoder_only_scheduler = Transformer.WarmupLR( - encoder_only_optim, WARMUP_EPOCHS, EMBEDDED_SIZE + encoder_only_optim, WARMUP_EPOCHS, EMBEDDED_SIZE, last_epoch=CURRENT_EPOCH ) decoder_only_scheduler = Transformer.WarmupLR( - decoder_only_optim, WARMUP_EPOCHS, EMBEDDED_SIZE + decoder_only_optim, WARMUP_EPOCHS, EMBEDDED_SIZE, last_epoch=CURRENT_EPOCH ) current_epoch = CURRENT_EPOCH @@ -209,7 +228,7 @@ while current_epoch < MAX_EPOCHS: decoder_only_optim.zero_grad() - pred_logits = DECODER_ONLY((dec_x, dec_x_pad)) + pred_logits = DECODER_ONLY((dec_x, enc_x_pad, dec_x_pad)) pred_logits = pred_logits.permute(0, 2, 1) loss: torch.Tensor = decoder_ce(pred_logits, tgt) @@ -297,7 +316,7 @@ while current_epoch < MAX_EPOCHS: - pred_logits = DECODER_ONLY((dec_x, dec_x_pad)) + pred_logits = DECODER_ONLY((dec_x, enc_x_pad, dec_x_pad)) pred_logits = pred_logits.permute(0, 2, 1) @@ -380,6 +399,13 @@ while current_epoch < MAX_EPOCHS: if current_epoch % CHECKPOINT_STEPS == 0 or patience == PATIENCE: print(f"Saving model at {CHECKPOINT_PATH.as_posix()}") torch.save(NANOSOCRATES.state_dict(), CHECKPOINT_PATH) + torch.save(nano_optim.state_dict(), NANO_OPTIM_PATH) + torch.save(encoder_only_optim.state_dict(), ENC_OPTIM_PATH) + torch.save(decoder_only_optim.state_dict(), DEC_OPTIM_PATH) + FILE = open(LAST_EPOCH_PATH, "w", encoding="utf-8") + FILE.write(f"{current_epoch}") + FILE.close() + if patience == PATIENCE: exit(0) diff --git a/Project_Model/Libs/Transformer/Models/NanoSocraDecoder.py b/Project_Model/Libs/Transformer/Models/NanoSocraDecoder.py index 3abceea..37a7264 100644 --- a/Project_Model/Libs/Transformer/Models/NanoSocraDecoder.py +++ b/Project_Model/Libs/Transformer/Models/NanoSocraDecoder.py @@ -9,6 +9,7 @@ class NanoSocraDecoder(torch.nn.Module): decoder_embedder: Embedder.NanoSocratesEmbedder, decoder_layers: torch.nn.Sequential, detokener: DeToken + ) -> None: super().__init__() @@ -17,14 +18,14 @@ class NanoSocraDecoder(torch.nn.Module): self.__decoder = decoder_layers self.__detokener = detokener - def forward(self, args: tuple[torch.Tensor, torch.Tensor]): + def forward(self, args: tuple[torch.Tensor,torch.Tensor, torch.Tensor]): - decoder_embedder_input, tgt_padding = args + decoder_embedder_input, prefix_mask, tgt_padding = args decoder_tensor = self.__decoder_embedder(decoder_embedder_input) decoder_output, _, _, _, _, _ = self.__decoder( - (decoder_tensor, decoder_tensor, decoder_tensor, tgt_padding, tgt_padding, True) + (decoder_tensor, decoder_tensor, decoder_tensor, prefix_mask, tgt_padding, True) ) logits: torch.Tensor = self.__detokener(decoder_output) diff --git a/Project_Model/Libs/Transformer/Models/NanoSocrates.py b/Project_Model/Libs/Transformer/Models/NanoSocrates.py new file mode 100644 index 0000000..39b8716 --- /dev/null +++ b/Project_Model/Libs/Transformer/Models/NanoSocrates.py @@ -0,0 +1,176 @@ +import torch +import Project_Model.Libs.Embedder as Embedder +from ..Classes import Encoder, Decoder, DeToken +from ..Utils import get_decoder_input +from Project_Model.Libs.Batch import TaskType + + +class NanoSocratesCore(torch.nn.Module): + + def __init__( + self, + vocabulary_size: int, + sentence_max_length: int, + sos: int, + pad: int, + eos: int, + latent_space: int = 256, + feed_forward_multiplier: int = 4, + attention_heads: int = 4, + layer_number: int = 2, + ) -> None: + super().__init__() + + self.__sos = sos + self.__pad = pad + self.__eos = eos + self.__sentence_len = sentence_max_length + + feed_forward_latent_space = latent_space * feed_forward_multiplier + + self.__encoder_embedder = Embedder.NanoSocratesEmbedder( + vocabulary_size, latent_space + ) + self.__decoder_embedder = Embedder.NanoSocratesEmbedder( + vocabulary_size, latent_space + ) + + TMP_ENCODERS = [ + Encoder(latent_space, feed_forward_latent_space, attention_heads) + ] * layer_number + + TMP_DECODERS = [ + Decoder(latent_space, feed_forward_latent_space, attention_heads) + ] * layer_number + + self.__encoder = torch.nn.Sequential(*TMP_ENCODERS) + self.__decoder = torch.nn.Sequential(*TMP_DECODERS) + + self.__detokener = DeToken(latent_space, vocabulary_size) + self.__encoder_detokener = DeToken(latent_space, vocabulary_size) + + def forward(self, args: tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]): + + encoder_embedder_input, src_padding, decoder_embedder_input, tgt_padding = args + + encoder_tensor = self.__encoder_embedder(encoder_embedder_input) + decoder_tensor = self.__decoder_embedder(decoder_embedder_input) + + encoder_output, _ = self.__encoder((encoder_tensor, src_padding)) + + decoder_output, _, _, _, _, _ = self.__decoder( + (decoder_tensor, encoder_output, encoder_output, src_padding, tgt_padding, False) + ) + + logits: torch.Tensor = self.__detokener(decoder_output) + + return logits + + def inference(self, input: tuple[torch.Tensor, torch.Tensor], task_type: TaskType) -> torch.Tensor: + + if task_type == TaskType.MASKING: + return self.__masking(input) + + if task_type == TaskType.COMPLETATION: + return self.__continue_rdf(input) + + return self.__text_generation(input) + + + + def __text_generation(self, args: tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor: + + x, padding = args + + encoder_tensor = self.__encoder_embedder(x) + BATCH, SEQ_LEN, _ = x.shape + + encoder_output, _ = self.__encoder((encoder_tensor, padding)) + + decoder_in = get_decoder_input(BATCH, self.__sos, self.__pad, SEQ_LEN) + decoder_in_pad_mask = decoder_in.eq(self.__pad) + + continue_generating = True + token_idx = 0 + + while continue_generating: + + decoder_in = self.__decoder_embedder(decoder_in) + + decoder_output, _, _, _, _, _ = self.__decoder( + (decoder_in, encoder_output, encoder_output, padding, decoder_in_pad_mask, False) + ) + + logits: torch.Tensor = self.__detokener(decoder_output) + + logits = torch.softmax(logits, 2) + + tokens = torch.argmax(logits) + + if tokens.shape[0] == 1 and tokens[0,token_idx] == self.__eos: + continue_generating = False + continue + + if token_idx < self.__sentence_len - 1: + decoder_in[:,token_idx + 1] = tokens[:,token_idx] + decoder_in_pad_mask = decoder_in.eq(self.__pad) + + return decoder_in + + + def __masking(self, args: tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor: + + x, padding = args + + encoder_tensor = self.__encoder_embedder(x) + x, _ = self.__encoder((encoder_tensor, padding)) + + logits: torch.Tensor = self.__encoder_detokener(x) + del x + + logits = torch.softmax(logits, 2) + + tokens = torch.argmax(logits) + + return tokens + + + def __continue_rdf(self, args: tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor: + + decoder_in, _ = args + decoder_in_prefix_mask = decoder_in.eq(self.__pad) + decoder_in_pad_mask = decoder_in.eq(self.__pad) + + continue_generating = True + token_idx = 0 + + while continue_generating: + + decoder_in = self.__decoder_embedder(decoder_in) + + decoder_output, _, _, _, _, _ = self.__decoder( + (decoder_in, decoder_in, decoder_in, decoder_in_prefix_mask, decoder_in_pad_mask, False) + ) + + logits: torch.Tensor = self.__detokener(decoder_output) + + logits = torch.softmax(logits, 2) + + tokens = torch.argmax(logits) + + if tokens.shape[0] == 1 and tokens[0,token_idx] == self.__eos: + continue_generating = False + continue + + if token_idx < self.__sentence_len - 1: + decoder_in[:,token_idx + 1] = tokens[:,token_idx] + decoder_in_pad_mask = decoder_in.eq(self.__pad) + + return decoder_in + + def take_pieces(self): + + return ( + (self.__encoder_embedder, self.__encoder), + (self.__decoder_embedder, self.__decoder, self.__detokener) + ) \ No newline at end of file diff --git a/Project_Model/Libs/Transformer/Models/TrainingModel.py b/Project_Model/Libs/Transformer/Models/TrainingModel.py index 465f123..5f563a8 100644 --- a/Project_Model/Libs/Transformer/Models/TrainingModel.py +++ b/Project_Model/Libs/Transformer/Models/TrainingModel.py @@ -36,6 +36,7 @@ class TrainingModel(torch.nn.Module): self.__decoder = torch.nn.Sequential(*TMP_DECODERS) self.__detokener = DeToken(latent_space, vocabulary_size) + self.__encoder_detokener = DeToken(latent_space, vocabulary_size) def forward(self, args: tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]): @@ -57,6 +58,6 @@ class TrainingModel(torch.nn.Module): def take_pieces(self): return ( - (self.__encoder_embedder, self.__encoder), + (self.__encoder_embedder, self.__encoder, self.__encoder_detokener), (self.__decoder_embedder, self.__decoder, self.__detokener) ) \ No newline at end of file diff --git a/Project_Model/Libs/TransformerUtils/model_utils.py b/Project_Model/Libs/TransformerUtils/model_utils.py index 7a9c841..098f9ff 100644 --- a/Project_Model/Libs/TransformerUtils/model_utils.py +++ b/Project_Model/Libs/TransformerUtils/model_utils.py @@ -10,8 +10,7 @@ def decompose_nano_socrates( ) -> tuple[TrainingModel, NanoSocratEncoder, NanoSocraDecoder]: encoder_pieces, decoder_pieces = model.take_pieces() - encoder_embedder, encoder = encoder_pieces - encoder_detokener = DeToken(embedding_size, vocabulary_size) + encoder_embedder, encoder, encoder_detokener = encoder_pieces decoder_embedder, decoder, decoder_detokener = decoder_pieces return (