doctor and model test

2025-10-08 22:51:36 +02:00
parent b805dc538e
commit 1de2cc59db
13 changed files with 902 additions and 63 deletions
--- a/Project_Model/Libs/Transformer/Classes/DeToken.py
+++ b/Project_Model/Libs/Transformer/Classes/DeToken.py
@@ -14,6 +14,6 @@ class DeToken(torch.nn.Module):
        x = self.__linear(x)

        # 2) Go to logits
-        x = torch.softmax(x, 2)
+        # x = torch.softmax(x, 2)

        return x
--- a/Project_Model/Libs/Transformer/Classes/Decoder.py
+++ b/Project_Model/Libs/Transformer/Classes/Decoder.py
@@ -41,11 +41,12 @@ class Decoder(nn.Module):
            torch.Tensor,
            torch.Tensor,
            torch.Tensor,
+            torch.Tensor,
            torch.Tensor
        ]
    ):  # -> list[torch.Tensor]:  # k_x = v_x . While x_q = x
        # WARNING: args is needed to have sequential
-        x, k_x, v_x, padding_mask = args
+        x, k_x, v_x, padding_mask,encoder_padding_mask = args

        # build of attention mask
        attention_mask = get_causal_attention_mask(x.size(1))
@@ -68,7 +69,7 @@ class Decoder(nn.Module):

        # 5) Encoder–decoder (cross) attention
        CROSS_ATTENTION = self.__cross_attention(
-            x, k_x, v_x, key_padding_mask=padding_mask
+            x, k_x, v_x, key_padding_mask=encoder_padding_mask
        )

        # 6) Dropout
@@ -96,7 +97,7 @@ class Decoder(nn.Module):
        # 12) Layer Normalization
        x = self.__layer_norm_3(x)

-        return (x, k_x, v_x, padding_mask)
+        return (x, k_x, v_x, padding_mask, encoder_padding_mask)


 # use eval to disable dropout ecc
--- a/Project_Model/Libs/Transformer/Classes/NanoSocrates.py
+++ b/Project_Model/Libs/Transformer/Classes/NanoSocrates.py
@@ -0,0 +1,23 @@
+import torch
+from NanoSocratesCore import NanoSocratesCore
+
+class NanoSocrates(torch.nn.Module):
+
+    def __init__(self,
+                embedded_size: int, 
+                feed_forward_dim: int, 
+                encoder_layers: int, 
+                decoder_layers:int,
+                attention_heads: int,
+                vocab_size: int) -> None:
+
+        super().__init__()
+
+        self._model = NanoSocratesCore(            
+                embedded_size, 
+                feed_forward_dim, 
+                encoder_layers, 
+                decoder_layers,
+                attention_heads,
+                vocab_size)
+        
--- a/Project_Model/Libs/Transformer/Classes/NanoSocratesCore.py
+++ b/Project_Model/Libs/Transformer/Classes/NanoSocratesCore.py
@@ -16,8 +16,11 @@ class NanoSocratesCore(torch.nn.Module):
        num_encoder_layers: int = 2,
        num_decoder_layers: int = 2,
        num_attention_heads: int = 4,
+        pad_token: int = 0, 
    ) -> None:

+        super().__init__() 
+        self.__pad_token = pad_token
        feed_forward_dim = embedding_size * feed_forward_multiplier

        self.__sentence_length = sentence_length
@@ -43,69 +46,64 @@ class NanoSocratesCore(torch.nn.Module):
        self.__input_embeder = NanoSocratesEmbedder(vocab_size, embedding_size)
        self.__output_embedder = NanoSocratesEmbedder(vocab_size, embedding_size)

+    @torch.no_grad()  # inference only
    def forward(
        self,
        encoder_input: list[list[int]],
-        decoder_input: list[list[int]],
-        encoder_padding_mask: list[list[int]],
+        decoder_input: list[list[int]],           # must start with <SOS> and PAD elsewhere
+        encoder_padding_mask: list[list[bool]],   # True where encoder is PAD
    ):
-
-        if len(encoder_padding_mask) != len(encoder_input):
-            raise Exception("Mismatch in received_dimensions")
-
-        # TODO: check for tensor in input to embedder
        # 1) Embed User-Input for encoders
-        ENCODER_INPUT = self.__input_embeder(encoder_input)
+        ENCODER_INPUT = self.__input_embeder(encoder_input)  # [B,S,E]

        # 2) Encode User-Input
-        ENCODER_OUTPUT, _ = self.__encoder_sequence(ENCODER_INPUT, encoder_padding_mask)
+        ENCODER_OUTPUT, encoder_padding_mask = self.__encoder_sequence(
+            (ENCODER_INPUT, encoder_padding_mask) # as tuple
+        )  # [B,S,E], [B,S]
        del ENCODER_INPUT

-        exit_loop = False
-        decoder_token_list = decoder_input[:]
+        # 3) Autoregressive Output (greedy)
+        LOGITS_HISTORY: list[torch.Tensor] = []   # keep per-step distributions
+        decoder_token_list = [row[:] for row in decoder_input]  # copy tokens
        decoder_phase = 0
+        exit_loop = False

-        LOGITS_HISTORY: list[torch.Tensor] = []
-
-        # 3) Autoregressive Output
        while not exit_loop:
+            decoder_phase += 1  # move to next position

-            # 3.0) Increment Counter
-            decoder_phase += 1
+            # 3.1) Build decoder key padding mask from current tokens (True where PAD)
+            DECODER_KEY_PADDING_MASK: list[list[bool]] = [
+                [tok == self.__pad_token for tok in row] for row in decoder_token_list
+            ]  # [B,T]

-            # 3.1) Embed Decoder Input
-            decoder_input = self.__output_embedder(decoder_token_list)
+            # 3.2) Embed Decoder Input (full sequence; decoder builds causal mask inside)
+            DECODER_INPUT = self.__output_embedder(decoder_token_list)  # [B,T,E]

-            # 3.2) Decode Decoder Input
+            # 3.3) Decode (self-attn uses causal mask internally; we provide PAD masks)
            DECODER_OUTPUT, _, _, _ = self.__decoder_sequence(
-                decoder_input, ENCODER_OUTPUT, ENCODER_OUTPUT
-            )
+                (DECODER_INPUT, ENCODER_OUTPUT, ENCODER_OUTPUT,
+                DECODER_KEY_PADDING_MASK, encoder_padding_mask)
+            )  # [B,T,E]
+            del DECODER_INPUT

-            # 3.3) Go back to Token space
-            # TODO: change name
-            LOGITS = self.__linear(DECODER_OUTPUT)
+            # 3.4) Project to token space
+            LOGITS = self.__linear(DECODER_OUTPUT)  # [B,T,V]
            del DECODER_OUTPUT

-            # 3.4) Transform in probabilities
-            # TODO: change name
-            TOKEN_PROBABILITIES = torch.softmax(LOGITS, dim=-1)
-            del LOGITS
+            # 3.5) Probabilities and greedy pick at current step
+            TOKEN_PROBABILITIES = torch.softmax(LOGITS, dim=-1)  # [B,T,V]
+            LOGITS_HISTORY.append(TOKEN_PROBABILITIES)  # store for this step

-            LOGITS_HISTORY.append(TOKEN_PROBABILITIES)
+            step_idx = decoder_phase - 1  # 0-based
+            TOKEN_IDS = TOKEN_PROBABILITIES[:, step_idx, :].argmax(dim=-1).tolist()  # [B] -> list[int]

-            # 3.5) Take most probable tokens
-            TOKEN_IDS = torch.argmax(TOKEN_PROBABILITIES, -1)
+            # 3.6) Write prediction into next slot (the slot is PAD)
+            if step_idx + 1 < self.__sentence_length:
+                for b, tok in enumerate(TOKEN_IDS):
+                    decoder_token_list[b][step_idx + 1] = tok  # feed next position

-            # TODO: check for dimensions and for efficiency
-            DECODER_TOKEN_TENSOR = torch.tensor(decoder_token_list)
-            DECODER_TOKEN_TENSOR[:, decoder_phase] = TOKEN_IDS
-            decoder_token_list = DECODER_TOKEN_TENSOR.tolist()
-
-            del TOKEN_IDS
-            del DECODER_TOKEN_TENSOR
-
-            # 3.6) Check if we generated all tokens
+            # 3.7) Stop when we filled the sequence
            if decoder_phase == self.__sentence_length - 1:
                exit_loop = True

-        return LOGITS_HISTORY
+        return LOGITS_HISTORY  # list of [B,T,V] (per step)