This commit is contained in:
GassiGiuseppe
2025-10-17 16:38:12 +02:00
19 changed files with 546 additions and 44 deletions

View File

@@ -28,6 +28,7 @@ class Batcher:
tokenizer: BPE.TokeNanoCore,
masker: SpannedMasker,
seed: int = 0,
debug = False
) -> None:
# ABSTRACT, TRIPLE
# tasks:
@@ -44,6 +45,7 @@ class Batcher:
self._seed = seed
# self._token_completation = TokenCompletationTransformer(sotl,eos)
self._completation_task_token_truncator = truncate_rdf_list
self.__debug = debug
def batch(self, batch_size) -> Generator[
tuple[
@@ -142,6 +144,7 @@ class Batcher:
return out_X, out_Y, padding_X, padding_Y
def __rdf2txt_transformation(self, batch: pd.DataFrame):
X: list[list[int]]
task_token = self._tokenizer.encode(SpecialToken.RDF_TO_TEXT.value)
out = batch.rename(columns={"RDFs": "X", "Abstract": "Y"})[["X", "Y"]]
out["X"] = [task_token + x for x in out["X"]]
@@ -157,7 +160,7 @@ class Batcher:
X = []
Y = []
for rdf in batch["RDFs"]:
x, y = self._masker.mask_sequence(rdf)
x, y = self._masker.mask_sequence(rdf[:self.__max_length])
X.append(x)
Y.append(y)
return self.__normalization(X, Y)
@@ -181,7 +184,7 @@ class Batcher:
def __token_cmpletation_task_special_normalization(self, X: list[list[int]], Y: list[list[int]]
) -> tuple[list[list[int]], list[list[int]], list[list[int]], list[list[int]]]:
def continue_rdf_padding(sequence: list[int], pad_token: int):
for i, x in enumerate(sequence):
if x == pad_token:

View File

@@ -83,7 +83,14 @@ class NanoSocratesCore(torch.nn.Module):
x, padding = args
encoder_tensor = self.__encoder_embedder(x)
BATCH, SEQ_LEN, _ = x.shape
BATCH: int
if len(x.shape) > 2:
BATCH, SEQ_LEN, _ = x.shape
else:
_, SEQ_LEN = x.shape
BATCH = 1
encoder_output, _ = self.__encoder((encoder_tensor, padding))
@@ -95,25 +102,32 @@ class NanoSocratesCore(torch.nn.Module):
while continue_generating:
decoder_in = self.__decoder_embedder(decoder_in)
decoder_in_x = self.__decoder_embedder(decoder_in)
decoder_output, _, _, _, _, _ = self.__decoder(
(decoder_in, encoder_output, encoder_output, padding, decoder_in_pad_mask, False)
(decoder_in_x, encoder_output, encoder_output, padding, decoder_in_pad_mask, False)
)
logits: torch.Tensor = self.__detokener(decoder_output)
logits = torch.softmax(logits, 2)
tokens = torch.argmax(logits)
tokens = torch.argmax(logits, 2)
if token_idx < self.__sentence_len - 1:
decoder_in[:,token_idx + 1] = tokens[:,token_idx]
decoder_in_pad_mask = decoder_in.eq(self.__pad)
if token_idx == self.__sentence_len - 1:
continue_generating = False
continue
if tokens.shape[0] == 1 and tokens[0,token_idx] == self.__eos:
continue_generating = False
continue
if token_idx < self.__sentence_len - 1:
decoder_in[:,token_idx + 1] = tokens[:,token_idx]
decoder_in_pad_mask = decoder_in.eq(self.__pad)
token_idx += 1
return decoder_in
@@ -130,7 +144,7 @@ class NanoSocratesCore(torch.nn.Module):
logits = torch.softmax(logits, 2)
tokens = torch.argmax(logits)
tokens = torch.argmax(logits, 2)
return tokens
@@ -146,31 +160,56 @@ class NanoSocratesCore(torch.nn.Module):
while continue_generating:
decoder_in = self.__decoder_embedder(decoder_in)
decoder_x = self.__decoder_embedder(decoder_in)
decoder_output, _, _, _, _, _ = self.__decoder(
(decoder_in, decoder_in, decoder_in, decoder_in_prefix_mask, decoder_in_pad_mask, False)
(decoder_x, decoder_in, decoder_in, decoder_in_prefix_mask, decoder_in_pad_mask, True)
)
logits: torch.Tensor = self.__detokener(decoder_output)
logits = torch.softmax(logits, 2)
tokens = torch.argmax(logits)
tokens = torch.argmax(logits, 2)
if token_idx < self.__sentence_len - 1:
decoder_in[:,token_idx + 1] = tokens[:,token_idx]
decoder_in_pad_mask = decoder_in.eq(self.__pad)
if token_idx == self.__sentence_len - 1:
continue_generating = False
continue
if tokens.shape[0] == 1 and tokens[0,token_idx] == self.__eos:
continue_generating = False
continue
if token_idx < self.__sentence_len - 1:
decoder_in[:,token_idx + 1] = tokens[:,token_idx]
decoder_in_pad_mask = decoder_in.eq(self.__pad)
token_idx += 1
return decoder_in
def take_pieces(self):
return (
(self.__encoder_embedder, self.__encoder),
(self.__encoder_embedder, self.__encoder, self.__encoder_detokener),
(self.__decoder_embedder, self.__decoder, self.__detokener)
)
)
def load_pieces(
self,
encoder_embedder: Embedder.NanoSocratesEmbedder,
decoder_embedder: Embedder.NanoSocratesEmbedder,
encoder: torch.nn.Sequential,
decoder: torch.nn.Sequential,
encoder_detokener: DeToken,
decoder_detokener: DeToken
):
self.__encoder_embedder = encoder_embedder
self.__decoder_embedder = decoder_embedder
self.__encoder = encoder
self.__decoder = decoder
self.__encoder_detokener = encoder_detokener
self.__detokener = decoder_detokener

View File

@@ -1,9 +1,11 @@
from .TrainingModel import TrainingModel
from .NanoSocratEncoder import NanoSocratEncoder
from .NanoSocraDecoder import NanoSocraDecoder
from .NanoSocrates import NanoSocratesCore
__all__ = [
"TrainingModel",
"NanoSocratEncoder",
"NanoSocraDecoder"
"NanoSocraDecoder",
"NanoSocratesCore"
]

View File

@@ -1,6 +1,7 @@
from enum import Enum, auto
class TaskType(Enum):
TEXT2RDF = auto()
RDF2TEXT = auto()
MASK = auto()
COMPLETATION = auto()

View File

@@ -1,8 +1,14 @@
from .model_utils import decompose_nano_socrates, create_standalone_model
from .model_utils import decompose_nano_socrates, create_standalone_model, train2inference
from .ModelType import ModelType
from .decode_batch import decode_batch
from .metrics import precision, recall, accuracy, f1, meteor, bleu, rouge, average, rdf2txt, txt2rdf, rdf_completion_1, rdf_completion_2, remove_padding, balance_paddings
__all__ = [
"ModelType",
"decompose_nano_socrates",
"create_standalone_model"
"create_standalone_model",
"decode_batch",
"train2inference",
"precision", "recall", "accuracy", "f1", "meteor", "bleu", "rouge", "average",
"rdf2txt", "txt2rdf", "rdf_completion_1", "rdf_completion_2", "remove_padding", "balance_paddings"
]

View File

@@ -0,0 +1,16 @@
import torch
import Project_Model.Libs.BPE as BPE
def decode_batch(batch: torch.Tensor, tokenizer: BPE.TokeNanoCore ,uknonw_token: int) -> list[str]:
strings = []
BATCH, _ = batch.shape
for i in range(0, BATCH):
tokens: list[int] = batch.tolist()[i]
tokens = list(map(lambda x: uknonw_token if x > tokenizer.vocabulary_size else x, tokens))
strings.append(tokenizer.decode(tokens))
return strings

View File

@@ -0,0 +1,100 @@
import evaluate as eval
BLEU = eval.load("bleu")
ROUGE = eval.load("rouge")
METEOR = eval.load("meteor")
def precision(ref: list[int], pred: list[int]):
metric = eval.load("precision")
return metric.compute(predictions=pred, references=ref, average="weighted", zero_division=0)
def recall(ref: list[int], pred: list[int]):
metric = eval.load("recall")
return metric.compute(predictions=pred, references=ref, average="weighted", zero_division=0)
def accuracy(ref: list[int], pred: list[int]):
metric = eval.load("accuracy")
return metric.compute(predictions=pred, references=ref)
def meteor(ref: list[str], pred: list[str]):
metric = METEOR
return metric.compute(predictions=pred, references=ref)
def bleu(ref: list[str], pred: list[str]):
metric = BLEU
return metric.compute(predictions=pred, references=ref)
def rouge(ref: list[str], pred: list[str]):
metric = ROUGE
return metric.compute(predictions=pred, references=ref)
def f1(precision: float, recall: float):
divisor = max((precision + recall), 1E-5)
return (2 * recall * precision) / divisor
def average(array: list[float]):
return sum(array) / len(array)
def rdf2txt(ref: list[str], pred: list[str]):
b_m = bleu(ref, pred)
r_m = rouge(ref, pred)
m_m = meteor(ref, pred)
return (b_m, r_m, m_m)
def txt2rdf(ref: list[int], pred: list[int]):
p_m = precision(ref, pred)
r_m = recall(ref, pred)
return (p_m, r_m)
def rdf_completion_1(ref: list[int], pred: list[int]):
a_m = accuracy(ref, pred)
return a_m
def rdf_completion_2(ref: list[int], pred: list[int]):
p_m = precision(ref, pred)
r_m = recall(ref, pred)
return (p_m, r_m)
def remove_padding(seq: list[int], pad_token: int, end_token: int):
clean_seq = list(filter(lambda x: x != pad_token, seq))
if clean_seq[-1] == end_token:
return clean_seq
clean_seq.append(
end_token
)
return clean_seq
def balance_paddings(seq_1: list[int], seq_2: list[int], pad_token: int):
SEQ_1_LEN = len(seq_1)
SEQ_2_LEN = len(seq_2)
if SEQ_1_LEN > SEQ_2_LEN:
PAD = [pad_token] * (SEQ_1_LEN - SEQ_2_LEN)
seq_2.extend(PAD)
if SEQ_2_LEN > SEQ_1_LEN:
seq_2 = seq_2[:SEQ_1_LEN]
return (seq_1, seq_2)

View File

@@ -1,13 +1,13 @@
import torch
from Project_Model.Libs.Embedder import NanoSocratesEmbedder
from Project_Model.Libs.Transformer import TrainingModel, NanoSocraDecoder, NanoSocratEncoder, DeToken, Encoder, Decoder
from Project_Model.Libs.Transformer import TrainingModel,NanoSocratesCore, NanoSocraDecoder, NanoSocratEncoder, DeToken, Encoder, Decoder
from .ModelType import ModelType
def decompose_nano_socrates(
model: TrainingModel, vocabulary_size: int, embedding_size: int
) -> tuple[TrainingModel, NanoSocratEncoder, NanoSocraDecoder]:
model: TrainingModel | NanoSocratesCore , vocabulary_size: int, embedding_size: int
) -> tuple[TrainingModel | NanoSocratesCore, NanoSocratEncoder, NanoSocraDecoder]:
encoder_pieces, decoder_pieces = model.take_pieces()
encoder_embedder, encoder, encoder_detokener = encoder_pieces
@@ -19,6 +19,26 @@ def decompose_nano_socrates(
NanoSocraDecoder(decoder_embedder, decoder, decoder_detokener),
)
def train2inference(
train_model: TrainingModel,
inference_model: NanoSocratesCore
) -> NanoSocratesCore:
encoder_pieces, decoder_pieces = train_model.take_pieces()
enc_emb, encoder, enc_det = encoder_pieces
dec_emb, decoder, dec_det = decoder_pieces
inference_model.load_pieces(
enc_emb,
dec_emb,
encoder,
decoder,
enc_det,
dec_det
)
return inference_model
def create_standalone_model(
model_type: ModelType,