This commit is contained in:
GassiGiuseppe
2025-10-07 23:16:20 +02:00
13 changed files with 348 additions and 6 deletions

View File

@@ -21,4 +21,7 @@ class SpecialToken(Enum):
# NanoSocrates
START = "<START>"
CORPUS_END = "<END>"
PAD = "<PAD>"
START_OF_SEQUENCE = "<SOS>"
END_OF_SEQUENCE = "<EOS>"
PAD = "<PAD>"

View File

@@ -45,9 +45,8 @@ def normalize_sequence(
pad_token: int,
end_token: int,
) -> tuple[list[int], list[bool]]:
new_sequence = pad_sequence(sequence, max_length, pad_token)
new_sequence = truncate_sequence(new_sequence, max_length, end_token)
new_sequence = truncate_sequence(sequence, max_length, end_token)
new_sequence = pad_sequence(new_sequence, max_length, pad_token)
PADDING_MASK = create_padding_mask(new_sequence, pad_token)
return (new_sequence, PADDING_MASK)