Compare commits
259 Commits
main
...
dev.modelt
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1de2cc59db | ||
|
|
b805dc538e | ||
|
|
c2e13bc9c6 | ||
|
|
14c3914571 | ||
|
|
b9273b95e2 | ||
|
|
c263e2cf13 | ||
|
|
c9a50d50b7 | ||
|
|
9b0c57c238 | ||
|
|
24ea4d3ba4 | ||
|
|
e353c200d7 | ||
|
|
159266a603 | ||
|
|
7027414342 | ||
|
|
fc44929a7b | ||
|
|
0560bc439a | ||
|
|
8adacdb08c | ||
|
|
533347ee22 | ||
|
|
d1ff88da82 | ||
|
|
3f465991f0 | ||
|
|
96cbf4eabb | ||
|
|
f801afe0e4 | ||
|
|
b4ee8362a2 | ||
|
|
3021a51961 | ||
|
|
99b5198c9a | ||
|
|
b97282179d | ||
|
|
fdece42462 | ||
|
|
109ad9f36b | ||
|
|
fef933da9d | ||
|
|
c65f5e66fe | ||
|
|
f9545aca1d | ||
|
|
490edcfd53 | ||
|
|
9b5bb6d5f8 | ||
|
|
14b810c451 | ||
|
|
56d438f01a | ||
|
|
745424a978 | ||
|
|
e1549d4458 | ||
|
|
456ce724fe | ||
|
|
44307cd917 | ||
|
|
ffdb312d58 | ||
|
|
0007c38212 | ||
|
|
9c1043e0ba | ||
|
|
ee8e56798c | ||
|
|
1797571bb2 | ||
|
|
e93710af08 | ||
|
|
d3bba9b944 | ||
|
|
b1e7af0607 | ||
|
|
d3b1f7da91 | ||
|
|
c217f5dec9 | ||
|
|
49f0beb6ea | ||
|
|
05bb460999 | ||
|
|
948c3fd7ac | ||
|
|
87409fecd5 | ||
|
|
7e40a36701 | ||
|
|
d48815cca2 | ||
|
|
0f243eaac2 | ||
|
|
9c83d9fa71 | ||
|
|
a693cbb77e | ||
|
|
6f219f634f | ||
|
|
b303affd18 | ||
|
|
53c4decac7 | ||
|
|
c60da8ba82 | ||
|
|
3b5e6c099c | ||
|
|
ba3a718480 | ||
|
|
69fba7c3e9 | ||
|
|
76200d936d | ||
|
|
9b656e7918 | ||
|
|
9a797a0485 | ||
|
|
3b274ad807 | ||
|
|
8f5e2f2f0d | ||
|
|
da0bdf703b | ||
|
|
03cdca1f00 | ||
|
|
7188c8678a | ||
|
|
1eef25a697 | ||
|
|
e9165fb146 | ||
|
|
bbadd4c521 | ||
|
|
c2f9344c82 | ||
|
|
25f3a5d221 | ||
|
|
e8ff82c40a | ||
|
|
23d1eaf99e | ||
|
|
25a6ad1254 | ||
|
|
460d4f5188 | ||
|
|
c6ac6df2c2 | ||
|
|
15baba54ab | ||
|
|
87f24878f4 | ||
|
|
999141f886 | ||
|
|
8e095ebb7a | ||
|
|
149deb407d | ||
|
|
8a21cb1b73 | ||
|
|
d2a3dfe90f | ||
|
|
0f95aeb122 | ||
|
|
0ee6e48004 | ||
|
|
55e0d2ac23 | ||
|
|
9c5f42153f | ||
|
|
c74689d01d | ||
|
|
51f491d033 | ||
|
|
c5c0c61f79 | ||
|
|
6b9cb7cd35 | ||
|
|
e8894504c6 | ||
|
|
845d645348 | ||
|
|
09f7b39512 | ||
|
|
070dc1b744 | ||
|
|
8121c75a09 | ||
|
|
a5b8692a77 | ||
|
|
7c935d2700 | ||
|
|
a1d143187d | ||
|
|
0eef2148a9 | ||
|
|
856bd8909c | ||
|
|
2e595a3a23 | ||
|
|
2194cc7b4f | ||
|
|
1eae8582b2 | ||
|
|
eadba1fb82 | ||
|
|
aa765b4555 | ||
|
|
17d82f0a4e | ||
|
|
0975c19e69 | ||
|
|
3fe4e45ceb | ||
|
|
d19426fa62 | ||
|
|
63baf29805 | ||
|
|
b80b4e4112 | ||
|
|
7cfaf601b4 | ||
|
|
fbbe6226bb | ||
|
|
b3d444979f | ||
|
|
66bcf6e55f | ||
|
|
dbf1d99408 | ||
|
|
97bac464f3 | ||
|
|
9a8e726d74 | ||
|
|
7ab9b0358e | ||
|
|
30c2938d29 | ||
|
|
76f24d4eb0 | ||
|
|
89a0a1f4bb | ||
|
|
64e355e80c | ||
|
|
397e29742a | ||
|
|
ccacea18d8 | ||
|
|
b09bd4acba | ||
|
|
c9032cab09 | ||
|
|
7020c9e683 | ||
|
|
2fe1ce9e9a | ||
|
|
18fc2ba9d8 | ||
|
|
5acee1d1a5 | ||
| 2e36753da4 | |||
|
|
007f1e9554 | ||
|
|
c319398ca0 | ||
|
|
255d8a072d | ||
|
|
8167c9d435 | ||
|
|
bd72ad3571 | ||
|
|
6ddb7de9da | ||
|
|
564b0d712e | ||
|
|
e433941405 | ||
|
|
b46df4f91a | ||
|
|
d179e01971 | ||
|
|
b071145f6e | ||
|
|
ed0255e99b | ||
|
|
3e8b5c5579 | ||
|
|
8db35732f9 | ||
|
|
9552d61f8d | ||
|
|
be8a87ce01 | ||
|
|
5801a819e9 | ||
|
|
3f48b5c428 | ||
|
|
9972ab8a51 | ||
|
|
650b37c586 | ||
|
|
90012285b5 | ||
|
|
1bbb4a0999 | ||
|
|
e521b0704e | ||
|
|
ee0aa583d5 | ||
|
|
0a698e9837 | ||
|
|
9440a562f2 | ||
|
|
5eda131aac | ||
|
|
57884eaf2e | ||
|
|
4548a683c2 | ||
|
|
3eec49ffa5 | ||
|
|
0bc7f4b227 | ||
|
|
f28952b0a2 | ||
|
|
0b626a8e09 | ||
|
|
b254098532 | ||
|
|
ee88ffe4cf | ||
|
|
70b4bd8645 | ||
|
|
6316d2bfc4 | ||
|
|
87ca748f45 | ||
|
|
4315d70109 | ||
|
|
9a5d633b5e | ||
|
|
a6760cd52d | ||
|
|
a7eb92227d | ||
|
|
9f221e31cd | ||
|
|
47197194d5 | ||
|
|
0cdbf6f624 | ||
|
|
3e30489f86 | ||
|
|
8a22e453e4 | ||
|
|
7feb4eb857 | ||
|
|
70af19d356 | ||
|
|
a4b44ab2ee | ||
|
|
74b6b609dd | ||
|
|
59796c37cb | ||
|
|
f696f5950b | ||
|
|
605b496da7 | ||
|
|
7d693964dd | ||
|
|
25f401b577 | ||
|
|
14c5ade230 | ||
| 4c9c51f902 | |||
|
|
63c1a4a160 | ||
|
|
51114af853 | ||
|
|
3a6dca0681 | ||
|
|
346098d2b7 | ||
|
|
64f9b41378 | ||
|
|
ac1ed42c49 | ||
|
|
edd01a2c83 | ||
|
|
5aa9e3fcf3 | ||
|
|
0970cabf92 | ||
|
|
a26d92750f | ||
|
|
34c4782232 | ||
|
|
c5439533e6 | ||
|
|
8819b8e87f | ||
|
|
1076dc8aa6 | ||
|
|
3d15e03b09 | ||
|
|
0ee2ec6fcd | ||
|
|
95cfa5486c | ||
|
|
0d30e90ee0 | ||
|
|
faaba17a98 | ||
|
|
854e5f1d98 | ||
|
|
242d7f674f | ||
|
|
de8c2afceb | ||
|
|
f89dffff75 | ||
|
|
e39bad8348 | ||
|
|
7a1a221017 | ||
|
|
fafe6ae0f9 | ||
|
|
e32444df75 | ||
|
|
b74b7ac4f0 | ||
|
|
22134391d9 | ||
|
|
82c9023849 | ||
|
|
00b87e01ea | ||
|
|
ce3d4bf6c5 | ||
|
|
c415b175a0 | ||
|
|
ec81ea7930 | ||
|
|
4bb03f86b3 | ||
|
|
e5f201f3db | ||
|
|
1c715dc569 | ||
|
|
6686b47328 | ||
|
|
9a5a7d84fd | ||
|
|
9678ece9c0 | ||
|
|
67bcd732b5 | ||
|
|
1a4f900500 | ||
|
|
ca8729b67c | ||
|
|
9dbffc52ed | ||
|
|
b7f504942a | ||
|
|
7f0c5ce8d3 | ||
|
|
9838e287a4 | ||
|
|
ca6143ea3c | ||
|
|
16e7ab4d9f | ||
|
|
28723ab662 | ||
|
|
3e59efcf33 | ||
|
|
7c04309cc1 | ||
|
|
db87295890 | ||
|
|
61568200a8 | ||
|
|
8df2736b97 | ||
|
|
eb5b7f629a | ||
|
|
79232b391e | ||
|
|
72eb937b47 | ||
|
|
cececa14ce | ||
|
|
2487d44abd | ||
|
|
553b86cac2 | ||
|
|
12bd781fd3 | ||
|
|
463f4907b8 |
1
.gitattributes
vendored
1
.gitattributes
vendored
@ -1,2 +1,3 @@
|
||||
Exam/Deep_Learning_2025_VIII.pdf filter=lfs diff=lfs merge=lfs -text
|
||||
Assets/** filter=lfs diff=lfs merge=lfs -text
|
||||
Assets/Dataset/1-hop/dataset.csv filter=lfs diff=lfs merge=lfs -text
|
||||
|
||||
7
.gitignore
vendored
7
.gitignore
vendored
@ -189,7 +189,8 @@ ipython_config.py
|
||||
.LSOverride
|
||||
|
||||
# Icon must end with two \r
|
||||
Icon
|
||||
Icon
|
||||
|
||||
|
||||
# Thumbnails
|
||||
._*
|
||||
@ -251,3 +252,7 @@ $RECYCLE.BIN/
|
||||
# .nfs files are created when an open file is removed but is still being accessed
|
||||
.nfs*
|
||||
|
||||
# ---> Custom
|
||||
**/Tmp/**
|
||||
**/cache/**
|
||||
!**/.gitkeep
|
||||
|
||||
14
.vscode/extensions.json
vendored
Normal file
14
.vscode/extensions.json
vendored
Normal file
@ -0,0 +1,14 @@
|
||||
{
|
||||
"recommendations": [
|
||||
"bierner.github-markdown-preview",
|
||||
"bierner.markdown-checkbox",
|
||||
"bierner.markdown-emoji",
|
||||
"bierner.markdown-footnotes",
|
||||
"bierner.markdown-mermaid",
|
||||
"bierner.markdown-preview-github-styles",
|
||||
"bierner.markdown-yaml-preamble",
|
||||
"davidanson.vscode-markdownlint",
|
||||
"kejun.markdown-alert",
|
||||
"yzhang.markdown-all-in-one"
|
||||
]
|
||||
}
|
||||
16
.vscode/launch.json
vendored
Normal file
16
.vscode/launch.json
vendored
Normal file
@ -0,0 +1,16 @@
|
||||
{
|
||||
// Use IntelliSense to learn about possible attributes.
|
||||
// Hover to view descriptions of existing attributes.
|
||||
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
||||
"version": "0.2.0",
|
||||
"configurations": [
|
||||
{
|
||||
"name": "Python Debugger: Current File with Arguments",
|
||||
"type": "debugpy",
|
||||
"request": "launch",
|
||||
"program": "${file}",
|
||||
"console": "integratedTerminal",
|
||||
"args": "${command:pickArgs}"
|
||||
}
|
||||
]
|
||||
}
|
||||
55
.vscode/settings.json
vendored
Normal file
55
.vscode/settings.json
vendored
Normal file
@ -0,0 +1,55 @@
|
||||
{
|
||||
// Always treat the project root as the working dir for Jupyter
|
||||
"jupyter.notebookFileRoot": "${workspaceFolder}",
|
||||
// When you click "Run Python File in Terminal", DON'T cd into the file's folder
|
||||
"python.terminal.executeInFileDir": false,
|
||||
// Start new integrated terminals at the project root
|
||||
"terminal.integrated.cwd": "${workspaceFolder}",
|
||||
// Make pytest run from the root without needing a pytest.ini
|
||||
"python.testing.pytestEnabled": true,
|
||||
"python.testing.cwd": "${workspaceFolder}",
|
||||
"python.testing.pytestArgs": [
|
||||
"src/test"
|
||||
],
|
||||
// Help Pylance resolve imports like `from src...` without red squiggles
|
||||
"python.analysis.extraPaths": [
|
||||
"${workspaceFolder}"
|
||||
],
|
||||
// For linux
|
||||
"terminal.integrated.env.linux": {
|
||||
"PYTHONPATH": "${workspaceFolder}"
|
||||
},
|
||||
// For OSX
|
||||
"terminal.integrated.env.osx": {
|
||||
"PYTHONPATH": "${workspaceFolder}"
|
||||
},
|
||||
// For Windows
|
||||
"terminal.integrated.env.windows": {
|
||||
"PYTHONPATH": "${workspaceFolder}"
|
||||
},
|
||||
"python.analysis.typeCheckingMode": "standard"
|
||||
}
|
||||
// {
|
||||
// // Always treat the project root as the working dir for Jupyter
|
||||
// "jupyter.notebookFileRoot": "${workspaceFolder}",
|
||||
//
|
||||
// // When you click "Run Python File in Terminal", DON'T cd into the file's folder
|
||||
// "python.terminal.executeInFileDir": false,
|
||||
//
|
||||
// // Start new integrated terminals at the project root
|
||||
// "terminal.integrated.cwd": "${workspaceFolder}",
|
||||
//
|
||||
// // Ensure Python can import from the project root no matter which file you run
|
||||
// // (so `src/` is on sys.path). Linux shown here; add osx/windows if needed.
|
||||
// "terminal.integrated.env.windows": {
|
||||
// "PYTHONPATH": "${workspaceFolder}"
|
||||
// },
|
||||
//
|
||||
// // Make pytest run from the root without needing a pytest.ini
|
||||
// "python.testing.pytestEnabled": true,
|
||||
// "python.testing.cwd": "${workspaceFolder}",
|
||||
// "python.testing.pytestArgs": ["src/test"],
|
||||
//
|
||||
// // Help Pylance resolve imports like `from src...` without red squiggles
|
||||
// "python.analysis.extraPaths": ["${workspaceFolder}"]
|
||||
// }
|
||||
BIN
Assets/Dataset/1-hop/curated/corpus.txt
(Stored with Git LFS)
Normal file
BIN
Assets/Dataset/1-hop/curated/corpus.txt
(Stored with Git LFS)
Normal file
Binary file not shown.
BIN
Assets/Dataset/1-hop/dataset.csv
(Stored with Git LFS)
Normal file
BIN
Assets/Dataset/1-hop/dataset.csv
(Stored with Git LFS)
Normal file
Binary file not shown.
|
BIN
Assets/Dataset/1-hop/movie-pageid.csv
(Stored with Git LFS)
Normal file
BIN
Assets/Dataset/1-hop/movie-pageid.csv
(Stored with Git LFS)
Normal file
Binary file not shown.
|
BIN
Assets/Dataset/1-hop/movies.csv
(Stored with Git LFS)
Normal file
BIN
Assets/Dataset/1-hop/movies.csv
(Stored with Git LFS)
Normal file
Binary file not shown.
|
BIN
Assets/Dataset/1-hop/reverse.csv
(Stored with Git LFS)
Normal file
BIN
Assets/Dataset/1-hop/reverse.csv
(Stored with Git LFS)
Normal file
Binary file not shown.
|
BIN
Assets/Dataset/1-hop/small/corpus.txt
(Stored with Git LFS)
Normal file
BIN
Assets/Dataset/1-hop/small/corpus.txt
(Stored with Git LFS)
Normal file
Binary file not shown.
BIN
Assets/Dataset/1-hop/small/rdf_completation.csv
(Stored with Git LFS)
Normal file
BIN
Assets/Dataset/1-hop/small/rdf_completation.csv
(Stored with Git LFS)
Normal file
Binary file not shown.
|
BIN
Assets/Dataset/1-hop/small/rdf_text.csv
(Stored with Git LFS)
Normal file
BIN
Assets/Dataset/1-hop/small/rdf_text.csv
(Stored with Git LFS)
Normal file
Binary file not shown.
|
BIN
Assets/Dataset/1-hop/toy/corpus.txt
(Stored with Git LFS)
Normal file
BIN
Assets/Dataset/1-hop/toy/corpus.txt
(Stored with Git LFS)
Normal file
Binary file not shown.
BIN
Assets/Dataset/1-hop/toy/rdf_completation.csv
(Stored with Git LFS)
Normal file
BIN
Assets/Dataset/1-hop/toy/rdf_completation.csv
(Stored with Git LFS)
Normal file
Binary file not shown.
|
BIN
Assets/Dataset/1-hop/toy/rdf_mask.csv
(Stored with Git LFS)
Normal file
BIN
Assets/Dataset/1-hop/toy/rdf_mask.csv
(Stored with Git LFS)
Normal file
Binary file not shown.
|
BIN
Assets/Dataset/1-hop/toy/rdf_text.csv
(Stored with Git LFS)
Normal file
BIN
Assets/Dataset/1-hop/toy/rdf_text.csv
(Stored with Git LFS)
Normal file
Binary file not shown.
|
BIN
Assets/Dataset/1-hop/uri-abbreviations.csv
(Stored with Git LFS)
Normal file
BIN
Assets/Dataset/1-hop/uri-abbreviations.csv
(Stored with Git LFS)
Normal file
Binary file not shown.
|
BIN
Assets/Dataset/1-hop/wikipedia-movie.csv
(Stored with Git LFS)
Normal file
BIN
Assets/Dataset/1-hop/wikipedia-movie.csv
(Stored with Git LFS)
Normal file
Binary file not shown.
|
BIN
Assets/Dataset/1-hop/wikipedia-summary.csv
(Stored with Git LFS)
Normal file
BIN
Assets/Dataset/1-hop/wikipedia-summary.csv
(Stored with Git LFS)
Normal file
Binary file not shown.
|
BIN
Assets/Dataset/DatawareHouse/dataset.db
(Stored with Git LFS)
Normal file
BIN
Assets/Dataset/DatawareHouse/dataset.db
(Stored with Git LFS)
Normal file
Binary file not shown.
0
Assets/Dataset/Tmp/.gitkeep
Normal file
0
Assets/Dataset/Tmp/.gitkeep
Normal file
BIN
Assets/Model/toy_10/README.md
(Stored with Git LFS)
Normal file
BIN
Assets/Model/toy_10/README.md
(Stored with Git LFS)
Normal file
Binary file not shown.
BIN
Assets/Model/toy_10/toy_dictionary.json
(Stored with Git LFS)
Normal file
BIN
Assets/Model/toy_10/toy_dictionary.json
(Stored with Git LFS)
Normal file
Binary file not shown.
196
Playgrounds/doctor.ipynb
Normal file
196
Playgrounds/doctor.ipynb
Normal file
File diff suppressed because one or more lines are too long
125
Playgrounds/doctor.py
Normal file
125
Playgrounds/doctor.py
Normal file
@ -0,0 +1,125 @@
|
||||
import random
|
||||
import torch
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
import Project_Model.Libs.Embedder as Embedder
|
||||
import Project_Model.Libs.BPE as BPE
|
||||
import Project_Model.Libs.Transformer as Transformer
|
||||
import Project_Model.Libs.TorchShims as torch_shims
|
||||
from Project_Model.Libs.Training.learning_rade_shedulers import Custom_lr
|
||||
from Project_Model.Libs.Training.logistic_collector import LogitsCollector # import the external collector
|
||||
|
||||
# set a fixed seed
|
||||
torch.manual_seed(0)
|
||||
random.seed(0)
|
||||
DEVICE = torch_shims.get_default_device()
|
||||
torch.set_default_device(DEVICE)
|
||||
|
||||
# BPE Init
|
||||
VOCABULARY_PATH = Path("Assets/Model/toy_10/toy_dictionary.json")
|
||||
SPECIAL_VOC = BPE.default_special_tokens()
|
||||
|
||||
VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)
|
||||
TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)
|
||||
|
||||
# Constants
|
||||
TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + 1
|
||||
EMBEDDED_SIZE = 256
|
||||
FEED_FORWARD_MULTIPLIER = 4
|
||||
ATTENTION_HEADS = 4
|
||||
SENTENCE_LENGTH = 256
|
||||
NUMBER_OF_BLOCKS = 2
|
||||
MAX_EPOCHS = int(1e3)
|
||||
|
||||
PAD_TOKEN = TOKENANO.encode("<PAD>")[0]
|
||||
END_TOKEN = TOKENANO.encode("<END>")[0]
|
||||
|
||||
# Load CSV
|
||||
TOY_DATASET_PATH = Path("Assets/Dataset/1-hop/toy/rdf_text.csv")
|
||||
TOY_DATASET = pd.read_csv(TOY_DATASET_PATH)
|
||||
|
||||
TOY_BATCH_INPUT_LIST: list[list[int]] = []
|
||||
TOY_BATCH_PADDING_LIST: list[list[bool]] = []
|
||||
TOY_BATCH_TARGET_LIST: list[list[int]] = []
|
||||
TOY_BATCH_DECODER_DEFAULT: list[list[int]] = []
|
||||
|
||||
for index, row in TOY_DATASET.iterrows():
|
||||
RDFs: str = row["RDFs"]
|
||||
Abstract: str = row["Abstract"]
|
||||
|
||||
input_tokens = TOKENANO.encode(RDFs) # encoder input ids
|
||||
output_tokens = TOKENANO.encode(Abstract)[1:] # decoder target ids (shifted left)
|
||||
decoder_default_tokens = TOKENANO.encode("<SOS>") # decoder input starts with <SOS>
|
||||
|
||||
input_tokens, padding = Transformer.normalize_sequence(
|
||||
input_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN
|
||||
) # pad/trim + end token
|
||||
output_tokens, _ = Transformer.normalize_sequence(
|
||||
output_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN
|
||||
) # pad/trim + end token
|
||||
decoder_default_tokens = Transformer.pad_sequence(
|
||||
decoder_default_tokens, SENTENCE_LENGTH, PAD_TOKEN
|
||||
) # pad with PAD up to SENTENCE_LENGTH
|
||||
|
||||
TOY_BATCH_INPUT_LIST.append(input_tokens)
|
||||
TOY_BATCH_PADDING_LIST.append(padding)
|
||||
TOY_BATCH_TARGET_LIST.append(output_tokens)
|
||||
TOY_BATCH_DECODER_DEFAULT.append(decoder_default_tokens)
|
||||
|
||||
# Training loop
|
||||
LOSS_HISTORY = []
|
||||
NANOSOCRATES = Transformer.TrainingModel(
|
||||
TOKEN_SPACE_SIZE,
|
||||
EMBEDDED_SIZE,
|
||||
FEED_FORWARD_MULTIPLIER,
|
||||
ATTENTION_HEADS,
|
||||
NUMBER_OF_BLOCKS,
|
||||
)
|
||||
|
||||
collector = LogitsCollector(PAD_TOKEN, END_TOKEN, TOKENANO) # collects logits and decodes
|
||||
|
||||
NANOSOCRATES.train()
|
||||
cross_entropy = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)
|
||||
optimizer = torch.optim.AdamW(NANOSOCRATES.parameters())
|
||||
scheduler = Custom_lr(EMBEDDED_SIZE, 4000) # step each optimizer step
|
||||
|
||||
current_epoch = 0
|
||||
BATCH_SIZE = min(32, len(TOY_BATCH_INPUT_LIST)) # small batch to stabilize
|
||||
|
||||
while current_epoch < MAX_EPOCHS:
|
||||
# simple fixed mini-batch from the top; later you can shuffle/slice
|
||||
enc = torch.tensor(TOY_BATCH_INPUT_LIST[:BATCH_SIZE], dtype=torch.long) # [B,T] encoder token ids
|
||||
pad = torch.tensor(TOY_BATCH_PADDING_LIST[:BATCH_SIZE], dtype=torch.bool) # [B,T] True where encoder PAD is present
|
||||
tgt = torch.tensor(TOY_BATCH_TARGET_LIST[:BATCH_SIZE], dtype=torch.long) # [B,T] decoder targets (ground-truth)
|
||||
|
||||
# decoder prefix buffer: <SOS> at pos 0, PAD elsewhere (no shift here) # we will fill it step by step
|
||||
dec = torch.tensor(TOY_BATCH_DECODER_DEFAULT[:BATCH_SIZE], dtype=torch.long) # [B,T]
|
||||
|
||||
total_loss = 0.0
|
||||
collector.reset() # start fresh for this epoch
|
||||
|
||||
T = tgt.size(1) # sequence length
|
||||
for t in range(T):
|
||||
optimizer.zero_grad(set_to_none=True) # clear grads for this token step
|
||||
|
||||
prefix = dec[:, : t + 1] # [B, t+1] current decoder prefix
|
||||
dec_pad_mask = prefix.eq(PAD_TOKEN) # [B, t+1] True where PAD inside prefix
|
||||
|
||||
# one-step logits given prefix (trainer model expects 4 args now)
|
||||
logits_t: torch.Tensor = NANOSOCRATES((enc, pad, prefix, dec_pad_mask)) # [B,V] logits for step t
|
||||
collector.add(logits_t) # store logits for decoding later
|
||||
|
||||
loss_t = cross_entropy(logits_t, tgt[:, t]) # CE expects raw logits; PAD ignored
|
||||
loss_t.backward() # backprop for this step
|
||||
optimizer.step() # update params
|
||||
scheduler.step() # Noam/warmup: step per optimizer step
|
||||
|
||||
total_loss = float(loss_t.detach()) # keep last step loss for logging
|
||||
|
||||
# teacher forcing: reveal the correct token for next position
|
||||
if t < T - 1:
|
||||
dec[:, t + 1] = tgt[:, t] # write ground-truth into next slot
|
||||
|
||||
current_epoch += 1
|
||||
print(f"EPOCH {current_epoch}\n\tLoss: {total_loss:.6f}") # simple log
|
||||
collector.print_decoded() # print decoded predictions for the batch
|
||||
182
Playgrounds/embedder.ipynb
Normal file
182
Playgrounds/embedder.ipynb
Normal file
File diff suppressed because one or more lines are too long
308
Playgrounds/encoder-decoder.ipynb
Normal file
308
Playgrounds/encoder-decoder.ipynb
Normal file
@ -0,0 +1,308 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "7a311d4b",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[[7706, 290, 756, 4270, 7357, 115, 351, 1507, 1213, 410, 3382, 317, 497, 4740, 2784, 7712], [7706, 290, 756, 4270, 7357, 115, 351, 1507, 1213, 410, 3382, 317, 497, 4740, 2784, 7712], [7706, 290, 756, 4270, 7357, 115, 351, 1507, 1213, 410, 3382, 317, 497, 4740, 2784, 7712]]\n",
|
||||
"3\n",
|
||||
"Embedder Tensor: torch.Size([3, 16, 256])\n",
|
||||
"Values:\n",
|
||||
"tensor([[[-0.6981, 0.0804, -2.1672, ..., 0.3919, 0.3341, 1.0794],\n",
|
||||
" [ 2.5818, -0.2308, 0.6001, ..., -0.0500, -0.0408, -0.9852],\n",
|
||||
" [-0.6967, 0.8109, 1.3108, ..., 2.1693, 1.4143, -0.1236],\n",
|
||||
" ...,\n",
|
||||
" [ 2.1226, 2.5695, -1.6178, ..., -0.0652, -0.0802, 0.1103],\n",
|
||||
" [ 0.8770, -2.4782, 0.8536, ..., 2.0471, -1.5702, 0.7387],\n",
|
||||
" [ 1.4284, -0.4654, 0.1394, ..., 1.6520, 0.6728, 1.3851]],\n",
|
||||
"\n",
|
||||
" [[-0.6981, 0.0804, -2.1672, ..., 0.3919, 0.3341, 1.0794],\n",
|
||||
" [ 2.5818, -0.2308, 0.6001, ..., -0.0500, -0.0408, -0.9852],\n",
|
||||
" [-0.6967, 0.8109, 1.3108, ..., 2.1693, 1.4143, -0.1236],\n",
|
||||
" ...,\n",
|
||||
" [ 2.1226, 2.5695, -1.6178, ..., -0.0652, -0.0802, 0.1103],\n",
|
||||
" [ 0.8770, -2.4782, 0.8536, ..., 2.0471, -1.5702, 0.7387],\n",
|
||||
" [ 1.4284, -0.4654, 0.1394, ..., 1.6520, 0.6728, 1.3851]],\n",
|
||||
"\n",
|
||||
" [[-0.6981, 0.0804, -2.1672, ..., 0.3919, 0.3341, 1.0794],\n",
|
||||
" [ 2.5818, -0.2308, 0.6001, ..., -0.0500, -0.0408, -0.9852],\n",
|
||||
" [-0.6967, 0.8109, 1.3108, ..., 2.1693, 1.4143, -0.1236],\n",
|
||||
" ...,\n",
|
||||
" [ 2.1226, 2.5695, -1.6178, ..., -0.0652, -0.0802, 0.1103],\n",
|
||||
" [ 0.8770, -2.4782, 0.8536, ..., 2.0471, -1.5702, 0.7387],\n",
|
||||
" [ 1.4284, -0.4654, 0.1394, ..., 1.6520, 0.6728, 1.3851]]],\n",
|
||||
" grad_fn=<AddBackward0>)\n",
|
||||
"ENCODER Tensor: torch.Size([3, 1, 256])\n",
|
||||
"Values:\n",
|
||||
"tensor([[[ 8.0069e-01, 4.0532e-01, -1.8316e+00, -1.3902e+00, -1.1784e+00,\n",
|
||||
" 1.3667e+00, -9.7890e-01, 6.0696e-01, -1.4899e+00, 5.5765e-01,\n",
|
||||
" 4.5991e-02, 5.1214e-01, 3.1901e-01, 4.7577e-01, -2.9585e-01,\n",
|
||||
" -1.0811e+00, -1.5281e+00, -6.3773e-01, -9.5954e-01, 1.8497e+00,\n",
|
||||
" -1.1789e+00, -9.7387e-01, 1.1931e-01, -7.2703e-01, 5.3108e-01,\n",
|
||||
" -6.4877e-01, -4.5188e-01, 1.5185e+00, -8.3408e-01, 3.2824e-01,\n",
|
||||
" -1.8166e+00, 1.9548e+00, -5.2419e-01, -1.0693e+00, -1.8510e+00,\n",
|
||||
" 1.5440e+00, -3.2370e-01, -1.3990e+00, -4.6940e-01, 6.5840e-02,\n",
|
||||
" -9.2057e-01, 1.2513e+00, -5.9168e-01, 7.8198e-01, -1.3121e+00,\n",
|
||||
" 1.1492e+00, -2.3695e-01, -1.8935e+00, 1.1639e+00, -5.8169e-01,\n",
|
||||
" 2.5051e-01, -8.1654e-01, -1.0328e+00, 1.4285e+00, -8.1485e-01,\n",
|
||||
" 1.0614e+00, -3.3834e-01, -4.1667e-02, -1.1920e-01, 3.1383e-01,\n",
|
||||
" -5.9857e-01, 1.7327e-01, -1.6854e+00, -1.5174e+00, -2.6508e-01,\n",
|
||||
" -6.0082e-01, 5.1468e-01, 2.7909e-01, -2.5296e-01, -1.4670e+00,\n",
|
||||
" -1.3587e+00, -8.8864e-02, 3.2825e-01, 1.0950e+00, -1.0371e+00,\n",
|
||||
" 1.1744e+00, 5.2984e-01, 4.1751e-01, -9.8803e-01, 3.5631e-01,\n",
|
||||
" 4.7484e-01, 2.2435e-01, 1.4022e+00, 1.2242e+00, 1.1447e+00,\n",
|
||||
" -5.4052e-01, -9.1786e-01, -1.2299e+00, 1.1656e+00, 9.1570e-01,\n",
|
||||
" 1.8956e+00, 7.4344e-01, 4.2187e-01, -9.5426e-02, -3.2428e-01,\n",
|
||||
" 9.6364e-01, -2.3252e-01, 2.9036e-01, -2.4432e+00, 9.8019e-01,\n",
|
||||
" -4.6697e-02, 8.3910e-01, -4.3541e-01, -7.1915e-01, -7.5638e-01,\n",
|
||||
" 9.0217e-01, 2.0919e+00, -7.9533e-01, -1.5413e-01, -6.9260e-01,\n",
|
||||
" -1.3086e+00, 7.8925e-01, 1.8855e-01, 7.4043e-01, -3.8834e-01,\n",
|
||||
" 1.0272e-02, 1.0763e+00, 4.2142e-01, 6.6520e-01, 4.5996e-01,\n",
|
||||
" -8.5060e-01, -9.0101e-01, -4.2090e-01, 2.5596e-01, -1.4946e+00,\n",
|
||||
" 1.0925e-01, -7.5359e-01, -3.0447e-01, 1.0679e+00, 1.9398e+00,\n",
|
||||
" 8.1472e-01, 1.3498e+00, 1.1107e+00, 6.3288e-01, 3.1149e-01,\n",
|
||||
" -1.9333e+00, -1.5274e+00, 2.1794e-01, -3.1895e-02, 1.0756e+00,\n",
|
||||
" 1.0215e+00, 1.6938e+00, -1.0939e+00, 2.2690e+00, -7.0921e-01,\n",
|
||||
" 6.4212e-01, -6.5468e-01, 1.6839e+00, 5.7296e-01, -1.4031e+00,\n",
|
||||
" 3.9133e-01, -5.3541e-01, 4.3439e-01, -1.6785e+00, 5.2030e-03,\n",
|
||||
" 4.5155e-01, -7.0953e-01, -1.9656e-01, -3.8671e-02, -1.0927e+00,\n",
|
||||
" -3.0405e-01, -1.3818e-02, -3.7748e-01, 1.4412e+00, -1.4254e-01,\n",
|
||||
" 7.9939e-01, -8.5402e-01, -1.0330e+00, 1.7661e+00, -3.6084e-01,\n",
|
||||
" 1.5622e+00, 1.0240e+00, 1.9056e-01, -4.1480e-01, 6.9056e-01,\n",
|
||||
" 1.7204e+00, -9.9218e-01, -1.6504e-01, -1.1807e+00, 1.0827e+00,\n",
|
||||
" 1.5973e+00, 1.4849e-01, -2.2867e+00, 7.7322e-01, -6.8401e-01,\n",
|
||||
" -6.0493e-01, 1.0616e+00, -1.8034e-01, -1.8828e+00, 1.1031e-01,\n",
|
||||
" 2.5452e-01, -4.2489e-02, 8.1171e-01, 1.3429e+00, -6.5058e-01,\n",
|
||||
" -1.3531e+00, -1.2263e+00, 1.1226e+00, 1.2407e+00, -9.7453e-01,\n",
|
||||
" 9.4696e-01, 6.6186e-01, -5.0804e-01, 1.2647e-01, -1.1777e+00,\n",
|
||||
" 6.8443e-02, -1.3043e-01, 2.9595e-01, -1.5330e+00, -6.5733e-01,\n",
|
||||
" 1.1291e+00, 6.9629e-01, 4.4690e-01, 8.0151e-01, -1.2406e+00,\n",
|
||||
" 2.6085e+00, -2.0310e-01, -1.0226e+00, -6.9182e-02, 7.6600e-01,\n",
|
||||
" -9.9842e-01, 2.0896e+00, 2.6334e-01, -1.1559e-01, -6.6876e-01,\n",
|
||||
" -6.6295e-01, -1.6461e-01, 2.8270e+00, 3.2727e-01, 1.3724e+00,\n",
|
||||
" -1.0749e+00, 3.7782e-01, -1.5472e+00, 3.0822e-01, 5.7273e-02,\n",
|
||||
" 3.9136e-01, 8.2948e-01, 2.1438e-01, -9.8623e-01, 5.6053e-01,\n",
|
||||
" -1.5617e+00, -3.9595e-01, 1.0451e-02, -1.1860e+00, -1.4994e-01,\n",
|
||||
" 1.6566e+00, 2.0369e+00, -4.3995e-01, -4.4262e-01, -3.1014e-01,\n",
|
||||
" 5.9083e-01, -1.0765e+00, -5.2906e-01, 4.6039e-02, -1.0154e+00,\n",
|
||||
" 5.9942e-01]],\n",
|
||||
"\n",
|
||||
" [[ 1.2683e+00, -4.3200e-01, -1.3333e+00, -3.6705e-01, -5.8895e-01,\n",
|
||||
" 9.9266e-01, -4.2914e-01, 9.2765e-01, -1.0935e+00, 1.4975e+00,\n",
|
||||
" -5.3739e-01, -2.8332e-01, 9.1166e-01, 1.5010e+00, -2.1787e-01,\n",
|
||||
" -1.4258e+00, -2.7524e-01, -1.2602e+00, 2.0117e-01, 2.3906e+00,\n",
|
||||
" -9.6397e-01, -7.5872e-01, 3.3948e-01, -7.9353e-01, 9.1668e-01,\n",
|
||||
" 8.7734e-04, -3.0271e-01, 1.7087e+00, -1.0273e+00, 1.5174e+00,\n",
|
||||
" -2.6405e-02, 1.4236e+00, -9.9093e-01, 5.4787e-01, -1.0904e+00,\n",
|
||||
" 5.2156e-01, -6.3470e-01, -7.7688e-01, -1.2538e+00, -3.9307e-01,\n",
|
||||
" -7.6707e-01, 1.3733e+00, -7.2709e-01, 1.1185e+00, -1.5860e+00,\n",
|
||||
" -2.6148e-01, -3.7984e-01, -1.3604e+00, 9.2864e-02, -7.9642e-01,\n",
|
||||
" 1.0956e+00, 3.1202e-01, -4.1234e-01, 3.6488e-02, -1.4639e+00,\n",
|
||||
" 1.0947e+00, -7.9230e-01, 4.6913e-01, -2.3407e-01, 4.1768e-02,\n",
|
||||
" -1.5921e+00, 6.9743e-01, -7.0222e-01, -5.4705e-01, -6.5663e-01,\n",
|
||||
" -4.1810e-01, 2.7744e-01, 7.9178e-01, 7.5886e-01, -7.6302e-01,\n",
|
||||
" -1.2204e+00, -1.1103e+00, -1.3646e-01, 1.9589e+00, -1.3637e+00,\n",
|
||||
" 9.0804e-01, 2.3094e-01, -5.5953e-02, -6.7626e-01, 1.4242e+00,\n",
|
||||
" 1.0167e+00, 1.0705e+00, 2.2947e+00, 9.1274e-01, 1.2281e+00,\n",
|
||||
" -7.0638e-01, -1.2249e+00, -8.9208e-02, 1.1016e+00, 1.1940e+00,\n",
|
||||
" 3.5834e-01, 1.2961e+00, -4.6674e-01, 3.4572e-01, -4.3458e-01,\n",
|
||||
" 1.1008e+00, 3.7783e-01, -6.5841e-01, -2.3127e+00, 1.4617e+00,\n",
|
||||
" -1.2826e-01, 1.3463e-01, -8.5268e-01, -8.4144e-01, -1.8594e+00,\n",
|
||||
" 1.9260e-01, 1.6432e+00, -2.0640e-02, -5.0030e-01, -1.5334e-01,\n",
|
||||
" -6.1072e-01, -1.3694e-01, -3.7308e-01, 1.6603e+00, 1.1246e-01,\n",
|
||||
" 6.0823e-02, 7.8749e-01, -1.7002e-01, 1.2058e+00, 8.5615e-01,\n",
|
||||
" 1.2525e-01, -1.0584e+00, -4.7931e-01, 1.4088e-01, -1.8149e+00,\n",
|
||||
" 1.4654e+00, -1.0936e+00, 5.3182e-01, 9.5694e-01, 3.2472e+00,\n",
|
||||
" 3.4877e-01, 1.8491e+00, -1.5184e-01, 1.4711e+00, -7.6064e-01,\n",
|
||||
" -2.2144e+00, -1.8952e+00, -4.9502e-01, -6.6836e-01, 1.4946e+00,\n",
|
||||
" 6.7616e-01, 1.1501e+00, -9.4747e-01, 1.1009e+00, -1.4211e+00,\n",
|
||||
" 3.9528e-01, -9.5220e-01, 1.4886e+00, 7.1784e-01, -1.9941e+00,\n",
|
||||
" 6.7901e-02, -1.3109e-01, 1.1695e+00, 1.2861e-01, -2.8123e-01,\n",
|
||||
" -6.1611e-01, 1.5513e-01, -3.9289e-01, -4.5543e-02, -2.8628e-01,\n",
|
||||
" 2.6118e-01, 2.2623e-01, -6.3705e-01, 7.3591e-01, -7.8799e-01,\n",
|
||||
" 2.5053e-01, -1.5923e-01, -4.9584e-01, 1.9009e+00, -2.3263e-01,\n",
|
||||
" 1.2213e+00, 1.0313e+00, 2.0177e-02, -6.2209e-01, -3.5161e-01,\n",
|
||||
" 1.5143e+00, -7.2332e-02, 2.3909e-02, -2.1261e+00, 8.5199e-01,\n",
|
||||
" 1.9084e+00, 4.6845e-02, -2.3554e+00, 1.3735e+00, -7.3909e-01,\n",
|
||||
" -8.3949e-01, -3.9314e-01, -4.3324e-01, -9.6804e-01, -5.3124e-01,\n",
|
||||
" -6.5091e-01, -1.1738e+00, 1.3315e+00, 6.5606e-01, -1.4131e-01,\n",
|
||||
" -1.7712e+00, -1.1628e+00, 9.6813e-01, 8.7314e-01, -9.8027e-01,\n",
|
||||
" 6.9376e-01, 5.3878e-01, -1.6169e+00, 2.2860e-01, -6.2179e-01,\n",
|
||||
" -1.1043e-01, -3.9658e-01, 2.8712e-01, 8.2201e-02, 2.0888e-01,\n",
|
||||
" -5.9884e-01, 7.3092e-01, 6.9128e-01, 5.3977e-01, -1.5728e+00,\n",
|
||||
" 1.6878e+00, -8.2669e-01, -9.8076e-01, -3.4203e-01, 4.6939e-02,\n",
|
||||
" -1.3158e-01, 2.1923e+00, -6.6483e-02, -4.0687e-01, -1.2715e+00,\n",
|
||||
" -8.1549e-01, -1.2047e+00, 1.3547e+00, -4.2072e-01, 1.1674e+00,\n",
|
||||
" -5.1421e-01, 1.3055e+00, -1.1277e+00, 1.8372e+00, -1.1215e+00,\n",
|
||||
" 1.4797e+00, 2.8354e-01, -6.3974e-01, -1.2869e+00, -2.7897e-01,\n",
|
||||
" -1.0397e+00, 1.8622e-01, -5.0397e-02, -4.4865e-02, -7.6067e-01,\n",
|
||||
" 1.7715e+00, 1.5040e+00, -2.6854e-01, -5.2802e-01, -5.3407e-01,\n",
|
||||
" 2.0313e-02, -2.6276e-01, -7.0748e-01, -8.7328e-01, -3.4108e-01,\n",
|
||||
" 1.4313e+00]],\n",
|
||||
"\n",
|
||||
" [[ 7.7464e-01, -4.2187e-01, -2.0571e+00, -8.6709e-01, -1.5722e+00,\n",
|
||||
" 4.9540e-01, -1.5270e+00, 1.0499e+00, -1.9579e+00, -2.5298e-02,\n",
|
||||
" 4.3419e-01, 5.8822e-01, 1.3392e+00, 6.9604e-01, -9.7883e-01,\n",
|
||||
" -9.1354e-01, -9.1852e-01, -6.0951e-01, -6.6255e-02, 1.3907e+00,\n",
|
||||
" -6.2912e-01, -2.7524e-01, 1.9520e-02, -2.7154e-01, 1.5162e-01,\n",
|
||||
" 1.3318e-02, -8.9196e-01, 9.0976e-01, -1.3544e+00, 2.4276e-01,\n",
|
||||
" -7.4038e-01, 9.7062e-01, 3.2011e-01, 3.4486e-01, -2.3374e+00,\n",
|
||||
" 1.3311e+00, -3.1871e-02, -1.4468e+00, -1.5968e+00, 3.0418e-01,\n",
|
||||
" -7.7136e-01, 1.3427e+00, -1.2493e+00, 1.4114e+00, -1.2475e+00,\n",
|
||||
" 7.0239e-01, -9.6120e-02, -4.4365e-01, 5.3238e-01, -1.4933e+00,\n",
|
||||
" 5.4476e-01, -1.8490e-02, -5.9936e-01, 1.0878e+00, -1.8892e+00,\n",
|
||||
" 1.2810e+00, -1.0747e+00, 5.3514e-01, 1.7422e-01, 1.1354e+00,\n",
|
||||
" -7.4837e-01, 4.0327e-01, -1.8950e+00, -7.2336e-01, 2.4441e-01,\n",
|
||||
" -1.3650e-01, -4.8344e-01, 3.3921e-02, 5.0889e-01, -1.3769e+00,\n",
|
||||
" -2.5907e-01, -2.7549e-01, -1.9128e-01, 1.9751e+00, -7.1191e-01,\n",
|
||||
" 5.1910e-01, 1.0902e-01, 2.9995e-01, -3.5180e-01, -6.2139e-01,\n",
|
||||
" 7.2905e-01, -5.3177e-01, 4.3340e-01, 1.0071e+00, 1.7586e+00,\n",
|
||||
" -3.9963e-01, -2.5139e-01, -9.4213e-01, 9.2847e-01, 1.1298e+00,\n",
|
||||
" 7.8545e-01, 1.3188e+00, 3.7466e-01, 9.0773e-01, -4.0454e-02,\n",
|
||||
" 1.3444e+00, 6.0301e-01, 8.9929e-02, -2.0754e+00, 4.8614e-01,\n",
|
||||
" -9.7160e-01, 8.2446e-01, -1.1813e+00, -9.6185e-01, -9.2922e-02,\n",
|
||||
" 6.0154e-01, 1.6640e+00, -1.0461e+00, 1.5868e-01, -5.7239e-01,\n",
|
||||
" -6.2726e-01, 3.2848e-01, 5.9609e-01, 1.5563e+00, -4.0883e-01,\n",
|
||||
" 4.4902e-01, 1.4004e+00, 2.2426e-01, 3.8314e-01, -2.0641e-01,\n",
|
||||
" -1.6465e-01, -6.4645e-01, 1.5772e-01, 6.8907e-01, -1.2703e+00,\n",
|
||||
" 1.8914e-01, -6.2678e-01, 3.0179e-01, 1.2687e+00, 1.6849e+00,\n",
|
||||
" 1.5690e+00, 1.0999e+00, 1.5820e+00, -6.4808e-01, 5.1003e-01,\n",
|
||||
" -1.6674e+00, -1.2224e+00, 1.9769e-01, -1.3883e-01, 1.2179e+00,\n",
|
||||
" 1.2971e+00, 4.6259e-01, -5.8717e-01, 1.4532e+00, -1.0540e+00,\n",
|
||||
" 2.8689e-01, -1.3895e+00, 1.4014e+00, -4.0430e-01, -2.6099e+00,\n",
|
||||
" -1.0293e+00, -1.1097e+00, 8.6266e-01, -1.0535e+00, 7.1789e-01,\n",
|
||||
" 6.0642e-01, -1.2493e+00, -3.7762e-01, -4.1281e-02, -7.3049e-01,\n",
|
||||
" -7.2913e-04, -7.3122e-02, -2.3850e-01, 1.2546e+00, 1.8802e-01,\n",
|
||||
" 1.3135e+00, -5.0367e-01, 1.2456e-01, 2.7475e+00, -1.2486e+00,\n",
|
||||
" 1.4441e+00, 8.7469e-01, -5.6901e-01, -1.2145e-01, 3.1091e-01,\n",
|
||||
" 1.9406e+00, -8.1891e-01, 3.1316e-02, -1.2867e+00, 8.0780e-01,\n",
|
||||
" 7.0041e-01, 2.8903e-01, -1.6387e+00, 6.6553e-01, -1.3696e+00,\n",
|
||||
" -7.9454e-01, 3.3899e-01, -5.5822e-01, -8.1969e-01, -1.2410e-01,\n",
|
||||
" -3.7024e-01, -7.2536e-01, 7.5648e-01, 1.6899e+00, -1.7404e-01,\n",
|
||||
" -1.7191e+00, -7.2603e-01, 1.5046e+00, 8.3216e-01, -1.5304e+00,\n",
|
||||
" -1.8264e-01, 3.3451e-01, -5.6636e-02, 6.1099e-01, -9.8517e-01,\n",
|
||||
" 4.4856e-01, -8.6275e-01, 6.9264e-02, -1.1572e+00, 2.3373e-01,\n",
|
||||
" 5.9896e-01, 1.2384e-01, 1.0309e+00, 1.4273e+00, -8.4776e-01,\n",
|
||||
" 2.6236e+00, -9.0133e-01, -4.0009e-01, -4.9727e-01, 3.7945e-01,\n",
|
||||
" -9.0712e-01, 1.5725e+00, 1.6298e-01, 1.1544e-01, -4.3125e-01,\n",
|
||||
" -8.7131e-01, -2.5880e-01, 2.9032e+00, 2.7154e-01, 1.3677e+00,\n",
|
||||
" -8.8544e-01, 5.6083e-01, -1.8256e+00, 9.4832e-01, -1.0762e+00,\n",
|
||||
" 7.5421e-01, 6.5008e-01, -8.6361e-01, -1.4911e+00, -7.5930e-02,\n",
|
||||
" -1.6896e+00, 1.5223e-02, -1.5283e-01, -1.8741e+00, 1.1400e-01,\n",
|
||||
" 1.8822e+00, 2.6615e+00, 2.1607e-01, -5.6243e-01, 3.6730e-01,\n",
|
||||
" 4.0374e-01, -1.1973e+00, -5.3006e-01, -3.4750e-01, -4.4187e-01,\n",
|
||||
" 7.4358e-01]]], grad_fn=<NativeLayerNormBackward0>)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import random\n",
|
||||
"import torch\n",
|
||||
"from pathlib import Path\n",
|
||||
"import Project_Model.Libs.Embedder as Embedder\n",
|
||||
"import Project_Model.Libs.BPE as BPE\n",
|
||||
"import Project_Model.Libs.Transformer as Transformer\n",
|
||||
"\n",
|
||||
"# set a fixed seed\n",
|
||||
"torch.manual_seed(0)\n",
|
||||
"random.seed(0)\n",
|
||||
"\n",
|
||||
"TEXT = (\n",
|
||||
" \"<ABS>The Dark Knight is a 2008 superhero film directed by Christopher Nolan,<SOTL>\"\n",
|
||||
")\n",
|
||||
"OUT_TEXT = \"<START>\"\n",
|
||||
"VOCABULARY_PATH = Path(\"Assets/Model/toy_10/toy_dictionary.json\")\n",
|
||||
"SPECIAL_VOC = BPE.default_special_tokens()\n",
|
||||
"\n",
|
||||
"VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)\n",
|
||||
"TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)\n",
|
||||
"\n",
|
||||
"PAD_TOKEN = TOKENANO.encode(\"<PAD>\")[0]\n",
|
||||
"END_TOKEN = TOKENANO.encode(\"<END>\")[0]\n",
|
||||
"\n",
|
||||
"ENCODER_INPUT = TOKENANO.encode(TEXT)\n",
|
||||
"DECODER_INPUT = TOKENANO.encode(OUT_TEXT)\n",
|
||||
"MAX_LEN = len(ENCODER_INPUT) + 1\n",
|
||||
"\n",
|
||||
"EN_IN, PAD_MASK = Transformer.normalize_sequence(ENCODER_INPUT, MAX_LEN, PAD_TOKEN, END_TOKEN)\n",
|
||||
"DEC_IN, _ = Transformer.normalize_sequence(DECODER_INPUT, MAX_LEN, PAD_TOKEN, END_TOKEN)\n",
|
||||
"BATCH_LEN = 3\n",
|
||||
"\n",
|
||||
"INPUT_TOKENIZATION = [\n",
|
||||
" EN_IN\n",
|
||||
"] * BATCH_LEN\n",
|
||||
"OUTPUT_TOKENIZATION = [\n",
|
||||
" DEC_IN\n",
|
||||
"] * BATCH_LEN\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"print(INPUT_TOKENIZATION)\n",
|
||||
"\n",
|
||||
"TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size\n",
|
||||
"EMBEDDED_SIZE = 256\n",
|
||||
"FEED_FORWARD_DIM = EMBEDDED_SIZE * 4\n",
|
||||
"\n",
|
||||
"EMBEDDER = Embedder.NanoSocratesEmbedder(TOKEN_SPACE_SIZE, EMBEDDED_SIZE)\n",
|
||||
"encoder_tensor: torch.Tensor = EMBEDDER(INPUT_TOKENIZATION)\n",
|
||||
"ENCODER = torch.nn.Sequential(\n",
|
||||
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||
")\n",
|
||||
"decoder_tensor: torch.Tensor = EMBEDDER(OUTPUT_TOKENIZATION)\n",
|
||||
"DECODER = torch.nn.Sequential(\n",
|
||||
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"print(len(INPUT_TOKENIZATION))\n",
|
||||
"print(f\"Embedder Tensor: {encoder_tensor.shape}\")\n",
|
||||
"print(f\"Values:\\n{encoder_tensor}\")\n",
|
||||
"\n",
|
||||
"BATCH_SIZE, TOKENS, DIMENSIONS = encoder_tensor.shape\n",
|
||||
"PAD_MASK = torch.tensor([PAD_MASK] * BATCH_LEN)\n",
|
||||
"\n",
|
||||
"encoder_out, _ = ENCODER((encoder_tensor, PAD_MASK))\n",
|
||||
"tensor: torch.Tensor\n",
|
||||
"tensor, _, _, _ = DECODER((decoder_tensor, encoder_out, encoder_out, None))\n",
|
||||
"\n",
|
||||
"print(f\"ENCODER Tensor: {tensor.shape}\")\n",
|
||||
"print(f\"Values:\\n{tensor}\")"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "deep_learning",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.13.7"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
131
Playgrounds/encoder.ipynb
Normal file
131
Playgrounds/encoder.ipynb
Normal file
@ -0,0 +1,131 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "c64b0e24",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[[7706, 290, 756, 4270, 7357, 115, 351, 1507, 1213, 410, 3382, 317, 497, 4740, 2784, 7700], [7706, 290, 756, 4270, 7357, 115, 351, 1507, 1213, 410, 3382, 317, 497, 4740, 2784, 7700]]\n",
|
||||
"2\n",
|
||||
"Embedder Tensor: torch.Size([2, 16, 256])\n",
|
||||
"Values:\n",
|
||||
"tensor([[[-0.6981, 0.0804, -2.1672, ..., 0.3919, 0.3341, 1.0794],\n",
|
||||
" [ 2.5818, -0.2308, 0.6001, ..., -0.0500, -0.0408, -0.9852],\n",
|
||||
" [-0.6967, 0.8109, 1.3108, ..., 2.1693, 1.4143, -0.1236],\n",
|
||||
" ...,\n",
|
||||
" [ 2.1226, 2.5695, -1.6178, ..., -0.0652, -0.0802, 0.1103],\n",
|
||||
" [ 0.8770, -2.4782, 0.8536, ..., 2.0471, -1.5702, 0.7387],\n",
|
||||
" [-0.0495, -1.8601, 0.0405, ..., 2.3944, -0.4297, 1.1141]],\n",
|
||||
"\n",
|
||||
" [[-0.6981, 0.0804, -2.1672, ..., 0.3919, 0.3341, 1.0794],\n",
|
||||
" [ 2.5818, -0.2308, 0.6001, ..., -0.0500, -0.0408, -0.9852],\n",
|
||||
" [-0.6967, 0.8109, 1.3108, ..., 2.1693, 1.4143, -0.1236],\n",
|
||||
" ...,\n",
|
||||
" [ 2.1226, 2.5695, -1.6178, ..., -0.0652, -0.0802, 0.1103],\n",
|
||||
" [ 0.8770, -2.4782, 0.8536, ..., 2.0471, -1.5702, 0.7387],\n",
|
||||
" [-0.0495, -1.8601, 0.0405, ..., 2.3944, -0.4297, 1.1141]]],\n",
|
||||
" grad_fn=<AddBackward0>)\n",
|
||||
"ENCODER Tensor: torch.Size([2, 16, 256])\n",
|
||||
"Values:\n",
|
||||
"tensor([[[-1.6325, 0.4094, -2.1403, ..., 0.4654, 0.5993, 0.9683],\n",
|
||||
" [ 1.8236, 0.4025, -0.6972, ..., 0.2430, 0.2536, -1.0889],\n",
|
||||
" [-0.0587, 0.1618, -0.2335, ..., 1.7609, 1.2664, -0.4452],\n",
|
||||
" ...,\n",
|
||||
" [ 2.0337, 1.3184, -1.3165, ..., -0.3303, 0.6572, 0.0884],\n",
|
||||
" [ 0.5752, -2.5594, -0.2393, ..., 1.3318, -1.4236, 0.4686],\n",
|
||||
" [ 1.0075, -2.4273, -0.4593, ..., 1.6660, 0.0359, 0.2927]],\n",
|
||||
"\n",
|
||||
" [[-1.8300, -0.3079, -1.6585, ..., 0.4859, 0.5652, 0.8072],\n",
|
||||
" [ 1.5461, -0.5666, -0.0330, ..., 0.5651, 0.2974, -1.0879],\n",
|
||||
" [-0.9060, 0.2700, -0.4585, ..., 2.0363, 1.2657, -0.7060],\n",
|
||||
" ...,\n",
|
||||
" [ 1.6688, 1.7038, -1.9549, ..., -0.2052, 0.6270, 0.4598],\n",
|
||||
" [ 0.0482, -2.3951, -0.4351, ..., 1.6230, -1.3662, -0.0390],\n",
|
||||
" [ 0.8146, -2.6169, -0.6188, ..., 1.4525, 0.0507, 0.5177]]],\n",
|
||||
" grad_fn=<NativeLayerNormBackward0>)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import random\n",
|
||||
"import torch\n",
|
||||
"from pathlib import Path\n",
|
||||
"import Project_Model.Libs.Embedder as Embedder\n",
|
||||
"import Project_Model.Libs.BPE as BPE\n",
|
||||
"import Project_Model.Libs.Transformer as Transformer\n",
|
||||
"\n",
|
||||
"# set a fixed seed\n",
|
||||
"torch.manual_seed(0)\n",
|
||||
"random.seed(0)\n",
|
||||
"\n",
|
||||
"TEXT = \"<ABS>The Dark Knight is a 2008 superhero film directed by Christopher Nolan,<SOTL>\"\n",
|
||||
"\n",
|
||||
"VOCABULARY_PATH = Path(\"Assets/Model/toy_10/toy_dictionary.json\")\n",
|
||||
"SPECIAL_VOC = BPE.default_special_tokens()\n",
|
||||
"\n",
|
||||
"VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)\n",
|
||||
"TOKENANO = BPE.TokeNanoCore(\n",
|
||||
" VOCABULARY,\n",
|
||||
" SPECIAL_VOC\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"TOKENIZATION = [TOKENANO.encode(TEXT), TOKENANO.encode(TEXT)]\n",
|
||||
"print(TOKENIZATION)\n",
|
||||
"\n",
|
||||
"TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size\n",
|
||||
"EMBEDDED_SIZE = 256\n",
|
||||
"FEED_FORWARD_DIM = EMBEDDED_SIZE * 4\n",
|
||||
"\n",
|
||||
"EMBEDDER = Embedder.NanoSocratesEmbedder(TOKEN_SPACE_SIZE, EMBEDDED_SIZE)\n",
|
||||
"tensor: torch.Tensor = EMBEDDER(TOKENIZATION)\n",
|
||||
"ENCODER = torch.nn.Sequential(\n",
|
||||
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||
")\n",
|
||||
"print(len(TOKENIZATION))\n",
|
||||
"print(f\"Embedder Tensor: {tensor.shape}\")\n",
|
||||
"print(f\"Values:\\n{tensor}\")\n",
|
||||
"\n",
|
||||
"BATCH_SIZE, TOKENS, DIMENSIONS = tensor.shape\n",
|
||||
"PAD_MASK = torch.tensor([[True] * TOKENS] * BATCH_SIZE, dtype=torch.bool)\n",
|
||||
"tensor, _ = ENCODER((tensor, PAD_MASK))\n",
|
||||
"print(f\"ENCODER Tensor: {tensor.shape}\")\n",
|
||||
"print(f\"Values:\\n{tensor}\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "deep_learning",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.13.7"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
221
Playgrounds/locistic_test.ipynb
Normal file
221
Playgrounds/locistic_test.ipynb
Normal file
@ -0,0 +1,221 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "c8741a8f",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"EPOCH 1\n",
|
||||
"\tLoss: 7.424792\n",
|
||||
"[0] \n",
|
||||
"[1] \n",
|
||||
"[2] \n",
|
||||
"[3] \n",
|
||||
"[4] \n",
|
||||
"[5] \n",
|
||||
"[6] \n",
|
||||
"[7] \n",
|
||||
"[8] \n",
|
||||
"[9] \n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import random\n",
|
||||
"import torch\n",
|
||||
"import pandas as pd\n",
|
||||
"from pathlib import Path\n",
|
||||
"import Project_Model.Libs.Embedder as Embedder\n",
|
||||
"import Project_Model.Libs.BPE as BPE\n",
|
||||
"import Project_Model.Libs.Transformer as Transformer\n",
|
||||
"import Project_Model.Libs.TorchShims as torch_shims\n",
|
||||
"from Project_Model.Libs.Training.learning_rade_shedulers import Custom_lr\n",
|
||||
"\n",
|
||||
"import torch\n",
|
||||
"\n",
|
||||
"class LogitsCollector:\n",
|
||||
" def __init__(self, pad_token: int, end_token: int, tokenizer) -> None:\n",
|
||||
" self.__pad_token = pad_token # used to skip PAD\n",
|
||||
" self.__end_token = end_token # used to stop at END\n",
|
||||
" self.__tokenizer = tokenizer # exposes .decode(list[int]) -> str\n",
|
||||
" self.__steps: list[torch.Tensor] = [] # list of per-step logits [B,V]\n",
|
||||
"\n",
|
||||
" def reset(self) -> None:\n",
|
||||
" self.__steps.clear() # clear history\n",
|
||||
"\n",
|
||||
" def add(self, logits_step: torch.Tensor) -> None:\n",
|
||||
" if logits_step.dim() == 3: # handle [B,1,V]\n",
|
||||
" logits_step = logits_step[:, -1, :] # -> [B,V]\n",
|
||||
" self.__steps.append(logits_step.detach()) # store raw logits (detached)\n",
|
||||
"\n",
|
||||
" def tokens(self) -> list[list[int]]:\n",
|
||||
" if not self.__steps:\n",
|
||||
" return []\n",
|
||||
" stack = torch.stack(self.__steps, dim=0) # [T,B,V]\n",
|
||||
" probs = torch.softmax(stack, dim=-1) # softmax over vocab -> [T,B,V]\n",
|
||||
" ids = probs.argmax(dim=-1).transpose(0, 1) # greedy ids -> [B,T]\n",
|
||||
" out: list[list[int]] = []\n",
|
||||
" for row in ids.tolist():\n",
|
||||
" seq: list[int] = []\n",
|
||||
" for tok in row:\n",
|
||||
" if tok == self.__end_token: # stop on END\n",
|
||||
" break\n",
|
||||
" if tok == self.__pad_token: # skip PAD\n",
|
||||
" continue\n",
|
||||
" seq.append(tok)\n",
|
||||
" out.append(seq)\n",
|
||||
" return out\n",
|
||||
"\n",
|
||||
" def print_decoded(self) -> None:\n",
|
||||
" for i, seq in enumerate(self.tokens()):\n",
|
||||
" try:\n",
|
||||
" text = self.__tokenizer.decode(seq) # decode tokens to string\n",
|
||||
" except Exception:\n",
|
||||
" text = str(seq) # fallback to ids\n",
|
||||
" print(f\"[{i}] {text}\") # simple print\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# set a fixed seed\n",
|
||||
"torch.manual_seed(0)\n",
|
||||
"random.seed(0)\n",
|
||||
"DEVICE = torch_shims.get_default_device()\n",
|
||||
"torch.set_default_device(DEVICE)\n",
|
||||
"\n",
|
||||
"# BPE Init\n",
|
||||
"VOCABULARY_PATH = Path(\"Assets/Model/toy_10/toy_dictionary.json\")\n",
|
||||
"SPECIAL_VOC = BPE.default_special_tokens()\n",
|
||||
"\n",
|
||||
"VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)\n",
|
||||
"TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)\n",
|
||||
"\n",
|
||||
"# Constants\n",
|
||||
"TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + 1\n",
|
||||
"EMBEDDED_SIZE = 256\n",
|
||||
"FEED_FORWARD_MULTIPLIER = 4\n",
|
||||
"ATTENTION_HEADS = 4\n",
|
||||
"SENTENCE_LENGTH = 256\n",
|
||||
"NUMBER_OF_BLOCKS = 2\n",
|
||||
"MAX_EPOCHS = int(1e3)\n",
|
||||
"\n",
|
||||
"PAD_TOKEN = TOKENANO.encode(\"<PAD>\")[0]\n",
|
||||
"END_TOKEN = TOKENANO.encode(\"<END>\")[0]\n",
|
||||
"\n",
|
||||
"# Load CSV\n",
|
||||
"TOY_DATASET_PATH = Path(\"Assets/Dataset/1-hop/toy/rdf_text.csv\")\n",
|
||||
"TOY_DATASET = pd.read_csv(TOY_DATASET_PATH)\n",
|
||||
"\n",
|
||||
"TOY_BATCH_INPUT_LIST: list[list[int]] = []\n",
|
||||
"TOY_BATCH_PADDING_LIST: list[list[bool]] = []\n",
|
||||
"TOY_BATCH_TARGET_LIST: list[list[int]] = []\n",
|
||||
"TOY_BATCH_DECODER_DEFAULT: list[list[int]] = []\n",
|
||||
"\n",
|
||||
"for index, row in TOY_DATASET.iterrows():\n",
|
||||
" RDFs: str = row[\"RDFs\"]\n",
|
||||
" Abstract: str = row[\"Abstract\"]\n",
|
||||
"\n",
|
||||
" input_tokens = TOKENANO.encode(RDFs) # encoder input ids\n",
|
||||
" output_tokens = TOKENANO.encode(Abstract)[1:] # decoder target ids (shifted left)\n",
|
||||
" decoder_default_tokens = TOKENANO.encode(\"<SOS>\") # decoder input starts with <SOS>\n",
|
||||
"\n",
|
||||
" input_tokens, padding = Transformer.normalize_sequence(\n",
|
||||
" input_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n",
|
||||
" ) # pad/trim + end token\n",
|
||||
" output_tokens, _ = Transformer.normalize_sequence(\n",
|
||||
" output_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n",
|
||||
" ) # pad/trim + end token\n",
|
||||
" decoder_default_tokens = Transformer.pad_sequence(\n",
|
||||
" decoder_default_tokens, SENTENCE_LENGTH, PAD_TOKEN\n",
|
||||
" ) # pad with PAD up to SENTENCE_LENGTH\n",
|
||||
"\n",
|
||||
" TOY_BATCH_INPUT_LIST.append(input_tokens)\n",
|
||||
" TOY_BATCH_PADDING_LIST.append(padding)\n",
|
||||
" TOY_BATCH_TARGET_LIST.append(output_tokens)\n",
|
||||
" TOY_BATCH_DECODER_DEFAULT.append(decoder_default_tokens)\n",
|
||||
"\n",
|
||||
"# Training loop\n",
|
||||
"LOSS_HISTORY = []\n",
|
||||
"NANOSOCRATES = Transformer.TrainingModel(\n",
|
||||
" TOKEN_SPACE_SIZE,\n",
|
||||
" EMBEDDED_SIZE,\n",
|
||||
" FEED_FORWARD_MULTIPLIER,\n",
|
||||
" ATTENTION_HEADS,\n",
|
||||
" NUMBER_OF_BLOCKS,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"collector = LogitsCollector(PAD_TOKEN, END_TOKEN, TOKENANO) # collects logits and decodes\n",
|
||||
"\n",
|
||||
"NANOSOCRATES.train()\n",
|
||||
"cross_entropy = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)\n",
|
||||
"optimizer = torch.optim.AdamW(NANOSOCRATES.parameters())\n",
|
||||
"scheduler = Custom_lr(EMBEDDED_SIZE, 4000) # step each optimizer step\n",
|
||||
"\n",
|
||||
"current_epoch = 0\n",
|
||||
"BATCH_SIZE = min(32, len(TOY_BATCH_INPUT_LIST)) # small batch to stabilize\n",
|
||||
"\n",
|
||||
"while current_epoch < MAX_EPOCHS:\n",
|
||||
" # simple fixed mini-batch from the top; later you can shuffle/slice\n",
|
||||
" enc = torch.tensor(TOY_BATCH_INPUT_LIST[:BATCH_SIZE], dtype=torch.long) # [B,T] encoder token ids\n",
|
||||
" pad = torch.tensor(TOY_BATCH_PADDING_LIST[:BATCH_SIZE], dtype=torch.bool) # [B,T] True where encoder PAD is present\n",
|
||||
" tgt = torch.tensor(TOY_BATCH_TARGET_LIST[:BATCH_SIZE], dtype=torch.long) # [B,T] decoder targets (ground-truth)\n",
|
||||
"\n",
|
||||
" # decoder prefix buffer: <SOS> at pos 0, PAD elsewhere (no shift here) # we will fill it step by step\n",
|
||||
" dec = torch.tensor(TOY_BATCH_DECODER_DEFAULT[:BATCH_SIZE], dtype=torch.long) # [B,T]\n",
|
||||
"\n",
|
||||
" total_loss = 0.0\n",
|
||||
" collector.reset() # start fresh for this epoch\n",
|
||||
"\n",
|
||||
" T = tgt.size(1) # sequence length\n",
|
||||
" for t in range(T):\n",
|
||||
" optimizer.zero_grad(set_to_none=True) # clear grads for this token step\n",
|
||||
"\n",
|
||||
" prefix = dec[:, : t + 1] # [B, t+1] current decoder prefix\n",
|
||||
" dec_pad_mask = prefix.eq(PAD_TOKEN) # [B, t+1] True where PAD inside prefix\n",
|
||||
"\n",
|
||||
" # one-step logits given prefix (trainer model expects 4 args now)\n",
|
||||
" logits_t: torch.Tensor = NANOSOCRATES((enc, pad, prefix, dec_pad_mask)) # [B,V] logits for step t\n",
|
||||
" collector.add(logits_t) # store logits for decoding later\n",
|
||||
"\n",
|
||||
" loss_t = cross_entropy(logits_t, tgt[:, t]) # CE expects raw logits; PAD ignored\n",
|
||||
" loss_t.backward() # backprop for this step\n",
|
||||
" optimizer.step() # update params\n",
|
||||
" scheduler.step() # Noam/warmup: step per optimizer step\n",
|
||||
"\n",
|
||||
" total_loss = float(loss_t.detach()) # keep last step loss for logging\n",
|
||||
"\n",
|
||||
" # teacher forcing: reveal the correct token for next position\n",
|
||||
" if t < T - 1:\n",
|
||||
" dec[:, t + 1] = tgt[:, t] # write ground-truth into next slot\n",
|
||||
"\n",
|
||||
" current_epoch += 1\n",
|
||||
" print(f\"EPOCH {current_epoch}\\n\\tLoss: {total_loss:.6f}\") # simple log\n",
|
||||
" collector.print_decoded() # print decoded predictions for the batch\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "deep_learning",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.13.7"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
205
Playgrounds/model-teacher-forcing.ipynb
Normal file
205
Playgrounds/model-teacher-forcing.ipynb
Normal file
@ -0,0 +1,205 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "0afbf498",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"EPOCH 1\n",
|
||||
"\tLoss: 9.174470901489258\n",
|
||||
"EPOCH 2\n",
|
||||
"\tLoss: 9.20919132232666\n",
|
||||
"EPOCH 3\n",
|
||||
"\tLoss: 9.227106094360352\n",
|
||||
"EPOCH 4\n",
|
||||
"\tLoss: 9.172086715698242\n",
|
||||
"EPOCH 5\n",
|
||||
"\tLoss: 9.180150985717773\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"ename": "KeyboardInterrupt",
|
||||
"evalue": "",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
|
||||
"\u001b[31mKeyboardInterrupt\u001b[39m Traceback (most recent call last)",
|
||||
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 116\u001b[39m\n\u001b[32m 113\u001b[39m step_target = target_logits[:, i] \u001b[38;5;66;03m# [B]\u001b[39;00m\n\u001b[32m 115\u001b[39m loss = cross_entropy(step_logits,step_target) \u001b[38;5;66;03m# now loss is without softmax\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m116\u001b[39m \u001b[43mloss\u001b[49m\u001b[43m.\u001b[49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# DAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMN\u001b[39;00m\n\u001b[32m 117\u001b[39m last_loss = loss\n\u001b[32m 118\u001b[39m optimizer.step()\n",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/deep_learning/lib/python3.13/site-packages/torch/_tensor.py:638\u001b[39m, in \u001b[36mTensor.backward\u001b[39m\u001b[34m(self, gradient, retain_graph, create_graph, inputs)\u001b[39m\n\u001b[32m 595\u001b[39m \u001b[38;5;250m\u001b[39m\u001b[33mr\u001b[39m\u001b[33;03m\"\"\"Computes the gradient of current tensor wrt graph leaves.\u001b[39;00m\n\u001b[32m 596\u001b[39m \n\u001b[32m 597\u001b[39m \u001b[33;03mThe graph is differentiated using the chain rule. If the tensor is\u001b[39;00m\n\u001b[32m (...)\u001b[39m\u001b[32m 635\u001b[39m \u001b[33;03m used to compute the :attr:`tensors`. Defaults to ``None``.\u001b[39;00m\n\u001b[32m 636\u001b[39m \u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 637\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m has_torch_function_unary(\u001b[38;5;28mself\u001b[39m):\n\u001b[32m--> \u001b[39m\u001b[32m638\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mhandle_torch_function\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 639\u001b[39m \u001b[43m \u001b[49m\u001b[43mTensor\u001b[49m\u001b[43m.\u001b[49m\u001b[43mbackward\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 640\u001b[39m \u001b[43m \u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 641\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 642\u001b[39m \u001b[43m \u001b[49m\u001b[43mgradient\u001b[49m\u001b[43m=\u001b[49m\u001b[43mgradient\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 643\u001b[39m \u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m=\u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 644\u001b[39m \u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 645\u001b[39m \u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m=\u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 646\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 647\u001b[39m torch.autograd.backward(\n\u001b[32m 648\u001b[39m \u001b[38;5;28mself\u001b[39m, gradient, retain_graph, create_graph, inputs=inputs\n\u001b[32m 649\u001b[39m )\n",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/deep_learning/lib/python3.13/site-packages/torch/overrides.py:1725\u001b[39m, in \u001b[36mhandle_torch_function\u001b[39m\u001b[34m(public_api, relevant_args, *args, **kwargs)\u001b[39m\n\u001b[32m 1721\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m _is_torch_function_mode_enabled():\n\u001b[32m 1722\u001b[39m \u001b[38;5;66;03m# if we're here, the mode must be set to a TorchFunctionStackMode\u001b[39;00m\n\u001b[32m 1723\u001b[39m \u001b[38;5;66;03m# this unsets it and calls directly into TorchFunctionStackMode's torch function\u001b[39;00m\n\u001b[32m 1724\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m _pop_mode_temporarily() \u001b[38;5;28;01mas\u001b[39;00m mode:\n\u001b[32m-> \u001b[39m\u001b[32m1725\u001b[39m result = \u001b[43mmode\u001b[49m\u001b[43m.\u001b[49m\u001b[43m__torch_function__\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpublic_api\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtypes\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1726\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m result \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mNotImplemented\u001b[39m:\n\u001b[32m 1727\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m result\n",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/deep_learning/lib/python3.13/site-packages/torch/utils/_device.py:103\u001b[39m, in \u001b[36mDeviceContext.__torch_function__\u001b[39m\u001b[34m(self, func, types, args, kwargs)\u001b[39m\n\u001b[32m 101\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m func \u001b[38;5;129;01min\u001b[39;00m _device_constructors() \u001b[38;5;129;01mand\u001b[39;00m kwargs.get(\u001b[33m\"\u001b[39m\u001b[33mdevice\u001b[39m\u001b[33m\"\u001b[39m) \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 102\u001b[39m kwargs[\u001b[33m\"\u001b[39m\u001b[33mdevice\u001b[39m\u001b[33m\"\u001b[39m] = \u001b[38;5;28mself\u001b[39m.device\n\u001b[32m--> \u001b[39m\u001b[32m103\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/deep_learning/lib/python3.13/site-packages/torch/_tensor.py:647\u001b[39m, in \u001b[36mTensor.backward\u001b[39m\u001b[34m(self, gradient, retain_graph, create_graph, inputs)\u001b[39m\n\u001b[32m 637\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m has_torch_function_unary(\u001b[38;5;28mself\u001b[39m):\n\u001b[32m 638\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m handle_torch_function(\n\u001b[32m 639\u001b[39m Tensor.backward,\n\u001b[32m 640\u001b[39m (\u001b[38;5;28mself\u001b[39m,),\n\u001b[32m (...)\u001b[39m\u001b[32m 645\u001b[39m inputs=inputs,\n\u001b[32m 646\u001b[39m )\n\u001b[32m--> \u001b[39m\u001b[32m647\u001b[39m \u001b[43mtorch\u001b[49m\u001b[43m.\u001b[49m\u001b[43mautograd\u001b[49m\u001b[43m.\u001b[49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 648\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgradient\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m=\u001b[49m\u001b[43minputs\u001b[49m\n\u001b[32m 649\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/deep_learning/lib/python3.13/site-packages/torch/autograd/__init__.py:354\u001b[39m, in \u001b[36mbackward\u001b[39m\u001b[34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001b[39m\n\u001b[32m 349\u001b[39m retain_graph = create_graph\n\u001b[32m 351\u001b[39m \u001b[38;5;66;03m# The reason we repeat the same comment below is that\u001b[39;00m\n\u001b[32m 352\u001b[39m \u001b[38;5;66;03m# some Python versions print out the first line of a multi-line function\u001b[39;00m\n\u001b[32m 353\u001b[39m \u001b[38;5;66;03m# calls in the traceback and some print out the last line\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m354\u001b[39m \u001b[43m_engine_run_backward\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 355\u001b[39m \u001b[43m \u001b[49m\u001b[43mtensors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 356\u001b[39m \u001b[43m \u001b[49m\u001b[43mgrad_tensors_\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 357\u001b[39m \u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 358\u001b[39m \u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 359\u001b[39m \u001b[43m \u001b[49m\u001b[43minputs_tuple\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 360\u001b[39m \u001b[43m \u001b[49m\u001b[43mallow_unreachable\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 361\u001b[39m \u001b[43m \u001b[49m\u001b[43maccumulate_grad\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 362\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
|
||||
"\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/deep_learning/lib/python3.13/site-packages/torch/autograd/graph.py:829\u001b[39m, in \u001b[36m_engine_run_backward\u001b[39m\u001b[34m(t_outputs, *args, **kwargs)\u001b[39m\n\u001b[32m 827\u001b[39m unregister_hooks = _register_logging_hooks_on_whole_graph(t_outputs)\n\u001b[32m 828\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m829\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mVariable\u001b[49m\u001b[43m.\u001b[49m\u001b[43m_execution_engine\u001b[49m\u001b[43m.\u001b[49m\u001b[43mrun_backward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Calls into the C++ engine to run the backward pass\u001b[39;49;00m\n\u001b[32m 830\u001b[39m \u001b[43m \u001b[49m\u001b[43mt_outputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\n\u001b[32m 831\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# Calls into the C++ engine to run the backward pass\u001b[39;00m\n\u001b[32m 832\u001b[39m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[32m 833\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m attach_logging_hooks:\n",
|
||||
"\u001b[31mKeyboardInterrupt\u001b[39m: "
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import random\n",
|
||||
"import torch\n",
|
||||
"import pandas as pd\n",
|
||||
"from pathlib import Path\n",
|
||||
"import Project_Model.Libs.Embedder as Embedder\n",
|
||||
"import Project_Model.Libs.BPE as BPE\n",
|
||||
"import Project_Model.Libs.Transformer as Transformer\n",
|
||||
"import Project_Model.Libs.TorchShims as torch_shims\n",
|
||||
"\n",
|
||||
"# set a fixed seed\n",
|
||||
"torch.manual_seed(0)\n",
|
||||
"random.seed(0)\n",
|
||||
"DEVICE = torch_shims.get_default_device()\n",
|
||||
"torch.set_default_device(DEVICE)\n",
|
||||
"\n",
|
||||
"# set a default device\n",
|
||||
"\n",
|
||||
"# BPE Init\n",
|
||||
"VOCABULARY_PATH = Path(\"Assets/Model/toy_10/toy_dictionary.json\")\n",
|
||||
"SPECIAL_VOC = BPE.default_special_tokens()\n",
|
||||
"\n",
|
||||
"VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)\n",
|
||||
"TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Constants\n",
|
||||
"TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + 1\n",
|
||||
"EMBEDDED_SIZE = 256\n",
|
||||
"FEED_FORWARD_MULTIPLIER = 4\n",
|
||||
"ATTENTION_HEADS = 4\n",
|
||||
"SENTENCE_LENGTH = 256\n",
|
||||
"NUMBER_OF_BLOCKS = 2\n",
|
||||
"MAX_EPOCHS = int(1e3)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"PAD_TOKEN = TOKENANO.encode(\"<PAD>\")[0]\n",
|
||||
"END_TOKEN = TOKENANO.encode(\"<END>\")[0]\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Load CSV\n",
|
||||
"TOY_DATASET_PATH = Path(\"Assets/Dataset/1-hop/toy/rdf_text.csv\")\n",
|
||||
"\n",
|
||||
"TOY_DATASET = pd.read_csv(TOY_DATASET_PATH)\n",
|
||||
"\n",
|
||||
"TOY_BATCH_INPUT_LIST: list[list[int]] = []\n",
|
||||
"TOY_BATCH_PADDING_LIST: list[list[bool]] = []\n",
|
||||
"TOY_BATCH_TARGET_LIST: list[list[int]] = []\n",
|
||||
"TOY_BATCH_DECODER_DEFAULT: list[list[int]]= []\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"for index, row in TOY_DATASET.iterrows():\n",
|
||||
"\n",
|
||||
" RDFs: str = row[\"RDFs\"]\n",
|
||||
" Abstract: str = row[\"Abstract\"]\n",
|
||||
"\n",
|
||||
" input_tokens = TOKENANO.encode(RDFs)\n",
|
||||
" output_tokens = TOKENANO.encode(Abstract)[1:]\n",
|
||||
" decoder_default_tokens = TOKENANO.encode(\"<SOS>\")\n",
|
||||
"\n",
|
||||
" input_tokens, padding = Transformer.normalize_sequence(\n",
|
||||
" input_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n",
|
||||
" )\n",
|
||||
" output_tokens, _ = Transformer.normalize_sequence(\n",
|
||||
" output_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n",
|
||||
" )\n",
|
||||
" decoder_default_tokens, _ = Transformer.normalize_sequence(\n",
|
||||
" decoder_default_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" TOY_BATCH_INPUT_LIST.append(input_tokens)\n",
|
||||
" TOY_BATCH_PADDING_LIST.append(padding)\n",
|
||||
" TOY_BATCH_TARGET_LIST.append(output_tokens)\n",
|
||||
" TOY_BATCH_DECODER_DEFAULT.append(decoder_default_tokens)\n",
|
||||
"\n",
|
||||
"# Training loop\n",
|
||||
"LOSS_HISTORY = []\n",
|
||||
"NANOSOCRATES = Transformer.TrainingModel(\n",
|
||||
" TOKEN_SPACE_SIZE,\n",
|
||||
" EMBEDDED_SIZE,\n",
|
||||
" FEED_FORWARD_MULTIPLIER,\n",
|
||||
" ATTENTION_HEADS,\n",
|
||||
" NUMBER_OF_BLOCKS\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"NANOSOCRATES.train() # nothing important, activates dropout etc \n",
|
||||
"cross_entropy = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)\n",
|
||||
"optimizer = torch.optim.AdamW(NANOSOCRATES.parameters())\n",
|
||||
"scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 4)\n",
|
||||
"\n",
|
||||
"last_loss = 0\n",
|
||||
"\n",
|
||||
"current_epoch = 0\n",
|
||||
"while current_epoch < MAX_EPOCHS:\n",
|
||||
"\n",
|
||||
" encoder_list = torch.tensor([TOY_BATCH_INPUT_LIST[0]])\n",
|
||||
" decoder_list = torch.tensor([TOY_BATCH_DECODER_DEFAULT[0]])\n",
|
||||
" padding_list = torch.tensor([TOY_BATCH_PADDING_LIST[0]], dtype=torch.bool)\n",
|
||||
" target_logits = torch.tensor([TOY_BATCH_TARGET_LIST[0]]) # Transform target into logits\n",
|
||||
"\n",
|
||||
" optimizer.zero_grad() # to clear gradient\n",
|
||||
"\n",
|
||||
" last_loss = 0.0\n",
|
||||
"\n",
|
||||
" for i in range(0, SENTENCE_LENGTH):\n",
|
||||
"\n",
|
||||
" # optimizer.zero_grad()\n",
|
||||
" # forward \n",
|
||||
" logits: torch.Tensor = NANOSOCRATES((encoder_list, padding_list, decoder_list))\n",
|
||||
" # probabilities = torch.softmax(logits,2)\n",
|
||||
" \n",
|
||||
"\n",
|
||||
" step_logits = logits[:, i, :] # [B, V]\n",
|
||||
" step_target = target_logits[:, i] # [B]\n",
|
||||
"\n",
|
||||
" loss = cross_entropy(step_logits,step_target) # now loss is without softmax\n",
|
||||
" loss.backward() # DAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMN\n",
|
||||
" last_loss = loss\n",
|
||||
" optimizer.step()\n",
|
||||
" optimizer.zero_grad()\n",
|
||||
" scheduler.step()\n",
|
||||
" \n",
|
||||
" probabilities = torch.softmax(logits,2)\n",
|
||||
" most_probable_tokens = torch.argmax(probabilities, 2) \n",
|
||||
" if i < SENTENCE_LENGTH - 1:\n",
|
||||
" decoder_list[:,i+1] = most_probable_tokens[:,i]\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" current_epoch += 1\n",
|
||||
"\n",
|
||||
" if current_epoch % 1 == 0:\n",
|
||||
" print(f\"EPOCH {current_epoch}\\n\\tLoss: {last_loss}\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "deep_learning",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.13.7"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
157
Playgrounds/nanosocrates-sanity-check.ipynb
Normal file
157
Playgrounds/nanosocrates-sanity-check.ipynb
Normal file
@ -0,0 +1,157 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "f5762da9",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"torch.Size([3, 17, 7714])\n",
|
||||
"torch.Size([3, 17])\n",
|
||||
"tensor([[2034, 6523, 5406, 3985, 5406, 6523, 2034, 2034, 5745, 643, 5406, 7405,\n",
|
||||
" 6523, 6230, 6419, 5745, 657],\n",
|
||||
" [2458, 830, 5745, 5745, 5406, 3741, 2034, 5745, 6302, 6419, 5406, 2411,\n",
|
||||
" 719, 830, 5745, 3189, 2775],\n",
|
||||
" [2034, 5745, 5327, 4696, 6523, 643, 6419, 1671, 6302, 4406, 5745, 643,\n",
|
||||
" 643, 1901, 1914, 1914, 719]])\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import random\n",
|
||||
"import torch\n",
|
||||
"from pathlib import Path\n",
|
||||
"import Project_Model.Libs.Embedder as Embedder\n",
|
||||
"import Project_Model.Libs.BPE as BPE\n",
|
||||
"import Project_Model.Libs.Transformer as Transformer\n",
|
||||
"\n",
|
||||
"# set a fixed seed\n",
|
||||
"torch.manual_seed(0)\n",
|
||||
"random.seed(0)\n",
|
||||
"\n",
|
||||
"# BPE Init\n",
|
||||
"VOCABULARY_PATH = Path(\"Assets/Model/toy_10/toy_dictionary.json\")\n",
|
||||
"SPECIAL_VOC = BPE.default_special_tokens()\n",
|
||||
"\n",
|
||||
"VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)\n",
|
||||
"TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Constants\n",
|
||||
"TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + 1\n",
|
||||
"EMBEDDED_SIZE = 256\n",
|
||||
"FEED_FORWARD_DIM = EMBEDDED_SIZE * 4\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Model Init\n",
|
||||
"ENCODER_EMBEDDER = Embedder.NanoSocratesEmbedder(TOKEN_SPACE_SIZE, EMBEDDED_SIZE)\n",
|
||||
"DECODER_EMBEDDER = Embedder.NanoSocratesEmbedder(TOKEN_SPACE_SIZE, EMBEDDED_SIZE)\n",
|
||||
"\n",
|
||||
"ENCODER = torch.nn.Sequential(\n",
|
||||
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"DECODER = torch.nn.Sequential(\n",
|
||||
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"DETOKENER = Transformer.DeToken(\n",
|
||||
" EMBEDDED_SIZE,\n",
|
||||
" TOKEN_SPACE_SIZE\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Data\n",
|
||||
"TEXT = (\n",
|
||||
" \"<ABS>The Dark Knight is a 2008 superhero film directed by Christopher Nolan,<SOTL>\"\n",
|
||||
")\n",
|
||||
"OUT_TEXT = \"<START>\"\n",
|
||||
"\n",
|
||||
"PAD_TOKEN = TOKENANO.encode(\"<PAD>\")[0]\n",
|
||||
"END_TOKEN = TOKENANO.encode(\"<END>\")[0]\n",
|
||||
"\n",
|
||||
"ENCODER_INPUT = TOKENANO.encode(TEXT)\n",
|
||||
"DECODER_INPUT = TOKENANO.encode(OUT_TEXT)\n",
|
||||
"MAX_LEN = len(ENCODER_INPUT) + 1\n",
|
||||
"\n",
|
||||
"EN_IN, PAD_MASK = Transformer.normalize_sequence(ENCODER_INPUT, MAX_LEN, PAD_TOKEN, END_TOKEN)\n",
|
||||
"DEC_IN, _ = Transformer.normalize_sequence(DECODER_INPUT, MAX_LEN, PAD_TOKEN, END_TOKEN)\n",
|
||||
"\n",
|
||||
"BATCH_LEN = 3\n",
|
||||
"\n",
|
||||
"INPUT_TOKENIZATION = [\n",
|
||||
" EN_IN\n",
|
||||
"] * BATCH_LEN\n",
|
||||
"OUTPUT_TOKENIZATION = [\n",
|
||||
" DEC_IN\n",
|
||||
"] * BATCH_LEN\n",
|
||||
"\n",
|
||||
"encoder_tensor_input = ENCODER_EMBEDDER(INPUT_TOKENIZATION)\n",
|
||||
"encoder_padding_mask = torch.tensor([PAD_MASK] * BATCH_LEN)\n",
|
||||
"\n",
|
||||
"encoder_output, _ = ENCODER((encoder_tensor_input, encoder_padding_mask))\n",
|
||||
"\n",
|
||||
"decoder_tensor_input = DECODER_EMBEDDER(OUTPUT_TOKENIZATION)\n",
|
||||
"decoder_padding_mask = torch.tensor([[False] * MAX_LEN] * BATCH_LEN)\n",
|
||||
"\n",
|
||||
"decoder_output, _, _, _ = DECODER((decoder_tensor_input, encoder_output, encoder_output, None))\n",
|
||||
"\n",
|
||||
"logits: torch.Tensor = DETOKENER(decoder_output)\n",
|
||||
"\n",
|
||||
"print(logits.shape)\n",
|
||||
"\n",
|
||||
"# print(logits)\n",
|
||||
"\n",
|
||||
"most_probable_tokens = torch.argmax(logits, 2)\n",
|
||||
"\n",
|
||||
"print(most_probable_tokens.shape)\n",
|
||||
"print(most_probable_tokens)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "deep_learning",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.13.7"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
197
Playgrounds/nanosocrates-train-toy.ipynb
Normal file
197
Playgrounds/nanosocrates-train-toy.ipynb
Normal file
@ -0,0 +1,197 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "adbd9598",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"c:\\Users\\Chris\\miniconda3\\envs\\deep_learning\\Lib\\site-packages\\torch\\utils\\_device.py:103: UserWarning: Aten Op fallback from XPU to CPU happends. This may have performance implications. If need debug the fallback ops please set environment variable `PYTORCH_DEBUG_XPU_FALLBACK=1` (Triggered internally at C:\\actions-runner\\_work\\pytorch\\pytorch\\pytorch\\build\\xpu\\ATen\\RegisterXPU_0.cpp:54528.)\n",
|
||||
" return func(*args, **kwargs)\n",
|
||||
"252.87s - name 'tensor' is not defined\n",
|
||||
"Traceback (most recent call last):\n",
|
||||
" File \"c:\\Users\\Chris\\miniconda3\\envs\\deep_learning\\Lib\\site-packages\\debugpy\\_vendored\\pydevd\\_pydevd_bundle\\pydevd_vars.py\", line 636, in change_attr_expression\n",
|
||||
" value = eval(expression, frame.f_globals, frame.f_locals)\n",
|
||||
" File \"<string>\", line 1, in <module>\n",
|
||||
"NameError: name 'tensor' is not defined\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"ename": "",
|
||||
"evalue": "",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[1;31mCannot execute code, session has been disposed. Please try restarting the Kernel."
|
||||
]
|
||||
},
|
||||
{
|
||||
"ename": "",
|
||||
"evalue": "",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[1;31mCannot execute code, session has been disposed. Please try restarting the Kernel. \n",
|
||||
"\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import random\n",
|
||||
"import torch\n",
|
||||
"import pandas as pd\n",
|
||||
"from pathlib import Path\n",
|
||||
"import Project_Model.Libs.Embedder as Embedder\n",
|
||||
"import Project_Model.Libs.BPE as BPE\n",
|
||||
"import Project_Model.Libs.Transformer as Transformer\n",
|
||||
"import Project_Model.Libs.TorchShims as torch_shims\n",
|
||||
"\n",
|
||||
"# set a fixed seed\n",
|
||||
"torch.manual_seed(0)\n",
|
||||
"random.seed(0)\n",
|
||||
"DEVICE = torch_shims.get_default_device()\n",
|
||||
"torch.set_default_device(DEVICE)\n",
|
||||
"\n",
|
||||
"# set a default device\n",
|
||||
"\n",
|
||||
"# BPE Init\n",
|
||||
"VOCABULARY_PATH = Path(\"Assets/Model/toy_10/toy_dictionary.json\")\n",
|
||||
"SPECIAL_VOC = BPE.default_special_tokens()\n",
|
||||
"\n",
|
||||
"VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)\n",
|
||||
"TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Constants\n",
|
||||
"TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + 1\n",
|
||||
"EMBEDDED_SIZE = 256\n",
|
||||
"FEED_FORWARD_MULTIPLIER = 4\n",
|
||||
"ATTENTION_HEADS = 4\n",
|
||||
"SENTENCE_LENGTH = 256\n",
|
||||
"NUMBER_OF_BLOCKS = 2\n",
|
||||
"MAX_EPOCHS = int(1e3)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"PAD_TOKEN = TOKENANO.encode(\"<PAD>\")[0]\n",
|
||||
"END_TOKEN = TOKENANO.encode(\"<END>\")[0]\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Load CSV\n",
|
||||
"TOY_DATASET_PATH = Path(\"Assets/Dataset/1-hop/toy/rdf_text.csv\")\n",
|
||||
"\n",
|
||||
"TOY_DATASET = pd.read_csv(TOY_DATASET_PATH)\n",
|
||||
"\n",
|
||||
"TOY_BATCH_INPUT_LIST: list[list[int]] = []\n",
|
||||
"TOY_BATCH_PADDING_LIST: list[list[bool]] = []\n",
|
||||
"TOY_BATCH_TARGET_LIST: list[list[int]] = []\n",
|
||||
"TOY_BATCH_DECODER_DEFAULT: list[list[int]]= []\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"for index, row in TOY_DATASET.iterrows():\n",
|
||||
"\n",
|
||||
" RDFs: str = row[\"RDFs\"]\n",
|
||||
" Abstract: str = row[\"Abstract\"]\n",
|
||||
"\n",
|
||||
" input_tokens = TOKENANO.encode(RDFs)\n",
|
||||
" output_tokens = TOKENANO.encode(Abstract)[1:]\n",
|
||||
" decoder_default_tokens = TOKENANO.encode(\"<SOS>\")\n",
|
||||
"\n",
|
||||
" input_tokens, padding = Transformer.normalize_sequence(\n",
|
||||
" input_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n",
|
||||
" )\n",
|
||||
" output_tokens, _ = Transformer.normalize_sequence(\n",
|
||||
" output_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n",
|
||||
" )\n",
|
||||
" decoder_default_tokens, _ = Transformer.normalize_sequence(\n",
|
||||
" decoder_default_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" TOY_BATCH_INPUT_LIST.append(input_tokens)\n",
|
||||
" TOY_BATCH_PADDING_LIST.append(padding)\n",
|
||||
" TOY_BATCH_TARGET_LIST.append(output_tokens)\n",
|
||||
" TOY_BATCH_DECODER_DEFAULT.append(decoder_default_tokens)\n",
|
||||
"\n",
|
||||
"# Training loop\n",
|
||||
"LOSS_HISTORY = []\n",
|
||||
"NANOSOCRATES = Transformer.TrainingModel(\n",
|
||||
" TOKEN_SPACE_SIZE,\n",
|
||||
" EMBEDDED_SIZE,\n",
|
||||
" FEED_FORWARD_MULTIPLIER,\n",
|
||||
" ATTENTION_HEADS,\n",
|
||||
" NUMBER_OF_BLOCKS\n",
|
||||
")\n",
|
||||
"cross_entropy = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)\n",
|
||||
"optimizer = torch.optim.AdamW(NANOSOCRATES.parameters())\n",
|
||||
"scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 4)\n",
|
||||
"last_loss = 0\n",
|
||||
"current_epoch = 0\n",
|
||||
"\n",
|
||||
"while current_epoch < MAX_EPOCHS:\n",
|
||||
"\n",
|
||||
" optimizer.zero_grad()\n",
|
||||
"\n",
|
||||
" encoder_list = torch.tensor([TOY_BATCH_INPUT_LIST[0]])\n",
|
||||
" decoder_list = torch.tensor([TOY_BATCH_DECODER_DEFAULT[0]])\n",
|
||||
" padding_list = torch.tensor([TOY_BATCH_PADDING_LIST[0]], dtype=torch.bool)\n",
|
||||
"\n",
|
||||
" # Transform target into logits\n",
|
||||
" target_logits = torch.tensor([TOY_BATCH_TARGET_LIST[0]])\n",
|
||||
"\n",
|
||||
" last_loss = 0\n",
|
||||
"\n",
|
||||
" for i in range(0, SENTENCE_LENGTH):\n",
|
||||
"\n",
|
||||
" optimizer.zero_grad()\n",
|
||||
"\n",
|
||||
" logits: torch.Tensor = NANOSOCRATES((encoder_list, padding_list, decoder_list))\n",
|
||||
"\n",
|
||||
" most_probable_tokens = torch.argmax(logits, 2)\n",
|
||||
"\n",
|
||||
" logits = logits[:,i,:]\n",
|
||||
"\n",
|
||||
" loss = cross_entropy(logits, target_logits[:,i])\n",
|
||||
" last_loss = loss\n",
|
||||
" optimizer.step()\n",
|
||||
" scheduler.step()\n",
|
||||
"\n",
|
||||
" if i < SENTENCE_LENGTH - 1:\n",
|
||||
" decoder_list[:,i+1] = most_probable_tokens[:,i]\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" current_epoch += 1\n",
|
||||
"\n",
|
||||
" if current_epoch % 1 == 0:\n",
|
||||
" print(f\"EPOCH {current_epoch}\\n\\tLoss: {last_loss}\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "deep_learning",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.13.7"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
112
Playgrounds/prova.ipynb
Normal file
112
Playgrounds/prova.ipynb
Normal file
@ -0,0 +1,112 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "4ae47336",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import torch\n",
|
||||
"B, T, D = 4, 7, 32\n",
|
||||
"x = torch.randn(B, T, D)\n",
|
||||
"attn_mask = torch.triu(torch.ones(T, T, dtype=torch.bool), diagonal=1) # [T,T]\n",
|
||||
"pad_mask = torch.zeros(B, T, dtype=torch.bool) # no pads\n",
|
||||
"mha = torch.nn.MultiheadAttention(D, num_heads=4, batch_first=True)\n",
|
||||
"y, _ = mha(x, x, x, attn_mask=attn_mask, key_padding_mask=pad_mask) # should work\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "e38e3fb5",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"tensor([[[0, 0, 0, 0, 1, 0, 0, 0, 0, 0],\n",
|
||||
" [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],\n",
|
||||
" [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]],\n",
|
||||
"\n",
|
||||
" [[0, 0, 1, 0, 0, 0, 0, 0, 0, 0],\n",
|
||||
" [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],\n",
|
||||
" [0, 0, 0, 0, 0, 1, 0, 0, 0, 0]]])"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"torch.nn.functional.one_hot(torch.tensor([\n",
|
||||
" [4, 1, 9],\n",
|
||||
" [2,4,5]\n",
|
||||
"]))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "7119ad53",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"device(type='cpu')"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"torch.get_default_device()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "8c95691a",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"xpu\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from Project_Model.Libs.TorchShims import get_default_device\n",
|
||||
"\n",
|
||||
"print(get_default_device())"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "deep_learning",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.13.7"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
60
Playgrounds/sanity-check-pytorch.ipynb
Normal file
60
Playgrounds/sanity-check-pytorch.ipynb
Normal file
@ -0,0 +1,60 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "dd23cc94",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Current detected architecture is: xpu\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import torch\n",
|
||||
"from Project_Model.Libs.TorchShims import get_default_device\n",
|
||||
"\n",
|
||||
"DEVICE = get_default_device()\n",
|
||||
"\n",
|
||||
"print(f\"Current detected architecture is: {DEVICE.type}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "6584882e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import Project_Model.Libs.Transformer as Transformer\n",
|
||||
"DECODER = Transformer.Decoder(256, 1024, 4)\n",
|
||||
"print()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "deep_learning",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.13.7"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
0
Playgrounds/trainer.ipynb
Normal file
0
Playgrounds/trainer.ipynb
Normal file
4
Project_Model/Libs/BPE/Classes/Encoder.py
Normal file
4
Project_Model/Libs/BPE/Classes/Encoder.py
Normal file
@ -0,0 +1,4 @@
|
||||
from abc import ABC
|
||||
|
||||
class Encoder(ABC):
|
||||
pass
|
||||
164
Project_Model/Libs/BPE/Classes/NanoSocraTraineRam.py
Normal file
164
Project_Model/Libs/BPE/Classes/NanoSocraTraineRam.py
Normal file
@ -0,0 +1,164 @@
|
||||
from collections import deque
|
||||
import datetime
|
||||
from pathlib import Path
|
||||
import re
|
||||
from ..Classes import (
|
||||
NanoSocratesBPE,
|
||||
NanoSocratesChunker,
|
||||
NanoSocratesSplitter,
|
||||
NanoSocratesBatchMemoryBPE,
|
||||
)
|
||||
from ..Enums import TokenType
|
||||
from ..Utils import (
|
||||
special_regex_maker,
|
||||
iterator_with_checks,
|
||||
save_nanos_vocabulary,
|
||||
load_nanos_vocabulary,
|
||||
save_json,
|
||||
load_json,
|
||||
)
|
||||
|
||||
|
||||
class NanoSocraTraineRam:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
max_vocabulary: int,
|
||||
special_vocabulary: list[str],
|
||||
merge_treshold: int = 0,
|
||||
max_iterations: int = 0,
|
||||
print_after_iterations: int = 1,
|
||||
) -> None:
|
||||
# Bytes
|
||||
BYTE_RESERVED_TOKENS = 256
|
||||
SPECIAL_RESERVED_TOKENS = len(special_vocabulary)
|
||||
RESERVED_TOKENS = BYTE_RESERVED_TOKENS + SPECIAL_RESERVED_TOKENS
|
||||
|
||||
self.__max_vocabulary = max_vocabulary - RESERVED_TOKENS
|
||||
self.__max_iterations = max_iterations
|
||||
self.__merge_treshold = merge_treshold
|
||||
self.__special_token_regex = special_regex_maker(special_vocabulary)
|
||||
self.__print_after_iterations = print_after_iterations
|
||||
|
||||
def trainBPE(
|
||||
self,
|
||||
path: Path,
|
||||
bpe: NanoSocratesBPE | None = None,
|
||||
) -> NanoSocratesBPE:
|
||||
|
||||
if not path.is_file():
|
||||
raise FileNotFoundError()
|
||||
|
||||
if bpe is None:
|
||||
bpe = NanoSocratesBPE()
|
||||
BPE = bpe
|
||||
|
||||
if BPE.vocabulary_size > self.__max_vocabulary:
|
||||
return BPE
|
||||
|
||||
exit = False
|
||||
current_iteration = 0
|
||||
data = self.__gather_data_from_file(path)
|
||||
|
||||
while not exit:
|
||||
|
||||
current_iteration = self.__increment_counter(current_iteration)
|
||||
|
||||
LAST_VOC_SIZE = BPE.vocabulary_size
|
||||
|
||||
last_memory = None
|
||||
|
||||
_, data, last_memory = self.__round_train(BPE, data)
|
||||
|
||||
NEW_VOC_SIZE = BPE.vocabulary_size
|
||||
|
||||
if current_iteration % self.__print_after_iterations == 0:
|
||||
|
||||
DELIMITER = "==============="
|
||||
|
||||
DEBUG = "\n".join(
|
||||
[
|
||||
DELIMITER,
|
||||
f"ITERATION: {current_iteration}",
|
||||
DELIMITER,
|
||||
f"\tVocabulary size: {BPE.vocabulary_size}\n",
|
||||
f"\tFrequencies:\n{last_memory.frequencies}\n", # type: ignore (pretty sure it's not None)
|
||||
f"\tvocabulary:\n{BPE.vocabulary}",
|
||||
DELIMITER,
|
||||
"",
|
||||
]
|
||||
)
|
||||
print(DEBUG)
|
||||
|
||||
if LAST_VOC_SIZE == NEW_VOC_SIZE:
|
||||
exit = True
|
||||
continue
|
||||
|
||||
if current_iteration == self.__max_iterations:
|
||||
exit = True
|
||||
continue
|
||||
|
||||
if BPE.vocabulary_size == self.__max_vocabulary:
|
||||
exit = True
|
||||
continue
|
||||
|
||||
return BPE
|
||||
|
||||
def __round_train(self, bpe: NanoSocratesBPE, data: list[list[int]]):
|
||||
|
||||
DATA_LEN = len(data)
|
||||
NEW_DATA = []
|
||||
|
||||
counter = 0
|
||||
memory = NanoSocratesBatchMemoryBPE({}, 0)
|
||||
while len(data) > 0:
|
||||
counter += 1
|
||||
last_batch = len(data) == 1
|
||||
|
||||
piece = data.pop()
|
||||
|
||||
bpe, memory, output = bpe.fit(piece, memory, last_batch)
|
||||
|
||||
if counter % int(1E6) == 0:
|
||||
print(f"Fitted: {counter}/{DATA_LEN}")
|
||||
|
||||
if len(output) < 2:
|
||||
continue
|
||||
|
||||
NEW_DATA.append(output)
|
||||
|
||||
return (bpe, NEW_DATA, memory)
|
||||
|
||||
def __gather_data_from_file(self, path: Path) -> list[list[int]]:
|
||||
|
||||
SPLITTER = NanoSocratesSplitter(self.__special_token_regex)
|
||||
|
||||
DATA: list[list[int]] = []
|
||||
|
||||
FILE = open(path, "r", encoding="utf-8")
|
||||
file_string = FILE.read()
|
||||
FILE.close()
|
||||
|
||||
for piece, type in SPLITTER.split_text(file_string):
|
||||
|
||||
if type != TokenType.BPE:
|
||||
continue
|
||||
|
||||
int_list = self.__make_list_ids(piece)
|
||||
DATA.append(int_list)
|
||||
|
||||
return DATA
|
||||
|
||||
def __increment_counter(self, counter: int):
|
||||
|
||||
# What if overflows???
|
||||
try:
|
||||
counter += 1
|
||||
except:
|
||||
print("Integer overflow")
|
||||
counter = 1
|
||||
|
||||
return counter
|
||||
|
||||
def __make_list_ids(self, corpus: str):
|
||||
return list(corpus.encode("utf-8"))
|
||||
248
Project_Model/Libs/BPE/Classes/NanoSocraTrainer.py
Normal file
248
Project_Model/Libs/BPE/Classes/NanoSocraTrainer.py
Normal file
@ -0,0 +1,248 @@
|
||||
from collections import deque
|
||||
import datetime
|
||||
from pathlib import Path
|
||||
import re
|
||||
from ..Classes import (
|
||||
NanoSocratesBPE,
|
||||
NanoSocratesChunker,
|
||||
NanoSocratesSplitter,
|
||||
NanoSocratesBatchMemoryBPE,
|
||||
)
|
||||
from ..Enums import TokenType
|
||||
from ..Utils import (
|
||||
special_regex_maker,
|
||||
iterator_with_checks,
|
||||
save_nanos_vocabulary,
|
||||
load_nanos_vocabulary,
|
||||
save_json,
|
||||
load_json,
|
||||
)
|
||||
|
||||
|
||||
class NanoSocraTrainer:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
max_vocabulary: int,
|
||||
special_vocabulary: list[str],
|
||||
chunk_size: int,
|
||||
merge_treshold: int = 0,
|
||||
max_iterations: int = 0,
|
||||
print_after_iterations: int = 1,
|
||||
) -> None:
|
||||
# Bytes
|
||||
BYTE_RESERVED_TOKENS = 256
|
||||
SPECIAL_RESERVED_TOKENS = len(special_vocabulary)
|
||||
RESERVED_TOKENS = BYTE_RESERVED_TOKENS + SPECIAL_RESERVED_TOKENS
|
||||
|
||||
self.__max_vocabulary = max_vocabulary - RESERVED_TOKENS
|
||||
self.__max_iterations = max_iterations
|
||||
self.__chunk_size = chunk_size
|
||||
self.__merge_treshold = merge_treshold
|
||||
self.__special_token_regex = special_regex_maker(special_vocabulary)
|
||||
self.__print_after_iterations = print_after_iterations
|
||||
|
||||
def trainBPE(
|
||||
self,
|
||||
path: Path,
|
||||
cache_dir: Path,
|
||||
bpe: NanoSocratesBPE | None = None,
|
||||
resume_from_iter: int = 0,
|
||||
) -> NanoSocratesBPE:
|
||||
|
||||
if not path.is_file():
|
||||
raise FileNotFoundError()
|
||||
|
||||
if not cache_dir.is_dir():
|
||||
raise NotADirectoryError()
|
||||
|
||||
if bpe is None:
|
||||
bpe = NanoSocratesBPE()
|
||||
BPE = bpe
|
||||
|
||||
if BPE.vocabulary_size > self.__max_vocabulary:
|
||||
return BPE
|
||||
|
||||
exit = False
|
||||
cached = False
|
||||
current_iteration = 0
|
||||
input_path = path
|
||||
|
||||
NEXT_ITERATION = resume_from_iter + 1 if resume_from_iter != 0 else 0
|
||||
|
||||
PATH_GEN = self.__switch_paths(path, cache_dir, NEXT_ITERATION)
|
||||
MEMORY_PATH_GEN = self.__switch_memory(cache_dir, resume_from_iter)
|
||||
|
||||
if resume_from_iter != 0:
|
||||
cached = True
|
||||
current_iteration = resume_from_iter
|
||||
input_path = next(PATH_GEN)
|
||||
# UGLY: fixes a bug immediately, unfortunately
|
||||
_, _ = next(MEMORY_PATH_GEN)
|
||||
_, voc_cache_path = next(MEMORY_PATH_GEN)
|
||||
vocabulary = load_nanos_vocabulary(voc_cache_path)
|
||||
BPE = NanoSocratesBPE(vocabulary)
|
||||
|
||||
while not exit:
|
||||
|
||||
out_path = next(PATH_GEN)
|
||||
internal_cache_path, vocabulary_cache = next(MEMORY_PATH_GEN)
|
||||
|
||||
current_iteration = self.__increment_counter(current_iteration)
|
||||
LAST_VOC_SIZE = BPE.vocabulary_size
|
||||
|
||||
FILE = open(out_path, "w")
|
||||
|
||||
last_memory = None
|
||||
|
||||
for _, memory, output in self.__round_train(input_path, BPE, cached):
|
||||
last_memory = memory
|
||||
FILE.write(output)
|
||||
|
||||
FILE.close()
|
||||
|
||||
internal_cache = {
|
||||
"finished_iter": current_iteration,
|
||||
"read_from": f"{input_path}",
|
||||
"wrote_to": f"{out_path}",
|
||||
"at": datetime.datetime.now(datetime.timezone.utc).strftime(
|
||||
"%Y-%m-%d %H:%M:%S.%f"
|
||||
)[:-3],
|
||||
}
|
||||
|
||||
VOCABULARY = BPE.vocabulary
|
||||
|
||||
save_json(internal_cache, internal_cache_path)
|
||||
save_nanos_vocabulary(VOCABULARY, vocabulary_cache)
|
||||
|
||||
cached = True
|
||||
input_path = out_path
|
||||
|
||||
NEW_VOC_SIZE = BPE.vocabulary_size
|
||||
|
||||
if current_iteration % self.__print_after_iterations == 0:
|
||||
|
||||
DELIMITER = "==============="
|
||||
|
||||
DEBUG = "\n".join(
|
||||
[
|
||||
DELIMITER,
|
||||
f"ITERATION: {current_iteration}",
|
||||
DELIMITER,
|
||||
f"\tVocabulary size: {BPE.vocabulary_size}\n",
|
||||
f"\tFrequencies:\n{last_memory.frequencies}\n", # type: ignore (pretty sure it's not None)
|
||||
f"\tvocabulary:\n{BPE.vocabulary}",
|
||||
DELIMITER,
|
||||
"",
|
||||
]
|
||||
)
|
||||
print(DEBUG)
|
||||
|
||||
if LAST_VOC_SIZE == NEW_VOC_SIZE:
|
||||
exit = True
|
||||
continue
|
||||
|
||||
if current_iteration == self.__max_iterations:
|
||||
exit = True
|
||||
continue
|
||||
|
||||
if BPE.vocabulary_size == self.__max_vocabulary:
|
||||
exit = True
|
||||
continue
|
||||
|
||||
return BPE
|
||||
|
||||
def __round_train(self, path: Path, bpe: NanoSocratesBPE, cached: bool):
|
||||
|
||||
CHUNKER = NanoSocratesChunker(self.__chunk_size, self.__special_token_regex)
|
||||
SPLITTER = NanoSocratesSplitter(self.__special_token_regex)
|
||||
|
||||
BPE = bpe
|
||||
memory = NanoSocratesBatchMemoryBPE({}, self.__merge_treshold)
|
||||
|
||||
CHUNKER_GENERATOR = iterator_with_checks(CHUNKER.chunk(path))
|
||||
|
||||
for chunk, last_chunk in CHUNKER_GENERATOR:
|
||||
|
||||
PIECE_GENERATOR = iterator_with_checks(SPLITTER.split_text(chunk))
|
||||
|
||||
for piece, last_piece in PIECE_GENERATOR:
|
||||
|
||||
LAST_BATCH = last_chunk and last_piece
|
||||
PIECE, TOKEN_TYPE = piece
|
||||
|
||||
if TOKEN_TYPE != TokenType.BPE:
|
||||
_, _, out = BPE.fit([], memory, LAST_BATCH)
|
||||
yield (BPE, memory, PIECE)
|
||||
continue
|
||||
|
||||
PIECE_DATA = self.__make_list_ids(PIECE, cached)
|
||||
|
||||
_, _, out = BPE.fit(PIECE_DATA, memory, LAST_BATCH)
|
||||
|
||||
OUT_STRING = f"{out}"
|
||||
yield (BPE, memory, OUT_STRING)
|
||||
|
||||
def __increment_counter(self, counter: int):
|
||||
|
||||
# What if overflows???
|
||||
try:
|
||||
counter += 1
|
||||
except:
|
||||
print("Integer overflow")
|
||||
counter = 1
|
||||
|
||||
return counter
|
||||
|
||||
def __make_list_ids(self, corpus: str, cached: bool):
|
||||
|
||||
if not cached:
|
||||
return list(corpus.encode("utf-8"))
|
||||
|
||||
REDUCED_CORPUS_LEN = len(corpus) - 1
|
||||
|
||||
# Skip these cars "[" "]"
|
||||
INTS = corpus[1:REDUCED_CORPUS_LEN]
|
||||
INT_LIST = list(map(int, INTS.split(",")))
|
||||
return INT_LIST
|
||||
|
||||
def __switch_paths(self, path: Path, cache_path: Path, initial_iteration: int):
|
||||
|
||||
CORPUS_TMP_1 = cache_path / "corpus-tmp1.txt"
|
||||
CORPUS_TMP_2 = cache_path / "corpus-tmp2.txt"
|
||||
|
||||
switch = True
|
||||
|
||||
if initial_iteration % 2 == 1:
|
||||
switch = False
|
||||
|
||||
del initial_iteration
|
||||
|
||||
while True:
|
||||
if switch:
|
||||
yield CORPUS_TMP_1
|
||||
else:
|
||||
yield CORPUS_TMP_2
|
||||
switch = not switch
|
||||
|
||||
def __switch_memory(self, cache_path: Path, initial_iteration: int):
|
||||
|
||||
INTERNAL_TMP_1 = cache_path / "internal-tmp1.json"
|
||||
INTERNAL_TMP_2 = cache_path / "internal-tmp2.json"
|
||||
|
||||
VOCAB_TMP_1 = cache_path / "voc-tmp1.json"
|
||||
VOCAB_TMP_2 = cache_path / "voc-tmp2.json"
|
||||
|
||||
switch = False
|
||||
|
||||
if initial_iteration % 2 == 1:
|
||||
switch = True
|
||||
|
||||
del initial_iteration
|
||||
|
||||
while True:
|
||||
if switch:
|
||||
yield (INTERNAL_TMP_1, VOCAB_TMP_1)
|
||||
else:
|
||||
yield (INTERNAL_TMP_2, VOCAB_TMP_2)
|
||||
switch = not switch
|
||||
280
Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py
Normal file
280
Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py
Normal file
@ -0,0 +1,280 @@
|
||||
from collections import deque
|
||||
import datetime
|
||||
import itertools
|
||||
from multiprocessing import Pool
|
||||
import os
|
||||
from pathlib import Path
|
||||
import re
|
||||
import time
|
||||
from ..Classes import (
|
||||
NanoSocratesBPE,
|
||||
NanoSocratesChunker,
|
||||
NanoSocratesSplitter,
|
||||
NanoSocratesBatchMemoryBPE,
|
||||
)
|
||||
from ..Enums import TokenType
|
||||
from ..Utils import (
|
||||
special_regex_maker,
|
||||
iterator_with_checks,
|
||||
save_nanos_vocabulary,
|
||||
load_nanos_vocabulary,
|
||||
save_json,
|
||||
load_json,
|
||||
)
|
||||
|
||||
|
||||
def split(a, n):
|
||||
k, m = divmod(len(a), n)
|
||||
return (a[i * k + min(i, m) : (i + 1) * k + min(i + 1, m)] for i in range(n))
|
||||
|
||||
|
||||
def split_fit(object: tuple[NanoSocratesBPE, list[list[int]]]):
|
||||
|
||||
bpe, data = object
|
||||
|
||||
NEW_DATA: list[list[int]] = []
|
||||
|
||||
memory = NanoSocratesBatchMemoryBPE({}, 0)
|
||||
|
||||
while len(data) > 0:
|
||||
|
||||
piece = data.pop()
|
||||
|
||||
bpe, memory, output = bpe.fit(piece, memory, False)
|
||||
|
||||
if len(output) < 2:
|
||||
continue
|
||||
|
||||
# We are sure of its type
|
||||
NEW_DATA.append(piece) # type: ignore
|
||||
|
||||
return (bpe, NEW_DATA, memory)
|
||||
|
||||
def split_encode(object: tuple[NanoSocratesBPE, list[list[int]]]):
|
||||
|
||||
bpe, data = object
|
||||
|
||||
NEW_DATA: list[list[int]] = []
|
||||
|
||||
for index, piece in zip(range(0, len(data)), data):
|
||||
output = bpe.encode_intermediate(piece)
|
||||
|
||||
if len(output) < 2:
|
||||
continue
|
||||
|
||||
# We are sure of its type
|
||||
NEW_DATA.append(data[index]) # type: ignore
|
||||
|
||||
return NEW_DATA
|
||||
|
||||
class NanoSocraTrainerPool:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
max_vocabulary: int,
|
||||
special_vocabulary: list[str],
|
||||
merge_treshold: int = 0,
|
||||
max_iterations: int = 0,
|
||||
print_after_iterations: int = 1,
|
||||
) -> None:
|
||||
# Bytes
|
||||
BYTE_RESERVED_TOKENS = 256
|
||||
SPECIAL_RESERVED_TOKENS = len(special_vocabulary)
|
||||
RESERVED_TOKENS = BYTE_RESERVED_TOKENS + SPECIAL_RESERVED_TOKENS
|
||||
|
||||
self.__max_vocabulary = max_vocabulary - RESERVED_TOKENS
|
||||
self.__max_iterations = max_iterations
|
||||
self.__merge_treshold = merge_treshold
|
||||
self.__special_token_regex = special_regex_maker(special_vocabulary)
|
||||
self.__print_after_iterations = print_after_iterations
|
||||
|
||||
# TODO: add a resume function
|
||||
def trainBPE(
|
||||
self,
|
||||
path: Path,
|
||||
cache_file: Path,
|
||||
bpe: NanoSocratesBPE | None = None,
|
||||
) -> NanoSocratesBPE:
|
||||
|
||||
if not path.is_file():
|
||||
raise FileNotFoundError()
|
||||
|
||||
if not cache_file.is_file():
|
||||
file = cache_file.open("w")
|
||||
file.close()
|
||||
|
||||
if bpe is None:
|
||||
bpe = NanoSocratesBPE()
|
||||
BPE = bpe
|
||||
|
||||
if BPE.vocabulary_size >= self.__max_vocabulary:
|
||||
return BPE
|
||||
|
||||
exit = False
|
||||
current_iteration = 0
|
||||
data = self.__gather_data_from_file(path)
|
||||
data = self.__encode_from_cache(BPE, data)
|
||||
|
||||
|
||||
while not exit:
|
||||
|
||||
current_iteration = self.__increment_counter(current_iteration)
|
||||
|
||||
LAST_VOC_SIZE = BPE.vocabulary_size
|
||||
|
||||
last_memory = None
|
||||
|
||||
start = time.time_ns()
|
||||
_, data, last_memory = self.__round_train(BPE, data)
|
||||
end = time.time_ns()
|
||||
NEW_VOC_SIZE = BPE.vocabulary_size
|
||||
|
||||
VOCABULARY = BPE.vocabulary
|
||||
|
||||
save_nanos_vocabulary(VOCABULARY, cache_file)
|
||||
|
||||
if current_iteration % self.__print_after_iterations == 0:
|
||||
|
||||
DELIMITER = "==============="
|
||||
|
||||
DEBUG = "\n".join(
|
||||
[
|
||||
DELIMITER,
|
||||
f"ITERATION: {current_iteration}",
|
||||
DELIMITER,
|
||||
f"\tVocabulary size: {BPE.vocabulary_size - 256}\n",
|
||||
f"\tTime elapsed: {(end - start)/1E9}s",
|
||||
DELIMITER,
|
||||
"",
|
||||
]
|
||||
)
|
||||
print(DEBUG)
|
||||
|
||||
if LAST_VOC_SIZE == NEW_VOC_SIZE:
|
||||
exit = True
|
||||
continue
|
||||
|
||||
if current_iteration == self.__max_iterations:
|
||||
exit = True
|
||||
continue
|
||||
|
||||
if BPE.vocabulary_size == self.__max_vocabulary:
|
||||
exit = True
|
||||
continue
|
||||
|
||||
return BPE
|
||||
|
||||
def __round_train(self, bpe: NanoSocratesBPE, data: list[list[int]]):
|
||||
|
||||
NEW_DATA: list[list[int]] = []
|
||||
|
||||
MEMORY = NanoSocratesBatchMemoryBPE({}, self.__merge_treshold)
|
||||
|
||||
fit_funct = split_fit
|
||||
CPU_COUNT = os.process_cpu_count()
|
||||
|
||||
if CPU_COUNT is None:
|
||||
raise Exception()
|
||||
|
||||
VOCABULARY = bpe.vocabulary
|
||||
|
||||
data_chunks = split(data, CPU_COUNT)
|
||||
JOBS = [(NanoSocratesBPE(VOCABULARY), chunk) for chunk in data_chunks]
|
||||
|
||||
JOB_RESULTS: list[
|
||||
tuple[NanoSocratesBPE, list[list[int]], NanoSocratesBatchMemoryBPE]
|
||||
]
|
||||
|
||||
with Pool() as pool:
|
||||
JOB_RESULTS = pool.map(fit_funct, JOBS)
|
||||
|
||||
for i, res in zip(range(0, CPU_COUNT), JOB_RESULTS):
|
||||
_, job_output, job_memory = res
|
||||
NEW_DATA.extend(job_output)
|
||||
|
||||
for key, value in job_memory.frequencies.items():
|
||||
frequency = MEMORY.frequencies.get(key)
|
||||
|
||||
if frequency is None:
|
||||
frequency = 0
|
||||
MEMORY.frequencies[key] = 0
|
||||
|
||||
frequency += value
|
||||
MEMORY.frequencies[key] = frequency
|
||||
|
||||
del job_output
|
||||
del job_memory
|
||||
|
||||
print(f"Joined {i + 1} out of {CPU_COUNT}")
|
||||
|
||||
# Get new token
|
||||
bpe.fit([], MEMORY, True)
|
||||
|
||||
print(f"Sentences from {len(data)} to {len(NEW_DATA)}")
|
||||
|
||||
return (bpe, NEW_DATA, MEMORY)
|
||||
|
||||
def __gather_data_from_file(self, path: Path) -> list[list[int]]:
|
||||
|
||||
SPLITTER = NanoSocratesSplitter(self.__special_token_regex)
|
||||
|
||||
DATA: list[list[int]] = []
|
||||
|
||||
FILE = open(path, "r", encoding="utf-8")
|
||||
file_string = FILE.read()
|
||||
FILE.close()
|
||||
|
||||
for piece, type in SPLITTER.split_text(file_string):
|
||||
|
||||
if type != TokenType.BPE:
|
||||
continue
|
||||
|
||||
int_list = self.__make_list_ids(piece)
|
||||
DATA.append(int_list)
|
||||
|
||||
return DATA
|
||||
|
||||
def __encode_from_cache(self, bpe: NanoSocratesBPE, data: list[list[int]]):
|
||||
|
||||
NEW_DATA : list[list[int]]= []
|
||||
|
||||
CPU_COUNT = os.process_cpu_count()
|
||||
|
||||
if CPU_COUNT is None:
|
||||
raise Exception()
|
||||
|
||||
VOCABULARY = bpe.vocabulary
|
||||
|
||||
data_chunks = split(data, CPU_COUNT)
|
||||
JOBS = [(NanoSocratesBPE(VOCABULARY), chunk) for chunk in data_chunks]
|
||||
|
||||
JOB_RESULTS: list[list[list[int]]]
|
||||
|
||||
with Pool() as pool:
|
||||
JOB_RESULTS = pool.map(split_encode, JOBS)
|
||||
|
||||
for i, res in zip(range(0, CPU_COUNT), JOB_RESULTS):
|
||||
job_output = res
|
||||
NEW_DATA.extend(job_output)
|
||||
|
||||
del job_output
|
||||
|
||||
print(f"Joined {i + 1} out of {CPU_COUNT}")
|
||||
|
||||
print(f"Sentences from {len(data)} to {len(NEW_DATA)}")
|
||||
|
||||
return NEW_DATA
|
||||
|
||||
def __increment_counter(self, counter: int):
|
||||
|
||||
# What if overflows???
|
||||
try:
|
||||
counter += 1
|
||||
except:
|
||||
print("Integer overflow")
|
||||
counter = 1
|
||||
|
||||
return counter
|
||||
|
||||
def __make_list_ids(self, corpus: str):
|
||||
return list(corpus.encode("utf-8"))
|
||||
219
Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py
Normal file
219
Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py
Normal file
@ -0,0 +1,219 @@
|
||||
from collections import deque
|
||||
from .Encoder import Encoder
|
||||
from ..Errors import OutOfDictionaryException, DuplicateWordException
|
||||
|
||||
|
||||
# ABOUT THE DICTIONARY:
|
||||
# the string is converted into utf-char bytes, that is: each char is rappresented with a set of bytes from 1 to 4.
|
||||
# each bytes get casted into an integer; such that, if an integer has its value lower then 256,
|
||||
# then it is rappresenting an utf-char-byte, otherwise it is a token-ID.
|
||||
class NanoSocratesBatchMemoryBPE:
|
||||
"""Memory to batch training. Keeps token couple frequencies, and merge_treshold"""
|
||||
|
||||
def __init__(
|
||||
self, frequencies: dict[tuple[int, int], int], merge_treshold: int
|
||||
) -> None:
|
||||
|
||||
self.frequencies = frequencies
|
||||
self.merge_treshold = merge_treshold
|
||||
|
||||
|
||||
class NanoSocratesBPE(Encoder):
|
||||
|
||||
def __init__(self, vocabulary: dict[tuple[int, int], int] | None = None) -> None:
|
||||
super().__init__()
|
||||
|
||||
self.__vocabulary: dict[tuple[int, int], int] = {}
|
||||
self.__reverse_vocabulary: dict[int, tuple[int, int]] = {}
|
||||
|
||||
if vocabulary is None:
|
||||
return
|
||||
|
||||
for key, value in vocabulary.items():
|
||||
if value < 256:
|
||||
raise OutOfDictionaryException()
|
||||
# values under 256 are used for unpaired char
|
||||
# TODO: check if they are in order
|
||||
self.__vocabulary[key] = value
|
||||
self.__reverse_vocabulary[value] = key
|
||||
|
||||
@property
|
||||
def vocabulary_size(self):
|
||||
return len(self.__vocabulary) + 256
|
||||
|
||||
@property
|
||||
def vocabulary(self):
|
||||
return self.__vocabulary
|
||||
|
||||
@property
|
||||
def __next_id(self) -> int:
|
||||
"""
|
||||
Gets the next it
|
||||
Returns:
|
||||
int:
|
||||
"""
|
||||
return self.vocabulary_size
|
||||
|
||||
# TODO: implement fit
|
||||
def fit(
|
||||
self,
|
||||
chunk_data: list[int],
|
||||
memory: NanoSocratesBatchMemoryBPE,
|
||||
last_batch: bool,
|
||||
):
|
||||
|
||||
ENCODED_CHUNK = self.encode_intermediate(chunk_data)
|
||||
DATA_LEN_BEFORE_LAST = len(ENCODED_CHUNK) - 1
|
||||
|
||||
# update frequency of each couple of element
|
||||
for i in range(0, DATA_LEN_BEFORE_LAST):
|
||||
CANDIDATE_COUPLE = (ENCODED_CHUNK[i], ENCODED_CHUNK[i + 1])
|
||||
|
||||
frequency = memory.frequencies.get(CANDIDATE_COUPLE)
|
||||
|
||||
# Initialize frequency
|
||||
if frequency is None:
|
||||
frequency = 0
|
||||
memory.frequencies[CANDIDATE_COUPLE] = 0
|
||||
|
||||
frequency += 1
|
||||
memory.frequencies[CANDIDATE_COUPLE] = frequency
|
||||
|
||||
if not last_batch:
|
||||
return (self, memory, ENCODED_CHUNK)
|
||||
|
||||
if len(memory.frequencies) < 1:
|
||||
return (self, memory, ENCODED_CHUNK)
|
||||
|
||||
FREQUENCIES = memory.frequencies
|
||||
MAX_COUPLE = max(FREQUENCIES.items(), key=lambda item: item[1])[0]
|
||||
FREQUENCY = FREQUENCIES[MAX_COUPLE]
|
||||
|
||||
if FREQUENCY < memory.merge_treshold:
|
||||
return (self, memory, ENCODED_CHUNK)
|
||||
|
||||
self.__learn_word(MAX_COUPLE)
|
||||
|
||||
return (self, memory, ENCODED_CHUNK)
|
||||
|
||||
def encode(self, piece: str) -> list[int]:
|
||||
"""Encode a String into token IDs, it firt convert it into utf-8, then pass the list of integer to encode_intermediate()
|
||||
Args:
|
||||
piece (str):
|
||||
Returns:
|
||||
list[int]:
|
||||
"""
|
||||
converted_piece = list(piece.encode("utf-8"))
|
||||
return self.encode_intermediate(converted_piece)
|
||||
|
||||
def encode_intermediate(self, piece: list[int]) -> list[int]:
|
||||
"""Encode a piece (as list of integer) till its maximum
|
||||
Args:
|
||||
piece (list[int]): piece to encode
|
||||
Returns:
|
||||
list[int]: piece encoded
|
||||
"""
|
||||
current_piece = piece
|
||||
new_piece = self.__round_encode(current_piece)
|
||||
|
||||
# until current_piece is bigger then new_piece, keep encoding
|
||||
while len(current_piece) != len(new_piece):
|
||||
current_piece = new_piece
|
||||
new_piece = self.__round_encode(current_piece)
|
||||
|
||||
return current_piece
|
||||
|
||||
def __round_encode(self, piece: list[int]):
|
||||
"""A single round of encode that traverse all the object. Multiple round are needed for a full encode: \n
|
||||
1) "ABAB" -> "XX"
|
||||
2) "XX" -> "Y"
|
||||
Args:
|
||||
piece (list[int]): the object to encode as a list of integer
|
||||
|
||||
Returns:
|
||||
(list[int]): the one time encoded object
|
||||
"""
|
||||
|
||||
if len(piece) == 1:
|
||||
return piece
|
||||
|
||||
PIECE_LENGTH = len(piece) - 1
|
||||
NEW_PIECE: list[int] = []
|
||||
|
||||
index = 0
|
||||
while index < PIECE_LENGTH:
|
||||
|
||||
CANDIDATE_WORD = (
|
||||
piece[index],
|
||||
piece[index + 1],
|
||||
) # take a tuple of consecutive element [int]
|
||||
CANDIDATE_TOKEN = self.__vocabulary.get(CANDIDATE_WORD)
|
||||
|
||||
# if no token to substitute the tuple, append the first element
|
||||
if CANDIDATE_TOKEN is None:
|
||||
NEW_PIECE.append(piece[index])
|
||||
index += 1
|
||||
|
||||
# if the latter element of the tuple is the last element of the piece, append it
|
||||
if index == PIECE_LENGTH:
|
||||
NEW_PIECE.append(piece[index])
|
||||
|
||||
continue
|
||||
|
||||
# in this case there was a candidate token to substitute the couple of element
|
||||
NEW_PIECE.append(CANDIDATE_TOKEN)
|
||||
|
||||
index += 2
|
||||
|
||||
if index == PIECE_LENGTH:
|
||||
NEW_PIECE.append(piece[index])
|
||||
|
||||
return NEW_PIECE
|
||||
|
||||
# TODO: Remake decode to take a list of token IDs
|
||||
def decode(self, token_ids: list[int]) -> str:
|
||||
|
||||
# deque: double ended queue
|
||||
token_stack: deque[int] = deque(token_ids)
|
||||
UTF_8_STRING_ARR: bytearray = bytearray()
|
||||
|
||||
while len(token_stack) > 0:
|
||||
TOKEN_ID = token_stack.popleft()
|
||||
|
||||
if TOKEN_ID < 256:
|
||||
UTF_8_STRING_ARR.append(TOKEN_ID)
|
||||
continue
|
||||
|
||||
left_token, right_token = self.__token_decode(TOKEN_ID)
|
||||
|
||||
token_stack.appendleft(right_token)
|
||||
token_stack.appendleft(left_token)
|
||||
|
||||
return UTF_8_STRING_ARR.decode("utf-8")
|
||||
|
||||
def __token_decode(self, token_id: int) -> tuple[int, int]:
|
||||
|
||||
CANDIDATE_DECODED = self.__reverse_vocabulary.get(token_id)
|
||||
|
||||
if CANDIDATE_DECODED is None:
|
||||
raise OutOfDictionaryException()
|
||||
|
||||
return CANDIDATE_DECODED
|
||||
|
||||
def __learn_word(self, words: tuple[int, int]):
|
||||
"""learn a new couple of object in the vocabulary
|
||||
Args:
|
||||
words (tuple[int, int]): the Pair of element to substitute with a new tokenID
|
||||
|
||||
Raises:
|
||||
DuplicateWordException: it launch if there is a duplicate of the new tokenID in the dictionary
|
||||
"""
|
||||
ID = self.__next_id
|
||||
|
||||
DUPLICATE = self.__vocabulary.get(words)
|
||||
|
||||
if DUPLICATE is not None:
|
||||
raise DuplicateWordException()
|
||||
|
||||
self.__vocabulary[words] = ID
|
||||
self.__reverse_vocabulary[ID] = words
|
||||
70
Project_Model/Libs/BPE/Classes/NanoSocratesChunker.py
Normal file
70
Project_Model/Libs/BPE/Classes/NanoSocratesChunker.py
Normal file
@ -0,0 +1,70 @@
|
||||
from pathlib import Path
|
||||
import re
|
||||
from ..Errors import DelimiterNotFoundException
|
||||
|
||||
|
||||
class NanoSocratesChunker:
|
||||
|
||||
def __init__(self, max_size: int, special_token_regex: re.Pattern) -> None:
|
||||
self.__max_size: int = max_size
|
||||
self.__special_token_regex: re.Pattern = special_token_regex
|
||||
self.__residual: str = ""
|
||||
|
||||
# max theorethical size of chars
|
||||
# between special tokens:
|
||||
# - min: size - len(longest_token)
|
||||
# - MAX: size - len(shortest_token)
|
||||
def chunk(self, file_path: Path):
|
||||
# read_file
|
||||
FILE = open(file_path, "r", encoding="utf-8")
|
||||
exit = False
|
||||
|
||||
while not exit:
|
||||
REMAINING_SIZE = self.__max_size - len(self.__residual)
|
||||
READ_SIZE = min(self.__max_size, REMAINING_SIZE)
|
||||
FILE_CHUNK = FILE.read(READ_SIZE)
|
||||
|
||||
if len(FILE_CHUNK) == 0:
|
||||
exit = True
|
||||
continue
|
||||
|
||||
CHUNK = self.__append_residuals(FILE_CHUNK)
|
||||
|
||||
boundaries = self.__identify_boudaries(CHUNK)
|
||||
|
||||
if boundaries is None:
|
||||
|
||||
# boundaries not found in 2 chunks,
|
||||
if len(CHUNK) > self.__max_size - 1:
|
||||
raise DelimiterNotFoundException()
|
||||
|
||||
if exit:
|
||||
yield CHUNK
|
||||
|
||||
self.__set_residual(0, CHUNK)
|
||||
continue
|
||||
|
||||
start, end = boundaries
|
||||
self.__set_residual(end, CHUNK)
|
||||
yield CHUNK[start:end]
|
||||
|
||||
def __identify_boudaries(self, corpus: str) -> tuple[int, int] | None:
|
||||
|
||||
end = 0
|
||||
|
||||
for match in self.__special_token_regex.finditer(corpus):
|
||||
# print(match)
|
||||
end = match.end()
|
||||
|
||||
if end == 0:
|
||||
return None
|
||||
|
||||
return (0, end)
|
||||
|
||||
def __append_residuals(self, corpus: str) -> str:
|
||||
RESIDUAL = self.__residual
|
||||
self.__residual = ""
|
||||
return RESIDUAL + corpus
|
||||
|
||||
def __set_residual(self, index: int, corpus: str):
|
||||
self.__residual = corpus[index:]
|
||||
68
Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py
Normal file
68
Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py
Normal file
@ -0,0 +1,68 @@
|
||||
from .Encoder import Encoder
|
||||
from ..Errors import OutOfDictionaryException
|
||||
|
||||
|
||||
class NanoSocratesSpecial(Encoder):
|
||||
|
||||
def __init__(
|
||||
self, bpe_vocabulary_size: int, special_tokens: list[str] = []
|
||||
) -> None:
|
||||
|
||||
super().__init__()
|
||||
|
||||
self.__bpe_offset = bpe_vocabulary_size
|
||||
self.__vocabulary: dict[str, int] = {}
|
||||
self.__reverse_vocabulary: dict[int, str] = {}
|
||||
|
||||
if len(special_tokens) == 0:
|
||||
return
|
||||
|
||||
for index, TOKEN in zip(range(0, len(special_tokens)), special_tokens):
|
||||
|
||||
CANDIDATE_ID = self.__bpe_offset + index + 1
|
||||
self.__vocabulary[TOKEN] = CANDIDATE_ID
|
||||
self.__reverse_vocabulary[CANDIDATE_ID] = TOKEN
|
||||
|
||||
@property
|
||||
def __next_id(self):
|
||||
BPE_OFFSET = self.__bpe_offset
|
||||
VOC_LENGTH = len(self.__vocabulary)
|
||||
return BPE_OFFSET + VOC_LENGTH + 1
|
||||
|
||||
@property
|
||||
def vocabulary_size(self) -> int:
|
||||
return len(self.vocabulary)
|
||||
|
||||
@property
|
||||
def vocabulary(self) -> dict[str, int]:
|
||||
return self.__vocabulary
|
||||
|
||||
@property
|
||||
def reverse_vocabulary(self) -> dict[int, str]:
|
||||
return self.__reverse_vocabulary
|
||||
|
||||
def add_special_word_to_vocabulary(self, word: str):
|
||||
CANDIDATE_INDEX = self.__next_id
|
||||
self.__vocabulary[word] = CANDIDATE_INDEX
|
||||
self.__reverse_vocabulary[CANDIDATE_INDEX] = word
|
||||
|
||||
def encode(self, word: str) -> list[int]:
|
||||
ID = self.__vocabulary.get(word)
|
||||
|
||||
if ID is None:
|
||||
raise OutOfDictionaryException()
|
||||
|
||||
return [ID]
|
||||
|
||||
def decode(self, token_id: list[int]) -> str:
|
||||
|
||||
if len(token_id) != 1:
|
||||
raise OutOfDictionaryException()
|
||||
|
||||
ID = token_id[0]
|
||||
WORD = self.__reverse_vocabulary.get(ID)
|
||||
|
||||
if WORD is None:
|
||||
raise OutOfDictionaryException()
|
||||
|
||||
return WORD
|
||||
98
Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py
Normal file
98
Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py
Normal file
@ -0,0 +1,98 @@
|
||||
import re
|
||||
from collections import deque
|
||||
from typing import Generator
|
||||
from ..Enums import TokenType
|
||||
|
||||
|
||||
class NanoSocratesSplitter:
|
||||
|
||||
def __init__(
|
||||
self, special_token_regex: re.Pattern, max_bpe_token_id: int = 255
|
||||
) -> None:
|
||||
# attention the regex got already compiled
|
||||
self.__special_token_regex = special_token_regex
|
||||
self.__max_bpe_token_id: int = max_bpe_token_id # used for decoding
|
||||
|
||||
def split_text(self, corpus: str) -> Generator[tuple[str, TokenType]]:
|
||||
"""Split a text using a regex given
|
||||
Args:
|
||||
corpus (str): all the corpus string to split
|
||||
Yields:
|
||||
Generator[tuple[str, TokenType]]: each time returns a piece of the splitted text: string and its TokenType. \n
|
||||
TokenType describe if the string is for the BPE or a special Token [BPE, SPECIAL]
|
||||
"""
|
||||
|
||||
bpe_start = 0
|
||||
bpe_end = len(corpus) # this can be deleted!
|
||||
|
||||
for special_token_start, special_token_end in self.__find_boundaries(corpus):
|
||||
|
||||
# FIND BPE
|
||||
bpe_end = special_token_start
|
||||
BPE_TOKEN_TEXT = corpus[bpe_start:bpe_end]
|
||||
if BPE_TOKEN_TEXT != "":
|
||||
for WORD in self.__split_words(BPE_TOKEN_TEXT):
|
||||
yield (WORD, TokenType.BPE)
|
||||
|
||||
# FIND SPECIAL TOKEN
|
||||
SPECIAL_TOKEN_TEXT = corpus[special_token_start:special_token_end]
|
||||
if SPECIAL_TOKEN_TEXT != "":
|
||||
yield (SPECIAL_TOKEN_TEXT, TokenType.SPECIAL)
|
||||
|
||||
# now save the new bpe start point
|
||||
# it will used in the next interaction
|
||||
bpe_start = special_token_end
|
||||
|
||||
def __find_boundaries(self, corpus: str) -> Generator[tuple[int, int]]:
|
||||
"""
|
||||
Find each time the start and end (not included) of the special token
|
||||
Args:
|
||||
corpus (str): the string where the special token will be searched
|
||||
Yields:
|
||||
Generator[tuple[int, int]]: Note the end is not included
|
||||
"""
|
||||
for match in self.__special_token_regex.finditer(corpus):
|
||||
start = match.start()
|
||||
end = match.end()
|
||||
|
||||
yield (start, end)
|
||||
|
||||
# make the last boundary be the end of corpus
|
||||
# eof = len(corpus)
|
||||
# yield(eof,eof)
|
||||
|
||||
def __split_words(self, bpe_piece: str) -> Generator[str]:
|
||||
|
||||
END_OF_STRING = len(bpe_piece)
|
||||
bound_start = 0
|
||||
bound_end = END_OF_STRING + 1
|
||||
for i in range(0, END_OF_STRING):
|
||||
|
||||
CANDIDATE_CHAR = bpe_piece[i]
|
||||
|
||||
if CANDIDATE_CHAR != " ":
|
||||
continue
|
||||
|
||||
bound_end = i
|
||||
|
||||
yield bpe_piece[bound_start:bound_end]
|
||||
|
||||
bound_start = bound_end
|
||||
bound_end = END_OF_STRING + 1
|
||||
|
||||
yield bpe_piece[bound_start:bound_end]
|
||||
|
||||
def split_tokens(self, corpus: list[int]) -> Generator[tuple[list[int], TokenType]]:
|
||||
|
||||
not_special_token_list: list[int] = []
|
||||
for token in corpus:
|
||||
if token > self.__max_bpe_token_id:
|
||||
|
||||
if len(not_special_token_list) > 0:
|
||||
yield (not_special_token_list, TokenType.BPE)
|
||||
not_special_token_list = []
|
||||
|
||||
yield ([token], TokenType.SPECIAL)
|
||||
continue
|
||||
|
||||
not_special_token_list.append(token)
|
||||
8
Project_Model/Libs/BPE/Classes/TokeNano.py
Normal file
8
Project_Model/Libs/BPE/Classes/TokeNano.py
Normal file
@ -0,0 +1,8 @@
|
||||
|
||||
from Project_Model.Libs.BPE.Classes.TokeNanoCore import TokeNanoCore
|
||||
|
||||
class TokeNano:
|
||||
|
||||
def __init__(self):
|
||||
|
||||
pass
|
||||
84
Project_Model/Libs/BPE/Classes/TokeNanoCore.py
Normal file
84
Project_Model/Libs/BPE/Classes/TokeNanoCore.py
Normal file
@ -0,0 +1,84 @@
|
||||
from pathlib import Path
|
||||
|
||||
from ..Classes import NanoSocratesSplitter
|
||||
from ..Classes import NanoSocratesBPE
|
||||
from ..Classes import NanoSocratesSpecial
|
||||
|
||||
from ..Utils import special_regex_maker
|
||||
from ..Enums import TokenType
|
||||
from ..Enums import SpecialToken
|
||||
|
||||
|
||||
class TokeNanoCore:
|
||||
def __init__(
|
||||
self,
|
||||
bpe_vocabulary: dict[tuple[int, int], int],
|
||||
special_token_list: list[str],
|
||||
# special_vocabulary: dict[str, int]
|
||||
):
|
||||
|
||||
self.__bpe_encoder = NanoSocratesBPE(bpe_vocabulary)
|
||||
|
||||
SPECIAL_REGEX = special_regex_maker(special_token_list)
|
||||
BPE_VOCABULARY_SIZE = self.__bpe_encoder.vocabulary_size
|
||||
|
||||
self.__splitter = NanoSocratesSplitter(SPECIAL_REGEX, BPE_VOCABULARY_SIZE)
|
||||
self.__special_encoder = NanoSocratesSpecial(
|
||||
BPE_VOCABULARY_SIZE, special_token_list
|
||||
)
|
||||
|
||||
@property
|
||||
def vocabulary_size(self):
|
||||
BPE_VOC_SIZE = self.__bpe_encoder.vocabulary_size
|
||||
SPECIAL_VOC_SIZE = self.__special_encoder.vocabulary_size
|
||||
return BPE_VOC_SIZE + SPECIAL_VOC_SIZE
|
||||
|
||||
def encode(self, corpus: str) -> list[int]:
|
||||
output: list[int] = []
|
||||
for piece, token_type in self.__splitter.split_text(corpus):
|
||||
|
||||
if token_type == TokenType.SPECIAL:
|
||||
output.extend(self.__special_encoder.encode(piece))
|
||||
|
||||
# slow but clear
|
||||
if token_type == TokenType.BPE:
|
||||
output.extend(self.__bpe_encoder.encode(piece))
|
||||
|
||||
return output
|
||||
|
||||
|
||||
|
||||
def encode_incomplete_string(self, corpus: str) -> list[int]:
|
||||
"""
|
||||
Encode string which don't end with a special token
|
||||
"""
|
||||
corpus = corpus + SpecialToken.CORPUS_END.value
|
||||
output: list[int] = []
|
||||
for piece, token_type in self.__splitter.split_text(corpus):
|
||||
|
||||
if token_type == TokenType.SPECIAL:
|
||||
output.extend(self.__special_encoder.encode(piece))
|
||||
|
||||
# slow but clear
|
||||
if token_type == TokenType.BPE:
|
||||
output.extend(self.__bpe_encoder.encode(piece))
|
||||
|
||||
return output[:-1]
|
||||
|
||||
|
||||
|
||||
def decode(self, corpus: list[int]) -> str:
|
||||
output_str = ""
|
||||
for token, token_type in self.__splitter.split_tokens(corpus):
|
||||
# token is an integer if special, a list of integer otherwise
|
||||
if token_type == TokenType.SPECIAL:
|
||||
output_str += self.__special_encoder.decode(
|
||||
token
|
||||
) # it accept an integer
|
||||
|
||||
# slow but clear
|
||||
if token_type == TokenType.BPE:
|
||||
output_str += self.__bpe_encoder.decode(
|
||||
token
|
||||
) # it accept a list of integer
|
||||
return output_str
|
||||
20
Project_Model/Libs/BPE/Classes/__init__.py
Normal file
20
Project_Model/Libs/BPE/Classes/__init__.py
Normal file
@ -0,0 +1,20 @@
|
||||
from .NanoSocratesChunker import NanoSocratesChunker
|
||||
from .NanoSocratesSplitter import NanoSocratesSplitter
|
||||
from .NanoSocratesBPE import NanoSocratesBPE, NanoSocratesBatchMemoryBPE
|
||||
from .NanoSocraTrainer import NanoSocraTrainer
|
||||
from .NanoSocraTraineRam import NanoSocraTraineRam
|
||||
from .NanoSocraTrainerPool import NanoSocraTrainerPool
|
||||
from .NanoSocratesSpecial import NanoSocratesSpecial
|
||||
from .TokeNanoCore import TokeNanoCore
|
||||
from .TokeNano import TokeNano
|
||||
|
||||
__all__ = [
|
||||
"NanoSocratesChunker",
|
||||
"NanoSocratesSplitter",
|
||||
"NanoSocratesBPE",
|
||||
"NanoSocraTrainer",
|
||||
"NanoSocraTraineRam",
|
||||
"NanoSocraTrainerPool",
|
||||
"TokeNanoCore",
|
||||
"TokeNano"
|
||||
]
|
||||
27
Project_Model/Libs/BPE/Enums/SpecialToken.py
Normal file
27
Project_Model/Libs/BPE/Enums/SpecialToken.py
Normal file
@ -0,0 +1,27 @@
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class SpecialToken(Enum):
|
||||
# (Enum, str) -> throws an error
|
||||
START_TRIPLE_LIST = "<SOTL>"
|
||||
START_TRIPLE = "<SOT>"
|
||||
END_TRIPLE = "<EOT>"
|
||||
SUBJECT = "<SUBJ>"
|
||||
RELATIONSHIP = "<PRED>"
|
||||
OBJECT = "<OBJ>"
|
||||
ABSTRACT = "<ABS>"
|
||||
|
||||
## Tasks' Token
|
||||
RDF_TO_TEXT = "<RDF2TXT>"
|
||||
TEXT_TO_RDF = "<TEXT2RDF>"
|
||||
CONTINUE_RDF = "<CONTINUERDF>"
|
||||
MASK = "<MASK>"
|
||||
|
||||
# BPE Training:
|
||||
# NanoSocrates
|
||||
START = "<START>"
|
||||
CORPUS_END = "<END>"
|
||||
START_OF_SEQUENCE = "<SOS>"
|
||||
END_OF_SEQUENCE = "<EOS>"
|
||||
PAD = "<PAD>"
|
||||
|
||||
6
Project_Model/Libs/BPE/Enums/TokenType.py
Normal file
6
Project_Model/Libs/BPE/Enums/TokenType.py
Normal file
@ -0,0 +1,6 @@
|
||||
from enum import Enum, auto
|
||||
|
||||
class TokenType(Enum):
|
||||
|
||||
SPECIAL = auto()
|
||||
BPE = auto()
|
||||
6
Project_Model/Libs/BPE/Enums/__init__.py
Normal file
6
Project_Model/Libs/BPE/Enums/__init__.py
Normal file
@ -0,0 +1,6 @@
|
||||
from .TokenType import TokenType
|
||||
from .SpecialToken import SpecialToken
|
||||
|
||||
__all__ = [
|
||||
"SpecialToken"
|
||||
]
|
||||
@ -0,0 +1,4 @@
|
||||
class DelimiterNotFoundException(Exception):
|
||||
|
||||
def __init__(self, *args: object) -> None:
|
||||
super().__init__(*args)
|
||||
4
Project_Model/Libs/BPE/Errors/DuplicateWordException.py
Normal file
4
Project_Model/Libs/BPE/Errors/DuplicateWordException.py
Normal file
@ -0,0 +1,4 @@
|
||||
class DuplicateWordException(Exception):
|
||||
|
||||
def __init__(self, *args: object) -> None:
|
||||
super().__init__(*args)
|
||||
@ -0,0 +1,4 @@
|
||||
class OutOfDictionaryException(Exception):
|
||||
|
||||
def __init__(self, *args: object) -> None:
|
||||
super().__init__(*args)
|
||||
@ -0,0 +1,4 @@
|
||||
class SentenceTooLongException(Exception):
|
||||
|
||||
def __init__(self, *args: object) -> None:
|
||||
super().__init__(*args)
|
||||
11
Project_Model/Libs/BPE/Errors/__init__.py
Normal file
11
Project_Model/Libs/BPE/Errors/__init__.py
Normal file
@ -0,0 +1,11 @@
|
||||
from .DelimiterNotFoundException import DelimiterNotFoundException
|
||||
from .OutOfDictionaryException import OutOfDictionaryException
|
||||
from .DuplicateWordException import DuplicateWordException
|
||||
from .SentenceTooLongException import SentenceTooLongException
|
||||
|
||||
__all__ = [
|
||||
"DelimiterNotFoundException",
|
||||
"OutOfDictionaryException",
|
||||
"DuplicateWordException",
|
||||
"SentenceTooLongException"
|
||||
]
|
||||
15
Project_Model/Libs/BPE/Utils/__init__.py
Normal file
15
Project_Model/Libs/BPE/Utils/__init__.py
Normal file
@ -0,0 +1,15 @@
|
||||
from .special_regex_maker import special_regex_maker
|
||||
from .lag_checker_iterator import iterator_with_checks
|
||||
from .vocabulary import save_nanos_vocabulary, load_nanos_vocabulary
|
||||
from .json_utils import save_json, load_json
|
||||
from .special_regex_maker import special_regex_maker
|
||||
from .default_special_tokens import default_special_tokens
|
||||
|
||||
__all__ = [
|
||||
"special_regex_maker",
|
||||
"iterator_with_checks",
|
||||
"save_nanos_vocabulary",
|
||||
"load_nanos_vocabulary",
|
||||
"save_json", "load_json",
|
||||
"default_special_tokens"
|
||||
]
|
||||
4
Project_Model/Libs/BPE/Utils/default_special_tokens.py
Normal file
4
Project_Model/Libs/BPE/Utils/default_special_tokens.py
Normal file
@ -0,0 +1,4 @@
|
||||
from ..Enums import SpecialToken
|
||||
|
||||
def default_special_tokens() -> list[str]:
|
||||
return [token.value for token in SpecialToken]
|
||||
18
Project_Model/Libs/BPE/Utils/json_utils.py
Normal file
18
Project_Model/Libs/BPE/Utils/json_utils.py
Normal file
@ -0,0 +1,18 @@
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def save_json(dictionary: dict, path: Path):
|
||||
|
||||
json_string = json.dumps(dictionary)
|
||||
FILE = open(path, "w")
|
||||
FILE.write(json_string)
|
||||
FILE.close()
|
||||
|
||||
|
||||
def load_json(path: Path) -> dict:
|
||||
FILE = open(path, "r")
|
||||
json_string = FILE.read()
|
||||
FILE.close()
|
||||
|
||||
return json.loads(json_string)
|
||||
27
Project_Model/Libs/BPE/Utils/lag_checker_iterator.py
Normal file
27
Project_Model/Libs/BPE/Utils/lag_checker_iterator.py
Normal file
@ -0,0 +1,27 @@
|
||||
from collections import deque
|
||||
from typing import Generator, TypeVar
|
||||
|
||||
T1 = TypeVar("T1")
|
||||
T2 = TypeVar("T2")
|
||||
T3 = TypeVar("T3")
|
||||
|
||||
|
||||
def iterator_with_checks(
|
||||
generator: Generator[T1, T2, T3],
|
||||
) -> Generator[tuple[T1, bool], T2, T3]:
|
||||
|
||||
# Here we can ignore to catch stop iteration
|
||||
# we will propagate it
|
||||
last_element = next(generator)
|
||||
|
||||
while True:
|
||||
|
||||
RETURN_ELEMENT = last_element
|
||||
try:
|
||||
element = next(generator)
|
||||
last_element = element
|
||||
yield (RETURN_ELEMENT, False)
|
||||
|
||||
except StopIteration:
|
||||
yield (RETURN_ELEMENT, True)
|
||||
break
|
||||
15
Project_Model/Libs/BPE/Utils/special_regex_maker.py
Normal file
15
Project_Model/Libs/BPE/Utils/special_regex_maker.py
Normal file
@ -0,0 +1,15 @@
|
||||
import re
|
||||
|
||||
|
||||
def special_regex_maker(special_tokens: list[str]) -> re.Pattern:
|
||||
"""compile a regex for the special token
|
||||
Args:
|
||||
special_tokens (list[str]): the list of special token
|
||||
|
||||
Returns:
|
||||
re.Pattern:
|
||||
"""
|
||||
|
||||
REGEX_STR = "|".join(special_tokens)
|
||||
|
||||
return re.compile(REGEX_STR)
|
||||
49
Project_Model/Libs/BPE/Utils/vocabulary.py
Normal file
49
Project_Model/Libs/BPE/Utils/vocabulary.py
Normal file
@ -0,0 +1,49 @@
|
||||
import json
|
||||
from pathlib import Path
|
||||
from ..Errors import OutOfDictionaryException
|
||||
|
||||
|
||||
def nanos_vocabulary2json_str(vocabulary: dict[tuple[int, int], int]) -> str:
|
||||
|
||||
JSON: dict[str, int] = {}
|
||||
|
||||
for key, item in vocabulary.items():
|
||||
TUPLE_STR = f"{key}"
|
||||
JSON[TUPLE_STR] = item
|
||||
|
||||
return json.dumps(JSON)
|
||||
|
||||
|
||||
def nanos_json_str2vocabulary(json_string: str) -> dict[tuple[int, int], int]:
|
||||
|
||||
JSON: dict[str, int] = json.loads(json_string)
|
||||
VOCABULARY: dict[tuple[int, int], int] = {}
|
||||
|
||||
for key, item in JSON.items():
|
||||
REDUCED_KEY = len(key) - 1
|
||||
KEY_STR = key[1:REDUCED_KEY]
|
||||
VOC_KEY = tuple(map(int, KEY_STR.split(",")))
|
||||
|
||||
if len(VOC_KEY) != 2:
|
||||
raise OutOfDictionaryException()
|
||||
|
||||
# Checked for weird things above
|
||||
VOCABULARY[VOC_KEY] = item # type: ignore
|
||||
|
||||
return VOCABULARY
|
||||
|
||||
|
||||
def save_nanos_vocabulary(vocabulary: dict[tuple[int, int], int], path: Path):
|
||||
|
||||
json_string = nanos_vocabulary2json_str(vocabulary)
|
||||
FILE = open(path, "w")
|
||||
FILE.write(json_string)
|
||||
FILE.close()
|
||||
|
||||
|
||||
def load_nanos_vocabulary(path: Path) -> dict[tuple[int, int], int]:
|
||||
FILE = open(path, "r")
|
||||
json_string = FILE.read()
|
||||
FILE.close()
|
||||
|
||||
return nanos_json_str2vocabulary(json_string)
|
||||
9
Project_Model/Libs/BPE/__init__.py
Normal file
9
Project_Model/Libs/BPE/__init__.py
Normal file
@ -0,0 +1,9 @@
|
||||
from .Classes import *
|
||||
from .Enums import *
|
||||
from .Errors import *
|
||||
from .Utils import *
|
||||
|
||||
from . import Classes
|
||||
from . import Enums
|
||||
from . import Errors
|
||||
from . import Utils
|
||||
11
Project_Model/Libs/Batch/Classes/BatchEmbedder.py
Normal file
11
Project_Model/Libs/Batch/Classes/BatchEmbedder.py
Normal file
@ -0,0 +1,11 @@
|
||||
from ....Libs.Embedder.Classes.NanoSocratesEmbedder import NanoSocratesEmbedder
|
||||
import torch
|
||||
|
||||
class BatchEmbedder(torch.nn.Module):
|
||||
|
||||
def __init__(self, vocabulary_size: int, embedding_size: int) -> None:
|
||||
super().__init__()
|
||||
self.__embedder = NanoSocratesEmbedder(vocabulary_size,embedding_size)
|
||||
|
||||
|
||||
def forward(self, )
|
||||
104
Project_Model/Libs/Batch/Classes/Batcher.py
Normal file
104
Project_Model/Libs/Batch/Classes/Batcher.py
Normal file
@ -0,0 +1,104 @@
|
||||
import random
|
||||
from typing import Generator
|
||||
import pandas as pd
|
||||
|
||||
import Project_Model.Libs.BPE as BPE
|
||||
from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
|
||||
from Project_Model.Libs.Transformer.Classes.SpannedMasker import SpannedMasker
|
||||
from TokenCompletation import TokenCompletationTransformer
|
||||
from Project_Model.Libs.BPE.Enums.SpecialToken import SpecialToken
|
||||
|
||||
class Batcher:
|
||||
|
||||
def __init__(self, dataset_path: str, batch_size:int, tokenizer: BPE.TokeNanoCore, masker: SpannedMasker) -> None:
|
||||
# ABSTRACT, TRIPLE
|
||||
# tasks:
|
||||
# rdf2text: X: TRIPLE, Y: ABSTRACT
|
||||
# text2rdf: X: ABSTRACT, X:TRIPLE
|
||||
# masking ( call masker): X: incomplete_triple Y: complete_triple (as exam)
|
||||
# completation: X: TRIPLE SUBSET, Y: related TRIPLE SUBSET
|
||||
self._dataset_path = dataset_path
|
||||
self._batch_size = batch_size
|
||||
self._tokenizer = tokenizer
|
||||
self._masker = masker
|
||||
|
||||
sotl = self._tokenizer.encode(SpecialToken.START_TRIPLE_LIST.value)
|
||||
eos = self._tokenizer.encode(SpecialToken.END_OF_SEQUENCE.value)
|
||||
self._token_completation = TokenCompletationTransformer(sotl,eos)
|
||||
|
||||
|
||||
def get_batch(self)-> Generator[pd.DataFrame]:
|
||||
for batch in pd.read_csv(self._dataset_path, chunksize= int(self._batch_size/4)): #now we support 3 task
|
||||
|
||||
tokenized_batch = pd.DataFrame()
|
||||
tokenized_batch[["Abstract","RDFs"]] = (
|
||||
batch[["Abstract","RDFs"]]
|
||||
.map(lambda t: self._tokenizer.encode(t))
|
||||
)
|
||||
|
||||
rdf2txt_batch = self.__rdf2txt_transformation(tokenized_batch)
|
||||
txt2rdf_batch = self.__txt2rdf_transformation(tokenized_batch)
|
||||
mask_batch = self.__masking_trasformation(tokenized_batch)
|
||||
completation_batch = self.__token_completation_task(tokenized_batch)
|
||||
|
||||
output = pd.concat([rdf2txt_batch,txt2rdf_batch,mask_batch,completation_batch],ignore_index=True)
|
||||
output = output.sample(frac=1).reset_index(drop=True)
|
||||
yield output
|
||||
|
||||
|
||||
def __random_subset_rdfs(self, batch: pd.DataFrame, seed = 0):
|
||||
# WIP
|
||||
rng = random.Random(seed)
|
||||
|
||||
def to_list(x):
|
||||
return x.split(SpecialToken.START_TRIPLE.value)[1:]
|
||||
|
||||
batch["RDFs"] = batch["RDFs"].map(
|
||||
to_list
|
||||
)
|
||||
|
||||
def __rdf2txt_transformation(self, batch: pd.DataFrame):
|
||||
batch = batch.rename(columns={"RDFs": "X", "Abstract": "Y"})
|
||||
return batch[["X", "Y"]]
|
||||
|
||||
|
||||
def __txt2rdf_transformation(self, batch: pd.DataFrame):
|
||||
batch = batch.rename(columns={ "Abstract": "X","RDFs": "Y"})
|
||||
return batch[["X", "Y"]]
|
||||
|
||||
def __masking_trasformation(self, batch: pd.DataFrame):
|
||||
# mask_sequence: List[int] -> Tuple[List[int], List[int]]
|
||||
xy_tuples = batch["RDFs"].apply(self._masker.mask_sequence) # Series of (X, Y)
|
||||
|
||||
output = batch.copy()
|
||||
# Expand into two columns preserving the original index
|
||||
output[["X", "Y"]] = pd.DataFrame(xy_tuples.tolist(), index=batch.index)
|
||||
return output[["X", "Y"]]
|
||||
|
||||
|
||||
def __token_completation_task(self, batch: pd.DataFrame):
|
||||
xy_tuples = batch["RDFs"].apply(self._token_completation.get_completation_tuple)
|
||||
output = batch.copy()
|
||||
output[["X", "Y"]] = pd.DataFrame(xy_tuples.tolist(), index=batch.index)
|
||||
return output[["X", "Y"]]
|
||||
|
||||
|
||||
|
||||
"""
|
||||
DATASET_PATH = "Assets/Dataset/Tmp/rdf_text.csv"
|
||||
VOCABULARY_path = "Assets/Dataset/Tmp/trimmed.json"
|
||||
|
||||
from pathlib import Path
|
||||
VOCABULARY = BPE.load_nanos_vocabulary(Path(VOCABULARY_path))
|
||||
SPECIAL_LIST = BPE.default_special_tokens()
|
||||
TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_LIST)
|
||||
SPECIAL_TOKENS: set[int] = set(TOKENANO.encode("".join(SPECIAL_LIST)))
|
||||
|
||||
MASKER = SpannedMasker(TOKENANO.vocabulary_size,SPECIAL_TOKENS)
|
||||
|
||||
prova = "<ABS>Cactus Flower is a 1969 American screwball comedy film directed by Gene Saks, and starring Walter Matthau, Ingrid Bergman and Goldie Hawn, who won an Academy Award for her performance.The screenplay was adapted by I. A. L. Diamond from the 1965 Broadway play of the same title written by Abe Burrows, which, in turn, is based on the French play Fleur de cactus by Pierre Barillet and Jean-Pierre Gredy. Cactus Flower was the ninth highest-grossing film of 1969."
|
||||
print(TOKENANO.encode(prova))
|
||||
batcher = Batcher(DATASET_PATH,8,TOKENANO,MASKER)
|
||||
for batch in batcher.get_batch():
|
||||
print(batch)
|
||||
"""
|
||||
33
Project_Model/Libs/Batch/Classes/TokenCompletation.py
Normal file
33
Project_Model/Libs/Batch/Classes/TokenCompletation.py
Normal file
@ -0,0 +1,33 @@
|
||||
|
||||
class TokenCompletationTransformer:
|
||||
|
||||
def __init__(self,SOTL_token,EOS_token, input_percent:float = 0.5) -> None:
|
||||
self.__SOTL_token = SOTL_token
|
||||
self.__EOS_token = EOS_token
|
||||
self.__input_percent = input_percent
|
||||
pass
|
||||
|
||||
|
||||
def get_completation_tuple(
|
||||
self,
|
||||
token_sequence: list[int],
|
||||
)-> tuple[list[int], list[int]]:
|
||||
|
||||
# split the sequence by encoded(<SOTL>), dont take the first, firts pertenge in as X the other as Y
|
||||
sotl_count =int( token_sequence.count(self.__SOTL_token) * self.__input_percent)
|
||||
|
||||
sotl_index = 0
|
||||
percent_index = 0
|
||||
while sotl_index < sotl_count:
|
||||
token = token_sequence[percent_index]
|
||||
if token == self.__SOTL_token:
|
||||
sotl_index += 1
|
||||
|
||||
percent_index+=1
|
||||
|
||||
percent_index = percent_index -1
|
||||
x_list = token_sequence[:percent_index]
|
||||
x_list.append(self.__EOS_token)
|
||||
y_list = token_sequence[percent_index:]
|
||||
return (x_list,y_list)
|
||||
|
||||
8
Project_Model/Libs/Batch/Enums/TaskType.py
Normal file
8
Project_Model/Libs/Batch/Enums/TaskType.py
Normal file
@ -0,0 +1,8 @@
|
||||
from enum import Enum, auto
|
||||
|
||||
class TaskType(Enum):
|
||||
|
||||
RDF2TXT = auto()
|
||||
TEXT2RDF = auto()
|
||||
MASKING = auto()
|
||||
COMPLETATION = auto()
|
||||
23
Project_Model/Libs/Embedder/Classes/NanoSocratesEmbedder.py
Normal file
23
Project_Model/Libs/Embedder/Classes/NanoSocratesEmbedder.py
Normal file
@ -0,0 +1,23 @@
|
||||
import torch
|
||||
from ..Utils import fixed_positional_encoding
|
||||
|
||||
|
||||
# WIP FOR BATCHING
|
||||
class NanoSocratesEmbedder(torch.nn.Module):
|
||||
|
||||
def __init__(self, vocabulary_size: int, embedding_size: int) -> None:
|
||||
super().__init__()
|
||||
self.__embedder = torch.nn.Embedding(vocabulary_size, embedding_size)
|
||||
|
||||
def forward(self, tokenized_sentence: torch.Tensor) -> torch.Tensor:
|
||||
|
||||
computed_embeddings: torch.Tensor = self.__embedder(tokenized_sentence)
|
||||
|
||||
_, SENTENCE_LENGHT, EMBEDDING_SIZE = computed_embeddings.shape # for batching
|
||||
|
||||
POSITIONAL_ENCODINGS = fixed_positional_encoding(
|
||||
SENTENCE_LENGHT, EMBEDDING_SIZE
|
||||
)
|
||||
|
||||
computed_embeddings = computed_embeddings + POSITIONAL_ENCODINGS # for batching
|
||||
return computed_embeddings
|
||||
5
Project_Model/Libs/Embedder/Classes/__init__.py
Normal file
5
Project_Model/Libs/Embedder/Classes/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
from .NanoSocratesEmbedder import NanoSocratesEmbedder
|
||||
|
||||
__all__ = [
|
||||
"NanoSocratesEmbedder"
|
||||
]
|
||||
5
Project_Model/Libs/Embedder/Utils/__init__.py
Normal file
5
Project_Model/Libs/Embedder/Utils/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
from .fixed_positional_encoding import fixed_positional_encoding
|
||||
|
||||
__all__ = [
|
||||
"fixed_positional_encoding"
|
||||
]
|
||||
@ -0,0 +1,28 @@
|
||||
import torch
|
||||
|
||||
|
||||
def fixed_positional_encoding(
|
||||
sentence_dimension: int,
|
||||
embedding_dimension: int,
|
||||
) -> torch.Tensor:
|
||||
|
||||
BIG_CONST = int(1e4)
|
||||
INITIAL_ENCODING = torch.tensor([i for i in range(0, sentence_dimension)])
|
||||
|
||||
ENCODINGS: list[torch.Tensor] = []
|
||||
|
||||
for i in range(0, embedding_dimension):
|
||||
EMBEDDING_POSITION = i
|
||||
|
||||
# Note: The original paper did not specify
|
||||
# to compute: pos mod 2!!
|
||||
DIVISOR = BIG_CONST ** ((2 * (EMBEDDING_POSITION // 2)) / embedding_dimension)
|
||||
INTERMEDIATE_ENCODING = INITIAL_ENCODING / DIVISOR
|
||||
|
||||
if EMBEDDING_POSITION % 2 == 0:
|
||||
ENCODINGS.append(torch.sin(INTERMEDIATE_ENCODING))
|
||||
continue
|
||||
|
||||
ENCODINGS.append(torch.cos(INTERMEDIATE_ENCODING))
|
||||
|
||||
return torch.stack(ENCODINGS).transpose(0, 1)
|
||||
7
Project_Model/Libs/Embedder/__init__.py
Normal file
7
Project_Model/Libs/Embedder/__init__.py
Normal file
@ -0,0 +1,7 @@
|
||||
from .Utils import *
|
||||
from .Classes import *
|
||||
|
||||
from . import Utils
|
||||
from . import Classes
|
||||
|
||||
|
||||
5
Project_Model/Libs/TorchShims/Utils/__init__.py
Normal file
5
Project_Model/Libs/TorchShims/Utils/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
from .get_default_device import get_default_device
|
||||
|
||||
__all__ = [
|
||||
"get_default_device"
|
||||
]
|
||||
17
Project_Model/Libs/TorchShims/Utils/get_default_device.py
Normal file
17
Project_Model/Libs/TorchShims/Utils/get_default_device.py
Normal file
@ -0,0 +1,17 @@
|
||||
import torch
|
||||
|
||||
def get_default_device() -> torch.device:
|
||||
|
||||
# Cuda or ROCm
|
||||
if torch.cuda.is_available():
|
||||
return torch.device("cuda")
|
||||
|
||||
# Intel GPUs
|
||||
if torch.xpu.is_available():
|
||||
return torch.device("xpu")
|
||||
|
||||
# Apple GPUs
|
||||
if torch.backends.mps.is_available():
|
||||
return torch.device("mps")
|
||||
|
||||
return torch.device("cpu")
|
||||
7
Project_Model/Libs/TorchShims/__init__.py
Normal file
7
Project_Model/Libs/TorchShims/__init__.py
Normal file
@ -0,0 +1,7 @@
|
||||
from .Utils import *
|
||||
|
||||
from .Utils import get_default_device
|
||||
|
||||
__all__ = [
|
||||
"get_default_device"
|
||||
]
|
||||
41
Project_Model/Libs/Training/learning_rade_shedulers.py
Normal file
41
Project_Model/Libs/Training/learning_rade_shedulers.py
Normal file
@ -0,0 +1,41 @@
|
||||
import numpy as np
|
||||
# custom LR from attention is all you need
|
||||
class Custom_lr():
|
||||
def __init__(self, d_model: int, warmup_step:int) -> None:
|
||||
|
||||
self.__d_model = d_model
|
||||
self.__warmup_step = warmup_step
|
||||
self.__epoch = 0
|
||||
|
||||
|
||||
def step(self) -> int:
|
||||
self.__epoch += 1
|
||||
return (self.__d_model ** -0.5) * min(self.__epoch ** -0.5,
|
||||
self.__epoch * (self.__warmup_step ** -1.5))
|
||||
|
||||
# OTHER LR
|
||||
|
||||
# Learning rate schedules (matching visualization parameters)
|
||||
def step_lr(epoch, lr):
|
||||
# StepLR: step_size=20, gamma=0.5 (from visualization)
|
||||
return lr * 0.5 if epoch % 20 == 0 and epoch > 0 else lr
|
||||
|
||||
def exp_lr(epoch, lr):
|
||||
# ExponentialLR: gamma=0.95 (from visualization)
|
||||
return lr * 0.95
|
||||
|
||||
def cosine_lr(epoch, lr):
|
||||
# CosineAnnealingLR: lr_min=0.001, lr_max=0.1, max_epochs=100 (from visualization)
|
||||
lr_min, lr_max = 0.001, 0.1
|
||||
max_epochs = 100
|
||||
return lr_min + 0.5 * (lr_max - lr_min) * (1 + np.cos(epoch * np.pi / max_epochs))
|
||||
|
||||
def cyclical_lr(epoch, lr):
|
||||
# CyclicalLR: base_lr=0.001, max_lr=0.1, step_size=20 (from visualization)
|
||||
base_lr = 0.001
|
||||
max_lr = 0.1
|
||||
step_size = 20
|
||||
|
||||
cycle = np.floor(1 + epoch / (2 * step_size))
|
||||
x = np.abs(epoch / step_size - 2 * cycle + 1)
|
||||
return base_lr + (max_lr - base_lr) * max(0, (1 - x))
|
||||
42
Project_Model/Libs/Training/logistic_collector.py
Normal file
42
Project_Model/Libs/Training/logistic_collector.py
Normal file
@ -0,0 +1,42 @@
|
||||
import torch
|
||||
|
||||
class LogitsCollector:
|
||||
def __init__(self, pad_token: int, end_token: int, tokenizer) -> None:
|
||||
self.__pad_token = pad_token # used to skip PAD
|
||||
self.__end_token = end_token # used to stop at END
|
||||
self.__tokenizer = tokenizer # exposes .decode(list[int]) -> str
|
||||
self.__steps: list[torch.Tensor] = [] # list of per-step logits [B,V]
|
||||
|
||||
def reset(self) -> None:
|
||||
self.__steps.clear() # clear history
|
||||
|
||||
def add(self, logits_step: torch.Tensor) -> None:
|
||||
if logits_step.dim() == 3: # handle [B,1,V]
|
||||
logits_step = logits_step[:, -1, :] # -> [B,V]
|
||||
self.__steps.append(logits_step.detach()) # store raw logits (detached)
|
||||
|
||||
def tokens(self) -> list[list[int]]:
|
||||
if not self.__steps:
|
||||
return []
|
||||
stack = torch.stack(self.__steps, dim=0) # [T,B,V]
|
||||
probs = torch.softmax(stack, dim=-1) # softmax over vocab -> [T,B,V]
|
||||
ids = probs.argmax(dim=-1).transpose(0, 1) # greedy ids -> [B,T]
|
||||
out: list[list[int]] = []
|
||||
for row in ids.tolist():
|
||||
seq: list[int] = []
|
||||
for tok in row:
|
||||
if tok == self.__end_token: # stop on END
|
||||
break
|
||||
if tok == self.__pad_token: # skip PAD
|
||||
continue
|
||||
seq.append(tok)
|
||||
out.append(seq)
|
||||
return out
|
||||
|
||||
def print_decoded(self) -> None:
|
||||
for i, seq in enumerate(self.tokens()):
|
||||
try:
|
||||
text = self.__tokenizer.decode(seq) # decode tokens to string
|
||||
except Exception:
|
||||
text = str(seq) # fallback to ids
|
||||
print(f"[{i}] {text}") # simple print
|
||||
0
Project_Model/Libs/Training/training.py
Normal file
0
Project_Model/Libs/Training/training.py
Normal file
19
Project_Model/Libs/Transformer/Classes/DeToken.py
Normal file
19
Project_Model/Libs/Transformer/Classes/DeToken.py
Normal file
@ -0,0 +1,19 @@
|
||||
import torch
|
||||
|
||||
|
||||
class DeToken(torch.nn.Module):
|
||||
|
||||
def __init__(self, embedding_size: int, vocabulary_size: int) -> None:
|
||||
super().__init__()
|
||||
|
||||
self.__linear = torch.nn.Linear(embedding_size, vocabulary_size)
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
|
||||
# 1) Go from latent space to vocabularu space
|
||||
x = self.__linear(x)
|
||||
|
||||
# 2) Go to logits
|
||||
# x = torch.softmax(x, 2)
|
||||
|
||||
return x
|
||||
103
Project_Model/Libs/Transformer/Classes/Decoder.py
Normal file
103
Project_Model/Libs/Transformer/Classes/Decoder.py
Normal file
@ -0,0 +1,103 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from .FeedForwardNetwork import FeedForwardNetwork
|
||||
from .TorchMultiHeadAttention import TorchMultiHeadAttention as MultiHeadAttention
|
||||
from ..Utils.attention_mask import get_causal_attention_mask
|
||||
|
||||
# B, L(T), E_D
|
||||
|
||||
|
||||
class Decoder(nn.Module):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
embedding_dimension: int,
|
||||
feed_forward_hidden_layer_dimension: int,
|
||||
number_of_attention_heads: int,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
self.__masked_attention = MultiHeadAttention(
|
||||
embedding_dimension, number_of_attention_heads, dropout=0.1
|
||||
)
|
||||
|
||||
self.__layer_norm_1 = nn.LayerNorm(embedding_dimension)
|
||||
|
||||
self.__cross_attention = MultiHeadAttention(
|
||||
embedding_dimension, number_of_attention_heads, dropout=0.1
|
||||
)
|
||||
self.__layer_norm_2 = nn.LayerNorm(embedding_dimension)
|
||||
|
||||
self.__dropout = nn.Dropout(0.1)
|
||||
|
||||
self.__feed_forward_network = FeedForwardNetwork(
|
||||
embedding_dimension, feed_forward_hidden_layer_dimension
|
||||
)
|
||||
self.__layer_norm_3 = nn.LayerNorm(embedding_dimension)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
args: tuple[
|
||||
torch.Tensor,
|
||||
torch.Tensor,
|
||||
torch.Tensor,
|
||||
torch.Tensor,
|
||||
torch.Tensor
|
||||
]
|
||||
): # -> list[torch.Tensor]: # k_x = v_x . While x_q = x
|
||||
# WARNING: args is needed to have sequential
|
||||
x, k_x, v_x, padding_mask,encoder_padding_mask = args
|
||||
|
||||
# build of attention mask
|
||||
attention_mask = get_causal_attention_mask(x.size(1))
|
||||
|
||||
# 1) Masked Attention
|
||||
MASKED_ATTENTION = self.__masked_attention(
|
||||
x, x, x, key_padding_mask=padding_mask, attention_mask=attention_mask
|
||||
)
|
||||
|
||||
# 2) Dropout
|
||||
# DROPPED_MASKED_ATTENTION = self.__dropout(MASKED_ATTENTION)
|
||||
# del MASKED_ATTENTION
|
||||
|
||||
# 3) Residual Connection
|
||||
x = x + MASKED_ATTENTION
|
||||
del MASKED_ATTENTION
|
||||
|
||||
# 4) Layer Normalization
|
||||
x = self.__layer_norm_1(x)
|
||||
|
||||
# 5) Encoder–decoder (cross) attention
|
||||
CROSS_ATTENTION = self.__cross_attention(
|
||||
x, k_x, v_x, key_padding_mask=encoder_padding_mask
|
||||
)
|
||||
|
||||
# 6) Dropout
|
||||
# DROPPED_CROSS_ATTENTION = self.__dropout(CROSS_ATTENTION)
|
||||
# del CROSS_ATTENTION
|
||||
|
||||
# 7) Residual Connection
|
||||
x = x + CROSS_ATTENTION
|
||||
del CROSS_ATTENTION
|
||||
|
||||
# 8) Layer Normalization
|
||||
x = self.__layer_norm_2(x)
|
||||
|
||||
# 9) Position-wise feed-forward
|
||||
FEED_FORWARD = self.__feed_forward_network(x)
|
||||
|
||||
# 10) Dropout
|
||||
# DROPPED_FEED_FORWARD = self.__dropout(FEED_FORWARD)
|
||||
# del FEED_FORWARD
|
||||
|
||||
# 11) Residual Connection
|
||||
x = x + FEED_FORWARD
|
||||
del FEED_FORWARD
|
||||
|
||||
# 12) Layer Normalization
|
||||
x = self.__layer_norm_3(x)
|
||||
|
||||
return (x, k_x, v_x, padding_mask, encoder_padding_mask)
|
||||
|
||||
|
||||
# use eval to disable dropout ecc
|
||||
73
Project_Model/Libs/Transformer/Classes/Encoder.py
Normal file
73
Project_Model/Libs/Transformer/Classes/Encoder.py
Normal file
@ -0,0 +1,73 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from Project_Model.Libs.Transformer.Classes.FeedForwardNetwork import FeedForwardNetwork
|
||||
from Project_Model.Libs.Transformer.Classes.TorchMultiHeadAttention import (
|
||||
TorchMultiHeadAttention as MultiHeadAttention,
|
||||
)
|
||||
|
||||
|
||||
class Encoder(
|
||||
nn.Module
|
||||
): # in this way we expose the primitive of nn.Module for training purpose
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
embedding_dimension: int,
|
||||
feed_forward_hidden_layer_dimension: int,
|
||||
number_of_attention_heads: int,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
self.__attention = MultiHeadAttention(
|
||||
embedding_dimension, number_of_attention_heads, dropout=0.1
|
||||
)
|
||||
self.__layer_norm_1 = nn.LayerNorm(
|
||||
embedding_dimension
|
||||
) # norm of first "Add and Normalize"
|
||||
self.__feed_forward = FeedForwardNetwork(
|
||||
embedding_dimension, feed_forward_hidden_layer_dimension
|
||||
)
|
||||
self.__layer_norm_2 = nn.LayerNorm(
|
||||
embedding_dimension
|
||||
) # norm of second "Add and Normalize"
|
||||
self.__dropout = nn.Dropout(0.1) # ...
|
||||
|
||||
|
||||
def forward(self, args: tuple[torch.Tensor, torch.Tensor]):
|
||||
# WARNING: args is needed to have sequential
|
||||
x, padding_mask = args
|
||||
|
||||
# -> ATTENTION -> dropout -> add and normalize -> FF -> dropout -> add and normalize ->
|
||||
# Attention with Residual Connection [ input + self-attention]
|
||||
|
||||
# 1) Multi Head Attention
|
||||
ATTENTION = self.__attention(x, x, x, key_padding_mask=padding_mask)
|
||||
|
||||
# 2) Dropout
|
||||
# DROPPED_ATTENTION = self.__dropout(ATTENTION)
|
||||
# del ATTENTION
|
||||
|
||||
# 3) Residual Connection
|
||||
x = x + ATTENTION
|
||||
del ATTENTION
|
||||
|
||||
# 4) Layer Normalization
|
||||
x = self.__layer_norm_1(x)
|
||||
|
||||
# 5) Feed Forward
|
||||
FEED_FORWARD = self.__feed_forward(x)
|
||||
|
||||
# 6) Dropout
|
||||
# DROPPED_FEED_FORWARD = self.__dropout(FEED_FORWARD)
|
||||
# del FEED_FORWARD
|
||||
|
||||
# 7) Residual Connection
|
||||
x = x + FEED_FORWARD
|
||||
del FEED_FORWARD
|
||||
|
||||
# 8) Layer Normalization
|
||||
x = self.__layer_norm_2(x)
|
||||
|
||||
return (x, padding_mask)
|
||||
|
||||
|
||||
# use eval to disable dropout ecc
|
||||
43
Project_Model/Libs/Transformer/Classes/FeedForwardNetwork.py
Normal file
43
Project_Model/Libs/Transformer/Classes/FeedForwardNetwork.py
Normal file
@ -0,0 +1,43 @@
|
||||
# it is position wise!
|
||||
# https://stackoverflow.com/questions/74979359/how-is-position-wise-feed-forward-neural-network-implemented-for-transformers
|
||||
|
||||
# Why do we need a fixed size
|
||||
# https://ai.stackexchange.com/questions/37624/why-do-transformers-have-a-fixed-input-length
|
||||
|
||||
import torch.nn as nn
|
||||
|
||||
|
||||
class FeedForwardNetwork(nn.Module):
|
||||
|
||||
def __init__(self, embedding_size: int, feed_forward_hidden_layer_dimension: int):
|
||||
|
||||
super().__init__()
|
||||
self.__fully_connected_1 = nn.Linear(
|
||||
embedding_size, feed_forward_hidden_layer_dimension
|
||||
) # expand in higher dimension
|
||||
|
||||
self.__relu = nn.ReLU()
|
||||
self.__dropout = nn.Dropout(
|
||||
0.1
|
||||
) # during training we drop something, with eval it got deactivated
|
||||
|
||||
self.__fully_connected_2 = nn.Linear(
|
||||
feed_forward_hidden_layer_dimension, embedding_size
|
||||
) # return into the model dimension
|
||||
|
||||
def forward(self, x):
|
||||
# -> NN1 -> RELU -> (Droput during training) -> NN2 ->
|
||||
|
||||
# 1) Linear Layer
|
||||
x = self.__fully_connected_1(x)
|
||||
|
||||
# 2) ReLU
|
||||
x = self.__relu(x)
|
||||
|
||||
# 3) Dropout
|
||||
x = self.__dropout(x)
|
||||
|
||||
# 4) Linear Layer
|
||||
x = self.__fully_connected_2(x)
|
||||
|
||||
return x
|
||||
23
Project_Model/Libs/Transformer/Classes/NanoSocrates.py
Normal file
23
Project_Model/Libs/Transformer/Classes/NanoSocrates.py
Normal file
@ -0,0 +1,23 @@
|
||||
import torch
|
||||
from NanoSocratesCore import NanoSocratesCore
|
||||
|
||||
class NanoSocrates(torch.nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
embedded_size: int,
|
||||
feed_forward_dim: int,
|
||||
encoder_layers: int,
|
||||
decoder_layers:int,
|
||||
attention_heads: int,
|
||||
vocab_size: int) -> None:
|
||||
|
||||
super().__init__()
|
||||
|
||||
self._model = NanoSocratesCore(
|
||||
embedded_size,
|
||||
feed_forward_dim,
|
||||
encoder_layers,
|
||||
decoder_layers,
|
||||
attention_heads,
|
||||
vocab_size)
|
||||
|
||||
109
Project_Model/Libs/Transformer/Classes/NanoSocratesCore.py
Normal file
109
Project_Model/Libs/Transformer/Classes/NanoSocratesCore.py
Normal file
@ -0,0 +1,109 @@
|
||||
from ..Utils.task_type import TaskType
|
||||
from .Decoder import Decoder
|
||||
from .Encoder import Encoder
|
||||
from ....Libs.Embedder import NanoSocratesEmbedder
|
||||
import torch
|
||||
|
||||
|
||||
class NanoSocratesCore(torch.nn.Module):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
sentence_length: int,
|
||||
vocab_size: int,
|
||||
embedding_size: int = 256,
|
||||
feed_forward_multiplier: int = 4,
|
||||
num_encoder_layers: int = 2,
|
||||
num_decoder_layers: int = 2,
|
||||
num_attention_heads: int = 4,
|
||||
pad_token: int = 0,
|
||||
) -> None:
|
||||
|
||||
super().__init__()
|
||||
self.__pad_token = pad_token
|
||||
feed_forward_dim = embedding_size * feed_forward_multiplier
|
||||
|
||||
self.__sentence_length = sentence_length
|
||||
|
||||
self.__encoder_sequence = torch.nn.Sequential(
|
||||
*[
|
||||
Encoder(embedding_size, feed_forward_dim, num_attention_heads)
|
||||
for _ in range(num_encoder_layers)
|
||||
]
|
||||
)
|
||||
|
||||
# * unpack the list so that each encoder has its own weights
|
||||
|
||||
self.__decoder_sequence = torch.nn.Sequential(
|
||||
*[
|
||||
Decoder(embedding_size, feed_forward_dim, num_attention_heads)
|
||||
for _ in range(num_decoder_layers)
|
||||
]
|
||||
)
|
||||
|
||||
self.__linear = torch.nn.Linear(embedding_size, vocab_size)
|
||||
|
||||
self.__input_embeder = NanoSocratesEmbedder(vocab_size, embedding_size)
|
||||
self.__output_embedder = NanoSocratesEmbedder(vocab_size, embedding_size)
|
||||
|
||||
@torch.no_grad() # inference only
|
||||
def forward(
|
||||
self,
|
||||
encoder_input: list[list[int]],
|
||||
decoder_input: list[list[int]], # must start with <SOS> and PAD elsewhere
|
||||
encoder_padding_mask: list[list[bool]], # True where encoder is PAD
|
||||
):
|
||||
# 1) Embed User-Input for encoders
|
||||
ENCODER_INPUT = self.__input_embeder(encoder_input) # [B,S,E]
|
||||
|
||||
# 2) Encode User-Input
|
||||
ENCODER_OUTPUT, encoder_padding_mask = self.__encoder_sequence(
|
||||
(ENCODER_INPUT, encoder_padding_mask) # as tuple
|
||||
) # [B,S,E], [B,S]
|
||||
del ENCODER_INPUT
|
||||
|
||||
# 3) Autoregressive Output (greedy)
|
||||
LOGITS_HISTORY: list[torch.Tensor] = [] # keep per-step distributions
|
||||
decoder_token_list = [row[:] for row in decoder_input] # copy tokens
|
||||
decoder_phase = 0
|
||||
exit_loop = False
|
||||
|
||||
while not exit_loop:
|
||||
decoder_phase += 1 # move to next position
|
||||
|
||||
# 3.1) Build decoder key padding mask from current tokens (True where PAD)
|
||||
DECODER_KEY_PADDING_MASK: list[list[bool]] = [
|
||||
[tok == self.__pad_token for tok in row] for row in decoder_token_list
|
||||
] # [B,T]
|
||||
|
||||
# 3.2) Embed Decoder Input (full sequence; decoder builds causal mask inside)
|
||||
DECODER_INPUT = self.__output_embedder(decoder_token_list) # [B,T,E]
|
||||
|
||||
# 3.3) Decode (self-attn uses causal mask internally; we provide PAD masks)
|
||||
DECODER_OUTPUT, _, _, _ = self.__decoder_sequence(
|
||||
(DECODER_INPUT, ENCODER_OUTPUT, ENCODER_OUTPUT,
|
||||
DECODER_KEY_PADDING_MASK, encoder_padding_mask)
|
||||
) # [B,T,E]
|
||||
del DECODER_INPUT
|
||||
|
||||
# 3.4) Project to token space
|
||||
LOGITS = self.__linear(DECODER_OUTPUT) # [B,T,V]
|
||||
del DECODER_OUTPUT
|
||||
|
||||
# 3.5) Probabilities and greedy pick at current step
|
||||
TOKEN_PROBABILITIES = torch.softmax(LOGITS, dim=-1) # [B,T,V]
|
||||
LOGITS_HISTORY.append(TOKEN_PROBABILITIES) # store for this step
|
||||
|
||||
step_idx = decoder_phase - 1 # 0-based
|
||||
TOKEN_IDS = TOKEN_PROBABILITIES[:, step_idx, :].argmax(dim=-1).tolist() # [B] -> list[int]
|
||||
|
||||
# 3.6) Write prediction into next slot (the slot is PAD)
|
||||
if step_idx + 1 < self.__sentence_length:
|
||||
for b, tok in enumerate(TOKEN_IDS):
|
||||
decoder_token_list[b][step_idx + 1] = tok # feed next position
|
||||
|
||||
# 3.7) Stop when we filled the sequence
|
||||
if decoder_phase == self.__sentence_length - 1:
|
||||
exit_loop = True
|
||||
|
||||
return LOGITS_HISTORY # list of [B,T,V] (per step)
|
||||
213
Project_Model/Libs/Transformer/Classes/SpannedMasker.py
Normal file
213
Project_Model/Libs/Transformer/Classes/SpannedMasker.py
Normal file
@ -0,0 +1,213 @@
|
||||
import math
|
||||
import random
|
||||
import sys
|
||||
|
||||
|
||||
class SpannedMasker:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
max_vocabulary: int,
|
||||
forbidden_tokens: set[int],
|
||||
change_token_probability: float = 0.15,
|
||||
average_span: int = 1,
|
||||
seed: int = random.randint(0, sys.maxsize),
|
||||
|
||||
) -> None:
|
||||
|
||||
if change_token_probability < 0 or change_token_probability > 1:
|
||||
raise ValueError("received a value that is not between 0 or 1")
|
||||
|
||||
self.__change_token_probability = change_token_probability
|
||||
self.__average_span = average_span
|
||||
self.__rng = random.Random(seed)
|
||||
self.__max_vocabulary = max_vocabulary
|
||||
self.__forbidden_tokens = forbidden_tokens
|
||||
|
||||
|
||||
def mask_sequence(
|
||||
self,
|
||||
token_sequence: list[int],
|
||||
) -> tuple[list[int], list[int]]:
|
||||
|
||||
MASK = self.__create_mask(token_sequence, self.__forbidden_tokens)
|
||||
MASKED = self.__create_masked_input(token_sequence, MASK, self.__max_vocabulary)
|
||||
TARGET = self.__create_target(token_sequence, MASK, self.__max_vocabulary)
|
||||
|
||||
return (MASKED, TARGET)
|
||||
|
||||
|
||||
|
||||
|
||||
def __number_of_spans(self, legal_token_number: int):
|
||||
EXPECTED_NUM_OF_CORRUPTED_TOKENS = self.__number_of_corrupted_tokens(legal_token_number)
|
||||
|
||||
return math.ceil(EXPECTED_NUM_OF_CORRUPTED_TOKENS / self.__average_span)
|
||||
|
||||
def __number_of_corrupted_tokens(self, legal_token_number: int):
|
||||
EXPECTED_NUM_OF_CORRUPTED_TOKENS = math.ceil(
|
||||
legal_token_number * self.__change_token_probability
|
||||
)
|
||||
|
||||
return EXPECTED_NUM_OF_CORRUPTED_TOKENS
|
||||
|
||||
def __create_mask(self, sequence: list[int], forbidden_tokens: set[int]) -> list[bool]:
|
||||
|
||||
SEQ_LEN = len(sequence)
|
||||
LEGAL_TOKENS = self.__count_legal_tokens(sequence, forbidden_tokens)
|
||||
NUM_OF_CORRUPTIONS = self.__number_of_corrupted_tokens(LEGAL_TOKENS)
|
||||
NUM_OF_SPANS = self.__number_of_spans(LEGAL_TOKENS)
|
||||
MASK = [False] * SEQ_LEN
|
||||
|
||||
mask_index = 0
|
||||
number_of_spans = 0
|
||||
exit_loop = False
|
||||
|
||||
while not exit_loop:
|
||||
|
||||
TOKEN = sequence[mask_index]
|
||||
MASKED = MASK[mask_index]
|
||||
SHOULD_MASK = self.__random_mask()
|
||||
skip = False
|
||||
|
||||
|
||||
if self.__is_illegal_token(TOKEN, forbidden_tokens):
|
||||
skip = True
|
||||
|
||||
if MASKED:
|
||||
skip = True
|
||||
|
||||
if not SHOULD_MASK:
|
||||
skip = True
|
||||
|
||||
if skip:
|
||||
mask_index = (mask_index + 1) % SEQ_LEN
|
||||
continue
|
||||
|
||||
|
||||
CANDIDATE_SPAN = self.__random_span(
|
||||
self.__average_span
|
||||
)
|
||||
|
||||
REMAINING_MASK = SEQ_LEN - (mask_index + 1)
|
||||
|
||||
SPAN_LENGTH = min(CANDIDATE_SPAN, REMAINING_MASK)
|
||||
|
||||
for _ in range(0, SPAN_LENGTH):
|
||||
INNER_TOKEN = sequence[mask_index]
|
||||
|
||||
if self.__is_illegal_token(INNER_TOKEN, forbidden_tokens):
|
||||
continue
|
||||
|
||||
MASK[mask_index] = True
|
||||
mask_index += 1
|
||||
|
||||
number_of_spans += 1
|
||||
mask_index += 1
|
||||
|
||||
if number_of_spans == NUM_OF_SPANS:
|
||||
exit_loop = True
|
||||
continue
|
||||
|
||||
if mask_index >= SEQ_LEN - 1:
|
||||
exit_loop = True
|
||||
continue
|
||||
|
||||
return MASK
|
||||
|
||||
def __create_masked_input(self, sequence: list[int], mask: list[bool], max_voc: int) -> list[int]:
|
||||
|
||||
OUT: list[int] = []
|
||||
mask_token_id = max_voc + 1
|
||||
index = 0
|
||||
while index < len(sequence):
|
||||
|
||||
TOKEN = sequence[index]
|
||||
MASKED = mask[index]
|
||||
|
||||
if not MASKED:
|
||||
OUT.append(
|
||||
TOKEN
|
||||
)
|
||||
index += 1
|
||||
continue
|
||||
|
||||
MASK_TOKEN = mask_token_id
|
||||
OUT.append(
|
||||
MASK_TOKEN
|
||||
)
|
||||
|
||||
while mask[index]:
|
||||
index += 1
|
||||
|
||||
mask_token_id += 1
|
||||
|
||||
return OUT
|
||||
|
||||
def __create_target(self, sequence: list[int], mask: list[bool], max_voc: int) -> list[int]:
|
||||
|
||||
OUT: list[int] = []
|
||||
mask_token_id = max_voc + 1
|
||||
index = 0
|
||||
while index < len(sequence):
|
||||
|
||||
TOKEN = sequence[index]
|
||||
MASKED = mask[index]
|
||||
|
||||
if MASKED:
|
||||
OUT.append(
|
||||
TOKEN
|
||||
)
|
||||
index += 1
|
||||
continue
|
||||
|
||||
MASK_TOKEN = mask_token_id
|
||||
OUT.append(
|
||||
MASK_TOKEN
|
||||
)
|
||||
|
||||
while index < len(mask) and not mask[index]:
|
||||
index += 1
|
||||
|
||||
mask_token_id += 1
|
||||
|
||||
|
||||
return OUT
|
||||
|
||||
def __is_illegal_token(self, token: int, illegal_voc: set[int]) -> bool:
|
||||
if token in illegal_voc:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def __count_legal_tokens(self, sequence: list[int], illegal_voc: set[int]) -> int:
|
||||
legal_count = 0
|
||||
|
||||
for token in sequence:
|
||||
if self.__is_illegal_token(token, illegal_voc):
|
||||
continue
|
||||
legal_count += 1
|
||||
|
||||
return legal_count
|
||||
|
||||
def __random_mask(self) -> bool:
|
||||
|
||||
if self.__random_probability() > self.__change_token_probability:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def __random_probability(self) -> float:
|
||||
return self.__rng.random()
|
||||
|
||||
def __random_token(self, max_vocabulary: int) -> int:
|
||||
return self.__rng.randint(0, max_vocabulary)
|
||||
|
||||
def __random_int_range(self, min: int, max: int) -> int:
|
||||
return self.__rng.randint(min, max)
|
||||
|
||||
def __random_span(self, average: int) -> int:
|
||||
candidate_span = self.__rng.gauss(mu=average)
|
||||
candidate_span = max(1, candidate_span)
|
||||
candidate_span = round(candidate_span)
|
||||
return candidate_span
|
||||
77
Project_Model/Libs/Transformer/Classes/TokenMasker.py
Normal file
77
Project_Model/Libs/Transformer/Classes/TokenMasker.py
Normal file
@ -0,0 +1,77 @@
|
||||
import random
|
||||
import sys
|
||||
|
||||
|
||||
class TokenMasker:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
change_token_probability: float = 0.15,
|
||||
mask_token_probability: float = 0.8,
|
||||
random_token_prob: float = 0.1,
|
||||
seed: int = random.randint(0, sys.maxsize),
|
||||
) -> None:
|
||||
|
||||
if change_token_probability < 0 or change_token_probability > 1:
|
||||
raise ValueError("received a value that is not between 0 or 1")
|
||||
|
||||
if mask_token_probability < 0 or mask_token_probability > 1:
|
||||
raise ValueError("received a value that is not between 0 or 1")
|
||||
|
||||
if random_token_prob < 0 or random_token_prob > 1:
|
||||
raise ValueError("received a value that is not between 0 or 1")
|
||||
|
||||
if mask_token_probability + random_token_prob > 1:
|
||||
raise ValueError("The sum of probabilities is over 1")
|
||||
|
||||
self.__change_token_probability = change_token_probability
|
||||
self.__mask_token_probability = mask_token_probability
|
||||
self.__random_token_prob = random_token_prob
|
||||
self.__rng = random.Random(seed)
|
||||
|
||||
def mask_sequence(
|
||||
self, token_sequence: list[int], max_vocabulary: int, mask_id: int
|
||||
) -> list[int]:
|
||||
|
||||
if mask_id <= max_vocabulary:
|
||||
raise ValueError("mask_id is a value of vocabulary")
|
||||
|
||||
MASKED_SEQUENCE: list[int] = []
|
||||
|
||||
for token in token_sequence:
|
||||
|
||||
if token > max_vocabulary:
|
||||
MASKED_SEQUENCE.append(token)
|
||||
continue
|
||||
|
||||
MASKED_TOKEN = self.__mask(token, max_vocabulary, mask_id)
|
||||
MASKED_SEQUENCE.append(MASKED_TOKEN)
|
||||
|
||||
return MASKED_SEQUENCE
|
||||
|
||||
def __mask(self, token: int, max_vocabulary: int, mask_id: int) -> int:
|
||||
|
||||
if self.__random_probability() > self.__change_token_probability:
|
||||
return token
|
||||
|
||||
MASK_TOKEN_TRESH = self.__mask_token_probability
|
||||
RANDOM_TOKEN_TRESH = MASK_TOKEN_TRESH + self.__random_token_prob
|
||||
CHANCE_PROBABILITY = self.__random_probability()
|
||||
|
||||
# It's over both probabilities, return same token
|
||||
if CHANCE_PROBABILITY > RANDOM_TOKEN_TRESH:
|
||||
return token
|
||||
|
||||
# It's over masking treshold, but lower than random
|
||||
# return random token
|
||||
if CHANCE_PROBABILITY > MASK_TOKEN_TRESH:
|
||||
return self.__random_token(max_vocabulary)
|
||||
|
||||
# It's below masking treshold, mask token
|
||||
return mask_id
|
||||
|
||||
def __random_probability(self) -> float:
|
||||
return self.__rng.random()
|
||||
|
||||
def __random_token(self, max_vocabulary: int) -> int:
|
||||
return self.__rng.randint(0, max_vocabulary)
|
||||
@ -0,0 +1,47 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from typing import Optional
|
||||
|
||||
class TorchMultiHeadAttention(nn.Module):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
embedding_dimension: int,
|
||||
number_of_attention_heads: int,
|
||||
dropout: float = 0.0
|
||||
):
|
||||
super().__init__()
|
||||
self.attention = torch.nn.MultiheadAttention(
|
||||
embedding_dimension,
|
||||
num_heads=number_of_attention_heads,
|
||||
dropout=dropout,
|
||||
batch_first=True,
|
||||
)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
x_q: torch.Tensor,
|
||||
x_k: torch.Tensor,
|
||||
x_v: torch.Tensor,
|
||||
key_padding_mask=None,
|
||||
attention_mask: Optional[torch.Tensor] = None
|
||||
) -> torch.Tensor:
|
||||
|
||||
# x * Wq -> Q
|
||||
# x * Wk -> K
|
||||
# x * Wv -> V
|
||||
# REMEMBER: tochAttention uses Batch internally to build the 3 dimension attention mask given the 2 dimension
|
||||
y, _ = self.attention(
|
||||
x_q, x_k, x_v, attn_mask=attention_mask, key_padding_mask=key_padding_mask,
|
||||
need_weights=False
|
||||
)
|
||||
return y
|
||||
|
||||
|
||||
# batch_first=False (default storico)
|
||||
# Formato: (L, N, E)
|
||||
# L = lunghezza della sequenza (time/posizioni)
|
||||
# N = batch size
|
||||
# E = dimensione d_model (embed)
|
||||
# batch_first=True
|
||||
# Formato: (N, L, E) (più naturale per molti modelli)
|
||||
16
Project_Model/Libs/Transformer/Classes/__init__.py
Normal file
16
Project_Model/Libs/Transformer/Classes/__init__.py
Normal file
@ -0,0 +1,16 @@
|
||||
from .Decoder import Decoder
|
||||
from .Encoder import Encoder
|
||||
from .FeedForwardNetwork import FeedForwardNetwork
|
||||
# from .MultiHeadAttention import MultiheadAttention
|
||||
from .TorchMultiHeadAttention import TorchMultiHeadAttention
|
||||
from .SpannedMasker import SpannedMasker
|
||||
from .DeToken import DeToken
|
||||
|
||||
__all__ = [
|
||||
"Decoder",
|
||||
"Encoder",
|
||||
"FeedForwardNetwork",
|
||||
"TorchMultiHeadAttention",
|
||||
"SpannedMasker",
|
||||
"DeToken"
|
||||
]
|
||||
0
Project_Model/Libs/Transformer/Enums/__init__.py
Normal file
0
Project_Model/Libs/Transformer/Enums/__init__.py
Normal file
72
Project_Model/Libs/Transformer/Models/TrainingModel.py
Normal file
72
Project_Model/Libs/Transformer/Models/TrainingModel.py
Normal file
@ -0,0 +1,72 @@
|
||||
import torch
|
||||
import Project_Model.Libs.Embedder as Embedder
|
||||
from ..Classes import Encoder, Decoder, DeToken
|
||||
|
||||
|
||||
class TrainingModel(torch.nn.Module):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocabulary_size: int,
|
||||
latent_space: int = 256,
|
||||
feed_forward_multiplier: int = 4,
|
||||
attention_heads: int = 4,
|
||||
layer_number: int = 2,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
feed_forward_latent_space = latent_space * feed_forward_multiplier
|
||||
|
||||
self.__encoder_embedder = Embedder.NanoSocratesEmbedder(
|
||||
vocabulary_size, latent_space
|
||||
)
|
||||
self.__decoder_embedder = Embedder.NanoSocratesEmbedder(
|
||||
vocabulary_size, latent_space
|
||||
)
|
||||
|
||||
# do NOT share layer weights
|
||||
enc_layers = [
|
||||
Encoder(latent_space, feed_forward_latent_space, attention_heads)
|
||||
for _ in range(layer_number)
|
||||
]
|
||||
dec_layers = [
|
||||
Decoder(latent_space, feed_forward_latent_space, attention_heads)
|
||||
for _ in range(layer_number)
|
||||
]
|
||||
|
||||
self.__encoder = torch.nn.Sequential(*enc_layers)
|
||||
self.__decoder = torch.nn.Sequential(*dec_layers)
|
||||
|
||||
self.__detokener = DeToken(latent_space, vocabulary_size)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
args: tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]
|
||||
):
|
||||
# returns logits for the LAST decoder position only -> [B, V]
|
||||
(
|
||||
encoder_embedder_input, # [B,S] encoder tokens
|
||||
encoder_padding_mask, # [B,S] True where encoder is PAD
|
||||
decoder_embedder_prefix, # [B,Tp] decoder prefix (e.g., <SOS> + tokens so far)
|
||||
decoder_padding_mask, # [B,Tp] True where decoder prefix has PAD
|
||||
) = args
|
||||
|
||||
# 1) embeddings
|
||||
encoder_tensor = self.__encoder_embedder(encoder_embedder_input) # [B,S,E]
|
||||
decoder_tensor = self.__decoder_embedder(decoder_embedder_prefix) # [B,Tp,E]
|
||||
|
||||
# 2) encode
|
||||
encoder_output, _ = self.__encoder((encoder_tensor, encoder_padding_mask)) # [B,S,E], [B,S]
|
||||
|
||||
# 3) decode (causal mask is built inside the decoder)
|
||||
decoder_output, _, _, _, _ = self.__decoder(
|
||||
(decoder_tensor, encoder_output, encoder_output,
|
||||
decoder_padding_mask, encoder_padding_mask)
|
||||
) # [B,Tp,E], ...
|
||||
|
||||
# 4) project only the last time step
|
||||
last_hidden = decoder_output[:, -1:, :] # [B,1,E]
|
||||
step_logits = self.__detokener(last_hidden) # [B,1,V]
|
||||
step_logits = step_logits[:, -1, :] # [B,V]
|
||||
|
||||
return step_logits # logits for one token
|
||||
5
Project_Model/Libs/Transformer/Models/__init__.py
Normal file
5
Project_Model/Libs/Transformer/Models/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
from .TrainingModel import TrainingModel
|
||||
|
||||
__all__ = [
|
||||
"TrainingModel"
|
||||
]
|
||||
17
Project_Model/Libs/Transformer/Utils/__init__.py
Normal file
17
Project_Model/Libs/Transformer/Utils/__init__.py
Normal file
@ -0,0 +1,17 @@
|
||||
from .attention_mask import get_causal_attention_mask,get_causal_attention_mask_batched
|
||||
from .task_type import TaskType
|
||||
from .post_tokenization import truncate_sequence, pad_sequence, normalize_sequence, create_padding_mask
|
||||
from .inference_masking import inference_masking
|
||||
from .truncate_rdf_list import truncate_rdf_list
|
||||
|
||||
__all__ = [
|
||||
"TaskType",
|
||||
"get_causal_attention_mask",
|
||||
"get_causal_attention_mask_batched",
|
||||
"truncate_sequence",
|
||||
"pad_sequence",
|
||||
"create_padding_mask",
|
||||
"normalize_sequence",
|
||||
"inference_masking",
|
||||
"truncate_rdf_list"
|
||||
]
|
||||
11
Project_Model/Libs/Transformer/Utils/attention_mask.py
Normal file
11
Project_Model/Libs/Transformer/Utils/attention_mask.py
Normal file
@ -0,0 +1,11 @@
|
||||
import torch
|
||||
|
||||
def get_causal_attention_mask(seq_len: int) -> torch.Tensor:
|
||||
return torch.triu(torch.ones(seq_len, seq_len, dtype=torch.bool), diagonal=1)
|
||||
|
||||
|
||||
# there is no need for this since MultiHeadAttention of torch does this internally
|
||||
def get_causal_attention_mask_batched(seq_len: int, batch_size: int ) -> torch.Tensor:
|
||||
base_mask = get_causal_attention_mask(seq_len)
|
||||
return base_mask.unsqueeze(0).expand(batch_size, -1, -1) # add another dimension at the beginning, big as batch_size
|
||||
# the result is that z,x,y where x,y are repeated along z
|
||||
13
Project_Model/Libs/Transformer/Utils/inference_masking.py
Normal file
13
Project_Model/Libs/Transformer/Utils/inference_masking.py
Normal file
@ -0,0 +1,13 @@
|
||||
def inference_masking(sequence: list[int], mask_token: int, max_vocabulary: int) -> list[int]:
|
||||
|
||||
current_mask_token = max_vocabulary + 1
|
||||
|
||||
for i in range(0, len(sequence)):
|
||||
|
||||
if sequence[i] != mask_token:
|
||||
continue
|
||||
|
||||
sequence[i] = current_mask_token
|
||||
current_mask_token += 1
|
||||
|
||||
return sequence
|
||||
56
Project_Model/Libs/Transformer/Utils/post_tokenization.py
Normal file
56
Project_Model/Libs/Transformer/Utils/post_tokenization.py
Normal file
@ -0,0 +1,56 @@
|
||||
def truncate_sequence(
|
||||
sequence: list[int], truncate_at: int, end_token: int
|
||||
) -> list[int]:
|
||||
|
||||
if len(sequence) < truncate_at - 1:
|
||||
sequence.append(end_token)
|
||||
return sequence
|
||||
|
||||
if len(sequence) < truncate_at:
|
||||
sequence[-1] = end_token
|
||||
return sequence
|
||||
|
||||
TRUNCATED_SEQUENCE = sequence[:truncate_at]
|
||||
TRUNCATED_SEQUENCE[-1] = end_token
|
||||
|
||||
return TRUNCATED_SEQUENCE
|
||||
|
||||
|
||||
def pad_sequence(sequence: list[int], pad_until: int, pad_token: int) -> list[int]:
|
||||
|
||||
if not (len(sequence) < pad_until):
|
||||
return sequence
|
||||
|
||||
NUM_OF_PADDINGS = pad_until - len(sequence)
|
||||
PADDINGS = [pad_token] * NUM_OF_PADDINGS
|
||||
|
||||
PADDED_SEQUENCE = sequence[:]
|
||||
PADDED_SEQUENCE.extend(PADDINGS)
|
||||
|
||||
return PADDED_SEQUENCE
|
||||
|
||||
def create_padding_mask(sequence: list[int], pad_token: int) -> list[bool]:
|
||||
|
||||
PADDING_MASK = [False] * len(sequence)
|
||||
|
||||
for i in range(0, len(sequence)):
|
||||
|
||||
if sequence[i] != pad_token:
|
||||
continue
|
||||
|
||||
PADDING_MASK[i] = True
|
||||
|
||||
return PADDING_MASK
|
||||
|
||||
|
||||
def normalize_sequence(
|
||||
sequence: list[int],
|
||||
max_length: int,
|
||||
pad_token: int,
|
||||
end_token: int,
|
||||
) -> tuple[list[int], list[bool]]:
|
||||
new_sequence = truncate_sequence(sequence, max_length, end_token)
|
||||
new_sequence = pad_sequence(new_sequence, max_length, pad_token)
|
||||
PADDING_MASK = create_padding_mask(new_sequence, pad_token)
|
||||
|
||||
return (new_sequence, PADDING_MASK)
|
||||
6
Project_Model/Libs/Transformer/Utils/task_type.py
Normal file
6
Project_Model/Libs/Transformer/Utils/task_type.py
Normal file
@ -0,0 +1,6 @@
|
||||
from enum import Enum, auto
|
||||
|
||||
class TaskType(Enum):
|
||||
RDF2TEXT = auto()
|
||||
MASK = auto()
|
||||
COMPLETATION = auto()
|
||||
65
Project_Model/Libs/Transformer/Utils/truncate_rdf_list.py
Normal file
65
Project_Model/Libs/Transformer/Utils/truncate_rdf_list.py
Normal file
@ -0,0 +1,65 @@
|
||||
from collections import deque
|
||||
import random
|
||||
import sys
|
||||
|
||||
|
||||
def truncate_rdf_list(
|
||||
sequence: list[int],
|
||||
truncation_probability: float,
|
||||
continue_triple_token: int,
|
||||
end_of_triple_token: int,
|
||||
seed: int = random.randint(0, sys.maxsize),
|
||||
) -> tuple[list[int], list[int]]:
|
||||
|
||||
if truncation_probability < 0 or truncation_probability > 1:
|
||||
raise ValueError("A probability must be between 0 and 1")
|
||||
|
||||
RNG = random.Random(seed)
|
||||
|
||||
END_OF_TRIPLES: deque[int] = deque()
|
||||
|
||||
for i in range(0, len(sequence)):
|
||||
|
||||
TOKEN = sequence[i]
|
||||
if TOKEN != end_of_triple_token:
|
||||
continue
|
||||
|
||||
END_OF_TRIPLES.append(i + 1)
|
||||
|
||||
TRIPLES_TOKENS: list[int] = []
|
||||
TARGET_TRIPLES: list[int] = []
|
||||
|
||||
start_of_triple = 0
|
||||
exit_loop = False
|
||||
|
||||
while not exit_loop:
|
||||
|
||||
EOT = END_OF_TRIPLES.popleft()
|
||||
|
||||
TRIPLE = sequence[start_of_triple:EOT]
|
||||
TRIPLES_TOKENS.extend(TRIPLE)
|
||||
|
||||
start_of_triple = EOT
|
||||
|
||||
if RNG.random() < truncation_probability:
|
||||
exit_loop = True
|
||||
|
||||
if len(END_OF_TRIPLES) == 1:
|
||||
exit_loop = True
|
||||
|
||||
TRIPLES_TOKENS.append(
|
||||
continue_triple_token
|
||||
)
|
||||
|
||||
while len(END_OF_TRIPLES) > 0:
|
||||
|
||||
EOT = END_OF_TRIPLES.popleft()
|
||||
|
||||
TRIPLE = sequence[start_of_triple:EOT]
|
||||
TARGET_TRIPLES.extend(TRIPLE)
|
||||
|
||||
start_of_triple = EOT
|
||||
|
||||
|
||||
return (TRIPLES_TOKENS, TARGET_TRIPLES)
|
||||
|
||||
7
Project_Model/Libs/Transformer/__init__.py
Normal file
7
Project_Model/Libs/Transformer/__init__.py
Normal file
@ -0,0 +1,7 @@
|
||||
from .Classes import *
|
||||
from .Utils import *
|
||||
from .Models import *
|
||||
|
||||
from . import Classes
|
||||
from . import Utils
|
||||
from . import Models
|
||||
4
Project_Model/Libs/__init__.py
Normal file
4
Project_Model/Libs/__init__.py
Normal file
@ -0,0 +1,4 @@
|
||||
from . import BPE
|
||||
from . import Embedder
|
||||
from . import Transformer
|
||||
from . import TorchShims
|
||||
74
Project_Model/Tests/bpe_test.py
Normal file
74
Project_Model/Tests/bpe_test.py
Normal file
@ -0,0 +1,74 @@
|
||||
from Project_Model.Libs.BPE.Enums import TokenType
|
||||
import Project_Model.Libs.BPE as BPE
|
||||
|
||||
import re
|
||||
|
||||
|
||||
class TestBPE:
|
||||
|
||||
def test_bpe_encoding_simple(self):
|
||||
|
||||
TEXT = "abababab"
|
||||
|
||||
# ab = 256
|
||||
# 256, 256 = 257
|
||||
# 257, 257 = 258
|
||||
|
||||
VOCABULARY = {(ord("a"), ord("b")): 256, (256, 256): 257, (257, 257): 258}
|
||||
EXPECTED = [258]
|
||||
|
||||
BPE_ENCODER = BPE.NanoSocratesBPE(VOCABULARY)
|
||||
|
||||
ENCODED = BPE_ENCODER.encode(TEXT)
|
||||
|
||||
assert len(ENCODED) == len(EXPECTED)
|
||||
|
||||
for encoded, expected in zip(ENCODED, EXPECTED):
|
||||
assert encoded == expected
|
||||
|
||||
def test_bpe_decoding_simple(self):
|
||||
|
||||
|
||||
INPUT = [258]
|
||||
|
||||
# ab = 256
|
||||
# 256, 256 = 257
|
||||
# 257, 257 = 258
|
||||
|
||||
VOCABULARY = {(ord("a"), ord("b")): 256, (256, 256): 257, (257, 257): 258}
|
||||
EXPECTED = "abababab"
|
||||
|
||||
BPE_ENCODER = BPE.NanoSocratesBPE(VOCABULARY)
|
||||
|
||||
DECODED = BPE_ENCODER.decode(INPUT)
|
||||
|
||||
assert len(DECODED) == len(EXPECTED)
|
||||
|
||||
for encoded, expected in zip(DECODED, EXPECTED):
|
||||
assert encoded == expected
|
||||
|
||||
def test_bpe_decoding_edge_1(self):
|
||||
|
||||
|
||||
INPUT = [258, ord("c")]
|
||||
|
||||
# ab = 256
|
||||
# 256, 256 = 257
|
||||
# 257, 257 = 258
|
||||
|
||||
VOCABULARY = {(ord("a"), ord("b")): 256, (256, 256): 257, (257, 257): 258}
|
||||
EXPECTED = "ababababc"
|
||||
|
||||
BPE_ENCODER = BPE.NanoSocratesBPE(VOCABULARY)
|
||||
|
||||
DECODED = BPE_ENCODER.decode(INPUT)
|
||||
|
||||
assert len(DECODED) == len(EXPECTED)
|
||||
|
||||
for encoded, expected in zip(DECODED, EXPECTED):
|
||||
assert encoded == expected
|
||||
|
||||
# Useful to debug weird cases
|
||||
if __name__ == "__main__":
|
||||
# TestBPE().test_bpe_decoding_simple()
|
||||
TestBPE().test_bpe_encoding_simple()
|
||||
77
Project_Model/Tests/bpe_trainer_test.py
Normal file
77
Project_Model/Tests/bpe_trainer_test.py
Normal file
@ -0,0 +1,77 @@
|
||||
from pathlib import Path
|
||||
from Project_Model.Libs.BPE.Enums import TokenType
|
||||
import Project_Model.Libs.BPE as BPE
|
||||
|
||||
import re
|
||||
|
||||
CACHE_DIR_PATH = Path("Project_Model/Tests/trainer_files/cache/pool-cache.json")
|
||||
|
||||
class TestTrainBPE:
|
||||
|
||||
def test_bpe_train_encoding_simple(self):
|
||||
|
||||
TRAINER = BPE.NanoSocraTrainerPool(
|
||||
int(32E3),
|
||||
["<SOT>", "<EOT>"]
|
||||
)
|
||||
|
||||
TEXT = "abababab"
|
||||
TEXT_PATH = Path("Project_Model/Tests/trainer_files/train_simple.txt")
|
||||
|
||||
EXPECTED = [258]
|
||||
|
||||
# ab = 256
|
||||
# 256, 256 = 257
|
||||
# 257, 257 = 258
|
||||
|
||||
BPE_ENCODER = TRAINER.trainBPE(
|
||||
TEXT_PATH,
|
||||
CACHE_DIR_PATH
|
||||
)
|
||||
|
||||
ENCODED = BPE_ENCODER.encode(TEXT)
|
||||
|
||||
assert len(ENCODED) == len(EXPECTED)
|
||||
|
||||
for encoded, expected in zip(ENCODED, EXPECTED):
|
||||
assert encoded == expected
|
||||
|
||||
|
||||
def test_bpe_train_encoding_and_decoding(self):
|
||||
|
||||
SPECIAL_LIST = ["<ABS>", "<SOTL>"]
|
||||
TRAINER = BPE.NanoSocraTrainerPool(
|
||||
int(32E3),
|
||||
SPECIAL_LIST
|
||||
)
|
||||
|
||||
TEXT_PATH = Path("Project_Model/Tests/trainer_files/train_encode_decode.txt")
|
||||
FILE = open(TEXT_PATH)
|
||||
TEXT = FILE.read()
|
||||
FILE.close()
|
||||
|
||||
EXPECTED = TEXT
|
||||
|
||||
# ab = 256
|
||||
# 256, 256 = 257
|
||||
# 257, 257 = 258
|
||||
|
||||
BPE_ENCODER = TRAINER.trainBPE(
|
||||
TEXT_PATH,
|
||||
CACHE_DIR_PATH
|
||||
)
|
||||
VOCABULARY = BPE_ENCODER.vocabulary
|
||||
TOKENANO = BPE.TokeNanoCore(VOCABULARY,SPECIAL_LIST)
|
||||
|
||||
ENCODED = TOKENANO.encode(TEXT)
|
||||
DECODED = TOKENANO.decode(ENCODED)
|
||||
|
||||
assert len(DECODED) == len(EXPECTED)
|
||||
|
||||
for decoded, expected in zip(DECODED, EXPECTED):
|
||||
assert decoded == expected
|
||||
|
||||
# Useful to debug weird cases
|
||||
if __name__ == "__main__":
|
||||
# TestTrainBPE().test_bpe_train_encoding_simple()
|
||||
TestTrainBPE().test_bpe_train_encoding_and_decoding()
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user