Compare commits

...

259 Commits

Author SHA1 Message Date
GassiGiuseppe
1de2cc59db doctor and model test 2025-10-08 22:51:36 +02:00
GassiGiuseppe
b805dc538e learning sheduler as torch one 2025-10-08 16:05:22 +02:00
Christian Risi
c2e13bc9c6 Quick fix to architecture 2025-10-08 12:34:09 +02:00
GassiGiuseppe
14c3914571 Merge branch 'dev.embedder' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev.embedder 2025-10-08 12:14:05 +02:00
Christian Risi
b9273b95e2 Merge branch 'dev.embedder' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev.embedder 2025-10-08 12:13:46 +02:00
GassiGiuseppe
c263e2cf13 Custom learning rate sheduler from Attention is all you need 2025-10-08 12:13:02 +02:00
GassiGiuseppe
c9a50d50b7 typo in Batcher 2025-10-08 11:39:08 +02:00
GassiGiuseppe
9b0c57c238 Batcher ended, attention it returns list of tokenId, which later needs to be embedded 2025-10-08 11:26:47 +02:00
Christian Risi
24ea4d3ba4 Added a training model for NanoSocrates 2025-10-08 11:18:05 +02:00
GassiGiuseppe
e353c200d7 updated special token 2025-10-08 11:02:18 +02:00
GassiGiuseppe
159266a603 WIP Batcher added class to fourth task 2025-10-08 00:39:16 +02:00
GassiGiuseppe
7027414342 Merge branch 'dev.embedder' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev.embedder 2025-10-07 23:16:20 +02:00
GassiGiuseppe
fc44929a7b moved spanned mask variables in init for better reliability, also tested 2025-10-07 23:15:50 +02:00
Christian Risi
0560bc439a Merge branch 'dev.embedder' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev.embedder 2025-10-07 20:45:10 +02:00
Christian Risi
8adacdb08c Added new playgrounds 2025-10-07 20:45:04 +02:00
Christian Risi
533347ee22 Added new special token 2025-10-07 20:44:54 +02:00
Christian Risi
d1ff88da82 Added small dataset 2025-10-07 20:44:40 +02:00
Christian Risi
3f465991f0 Added toy dataset 2025-10-07 20:44:11 +02:00
GassiGiuseppe
96cbf4eabb wip Batcher 2025-10-07 20:09:51 +02:00
GassiGiuseppe
f801afe0e4 Merge branch 'dev.embedder' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev.embedder 2025-10-07 17:42:21 +02:00
GassiGiuseppe
b4ee8362a2 WIP training Batching 2025-10-07 17:41:53 +02:00
Christian Risi
3021a51961 Merge branch 'dev.embedder' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev.embedder 2025-10-07 16:38:12 +02:00
Christian Risi
99b5198c9a WIP 2025-10-07 16:38:08 +02:00
Christian Risi
b97282179d Fixed a bug about sequence normalizations 2025-10-07 16:37:43 +02:00
Christian Risi
fdece42462 Made model Batch ready 2025-10-07 16:37:20 +02:00
Christian Risi
109ad9f36b Changed Imports 2025-10-07 16:36:59 +02:00
Christian Risi
fef933da9d Added <PAD> and moved <END> Token 2025-10-07 16:36:45 +02:00
Christian Risi
c65f5e66fe Uploaded all playgrounds 2025-10-07 16:36:26 +02:00
Christian Risi
f9545aca1d Deleted MultiHeadAttention 2025-10-07 16:36:11 +02:00
GassiGiuseppe
490edcfd53 WIP Batcher 2025-10-07 15:36:51 +02:00
Christian Risi
9b5bb6d5f8 Added support for batches 2025-10-07 12:15:03 +02:00
GassiGiuseppe
14b810c451 WIP NanoSocratesEmbedder for batching 2025-10-06 21:41:45 +02:00
GassiGiuseppe
56d438f01a WIP NanoSocratesCore 2025-10-06 18:21:27 +02:00
GassiGiuseppe
745424a978 new special token for start sequence in decoder 2025-10-06 18:21:10 +02:00
GassiGiuseppe
e1549d4458 Modified decoder and decoder for sequential architecture 2025-10-06 18:20:46 +02:00
Christian Risi
456ce724fe Added capability of returning target after truncating 2025-10-06 17:43:01 +02:00
Christian Risi
44307cd917 Added util to create padding mask 2025-10-06 17:29:05 +02:00
Christian Risi
ffdb312d58 Added a util to create truncated RDF lists 2025-10-06 17:22:13 +02:00
Christian Risi
0007c38212 Added a util to make masked inference 2025-10-06 17:02:06 +02:00
Christian Risi
9c1043e0ba Added post tokenization utils 2025-10-06 17:01:18 +02:00
Christian Risi
ee8e56798c Added new utils 2025-10-06 17:00:55 +02:00
Christian Risi
1797571bb2 Added test to see if illegal tokens were included in target 2025-10-06 16:17:12 +02:00
Christian Risi
e93710af08 Fixed illegal tokens being added in target output 2025-10-06 16:16:47 +02:00
Christian Risi
d3bba9b944 Added actual test 2025-10-06 16:06:17 +02:00
Christian Risi
b1e7af0607 Merge branch 'dev.embedder' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev.embedder 2025-10-06 15:55:44 +02:00
Christian Risi
d3b1f7da91 Added testing for spanned masking 2025-10-06 15:55:40 +02:00
Christian Risi
c217f5dec9 Added 2 types of masking 2025-10-06 15:45:45 +02:00
Christian Risi
49f0beb6ea Updated imports 2025-10-06 15:45:28 +02:00
GassiGiuseppe
05bb460999 file to test batch attention mask 2025-10-06 13:03:20 +02:00
GassiGiuseppe
948c3fd7ac update to batch attention mask 2025-10-06 13:03:03 +02:00
GassiGiuseppe
87409fecd5 added method fot batched attention_mask 2025-10-06 12:00:11 +02:00
GassiGiuseppe
7e40a36701 wip: NanoSocratesCore 2025-10-05 22:58:06 +02:00
GassiGiuseppe
d48815cca2 added task_type and updated init 2025-10-05 18:58:42 +02:00
GassiGiuseppe
0f243eaac2 added padding_mask entry to decoder and encoder 2025-10-05 18:46:06 +02:00
GassiGiuseppe
9c83d9fa71 Merge branch 'dev.embedder' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev.embedder 2025-10-05 18:45:33 +02:00
Christian Risi
a693cbb77e A set of utils for our pipeline 2025-10-05 18:37:43 +02:00
GassiGiuseppe
6f219f634f Added attention_mask 2025-10-05 17:49:01 +02:00
GassiGiuseppe
b303affd18 updated uml of the model 2025-10-05 16:40:19 +02:00
Christian Risi
53c4decac7 Added playgrounds for the architecture 2025-10-05 16:30:23 +02:00
Christian Risi
c60da8ba82 Refactoring 2025-10-05 15:40:29 +02:00
Christian Risi
3b5e6c099c Merge branch 'dev' into dev.embedder 2025-10-05 11:17:09 +02:00
Christian Risi
ba3a718480 Merge branch 'dev.etl' into dev 2025-10-05 11:16:54 +02:00
GassiGiuseppe
69fba7c3e9 new utility to generate a csv debug file of the output of the pipeline 2025-10-04 21:33:09 +02:00
GassiGiuseppe
76200d936d added first classes (Encoder, Decoder, Attention) for the model 2025-10-04 21:07:58 +02:00
Christian Risi
9b656e7918 Added a playground to test the embedding phase 2025-10-04 19:43:42 +02:00
Christian Risi
9a797a0485 Added embedder code for "Attention is all you need" 2025-10-04 19:43:25 +02:00
Christian Risi
3b274ad807 Added a way to take the default special token list 2025-10-04 19:43:02 +02:00
Christian Risi
8f5e2f2f0d Modifications 2025-10-04 19:42:45 +02:00
Christian Risi
da0bdf703b Added a way to see vocabulary size 2025-10-04 19:42:29 +02:00
Christian Risi
03cdca1f00 Modified imports for BPE 2025-10-04 19:42:02 +02:00
Christian Risi
7188c8678a Added imports for Embedder 2025-10-04 19:41:48 +02:00
Christian Risi
1eef25a697 Merge branch 'dev' into dev.embedder 2025-10-04 19:04:03 +02:00
Christian Risi
e9165fb146 Merge branch 'dev.bpe' into dev 2025-10-04 19:03:09 +02:00
GassiGiuseppe
bbadd4c521 update cleaning pipeline with a new method to filter also by number of films,
also updated the signature of the pipeline
2025-10-04 19:00:05 +02:00
GassiGiuseppe
c2f9344c82 little test file 2025-10-04 18:58:20 +02:00
GassiGiuseppe
25f3a5d221 Logic to test BPE 2025-10-04 18:58:04 +02:00
Christian Risi
e8ff82c40a Updated with tasks architectures 2025-10-04 10:57:12 +02:00
Christian Risi
23d1eaf99e Fixed a rare bug over training multiple times 2025-10-04 10:47:39 +02:00
Christian Risi
25a6ad1254 Added model high level architecture 2025-10-03 23:37:16 +02:00
Christian Risi
460d4f5188 Renamed directory to Playgrounds 2025-10-03 22:59:43 +02:00
Christian Risi
c6ac6df2c2 Added stubs for other libraries 2025-10-03 20:28:23 +02:00
Christian Risi
15baba54ab Sanity check to autodetect Device 2025-10-03 20:16:01 +02:00
Christian Risi
87f24878f4 Added shims for utils on using Pytorch 2025-10-03 20:11:14 +02:00
Christian Risi
999141f886 Merge branch 'dev' into dev.embedder 2025-10-03 18:08:34 +02:00
Christian Risi
8e095ebb7a Added papers stub 2025-10-03 18:02:27 +02:00
Christian Risi
149deb407d added cache directories 2025-10-03 18:01:05 +02:00
Christian Risi
8a21cb1b73 added python analysis 2025-10-03 18:00:52 +02:00
Christian Risi
d2a3dfe90f Fixed bug 2025-10-03 17:59:46 +02:00
GassiGiuseppe
0f95aeb122 toy dictionary for bpe implemeted 2025-10-03 16:26:01 +02:00
Christian Risi
0ee6e48004 Fixed the same bug as before, but this time is correct 2025-10-03 16:09:53 +02:00
Christian Risi
55e0d2ac23 Fixed a encoding bug 2025-10-03 16:08:11 +02:00
Christian Risi
9c5f42153f fixed typos 2025-10-03 15:17:44 +02:00
Christian Risi
c74689d01d Fixed tests to reflect new version of tokenizer 2025-10-03 13:27:38 +02:00
Christian Risi
51f491d033 fixed typos 2025-10-03 13:27:17 +02:00
Christian Risi
c5c0c61f79 Fix of bugs and semantics 2025-10-03 13:26:58 +02:00
Christian Risi
6b9cb7cd35 Modified imports 2025-10-03 13:26:42 +02:00
Christian Risi
e8894504c6 Fixed a bug where a token (int) was yielded instead of a list of int 2025-10-03 11:44:44 +02:00
GassiGiuseppe
845d645348 added some stubs on special_regex_maker 2025-10-03 10:38:35 +02:00
GassiGiuseppe
09f7b39512 test files updated 2025-10-03 01:04:47 +02:00
GassiGiuseppe
070dc1b744 implemented token nano for the BPE encoding/decoding 2025-10-03 01:04:06 +02:00
GassiGiuseppe
8121c75a09 Updated NanoSocratesSplitter to split also token in decode phase 2025-10-03 01:00:36 +02:00
GassiGiuseppe
a5b8692a77 Updated NanoSocratesSpecial to work with TokeNano 2025-10-03 00:59:15 +02:00
GassiGiuseppe
7c935d2700 Update NanoSocratesBPE: corrected a minor bug about dictionary lenght,
added some comment to make the code more clear
2025-10-03 00:57:19 +02:00
Christian Risi
a1d143187d corrected test to reflect changes in BPE trainer 2025-10-02 20:11:43 +02:00
GassiGiuseppe
0eef2148a9 in NanoSocratesBPE: encode() method rewritten and tested 2025-10-02 12:12:44 +02:00
Christian Risi
856bd8909c Added treshold 2025-10-02 11:02:03 +02:00
Christian Risi
2e595a3a23 Changed training phase to take directly data instead of its encode 2025-10-02 09:56:44 +02:00
Christian Risi
2194cc7b4f Changed test to use pool trainer 2025-10-02 09:56:05 +02:00
Christian Risi
1eae8582b2 Fixed decoding phase 2025-10-02 09:33:58 +02:00
Christian Risi
eadba1fb82 Corrected test to reflect changes in NanoSocratesBPE 2025-10-02 09:33:47 +02:00
Christian Risi
aa765b4555 Added time checking 2025-10-02 08:48:45 +02:00
Christian Risi
17d82f0a4e Added support to resume workload 2025-10-02 08:48:28 +02:00
Christian Risi
0975c19e69 added nwew method to encode from list of tokens 2025-10-02 08:48:13 +02:00
Christian Risi
3fe4e45ceb Fixed a bug while joining frequencies 2025-10-02 01:50:37 +02:00
Christian Risi
d19426fa62 added multithreaded training to package 2025-10-02 01:31:05 +02:00
Christian Risi
63baf29805 Added multithreaded training 2025-10-02 01:30:24 +02:00
Christian Risi
b80b4e4112 Fixed returning type hints 2025-10-02 01:29:57 +02:00
Christian Risi
7cfaf601b4 Refactored to remove tokens that can't be compressed anymore 2025-10-01 19:42:22 +02:00
Christian Risi
fbbe6226bb Finished uploading stubs for TokeNano 2025-10-01 18:56:53 +02:00
Christian Risi
b3d444979f Added flag to resume work correctly 2025-10-01 12:22:09 +02:00
Christian Risi
66bcf6e55f Added a way to recover iteration work 2025-10-01 12:21:42 +02:00
Christian Risi
dbf1d99408 Added json utils to save and load json files 2025-10-01 12:20:59 +02:00
Christian Risi
97bac464f3 Fixed JSON incompatibility 2025-10-01 00:32:43 +02:00
Christian Risi
9a8e726d74 Added cdebug configuration 2025-10-01 00:22:22 +02:00
Christian Risi
7ab9b0358e Added script to run BPE 2025-09-30 23:59:09 +02:00
Christian Risi
30c2938d29 Fixed typing 2025-09-30 23:58:54 +02:00
Christian Risi
76f24d4eb0 Renamed file 2025-09-30 23:58:43 +02:00
Christian Risi
89a0a1f4bb Fixed bug for utf-8 conversion 2025-09-30 23:58:31 +02:00
GassiGiuseppe
64e355e80c Added regex to delete new lines and * from ObjectURI 2025-09-30 15:00:07 +02:00
GassiGiuseppe
397e29742a minor update of settings 2025-09-30 13:58:20 +02:00
Christian Risi
ccacea18d8 Created files to test BPE training 2025-09-30 13:33:54 +02:00
Christian Risi
b09bd4acba Created trainer to train BPE 2025-09-30 13:33:40 +02:00
Christian Risi
c9032cab09 Added fit method 2025-09-30 13:33:28 +02:00
Christian Risi
7020c9e683 Added utils to make regexps and iterators that check for last element 2025-09-30 13:33:12 +02:00
Christian Risi
2fe1ce9e9a Updated Inits 2025-09-30 13:32:37 +02:00
Christian Risi
18fc2ba9d8 Added Exceptions 2025-09-30 13:32:24 +02:00
Christian Risi
5acee1d1a5 Merge branch 'dev' into dev.bpe 2025-09-30 11:35:27 +02:00
2e36753da4 Merge pull request 'dev.etl' (#5) from dev.etl into dev
Reviewed-on: #5
2025-09-30 11:28:57 +02:00
GassiGiuseppe
007f1e9554 minor updates 2025-09-29 18:53:33 +02:00
GassiGiuseppe
c319398ca0 little update to UML pipeline 2025-09-29 17:03:31 +02:00
GassiGiuseppe
255d8a072d First implementation of the cleaning pipeline UML 2025-09-29 16:59:52 +02:00
GassiGiuseppe
8167c9d435 Added Toy Dataset entry point into the Pipeline class
Before it was forced into the sql_endpoint,
now all the pipeline can be managed in the Pipeline class
2025-09-29 16:03:49 +02:00
GassiGiuseppe
bd72ad3571 Added file to execute the complete cleaning pipeline 2025-09-29 15:21:26 +02:00
GassiGiuseppe
6ddb7de9da Added sqlAlchemy to requirements 2025-09-29 15:19:19 +02:00
Christian Risi
564b0d712e Modified UML diagram 2025-09-28 18:05:03 +02:00
Christian Risi
e433941405 Added BPE
TODO:
- complete the fit method
2025-09-28 18:04:44 +02:00
Christian Risi
b46df4f91a Added Special Encoder 2025-09-28 18:03:47 +02:00
Christian Risi
d179e01971 Added Splitter to divide tokens from text 2025-09-28 18:03:16 +02:00
Christian Risi
b071145f6e Added Chunker 2025-09-28 18:02:06 +02:00
Christian Risi
ed0255e99b Updated imports 2025-09-28 18:01:35 +02:00
Christian Risi
3e8b5c5579 Added test for chunker 2025-09-26 18:50:32 +02:00
Christian Risi
8db35732f9 Added Chunker to restrict our domains 2025-09-26 18:50:23 +02:00
Christian Risi
9552d61f8d Added Excetption for when we don't find a delimiter 2025-09-26 18:49:56 +02:00
Christian Risi
be8a87ce01 Modified the architecture for BPE 2025-09-26 18:49:29 +02:00
Christian Risi
5801a819e9 Added vars to make it easier to work here 2025-09-26 18:49:06 +02:00
Christian Risi
3f48b5c428 Added text files to test a chunker 2025-09-26 18:48:44 +02:00
Christian Risi
9972ab8a51 Added imports 2025-09-26 18:48:23 +02:00
GassiGiuseppe
650b37c586 Added vscode setting to execute jupyternotebook from root dir 2025-09-26 11:24:34 +02:00
Christian Risi
90012285b5 UML Diagram to explain bpe workflows 2025-09-25 20:18:21 +02:00
Christian Risi
1bbb4a0999 Added new paper 2025-09-25 20:17:48 +02:00
GassiGiuseppe
e521b0704e deleted TODO in path_splitter_tree, as it was already resolved 2025-09-25 19:19:11 +02:00
Christian Risi
ee0aa583d5 Added Docs for BPE research 2025-09-25 19:10:45 +02:00
Christian Risi
0a698e9837 Added schema to extract from DB for BPE 2025-09-25 19:09:52 +02:00
GassiGiuseppe
9440a562f2 Merge branch 'dev.etl' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev.etl 2025-09-25 18:33:51 +02:00
Christian Risi
5eda131aac Fixed creation query to be unique even with movieID in RDFs 2025-09-25 17:58:09 +02:00
GassiGiuseppe
57884eaf2e CSV support added to path_splitter_tree
Also resolved a minor bug to print also leaf nodes
2025-09-25 17:57:46 +02:00
Christian Risi
4548a683c2 Fixed DB 2025-09-25 17:57:45 +02:00
GassiGiuseppe
3eec49ffa5 WIP: added test file: clean_relationship.jupyter
to create a first cleaning pipeline
2025-09-25 16:28:24 +02:00
Christian Risi
0bc7f4b227 Fixed Typos 2025-09-25 12:37:52 +02:00
Christian Risi
f28952b0a2 Added todo 2025-09-25 12:00:26 +02:00
Christian Risi
0b626a8e09 Modified query to take all data 2025-09-25 11:53:12 +02:00
Christian Risi
b254098532 Added views to count for subjects and objects 2025-09-25 11:40:44 +02:00
Christian Risi
ee88ffe4cf Added View to filter over relationship counts 2025-09-25 11:32:03 +02:00
Christian Risi
70b4bd8645 Added Complex query 2025-09-25 11:31:34 +02:00
Christian Risi
6316d2bfc4 Added queries to take data from SQL for dataset 2025-09-25 11:27:19 +02:00
Christian Risi
87ca748f45 Updated DB to reflect new changes 2025-09-24 19:29:57 +02:00
Christian Risi
4315d70109 Merged abbreviation_datawarehouse into datawarehouse 2025-09-24 19:29:43 +02:00
Christian Risi
9a5d633b5e Fixed Typos 2025-09-24 19:29:07 +02:00
Christian Risi
a6760cd52d Updated SQL Queries to support parsing in DB 2025-09-24 19:28:55 +02:00
GassiGiuseppe
a7eb92227d Moved all db queries file in their own folder 2025-09-24 16:44:55 +02:00
GassiGiuseppe
9f221e31cd Merge branch 'dev.etl' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev.etl 2025-09-24 16:32:52 +02:00
GassiGiuseppe
47197194d5 WIP abbrevietion_datawarehouse to creat an abbreviation system 2025-09-24 16:32:09 +02:00
Christian Risi
0cdbf6f624 Added query to retrieve a dirty dataset from SQLite DB 2025-09-24 16:15:47 +02:00
Christian Risi
3e30489f86 Updated Queries for DB 2025-09-24 14:44:53 +02:00
Christian Risi
8a22e453e4 Fixed csv 2025-09-24 14:44:25 +02:00
Christian Risi
7feb4eb857 Fixed URI generation 2025-09-24 14:44:07 +02:00
Christian Risi
70af19d356 Removed unused imports and added trailing slashes 2025-09-24 14:04:48 +02:00
Christian Risi
a4b44ab2ee Fixed Typos 2025-09-24 14:04:27 +02:00
Christian Risi
74b6b609dd Fixed typos 2025-09-24 13:59:19 +02:00
Christian Risi
59796c37cb Added script to take dbpedia uris 2025-09-24 13:49:29 +02:00
Christian Risi
f696f5950b Added uri-abbreviations 2025-09-24 13:48:53 +02:00
Christian Risi
605b496da7 Added barebone UML diagram for a Cleaning Pipeline 2025-09-23 19:49:01 +02:00
Christian Risi
7d693964dd Added new directories to tree structure 2025-09-23 19:47:56 +02:00
Christian Risi
25f401b577 Fixed bug for parsing and added CLI functionalities 2025-09-23 17:58:08 +02:00
Christian Risi
14c5ade230 Added CLI functionalities 2025-09-23 17:57:38 +02:00
4c9c51f902 Added barebone to have a splitter 2025-09-23 15:34:53 +02:00
GassiGiuseppe
63c1a4a160 added little snippet to rebuild db from db_creation.sql 2025-09-22 17:52:23 +02:00
GassiGiuseppe
51114af853 DataRetrivial deleted since it does the same thing as datawarehouse.py 2025-09-22 17:51:35 +02:00
GassiGiuseppe
3a6dca0681 Infos about Dataset contruction from csv moved
from python file to markdown
2025-09-22 17:39:44 +02:00
GassiGiuseppe
346098d2b7 Added query.sql , file with the query used to populate the Dataset 2025-09-22 17:21:32 +02:00
GassiGiuseppe
64f9b41378 Built datawarehouse.py which populate the dataset 2025-09-22 17:17:22 +02:00
GassiGiuseppe
ac1ed42c49 Folder DataCleaning renamed to DatasetMerging since it doesn't clean nothing
and instead Build the dataset
2025-09-22 17:11:49 +02:00
GassiGiuseppe
edd01a2c83 Dataset updated, the new one is built with the new method
( 50 new rows found ... upon 13 milion )
2025-09-22 16:57:06 +02:00
GassiGiuseppe
5aa9e3fcf3 Added in DBPEDIA the query to get Film \ wiki page ID
plus some editing
2025-09-22 15:42:57 +02:00
GassiGiuseppe
0970cabf92 reverse.csv grammar correction of the header
it seemed to have missplaced the header also in the middle of the csv
2025-09-22 13:47:20 +02:00
GassiGiuseppe
a26d92750f Update movie-pageid.csv : grammar correction of the header 2025-09-22 12:59:35 +02:00
GassiGiuseppe
34c4782232 Dataset.db update. it seems to be correct 2025-09-20 23:33:56 +02:00
GassiGiuseppe
c5439533e6 DataRetrivial update, without df 2025-09-20 23:32:08 +02:00
GassiGiuseppe
8819b8e87f DataRetrivial populate the db from csv 2025-09-20 19:56:24 +02:00
Christian Risi
1076dc8aa6 Run /Scripts/DataCleaning/SQL_Queries/db_creation.sql 2025-09-20 16:39:16 +02:00
Christian Risi
3d15e03b09 Renamed file to fix spelling 2025-09-20 16:38:38 +02:00
Christian Risi
0ee2ec6fcd Spelling corrections 2025-09-20 16:37:57 +02:00
Christian Risi
95cfa5486c Added instructions to create databse schema 2025-09-20 16:30:08 +02:00
GassiGiuseppe
0d30e90ee0 Created file for the db DatawareHouse
Also decided firsts schema models into DBMerger
2025-09-20 15:53:32 +02:00
GassiGiuseppe
faaba17a98 Merge branch 'dev' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev 2025-09-20 14:34:25 +02:00
Christian Risi
854e5f1d98 Updated file to gather data from wikipedia 2025-09-20 14:32:30 +02:00
GassiGiuseppe
242d7f674f wikipedia summary file uploaded
Dataset composed of PageId and wikipedia Summary
2025-09-20 14:32:25 +02:00
Christian Risi
de8c2afceb Added reconciliation 2025-09-19 22:22:09 +02:00
Christian Risi
f89dffff75 Created script to gather wikipedia abstracts 2025-09-19 19:01:38 +02:00
GassiGiuseppe
e39bad8348 Added Troubleshooting section to README
where are corrected some potential issue with git and big files
2025-09-19 13:39:56 +02:00
GassiGiuseppe
7a1a221017 update of the database of movie-pageid
which has subject has film uri and object wikipage id
2025-09-19 13:37:56 +02:00
Christian Risi
fafe6ae0f9 Modified tree structure with more TMP directories 2025-09-19 12:46:31 +02:00
Christian Risi
e32444df75 Updated fetchdata to be used in terminal
Changes:
  - now you can use it as if it were a cli command

Missing:
  - documentation
2025-09-19 12:35:15 +02:00
Christian Risi
b74b7ac4f0 Added new directories to make experiments and updated .gitignore
Changes:
  - Added /Scripts/Experiments/Queries to keep track
      of important queries, once set
  - Added /Scripts/Experiments/Tmp to run quick experiments
      when still unsure while explorating datasets
2025-09-19 08:43:54 +02:00
Christian Risi
22134391d9 Added Scripts/Experiment directory
This directory is to place files to make experiments
2025-09-19 08:41:46 +02:00
Christian Risi
82c9023849 Ignoring Scripts/Experiments files and always tracking .gitkeep files 2025-09-19 08:39:47 +02:00
Christian Risi
00b87e01ea Moved fetchdata.py to reflect working tree
old - ${Proj}/Scripts/fetchdata.py
new - ${Proj}/Scripts/DataGathering/fetchdata.py
2025-09-19 08:37:04 +02:00
Christian Risi
ce3d4bf6c5 Renamed dir from Script to Scripts 2025-09-19 08:31:00 +02:00
GassiGiuseppe
c415b175a0 added reverse.csv with the reletion incoming to films 2025-09-18 20:26:51 +02:00
GassiGiuseppe
ec81ea7930 Added file to gather wikipedia abstract from url 2025-09-18 20:26:11 +02:00
GassiGiuseppe
4bb03f86b3 Added file to study the most frequent relationship into a csv triplet 2025-09-18 20:25:25 +02:00
GassiGiuseppe
e5f201f3db DEVELOPMENT file makrdown created 2025-09-18 20:24:54 +02:00
GassiGiuseppe
1c715dc569 Typo correction in the markdown 2025-09-18 20:24:11 +02:00
GassiGiuseppe
6686b47328 Added SQL to obtain wikipedia url with movies 2025-09-18 20:23:10 +02:00
GassiGiuseppe
9a5a7d84fd Merge branch 'dev' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev 2025-09-18 19:20:26 +02:00
GassiGiuseppe
9678ece9c0 Requirements changed
added Pandas and some other
2025-09-18 19:07:38 +02:00
Christian Risi
67bcd732b5 Updated movies 2025-09-18 18:36:52 +02:00
Christian Risi
1a4f900500 Updated git attributes 2025-09-18 18:36:42 +02:00
Christian Risi
ca8729b67c Merge branch 'dev' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev 2025-09-18 18:36:30 +02:00
GassiGiuseppe
9dbffc52ed Added dataset of movies and their wikipedia's page link 2025-09-18 18:16:51 +02:00
Christian Risi
b7f504942a Created Dataset 2025-09-18 17:24:08 +02:00
Christian Risi
7f0c5ce8d3 Updated File for fetching 2025-09-18 17:23:56 +02:00
Christian Risi
9838e287a4 Updated file 2025-09-18 12:03:09 +02:00
Christian Risi
ca6143ea3c Updated Query histories 2025-09-18 11:46:32 +02:00
Christian Risi
16e7ab4d9f Modified Datasets 2025-09-17 17:30:51 +02:00
Christian Risi
28723ab662 Merge branch 'dev' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev 2025-09-17 17:06:16 +02:00
Christian Risi
3e59efcf33 Generated datasets 2025-09-17 17:06:14 +02:00
Christian Risi
7c04309cc1 Added script to fetch data from DBPedia 2025-09-17 17:05:27 +02:00
Christian Risi
db87295890 Added history of queries 2025-09-17 17:04:58 +02:00
GassiGiuseppe
61568200a8 README update with setup chapter
where are scripted the command to manage conda and pip
2025-09-17 16:50:50 +02:00
Christian Risi
8df2736b97 Added environments 2025-09-17 16:16:58 +02:00
Christian Risi
eb5b7f629a Conda env 2025-09-17 15:53:17 +02:00
Christian Risi
79232b391e First SparQL query 2025-09-17 14:26:37 +02:00
Christian Risi
72eb937b47 Fixed Markdown violations 2025-09-17 12:51:14 +02:00
Christian Risi
cececa14ce Merge branch 'dev' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev 2025-09-17 12:48:34 +02:00
Christian Risi
2487d44abd Added SparQL 2025-09-17 12:48:33 +02:00
GassiGiuseppe
553b86cac2 Resources file updated with Byte-Pair Encoding
a technique we will use to tokenize the engress' words
2025-09-17 12:06:01 +02:00
Christian Risi
12bd781fd3 Added workspace recommendations 2025-09-17 11:38:23 +02:00
Christian Risi
463f4907b8 Added Resources documentation 2025-09-17 11:36:02 +02:00
157 changed files with 17796 additions and 2 deletions

1
.gitattributes vendored
View File

@ -1,2 +1,3 @@
Exam/Deep_Learning_2025_VIII.pdf filter=lfs diff=lfs merge=lfs -text
Assets/** filter=lfs diff=lfs merge=lfs -text
Assets/Dataset/1-hop/dataset.csv filter=lfs diff=lfs merge=lfs -text

7
.gitignore vendored
View File

@ -189,7 +189,8 @@ ipython_config.py
.LSOverride
# Icon must end with two \r
Icon
Icon
# Thumbnails
._*
@ -251,3 +252,7 @@ $RECYCLE.BIN/
# .nfs files are created when an open file is removed but is still being accessed
.nfs*
# ---> Custom
**/Tmp/**
**/cache/**
!**/.gitkeep

14
.vscode/extensions.json vendored Normal file
View File

@ -0,0 +1,14 @@
{
"recommendations": [
"bierner.github-markdown-preview",
"bierner.markdown-checkbox",
"bierner.markdown-emoji",
"bierner.markdown-footnotes",
"bierner.markdown-mermaid",
"bierner.markdown-preview-github-styles",
"bierner.markdown-yaml-preamble",
"davidanson.vscode-markdownlint",
"kejun.markdown-alert",
"yzhang.markdown-all-in-one"
]
}

16
.vscode/launch.json vendored Normal file
View File

@ -0,0 +1,16 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Python Debugger: Current File with Arguments",
"type": "debugpy",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal",
"args": "${command:pickArgs}"
}
]
}

55
.vscode/settings.json vendored Normal file
View File

@ -0,0 +1,55 @@
{
// Always treat the project root as the working dir for Jupyter
"jupyter.notebookFileRoot": "${workspaceFolder}",
// When you click "Run Python File in Terminal", DON'T cd into the file's folder
"python.terminal.executeInFileDir": false,
// Start new integrated terminals at the project root
"terminal.integrated.cwd": "${workspaceFolder}",
// Make pytest run from the root without needing a pytest.ini
"python.testing.pytestEnabled": true,
"python.testing.cwd": "${workspaceFolder}",
"python.testing.pytestArgs": [
"src/test"
],
// Help Pylance resolve imports like `from src...` without red squiggles
"python.analysis.extraPaths": [
"${workspaceFolder}"
],
// For linux
"terminal.integrated.env.linux": {
"PYTHONPATH": "${workspaceFolder}"
},
// For OSX
"terminal.integrated.env.osx": {
"PYTHONPATH": "${workspaceFolder}"
},
// For Windows
"terminal.integrated.env.windows": {
"PYTHONPATH": "${workspaceFolder}"
},
"python.analysis.typeCheckingMode": "standard"
}
// {
// // Always treat the project root as the working dir for Jupyter
// "jupyter.notebookFileRoot": "${workspaceFolder}",
//
// // When you click "Run Python File in Terminal", DON'T cd into the file's folder
// "python.terminal.executeInFileDir": false,
//
// // Start new integrated terminals at the project root
// "terminal.integrated.cwd": "${workspaceFolder}",
//
// // Ensure Python can import from the project root no matter which file you run
// // (so `src/` is on sys.path). Linux shown here; add osx/windows if needed.
// "terminal.integrated.env.windows": {
// "PYTHONPATH": "${workspaceFolder}"
// },
//
// // Make pytest run from the root without needing a pytest.ini
// "python.testing.pytestEnabled": true,
// "python.testing.cwd": "${workspaceFolder}",
// "python.testing.pytestArgs": ["src/test"],
//
// // Help Pylance resolve imports like `from src...` without red squiggles
// "python.analysis.extraPaths": ["${workspaceFolder}"]
// }

BIN
Assets/Dataset/1-hop/curated/corpus.txt (Stored with Git LFS) Normal file

Binary file not shown.

BIN
Assets/Dataset/1-hop/dataset.csv (Stored with Git LFS) Normal file

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:331d8ef4e99c5200f1323e7149bd8aade39dc17ee5778b553bb32c593ff601cf
3 size 2443211793

BIN
Assets/Dataset/1-hop/movie-pageid.csv (Stored with Git LFS) Normal file

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:88e387ed1338bdfd34ded22f3f8bebb2be5127857bf36fcffc266b35c534587c
3 size 10148507

BIN
Assets/Dataset/1-hop/movies.csv (Stored with Git LFS) Normal file

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:8d81c8801ea79bd46747769a288cd0c507b3b94b2fb4bbb9605e282776ca5efb
3 size 8808636

BIN
Assets/Dataset/1-hop/reverse.csv (Stored with Git LFS) Normal file

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:b4878aed66c382e73982b19fa02129d5b3c3e3e8690c28e4dd662257e1d9b119
3 size 32343972

BIN
Assets/Dataset/1-hop/small/corpus.txt (Stored with Git LFS) Normal file

Binary file not shown.

BIN
Assets/Dataset/1-hop/small/rdf_completation.csv (Stored with Git LFS) Normal file

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:80da574017b251c9f07ecbce837d9d36a9ee8183a2a3bdbe0a2e31e22226ab79
3 size 12773126

BIN
Assets/Dataset/1-hop/small/rdf_text.csv (Stored with Git LFS) Normal file

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:41b30ab739a01482036c40b6560adfe751c5905ae80aafef6ee0f1a716849c68
3 size 13222824

BIN
Assets/Dataset/1-hop/toy/corpus.txt (Stored with Git LFS) Normal file

Binary file not shown.

BIN
Assets/Dataset/1-hop/toy/rdf_completation.csv (Stored with Git LFS) Normal file

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:39012a1e59eaa740d01515aa6b9744267dbb3ae13941b28558060795a94d90e0
3 size 86122

BIN
Assets/Dataset/1-hop/toy/rdf_mask.csv (Stored with Git LFS) Normal file

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:8f31602eba47f7daff3b13bb243abaf429ff5900a8d26ae854ba790fda47d287
3 size 517642

BIN
Assets/Dataset/1-hop/toy/rdf_text.csv (Stored with Git LFS) Normal file

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:1189e04d3ba9d9138a4e216200313f5842b8a49de1745bb553ba2e3abf18d818
3 size 102533

BIN
Assets/Dataset/1-hop/uri-abbreviations.csv (Stored with Git LFS) Normal file

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:c1fcb1ad61a69145145c45c639ab42b36ffc63caa0ef9832eb81491197883ff4
3 size 8086

BIN
Assets/Dataset/1-hop/wikipedia-movie.csv (Stored with Git LFS) Normal file

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:1730dc111c0290b16d094a4b6a6577d966978d97ee9ef4202e86148cc9d8e8e8
3 size 17445736

BIN
Assets/Dataset/1-hop/wikipedia-summary.csv (Stored with Git LFS) Normal file

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:ef7b680257f16b193a9b4ea2914564b58c676955809e6b9d58058adaab7855c1
3 size 73089553

BIN
Assets/Dataset/DatawareHouse/dataset.db (Stored with Git LFS) Normal file

Binary file not shown.

View File

BIN
Assets/Model/toy_10/README.md (Stored with Git LFS) Normal file

Binary file not shown.

BIN
Assets/Model/toy_10/toy_dictionary.json (Stored with Git LFS) Normal file

Binary file not shown.

196
Playgrounds/doctor.ipynb Normal file

File diff suppressed because one or more lines are too long

125
Playgrounds/doctor.py Normal file
View File

@ -0,0 +1,125 @@
import random
import torch
import pandas as pd
from pathlib import Path
import Project_Model.Libs.Embedder as Embedder
import Project_Model.Libs.BPE as BPE
import Project_Model.Libs.Transformer as Transformer
import Project_Model.Libs.TorchShims as torch_shims
from Project_Model.Libs.Training.learning_rade_shedulers import Custom_lr
from Project_Model.Libs.Training.logistic_collector import LogitsCollector # import the external collector
# set a fixed seed
torch.manual_seed(0)
random.seed(0)
DEVICE = torch_shims.get_default_device()
torch.set_default_device(DEVICE)
# BPE Init
VOCABULARY_PATH = Path("Assets/Model/toy_10/toy_dictionary.json")
SPECIAL_VOC = BPE.default_special_tokens()
VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)
TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)
# Constants
TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + 1
EMBEDDED_SIZE = 256
FEED_FORWARD_MULTIPLIER = 4
ATTENTION_HEADS = 4
SENTENCE_LENGTH = 256
NUMBER_OF_BLOCKS = 2
MAX_EPOCHS = int(1e3)
PAD_TOKEN = TOKENANO.encode("<PAD>")[0]
END_TOKEN = TOKENANO.encode("<END>")[0]
# Load CSV
TOY_DATASET_PATH = Path("Assets/Dataset/1-hop/toy/rdf_text.csv")
TOY_DATASET = pd.read_csv(TOY_DATASET_PATH)
TOY_BATCH_INPUT_LIST: list[list[int]] = []
TOY_BATCH_PADDING_LIST: list[list[bool]] = []
TOY_BATCH_TARGET_LIST: list[list[int]] = []
TOY_BATCH_DECODER_DEFAULT: list[list[int]] = []
for index, row in TOY_DATASET.iterrows():
RDFs: str = row["RDFs"]
Abstract: str = row["Abstract"]
input_tokens = TOKENANO.encode(RDFs) # encoder input ids
output_tokens = TOKENANO.encode(Abstract)[1:] # decoder target ids (shifted left)
decoder_default_tokens = TOKENANO.encode("<SOS>") # decoder input starts with <SOS>
input_tokens, padding = Transformer.normalize_sequence(
input_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN
) # pad/trim + end token
output_tokens, _ = Transformer.normalize_sequence(
output_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN
) # pad/trim + end token
decoder_default_tokens = Transformer.pad_sequence(
decoder_default_tokens, SENTENCE_LENGTH, PAD_TOKEN
) # pad with PAD up to SENTENCE_LENGTH
TOY_BATCH_INPUT_LIST.append(input_tokens)
TOY_BATCH_PADDING_LIST.append(padding)
TOY_BATCH_TARGET_LIST.append(output_tokens)
TOY_BATCH_DECODER_DEFAULT.append(decoder_default_tokens)
# Training loop
LOSS_HISTORY = []
NANOSOCRATES = Transformer.TrainingModel(
TOKEN_SPACE_SIZE,
EMBEDDED_SIZE,
FEED_FORWARD_MULTIPLIER,
ATTENTION_HEADS,
NUMBER_OF_BLOCKS,
)
collector = LogitsCollector(PAD_TOKEN, END_TOKEN, TOKENANO) # collects logits and decodes
NANOSOCRATES.train()
cross_entropy = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)
optimizer = torch.optim.AdamW(NANOSOCRATES.parameters())
scheduler = Custom_lr(EMBEDDED_SIZE, 4000) # step each optimizer step
current_epoch = 0
BATCH_SIZE = min(32, len(TOY_BATCH_INPUT_LIST)) # small batch to stabilize
while current_epoch < MAX_EPOCHS:
# simple fixed mini-batch from the top; later you can shuffle/slice
enc = torch.tensor(TOY_BATCH_INPUT_LIST[:BATCH_SIZE], dtype=torch.long) # [B,T] encoder token ids
pad = torch.tensor(TOY_BATCH_PADDING_LIST[:BATCH_SIZE], dtype=torch.bool) # [B,T] True where encoder PAD is present
tgt = torch.tensor(TOY_BATCH_TARGET_LIST[:BATCH_SIZE], dtype=torch.long) # [B,T] decoder targets (ground-truth)
# decoder prefix buffer: <SOS> at pos 0, PAD elsewhere (no shift here) # we will fill it step by step
dec = torch.tensor(TOY_BATCH_DECODER_DEFAULT[:BATCH_SIZE], dtype=torch.long) # [B,T]
total_loss = 0.0
collector.reset() # start fresh for this epoch
T = tgt.size(1) # sequence length
for t in range(T):
optimizer.zero_grad(set_to_none=True) # clear grads for this token step
prefix = dec[:, : t + 1] # [B, t+1] current decoder prefix
dec_pad_mask = prefix.eq(PAD_TOKEN) # [B, t+1] True where PAD inside prefix
# one-step logits given prefix (trainer model expects 4 args now)
logits_t: torch.Tensor = NANOSOCRATES((enc, pad, prefix, dec_pad_mask)) # [B,V] logits for step t
collector.add(logits_t) # store logits for decoding later
loss_t = cross_entropy(logits_t, tgt[:, t]) # CE expects raw logits; PAD ignored
loss_t.backward() # backprop for this step
optimizer.step() # update params
scheduler.step() # Noam/warmup: step per optimizer step
total_loss = float(loss_t.detach()) # keep last step loss for logging
# teacher forcing: reveal the correct token for next position
if t < T - 1:
dec[:, t + 1] = tgt[:, t] # write ground-truth into next slot
current_epoch += 1
print(f"EPOCH {current_epoch}\n\tLoss: {total_loss:.6f}") # simple log
collector.print_decoded() # print decoded predictions for the batch

182
Playgrounds/embedder.ipynb Normal file

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,308 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 6,
"id": "7a311d4b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[7706, 290, 756, 4270, 7357, 115, 351, 1507, 1213, 410, 3382, 317, 497, 4740, 2784, 7712], [7706, 290, 756, 4270, 7357, 115, 351, 1507, 1213, 410, 3382, 317, 497, 4740, 2784, 7712], [7706, 290, 756, 4270, 7357, 115, 351, 1507, 1213, 410, 3382, 317, 497, 4740, 2784, 7712]]\n",
"3\n",
"Embedder Tensor: torch.Size([3, 16, 256])\n",
"Values:\n",
"tensor([[[-0.6981, 0.0804, -2.1672, ..., 0.3919, 0.3341, 1.0794],\n",
" [ 2.5818, -0.2308, 0.6001, ..., -0.0500, -0.0408, -0.9852],\n",
" [-0.6967, 0.8109, 1.3108, ..., 2.1693, 1.4143, -0.1236],\n",
" ...,\n",
" [ 2.1226, 2.5695, -1.6178, ..., -0.0652, -0.0802, 0.1103],\n",
" [ 0.8770, -2.4782, 0.8536, ..., 2.0471, -1.5702, 0.7387],\n",
" [ 1.4284, -0.4654, 0.1394, ..., 1.6520, 0.6728, 1.3851]],\n",
"\n",
" [[-0.6981, 0.0804, -2.1672, ..., 0.3919, 0.3341, 1.0794],\n",
" [ 2.5818, -0.2308, 0.6001, ..., -0.0500, -0.0408, -0.9852],\n",
" [-0.6967, 0.8109, 1.3108, ..., 2.1693, 1.4143, -0.1236],\n",
" ...,\n",
" [ 2.1226, 2.5695, -1.6178, ..., -0.0652, -0.0802, 0.1103],\n",
" [ 0.8770, -2.4782, 0.8536, ..., 2.0471, -1.5702, 0.7387],\n",
" [ 1.4284, -0.4654, 0.1394, ..., 1.6520, 0.6728, 1.3851]],\n",
"\n",
" [[-0.6981, 0.0804, -2.1672, ..., 0.3919, 0.3341, 1.0794],\n",
" [ 2.5818, -0.2308, 0.6001, ..., -0.0500, -0.0408, -0.9852],\n",
" [-0.6967, 0.8109, 1.3108, ..., 2.1693, 1.4143, -0.1236],\n",
" ...,\n",
" [ 2.1226, 2.5695, -1.6178, ..., -0.0652, -0.0802, 0.1103],\n",
" [ 0.8770, -2.4782, 0.8536, ..., 2.0471, -1.5702, 0.7387],\n",
" [ 1.4284, -0.4654, 0.1394, ..., 1.6520, 0.6728, 1.3851]]],\n",
" grad_fn=<AddBackward0>)\n",
"ENCODER Tensor: torch.Size([3, 1, 256])\n",
"Values:\n",
"tensor([[[ 8.0069e-01, 4.0532e-01, -1.8316e+00, -1.3902e+00, -1.1784e+00,\n",
" 1.3667e+00, -9.7890e-01, 6.0696e-01, -1.4899e+00, 5.5765e-01,\n",
" 4.5991e-02, 5.1214e-01, 3.1901e-01, 4.7577e-01, -2.9585e-01,\n",
" -1.0811e+00, -1.5281e+00, -6.3773e-01, -9.5954e-01, 1.8497e+00,\n",
" -1.1789e+00, -9.7387e-01, 1.1931e-01, -7.2703e-01, 5.3108e-01,\n",
" -6.4877e-01, -4.5188e-01, 1.5185e+00, -8.3408e-01, 3.2824e-01,\n",
" -1.8166e+00, 1.9548e+00, -5.2419e-01, -1.0693e+00, -1.8510e+00,\n",
" 1.5440e+00, -3.2370e-01, -1.3990e+00, -4.6940e-01, 6.5840e-02,\n",
" -9.2057e-01, 1.2513e+00, -5.9168e-01, 7.8198e-01, -1.3121e+00,\n",
" 1.1492e+00, -2.3695e-01, -1.8935e+00, 1.1639e+00, -5.8169e-01,\n",
" 2.5051e-01, -8.1654e-01, -1.0328e+00, 1.4285e+00, -8.1485e-01,\n",
" 1.0614e+00, -3.3834e-01, -4.1667e-02, -1.1920e-01, 3.1383e-01,\n",
" -5.9857e-01, 1.7327e-01, -1.6854e+00, -1.5174e+00, -2.6508e-01,\n",
" -6.0082e-01, 5.1468e-01, 2.7909e-01, -2.5296e-01, -1.4670e+00,\n",
" -1.3587e+00, -8.8864e-02, 3.2825e-01, 1.0950e+00, -1.0371e+00,\n",
" 1.1744e+00, 5.2984e-01, 4.1751e-01, -9.8803e-01, 3.5631e-01,\n",
" 4.7484e-01, 2.2435e-01, 1.4022e+00, 1.2242e+00, 1.1447e+00,\n",
" -5.4052e-01, -9.1786e-01, -1.2299e+00, 1.1656e+00, 9.1570e-01,\n",
" 1.8956e+00, 7.4344e-01, 4.2187e-01, -9.5426e-02, -3.2428e-01,\n",
" 9.6364e-01, -2.3252e-01, 2.9036e-01, -2.4432e+00, 9.8019e-01,\n",
" -4.6697e-02, 8.3910e-01, -4.3541e-01, -7.1915e-01, -7.5638e-01,\n",
" 9.0217e-01, 2.0919e+00, -7.9533e-01, -1.5413e-01, -6.9260e-01,\n",
" -1.3086e+00, 7.8925e-01, 1.8855e-01, 7.4043e-01, -3.8834e-01,\n",
" 1.0272e-02, 1.0763e+00, 4.2142e-01, 6.6520e-01, 4.5996e-01,\n",
" -8.5060e-01, -9.0101e-01, -4.2090e-01, 2.5596e-01, -1.4946e+00,\n",
" 1.0925e-01, -7.5359e-01, -3.0447e-01, 1.0679e+00, 1.9398e+00,\n",
" 8.1472e-01, 1.3498e+00, 1.1107e+00, 6.3288e-01, 3.1149e-01,\n",
" -1.9333e+00, -1.5274e+00, 2.1794e-01, -3.1895e-02, 1.0756e+00,\n",
" 1.0215e+00, 1.6938e+00, -1.0939e+00, 2.2690e+00, -7.0921e-01,\n",
" 6.4212e-01, -6.5468e-01, 1.6839e+00, 5.7296e-01, -1.4031e+00,\n",
" 3.9133e-01, -5.3541e-01, 4.3439e-01, -1.6785e+00, 5.2030e-03,\n",
" 4.5155e-01, -7.0953e-01, -1.9656e-01, -3.8671e-02, -1.0927e+00,\n",
" -3.0405e-01, -1.3818e-02, -3.7748e-01, 1.4412e+00, -1.4254e-01,\n",
" 7.9939e-01, -8.5402e-01, -1.0330e+00, 1.7661e+00, -3.6084e-01,\n",
" 1.5622e+00, 1.0240e+00, 1.9056e-01, -4.1480e-01, 6.9056e-01,\n",
" 1.7204e+00, -9.9218e-01, -1.6504e-01, -1.1807e+00, 1.0827e+00,\n",
" 1.5973e+00, 1.4849e-01, -2.2867e+00, 7.7322e-01, -6.8401e-01,\n",
" -6.0493e-01, 1.0616e+00, -1.8034e-01, -1.8828e+00, 1.1031e-01,\n",
" 2.5452e-01, -4.2489e-02, 8.1171e-01, 1.3429e+00, -6.5058e-01,\n",
" -1.3531e+00, -1.2263e+00, 1.1226e+00, 1.2407e+00, -9.7453e-01,\n",
" 9.4696e-01, 6.6186e-01, -5.0804e-01, 1.2647e-01, -1.1777e+00,\n",
" 6.8443e-02, -1.3043e-01, 2.9595e-01, -1.5330e+00, -6.5733e-01,\n",
" 1.1291e+00, 6.9629e-01, 4.4690e-01, 8.0151e-01, -1.2406e+00,\n",
" 2.6085e+00, -2.0310e-01, -1.0226e+00, -6.9182e-02, 7.6600e-01,\n",
" -9.9842e-01, 2.0896e+00, 2.6334e-01, -1.1559e-01, -6.6876e-01,\n",
" -6.6295e-01, -1.6461e-01, 2.8270e+00, 3.2727e-01, 1.3724e+00,\n",
" -1.0749e+00, 3.7782e-01, -1.5472e+00, 3.0822e-01, 5.7273e-02,\n",
" 3.9136e-01, 8.2948e-01, 2.1438e-01, -9.8623e-01, 5.6053e-01,\n",
" -1.5617e+00, -3.9595e-01, 1.0451e-02, -1.1860e+00, -1.4994e-01,\n",
" 1.6566e+00, 2.0369e+00, -4.3995e-01, -4.4262e-01, -3.1014e-01,\n",
" 5.9083e-01, -1.0765e+00, -5.2906e-01, 4.6039e-02, -1.0154e+00,\n",
" 5.9942e-01]],\n",
"\n",
" [[ 1.2683e+00, -4.3200e-01, -1.3333e+00, -3.6705e-01, -5.8895e-01,\n",
" 9.9266e-01, -4.2914e-01, 9.2765e-01, -1.0935e+00, 1.4975e+00,\n",
" -5.3739e-01, -2.8332e-01, 9.1166e-01, 1.5010e+00, -2.1787e-01,\n",
" -1.4258e+00, -2.7524e-01, -1.2602e+00, 2.0117e-01, 2.3906e+00,\n",
" -9.6397e-01, -7.5872e-01, 3.3948e-01, -7.9353e-01, 9.1668e-01,\n",
" 8.7734e-04, -3.0271e-01, 1.7087e+00, -1.0273e+00, 1.5174e+00,\n",
" -2.6405e-02, 1.4236e+00, -9.9093e-01, 5.4787e-01, -1.0904e+00,\n",
" 5.2156e-01, -6.3470e-01, -7.7688e-01, -1.2538e+00, -3.9307e-01,\n",
" -7.6707e-01, 1.3733e+00, -7.2709e-01, 1.1185e+00, -1.5860e+00,\n",
" -2.6148e-01, -3.7984e-01, -1.3604e+00, 9.2864e-02, -7.9642e-01,\n",
" 1.0956e+00, 3.1202e-01, -4.1234e-01, 3.6488e-02, -1.4639e+00,\n",
" 1.0947e+00, -7.9230e-01, 4.6913e-01, -2.3407e-01, 4.1768e-02,\n",
" -1.5921e+00, 6.9743e-01, -7.0222e-01, -5.4705e-01, -6.5663e-01,\n",
" -4.1810e-01, 2.7744e-01, 7.9178e-01, 7.5886e-01, -7.6302e-01,\n",
" -1.2204e+00, -1.1103e+00, -1.3646e-01, 1.9589e+00, -1.3637e+00,\n",
" 9.0804e-01, 2.3094e-01, -5.5953e-02, -6.7626e-01, 1.4242e+00,\n",
" 1.0167e+00, 1.0705e+00, 2.2947e+00, 9.1274e-01, 1.2281e+00,\n",
" -7.0638e-01, -1.2249e+00, -8.9208e-02, 1.1016e+00, 1.1940e+00,\n",
" 3.5834e-01, 1.2961e+00, -4.6674e-01, 3.4572e-01, -4.3458e-01,\n",
" 1.1008e+00, 3.7783e-01, -6.5841e-01, -2.3127e+00, 1.4617e+00,\n",
" -1.2826e-01, 1.3463e-01, -8.5268e-01, -8.4144e-01, -1.8594e+00,\n",
" 1.9260e-01, 1.6432e+00, -2.0640e-02, -5.0030e-01, -1.5334e-01,\n",
" -6.1072e-01, -1.3694e-01, -3.7308e-01, 1.6603e+00, 1.1246e-01,\n",
" 6.0823e-02, 7.8749e-01, -1.7002e-01, 1.2058e+00, 8.5615e-01,\n",
" 1.2525e-01, -1.0584e+00, -4.7931e-01, 1.4088e-01, -1.8149e+00,\n",
" 1.4654e+00, -1.0936e+00, 5.3182e-01, 9.5694e-01, 3.2472e+00,\n",
" 3.4877e-01, 1.8491e+00, -1.5184e-01, 1.4711e+00, -7.6064e-01,\n",
" -2.2144e+00, -1.8952e+00, -4.9502e-01, -6.6836e-01, 1.4946e+00,\n",
" 6.7616e-01, 1.1501e+00, -9.4747e-01, 1.1009e+00, -1.4211e+00,\n",
" 3.9528e-01, -9.5220e-01, 1.4886e+00, 7.1784e-01, -1.9941e+00,\n",
" 6.7901e-02, -1.3109e-01, 1.1695e+00, 1.2861e-01, -2.8123e-01,\n",
" -6.1611e-01, 1.5513e-01, -3.9289e-01, -4.5543e-02, -2.8628e-01,\n",
" 2.6118e-01, 2.2623e-01, -6.3705e-01, 7.3591e-01, -7.8799e-01,\n",
" 2.5053e-01, -1.5923e-01, -4.9584e-01, 1.9009e+00, -2.3263e-01,\n",
" 1.2213e+00, 1.0313e+00, 2.0177e-02, -6.2209e-01, -3.5161e-01,\n",
" 1.5143e+00, -7.2332e-02, 2.3909e-02, -2.1261e+00, 8.5199e-01,\n",
" 1.9084e+00, 4.6845e-02, -2.3554e+00, 1.3735e+00, -7.3909e-01,\n",
" -8.3949e-01, -3.9314e-01, -4.3324e-01, -9.6804e-01, -5.3124e-01,\n",
" -6.5091e-01, -1.1738e+00, 1.3315e+00, 6.5606e-01, -1.4131e-01,\n",
" -1.7712e+00, -1.1628e+00, 9.6813e-01, 8.7314e-01, -9.8027e-01,\n",
" 6.9376e-01, 5.3878e-01, -1.6169e+00, 2.2860e-01, -6.2179e-01,\n",
" -1.1043e-01, -3.9658e-01, 2.8712e-01, 8.2201e-02, 2.0888e-01,\n",
" -5.9884e-01, 7.3092e-01, 6.9128e-01, 5.3977e-01, -1.5728e+00,\n",
" 1.6878e+00, -8.2669e-01, -9.8076e-01, -3.4203e-01, 4.6939e-02,\n",
" -1.3158e-01, 2.1923e+00, -6.6483e-02, -4.0687e-01, -1.2715e+00,\n",
" -8.1549e-01, -1.2047e+00, 1.3547e+00, -4.2072e-01, 1.1674e+00,\n",
" -5.1421e-01, 1.3055e+00, -1.1277e+00, 1.8372e+00, -1.1215e+00,\n",
" 1.4797e+00, 2.8354e-01, -6.3974e-01, -1.2869e+00, -2.7897e-01,\n",
" -1.0397e+00, 1.8622e-01, -5.0397e-02, -4.4865e-02, -7.6067e-01,\n",
" 1.7715e+00, 1.5040e+00, -2.6854e-01, -5.2802e-01, -5.3407e-01,\n",
" 2.0313e-02, -2.6276e-01, -7.0748e-01, -8.7328e-01, -3.4108e-01,\n",
" 1.4313e+00]],\n",
"\n",
" [[ 7.7464e-01, -4.2187e-01, -2.0571e+00, -8.6709e-01, -1.5722e+00,\n",
" 4.9540e-01, -1.5270e+00, 1.0499e+00, -1.9579e+00, -2.5298e-02,\n",
" 4.3419e-01, 5.8822e-01, 1.3392e+00, 6.9604e-01, -9.7883e-01,\n",
" -9.1354e-01, -9.1852e-01, -6.0951e-01, -6.6255e-02, 1.3907e+00,\n",
" -6.2912e-01, -2.7524e-01, 1.9520e-02, -2.7154e-01, 1.5162e-01,\n",
" 1.3318e-02, -8.9196e-01, 9.0976e-01, -1.3544e+00, 2.4276e-01,\n",
" -7.4038e-01, 9.7062e-01, 3.2011e-01, 3.4486e-01, -2.3374e+00,\n",
" 1.3311e+00, -3.1871e-02, -1.4468e+00, -1.5968e+00, 3.0418e-01,\n",
" -7.7136e-01, 1.3427e+00, -1.2493e+00, 1.4114e+00, -1.2475e+00,\n",
" 7.0239e-01, -9.6120e-02, -4.4365e-01, 5.3238e-01, -1.4933e+00,\n",
" 5.4476e-01, -1.8490e-02, -5.9936e-01, 1.0878e+00, -1.8892e+00,\n",
" 1.2810e+00, -1.0747e+00, 5.3514e-01, 1.7422e-01, 1.1354e+00,\n",
" -7.4837e-01, 4.0327e-01, -1.8950e+00, -7.2336e-01, 2.4441e-01,\n",
" -1.3650e-01, -4.8344e-01, 3.3921e-02, 5.0889e-01, -1.3769e+00,\n",
" -2.5907e-01, -2.7549e-01, -1.9128e-01, 1.9751e+00, -7.1191e-01,\n",
" 5.1910e-01, 1.0902e-01, 2.9995e-01, -3.5180e-01, -6.2139e-01,\n",
" 7.2905e-01, -5.3177e-01, 4.3340e-01, 1.0071e+00, 1.7586e+00,\n",
" -3.9963e-01, -2.5139e-01, -9.4213e-01, 9.2847e-01, 1.1298e+00,\n",
" 7.8545e-01, 1.3188e+00, 3.7466e-01, 9.0773e-01, -4.0454e-02,\n",
" 1.3444e+00, 6.0301e-01, 8.9929e-02, -2.0754e+00, 4.8614e-01,\n",
" -9.7160e-01, 8.2446e-01, -1.1813e+00, -9.6185e-01, -9.2922e-02,\n",
" 6.0154e-01, 1.6640e+00, -1.0461e+00, 1.5868e-01, -5.7239e-01,\n",
" -6.2726e-01, 3.2848e-01, 5.9609e-01, 1.5563e+00, -4.0883e-01,\n",
" 4.4902e-01, 1.4004e+00, 2.2426e-01, 3.8314e-01, -2.0641e-01,\n",
" -1.6465e-01, -6.4645e-01, 1.5772e-01, 6.8907e-01, -1.2703e+00,\n",
" 1.8914e-01, -6.2678e-01, 3.0179e-01, 1.2687e+00, 1.6849e+00,\n",
" 1.5690e+00, 1.0999e+00, 1.5820e+00, -6.4808e-01, 5.1003e-01,\n",
" -1.6674e+00, -1.2224e+00, 1.9769e-01, -1.3883e-01, 1.2179e+00,\n",
" 1.2971e+00, 4.6259e-01, -5.8717e-01, 1.4532e+00, -1.0540e+00,\n",
" 2.8689e-01, -1.3895e+00, 1.4014e+00, -4.0430e-01, -2.6099e+00,\n",
" -1.0293e+00, -1.1097e+00, 8.6266e-01, -1.0535e+00, 7.1789e-01,\n",
" 6.0642e-01, -1.2493e+00, -3.7762e-01, -4.1281e-02, -7.3049e-01,\n",
" -7.2913e-04, -7.3122e-02, -2.3850e-01, 1.2546e+00, 1.8802e-01,\n",
" 1.3135e+00, -5.0367e-01, 1.2456e-01, 2.7475e+00, -1.2486e+00,\n",
" 1.4441e+00, 8.7469e-01, -5.6901e-01, -1.2145e-01, 3.1091e-01,\n",
" 1.9406e+00, -8.1891e-01, 3.1316e-02, -1.2867e+00, 8.0780e-01,\n",
" 7.0041e-01, 2.8903e-01, -1.6387e+00, 6.6553e-01, -1.3696e+00,\n",
" -7.9454e-01, 3.3899e-01, -5.5822e-01, -8.1969e-01, -1.2410e-01,\n",
" -3.7024e-01, -7.2536e-01, 7.5648e-01, 1.6899e+00, -1.7404e-01,\n",
" -1.7191e+00, -7.2603e-01, 1.5046e+00, 8.3216e-01, -1.5304e+00,\n",
" -1.8264e-01, 3.3451e-01, -5.6636e-02, 6.1099e-01, -9.8517e-01,\n",
" 4.4856e-01, -8.6275e-01, 6.9264e-02, -1.1572e+00, 2.3373e-01,\n",
" 5.9896e-01, 1.2384e-01, 1.0309e+00, 1.4273e+00, -8.4776e-01,\n",
" 2.6236e+00, -9.0133e-01, -4.0009e-01, -4.9727e-01, 3.7945e-01,\n",
" -9.0712e-01, 1.5725e+00, 1.6298e-01, 1.1544e-01, -4.3125e-01,\n",
" -8.7131e-01, -2.5880e-01, 2.9032e+00, 2.7154e-01, 1.3677e+00,\n",
" -8.8544e-01, 5.6083e-01, -1.8256e+00, 9.4832e-01, -1.0762e+00,\n",
" 7.5421e-01, 6.5008e-01, -8.6361e-01, -1.4911e+00, -7.5930e-02,\n",
" -1.6896e+00, 1.5223e-02, -1.5283e-01, -1.8741e+00, 1.1400e-01,\n",
" 1.8822e+00, 2.6615e+00, 2.1607e-01, -5.6243e-01, 3.6730e-01,\n",
" 4.0374e-01, -1.1973e+00, -5.3006e-01, -3.4750e-01, -4.4187e-01,\n",
" 7.4358e-01]]], grad_fn=<NativeLayerNormBackward0>)\n"
]
}
],
"source": [
"import random\n",
"import torch\n",
"from pathlib import Path\n",
"import Project_Model.Libs.Embedder as Embedder\n",
"import Project_Model.Libs.BPE as BPE\n",
"import Project_Model.Libs.Transformer as Transformer\n",
"\n",
"# set a fixed seed\n",
"torch.manual_seed(0)\n",
"random.seed(0)\n",
"\n",
"TEXT = (\n",
" \"<ABS>The Dark Knight is a 2008 superhero film directed by Christopher Nolan,<SOTL>\"\n",
")\n",
"OUT_TEXT = \"<START>\"\n",
"VOCABULARY_PATH = Path(\"Assets/Model/toy_10/toy_dictionary.json\")\n",
"SPECIAL_VOC = BPE.default_special_tokens()\n",
"\n",
"VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)\n",
"TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)\n",
"\n",
"PAD_TOKEN = TOKENANO.encode(\"<PAD>\")[0]\n",
"END_TOKEN = TOKENANO.encode(\"<END>\")[0]\n",
"\n",
"ENCODER_INPUT = TOKENANO.encode(TEXT)\n",
"DECODER_INPUT = TOKENANO.encode(OUT_TEXT)\n",
"MAX_LEN = len(ENCODER_INPUT) + 1\n",
"\n",
"EN_IN, PAD_MASK = Transformer.normalize_sequence(ENCODER_INPUT, MAX_LEN, PAD_TOKEN, END_TOKEN)\n",
"DEC_IN, _ = Transformer.normalize_sequence(DECODER_INPUT, MAX_LEN, PAD_TOKEN, END_TOKEN)\n",
"BATCH_LEN = 3\n",
"\n",
"INPUT_TOKENIZATION = [\n",
" EN_IN\n",
"] * BATCH_LEN\n",
"OUTPUT_TOKENIZATION = [\n",
" DEC_IN\n",
"] * BATCH_LEN\n",
"\n",
"\n",
"print(INPUT_TOKENIZATION)\n",
"\n",
"TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size\n",
"EMBEDDED_SIZE = 256\n",
"FEED_FORWARD_DIM = EMBEDDED_SIZE * 4\n",
"\n",
"EMBEDDER = Embedder.NanoSocratesEmbedder(TOKEN_SPACE_SIZE, EMBEDDED_SIZE)\n",
"encoder_tensor: torch.Tensor = EMBEDDER(INPUT_TOKENIZATION)\n",
"ENCODER = torch.nn.Sequential(\n",
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
")\n",
"decoder_tensor: torch.Tensor = EMBEDDER(OUTPUT_TOKENIZATION)\n",
"DECODER = torch.nn.Sequential(\n",
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
")\n",
"\n",
"print(len(INPUT_TOKENIZATION))\n",
"print(f\"Embedder Tensor: {encoder_tensor.shape}\")\n",
"print(f\"Values:\\n{encoder_tensor}\")\n",
"\n",
"BATCH_SIZE, TOKENS, DIMENSIONS = encoder_tensor.shape\n",
"PAD_MASK = torch.tensor([PAD_MASK] * BATCH_LEN)\n",
"\n",
"encoder_out, _ = ENCODER((encoder_tensor, PAD_MASK))\n",
"tensor: torch.Tensor\n",
"tensor, _, _, _ = DECODER((decoder_tensor, encoder_out, encoder_out, None))\n",
"\n",
"print(f\"ENCODER Tensor: {tensor.shape}\")\n",
"print(f\"Values:\\n{tensor}\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "deep_learning",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

131
Playgrounds/encoder.ipynb Normal file
View File

@ -0,0 +1,131 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "c64b0e24",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[7706, 290, 756, 4270, 7357, 115, 351, 1507, 1213, 410, 3382, 317, 497, 4740, 2784, 7700], [7706, 290, 756, 4270, 7357, 115, 351, 1507, 1213, 410, 3382, 317, 497, 4740, 2784, 7700]]\n",
"2\n",
"Embedder Tensor: torch.Size([2, 16, 256])\n",
"Values:\n",
"tensor([[[-0.6981, 0.0804, -2.1672, ..., 0.3919, 0.3341, 1.0794],\n",
" [ 2.5818, -0.2308, 0.6001, ..., -0.0500, -0.0408, -0.9852],\n",
" [-0.6967, 0.8109, 1.3108, ..., 2.1693, 1.4143, -0.1236],\n",
" ...,\n",
" [ 2.1226, 2.5695, -1.6178, ..., -0.0652, -0.0802, 0.1103],\n",
" [ 0.8770, -2.4782, 0.8536, ..., 2.0471, -1.5702, 0.7387],\n",
" [-0.0495, -1.8601, 0.0405, ..., 2.3944, -0.4297, 1.1141]],\n",
"\n",
" [[-0.6981, 0.0804, -2.1672, ..., 0.3919, 0.3341, 1.0794],\n",
" [ 2.5818, -0.2308, 0.6001, ..., -0.0500, -0.0408, -0.9852],\n",
" [-0.6967, 0.8109, 1.3108, ..., 2.1693, 1.4143, -0.1236],\n",
" ...,\n",
" [ 2.1226, 2.5695, -1.6178, ..., -0.0652, -0.0802, 0.1103],\n",
" [ 0.8770, -2.4782, 0.8536, ..., 2.0471, -1.5702, 0.7387],\n",
" [-0.0495, -1.8601, 0.0405, ..., 2.3944, -0.4297, 1.1141]]],\n",
" grad_fn=<AddBackward0>)\n",
"ENCODER Tensor: torch.Size([2, 16, 256])\n",
"Values:\n",
"tensor([[[-1.6325, 0.4094, -2.1403, ..., 0.4654, 0.5993, 0.9683],\n",
" [ 1.8236, 0.4025, -0.6972, ..., 0.2430, 0.2536, -1.0889],\n",
" [-0.0587, 0.1618, -0.2335, ..., 1.7609, 1.2664, -0.4452],\n",
" ...,\n",
" [ 2.0337, 1.3184, -1.3165, ..., -0.3303, 0.6572, 0.0884],\n",
" [ 0.5752, -2.5594, -0.2393, ..., 1.3318, -1.4236, 0.4686],\n",
" [ 1.0075, -2.4273, -0.4593, ..., 1.6660, 0.0359, 0.2927]],\n",
"\n",
" [[-1.8300, -0.3079, -1.6585, ..., 0.4859, 0.5652, 0.8072],\n",
" [ 1.5461, -0.5666, -0.0330, ..., 0.5651, 0.2974, -1.0879],\n",
" [-0.9060, 0.2700, -0.4585, ..., 2.0363, 1.2657, -0.7060],\n",
" ...,\n",
" [ 1.6688, 1.7038, -1.9549, ..., -0.2052, 0.6270, 0.4598],\n",
" [ 0.0482, -2.3951, -0.4351, ..., 1.6230, -1.3662, -0.0390],\n",
" [ 0.8146, -2.6169, -0.6188, ..., 1.4525, 0.0507, 0.5177]]],\n",
" grad_fn=<NativeLayerNormBackward0>)\n"
]
}
],
"source": [
"import random\n",
"import torch\n",
"from pathlib import Path\n",
"import Project_Model.Libs.Embedder as Embedder\n",
"import Project_Model.Libs.BPE as BPE\n",
"import Project_Model.Libs.Transformer as Transformer\n",
"\n",
"# set a fixed seed\n",
"torch.manual_seed(0)\n",
"random.seed(0)\n",
"\n",
"TEXT = \"<ABS>The Dark Knight is a 2008 superhero film directed by Christopher Nolan,<SOTL>\"\n",
"\n",
"VOCABULARY_PATH = Path(\"Assets/Model/toy_10/toy_dictionary.json\")\n",
"SPECIAL_VOC = BPE.default_special_tokens()\n",
"\n",
"VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)\n",
"TOKENANO = BPE.TokeNanoCore(\n",
" VOCABULARY,\n",
" SPECIAL_VOC\n",
")\n",
"\n",
"TOKENIZATION = [TOKENANO.encode(TEXT), TOKENANO.encode(TEXT)]\n",
"print(TOKENIZATION)\n",
"\n",
"TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size\n",
"EMBEDDED_SIZE = 256\n",
"FEED_FORWARD_DIM = EMBEDDED_SIZE * 4\n",
"\n",
"EMBEDDER = Embedder.NanoSocratesEmbedder(TOKEN_SPACE_SIZE, EMBEDDED_SIZE)\n",
"tensor: torch.Tensor = EMBEDDER(TOKENIZATION)\n",
"ENCODER = torch.nn.Sequential(\n",
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
")\n",
"print(len(TOKENIZATION))\n",
"print(f\"Embedder Tensor: {tensor.shape}\")\n",
"print(f\"Values:\\n{tensor}\")\n",
"\n",
"BATCH_SIZE, TOKENS, DIMENSIONS = tensor.shape\n",
"PAD_MASK = torch.tensor([[True] * TOKENS] * BATCH_SIZE, dtype=torch.bool)\n",
"tensor, _ = ENCODER((tensor, PAD_MASK))\n",
"print(f\"ENCODER Tensor: {tensor.shape}\")\n",
"print(f\"Values:\\n{tensor}\")\n",
"\n",
"\n",
"\n",
"\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "deep_learning",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -0,0 +1,221 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "c8741a8f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"EPOCH 1\n",
"\tLoss: 7.424792\n",
"[0] \n",
"[1] \n",
"[2] \n",
"[3] \n",
"[4] \n",
"[5] \n",
"[6] \n",
"[7] \n",
"[8] \n",
"[9] \n"
]
}
],
"source": [
"import random\n",
"import torch\n",
"import pandas as pd\n",
"from pathlib import Path\n",
"import Project_Model.Libs.Embedder as Embedder\n",
"import Project_Model.Libs.BPE as BPE\n",
"import Project_Model.Libs.Transformer as Transformer\n",
"import Project_Model.Libs.TorchShims as torch_shims\n",
"from Project_Model.Libs.Training.learning_rade_shedulers import Custom_lr\n",
"\n",
"import torch\n",
"\n",
"class LogitsCollector:\n",
" def __init__(self, pad_token: int, end_token: int, tokenizer) -> None:\n",
" self.__pad_token = pad_token # used to skip PAD\n",
" self.__end_token = end_token # used to stop at END\n",
" self.__tokenizer = tokenizer # exposes .decode(list[int]) -> str\n",
" self.__steps: list[torch.Tensor] = [] # list of per-step logits [B,V]\n",
"\n",
" def reset(self) -> None:\n",
" self.__steps.clear() # clear history\n",
"\n",
" def add(self, logits_step: torch.Tensor) -> None:\n",
" if logits_step.dim() == 3: # handle [B,1,V]\n",
" logits_step = logits_step[:, -1, :] # -> [B,V]\n",
" self.__steps.append(logits_step.detach()) # store raw logits (detached)\n",
"\n",
" def tokens(self) -> list[list[int]]:\n",
" if not self.__steps:\n",
" return []\n",
" stack = torch.stack(self.__steps, dim=0) # [T,B,V]\n",
" probs = torch.softmax(stack, dim=-1) # softmax over vocab -> [T,B,V]\n",
" ids = probs.argmax(dim=-1).transpose(0, 1) # greedy ids -> [B,T]\n",
" out: list[list[int]] = []\n",
" for row in ids.tolist():\n",
" seq: list[int] = []\n",
" for tok in row:\n",
" if tok == self.__end_token: # stop on END\n",
" break\n",
" if tok == self.__pad_token: # skip PAD\n",
" continue\n",
" seq.append(tok)\n",
" out.append(seq)\n",
" return out\n",
"\n",
" def print_decoded(self) -> None:\n",
" for i, seq in enumerate(self.tokens()):\n",
" try:\n",
" text = self.__tokenizer.decode(seq) # decode tokens to string\n",
" except Exception:\n",
" text = str(seq) # fallback to ids\n",
" print(f\"[{i}] {text}\") # simple print\n",
"\n",
"\n",
"# set a fixed seed\n",
"torch.manual_seed(0)\n",
"random.seed(0)\n",
"DEVICE = torch_shims.get_default_device()\n",
"torch.set_default_device(DEVICE)\n",
"\n",
"# BPE Init\n",
"VOCABULARY_PATH = Path(\"Assets/Model/toy_10/toy_dictionary.json\")\n",
"SPECIAL_VOC = BPE.default_special_tokens()\n",
"\n",
"VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)\n",
"TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)\n",
"\n",
"# Constants\n",
"TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + 1\n",
"EMBEDDED_SIZE = 256\n",
"FEED_FORWARD_MULTIPLIER = 4\n",
"ATTENTION_HEADS = 4\n",
"SENTENCE_LENGTH = 256\n",
"NUMBER_OF_BLOCKS = 2\n",
"MAX_EPOCHS = int(1e3)\n",
"\n",
"PAD_TOKEN = TOKENANO.encode(\"<PAD>\")[0]\n",
"END_TOKEN = TOKENANO.encode(\"<END>\")[0]\n",
"\n",
"# Load CSV\n",
"TOY_DATASET_PATH = Path(\"Assets/Dataset/1-hop/toy/rdf_text.csv\")\n",
"TOY_DATASET = pd.read_csv(TOY_DATASET_PATH)\n",
"\n",
"TOY_BATCH_INPUT_LIST: list[list[int]] = []\n",
"TOY_BATCH_PADDING_LIST: list[list[bool]] = []\n",
"TOY_BATCH_TARGET_LIST: list[list[int]] = []\n",
"TOY_BATCH_DECODER_DEFAULT: list[list[int]] = []\n",
"\n",
"for index, row in TOY_DATASET.iterrows():\n",
" RDFs: str = row[\"RDFs\"]\n",
" Abstract: str = row[\"Abstract\"]\n",
"\n",
" input_tokens = TOKENANO.encode(RDFs) # encoder input ids\n",
" output_tokens = TOKENANO.encode(Abstract)[1:] # decoder target ids (shifted left)\n",
" decoder_default_tokens = TOKENANO.encode(\"<SOS>\") # decoder input starts with <SOS>\n",
"\n",
" input_tokens, padding = Transformer.normalize_sequence(\n",
" input_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n",
" ) # pad/trim + end token\n",
" output_tokens, _ = Transformer.normalize_sequence(\n",
" output_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n",
" ) # pad/trim + end token\n",
" decoder_default_tokens = Transformer.pad_sequence(\n",
" decoder_default_tokens, SENTENCE_LENGTH, PAD_TOKEN\n",
" ) # pad with PAD up to SENTENCE_LENGTH\n",
"\n",
" TOY_BATCH_INPUT_LIST.append(input_tokens)\n",
" TOY_BATCH_PADDING_LIST.append(padding)\n",
" TOY_BATCH_TARGET_LIST.append(output_tokens)\n",
" TOY_BATCH_DECODER_DEFAULT.append(decoder_default_tokens)\n",
"\n",
"# Training loop\n",
"LOSS_HISTORY = []\n",
"NANOSOCRATES = Transformer.TrainingModel(\n",
" TOKEN_SPACE_SIZE,\n",
" EMBEDDED_SIZE,\n",
" FEED_FORWARD_MULTIPLIER,\n",
" ATTENTION_HEADS,\n",
" NUMBER_OF_BLOCKS,\n",
")\n",
"\n",
"collector = LogitsCollector(PAD_TOKEN, END_TOKEN, TOKENANO) # collects logits and decodes\n",
"\n",
"NANOSOCRATES.train()\n",
"cross_entropy = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)\n",
"optimizer = torch.optim.AdamW(NANOSOCRATES.parameters())\n",
"scheduler = Custom_lr(EMBEDDED_SIZE, 4000) # step each optimizer step\n",
"\n",
"current_epoch = 0\n",
"BATCH_SIZE = min(32, len(TOY_BATCH_INPUT_LIST)) # small batch to stabilize\n",
"\n",
"while current_epoch < MAX_EPOCHS:\n",
" # simple fixed mini-batch from the top; later you can shuffle/slice\n",
" enc = torch.tensor(TOY_BATCH_INPUT_LIST[:BATCH_SIZE], dtype=torch.long) # [B,T] encoder token ids\n",
" pad = torch.tensor(TOY_BATCH_PADDING_LIST[:BATCH_SIZE], dtype=torch.bool) # [B,T] True where encoder PAD is present\n",
" tgt = torch.tensor(TOY_BATCH_TARGET_LIST[:BATCH_SIZE], dtype=torch.long) # [B,T] decoder targets (ground-truth)\n",
"\n",
" # decoder prefix buffer: <SOS> at pos 0, PAD elsewhere (no shift here) # we will fill it step by step\n",
" dec = torch.tensor(TOY_BATCH_DECODER_DEFAULT[:BATCH_SIZE], dtype=torch.long) # [B,T]\n",
"\n",
" total_loss = 0.0\n",
" collector.reset() # start fresh for this epoch\n",
"\n",
" T = tgt.size(1) # sequence length\n",
" for t in range(T):\n",
" optimizer.zero_grad(set_to_none=True) # clear grads for this token step\n",
"\n",
" prefix = dec[:, : t + 1] # [B, t+1] current decoder prefix\n",
" dec_pad_mask = prefix.eq(PAD_TOKEN) # [B, t+1] True where PAD inside prefix\n",
"\n",
" # one-step logits given prefix (trainer model expects 4 args now)\n",
" logits_t: torch.Tensor = NANOSOCRATES((enc, pad, prefix, dec_pad_mask)) # [B,V] logits for step t\n",
" collector.add(logits_t) # store logits for decoding later\n",
"\n",
" loss_t = cross_entropy(logits_t, tgt[:, t]) # CE expects raw logits; PAD ignored\n",
" loss_t.backward() # backprop for this step\n",
" optimizer.step() # update params\n",
" scheduler.step() # Noam/warmup: step per optimizer step\n",
"\n",
" total_loss = float(loss_t.detach()) # keep last step loss for logging\n",
"\n",
" # teacher forcing: reveal the correct token for next position\n",
" if t < T - 1:\n",
" dec[:, t + 1] = tgt[:, t] # write ground-truth into next slot\n",
"\n",
" current_epoch += 1\n",
" print(f\"EPOCH {current_epoch}\\n\\tLoss: {total_loss:.6f}\") # simple log\n",
" collector.print_decoded() # print decoded predictions for the batch\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "deep_learning",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -0,0 +1,205 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "0afbf498",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"EPOCH 1\n",
"\tLoss: 9.174470901489258\n",
"EPOCH 2\n",
"\tLoss: 9.20919132232666\n",
"EPOCH 3\n",
"\tLoss: 9.227106094360352\n",
"EPOCH 4\n",
"\tLoss: 9.172086715698242\n",
"EPOCH 5\n",
"\tLoss: 9.180150985717773\n"
]
},
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[31m---------------------------------------------------------------------------\u001b[39m",
"\u001b[31mKeyboardInterrupt\u001b[39m Traceback (most recent call last)",
"\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[1]\u001b[39m\u001b[32m, line 116\u001b[39m\n\u001b[32m 113\u001b[39m step_target = target_logits[:, i] \u001b[38;5;66;03m# [B]\u001b[39;00m\n\u001b[32m 115\u001b[39m loss = cross_entropy(step_logits,step_target) \u001b[38;5;66;03m# now loss is without softmax\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m116\u001b[39m \u001b[43mloss\u001b[49m\u001b[43m.\u001b[49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# DAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMN\u001b[39;00m\n\u001b[32m 117\u001b[39m last_loss = loss\n\u001b[32m 118\u001b[39m optimizer.step()\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/deep_learning/lib/python3.13/site-packages/torch/_tensor.py:638\u001b[39m, in \u001b[36mTensor.backward\u001b[39m\u001b[34m(self, gradient, retain_graph, create_graph, inputs)\u001b[39m\n\u001b[32m 595\u001b[39m \u001b[38;5;250m\u001b[39m\u001b[33mr\u001b[39m\u001b[33;03m\"\"\"Computes the gradient of current tensor wrt graph leaves.\u001b[39;00m\n\u001b[32m 596\u001b[39m \n\u001b[32m 597\u001b[39m \u001b[33;03mThe graph is differentiated using the chain rule. If the tensor is\u001b[39;00m\n\u001b[32m (...)\u001b[39m\u001b[32m 635\u001b[39m \u001b[33;03m used to compute the :attr:`tensors`. Defaults to ``None``.\u001b[39;00m\n\u001b[32m 636\u001b[39m \u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m 637\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m has_torch_function_unary(\u001b[38;5;28mself\u001b[39m):\n\u001b[32m--> \u001b[39m\u001b[32m638\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mhandle_torch_function\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 639\u001b[39m \u001b[43m \u001b[49m\u001b[43mTensor\u001b[49m\u001b[43m.\u001b[49m\u001b[43mbackward\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 640\u001b[39m \u001b[43m \u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 641\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[32m 642\u001b[39m \u001b[43m \u001b[49m\u001b[43mgradient\u001b[49m\u001b[43m=\u001b[49m\u001b[43mgradient\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 643\u001b[39m \u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m=\u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 644\u001b[39m \u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m=\u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 645\u001b[39m \u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m=\u001b[49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 646\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 647\u001b[39m torch.autograd.backward(\n\u001b[32m 648\u001b[39m \u001b[38;5;28mself\u001b[39m, gradient, retain_graph, create_graph, inputs=inputs\n\u001b[32m 649\u001b[39m )\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/deep_learning/lib/python3.13/site-packages/torch/overrides.py:1725\u001b[39m, in \u001b[36mhandle_torch_function\u001b[39m\u001b[34m(public_api, relevant_args, *args, **kwargs)\u001b[39m\n\u001b[32m 1721\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m _is_torch_function_mode_enabled():\n\u001b[32m 1722\u001b[39m \u001b[38;5;66;03m# if we're here, the mode must be set to a TorchFunctionStackMode\u001b[39;00m\n\u001b[32m 1723\u001b[39m \u001b[38;5;66;03m# this unsets it and calls directly into TorchFunctionStackMode's torch function\u001b[39;00m\n\u001b[32m 1724\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m _pop_mode_temporarily() \u001b[38;5;28;01mas\u001b[39;00m mode:\n\u001b[32m-> \u001b[39m\u001b[32m1725\u001b[39m result = \u001b[43mmode\u001b[49m\u001b[43m.\u001b[49m\u001b[43m__torch_function__\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpublic_api\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtypes\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1726\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m result \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mNotImplemented\u001b[39m:\n\u001b[32m 1727\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m result\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/deep_learning/lib/python3.13/site-packages/torch/utils/_device.py:103\u001b[39m, in \u001b[36mDeviceContext.__torch_function__\u001b[39m\u001b[34m(self, func, types, args, kwargs)\u001b[39m\n\u001b[32m 101\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m func \u001b[38;5;129;01min\u001b[39;00m _device_constructors() \u001b[38;5;129;01mand\u001b[39;00m kwargs.get(\u001b[33m\"\u001b[39m\u001b[33mdevice\u001b[39m\u001b[33m\"\u001b[39m) \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[32m 102\u001b[39m kwargs[\u001b[33m\"\u001b[39m\u001b[33mdevice\u001b[39m\u001b[33m\"\u001b[39m] = \u001b[38;5;28mself\u001b[39m.device\n\u001b[32m--> \u001b[39m\u001b[32m103\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/deep_learning/lib/python3.13/site-packages/torch/_tensor.py:647\u001b[39m, in \u001b[36mTensor.backward\u001b[39m\u001b[34m(self, gradient, retain_graph, create_graph, inputs)\u001b[39m\n\u001b[32m 637\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m has_torch_function_unary(\u001b[38;5;28mself\u001b[39m):\n\u001b[32m 638\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m handle_torch_function(\n\u001b[32m 639\u001b[39m Tensor.backward,\n\u001b[32m 640\u001b[39m (\u001b[38;5;28mself\u001b[39m,),\n\u001b[32m (...)\u001b[39m\u001b[32m 645\u001b[39m inputs=inputs,\n\u001b[32m 646\u001b[39m )\n\u001b[32m--> \u001b[39m\u001b[32m647\u001b[39m \u001b[43mtorch\u001b[49m\u001b[43m.\u001b[49m\u001b[43mautograd\u001b[49m\u001b[43m.\u001b[49m\u001b[43mbackward\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 648\u001b[39m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mgradient\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m=\u001b[49m\u001b[43minputs\u001b[49m\n\u001b[32m 649\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/deep_learning/lib/python3.13/site-packages/torch/autograd/__init__.py:354\u001b[39m, in \u001b[36mbackward\u001b[39m\u001b[34m(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)\u001b[39m\n\u001b[32m 349\u001b[39m retain_graph = create_graph\n\u001b[32m 351\u001b[39m \u001b[38;5;66;03m# The reason we repeat the same comment below is that\u001b[39;00m\n\u001b[32m 352\u001b[39m \u001b[38;5;66;03m# some Python versions print out the first line of a multi-line function\u001b[39;00m\n\u001b[32m 353\u001b[39m \u001b[38;5;66;03m# calls in the traceback and some print out the last line\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m354\u001b[39m \u001b[43m_engine_run_backward\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 355\u001b[39m \u001b[43m \u001b[49m\u001b[43mtensors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 356\u001b[39m \u001b[43m \u001b[49m\u001b[43mgrad_tensors_\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 357\u001b[39m \u001b[43m \u001b[49m\u001b[43mretain_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 358\u001b[39m \u001b[43m \u001b[49m\u001b[43mcreate_graph\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 359\u001b[39m \u001b[43m \u001b[49m\u001b[43minputs_tuple\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 360\u001b[39m \u001b[43m \u001b[49m\u001b[43mallow_unreachable\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 361\u001b[39m \u001b[43m \u001b[49m\u001b[43maccumulate_grad\u001b[49m\u001b[43m=\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 362\u001b[39m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n",
"\u001b[36mFile \u001b[39m\u001b[32m~/miniconda3/envs/deep_learning/lib/python3.13/site-packages/torch/autograd/graph.py:829\u001b[39m, in \u001b[36m_engine_run_backward\u001b[39m\u001b[34m(t_outputs, *args, **kwargs)\u001b[39m\n\u001b[32m 827\u001b[39m unregister_hooks = _register_logging_hooks_on_whole_graph(t_outputs)\n\u001b[32m 828\u001b[39m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[32m--> \u001b[39m\u001b[32m829\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mVariable\u001b[49m\u001b[43m.\u001b[49m\u001b[43m_execution_engine\u001b[49m\u001b[43m.\u001b[49m\u001b[43mrun_backward\u001b[49m\u001b[43m(\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;66;43;03m# Calls into the C++ engine to run the backward pass\u001b[39;49;00m\n\u001b[32m 830\u001b[39m \u001b[43m \u001b[49m\u001b[43mt_outputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m*\u001b[49m\u001b[43m*\u001b[49m\u001b[43mkwargs\u001b[49m\n\u001b[32m 831\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# Calls into the C++ engine to run the backward pass\u001b[39;00m\n\u001b[32m 832\u001b[39m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[32m 833\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m attach_logging_hooks:\n",
"\u001b[31mKeyboardInterrupt\u001b[39m: "
]
}
],
"source": [
"import random\n",
"import torch\n",
"import pandas as pd\n",
"from pathlib import Path\n",
"import Project_Model.Libs.Embedder as Embedder\n",
"import Project_Model.Libs.BPE as BPE\n",
"import Project_Model.Libs.Transformer as Transformer\n",
"import Project_Model.Libs.TorchShims as torch_shims\n",
"\n",
"# set a fixed seed\n",
"torch.manual_seed(0)\n",
"random.seed(0)\n",
"DEVICE = torch_shims.get_default_device()\n",
"torch.set_default_device(DEVICE)\n",
"\n",
"# set a default device\n",
"\n",
"# BPE Init\n",
"VOCABULARY_PATH = Path(\"Assets/Model/toy_10/toy_dictionary.json\")\n",
"SPECIAL_VOC = BPE.default_special_tokens()\n",
"\n",
"VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)\n",
"TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)\n",
"\n",
"\n",
"# Constants\n",
"TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + 1\n",
"EMBEDDED_SIZE = 256\n",
"FEED_FORWARD_MULTIPLIER = 4\n",
"ATTENTION_HEADS = 4\n",
"SENTENCE_LENGTH = 256\n",
"NUMBER_OF_BLOCKS = 2\n",
"MAX_EPOCHS = int(1e3)\n",
"\n",
"\n",
"PAD_TOKEN = TOKENANO.encode(\"<PAD>\")[0]\n",
"END_TOKEN = TOKENANO.encode(\"<END>\")[0]\n",
"\n",
"\n",
"# Load CSV\n",
"TOY_DATASET_PATH = Path(\"Assets/Dataset/1-hop/toy/rdf_text.csv\")\n",
"\n",
"TOY_DATASET = pd.read_csv(TOY_DATASET_PATH)\n",
"\n",
"TOY_BATCH_INPUT_LIST: list[list[int]] = []\n",
"TOY_BATCH_PADDING_LIST: list[list[bool]] = []\n",
"TOY_BATCH_TARGET_LIST: list[list[int]] = []\n",
"TOY_BATCH_DECODER_DEFAULT: list[list[int]]= []\n",
"\n",
"\n",
"for index, row in TOY_DATASET.iterrows():\n",
"\n",
" RDFs: str = row[\"RDFs\"]\n",
" Abstract: str = row[\"Abstract\"]\n",
"\n",
" input_tokens = TOKENANO.encode(RDFs)\n",
" output_tokens = TOKENANO.encode(Abstract)[1:]\n",
" decoder_default_tokens = TOKENANO.encode(\"<SOS>\")\n",
"\n",
" input_tokens, padding = Transformer.normalize_sequence(\n",
" input_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n",
" )\n",
" output_tokens, _ = Transformer.normalize_sequence(\n",
" output_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n",
" )\n",
" decoder_default_tokens, _ = Transformer.normalize_sequence(\n",
" decoder_default_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n",
" )\n",
"\n",
" TOY_BATCH_INPUT_LIST.append(input_tokens)\n",
" TOY_BATCH_PADDING_LIST.append(padding)\n",
" TOY_BATCH_TARGET_LIST.append(output_tokens)\n",
" TOY_BATCH_DECODER_DEFAULT.append(decoder_default_tokens)\n",
"\n",
"# Training loop\n",
"LOSS_HISTORY = []\n",
"NANOSOCRATES = Transformer.TrainingModel(\n",
" TOKEN_SPACE_SIZE,\n",
" EMBEDDED_SIZE,\n",
" FEED_FORWARD_MULTIPLIER,\n",
" ATTENTION_HEADS,\n",
" NUMBER_OF_BLOCKS\n",
")\n",
"\n",
"NANOSOCRATES.train() # nothing important, activates dropout etc \n",
"cross_entropy = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)\n",
"optimizer = torch.optim.AdamW(NANOSOCRATES.parameters())\n",
"scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 4)\n",
"\n",
"last_loss = 0\n",
"\n",
"current_epoch = 0\n",
"while current_epoch < MAX_EPOCHS:\n",
"\n",
" encoder_list = torch.tensor([TOY_BATCH_INPUT_LIST[0]])\n",
" decoder_list = torch.tensor([TOY_BATCH_DECODER_DEFAULT[0]])\n",
" padding_list = torch.tensor([TOY_BATCH_PADDING_LIST[0]], dtype=torch.bool)\n",
" target_logits = torch.tensor([TOY_BATCH_TARGET_LIST[0]]) # Transform target into logits\n",
"\n",
" optimizer.zero_grad() # to clear gradient\n",
"\n",
" last_loss = 0.0\n",
"\n",
" for i in range(0, SENTENCE_LENGTH):\n",
"\n",
" # optimizer.zero_grad()\n",
" # forward \n",
" logits: torch.Tensor = NANOSOCRATES((encoder_list, padding_list, decoder_list))\n",
" # probabilities = torch.softmax(logits,2)\n",
" \n",
"\n",
" step_logits = logits[:, i, :] # [B, V]\n",
" step_target = target_logits[:, i] # [B]\n",
"\n",
" loss = cross_entropy(step_logits,step_target) # now loss is without softmax\n",
" loss.backward() # DAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMN\n",
" last_loss = loss\n",
" optimizer.step()\n",
" optimizer.zero_grad()\n",
" scheduler.step()\n",
" \n",
" probabilities = torch.softmax(logits,2)\n",
" most_probable_tokens = torch.argmax(probabilities, 2) \n",
" if i < SENTENCE_LENGTH - 1:\n",
" decoder_list[:,i+1] = most_probable_tokens[:,i]\n",
"\n",
"\n",
" current_epoch += 1\n",
"\n",
" if current_epoch % 1 == 0:\n",
" print(f\"EPOCH {current_epoch}\\n\\tLoss: {last_loss}\")\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "deep_learning",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -0,0 +1,157 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "f5762da9",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"torch.Size([3, 17, 7714])\n",
"torch.Size([3, 17])\n",
"tensor([[2034, 6523, 5406, 3985, 5406, 6523, 2034, 2034, 5745, 643, 5406, 7405,\n",
" 6523, 6230, 6419, 5745, 657],\n",
" [2458, 830, 5745, 5745, 5406, 3741, 2034, 5745, 6302, 6419, 5406, 2411,\n",
" 719, 830, 5745, 3189, 2775],\n",
" [2034, 5745, 5327, 4696, 6523, 643, 6419, 1671, 6302, 4406, 5745, 643,\n",
" 643, 1901, 1914, 1914, 719]])\n"
]
}
],
"source": [
"import random\n",
"import torch\n",
"from pathlib import Path\n",
"import Project_Model.Libs.Embedder as Embedder\n",
"import Project_Model.Libs.BPE as BPE\n",
"import Project_Model.Libs.Transformer as Transformer\n",
"\n",
"# set a fixed seed\n",
"torch.manual_seed(0)\n",
"random.seed(0)\n",
"\n",
"# BPE Init\n",
"VOCABULARY_PATH = Path(\"Assets/Model/toy_10/toy_dictionary.json\")\n",
"SPECIAL_VOC = BPE.default_special_tokens()\n",
"\n",
"VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)\n",
"TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)\n",
"\n",
"\n",
"# Constants\n",
"TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + 1\n",
"EMBEDDED_SIZE = 256\n",
"FEED_FORWARD_DIM = EMBEDDED_SIZE * 4\n",
"\n",
"\n",
"# Model Init\n",
"ENCODER_EMBEDDER = Embedder.NanoSocratesEmbedder(TOKEN_SPACE_SIZE, EMBEDDED_SIZE)\n",
"DECODER_EMBEDDER = Embedder.NanoSocratesEmbedder(TOKEN_SPACE_SIZE, EMBEDDED_SIZE)\n",
"\n",
"ENCODER = torch.nn.Sequential(\n",
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Encoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
")\n",
"\n",
"DECODER = torch.nn.Sequential(\n",
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
" Transformer.Decoder(EMBEDDED_SIZE, FEED_FORWARD_DIM, 4),\n",
")\n",
"\n",
"DETOKENER = Transformer.DeToken(\n",
" EMBEDDED_SIZE,\n",
" TOKEN_SPACE_SIZE\n",
")\n",
"\n",
"\n",
"# Data\n",
"TEXT = (\n",
" \"<ABS>The Dark Knight is a 2008 superhero film directed by Christopher Nolan,<SOTL>\"\n",
")\n",
"OUT_TEXT = \"<START>\"\n",
"\n",
"PAD_TOKEN = TOKENANO.encode(\"<PAD>\")[0]\n",
"END_TOKEN = TOKENANO.encode(\"<END>\")[0]\n",
"\n",
"ENCODER_INPUT = TOKENANO.encode(TEXT)\n",
"DECODER_INPUT = TOKENANO.encode(OUT_TEXT)\n",
"MAX_LEN = len(ENCODER_INPUT) + 1\n",
"\n",
"EN_IN, PAD_MASK = Transformer.normalize_sequence(ENCODER_INPUT, MAX_LEN, PAD_TOKEN, END_TOKEN)\n",
"DEC_IN, _ = Transformer.normalize_sequence(DECODER_INPUT, MAX_LEN, PAD_TOKEN, END_TOKEN)\n",
"\n",
"BATCH_LEN = 3\n",
"\n",
"INPUT_TOKENIZATION = [\n",
" EN_IN\n",
"] * BATCH_LEN\n",
"OUTPUT_TOKENIZATION = [\n",
" DEC_IN\n",
"] * BATCH_LEN\n",
"\n",
"encoder_tensor_input = ENCODER_EMBEDDER(INPUT_TOKENIZATION)\n",
"encoder_padding_mask = torch.tensor([PAD_MASK] * BATCH_LEN)\n",
"\n",
"encoder_output, _ = ENCODER((encoder_tensor_input, encoder_padding_mask))\n",
"\n",
"decoder_tensor_input = DECODER_EMBEDDER(OUTPUT_TOKENIZATION)\n",
"decoder_padding_mask = torch.tensor([[False] * MAX_LEN] * BATCH_LEN)\n",
"\n",
"decoder_output, _, _, _ = DECODER((decoder_tensor_input, encoder_output, encoder_output, None))\n",
"\n",
"logits: torch.Tensor = DETOKENER(decoder_output)\n",
"\n",
"print(logits.shape)\n",
"\n",
"# print(logits)\n",
"\n",
"most_probable_tokens = torch.argmax(logits, 2)\n",
"\n",
"print(most_probable_tokens.shape)\n",
"print(most_probable_tokens)\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "deep_learning",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -0,0 +1,197 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "adbd9598",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\Chris\\miniconda3\\envs\\deep_learning\\Lib\\site-packages\\torch\\utils\\_device.py:103: UserWarning: Aten Op fallback from XPU to CPU happends. This may have performance implications. If need debug the fallback ops please set environment variable `PYTORCH_DEBUG_XPU_FALLBACK=1` (Triggered internally at C:\\actions-runner\\_work\\pytorch\\pytorch\\pytorch\\build\\xpu\\ATen\\RegisterXPU_0.cpp:54528.)\n",
" return func(*args, **kwargs)\n",
"252.87s - name 'tensor' is not defined\n",
"Traceback (most recent call last):\n",
" File \"c:\\Users\\Chris\\miniconda3\\envs\\deep_learning\\Lib\\site-packages\\debugpy\\_vendored\\pydevd\\_pydevd_bundle\\pydevd_vars.py\", line 636, in change_attr_expression\n",
" value = eval(expression, frame.f_globals, frame.f_locals)\n",
" File \"<string>\", line 1, in <module>\n",
"NameError: name 'tensor' is not defined\n"
]
},
{
"ename": "",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[1;31mCannot execute code, session has been disposed. Please try restarting the Kernel."
]
},
{
"ename": "",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[1;31mCannot execute code, session has been disposed. Please try restarting the Kernel. \n",
"\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
]
}
],
"source": [
"import random\n",
"import torch\n",
"import pandas as pd\n",
"from pathlib import Path\n",
"import Project_Model.Libs.Embedder as Embedder\n",
"import Project_Model.Libs.BPE as BPE\n",
"import Project_Model.Libs.Transformer as Transformer\n",
"import Project_Model.Libs.TorchShims as torch_shims\n",
"\n",
"# set a fixed seed\n",
"torch.manual_seed(0)\n",
"random.seed(0)\n",
"DEVICE = torch_shims.get_default_device()\n",
"torch.set_default_device(DEVICE)\n",
"\n",
"# set a default device\n",
"\n",
"# BPE Init\n",
"VOCABULARY_PATH = Path(\"Assets/Model/toy_10/toy_dictionary.json\")\n",
"SPECIAL_VOC = BPE.default_special_tokens()\n",
"\n",
"VOCABULARY = BPE.load_nanos_vocabulary(VOCABULARY_PATH)\n",
"TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_VOC)\n",
"\n",
"\n",
"# Constants\n",
"TOKEN_SPACE_SIZE = TOKENANO.vocabulary_size + 1\n",
"EMBEDDED_SIZE = 256\n",
"FEED_FORWARD_MULTIPLIER = 4\n",
"ATTENTION_HEADS = 4\n",
"SENTENCE_LENGTH = 256\n",
"NUMBER_OF_BLOCKS = 2\n",
"MAX_EPOCHS = int(1e3)\n",
"\n",
"\n",
"PAD_TOKEN = TOKENANO.encode(\"<PAD>\")[0]\n",
"END_TOKEN = TOKENANO.encode(\"<END>\")[0]\n",
"\n",
"\n",
"# Load CSV\n",
"TOY_DATASET_PATH = Path(\"Assets/Dataset/1-hop/toy/rdf_text.csv\")\n",
"\n",
"TOY_DATASET = pd.read_csv(TOY_DATASET_PATH)\n",
"\n",
"TOY_BATCH_INPUT_LIST: list[list[int]] = []\n",
"TOY_BATCH_PADDING_LIST: list[list[bool]] = []\n",
"TOY_BATCH_TARGET_LIST: list[list[int]] = []\n",
"TOY_BATCH_DECODER_DEFAULT: list[list[int]]= []\n",
"\n",
"\n",
"for index, row in TOY_DATASET.iterrows():\n",
"\n",
" RDFs: str = row[\"RDFs\"]\n",
" Abstract: str = row[\"Abstract\"]\n",
"\n",
" input_tokens = TOKENANO.encode(RDFs)\n",
" output_tokens = TOKENANO.encode(Abstract)[1:]\n",
" decoder_default_tokens = TOKENANO.encode(\"<SOS>\")\n",
"\n",
" input_tokens, padding = Transformer.normalize_sequence(\n",
" input_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n",
" )\n",
" output_tokens, _ = Transformer.normalize_sequence(\n",
" output_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n",
" )\n",
" decoder_default_tokens, _ = Transformer.normalize_sequence(\n",
" decoder_default_tokens, SENTENCE_LENGTH, PAD_TOKEN, END_TOKEN\n",
" )\n",
"\n",
" TOY_BATCH_INPUT_LIST.append(input_tokens)\n",
" TOY_BATCH_PADDING_LIST.append(padding)\n",
" TOY_BATCH_TARGET_LIST.append(output_tokens)\n",
" TOY_BATCH_DECODER_DEFAULT.append(decoder_default_tokens)\n",
"\n",
"# Training loop\n",
"LOSS_HISTORY = []\n",
"NANOSOCRATES = Transformer.TrainingModel(\n",
" TOKEN_SPACE_SIZE,\n",
" EMBEDDED_SIZE,\n",
" FEED_FORWARD_MULTIPLIER,\n",
" ATTENTION_HEADS,\n",
" NUMBER_OF_BLOCKS\n",
")\n",
"cross_entropy = torch.nn.CrossEntropyLoss(ignore_index=PAD_TOKEN)\n",
"optimizer = torch.optim.AdamW(NANOSOCRATES.parameters())\n",
"scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 4)\n",
"last_loss = 0\n",
"current_epoch = 0\n",
"\n",
"while current_epoch < MAX_EPOCHS:\n",
"\n",
" optimizer.zero_grad()\n",
"\n",
" encoder_list = torch.tensor([TOY_BATCH_INPUT_LIST[0]])\n",
" decoder_list = torch.tensor([TOY_BATCH_DECODER_DEFAULT[0]])\n",
" padding_list = torch.tensor([TOY_BATCH_PADDING_LIST[0]], dtype=torch.bool)\n",
"\n",
" # Transform target into logits\n",
" target_logits = torch.tensor([TOY_BATCH_TARGET_LIST[0]])\n",
"\n",
" last_loss = 0\n",
"\n",
" for i in range(0, SENTENCE_LENGTH):\n",
"\n",
" optimizer.zero_grad()\n",
"\n",
" logits: torch.Tensor = NANOSOCRATES((encoder_list, padding_list, decoder_list))\n",
"\n",
" most_probable_tokens = torch.argmax(logits, 2)\n",
"\n",
" logits = logits[:,i,:]\n",
"\n",
" loss = cross_entropy(logits, target_logits[:,i])\n",
" last_loss = loss\n",
" optimizer.step()\n",
" scheduler.step()\n",
"\n",
" if i < SENTENCE_LENGTH - 1:\n",
" decoder_list[:,i+1] = most_probable_tokens[:,i]\n",
"\n",
"\n",
" current_epoch += 1\n",
"\n",
" if current_epoch % 1 == 0:\n",
" print(f\"EPOCH {current_epoch}\\n\\tLoss: {last_loss}\")\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "deep_learning",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

112
Playgrounds/prova.ipynb Normal file
View File

@ -0,0 +1,112 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "4ae47336",
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"B, T, D = 4, 7, 32\n",
"x = torch.randn(B, T, D)\n",
"attn_mask = torch.triu(torch.ones(T, T, dtype=torch.bool), diagonal=1) # [T,T]\n",
"pad_mask = torch.zeros(B, T, dtype=torch.bool) # no pads\n",
"mha = torch.nn.MultiheadAttention(D, num_heads=4, batch_first=True)\n",
"y, _ = mha(x, x, x, attn_mask=attn_mask, key_padding_mask=pad_mask) # should work\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "e38e3fb5",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"tensor([[[0, 0, 0, 0, 1, 0, 0, 0, 0, 0],\n",
" [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],\n",
" [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]],\n",
"\n",
" [[0, 0, 1, 0, 0, 0, 0, 0, 0, 0],\n",
" [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],\n",
" [0, 0, 0, 0, 0, 1, 0, 0, 0, 0]]])"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"torch.nn.functional.one_hot(torch.tensor([\n",
" [4, 1, 9],\n",
" [2,4,5]\n",
"]))"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "7119ad53",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"device(type='cpu')"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"torch.get_default_device()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "8c95691a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"xpu\n"
]
}
],
"source": [
"from Project_Model.Libs.TorchShims import get_default_device\n",
"\n",
"print(get_default_device())"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "deep_learning",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -0,0 +1,60 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "dd23cc94",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Current detected architecture is: xpu\n"
]
}
],
"source": [
"import torch\n",
"from Project_Model.Libs.TorchShims import get_default_device\n",
"\n",
"DEVICE = get_default_device()\n",
"\n",
"print(f\"Current detected architecture is: {DEVICE.type}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6584882e",
"metadata": {},
"outputs": [],
"source": [
"import Project_Model.Libs.Transformer as Transformer\n",
"DECODER = Transformer.Decoder(256, 1024, 4)\n",
"print()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "deep_learning",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

View File

@ -0,0 +1,4 @@
from abc import ABC
class Encoder(ABC):
pass

View File

@ -0,0 +1,164 @@
from collections import deque
import datetime
from pathlib import Path
import re
from ..Classes import (
NanoSocratesBPE,
NanoSocratesChunker,
NanoSocratesSplitter,
NanoSocratesBatchMemoryBPE,
)
from ..Enums import TokenType
from ..Utils import (
special_regex_maker,
iterator_with_checks,
save_nanos_vocabulary,
load_nanos_vocabulary,
save_json,
load_json,
)
class NanoSocraTraineRam:
def __init__(
self,
max_vocabulary: int,
special_vocabulary: list[str],
merge_treshold: int = 0,
max_iterations: int = 0,
print_after_iterations: int = 1,
) -> None:
# Bytes
BYTE_RESERVED_TOKENS = 256
SPECIAL_RESERVED_TOKENS = len(special_vocabulary)
RESERVED_TOKENS = BYTE_RESERVED_TOKENS + SPECIAL_RESERVED_TOKENS
self.__max_vocabulary = max_vocabulary - RESERVED_TOKENS
self.__max_iterations = max_iterations
self.__merge_treshold = merge_treshold
self.__special_token_regex = special_regex_maker(special_vocabulary)
self.__print_after_iterations = print_after_iterations
def trainBPE(
self,
path: Path,
bpe: NanoSocratesBPE | None = None,
) -> NanoSocratesBPE:
if not path.is_file():
raise FileNotFoundError()
if bpe is None:
bpe = NanoSocratesBPE()
BPE = bpe
if BPE.vocabulary_size > self.__max_vocabulary:
return BPE
exit = False
current_iteration = 0
data = self.__gather_data_from_file(path)
while not exit:
current_iteration = self.__increment_counter(current_iteration)
LAST_VOC_SIZE = BPE.vocabulary_size
last_memory = None
_, data, last_memory = self.__round_train(BPE, data)
NEW_VOC_SIZE = BPE.vocabulary_size
if current_iteration % self.__print_after_iterations == 0:
DELIMITER = "==============="
DEBUG = "\n".join(
[
DELIMITER,
f"ITERATION: {current_iteration}",
DELIMITER,
f"\tVocabulary size: {BPE.vocabulary_size}\n",
f"\tFrequencies:\n{last_memory.frequencies}\n", # type: ignore (pretty sure it's not None)
f"\tvocabulary:\n{BPE.vocabulary}",
DELIMITER,
"",
]
)
print(DEBUG)
if LAST_VOC_SIZE == NEW_VOC_SIZE:
exit = True
continue
if current_iteration == self.__max_iterations:
exit = True
continue
if BPE.vocabulary_size == self.__max_vocabulary:
exit = True
continue
return BPE
def __round_train(self, bpe: NanoSocratesBPE, data: list[list[int]]):
DATA_LEN = len(data)
NEW_DATA = []
counter = 0
memory = NanoSocratesBatchMemoryBPE({}, 0)
while len(data) > 0:
counter += 1
last_batch = len(data) == 1
piece = data.pop()
bpe, memory, output = bpe.fit(piece, memory, last_batch)
if counter % int(1E6) == 0:
print(f"Fitted: {counter}/{DATA_LEN}")
if len(output) < 2:
continue
NEW_DATA.append(output)
return (bpe, NEW_DATA, memory)
def __gather_data_from_file(self, path: Path) -> list[list[int]]:
SPLITTER = NanoSocratesSplitter(self.__special_token_regex)
DATA: list[list[int]] = []
FILE = open(path, "r", encoding="utf-8")
file_string = FILE.read()
FILE.close()
for piece, type in SPLITTER.split_text(file_string):
if type != TokenType.BPE:
continue
int_list = self.__make_list_ids(piece)
DATA.append(int_list)
return DATA
def __increment_counter(self, counter: int):
# What if overflows???
try:
counter += 1
except:
print("Integer overflow")
counter = 1
return counter
def __make_list_ids(self, corpus: str):
return list(corpus.encode("utf-8"))

View File

@ -0,0 +1,248 @@
from collections import deque
import datetime
from pathlib import Path
import re
from ..Classes import (
NanoSocratesBPE,
NanoSocratesChunker,
NanoSocratesSplitter,
NanoSocratesBatchMemoryBPE,
)
from ..Enums import TokenType
from ..Utils import (
special_regex_maker,
iterator_with_checks,
save_nanos_vocabulary,
load_nanos_vocabulary,
save_json,
load_json,
)
class NanoSocraTrainer:
def __init__(
self,
max_vocabulary: int,
special_vocabulary: list[str],
chunk_size: int,
merge_treshold: int = 0,
max_iterations: int = 0,
print_after_iterations: int = 1,
) -> None:
# Bytes
BYTE_RESERVED_TOKENS = 256
SPECIAL_RESERVED_TOKENS = len(special_vocabulary)
RESERVED_TOKENS = BYTE_RESERVED_TOKENS + SPECIAL_RESERVED_TOKENS
self.__max_vocabulary = max_vocabulary - RESERVED_TOKENS
self.__max_iterations = max_iterations
self.__chunk_size = chunk_size
self.__merge_treshold = merge_treshold
self.__special_token_regex = special_regex_maker(special_vocabulary)
self.__print_after_iterations = print_after_iterations
def trainBPE(
self,
path: Path,
cache_dir: Path,
bpe: NanoSocratesBPE | None = None,
resume_from_iter: int = 0,
) -> NanoSocratesBPE:
if not path.is_file():
raise FileNotFoundError()
if not cache_dir.is_dir():
raise NotADirectoryError()
if bpe is None:
bpe = NanoSocratesBPE()
BPE = bpe
if BPE.vocabulary_size > self.__max_vocabulary:
return BPE
exit = False
cached = False
current_iteration = 0
input_path = path
NEXT_ITERATION = resume_from_iter + 1 if resume_from_iter != 0 else 0
PATH_GEN = self.__switch_paths(path, cache_dir, NEXT_ITERATION)
MEMORY_PATH_GEN = self.__switch_memory(cache_dir, resume_from_iter)
if resume_from_iter != 0:
cached = True
current_iteration = resume_from_iter
input_path = next(PATH_GEN)
# UGLY: fixes a bug immediately, unfortunately
_, _ = next(MEMORY_PATH_GEN)
_, voc_cache_path = next(MEMORY_PATH_GEN)
vocabulary = load_nanos_vocabulary(voc_cache_path)
BPE = NanoSocratesBPE(vocabulary)
while not exit:
out_path = next(PATH_GEN)
internal_cache_path, vocabulary_cache = next(MEMORY_PATH_GEN)
current_iteration = self.__increment_counter(current_iteration)
LAST_VOC_SIZE = BPE.vocabulary_size
FILE = open(out_path, "w")
last_memory = None
for _, memory, output in self.__round_train(input_path, BPE, cached):
last_memory = memory
FILE.write(output)
FILE.close()
internal_cache = {
"finished_iter": current_iteration,
"read_from": f"{input_path}",
"wrote_to": f"{out_path}",
"at": datetime.datetime.now(datetime.timezone.utc).strftime(
"%Y-%m-%d %H:%M:%S.%f"
)[:-3],
}
VOCABULARY = BPE.vocabulary
save_json(internal_cache, internal_cache_path)
save_nanos_vocabulary(VOCABULARY, vocabulary_cache)
cached = True
input_path = out_path
NEW_VOC_SIZE = BPE.vocabulary_size
if current_iteration % self.__print_after_iterations == 0:
DELIMITER = "==============="
DEBUG = "\n".join(
[
DELIMITER,
f"ITERATION: {current_iteration}",
DELIMITER,
f"\tVocabulary size: {BPE.vocabulary_size}\n",
f"\tFrequencies:\n{last_memory.frequencies}\n", # type: ignore (pretty sure it's not None)
f"\tvocabulary:\n{BPE.vocabulary}",
DELIMITER,
"",
]
)
print(DEBUG)
if LAST_VOC_SIZE == NEW_VOC_SIZE:
exit = True
continue
if current_iteration == self.__max_iterations:
exit = True
continue
if BPE.vocabulary_size == self.__max_vocabulary:
exit = True
continue
return BPE
def __round_train(self, path: Path, bpe: NanoSocratesBPE, cached: bool):
CHUNKER = NanoSocratesChunker(self.__chunk_size, self.__special_token_regex)
SPLITTER = NanoSocratesSplitter(self.__special_token_regex)
BPE = bpe
memory = NanoSocratesBatchMemoryBPE({}, self.__merge_treshold)
CHUNKER_GENERATOR = iterator_with_checks(CHUNKER.chunk(path))
for chunk, last_chunk in CHUNKER_GENERATOR:
PIECE_GENERATOR = iterator_with_checks(SPLITTER.split_text(chunk))
for piece, last_piece in PIECE_GENERATOR:
LAST_BATCH = last_chunk and last_piece
PIECE, TOKEN_TYPE = piece
if TOKEN_TYPE != TokenType.BPE:
_, _, out = BPE.fit([], memory, LAST_BATCH)
yield (BPE, memory, PIECE)
continue
PIECE_DATA = self.__make_list_ids(PIECE, cached)
_, _, out = BPE.fit(PIECE_DATA, memory, LAST_BATCH)
OUT_STRING = f"{out}"
yield (BPE, memory, OUT_STRING)
def __increment_counter(self, counter: int):
# What if overflows???
try:
counter += 1
except:
print("Integer overflow")
counter = 1
return counter
def __make_list_ids(self, corpus: str, cached: bool):
if not cached:
return list(corpus.encode("utf-8"))
REDUCED_CORPUS_LEN = len(corpus) - 1
# Skip these cars "[" "]"
INTS = corpus[1:REDUCED_CORPUS_LEN]
INT_LIST = list(map(int, INTS.split(",")))
return INT_LIST
def __switch_paths(self, path: Path, cache_path: Path, initial_iteration: int):
CORPUS_TMP_1 = cache_path / "corpus-tmp1.txt"
CORPUS_TMP_2 = cache_path / "corpus-tmp2.txt"
switch = True
if initial_iteration % 2 == 1:
switch = False
del initial_iteration
while True:
if switch:
yield CORPUS_TMP_1
else:
yield CORPUS_TMP_2
switch = not switch
def __switch_memory(self, cache_path: Path, initial_iteration: int):
INTERNAL_TMP_1 = cache_path / "internal-tmp1.json"
INTERNAL_TMP_2 = cache_path / "internal-tmp2.json"
VOCAB_TMP_1 = cache_path / "voc-tmp1.json"
VOCAB_TMP_2 = cache_path / "voc-tmp2.json"
switch = False
if initial_iteration % 2 == 1:
switch = True
del initial_iteration
while True:
if switch:
yield (INTERNAL_TMP_1, VOCAB_TMP_1)
else:
yield (INTERNAL_TMP_2, VOCAB_TMP_2)
switch = not switch

View File

@ -0,0 +1,280 @@
from collections import deque
import datetime
import itertools
from multiprocessing import Pool
import os
from pathlib import Path
import re
import time
from ..Classes import (
NanoSocratesBPE,
NanoSocratesChunker,
NanoSocratesSplitter,
NanoSocratesBatchMemoryBPE,
)
from ..Enums import TokenType
from ..Utils import (
special_regex_maker,
iterator_with_checks,
save_nanos_vocabulary,
load_nanos_vocabulary,
save_json,
load_json,
)
def split(a, n):
k, m = divmod(len(a), n)
return (a[i * k + min(i, m) : (i + 1) * k + min(i + 1, m)] for i in range(n))
def split_fit(object: tuple[NanoSocratesBPE, list[list[int]]]):
bpe, data = object
NEW_DATA: list[list[int]] = []
memory = NanoSocratesBatchMemoryBPE({}, 0)
while len(data) > 0:
piece = data.pop()
bpe, memory, output = bpe.fit(piece, memory, False)
if len(output) < 2:
continue
# We are sure of its type
NEW_DATA.append(piece) # type: ignore
return (bpe, NEW_DATA, memory)
def split_encode(object: tuple[NanoSocratesBPE, list[list[int]]]):
bpe, data = object
NEW_DATA: list[list[int]] = []
for index, piece in zip(range(0, len(data)), data):
output = bpe.encode_intermediate(piece)
if len(output) < 2:
continue
# We are sure of its type
NEW_DATA.append(data[index]) # type: ignore
return NEW_DATA
class NanoSocraTrainerPool:
def __init__(
self,
max_vocabulary: int,
special_vocabulary: list[str],
merge_treshold: int = 0,
max_iterations: int = 0,
print_after_iterations: int = 1,
) -> None:
# Bytes
BYTE_RESERVED_TOKENS = 256
SPECIAL_RESERVED_TOKENS = len(special_vocabulary)
RESERVED_TOKENS = BYTE_RESERVED_TOKENS + SPECIAL_RESERVED_TOKENS
self.__max_vocabulary = max_vocabulary - RESERVED_TOKENS
self.__max_iterations = max_iterations
self.__merge_treshold = merge_treshold
self.__special_token_regex = special_regex_maker(special_vocabulary)
self.__print_after_iterations = print_after_iterations
# TODO: add a resume function
def trainBPE(
self,
path: Path,
cache_file: Path,
bpe: NanoSocratesBPE | None = None,
) -> NanoSocratesBPE:
if not path.is_file():
raise FileNotFoundError()
if not cache_file.is_file():
file = cache_file.open("w")
file.close()
if bpe is None:
bpe = NanoSocratesBPE()
BPE = bpe
if BPE.vocabulary_size >= self.__max_vocabulary:
return BPE
exit = False
current_iteration = 0
data = self.__gather_data_from_file(path)
data = self.__encode_from_cache(BPE, data)
while not exit:
current_iteration = self.__increment_counter(current_iteration)
LAST_VOC_SIZE = BPE.vocabulary_size
last_memory = None
start = time.time_ns()
_, data, last_memory = self.__round_train(BPE, data)
end = time.time_ns()
NEW_VOC_SIZE = BPE.vocabulary_size
VOCABULARY = BPE.vocabulary
save_nanos_vocabulary(VOCABULARY, cache_file)
if current_iteration % self.__print_after_iterations == 0:
DELIMITER = "==============="
DEBUG = "\n".join(
[
DELIMITER,
f"ITERATION: {current_iteration}",
DELIMITER,
f"\tVocabulary size: {BPE.vocabulary_size - 256}\n",
f"\tTime elapsed: {(end - start)/1E9}s",
DELIMITER,
"",
]
)
print(DEBUG)
if LAST_VOC_SIZE == NEW_VOC_SIZE:
exit = True
continue
if current_iteration == self.__max_iterations:
exit = True
continue
if BPE.vocabulary_size == self.__max_vocabulary:
exit = True
continue
return BPE
def __round_train(self, bpe: NanoSocratesBPE, data: list[list[int]]):
NEW_DATA: list[list[int]] = []
MEMORY = NanoSocratesBatchMemoryBPE({}, self.__merge_treshold)
fit_funct = split_fit
CPU_COUNT = os.process_cpu_count()
if CPU_COUNT is None:
raise Exception()
VOCABULARY = bpe.vocabulary
data_chunks = split(data, CPU_COUNT)
JOBS = [(NanoSocratesBPE(VOCABULARY), chunk) for chunk in data_chunks]
JOB_RESULTS: list[
tuple[NanoSocratesBPE, list[list[int]], NanoSocratesBatchMemoryBPE]
]
with Pool() as pool:
JOB_RESULTS = pool.map(fit_funct, JOBS)
for i, res in zip(range(0, CPU_COUNT), JOB_RESULTS):
_, job_output, job_memory = res
NEW_DATA.extend(job_output)
for key, value in job_memory.frequencies.items():
frequency = MEMORY.frequencies.get(key)
if frequency is None:
frequency = 0
MEMORY.frequencies[key] = 0
frequency += value
MEMORY.frequencies[key] = frequency
del job_output
del job_memory
print(f"Joined {i + 1} out of {CPU_COUNT}")
# Get new token
bpe.fit([], MEMORY, True)
print(f"Sentences from {len(data)} to {len(NEW_DATA)}")
return (bpe, NEW_DATA, MEMORY)
def __gather_data_from_file(self, path: Path) -> list[list[int]]:
SPLITTER = NanoSocratesSplitter(self.__special_token_regex)
DATA: list[list[int]] = []
FILE = open(path, "r", encoding="utf-8")
file_string = FILE.read()
FILE.close()
for piece, type in SPLITTER.split_text(file_string):
if type != TokenType.BPE:
continue
int_list = self.__make_list_ids(piece)
DATA.append(int_list)
return DATA
def __encode_from_cache(self, bpe: NanoSocratesBPE, data: list[list[int]]):
NEW_DATA : list[list[int]]= []
CPU_COUNT = os.process_cpu_count()
if CPU_COUNT is None:
raise Exception()
VOCABULARY = bpe.vocabulary
data_chunks = split(data, CPU_COUNT)
JOBS = [(NanoSocratesBPE(VOCABULARY), chunk) for chunk in data_chunks]
JOB_RESULTS: list[list[list[int]]]
with Pool() as pool:
JOB_RESULTS = pool.map(split_encode, JOBS)
for i, res in zip(range(0, CPU_COUNT), JOB_RESULTS):
job_output = res
NEW_DATA.extend(job_output)
del job_output
print(f"Joined {i + 1} out of {CPU_COUNT}")
print(f"Sentences from {len(data)} to {len(NEW_DATA)}")
return NEW_DATA
def __increment_counter(self, counter: int):
# What if overflows???
try:
counter += 1
except:
print("Integer overflow")
counter = 1
return counter
def __make_list_ids(self, corpus: str):
return list(corpus.encode("utf-8"))

View File

@ -0,0 +1,219 @@
from collections import deque
from .Encoder import Encoder
from ..Errors import OutOfDictionaryException, DuplicateWordException
# ABOUT THE DICTIONARY:
# the string is converted into utf-char bytes, that is: each char is rappresented with a set of bytes from 1 to 4.
# each bytes get casted into an integer; such that, if an integer has its value lower then 256,
# then it is rappresenting an utf-char-byte, otherwise it is a token-ID.
class NanoSocratesBatchMemoryBPE:
"""Memory to batch training. Keeps token couple frequencies, and merge_treshold"""
def __init__(
self, frequencies: dict[tuple[int, int], int], merge_treshold: int
) -> None:
self.frequencies = frequencies
self.merge_treshold = merge_treshold
class NanoSocratesBPE(Encoder):
def __init__(self, vocabulary: dict[tuple[int, int], int] | None = None) -> None:
super().__init__()
self.__vocabulary: dict[tuple[int, int], int] = {}
self.__reverse_vocabulary: dict[int, tuple[int, int]] = {}
if vocabulary is None:
return
for key, value in vocabulary.items():
if value < 256:
raise OutOfDictionaryException()
# values under 256 are used for unpaired char
# TODO: check if they are in order
self.__vocabulary[key] = value
self.__reverse_vocabulary[value] = key
@property
def vocabulary_size(self):
return len(self.__vocabulary) + 256
@property
def vocabulary(self):
return self.__vocabulary
@property
def __next_id(self) -> int:
"""
Gets the next it
Returns:
int:
"""
return self.vocabulary_size
# TODO: implement fit
def fit(
self,
chunk_data: list[int],
memory: NanoSocratesBatchMemoryBPE,
last_batch: bool,
):
ENCODED_CHUNK = self.encode_intermediate(chunk_data)
DATA_LEN_BEFORE_LAST = len(ENCODED_CHUNK) - 1
# update frequency of each couple of element
for i in range(0, DATA_LEN_BEFORE_LAST):
CANDIDATE_COUPLE = (ENCODED_CHUNK[i], ENCODED_CHUNK[i + 1])
frequency = memory.frequencies.get(CANDIDATE_COUPLE)
# Initialize frequency
if frequency is None:
frequency = 0
memory.frequencies[CANDIDATE_COUPLE] = 0
frequency += 1
memory.frequencies[CANDIDATE_COUPLE] = frequency
if not last_batch:
return (self, memory, ENCODED_CHUNK)
if len(memory.frequencies) < 1:
return (self, memory, ENCODED_CHUNK)
FREQUENCIES = memory.frequencies
MAX_COUPLE = max(FREQUENCIES.items(), key=lambda item: item[1])[0]
FREQUENCY = FREQUENCIES[MAX_COUPLE]
if FREQUENCY < memory.merge_treshold:
return (self, memory, ENCODED_CHUNK)
self.__learn_word(MAX_COUPLE)
return (self, memory, ENCODED_CHUNK)
def encode(self, piece: str) -> list[int]:
"""Encode a String into token IDs, it firt convert it into utf-8, then pass the list of integer to encode_intermediate()
Args:
piece (str):
Returns:
list[int]:
"""
converted_piece = list(piece.encode("utf-8"))
return self.encode_intermediate(converted_piece)
def encode_intermediate(self, piece: list[int]) -> list[int]:
"""Encode a piece (as list of integer) till its maximum
Args:
piece (list[int]): piece to encode
Returns:
list[int]: piece encoded
"""
current_piece = piece
new_piece = self.__round_encode(current_piece)
# until current_piece is bigger then new_piece, keep encoding
while len(current_piece) != len(new_piece):
current_piece = new_piece
new_piece = self.__round_encode(current_piece)
return current_piece
def __round_encode(self, piece: list[int]):
"""A single round of encode that traverse all the object. Multiple round are needed for a full encode: \n
1) "ABAB" -> "XX"
2) "XX" -> "Y"
Args:
piece (list[int]): the object to encode as a list of integer
Returns:
(list[int]): the one time encoded object
"""
if len(piece) == 1:
return piece
PIECE_LENGTH = len(piece) - 1
NEW_PIECE: list[int] = []
index = 0
while index < PIECE_LENGTH:
CANDIDATE_WORD = (
piece[index],
piece[index + 1],
) # take a tuple of consecutive element [int]
CANDIDATE_TOKEN = self.__vocabulary.get(CANDIDATE_WORD)
# if no token to substitute the tuple, append the first element
if CANDIDATE_TOKEN is None:
NEW_PIECE.append(piece[index])
index += 1
# if the latter element of the tuple is the last element of the piece, append it
if index == PIECE_LENGTH:
NEW_PIECE.append(piece[index])
continue
# in this case there was a candidate token to substitute the couple of element
NEW_PIECE.append(CANDIDATE_TOKEN)
index += 2
if index == PIECE_LENGTH:
NEW_PIECE.append(piece[index])
return NEW_PIECE
# TODO: Remake decode to take a list of token IDs
def decode(self, token_ids: list[int]) -> str:
# deque: double ended queue
token_stack: deque[int] = deque(token_ids)
UTF_8_STRING_ARR: bytearray = bytearray()
while len(token_stack) > 0:
TOKEN_ID = token_stack.popleft()
if TOKEN_ID < 256:
UTF_8_STRING_ARR.append(TOKEN_ID)
continue
left_token, right_token = self.__token_decode(TOKEN_ID)
token_stack.appendleft(right_token)
token_stack.appendleft(left_token)
return UTF_8_STRING_ARR.decode("utf-8")
def __token_decode(self, token_id: int) -> tuple[int, int]:
CANDIDATE_DECODED = self.__reverse_vocabulary.get(token_id)
if CANDIDATE_DECODED is None:
raise OutOfDictionaryException()
return CANDIDATE_DECODED
def __learn_word(self, words: tuple[int, int]):
"""learn a new couple of object in the vocabulary
Args:
words (tuple[int, int]): the Pair of element to substitute with a new tokenID
Raises:
DuplicateWordException: it launch if there is a duplicate of the new tokenID in the dictionary
"""
ID = self.__next_id
DUPLICATE = self.__vocabulary.get(words)
if DUPLICATE is not None:
raise DuplicateWordException()
self.__vocabulary[words] = ID
self.__reverse_vocabulary[ID] = words

View File

@ -0,0 +1,70 @@
from pathlib import Path
import re
from ..Errors import DelimiterNotFoundException
class NanoSocratesChunker:
def __init__(self, max_size: int, special_token_regex: re.Pattern) -> None:
self.__max_size: int = max_size
self.__special_token_regex: re.Pattern = special_token_regex
self.__residual: str = ""
# max theorethical size of chars
# between special tokens:
# - min: size - len(longest_token)
# - MAX: size - len(shortest_token)
def chunk(self, file_path: Path):
# read_file
FILE = open(file_path, "r", encoding="utf-8")
exit = False
while not exit:
REMAINING_SIZE = self.__max_size - len(self.__residual)
READ_SIZE = min(self.__max_size, REMAINING_SIZE)
FILE_CHUNK = FILE.read(READ_SIZE)
if len(FILE_CHUNK) == 0:
exit = True
continue
CHUNK = self.__append_residuals(FILE_CHUNK)
boundaries = self.__identify_boudaries(CHUNK)
if boundaries is None:
# boundaries not found in 2 chunks,
if len(CHUNK) > self.__max_size - 1:
raise DelimiterNotFoundException()
if exit:
yield CHUNK
self.__set_residual(0, CHUNK)
continue
start, end = boundaries
self.__set_residual(end, CHUNK)
yield CHUNK[start:end]
def __identify_boudaries(self, corpus: str) -> tuple[int, int] | None:
end = 0
for match in self.__special_token_regex.finditer(corpus):
# print(match)
end = match.end()
if end == 0:
return None
return (0, end)
def __append_residuals(self, corpus: str) -> str:
RESIDUAL = self.__residual
self.__residual = ""
return RESIDUAL + corpus
def __set_residual(self, index: int, corpus: str):
self.__residual = corpus[index:]

View File

@ -0,0 +1,68 @@
from .Encoder import Encoder
from ..Errors import OutOfDictionaryException
class NanoSocratesSpecial(Encoder):
def __init__(
self, bpe_vocabulary_size: int, special_tokens: list[str] = []
) -> None:
super().__init__()
self.__bpe_offset = bpe_vocabulary_size
self.__vocabulary: dict[str, int] = {}
self.__reverse_vocabulary: dict[int, str] = {}
if len(special_tokens) == 0:
return
for index, TOKEN in zip(range(0, len(special_tokens)), special_tokens):
CANDIDATE_ID = self.__bpe_offset + index + 1
self.__vocabulary[TOKEN] = CANDIDATE_ID
self.__reverse_vocabulary[CANDIDATE_ID] = TOKEN
@property
def __next_id(self):
BPE_OFFSET = self.__bpe_offset
VOC_LENGTH = len(self.__vocabulary)
return BPE_OFFSET + VOC_LENGTH + 1
@property
def vocabulary_size(self) -> int:
return len(self.vocabulary)
@property
def vocabulary(self) -> dict[str, int]:
return self.__vocabulary
@property
def reverse_vocabulary(self) -> dict[int, str]:
return self.__reverse_vocabulary
def add_special_word_to_vocabulary(self, word: str):
CANDIDATE_INDEX = self.__next_id
self.__vocabulary[word] = CANDIDATE_INDEX
self.__reverse_vocabulary[CANDIDATE_INDEX] = word
def encode(self, word: str) -> list[int]:
ID = self.__vocabulary.get(word)
if ID is None:
raise OutOfDictionaryException()
return [ID]
def decode(self, token_id: list[int]) -> str:
if len(token_id) != 1:
raise OutOfDictionaryException()
ID = token_id[0]
WORD = self.__reverse_vocabulary.get(ID)
if WORD is None:
raise OutOfDictionaryException()
return WORD

View File

@ -0,0 +1,98 @@
import re
from collections import deque
from typing import Generator
from ..Enums import TokenType
class NanoSocratesSplitter:
def __init__(
self, special_token_regex: re.Pattern, max_bpe_token_id: int = 255
) -> None:
# attention the regex got already compiled
self.__special_token_regex = special_token_regex
self.__max_bpe_token_id: int = max_bpe_token_id # used for decoding
def split_text(self, corpus: str) -> Generator[tuple[str, TokenType]]:
"""Split a text using a regex given
Args:
corpus (str): all the corpus string to split
Yields:
Generator[tuple[str, TokenType]]: each time returns a piece of the splitted text: string and its TokenType. \n
TokenType describe if the string is for the BPE or a special Token [BPE, SPECIAL]
"""
bpe_start = 0
bpe_end = len(corpus) # this can be deleted!
for special_token_start, special_token_end in self.__find_boundaries(corpus):
# FIND BPE
bpe_end = special_token_start
BPE_TOKEN_TEXT = corpus[bpe_start:bpe_end]
if BPE_TOKEN_TEXT != "":
for WORD in self.__split_words(BPE_TOKEN_TEXT):
yield (WORD, TokenType.BPE)
# FIND SPECIAL TOKEN
SPECIAL_TOKEN_TEXT = corpus[special_token_start:special_token_end]
if SPECIAL_TOKEN_TEXT != "":
yield (SPECIAL_TOKEN_TEXT, TokenType.SPECIAL)
# now save the new bpe start point
# it will used in the next interaction
bpe_start = special_token_end
def __find_boundaries(self, corpus: str) -> Generator[tuple[int, int]]:
"""
Find each time the start and end (not included) of the special token
Args:
corpus (str): the string where the special token will be searched
Yields:
Generator[tuple[int, int]]: Note the end is not included
"""
for match in self.__special_token_regex.finditer(corpus):
start = match.start()
end = match.end()
yield (start, end)
# make the last boundary be the end of corpus
# eof = len(corpus)
# yield(eof,eof)
def __split_words(self, bpe_piece: str) -> Generator[str]:
END_OF_STRING = len(bpe_piece)
bound_start = 0
bound_end = END_OF_STRING + 1
for i in range(0, END_OF_STRING):
CANDIDATE_CHAR = bpe_piece[i]
if CANDIDATE_CHAR != " ":
continue
bound_end = i
yield bpe_piece[bound_start:bound_end]
bound_start = bound_end
bound_end = END_OF_STRING + 1
yield bpe_piece[bound_start:bound_end]
def split_tokens(self, corpus: list[int]) -> Generator[tuple[list[int], TokenType]]:
not_special_token_list: list[int] = []
for token in corpus:
if token > self.__max_bpe_token_id:
if len(not_special_token_list) > 0:
yield (not_special_token_list, TokenType.BPE)
not_special_token_list = []
yield ([token], TokenType.SPECIAL)
continue
not_special_token_list.append(token)

View File

@ -0,0 +1,8 @@
from Project_Model.Libs.BPE.Classes.TokeNanoCore import TokeNanoCore
class TokeNano:
def __init__(self):
pass

View File

@ -0,0 +1,84 @@
from pathlib import Path
from ..Classes import NanoSocratesSplitter
from ..Classes import NanoSocratesBPE
from ..Classes import NanoSocratesSpecial
from ..Utils import special_regex_maker
from ..Enums import TokenType
from ..Enums import SpecialToken
class TokeNanoCore:
def __init__(
self,
bpe_vocabulary: dict[tuple[int, int], int],
special_token_list: list[str],
# special_vocabulary: dict[str, int]
):
self.__bpe_encoder = NanoSocratesBPE(bpe_vocabulary)
SPECIAL_REGEX = special_regex_maker(special_token_list)
BPE_VOCABULARY_SIZE = self.__bpe_encoder.vocabulary_size
self.__splitter = NanoSocratesSplitter(SPECIAL_REGEX, BPE_VOCABULARY_SIZE)
self.__special_encoder = NanoSocratesSpecial(
BPE_VOCABULARY_SIZE, special_token_list
)
@property
def vocabulary_size(self):
BPE_VOC_SIZE = self.__bpe_encoder.vocabulary_size
SPECIAL_VOC_SIZE = self.__special_encoder.vocabulary_size
return BPE_VOC_SIZE + SPECIAL_VOC_SIZE
def encode(self, corpus: str) -> list[int]:
output: list[int] = []
for piece, token_type in self.__splitter.split_text(corpus):
if token_type == TokenType.SPECIAL:
output.extend(self.__special_encoder.encode(piece))
# slow but clear
if token_type == TokenType.BPE:
output.extend(self.__bpe_encoder.encode(piece))
return output
def encode_incomplete_string(self, corpus: str) -> list[int]:
"""
Encode string which don't end with a special token
"""
corpus = corpus + SpecialToken.CORPUS_END.value
output: list[int] = []
for piece, token_type in self.__splitter.split_text(corpus):
if token_type == TokenType.SPECIAL:
output.extend(self.__special_encoder.encode(piece))
# slow but clear
if token_type == TokenType.BPE:
output.extend(self.__bpe_encoder.encode(piece))
return output[:-1]
def decode(self, corpus: list[int]) -> str:
output_str = ""
for token, token_type in self.__splitter.split_tokens(corpus):
# token is an integer if special, a list of integer otherwise
if token_type == TokenType.SPECIAL:
output_str += self.__special_encoder.decode(
token
) # it accept an integer
# slow but clear
if token_type == TokenType.BPE:
output_str += self.__bpe_encoder.decode(
token
) # it accept a list of integer
return output_str

View File

@ -0,0 +1,20 @@
from .NanoSocratesChunker import NanoSocratesChunker
from .NanoSocratesSplitter import NanoSocratesSplitter
from .NanoSocratesBPE import NanoSocratesBPE, NanoSocratesBatchMemoryBPE
from .NanoSocraTrainer import NanoSocraTrainer
from .NanoSocraTraineRam import NanoSocraTraineRam
from .NanoSocraTrainerPool import NanoSocraTrainerPool
from .NanoSocratesSpecial import NanoSocratesSpecial
from .TokeNanoCore import TokeNanoCore
from .TokeNano import TokeNano
__all__ = [
"NanoSocratesChunker",
"NanoSocratesSplitter",
"NanoSocratesBPE",
"NanoSocraTrainer",
"NanoSocraTraineRam",
"NanoSocraTrainerPool",
"TokeNanoCore",
"TokeNano"
]

View File

@ -0,0 +1,27 @@
from enum import Enum
class SpecialToken(Enum):
# (Enum, str) -> throws an error
START_TRIPLE_LIST = "<SOTL>"
START_TRIPLE = "<SOT>"
END_TRIPLE = "<EOT>"
SUBJECT = "<SUBJ>"
RELATIONSHIP = "<PRED>"
OBJECT = "<OBJ>"
ABSTRACT = "<ABS>"
## Tasks' Token
RDF_TO_TEXT = "<RDF2TXT>"
TEXT_TO_RDF = "<TEXT2RDF>"
CONTINUE_RDF = "<CONTINUERDF>"
MASK = "<MASK>"
# BPE Training:
# NanoSocrates
START = "<START>"
CORPUS_END = "<END>"
START_OF_SEQUENCE = "<SOS>"
END_OF_SEQUENCE = "<EOS>"
PAD = "<PAD>"

View File

@ -0,0 +1,6 @@
from enum import Enum, auto
class TokenType(Enum):
SPECIAL = auto()
BPE = auto()

View File

@ -0,0 +1,6 @@
from .TokenType import TokenType
from .SpecialToken import SpecialToken
__all__ = [
"SpecialToken"
]

View File

@ -0,0 +1,4 @@
class DelimiterNotFoundException(Exception):
def __init__(self, *args: object) -> None:
super().__init__(*args)

View File

@ -0,0 +1,4 @@
class DuplicateWordException(Exception):
def __init__(self, *args: object) -> None:
super().__init__(*args)

View File

@ -0,0 +1,4 @@
class OutOfDictionaryException(Exception):
def __init__(self, *args: object) -> None:
super().__init__(*args)

View File

@ -0,0 +1,4 @@
class SentenceTooLongException(Exception):
def __init__(self, *args: object) -> None:
super().__init__(*args)

View File

@ -0,0 +1,11 @@
from .DelimiterNotFoundException import DelimiterNotFoundException
from .OutOfDictionaryException import OutOfDictionaryException
from .DuplicateWordException import DuplicateWordException
from .SentenceTooLongException import SentenceTooLongException
__all__ = [
"DelimiterNotFoundException",
"OutOfDictionaryException",
"DuplicateWordException",
"SentenceTooLongException"
]

View File

@ -0,0 +1,15 @@
from .special_regex_maker import special_regex_maker
from .lag_checker_iterator import iterator_with_checks
from .vocabulary import save_nanos_vocabulary, load_nanos_vocabulary
from .json_utils import save_json, load_json
from .special_regex_maker import special_regex_maker
from .default_special_tokens import default_special_tokens
__all__ = [
"special_regex_maker",
"iterator_with_checks",
"save_nanos_vocabulary",
"load_nanos_vocabulary",
"save_json", "load_json",
"default_special_tokens"
]

View File

@ -0,0 +1,4 @@
from ..Enums import SpecialToken
def default_special_tokens() -> list[str]:
return [token.value for token in SpecialToken]

View File

@ -0,0 +1,18 @@
import json
from pathlib import Path
def save_json(dictionary: dict, path: Path):
json_string = json.dumps(dictionary)
FILE = open(path, "w")
FILE.write(json_string)
FILE.close()
def load_json(path: Path) -> dict:
FILE = open(path, "r")
json_string = FILE.read()
FILE.close()
return json.loads(json_string)

View File

@ -0,0 +1,27 @@
from collections import deque
from typing import Generator, TypeVar
T1 = TypeVar("T1")
T2 = TypeVar("T2")
T3 = TypeVar("T3")
def iterator_with_checks(
generator: Generator[T1, T2, T3],
) -> Generator[tuple[T1, bool], T2, T3]:
# Here we can ignore to catch stop iteration
# we will propagate it
last_element = next(generator)
while True:
RETURN_ELEMENT = last_element
try:
element = next(generator)
last_element = element
yield (RETURN_ELEMENT, False)
except StopIteration:
yield (RETURN_ELEMENT, True)
break

View File

@ -0,0 +1,15 @@
import re
def special_regex_maker(special_tokens: list[str]) -> re.Pattern:
"""compile a regex for the special token
Args:
special_tokens (list[str]): the list of special token
Returns:
re.Pattern:
"""
REGEX_STR = "|".join(special_tokens)
return re.compile(REGEX_STR)

View File

@ -0,0 +1,49 @@
import json
from pathlib import Path
from ..Errors import OutOfDictionaryException
def nanos_vocabulary2json_str(vocabulary: dict[tuple[int, int], int]) -> str:
JSON: dict[str, int] = {}
for key, item in vocabulary.items():
TUPLE_STR = f"{key}"
JSON[TUPLE_STR] = item
return json.dumps(JSON)
def nanos_json_str2vocabulary(json_string: str) -> dict[tuple[int, int], int]:
JSON: dict[str, int] = json.loads(json_string)
VOCABULARY: dict[tuple[int, int], int] = {}
for key, item in JSON.items():
REDUCED_KEY = len(key) - 1
KEY_STR = key[1:REDUCED_KEY]
VOC_KEY = tuple(map(int, KEY_STR.split(",")))
if len(VOC_KEY) != 2:
raise OutOfDictionaryException()
# Checked for weird things above
VOCABULARY[VOC_KEY] = item # type: ignore
return VOCABULARY
def save_nanos_vocabulary(vocabulary: dict[tuple[int, int], int], path: Path):
json_string = nanos_vocabulary2json_str(vocabulary)
FILE = open(path, "w")
FILE.write(json_string)
FILE.close()
def load_nanos_vocabulary(path: Path) -> dict[tuple[int, int], int]:
FILE = open(path, "r")
json_string = FILE.read()
FILE.close()
return nanos_json_str2vocabulary(json_string)

View File

@ -0,0 +1,9 @@
from .Classes import *
from .Enums import *
from .Errors import *
from .Utils import *
from . import Classes
from . import Enums
from . import Errors
from . import Utils

View File

@ -0,0 +1,11 @@
from ....Libs.Embedder.Classes.NanoSocratesEmbedder import NanoSocratesEmbedder
import torch
class BatchEmbedder(torch.nn.Module):
def __init__(self, vocabulary_size: int, embedding_size: int) -> None:
super().__init__()
self.__embedder = NanoSocratesEmbedder(vocabulary_size,embedding_size)
def forward(self, )

View File

@ -0,0 +1,104 @@
import random
from typing import Generator
import pandas as pd
import Project_Model.Libs.BPE as BPE
from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
from Project_Model.Libs.Transformer.Classes.SpannedMasker import SpannedMasker
from TokenCompletation import TokenCompletationTransformer
from Project_Model.Libs.BPE.Enums.SpecialToken import SpecialToken
class Batcher:
def __init__(self, dataset_path: str, batch_size:int, tokenizer: BPE.TokeNanoCore, masker: SpannedMasker) -> None:
# ABSTRACT, TRIPLE
# tasks:
# rdf2text: X: TRIPLE, Y: ABSTRACT
# text2rdf: X: ABSTRACT, X:TRIPLE
# masking ( call masker): X: incomplete_triple Y: complete_triple (as exam)
# completation: X: TRIPLE SUBSET, Y: related TRIPLE SUBSET
self._dataset_path = dataset_path
self._batch_size = batch_size
self._tokenizer = tokenizer
self._masker = masker
sotl = self._tokenizer.encode(SpecialToken.START_TRIPLE_LIST.value)
eos = self._tokenizer.encode(SpecialToken.END_OF_SEQUENCE.value)
self._token_completation = TokenCompletationTransformer(sotl,eos)
def get_batch(self)-> Generator[pd.DataFrame]:
for batch in pd.read_csv(self._dataset_path, chunksize= int(self._batch_size/4)): #now we support 3 task
tokenized_batch = pd.DataFrame()
tokenized_batch[["Abstract","RDFs"]] = (
batch[["Abstract","RDFs"]]
.map(lambda t: self._tokenizer.encode(t))
)
rdf2txt_batch = self.__rdf2txt_transformation(tokenized_batch)
txt2rdf_batch = self.__txt2rdf_transformation(tokenized_batch)
mask_batch = self.__masking_trasformation(tokenized_batch)
completation_batch = self.__token_completation_task(tokenized_batch)
output = pd.concat([rdf2txt_batch,txt2rdf_batch,mask_batch,completation_batch],ignore_index=True)
output = output.sample(frac=1).reset_index(drop=True)
yield output
def __random_subset_rdfs(self, batch: pd.DataFrame, seed = 0):
# WIP
rng = random.Random(seed)
def to_list(x):
return x.split(SpecialToken.START_TRIPLE.value)[1:]
batch["RDFs"] = batch["RDFs"].map(
to_list
)
def __rdf2txt_transformation(self, batch: pd.DataFrame):
batch = batch.rename(columns={"RDFs": "X", "Abstract": "Y"})
return batch[["X", "Y"]]
def __txt2rdf_transformation(self, batch: pd.DataFrame):
batch = batch.rename(columns={ "Abstract": "X","RDFs": "Y"})
return batch[["X", "Y"]]
def __masking_trasformation(self, batch: pd.DataFrame):
# mask_sequence: List[int] -> Tuple[List[int], List[int]]
xy_tuples = batch["RDFs"].apply(self._masker.mask_sequence) # Series of (X, Y)
output = batch.copy()
# Expand into two columns preserving the original index
output[["X", "Y"]] = pd.DataFrame(xy_tuples.tolist(), index=batch.index)
return output[["X", "Y"]]
def __token_completation_task(self, batch: pd.DataFrame):
xy_tuples = batch["RDFs"].apply(self._token_completation.get_completation_tuple)
output = batch.copy()
output[["X", "Y"]] = pd.DataFrame(xy_tuples.tolist(), index=batch.index)
return output[["X", "Y"]]
"""
DATASET_PATH = "Assets/Dataset/Tmp/rdf_text.csv"
VOCABULARY_path = "Assets/Dataset/Tmp/trimmed.json"
from pathlib import Path
VOCABULARY = BPE.load_nanos_vocabulary(Path(VOCABULARY_path))
SPECIAL_LIST = BPE.default_special_tokens()
TOKENANO = BPE.TokeNanoCore(VOCABULARY, SPECIAL_LIST)
SPECIAL_TOKENS: set[int] = set(TOKENANO.encode("".join(SPECIAL_LIST)))
MASKER = SpannedMasker(TOKENANO.vocabulary_size,SPECIAL_TOKENS)
prova = "<ABS>Cactus Flower is a 1969 American screwball comedy film directed by Gene Saks, and starring Walter Matthau, Ingrid Bergman and Goldie Hawn, who won an Academy Award for her performance.The screenplay was adapted by I. A. L. Diamond from the 1965 Broadway play of the same title written by Abe Burrows, which, in turn, is based on the French play Fleur de cactus by Pierre Barillet and Jean-Pierre Gredy. Cactus Flower was the ninth highest-grossing film of 1969."
print(TOKENANO.encode(prova))
batcher = Batcher(DATASET_PATH,8,TOKENANO,MASKER)
for batch in batcher.get_batch():
print(batch)
"""

View File

@ -0,0 +1,33 @@
class TokenCompletationTransformer:
def __init__(self,SOTL_token,EOS_token, input_percent:float = 0.5) -> None:
self.__SOTL_token = SOTL_token
self.__EOS_token = EOS_token
self.__input_percent = input_percent
pass
def get_completation_tuple(
self,
token_sequence: list[int],
)-> tuple[list[int], list[int]]:
# split the sequence by encoded(<SOTL>), dont take the first, firts pertenge in as X the other as Y
sotl_count =int( token_sequence.count(self.__SOTL_token) * self.__input_percent)
sotl_index = 0
percent_index = 0
while sotl_index < sotl_count:
token = token_sequence[percent_index]
if token == self.__SOTL_token:
sotl_index += 1
percent_index+=1
percent_index = percent_index -1
x_list = token_sequence[:percent_index]
x_list.append(self.__EOS_token)
y_list = token_sequence[percent_index:]
return (x_list,y_list)

View File

@ -0,0 +1,8 @@
from enum import Enum, auto
class TaskType(Enum):
RDF2TXT = auto()
TEXT2RDF = auto()
MASKING = auto()
COMPLETATION = auto()

View File

@ -0,0 +1,23 @@
import torch
from ..Utils import fixed_positional_encoding
# WIP FOR BATCHING
class NanoSocratesEmbedder(torch.nn.Module):
def __init__(self, vocabulary_size: int, embedding_size: int) -> None:
super().__init__()
self.__embedder = torch.nn.Embedding(vocabulary_size, embedding_size)
def forward(self, tokenized_sentence: torch.Tensor) -> torch.Tensor:
computed_embeddings: torch.Tensor = self.__embedder(tokenized_sentence)
_, SENTENCE_LENGHT, EMBEDDING_SIZE = computed_embeddings.shape # for batching
POSITIONAL_ENCODINGS = fixed_positional_encoding(
SENTENCE_LENGHT, EMBEDDING_SIZE
)
computed_embeddings = computed_embeddings + POSITIONAL_ENCODINGS # for batching
return computed_embeddings

View File

@ -0,0 +1,5 @@
from .NanoSocratesEmbedder import NanoSocratesEmbedder
__all__ = [
"NanoSocratesEmbedder"
]

View File

@ -0,0 +1,5 @@
from .fixed_positional_encoding import fixed_positional_encoding
__all__ = [
"fixed_positional_encoding"
]

View File

@ -0,0 +1,28 @@
import torch
def fixed_positional_encoding(
sentence_dimension: int,
embedding_dimension: int,
) -> torch.Tensor:
BIG_CONST = int(1e4)
INITIAL_ENCODING = torch.tensor([i for i in range(0, sentence_dimension)])
ENCODINGS: list[torch.Tensor] = []
for i in range(0, embedding_dimension):
EMBEDDING_POSITION = i
# Note: The original paper did not specify
# to compute: pos mod 2!!
DIVISOR = BIG_CONST ** ((2 * (EMBEDDING_POSITION // 2)) / embedding_dimension)
INTERMEDIATE_ENCODING = INITIAL_ENCODING / DIVISOR
if EMBEDDING_POSITION % 2 == 0:
ENCODINGS.append(torch.sin(INTERMEDIATE_ENCODING))
continue
ENCODINGS.append(torch.cos(INTERMEDIATE_ENCODING))
return torch.stack(ENCODINGS).transpose(0, 1)

View File

@ -0,0 +1,7 @@
from .Utils import *
from .Classes import *
from . import Utils
from . import Classes

View File

@ -0,0 +1,5 @@
from .get_default_device import get_default_device
__all__ = [
"get_default_device"
]

View File

@ -0,0 +1,17 @@
import torch
def get_default_device() -> torch.device:
# Cuda or ROCm
if torch.cuda.is_available():
return torch.device("cuda")
# Intel GPUs
if torch.xpu.is_available():
return torch.device("xpu")
# Apple GPUs
if torch.backends.mps.is_available():
return torch.device("mps")
return torch.device("cpu")

View File

@ -0,0 +1,7 @@
from .Utils import *
from .Utils import get_default_device
__all__ = [
"get_default_device"
]

View File

@ -0,0 +1,41 @@
import numpy as np
# custom LR from attention is all you need
class Custom_lr():
def __init__(self, d_model: int, warmup_step:int) -> None:
self.__d_model = d_model
self.__warmup_step = warmup_step
self.__epoch = 0
def step(self) -> int:
self.__epoch += 1
return (self.__d_model ** -0.5) * min(self.__epoch ** -0.5,
self.__epoch * (self.__warmup_step ** -1.5))
# OTHER LR
# Learning rate schedules (matching visualization parameters)
def step_lr(epoch, lr):
# StepLR: step_size=20, gamma=0.5 (from visualization)
return lr * 0.5 if epoch % 20 == 0 and epoch > 0 else lr
def exp_lr(epoch, lr):
# ExponentialLR: gamma=0.95 (from visualization)
return lr * 0.95
def cosine_lr(epoch, lr):
# CosineAnnealingLR: lr_min=0.001, lr_max=0.1, max_epochs=100 (from visualization)
lr_min, lr_max = 0.001, 0.1
max_epochs = 100
return lr_min + 0.5 * (lr_max - lr_min) * (1 + np.cos(epoch * np.pi / max_epochs))
def cyclical_lr(epoch, lr):
# CyclicalLR: base_lr=0.001, max_lr=0.1, step_size=20 (from visualization)
base_lr = 0.001
max_lr = 0.1
step_size = 20
cycle = np.floor(1 + epoch / (2 * step_size))
x = np.abs(epoch / step_size - 2 * cycle + 1)
return base_lr + (max_lr - base_lr) * max(0, (1 - x))

View File

@ -0,0 +1,42 @@
import torch
class LogitsCollector:
def __init__(self, pad_token: int, end_token: int, tokenizer) -> None:
self.__pad_token = pad_token # used to skip PAD
self.__end_token = end_token # used to stop at END
self.__tokenizer = tokenizer # exposes .decode(list[int]) -> str
self.__steps: list[torch.Tensor] = [] # list of per-step logits [B,V]
def reset(self) -> None:
self.__steps.clear() # clear history
def add(self, logits_step: torch.Tensor) -> None:
if logits_step.dim() == 3: # handle [B,1,V]
logits_step = logits_step[:, -1, :] # -> [B,V]
self.__steps.append(logits_step.detach()) # store raw logits (detached)
def tokens(self) -> list[list[int]]:
if not self.__steps:
return []
stack = torch.stack(self.__steps, dim=0) # [T,B,V]
probs = torch.softmax(stack, dim=-1) # softmax over vocab -> [T,B,V]
ids = probs.argmax(dim=-1).transpose(0, 1) # greedy ids -> [B,T]
out: list[list[int]] = []
for row in ids.tolist():
seq: list[int] = []
for tok in row:
if tok == self.__end_token: # stop on END
break
if tok == self.__pad_token: # skip PAD
continue
seq.append(tok)
out.append(seq)
return out
def print_decoded(self) -> None:
for i, seq in enumerate(self.tokens()):
try:
text = self.__tokenizer.decode(seq) # decode tokens to string
except Exception:
text = str(seq) # fallback to ids
print(f"[{i}] {text}") # simple print

View File

View File

@ -0,0 +1,19 @@
import torch
class DeToken(torch.nn.Module):
def __init__(self, embedding_size: int, vocabulary_size: int) -> None:
super().__init__()
self.__linear = torch.nn.Linear(embedding_size, vocabulary_size)
def forward(self, x: torch.Tensor) -> torch.Tensor:
# 1) Go from latent space to vocabularu space
x = self.__linear(x)
# 2) Go to logits
# x = torch.softmax(x, 2)
return x

View File

@ -0,0 +1,103 @@
import torch
import torch.nn as nn
from .FeedForwardNetwork import FeedForwardNetwork
from .TorchMultiHeadAttention import TorchMultiHeadAttention as MultiHeadAttention
from ..Utils.attention_mask import get_causal_attention_mask
# B, L(T), E_D
class Decoder(nn.Module):
def __init__(
self,
embedding_dimension: int,
feed_forward_hidden_layer_dimension: int,
number_of_attention_heads: int,
) -> None:
super().__init__()
self.__masked_attention = MultiHeadAttention(
embedding_dimension, number_of_attention_heads, dropout=0.1
)
self.__layer_norm_1 = nn.LayerNorm(embedding_dimension)
self.__cross_attention = MultiHeadAttention(
embedding_dimension, number_of_attention_heads, dropout=0.1
)
self.__layer_norm_2 = nn.LayerNorm(embedding_dimension)
self.__dropout = nn.Dropout(0.1)
self.__feed_forward_network = FeedForwardNetwork(
embedding_dimension, feed_forward_hidden_layer_dimension
)
self.__layer_norm_3 = nn.LayerNorm(embedding_dimension)
def forward(
self,
args: tuple[
torch.Tensor,
torch.Tensor,
torch.Tensor,
torch.Tensor,
torch.Tensor
]
): # -> list[torch.Tensor]: # k_x = v_x . While x_q = x
# WARNING: args is needed to have sequential
x, k_x, v_x, padding_mask,encoder_padding_mask = args
# build of attention mask
attention_mask = get_causal_attention_mask(x.size(1))
# 1) Masked Attention
MASKED_ATTENTION = self.__masked_attention(
x, x, x, key_padding_mask=padding_mask, attention_mask=attention_mask
)
# 2) Dropout
# DROPPED_MASKED_ATTENTION = self.__dropout(MASKED_ATTENTION)
# del MASKED_ATTENTION
# 3) Residual Connection
x = x + MASKED_ATTENTION
del MASKED_ATTENTION
# 4) Layer Normalization
x = self.__layer_norm_1(x)
# 5) Encoderdecoder (cross) attention
CROSS_ATTENTION = self.__cross_attention(
x, k_x, v_x, key_padding_mask=encoder_padding_mask
)
# 6) Dropout
# DROPPED_CROSS_ATTENTION = self.__dropout(CROSS_ATTENTION)
# del CROSS_ATTENTION
# 7) Residual Connection
x = x + CROSS_ATTENTION
del CROSS_ATTENTION
# 8) Layer Normalization
x = self.__layer_norm_2(x)
# 9) Position-wise feed-forward
FEED_FORWARD = self.__feed_forward_network(x)
# 10) Dropout
# DROPPED_FEED_FORWARD = self.__dropout(FEED_FORWARD)
# del FEED_FORWARD
# 11) Residual Connection
x = x + FEED_FORWARD
del FEED_FORWARD
# 12) Layer Normalization
x = self.__layer_norm_3(x)
return (x, k_x, v_x, padding_mask, encoder_padding_mask)
# use eval to disable dropout ecc

View File

@ -0,0 +1,73 @@
import torch
import torch.nn as nn
from Project_Model.Libs.Transformer.Classes.FeedForwardNetwork import FeedForwardNetwork
from Project_Model.Libs.Transformer.Classes.TorchMultiHeadAttention import (
TorchMultiHeadAttention as MultiHeadAttention,
)
class Encoder(
nn.Module
): # in this way we expose the primitive of nn.Module for training purpose
def __init__(
self,
embedding_dimension: int,
feed_forward_hidden_layer_dimension: int,
number_of_attention_heads: int,
) -> None:
super().__init__()
self.__attention = MultiHeadAttention(
embedding_dimension, number_of_attention_heads, dropout=0.1
)
self.__layer_norm_1 = nn.LayerNorm(
embedding_dimension
) # norm of first "Add and Normalize"
self.__feed_forward = FeedForwardNetwork(
embedding_dimension, feed_forward_hidden_layer_dimension
)
self.__layer_norm_2 = nn.LayerNorm(
embedding_dimension
) # norm of second "Add and Normalize"
self.__dropout = nn.Dropout(0.1) # ...
def forward(self, args: tuple[torch.Tensor, torch.Tensor]):
# WARNING: args is needed to have sequential
x, padding_mask = args
# -> ATTENTION -> dropout -> add and normalize -> FF -> dropout -> add and normalize ->
# Attention with Residual Connection [ input + self-attention]
# 1) Multi Head Attention
ATTENTION = self.__attention(x, x, x, key_padding_mask=padding_mask)
# 2) Dropout
# DROPPED_ATTENTION = self.__dropout(ATTENTION)
# del ATTENTION
# 3) Residual Connection
x = x + ATTENTION
del ATTENTION
# 4) Layer Normalization
x = self.__layer_norm_1(x)
# 5) Feed Forward
FEED_FORWARD = self.__feed_forward(x)
# 6) Dropout
# DROPPED_FEED_FORWARD = self.__dropout(FEED_FORWARD)
# del FEED_FORWARD
# 7) Residual Connection
x = x + FEED_FORWARD
del FEED_FORWARD
# 8) Layer Normalization
x = self.__layer_norm_2(x)
return (x, padding_mask)
# use eval to disable dropout ecc

View File

@ -0,0 +1,43 @@
# it is position wise!
# https://stackoverflow.com/questions/74979359/how-is-position-wise-feed-forward-neural-network-implemented-for-transformers
# Why do we need a fixed size
# https://ai.stackexchange.com/questions/37624/why-do-transformers-have-a-fixed-input-length
import torch.nn as nn
class FeedForwardNetwork(nn.Module):
def __init__(self, embedding_size: int, feed_forward_hidden_layer_dimension: int):
super().__init__()
self.__fully_connected_1 = nn.Linear(
embedding_size, feed_forward_hidden_layer_dimension
) # expand in higher dimension
self.__relu = nn.ReLU()
self.__dropout = nn.Dropout(
0.1
) # during training we drop something, with eval it got deactivated
self.__fully_connected_2 = nn.Linear(
feed_forward_hidden_layer_dimension, embedding_size
) # return into the model dimension
def forward(self, x):
# -> NN1 -> RELU -> (Droput during training) -> NN2 ->
# 1) Linear Layer
x = self.__fully_connected_1(x)
# 2) ReLU
x = self.__relu(x)
# 3) Dropout
x = self.__dropout(x)
# 4) Linear Layer
x = self.__fully_connected_2(x)
return x

View File

@ -0,0 +1,23 @@
import torch
from NanoSocratesCore import NanoSocratesCore
class NanoSocrates(torch.nn.Module):
def __init__(self,
embedded_size: int,
feed_forward_dim: int,
encoder_layers: int,
decoder_layers:int,
attention_heads: int,
vocab_size: int) -> None:
super().__init__()
self._model = NanoSocratesCore(
embedded_size,
feed_forward_dim,
encoder_layers,
decoder_layers,
attention_heads,
vocab_size)

View File

@ -0,0 +1,109 @@
from ..Utils.task_type import TaskType
from .Decoder import Decoder
from .Encoder import Encoder
from ....Libs.Embedder import NanoSocratesEmbedder
import torch
class NanoSocratesCore(torch.nn.Module):
def __init__(
self,
sentence_length: int,
vocab_size: int,
embedding_size: int = 256,
feed_forward_multiplier: int = 4,
num_encoder_layers: int = 2,
num_decoder_layers: int = 2,
num_attention_heads: int = 4,
pad_token: int = 0,
) -> None:
super().__init__()
self.__pad_token = pad_token
feed_forward_dim = embedding_size * feed_forward_multiplier
self.__sentence_length = sentence_length
self.__encoder_sequence = torch.nn.Sequential(
*[
Encoder(embedding_size, feed_forward_dim, num_attention_heads)
for _ in range(num_encoder_layers)
]
)
# * unpack the list so that each encoder has its own weights
self.__decoder_sequence = torch.nn.Sequential(
*[
Decoder(embedding_size, feed_forward_dim, num_attention_heads)
for _ in range(num_decoder_layers)
]
)
self.__linear = torch.nn.Linear(embedding_size, vocab_size)
self.__input_embeder = NanoSocratesEmbedder(vocab_size, embedding_size)
self.__output_embedder = NanoSocratesEmbedder(vocab_size, embedding_size)
@torch.no_grad() # inference only
def forward(
self,
encoder_input: list[list[int]],
decoder_input: list[list[int]], # must start with <SOS> and PAD elsewhere
encoder_padding_mask: list[list[bool]], # True where encoder is PAD
):
# 1) Embed User-Input for encoders
ENCODER_INPUT = self.__input_embeder(encoder_input) # [B,S,E]
# 2) Encode User-Input
ENCODER_OUTPUT, encoder_padding_mask = self.__encoder_sequence(
(ENCODER_INPUT, encoder_padding_mask) # as tuple
) # [B,S,E], [B,S]
del ENCODER_INPUT
# 3) Autoregressive Output (greedy)
LOGITS_HISTORY: list[torch.Tensor] = [] # keep per-step distributions
decoder_token_list = [row[:] for row in decoder_input] # copy tokens
decoder_phase = 0
exit_loop = False
while not exit_loop:
decoder_phase += 1 # move to next position
# 3.1) Build decoder key padding mask from current tokens (True where PAD)
DECODER_KEY_PADDING_MASK: list[list[bool]] = [
[tok == self.__pad_token for tok in row] for row in decoder_token_list
] # [B,T]
# 3.2) Embed Decoder Input (full sequence; decoder builds causal mask inside)
DECODER_INPUT = self.__output_embedder(decoder_token_list) # [B,T,E]
# 3.3) Decode (self-attn uses causal mask internally; we provide PAD masks)
DECODER_OUTPUT, _, _, _ = self.__decoder_sequence(
(DECODER_INPUT, ENCODER_OUTPUT, ENCODER_OUTPUT,
DECODER_KEY_PADDING_MASK, encoder_padding_mask)
) # [B,T,E]
del DECODER_INPUT
# 3.4) Project to token space
LOGITS = self.__linear(DECODER_OUTPUT) # [B,T,V]
del DECODER_OUTPUT
# 3.5) Probabilities and greedy pick at current step
TOKEN_PROBABILITIES = torch.softmax(LOGITS, dim=-1) # [B,T,V]
LOGITS_HISTORY.append(TOKEN_PROBABILITIES) # store for this step
step_idx = decoder_phase - 1 # 0-based
TOKEN_IDS = TOKEN_PROBABILITIES[:, step_idx, :].argmax(dim=-1).tolist() # [B] -> list[int]
# 3.6) Write prediction into next slot (the slot is PAD)
if step_idx + 1 < self.__sentence_length:
for b, tok in enumerate(TOKEN_IDS):
decoder_token_list[b][step_idx + 1] = tok # feed next position
# 3.7) Stop when we filled the sequence
if decoder_phase == self.__sentence_length - 1:
exit_loop = True
return LOGITS_HISTORY # list of [B,T,V] (per step)

View File

@ -0,0 +1,213 @@
import math
import random
import sys
class SpannedMasker:
def __init__(
self,
max_vocabulary: int,
forbidden_tokens: set[int],
change_token_probability: float = 0.15,
average_span: int = 1,
seed: int = random.randint(0, sys.maxsize),
) -> None:
if change_token_probability < 0 or change_token_probability > 1:
raise ValueError("received a value that is not between 0 or 1")
self.__change_token_probability = change_token_probability
self.__average_span = average_span
self.__rng = random.Random(seed)
self.__max_vocabulary = max_vocabulary
self.__forbidden_tokens = forbidden_tokens
def mask_sequence(
self,
token_sequence: list[int],
) -> tuple[list[int], list[int]]:
MASK = self.__create_mask(token_sequence, self.__forbidden_tokens)
MASKED = self.__create_masked_input(token_sequence, MASK, self.__max_vocabulary)
TARGET = self.__create_target(token_sequence, MASK, self.__max_vocabulary)
return (MASKED, TARGET)
def __number_of_spans(self, legal_token_number: int):
EXPECTED_NUM_OF_CORRUPTED_TOKENS = self.__number_of_corrupted_tokens(legal_token_number)
return math.ceil(EXPECTED_NUM_OF_CORRUPTED_TOKENS / self.__average_span)
def __number_of_corrupted_tokens(self, legal_token_number: int):
EXPECTED_NUM_OF_CORRUPTED_TOKENS = math.ceil(
legal_token_number * self.__change_token_probability
)
return EXPECTED_NUM_OF_CORRUPTED_TOKENS
def __create_mask(self, sequence: list[int], forbidden_tokens: set[int]) -> list[bool]:
SEQ_LEN = len(sequence)
LEGAL_TOKENS = self.__count_legal_tokens(sequence, forbidden_tokens)
NUM_OF_CORRUPTIONS = self.__number_of_corrupted_tokens(LEGAL_TOKENS)
NUM_OF_SPANS = self.__number_of_spans(LEGAL_TOKENS)
MASK = [False] * SEQ_LEN
mask_index = 0
number_of_spans = 0
exit_loop = False
while not exit_loop:
TOKEN = sequence[mask_index]
MASKED = MASK[mask_index]
SHOULD_MASK = self.__random_mask()
skip = False
if self.__is_illegal_token(TOKEN, forbidden_tokens):
skip = True
if MASKED:
skip = True
if not SHOULD_MASK:
skip = True
if skip:
mask_index = (mask_index + 1) % SEQ_LEN
continue
CANDIDATE_SPAN = self.__random_span(
self.__average_span
)
REMAINING_MASK = SEQ_LEN - (mask_index + 1)
SPAN_LENGTH = min(CANDIDATE_SPAN, REMAINING_MASK)
for _ in range(0, SPAN_LENGTH):
INNER_TOKEN = sequence[mask_index]
if self.__is_illegal_token(INNER_TOKEN, forbidden_tokens):
continue
MASK[mask_index] = True
mask_index += 1
number_of_spans += 1
mask_index += 1
if number_of_spans == NUM_OF_SPANS:
exit_loop = True
continue
if mask_index >= SEQ_LEN - 1:
exit_loop = True
continue
return MASK
def __create_masked_input(self, sequence: list[int], mask: list[bool], max_voc: int) -> list[int]:
OUT: list[int] = []
mask_token_id = max_voc + 1
index = 0
while index < len(sequence):
TOKEN = sequence[index]
MASKED = mask[index]
if not MASKED:
OUT.append(
TOKEN
)
index += 1
continue
MASK_TOKEN = mask_token_id
OUT.append(
MASK_TOKEN
)
while mask[index]:
index += 1
mask_token_id += 1
return OUT
def __create_target(self, sequence: list[int], mask: list[bool], max_voc: int) -> list[int]:
OUT: list[int] = []
mask_token_id = max_voc + 1
index = 0
while index < len(sequence):
TOKEN = sequence[index]
MASKED = mask[index]
if MASKED:
OUT.append(
TOKEN
)
index += 1
continue
MASK_TOKEN = mask_token_id
OUT.append(
MASK_TOKEN
)
while index < len(mask) and not mask[index]:
index += 1
mask_token_id += 1
return OUT
def __is_illegal_token(self, token: int, illegal_voc: set[int]) -> bool:
if token in illegal_voc:
return True
return False
def __count_legal_tokens(self, sequence: list[int], illegal_voc: set[int]) -> int:
legal_count = 0
for token in sequence:
if self.__is_illegal_token(token, illegal_voc):
continue
legal_count += 1
return legal_count
def __random_mask(self) -> bool:
if self.__random_probability() > self.__change_token_probability:
return False
return True
def __random_probability(self) -> float:
return self.__rng.random()
def __random_token(self, max_vocabulary: int) -> int:
return self.__rng.randint(0, max_vocabulary)
def __random_int_range(self, min: int, max: int) -> int:
return self.__rng.randint(min, max)
def __random_span(self, average: int) -> int:
candidate_span = self.__rng.gauss(mu=average)
candidate_span = max(1, candidate_span)
candidate_span = round(candidate_span)
return candidate_span

View File

@ -0,0 +1,77 @@
import random
import sys
class TokenMasker:
def __init__(
self,
change_token_probability: float = 0.15,
mask_token_probability: float = 0.8,
random_token_prob: float = 0.1,
seed: int = random.randint(0, sys.maxsize),
) -> None:
if change_token_probability < 0 or change_token_probability > 1:
raise ValueError("received a value that is not between 0 or 1")
if mask_token_probability < 0 or mask_token_probability > 1:
raise ValueError("received a value that is not between 0 or 1")
if random_token_prob < 0 or random_token_prob > 1:
raise ValueError("received a value that is not between 0 or 1")
if mask_token_probability + random_token_prob > 1:
raise ValueError("The sum of probabilities is over 1")
self.__change_token_probability = change_token_probability
self.__mask_token_probability = mask_token_probability
self.__random_token_prob = random_token_prob
self.__rng = random.Random(seed)
def mask_sequence(
self, token_sequence: list[int], max_vocabulary: int, mask_id: int
) -> list[int]:
if mask_id <= max_vocabulary:
raise ValueError("mask_id is a value of vocabulary")
MASKED_SEQUENCE: list[int] = []
for token in token_sequence:
if token > max_vocabulary:
MASKED_SEQUENCE.append(token)
continue
MASKED_TOKEN = self.__mask(token, max_vocabulary, mask_id)
MASKED_SEQUENCE.append(MASKED_TOKEN)
return MASKED_SEQUENCE
def __mask(self, token: int, max_vocabulary: int, mask_id: int) -> int:
if self.__random_probability() > self.__change_token_probability:
return token
MASK_TOKEN_TRESH = self.__mask_token_probability
RANDOM_TOKEN_TRESH = MASK_TOKEN_TRESH + self.__random_token_prob
CHANCE_PROBABILITY = self.__random_probability()
# It's over both probabilities, return same token
if CHANCE_PROBABILITY > RANDOM_TOKEN_TRESH:
return token
# It's over masking treshold, but lower than random
# return random token
if CHANCE_PROBABILITY > MASK_TOKEN_TRESH:
return self.__random_token(max_vocabulary)
# It's below masking treshold, mask token
return mask_id
def __random_probability(self) -> float:
return self.__rng.random()
def __random_token(self, max_vocabulary: int) -> int:
return self.__rng.randint(0, max_vocabulary)

View File

@ -0,0 +1,47 @@
import torch
import torch.nn as nn
from typing import Optional
class TorchMultiHeadAttention(nn.Module):
def __init__(
self,
embedding_dimension: int,
number_of_attention_heads: int,
dropout: float = 0.0
):
super().__init__()
self.attention = torch.nn.MultiheadAttention(
embedding_dimension,
num_heads=number_of_attention_heads,
dropout=dropout,
batch_first=True,
)
def forward(
self,
x_q: torch.Tensor,
x_k: torch.Tensor,
x_v: torch.Tensor,
key_padding_mask=None,
attention_mask: Optional[torch.Tensor] = None
) -> torch.Tensor:
# x * Wq -> Q
# x * Wk -> K
# x * Wv -> V
# REMEMBER: tochAttention uses Batch internally to build the 3 dimension attention mask given the 2 dimension
y, _ = self.attention(
x_q, x_k, x_v, attn_mask=attention_mask, key_padding_mask=key_padding_mask,
need_weights=False
)
return y
# batch_first=False (default storico)
# Formato: (L, N, E)
# L = lunghezza della sequenza (time/posizioni)
# N = batch size
# E = dimensione d_model (embed)
# batch_first=True
# Formato: (N, L, E) (più naturale per molti modelli)

View File

@ -0,0 +1,16 @@
from .Decoder import Decoder
from .Encoder import Encoder
from .FeedForwardNetwork import FeedForwardNetwork
# from .MultiHeadAttention import MultiheadAttention
from .TorchMultiHeadAttention import TorchMultiHeadAttention
from .SpannedMasker import SpannedMasker
from .DeToken import DeToken
__all__ = [
"Decoder",
"Encoder",
"FeedForwardNetwork",
"TorchMultiHeadAttention",
"SpannedMasker",
"DeToken"
]

View File

@ -0,0 +1,72 @@
import torch
import Project_Model.Libs.Embedder as Embedder
from ..Classes import Encoder, Decoder, DeToken
class TrainingModel(torch.nn.Module):
def __init__(
self,
vocabulary_size: int,
latent_space: int = 256,
feed_forward_multiplier: int = 4,
attention_heads: int = 4,
layer_number: int = 2,
) -> None:
super().__init__()
feed_forward_latent_space = latent_space * feed_forward_multiplier
self.__encoder_embedder = Embedder.NanoSocratesEmbedder(
vocabulary_size, latent_space
)
self.__decoder_embedder = Embedder.NanoSocratesEmbedder(
vocabulary_size, latent_space
)
# do NOT share layer weights
enc_layers = [
Encoder(latent_space, feed_forward_latent_space, attention_heads)
for _ in range(layer_number)
]
dec_layers = [
Decoder(latent_space, feed_forward_latent_space, attention_heads)
for _ in range(layer_number)
]
self.__encoder = torch.nn.Sequential(*enc_layers)
self.__decoder = torch.nn.Sequential(*dec_layers)
self.__detokener = DeToken(latent_space, vocabulary_size)
def forward(
self,
args: tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]
):
# returns logits for the LAST decoder position only -> [B, V]
(
encoder_embedder_input, # [B,S] encoder tokens
encoder_padding_mask, # [B,S] True where encoder is PAD
decoder_embedder_prefix, # [B,Tp] decoder prefix (e.g., <SOS> + tokens so far)
decoder_padding_mask, # [B,Tp] True where decoder prefix has PAD
) = args
# 1) embeddings
encoder_tensor = self.__encoder_embedder(encoder_embedder_input) # [B,S,E]
decoder_tensor = self.__decoder_embedder(decoder_embedder_prefix) # [B,Tp,E]
# 2) encode
encoder_output, _ = self.__encoder((encoder_tensor, encoder_padding_mask)) # [B,S,E], [B,S]
# 3) decode (causal mask is built inside the decoder)
decoder_output, _, _, _, _ = self.__decoder(
(decoder_tensor, encoder_output, encoder_output,
decoder_padding_mask, encoder_padding_mask)
) # [B,Tp,E], ...
# 4) project only the last time step
last_hidden = decoder_output[:, -1:, :] # [B,1,E]
step_logits = self.__detokener(last_hidden) # [B,1,V]
step_logits = step_logits[:, -1, :] # [B,V]
return step_logits # logits for one token

View File

@ -0,0 +1,5 @@
from .TrainingModel import TrainingModel
__all__ = [
"TrainingModel"
]

View File

@ -0,0 +1,17 @@
from .attention_mask import get_causal_attention_mask,get_causal_attention_mask_batched
from .task_type import TaskType
from .post_tokenization import truncate_sequence, pad_sequence, normalize_sequence, create_padding_mask
from .inference_masking import inference_masking
from .truncate_rdf_list import truncate_rdf_list
__all__ = [
"TaskType",
"get_causal_attention_mask",
"get_causal_attention_mask_batched",
"truncate_sequence",
"pad_sequence",
"create_padding_mask",
"normalize_sequence",
"inference_masking",
"truncate_rdf_list"
]

View File

@ -0,0 +1,11 @@
import torch
def get_causal_attention_mask(seq_len: int) -> torch.Tensor:
return torch.triu(torch.ones(seq_len, seq_len, dtype=torch.bool), diagonal=1)
# there is no need for this since MultiHeadAttention of torch does this internally
def get_causal_attention_mask_batched(seq_len: int, batch_size: int ) -> torch.Tensor:
base_mask = get_causal_attention_mask(seq_len)
return base_mask.unsqueeze(0).expand(batch_size, -1, -1) # add another dimension at the beginning, big as batch_size
# the result is that z,x,y where x,y are repeated along z

View File

@ -0,0 +1,13 @@
def inference_masking(sequence: list[int], mask_token: int, max_vocabulary: int) -> list[int]:
current_mask_token = max_vocabulary + 1
for i in range(0, len(sequence)):
if sequence[i] != mask_token:
continue
sequence[i] = current_mask_token
current_mask_token += 1
return sequence

View File

@ -0,0 +1,56 @@
def truncate_sequence(
sequence: list[int], truncate_at: int, end_token: int
) -> list[int]:
if len(sequence) < truncate_at - 1:
sequence.append(end_token)
return sequence
if len(sequence) < truncate_at:
sequence[-1] = end_token
return sequence
TRUNCATED_SEQUENCE = sequence[:truncate_at]
TRUNCATED_SEQUENCE[-1] = end_token
return TRUNCATED_SEQUENCE
def pad_sequence(sequence: list[int], pad_until: int, pad_token: int) -> list[int]:
if not (len(sequence) < pad_until):
return sequence
NUM_OF_PADDINGS = pad_until - len(sequence)
PADDINGS = [pad_token] * NUM_OF_PADDINGS
PADDED_SEQUENCE = sequence[:]
PADDED_SEQUENCE.extend(PADDINGS)
return PADDED_SEQUENCE
def create_padding_mask(sequence: list[int], pad_token: int) -> list[bool]:
PADDING_MASK = [False] * len(sequence)
for i in range(0, len(sequence)):
if sequence[i] != pad_token:
continue
PADDING_MASK[i] = True
return PADDING_MASK
def normalize_sequence(
sequence: list[int],
max_length: int,
pad_token: int,
end_token: int,
) -> tuple[list[int], list[bool]]:
new_sequence = truncate_sequence(sequence, max_length, end_token)
new_sequence = pad_sequence(new_sequence, max_length, pad_token)
PADDING_MASK = create_padding_mask(new_sequence, pad_token)
return (new_sequence, PADDING_MASK)

View File

@ -0,0 +1,6 @@
from enum import Enum, auto
class TaskType(Enum):
RDF2TEXT = auto()
MASK = auto()
COMPLETATION = auto()

View File

@ -0,0 +1,65 @@
from collections import deque
import random
import sys
def truncate_rdf_list(
sequence: list[int],
truncation_probability: float,
continue_triple_token: int,
end_of_triple_token: int,
seed: int = random.randint(0, sys.maxsize),
) -> tuple[list[int], list[int]]:
if truncation_probability < 0 or truncation_probability > 1:
raise ValueError("A probability must be between 0 and 1")
RNG = random.Random(seed)
END_OF_TRIPLES: deque[int] = deque()
for i in range(0, len(sequence)):
TOKEN = sequence[i]
if TOKEN != end_of_triple_token:
continue
END_OF_TRIPLES.append(i + 1)
TRIPLES_TOKENS: list[int] = []
TARGET_TRIPLES: list[int] = []
start_of_triple = 0
exit_loop = False
while not exit_loop:
EOT = END_OF_TRIPLES.popleft()
TRIPLE = sequence[start_of_triple:EOT]
TRIPLES_TOKENS.extend(TRIPLE)
start_of_triple = EOT
if RNG.random() < truncation_probability:
exit_loop = True
if len(END_OF_TRIPLES) == 1:
exit_loop = True
TRIPLES_TOKENS.append(
continue_triple_token
)
while len(END_OF_TRIPLES) > 0:
EOT = END_OF_TRIPLES.popleft()
TRIPLE = sequence[start_of_triple:EOT]
TARGET_TRIPLES.extend(TRIPLE)
start_of_triple = EOT
return (TRIPLES_TOKENS, TARGET_TRIPLES)

View File

@ -0,0 +1,7 @@
from .Classes import *
from .Utils import *
from .Models import *
from . import Classes
from . import Utils
from . import Models

View File

@ -0,0 +1,4 @@
from . import BPE
from . import Embedder
from . import Transformer
from . import TorchShims

View File

@ -0,0 +1,74 @@
from Project_Model.Libs.BPE.Enums import TokenType
import Project_Model.Libs.BPE as BPE
import re
class TestBPE:
def test_bpe_encoding_simple(self):
TEXT = "abababab"
# ab = 256
# 256, 256 = 257
# 257, 257 = 258
VOCABULARY = {(ord("a"), ord("b")): 256, (256, 256): 257, (257, 257): 258}
EXPECTED = [258]
BPE_ENCODER = BPE.NanoSocratesBPE(VOCABULARY)
ENCODED = BPE_ENCODER.encode(TEXT)
assert len(ENCODED) == len(EXPECTED)
for encoded, expected in zip(ENCODED, EXPECTED):
assert encoded == expected
def test_bpe_decoding_simple(self):
INPUT = [258]
# ab = 256
# 256, 256 = 257
# 257, 257 = 258
VOCABULARY = {(ord("a"), ord("b")): 256, (256, 256): 257, (257, 257): 258}
EXPECTED = "abababab"
BPE_ENCODER = BPE.NanoSocratesBPE(VOCABULARY)
DECODED = BPE_ENCODER.decode(INPUT)
assert len(DECODED) == len(EXPECTED)
for encoded, expected in zip(DECODED, EXPECTED):
assert encoded == expected
def test_bpe_decoding_edge_1(self):
INPUT = [258, ord("c")]
# ab = 256
# 256, 256 = 257
# 257, 257 = 258
VOCABULARY = {(ord("a"), ord("b")): 256, (256, 256): 257, (257, 257): 258}
EXPECTED = "ababababc"
BPE_ENCODER = BPE.NanoSocratesBPE(VOCABULARY)
DECODED = BPE_ENCODER.decode(INPUT)
assert len(DECODED) == len(EXPECTED)
for encoded, expected in zip(DECODED, EXPECTED):
assert encoded == expected
# Useful to debug weird cases
if __name__ == "__main__":
# TestBPE().test_bpe_decoding_simple()
TestBPE().test_bpe_encoding_simple()

View File

@ -0,0 +1,77 @@
from pathlib import Path
from Project_Model.Libs.BPE.Enums import TokenType
import Project_Model.Libs.BPE as BPE
import re
CACHE_DIR_PATH = Path("Project_Model/Tests/trainer_files/cache/pool-cache.json")
class TestTrainBPE:
def test_bpe_train_encoding_simple(self):
TRAINER = BPE.NanoSocraTrainerPool(
int(32E3),
["<SOT>", "<EOT>"]
)
TEXT = "abababab"
TEXT_PATH = Path("Project_Model/Tests/trainer_files/train_simple.txt")
EXPECTED = [258]
# ab = 256
# 256, 256 = 257
# 257, 257 = 258
BPE_ENCODER = TRAINER.trainBPE(
TEXT_PATH,
CACHE_DIR_PATH
)
ENCODED = BPE_ENCODER.encode(TEXT)
assert len(ENCODED) == len(EXPECTED)
for encoded, expected in zip(ENCODED, EXPECTED):
assert encoded == expected
def test_bpe_train_encoding_and_decoding(self):
SPECIAL_LIST = ["<ABS>", "<SOTL>"]
TRAINER = BPE.NanoSocraTrainerPool(
int(32E3),
SPECIAL_LIST
)
TEXT_PATH = Path("Project_Model/Tests/trainer_files/train_encode_decode.txt")
FILE = open(TEXT_PATH)
TEXT = FILE.read()
FILE.close()
EXPECTED = TEXT
# ab = 256
# 256, 256 = 257
# 257, 257 = 258
BPE_ENCODER = TRAINER.trainBPE(
TEXT_PATH,
CACHE_DIR_PATH
)
VOCABULARY = BPE_ENCODER.vocabulary
TOKENANO = BPE.TokeNanoCore(VOCABULARY,SPECIAL_LIST)
ENCODED = TOKENANO.encode(TEXT)
DECODED = TOKENANO.decode(ENCODED)
assert len(DECODED) == len(EXPECTED)
for decoded, expected in zip(DECODED, EXPECTED):
assert decoded == expected
# Useful to debug weird cases
if __name__ == "__main__":
# TestTrainBPE().test_bpe_train_encoding_simple()
TestTrainBPE().test_bpe_train_encoding_and_decoding()

Some files were not shown because too many files have changed in this diff Show More