NanoSocrates/Scripts/Training/dictionary_adjuster.py

# to cut the mad trained dict into a short one
from Project_Model.Libs.BPE.Utils.vocabulary import load_nanos_vocabulary, save_nanos_vocabulary
from pathlib import Path

DICTIONARY_PATH = "Assets/Dataset/Tmp/mad_cache.json" 
OUTPUT_PATH = "Assets/Dataset/Tmp/trimmed.json"


big_dict = load_nanos_vocabulary(Path(DICTIONARY_PATH))
big_dict = dict(list(big_dict.items())[:31744])

save_nanos_vocabulary(big_dict,Path(OUTPUT_PATH))
little snippet to trim big dictionaries 2025-10-07 16:05:32 +02:00			`# to cut the mad trained dict into a short one`
			`from Project_Model.Libs.BPE.Utils.vocabulary import load_nanos_vocabulary, save_nanos_vocabulary`
			`from pathlib import Path`

			`DICTIONARY_PATH = "Assets/Dataset/Tmp/mad_cache.json"`
			`OUTPUT_PATH = "Assets/Dataset/Tmp/trimmed.json"`


			`big_dict = load_nanos_vocabulary(Path(DICTIONARY_PATH))`
			`big_dict = dict(list(big_dict.items())[:31744])`

			`save_nanos_vocabulary(big_dict,Path(OUTPUT_PATH))`