NanoSocrates/Scripts/Training/dictionary_adjuster.py

# to cut the mad trained dict into a short one
from Project_Model.Libs.BPE.Utils.vocabulary import load_nanos_vocabulary, save_nanos_vocabulary
from pathlib import Path

DICTIONARY_PATH = "Assets/Dataset/Tmp/mad_cache.json"
OUTPUT_PATH = "Assets/Dataset/Tmp/trimmed.json"


big_dict = load_nanos_vocabulary(Path(DICTIONARY_PATH))
big_dict = dict(list(big_dict.items())[:31744])

save_nanos_vocabulary(big_dict,Path(OUTPUT_PATH))