diff --git a/Scripts/Training/dictionary_adjuster.py b/Scripts/Training/dictionary_adjuster.py new file mode 100644 index 0000000..000d807 --- /dev/null +++ b/Scripts/Training/dictionary_adjuster.py @@ -0,0 +1,12 @@ +# to cut the mad trained dict into a short one +from Project_Model.Libs.BPE.Utils.vocabulary import load_nanos_vocabulary, save_nanos_vocabulary +from pathlib import Path + +DICTIONARY_PATH = "Assets/Dataset/Tmp/mad_cache.json" +OUTPUT_PATH = "Assets/Dataset/Tmp/trimmed.json" + + +big_dict = load_nanos_vocabulary(Path(DICTIONARY_PATH)) +big_dict = dict(list(big_dict.items())[:31744]) + +save_nanos_vocabulary(big_dict,Path(OUTPUT_PATH)) \ No newline at end of file