From 1d23b9cc8bbc40c7d93c7294848a45193856a1a8 Mon Sep 17 00:00:00 2001 From: GassiGiuseppe Date: Tue, 7 Oct 2025 16:05:32 +0200 Subject: [PATCH] little snippet to trim big dictionaries --- Scripts/Training/dictionary_adjuster.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 Scripts/Training/dictionary_adjuster.py diff --git a/Scripts/Training/dictionary_adjuster.py b/Scripts/Training/dictionary_adjuster.py new file mode 100644 index 0000000..000d807 --- /dev/null +++ b/Scripts/Training/dictionary_adjuster.py @@ -0,0 +1,12 @@ +# to cut the mad trained dict into a short one +from Project_Model.Libs.BPE.Utils.vocabulary import load_nanos_vocabulary, save_nanos_vocabulary +from pathlib import Path + +DICTIONARY_PATH = "Assets/Dataset/Tmp/mad_cache.json" +OUTPUT_PATH = "Assets/Dataset/Tmp/trimmed.json" + + +big_dict = load_nanos_vocabulary(Path(DICTIONARY_PATH)) +big_dict = dict(list(big_dict.items())[:31744]) + +save_nanos_vocabulary(big_dict,Path(OUTPUT_PATH)) \ No newline at end of file