a new exasperated way to train the bpe, just a wild experimen that could be useful later

2025-10-04 19:03:07 +02:00
parent 845c63dbef
commit 502016f843
1 changed files with 48 additions and 0 deletions
--- a/Scripts/Training/mad_traininng.py
+++ b/Scripts/Training/mad_traininng.py
@@ -0,0 +1,48 @@
+# generate each time a corpus big then the last, without the old data
+# then using the same vocabulary let the bpe train
+
+from Scripts.DataCleaning.pipeline import Pipeline
+from Scripts.Training.bpe_trainer_pool import train,get_args
+from pathlib import Path
+import os, shutil
+
+CORPUS_PATH = "Assets/Dataset/Tmp/toy_corpus.txt"
+VOCABULARY_PATH = "Assets/Dataset/Tmp/mad_vocabulary.json"
+CACHE_PATH = "Assets/Dataset/Tmp/mad_cache.json"
+
+
+def mad_corpus_generator(corpus_size :int, corpus_offset: int):
+    print("New Corpus")
+    pipe = Pipeline(bpe_corpus_path=CORPUS_PATH)
+    print("Pipeline Created")
+    corpus_ending_offset = corpus_size + corpus_offset
+    pipe.reduce_movie_list(corpus_offset,corpus_ending_offset)
+    print("Starting building corpus")
+    pipe.execute_task_bpe_corpus()
+    print("Corpus created")
+
+def mad_bpe_trainer():
+    argv = [
+    "--input-file", CORPUS_PATH,
+    "--output-file", VOCABULARY_PATH,
+    "--cache-file", CACHE_PATH,
+    ]
+    args = get_args(argv)
+    train(args)
+
+def mad_hatter():
+    # 10,100,500,1000,1500,2000,3000,4000,5000,10000
+    film_list = [10,100,500,1000,1500,2000,3000,4000,5000,10000]
+    starting_offset = 0
+    for corpus_size in film_list:
+
+        # mad_corpus_generator(corpus_size, starting_offset)
+        # starting_offset = starting_offset + corpus_size
+
+        mad_bpe_trainer()
+        # put dict into cache
+        shutil.copyfile(Path(VOCABULARY_PATH), Path(CACHE_PATH))
+
+
+
+mad_hatter()