diff --git a/Scripts/Training/mad_traininng.py b/Scripts/Training/mad_traininng.py new file mode 100644 index 0000000..fe1bb4a --- /dev/null +++ b/Scripts/Training/mad_traininng.py @@ -0,0 +1,48 @@ +# generate each time a corpus big then the last, without the old data +# then using the same vocabulary let the bpe train + +from Scripts.DataCleaning.pipeline import Pipeline +from Scripts.Training.bpe_trainer_pool import train,get_args +from pathlib import Path +import os, shutil + +CORPUS_PATH = "Assets/Dataset/Tmp/toy_corpus.txt" +VOCABULARY_PATH = "Assets/Dataset/Tmp/mad_vocabulary.json" +CACHE_PATH = "Assets/Dataset/Tmp/mad_cache.json" + + +def mad_corpus_generator(corpus_size :int, corpus_offset: int): + print("New Corpus") + pipe = Pipeline(bpe_corpus_path=CORPUS_PATH) + print("Pipeline Created") + corpus_ending_offset = corpus_size + corpus_offset + pipe.reduce_movie_list(corpus_offset,corpus_ending_offset) + print("Starting building corpus") + pipe.execute_task_bpe_corpus() + print("Corpus created") + +def mad_bpe_trainer(): + argv = [ + "--input-file", CORPUS_PATH, + "--output-file", VOCABULARY_PATH, + "--cache-file", CACHE_PATH, + ] + args = get_args(argv) + train(args) + +def mad_hatter(): + # 10,100,500,1000,1500,2000,3000,4000,5000,10000 + film_list = [10,100,500,1000,1500,2000,3000,4000,5000,10000] + starting_offset = 0 + for corpus_size in film_list: + + # mad_corpus_generator(corpus_size, starting_offset) + # starting_offset = starting_offset + corpus_size + + mad_bpe_trainer() + # put dict into cache + shutil.copyfile(Path(VOCABULARY_PATH), Path(CACHE_PATH)) + + + +mad_hatter() \ No newline at end of file