# generate each time a corpus big then the last, without the old data # then using the same vocabulary let the bpe train from Scripts.DataCleaning.pipeline import Pipeline from Scripts.Training.bpe_trainer_pool import train,get_args from pathlib import Path import os, shutil CORPUS_PATH = "Assets/Dataset/Tmp/toy_corpus.txt" VOCABULARY_PATH = "Assets/Dataset/Tmp/mad_vocabulary.json" CACHE_PATH = "Assets/Dataset/Tmp/mad_cache.json" def mad_corpus_generator(corpus_size :int, corpus_offset: int): print("New Corpus") pipe = Pipeline(bpe_corpus_path=CORPUS_PATH) print("Pipeline Created") corpus_ending_offset = corpus_size + corpus_offset pipe.reduce_movie_list(corpus_offset,corpus_ending_offset) print("Starting building corpus") pipe.execute_task_bpe_corpus() print("Corpus created") def mad_bpe_trainer(): argv = [ "--input-file", CORPUS_PATH, "--output-file", VOCABULARY_PATH, "--cache-file", CACHE_PATH, ] args = get_args(argv) train(args) def mad_hatter(): # 10,100,500,1000,1500,2000,3000,4000,5000,10000 film_list = [10,100,500,1000,1500,2000,3000,4000,5000,10000] starting_offset = 0 for corpus_size in film_list: # mad_corpus_generator(corpus_size, starting_offset) # starting_offset = starting_offset + corpus_size mad_bpe_trainer() # put dict into cache shutil.copyfile(Path(VOCABULARY_PATH), Path(CACHE_PATH)) mad_hatter()