a new exasperated way to train the bpe, just a wild experimen that could be useful later
This commit is contained in:
parent
845c63dbef
commit
502016f843
48
Scripts/Training/mad_traininng.py
Normal file
48
Scripts/Training/mad_traininng.py
Normal file
@ -0,0 +1,48 @@
|
||||
# generate each time a corpus big then the last, without the old data
|
||||
# then using the same vocabulary let the bpe train
|
||||
|
||||
from Scripts.DataCleaning.pipeline import Pipeline
|
||||
from Scripts.Training.bpe_trainer_pool import train,get_args
|
||||
from pathlib import Path
|
||||
import os, shutil
|
||||
|
||||
CORPUS_PATH = "Assets/Dataset/Tmp/toy_corpus.txt"
|
||||
VOCABULARY_PATH = "Assets/Dataset/Tmp/mad_vocabulary.json"
|
||||
CACHE_PATH = "Assets/Dataset/Tmp/mad_cache.json"
|
||||
|
||||
|
||||
def mad_corpus_generator(corpus_size :int, corpus_offset: int):
|
||||
print("New Corpus")
|
||||
pipe = Pipeline(bpe_corpus_path=CORPUS_PATH)
|
||||
print("Pipeline Created")
|
||||
corpus_ending_offset = corpus_size + corpus_offset
|
||||
pipe.reduce_movie_list(corpus_offset,corpus_ending_offset)
|
||||
print("Starting building corpus")
|
||||
pipe.execute_task_bpe_corpus()
|
||||
print("Corpus created")
|
||||
|
||||
def mad_bpe_trainer():
|
||||
argv = [
|
||||
"--input-file", CORPUS_PATH,
|
||||
"--output-file", VOCABULARY_PATH,
|
||||
"--cache-file", CACHE_PATH,
|
||||
]
|
||||
args = get_args(argv)
|
||||
train(args)
|
||||
|
||||
def mad_hatter():
|
||||
# 10,100,500,1000,1500,2000,3000,4000,5000,10000
|
||||
film_list = [10,100,500,1000,1500,2000,3000,4000,5000,10000]
|
||||
starting_offset = 0
|
||||
for corpus_size in film_list:
|
||||
|
||||
# mad_corpus_generator(corpus_size, starting_offset)
|
||||
# starting_offset = starting_offset + corpus_size
|
||||
|
||||
mad_bpe_trainer()
|
||||
# put dict into cache
|
||||
shutil.copyfile(Path(VOCABULARY_PATH), Path(CACHE_PATH))
|
||||
|
||||
|
||||
|
||||
mad_hatter()
|
||||
Loading…
x
Reference in New Issue
Block a user