Compare commits
4 Commits
dev.modelt
...
dev.bpe
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1d23b9cc8b | ||
|
|
165290162c | ||
|
|
502016f843 | ||
|
|
845c63dbef |
@@ -31,11 +31,15 @@ class TokeNanoCore:
|
|||||||
for piece, token_type in self.__splitter.split_text(corpus):
|
for piece, token_type in self.__splitter.split_text(corpus):
|
||||||
|
|
||||||
if token_type == TokenType.SPECIAL:
|
if token_type == TokenType.SPECIAL:
|
||||||
output.extend(self.__special_encoder.encode(piece))
|
ENCODED_PIECE = self.__special_encoder.encode(piece)
|
||||||
|
output.extend(ENCODED_PIECE)
|
||||||
|
continue
|
||||||
|
|
||||||
# slow but clear
|
# slow but clear
|
||||||
if token_type == TokenType.BPE:
|
if token_type == TokenType.BPE:
|
||||||
output.extend(self.__bpe_encoder.encode(piece))
|
ENCODED_PIECE = self.__bpe_encoder.encode(piece)
|
||||||
|
output.extend(ENCODED_PIECE)
|
||||||
|
continue
|
||||||
|
|
||||||
return output
|
return output
|
||||||
|
|
||||||
@@ -46,11 +50,13 @@ class TokeNanoCore:
|
|||||||
if token_type == TokenType.SPECIAL:
|
if token_type == TokenType.SPECIAL:
|
||||||
output_str += self.__special_encoder.decode(
|
output_str += self.__special_encoder.decode(
|
||||||
token
|
token
|
||||||
) # it accept an integer
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
# slow but clear
|
# slow but clear
|
||||||
if token_type == TokenType.BPE:
|
if token_type == TokenType.BPE:
|
||||||
output_str += self.__bpe_encoder.decode(
|
output_str += self.__bpe_encoder.decode(
|
||||||
token
|
token
|
||||||
) # it accept a list of integer
|
)
|
||||||
|
continue
|
||||||
return output_str
|
return output_str
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ from .NanoSocraTrainer import NanoSocraTrainer
|
|||||||
from .NanoSocraTraineRam import NanoSocraTraineRam
|
from .NanoSocraTraineRam import NanoSocraTraineRam
|
||||||
from .NanoSocraTrainerPool import NanoSocraTrainerPool
|
from .NanoSocraTrainerPool import NanoSocraTrainerPool
|
||||||
from .NanoSocratesSpecial import NanoSocratesSpecial
|
from .NanoSocratesSpecial import NanoSocratesSpecial
|
||||||
|
from .TokeNanoCore import TokeNanoCore
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"NanoSocratesChunker",
|
"NanoSocratesChunker",
|
||||||
@@ -12,5 +13,6 @@ __all__ = [
|
|||||||
"NanoSocratesBPE",
|
"NanoSocratesBPE",
|
||||||
"NanoSocraTrainer",
|
"NanoSocraTrainer",
|
||||||
"NanoSocraTraineRam",
|
"NanoSocraTraineRam",
|
||||||
"NanoSocraTrainerPool"
|
"NanoSocraTrainerPool",
|
||||||
|
"TokeNanoCore"
|
||||||
]
|
]
|
||||||
12
Scripts/Training/dictionary_adjuster.py
Normal file
12
Scripts/Training/dictionary_adjuster.py
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
# to cut the mad trained dict into a short one
|
||||||
|
from Project_Model.Libs.BPE.Utils.vocabulary import load_nanos_vocabulary, save_nanos_vocabulary
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
DICTIONARY_PATH = "Assets/Dataset/Tmp/mad_cache.json"
|
||||||
|
OUTPUT_PATH = "Assets/Dataset/Tmp/trimmed.json"
|
||||||
|
|
||||||
|
|
||||||
|
big_dict = load_nanos_vocabulary(Path(DICTIONARY_PATH))
|
||||||
|
big_dict = dict(list(big_dict.items())[:31744])
|
||||||
|
|
||||||
|
save_nanos_vocabulary(big_dict,Path(OUTPUT_PATH))
|
||||||
48
Scripts/Training/mad_traininng.py
Normal file
48
Scripts/Training/mad_traininng.py
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
# generate each time a corpus big then the last, without the old data
|
||||||
|
# then using the same vocabulary let the bpe train
|
||||||
|
|
||||||
|
from Scripts.DataCleaning.pipeline import Pipeline
|
||||||
|
from Scripts.Training.bpe_trainer_pool import train,get_args
|
||||||
|
from pathlib import Path
|
||||||
|
import os, shutil
|
||||||
|
|
||||||
|
CORPUS_PATH = "Assets/Dataset/Tmp/toy_corpus.txt"
|
||||||
|
VOCABULARY_PATH = "Assets/Dataset/Tmp/mad_vocabulary.json"
|
||||||
|
CACHE_PATH = "Assets/Dataset/Tmp/mad_cache.json"
|
||||||
|
|
||||||
|
|
||||||
|
def mad_corpus_generator(corpus_size :int, corpus_offset: int):
|
||||||
|
print("New Corpus")
|
||||||
|
pipe = Pipeline(bpe_corpus_path=CORPUS_PATH)
|
||||||
|
print("Pipeline Created")
|
||||||
|
corpus_ending_offset = corpus_size + corpus_offset
|
||||||
|
pipe.reduce_movie_list(corpus_offset,corpus_ending_offset)
|
||||||
|
print("Starting building corpus")
|
||||||
|
pipe.execute_task_bpe_corpus()
|
||||||
|
print("Corpus created")
|
||||||
|
|
||||||
|
def mad_bpe_trainer():
|
||||||
|
argv = [
|
||||||
|
"--input-file", CORPUS_PATH,
|
||||||
|
"--output-file", VOCABULARY_PATH,
|
||||||
|
"--cache-file", CACHE_PATH,
|
||||||
|
]
|
||||||
|
args = get_args(argv)
|
||||||
|
train(args)
|
||||||
|
|
||||||
|
def mad_hatter():
|
||||||
|
# 10,100,500,1000,1500,2000,3000,4000,5000,10000
|
||||||
|
film_list = [10,100,500,1000,1500,2000,3000,4000,5000,10000]
|
||||||
|
starting_offset = 0
|
||||||
|
for corpus_size in film_list:
|
||||||
|
|
||||||
|
# mad_corpus_generator(corpus_size, starting_offset)
|
||||||
|
# starting_offset = starting_offset + corpus_size
|
||||||
|
|
||||||
|
mad_bpe_trainer()
|
||||||
|
# put dict into cache
|
||||||
|
shutil.copyfile(Path(VOCABULARY_PATH), Path(CACHE_PATH))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
mad_hatter()
|
||||||
Reference in New Issue
Block a user