4 Commits
dev ... dev.bpe

Author SHA1 Message Date
GassiGiuseppe
1d23b9cc8b little snippet to trim big dictionaries 2025-10-07 16:05:32 +02:00
GassiGiuseppe
165290162c added tokenano to the init 2025-10-04 19:03:56 +02:00
GassiGiuseppe
502016f843 a new exasperated way to train the bpe, just a wild experimen that could be useful later 2025-10-04 19:03:07 +02:00
GassiGiuseppe
845c63dbef updated tokenano to be more easy to read 2025-10-04 19:01:21 +02:00
4 changed files with 73 additions and 5 deletions

View File

@@ -31,11 +31,15 @@ class TokeNanoCore:
for piece, token_type in self.__splitter.split_text(corpus):
if token_type == TokenType.SPECIAL:
output.extend(self.__special_encoder.encode(piece))
ENCODED_PIECE = self.__special_encoder.encode(piece)
output.extend(ENCODED_PIECE)
continue
# slow but clear
if token_type == TokenType.BPE:
output.extend(self.__bpe_encoder.encode(piece))
ENCODED_PIECE = self.__bpe_encoder.encode(piece)
output.extend(ENCODED_PIECE)
continue
return output
@@ -46,11 +50,13 @@ class TokeNanoCore:
if token_type == TokenType.SPECIAL:
output_str += self.__special_encoder.decode(
token
) # it accept an integer
)
continue
# slow but clear
if token_type == TokenType.BPE:
output_str += self.__bpe_encoder.decode(
token
) # it accept a list of integer
)
continue
return output_str

View File

@@ -5,6 +5,7 @@ from .NanoSocraTrainer import NanoSocraTrainer
from .NanoSocraTraineRam import NanoSocraTraineRam
from .NanoSocraTrainerPool import NanoSocraTrainerPool
from .NanoSocratesSpecial import NanoSocratesSpecial
from .TokeNanoCore import TokeNanoCore
__all__ = [
"NanoSocratesChunker",
@@ -12,5 +13,6 @@ __all__ = [
"NanoSocratesBPE",
"NanoSocraTrainer",
"NanoSocraTraineRam",
"NanoSocraTrainerPool"
"NanoSocraTrainerPool",
"TokeNanoCore"
]

View File

@@ -0,0 +1,12 @@
# to cut the mad trained dict into a short one
from Project_Model.Libs.BPE.Utils.vocabulary import load_nanos_vocabulary, save_nanos_vocabulary
from pathlib import Path
DICTIONARY_PATH = "Assets/Dataset/Tmp/mad_cache.json"
OUTPUT_PATH = "Assets/Dataset/Tmp/trimmed.json"
big_dict = load_nanos_vocabulary(Path(DICTIONARY_PATH))
big_dict = dict(list(big_dict.items())[:31744])
save_nanos_vocabulary(big_dict,Path(OUTPUT_PATH))

View File

@@ -0,0 +1,48 @@
# generate each time a corpus big then the last, without the old data
# then using the same vocabulary let the bpe train
from Scripts.DataCleaning.pipeline import Pipeline
from Scripts.Training.bpe_trainer_pool import train,get_args
from pathlib import Path
import os, shutil
CORPUS_PATH = "Assets/Dataset/Tmp/toy_corpus.txt"
VOCABULARY_PATH = "Assets/Dataset/Tmp/mad_vocabulary.json"
CACHE_PATH = "Assets/Dataset/Tmp/mad_cache.json"
def mad_corpus_generator(corpus_size :int, corpus_offset: int):
print("New Corpus")
pipe = Pipeline(bpe_corpus_path=CORPUS_PATH)
print("Pipeline Created")
corpus_ending_offset = corpus_size + corpus_offset
pipe.reduce_movie_list(corpus_offset,corpus_ending_offset)
print("Starting building corpus")
pipe.execute_task_bpe_corpus()
print("Corpus created")
def mad_bpe_trainer():
argv = [
"--input-file", CORPUS_PATH,
"--output-file", VOCABULARY_PATH,
"--cache-file", CACHE_PATH,
]
args = get_args(argv)
train(args)
def mad_hatter():
# 10,100,500,1000,1500,2000,3000,4000,5000,10000
film_list = [10,100,500,1000,1500,2000,3000,4000,5000,10000]
starting_offset = 0
for corpus_size in film_list:
# mad_corpus_generator(corpus_size, starting_offset)
# starting_offset = starting_offset + corpus_size
mad_bpe_trainer()
# put dict into cache
shutil.copyfile(Path(VOCABULARY_PATH), Path(CACHE_PATH))
mad_hatter()