4 Commits

Author SHA1 Message Date
GassiGiuseppe
1d23b9cc8b little snippet to trim big dictionaries 2025-10-07 16:05:32 +02:00
GassiGiuseppe
165290162c added tokenano to the init 2025-10-04 19:03:56 +02:00
GassiGiuseppe
502016f843 a new exasperated way to train the bpe, just a wild experimen that could be useful later 2025-10-04 19:03:07 +02:00
GassiGiuseppe
845c63dbef updated tokenano to be more easy to read 2025-10-04 19:01:21 +02:00
4 changed files with 73 additions and 5 deletions

View File

@@ -31,11 +31,15 @@ class TokeNanoCore:
for piece, token_type in self.__splitter.split_text(corpus): for piece, token_type in self.__splitter.split_text(corpus):
if token_type == TokenType.SPECIAL: if token_type == TokenType.SPECIAL:
output.extend(self.__special_encoder.encode(piece)) ENCODED_PIECE = self.__special_encoder.encode(piece)
output.extend(ENCODED_PIECE)
continue
# slow but clear # slow but clear
if token_type == TokenType.BPE: if token_type == TokenType.BPE:
output.extend(self.__bpe_encoder.encode(piece)) ENCODED_PIECE = self.__bpe_encoder.encode(piece)
output.extend(ENCODED_PIECE)
continue
return output return output
@@ -46,11 +50,13 @@ class TokeNanoCore:
if token_type == TokenType.SPECIAL: if token_type == TokenType.SPECIAL:
output_str += self.__special_encoder.decode( output_str += self.__special_encoder.decode(
token token
) # it accept an integer )
continue
# slow but clear # slow but clear
if token_type == TokenType.BPE: if token_type == TokenType.BPE:
output_str += self.__bpe_encoder.decode( output_str += self.__bpe_encoder.decode(
token token
) # it accept a list of integer )
continue
return output_str return output_str

View File

@@ -5,6 +5,7 @@ from .NanoSocraTrainer import NanoSocraTrainer
from .NanoSocraTraineRam import NanoSocraTraineRam from .NanoSocraTraineRam import NanoSocraTraineRam
from .NanoSocraTrainerPool import NanoSocraTrainerPool from .NanoSocraTrainerPool import NanoSocraTrainerPool
from .NanoSocratesSpecial import NanoSocratesSpecial from .NanoSocratesSpecial import NanoSocratesSpecial
from .TokeNanoCore import TokeNanoCore
__all__ = [ __all__ = [
"NanoSocratesChunker", "NanoSocratesChunker",
@@ -12,5 +13,6 @@ __all__ = [
"NanoSocratesBPE", "NanoSocratesBPE",
"NanoSocraTrainer", "NanoSocraTrainer",
"NanoSocraTraineRam", "NanoSocraTraineRam",
"NanoSocraTrainerPool" "NanoSocraTrainerPool",
"TokeNanoCore"
] ]

View File

@@ -0,0 +1,12 @@
# to cut the mad trained dict into a short one
from Project_Model.Libs.BPE.Utils.vocabulary import load_nanos_vocabulary, save_nanos_vocabulary
from pathlib import Path
DICTIONARY_PATH = "Assets/Dataset/Tmp/mad_cache.json"
OUTPUT_PATH = "Assets/Dataset/Tmp/trimmed.json"
big_dict = load_nanos_vocabulary(Path(DICTIONARY_PATH))
big_dict = dict(list(big_dict.items())[:31744])
save_nanos_vocabulary(big_dict,Path(OUTPUT_PATH))

View File

@@ -0,0 +1,48 @@
# generate each time a corpus big then the last, without the old data
# then using the same vocabulary let the bpe train
from Scripts.DataCleaning.pipeline import Pipeline
from Scripts.Training.bpe_trainer_pool import train,get_args
from pathlib import Path
import os, shutil
CORPUS_PATH = "Assets/Dataset/Tmp/toy_corpus.txt"
VOCABULARY_PATH = "Assets/Dataset/Tmp/mad_vocabulary.json"
CACHE_PATH = "Assets/Dataset/Tmp/mad_cache.json"
def mad_corpus_generator(corpus_size :int, corpus_offset: int):
print("New Corpus")
pipe = Pipeline(bpe_corpus_path=CORPUS_PATH)
print("Pipeline Created")
corpus_ending_offset = corpus_size + corpus_offset
pipe.reduce_movie_list(corpus_offset,corpus_ending_offset)
print("Starting building corpus")
pipe.execute_task_bpe_corpus()
print("Corpus created")
def mad_bpe_trainer():
argv = [
"--input-file", CORPUS_PATH,
"--output-file", VOCABULARY_PATH,
"--cache-file", CACHE_PATH,
]
args = get_args(argv)
train(args)
def mad_hatter():
# 10,100,500,1000,1500,2000,3000,4000,5000,10000
film_list = [10,100,500,1000,1500,2000,3000,4000,5000,10000]
starting_offset = 0
for corpus_size in film_list:
# mad_corpus_generator(corpus_size, starting_offset)
# starting_offset = starting_offset + corpus_size
mad_bpe_trainer()
# put dict into cache
shutil.copyfile(Path(VOCABULARY_PATH), Path(CACHE_PATH))
mad_hatter()