little snippet to trim big dictionaries

added tokenano to the init
a new exasperated way to train the bpe, just a wild experimen that could be useful later
2025-10-07 16:05:32 +02:00 · 2025-10-04 19:03:56 +02:00 · 2025-10-04 19:03:07 +02:00 · 2025-10-04 19:01:21 +02:00
4 changed files with 73 additions and 5 deletions
--- a/Project_Model/Libs/BPE/Classes/TokeNanoCore.py
+++ b/Project_Model/Libs/BPE/Classes/TokeNanoCore.py
@@ -31,11 +31,15 @@ class TokeNanoCore:
        for piece, token_type in self.__splitter.split_text(corpus):
            if token_type == TokenType.SPECIAL:
-                output.extend(self.__special_encoder.encode(piece))
+                ENCODED_PIECE = self.__special_encoder.encode(piece)
                output.extend(ENCODED_PIECE)
                continue
            # slow but clear
            if token_type == TokenType.BPE:
-                output.extend(self.__bpe_encoder.encode(piece))
+                ENCODED_PIECE = self.__bpe_encoder.encode(piece)
                output.extend(ENCODED_PIECE)
                continue
        return output
@@ -46,11 +50,13 @@ class TokeNanoCore:
            if token_type == TokenType.SPECIAL:
                output_str += self.__special_encoder.decode(
                    token
-                )  # it accept an integer
+                )
                continue
            # slow but clear
            if token_type == TokenType.BPE:
                output_str += self.__bpe_encoder.decode(
                    token
-                )  # it accept a list of integer
+                )
                continue
        return output_str
--- a/Project_Model/Libs/BPE/Classes/init.py
+++ b/Project_Model/Libs/BPE/Classes/init.py
@@ -5,6 +5,7 @@ from .NanoSocraTrainer import NanoSocraTrainer
 from .NanoSocraTraineRam import NanoSocraTraineRam
 from .NanoSocraTrainerPool import NanoSocraTrainerPool
 from .NanoSocratesSpecial import NanoSocratesSpecial
 from .TokeNanoCore import TokeNanoCore
 __all__ = [
    "NanoSocratesChunker",
@@ -12,5 +13,6 @@ __all__ = [
    "NanoSocratesBPE",
    "NanoSocraTrainer",
    "NanoSocraTraineRam",
-    "NanoSocraTrainerPool"
+    "NanoSocraTrainerPool",
    "TokeNanoCore"
 ]
--- a/Scripts/Training/dictionary_adjuster.py
+++ b/Scripts/Training/dictionary_adjuster.py
@@ -0,0 +1,12 @@
 # to cut the mad trained dict into a short one
 from Project_Model.Libs.BPE.Utils.vocabulary import load_nanos_vocabulary, save_nanos_vocabulary
 from pathlib import Path
 DICTIONARY_PATH = "Assets/Dataset/Tmp/mad_cache.json" 
 OUTPUT_PATH = "Assets/Dataset/Tmp/trimmed.json"
 big_dict = load_nanos_vocabulary(Path(DICTIONARY_PATH))
 big_dict = dict(list(big_dict.items())[:31744])
 save_nanos_vocabulary(big_dict,Path(OUTPUT_PATH))
--- a/Scripts/Training/mad_traininng.py
+++ b/Scripts/Training/mad_traininng.py
@@ -0,0 +1,48 @@
 # generate each time a corpus big then the last, without the old data
 # then using the same vocabulary let the bpe train
 from Scripts.DataCleaning.pipeline import Pipeline
 from Scripts.Training.bpe_trainer_pool import train,get_args
 from pathlib import Path
 import os, shutil
 CORPUS_PATH = "Assets/Dataset/Tmp/toy_corpus.txt"
 VOCABULARY_PATH = "Assets/Dataset/Tmp/mad_vocabulary.json"
 CACHE_PATH = "Assets/Dataset/Tmp/mad_cache.json"
 def mad_corpus_generator(corpus_size :int, corpus_offset: int):
    print("New Corpus")
    pipe = Pipeline(bpe_corpus_path=CORPUS_PATH)
    print("Pipeline Created")
    corpus_ending_offset = corpus_size + corpus_offset
    pipe.reduce_movie_list(corpus_offset,corpus_ending_offset)
    print("Starting building corpus")
    pipe.execute_task_bpe_corpus()
    print("Corpus created")
 def mad_bpe_trainer():
    argv = [
    "--input-file", CORPUS_PATH,
    "--output-file", VOCABULARY_PATH,
    "--cache-file", CACHE_PATH,
    ]
    args = get_args(argv)
    train(args)
 def mad_hatter():
    # 10,100,500,1000,1500,2000,3000,4000,5000,10000
    film_list = [10,100,500,1000,1500,2000,3000,4000,5000,10000]
    starting_offset = 0
    for corpus_size in film_list:
        # mad_corpus_generator(corpus_size, starting_offset)
        # starting_offset = starting_offset + corpus_size
        mad_bpe_trainer()
        # put dict into cache
        shutil.copyfile(Path(VOCABULARY_PATH), Path(CACHE_PATH))
 mad_hatter()
Author	SHA1	Message	Date
GassiGiuseppe	1d23b9cc8b	little snippet to trim big dictionaries	2025-10-07 16:05:32 +02:00
GassiGiuseppe	165290162c	added tokenano to the init	2025-10-04 19:03:56 +02:00
GassiGiuseppe	502016f843	a new exasperated way to train the bpe, just a wild experimen that could be useful later	2025-10-04 19:03:07 +02:00
GassiGiuseppe	845c63dbef	updated tokenano to be more easy to read	2025-10-04 19:01:21 +02:00