NanoSocrates/Project_Model/Libs/BPE/Classes/TokeNanoCore.py

from pathlib import Path

from ..Classes import NanoSocratesSplitter
from ..Classes import NanoSocratesBPE
from ..Classes import NanoSocratesSpecial

from ..Utils import special_regex_maker
from ..Enums import TokenType
from ..Enums import SpecialToken


class TokeNanoCore:
    def __init__(
        self,
        bpe_vocabulary: dict[tuple[int, int], int],
        special_token_list: list[str],
        # special_vocabulary: dict[str, int]
    ):

        self.__bpe_encoder = NanoSocratesBPE(bpe_vocabulary)

        SPECIAL_REGEX = special_regex_maker(special_token_list)
        BPE_VOCABULARY_SIZE = self.__bpe_encoder.vocabulary_size

        self.__splitter = NanoSocratesSplitter(SPECIAL_REGEX, BPE_VOCABULARY_SIZE)
        self.__special_encoder = NanoSocratesSpecial(
            BPE_VOCABULARY_SIZE, special_token_list
        )

    @property
    def vocabulary_size(self):
        BPE_VOC_SIZE = self.__bpe_encoder.vocabulary_size
        SPECIAL_VOC_SIZE = self.__special_encoder.vocabulary_size
        return BPE_VOC_SIZE + SPECIAL_VOC_SIZE + 1

    def encode(self, corpus: str) -> list[int]:
        output: list[int] = []
        for piece, token_type in self.__splitter.split_text(corpus):

            if token_type == TokenType.SPECIAL:
                output.extend(self.__special_encoder.encode(piece))

            # slow but clear
            if token_type == TokenType.BPE:
                output.extend(self.__bpe_encoder.encode(piece))

        return output
    

    def encode_incomplete_string(self, corpus: str) -> list[int]:
        """
        Encode string which don't end with a special token
        """
        corpus = corpus + SpecialToken.CORPUS_END.value
        output: list[int] = []
        for piece, token_type in self.__splitter.split_text(corpus):

            if token_type == TokenType.SPECIAL:
                output.extend(self.__special_encoder.encode(piece))

            # slow but clear
            if token_type == TokenType.BPE:
                output.extend(self.__bpe_encoder.encode(piece))

        return output[:-1]


    def decode(self, corpus: list[int]) -> str:
        output_str = ""
        for token, token_type in self.__splitter.split_tokens(corpus):
            # token is an integer if special, a list of integer otherwise
            if token_type == TokenType.SPECIAL:
                output_str += self.__special_encoder.decode(
                    token
                )  # it accept an integer

            # slow but clear
            if token_type == TokenType.BPE:
                output_str += self.__bpe_encoder.decode(
                    token
                )  # it accept a list of integer
        return output_str
implemented token nano for the BPE encoding/decoding 2025-10-03 01:04:06 +02:00			`from pathlib import Path`

Fix of bugs and semantics 2025-10-03 13:26:58 +02:00			`from ..Classes import NanoSocratesSplitter`
			`from ..Classes import NanoSocratesBPE`
			`from ..Classes import NanoSocratesSpecial`

			`from ..Utils import special_regex_maker`
			`from ..Enums import TokenType`
WIP training Batching 2025-10-07 17:41:53 +02:00			`from ..Enums import SpecialToken`
Fix of bugs and semantics 2025-10-03 13:26:58 +02:00
implemented token nano for the BPE encoding/decoding 2025-10-03 01:04:06 +02:00
			`class TokeNanoCore:`
Fix of bugs and semantics 2025-10-03 13:26:58 +02:00			`def __init__(`
			`self,`
			`bpe_vocabulary: dict[tuple[int, int], int],`
			`special_token_list: list[str],`
			`# special_vocabulary: dict[str, int]`
			`):`

			`self.__bpe_encoder = NanoSocratesBPE(bpe_vocabulary)`

			`SPECIAL_REGEX = special_regex_maker(special_token_list)`
			`BPE_VOCABULARY_SIZE = self.__bpe_encoder.vocabulary_size`

			`self.__splitter = NanoSocratesSplitter(SPECIAL_REGEX, BPE_VOCABULARY_SIZE)`
			`self.__special_encoder = NanoSocratesSpecial(`
			`BPE_VOCABULARY_SIZE, special_token_list`
Added a way to see vocabulary size 2025-10-04 19:42:29 +02:00			`)`

			`@property`
			`def vocabulary_size(self):`
			`BPE_VOC_SIZE = self.__bpe_encoder.vocabulary_size`
			`SPECIAL_VOC_SIZE = self.__special_encoder.vocabulary_size`
Batcher added 2025-10-10 20:10:08 +02:00			`return BPE_VOC_SIZE + SPECIAL_VOC_SIZE + 1`
implemented token nano for the BPE encoding/decoding 2025-10-03 01:04:06 +02:00
Fix of bugs and semantics 2025-10-03 13:26:58 +02:00			`def encode(self, corpus: str) -> list[int]:`
			`output: list[int] = []`
			`for piece, token_type in self.__splitter.split_text(corpus):`
implemented token nano for the BPE encoding/decoding 2025-10-03 01:04:06 +02:00
			`if token_type == TokenType.SPECIAL:`
Fix of bugs and semantics 2025-10-03 13:26:58 +02:00			`output.extend(self.__special_encoder.encode(piece))`
implemented token nano for the BPE encoding/decoding 2025-10-03 01:04:06 +02:00
			`# slow but clear`
			`if token_type == TokenType.BPE:`
Fix of bugs and semantics 2025-10-03 13:26:58 +02:00			`output.extend(self.__bpe_encoder.encode(piece))`
implemented token nano for the BPE encoding/decoding 2025-10-03 01:04:06 +02:00
			`return output`
WIP training Batching 2025-10-07 17:41:53 +02:00


			`def encode_incomplete_string(self, corpus: str) -> list[int]:`
			`"""`
			`Encode string which don't end with a special token`
			`"""`
			`corpus = corpus + SpecialToken.CORPUS_END.value`
			`output: list[int] = []`
			`for piece, token_type in self.__splitter.split_text(corpus):`

			`if token_type == TokenType.SPECIAL:`
			`output.extend(self.__special_encoder.encode(piece))`

			`# slow but clear`
			`if token_type == TokenType.BPE:`
			`output.extend(self.__bpe_encoder.encode(piece))`

			`return output[:-1]`


implemented token nano for the BPE encoding/decoding 2025-10-03 01:04:06 +02:00
Fix of bugs and semantics 2025-10-03 13:26:58 +02:00			`def decode(self, corpus: list[int]) -> str:`
			`output_str = ""`
			`for token, token_type in self.__splitter.split_tokens(corpus):`
implemented token nano for the BPE encoding/decoding 2025-10-03 01:04:06 +02:00			`# token is an integer if special, a list of integer otherwise`
			`if token_type == TokenType.SPECIAL:`
Fix of bugs and semantics 2025-10-03 13:26:58 +02:00			`output_str += self.__special_encoder.decode(`
			`token`
			`) # it accept an integer`
implemented token nano for the BPE encoding/decoding 2025-10-03 01:04:06 +02:00
			`# slow but clear`
			`if token_type == TokenType.BPE:`
Fix of bugs and semantics 2025-10-03 13:26:58 +02:00			`output_str += self.__bpe_encoder.decode(`
			`token`
			`) # it accept a list of integer`
implemented token nano for the BPE encoding/decoding 2025-10-03 01:04:06 +02:00			`return output_str`