From da0bdf703b377f17693df0d33e3d9e3162537e7f Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Sat, 4 Oct 2025 19:42:29 +0200 Subject: [PATCH] Added a way to see vocabulary size --- Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py | 4 ++++ Project_Model/Libs/BPE/Classes/TokeNanoCore.py | 8 +++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py b/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py index 61d4741..010ff79 100644 --- a/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py +++ b/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py @@ -29,6 +29,10 @@ class NanoSocratesSpecial(Encoder): VOC_LENGTH = len(self.__vocabulary) return BPE_OFFSET + VOC_LENGTH + 1 + @property + def vocabulary_size(self) -> int: + return len(self.vocabulary) + @property def vocabulary(self) -> dict[str, int]: return self.__vocabulary diff --git a/Project_Model/Libs/BPE/Classes/TokeNanoCore.py b/Project_Model/Libs/BPE/Classes/TokeNanoCore.py index f726a95..86bca19 100644 --- a/Project_Model/Libs/BPE/Classes/TokeNanoCore.py +++ b/Project_Model/Libs/BPE/Classes/TokeNanoCore.py @@ -24,7 +24,13 @@ class TokeNanoCore: self.__splitter = NanoSocratesSplitter(SPECIAL_REGEX, BPE_VOCABULARY_SIZE) self.__special_encoder = NanoSocratesSpecial( BPE_VOCABULARY_SIZE, special_token_list - ) + ) + + @property + def vocabulary_size(self): + BPE_VOC_SIZE = self.__bpe_encoder.vocabulary_size + SPECIAL_VOC_SIZE = self.__special_encoder.vocabulary_size + return BPE_VOC_SIZE + SPECIAL_VOC_SIZE def encode(self, corpus: str) -> list[int]: output: list[int] = []