diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py b/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py index 61d4741..010ff79 100644 --- a/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py +++ b/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py @@ -29,6 +29,10 @@ class NanoSocratesSpecial(Encoder): VOC_LENGTH = len(self.__vocabulary) return BPE_OFFSET + VOC_LENGTH + 1 + @property + def vocabulary_size(self) -> int: + return len(self.vocabulary) + @property def vocabulary(self) -> dict[str, int]: return self.__vocabulary diff --git a/Project_Model/Libs/BPE/Classes/TokeNanoCore.py b/Project_Model/Libs/BPE/Classes/TokeNanoCore.py index f726a95..86bca19 100644 --- a/Project_Model/Libs/BPE/Classes/TokeNanoCore.py +++ b/Project_Model/Libs/BPE/Classes/TokeNanoCore.py @@ -24,7 +24,13 @@ class TokeNanoCore: self.__splitter = NanoSocratesSplitter(SPECIAL_REGEX, BPE_VOCABULARY_SIZE) self.__special_encoder = NanoSocratesSpecial( BPE_VOCABULARY_SIZE, special_token_list - ) + ) + + @property + def vocabulary_size(self): + BPE_VOC_SIZE = self.__bpe_encoder.vocabulary_size + SPECIAL_VOC_SIZE = self.__special_encoder.vocabulary_size + return BPE_VOC_SIZE + SPECIAL_VOC_SIZE def encode(self, corpus: str) -> list[int]: output: list[int] = []