Added a way to see vocabulary size

This commit is contained in:
Christian Risi 2025-10-04 19:42:29 +02:00
parent 03cdca1f00
commit da0bdf703b
2 changed files with 11 additions and 1 deletions

View File

@ -29,6 +29,10 @@ class NanoSocratesSpecial(Encoder):
VOC_LENGTH = len(self.__vocabulary) VOC_LENGTH = len(self.__vocabulary)
return BPE_OFFSET + VOC_LENGTH + 1 return BPE_OFFSET + VOC_LENGTH + 1
@property
def vocabulary_size(self) -> int:
return len(self.vocabulary)
@property @property
def vocabulary(self) -> dict[str, int]: def vocabulary(self) -> dict[str, int]:
return self.__vocabulary return self.__vocabulary

View File

@ -24,7 +24,13 @@ class TokeNanoCore:
self.__splitter = NanoSocratesSplitter(SPECIAL_REGEX, BPE_VOCABULARY_SIZE) self.__splitter = NanoSocratesSplitter(SPECIAL_REGEX, BPE_VOCABULARY_SIZE)
self.__special_encoder = NanoSocratesSpecial( self.__special_encoder = NanoSocratesSpecial(
BPE_VOCABULARY_SIZE, special_token_list BPE_VOCABULARY_SIZE, special_token_list
) )
@property
def vocabulary_size(self):
BPE_VOC_SIZE = self.__bpe_encoder.vocabulary_size
SPECIAL_VOC_SIZE = self.__special_encoder.vocabulary_size
return BPE_VOC_SIZE + SPECIAL_VOC_SIZE
def encode(self, corpus: str) -> list[int]: def encode(self, corpus: str) -> list[int]:
output: list[int] = [] output: list[int] = []