Added a way to see vocabulary size
This commit is contained in:
parent
03cdca1f00
commit
da0bdf703b
@ -29,6 +29,10 @@ class NanoSocratesSpecial(Encoder):
|
||||
VOC_LENGTH = len(self.__vocabulary)
|
||||
return BPE_OFFSET + VOC_LENGTH + 1
|
||||
|
||||
@property
|
||||
def vocabulary_size(self) -> int:
|
||||
return len(self.vocabulary)
|
||||
|
||||
@property
|
||||
def vocabulary(self) -> dict[str, int]:
|
||||
return self.__vocabulary
|
||||
|
||||
@ -26,6 +26,12 @@ class TokeNanoCore:
|
||||
BPE_VOCABULARY_SIZE, special_token_list
|
||||
)
|
||||
|
||||
@property
|
||||
def vocabulary_size(self):
|
||||
BPE_VOC_SIZE = self.__bpe_encoder.vocabulary_size
|
||||
SPECIAL_VOC_SIZE = self.__special_encoder.vocabulary_size
|
||||
return BPE_VOC_SIZE + SPECIAL_VOC_SIZE
|
||||
|
||||
def encode(self, corpus: str) -> list[int]:
|
||||
output: list[int] = []
|
||||
for piece, token_type in self.__splitter.split_text(corpus):
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user