Added a way to see vocabulary size
This commit is contained in:
parent
03cdca1f00
commit
da0bdf703b
@ -29,6 +29,10 @@ class NanoSocratesSpecial(Encoder):
|
|||||||
VOC_LENGTH = len(self.__vocabulary)
|
VOC_LENGTH = len(self.__vocabulary)
|
||||||
return BPE_OFFSET + VOC_LENGTH + 1
|
return BPE_OFFSET + VOC_LENGTH + 1
|
||||||
|
|
||||||
|
@property
|
||||||
|
def vocabulary_size(self) -> int:
|
||||||
|
return len(self.vocabulary)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def vocabulary(self) -> dict[str, int]:
|
def vocabulary(self) -> dict[str, int]:
|
||||||
return self.__vocabulary
|
return self.__vocabulary
|
||||||
|
|||||||
@ -24,7 +24,13 @@ class TokeNanoCore:
|
|||||||
self.__splitter = NanoSocratesSplitter(SPECIAL_REGEX, BPE_VOCABULARY_SIZE)
|
self.__splitter = NanoSocratesSplitter(SPECIAL_REGEX, BPE_VOCABULARY_SIZE)
|
||||||
self.__special_encoder = NanoSocratesSpecial(
|
self.__special_encoder = NanoSocratesSpecial(
|
||||||
BPE_VOCABULARY_SIZE, special_token_list
|
BPE_VOCABULARY_SIZE, special_token_list
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def vocabulary_size(self):
|
||||||
|
BPE_VOC_SIZE = self.__bpe_encoder.vocabulary_size
|
||||||
|
SPECIAL_VOC_SIZE = self.__special_encoder.vocabulary_size
|
||||||
|
return BPE_VOC_SIZE + SPECIAL_VOC_SIZE
|
||||||
|
|
||||||
def encode(self, corpus: str) -> list[int]:
|
def encode(self, corpus: str) -> list[int]:
|
||||||
output: list[int] = []
|
output: list[int] = []
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user