Added a way to see vocabulary size

2025-10-04 19:42:29 +02:00
parent 03cdca1f00
commit da0bdf703b
2 changed files with 11 additions and 1 deletions
--- a/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py
+++ b/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py
@@ -29,6 +29,10 @@ class NanoSocratesSpecial(Encoder):
        VOC_LENGTH = len(self.__vocabulary)
        return BPE_OFFSET + VOC_LENGTH + 1

+    @property
+    def vocabulary_size(self) -> int:
+        return len(self.vocabulary)
+
    @property
    def vocabulary(self) -> dict[str, int]:
        return self.__vocabulary
--- a/Project_Model/Libs/BPE/Classes/TokeNanoCore.py
+++ b/Project_Model/Libs/BPE/Classes/TokeNanoCore.py
@@ -26,6 +26,12 @@ class TokeNanoCore:
            BPE_VOCABULARY_SIZE, special_token_list
        )

+    @property
+    def vocabulary_size(self):
+        BPE_VOC_SIZE = self.__bpe_encoder.vocabulary_size
+        SPECIAL_VOC_SIZE = self.__special_encoder.vocabulary_size
+        return BPE_VOC_SIZE + SPECIAL_VOC_SIZE
+
    def encode(self, corpus: str) -> list[int]:
        output: list[int] = []
        for piece, token_type in self.__splitter.split_text(corpus):