diff --git a/Project_Model/Libs/BPE/Classes/Encoder.py b/Project_Model/Libs/BPE/Classes/Encoder.py new file mode 100644 index 0000000..800772b --- /dev/null +++ b/Project_Model/Libs/BPE/Classes/Encoder.py @@ -0,0 +1,4 @@ +from abc import ABC + +class Encoder(ABC): + pass \ No newline at end of file diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py b/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py new file mode 100644 index 0000000..e551d6c --- /dev/null +++ b/Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py @@ -0,0 +1,54 @@ +from .Encoder import Encoder +from ..Errors import OutOfDictionaryException + +class NanoSocratesSpecial(Encoder): + + def __init__( + self, + initial_vocabulary: list[str] | None = None + ) -> None: + super().__init__() + + self.__vocabulary: dict[str, int] = {} + self.__reverse_vocabulary: dict[int, str] = {} + self.__current_index = 0 + + if initial_vocabulary is None: + return + + for word in initial_vocabulary: + + CURRENT_INDEX = self.__current_index + self.__vocabulary[word] = CURRENT_INDEX + self.__reverse_vocabulary[CURRENT_INDEX] = word + + self.__current_index += 1 + + @property + def vocabulary_size(self): + return self.__current_index + + def add_special_word(self, word:str): + CURRENT_INDEX = self.__current_index + self.__vocabulary[word] = CURRENT_INDEX + self.__reverse_vocabulary[CURRENT_INDEX] = word + self.__current_index += 1 + + def encode(self, word: str) -> list[int]: + ID = self.__vocabulary.get(word) + + if ID is None: + raise OutOfDictionaryException() + + return [ID] + + def decode(self, token_id: int) -> str: + + ID = token_id + WORD = self.__reverse_vocabulary.get(ID) + + if WORD is None: + raise OutOfDictionaryException() + + return WORD + diff --git a/Project_Model/Libs/BPE/Errors/OutOfDictionaryException.py b/Project_Model/Libs/BPE/Errors/OutOfDictionaryException.py new file mode 100644 index 0000000..2c4c440 --- /dev/null +++ b/Project_Model/Libs/BPE/Errors/OutOfDictionaryException.py @@ -0,0 +1,4 @@ +class OutOfDictionaryException(Exception): + + def __init__(self, *args: object) -> None: + super().__init__(*args) \ No newline at end of file