diff --git a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py index 132217e..baa5efd 100644 --- a/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py +++ b/Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py @@ -2,7 +2,10 @@ from collections import deque from .Encoder import Encoder from ..Errors import OutOfDictionaryException, DuplicateWordException - +# ABOUT THE DICTIONARY: +# the string is converted into utf-char bytes, that is: each char is rappresented with a set of bytes from 1 to 4. +# each bytes get casted into an integer; such that, if an integer has its value lower then 256, +# then it is rappresenting an utf-char-byte, otherwise it is a token-ID. class NanoSocratesBatchMemoryBPE: """ Memory to batch training. Keeps token couple frequencies, and merge_treshold """ @@ -31,6 +34,7 @@ class NanoSocratesBPE(Encoder): for key, value in vocabulary.items(): if value < 256: raise OutOfDictionaryException() + # values under 256 are used for unpaired char # TODO: check if they are in order self.__vocabulary[key] = value self.__reverse_vocabulary[value] = key @@ -38,7 +42,7 @@ class NanoSocratesBPE(Encoder): @property def vocabulary_size(self): - return len(self.__vocabulary) + 255 + return len(self.__vocabulary) + 256 @property def vocabulary(self): @@ -51,7 +55,7 @@ class NanoSocratesBPE(Encoder): Returns: int: """ - return self.vocabulary_size + 1 + return self.vocabulary_size # TODO: implement fit def fit( @@ -64,6 +68,7 @@ class NanoSocratesBPE(Encoder): ENCODED_CHUNK = self.encode_intermediate(chunk_data) DATA_LEN_BEFORE_LAST = len(ENCODED_CHUNK) - 1 + # update frequency of each couple of element for i in range(0, DATA_LEN_BEFORE_LAST): CANDIDATE_COUPLE = (ENCODED_CHUNK[i], ENCODED_CHUNK[i+1]) @@ -77,6 +82,7 @@ class NanoSocratesBPE(Encoder): frequency += 1 memory.frequencies[CANDIDATE_COUPLE] = frequency + if not last_batch: return (self, memory, ENCODED_CHUNK) @@ -126,13 +132,14 @@ class NanoSocratesBPE(Encoder): def __round_encode(self, piece: list[int]): - """_summary_ - + """ A single round of encode that traverse all the object. Multiple round are needed for a full encode: \n + 1) "ABAB" -> "XX" + 2) "XX" -> "Y" Args: - piece (list[int]): _description_ + piece (list[int]): the object to encode as a list of integer Returns: - _type_: _description_ + (list[int]): the one time encoded object """ if len(piece) == 1: @@ -144,26 +151,31 @@ class NanoSocratesBPE(Encoder): index = 0 while index < PIECE_LENGTH: - CANDIDATE_WORD = (piece[index], piece[index + 1]) + CANDIDATE_WORD = (piece[index], piece[index + 1]) # take a tuple of consecutive element [int] CANDIDATE_TOKEN = self.__vocabulary.get(CANDIDATE_WORD) + # if no token to substitute the tuple, append the first element if CANDIDATE_TOKEN is None: - NEW_PIECE.append(piece[index]) + NEW_PIECE.append(piece[index]) index += 1 + # if the latter element of the tuple is the last element of the piece, append it if index == PIECE_LENGTH: - NEW_PIECE.append(piece[index]) + NEW_PIECE.append(piece[index]) continue + # in this case there was a candidate token to substitute the couple of element NEW_PIECE.append(CANDIDATE_TOKEN) index += 2 return NEW_PIECE + # TODO: Remake decode to take a list of token IDs def decode(self, token_ids: list[int]) -> str: + # deque: double ended queue token_stack: deque[int] = deque(token_ids) @@ -199,7 +211,13 @@ class NanoSocratesBPE(Encoder): return CANDIDATE_DECODED def __learn_word(self, words: tuple[int, int]): + """ learn a new couple of object in the vocabulary + Args: + words (tuple[int, int]): the Pair of element to substitute with a new tokenID + Raises: + DuplicateWordException: it launch if there is a duplicate of the new tokenID in the dictionary + """ ID = self.__next_id DUPLICATE = self.__vocabulary.get(words)