Fix of bugs and semantics
This commit is contained in:
parent
6b9cb7cd35
commit
c5c0c61f79
@ -2,20 +2,18 @@ from collections import deque
|
|||||||
from .Encoder import Encoder
|
from .Encoder import Encoder
|
||||||
from ..Errors import OutOfDictionaryException, DuplicateWordException
|
from ..Errors import OutOfDictionaryException, DuplicateWordException
|
||||||
|
|
||||||
|
|
||||||
# ABOUT THE DICTIONARY:
|
# ABOUT THE DICTIONARY:
|
||||||
# the string is converted into utf-char bytes, that is: each char is rappresented with a set of bytes from 1 to 4.
|
# the string is converted into utf-char bytes, that is: each char is rappresented with a set of bytes from 1 to 4.
|
||||||
# each bytes get casted into an integer; such that, if an integer has its value lower then 256,
|
# each bytes get casted into an integer; such that, if an integer has its value lower then 256,
|
||||||
# then it is rappresenting an utf-char-byte, otherwise it is a token-ID.
|
# then it is rappresenting an utf-char-byte, otherwise it is a token-ID.
|
||||||
class NanoSocratesBatchMemoryBPE:
|
class NanoSocratesBatchMemoryBPE:
|
||||||
""" Memory to batch training. Keeps token couple frequencies, and merge_treshold
|
"""Memory to batch training. Keeps token couple frequencies, and merge_treshold"""
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self, frequencies: dict[tuple[int, int], int], merge_treshold: int
|
||||||
frequencies: dict[tuple[int, int], int],
|
|
||||||
merge_treshold: int
|
|
||||||
) -> None:
|
) -> None:
|
||||||
|
|
||||||
self.frequencies = frequencies
|
self.frequencies = frequencies
|
||||||
self.merge_treshold = merge_treshold
|
self.merge_treshold = merge_treshold
|
||||||
|
|
||||||
@ -39,7 +37,6 @@ class NanoSocratesBPE(Encoder):
|
|||||||
self.__vocabulary[key] = value
|
self.__vocabulary[key] = value
|
||||||
self.__reverse_vocabulary[value] = key
|
self.__reverse_vocabulary[value] = key
|
||||||
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def vocabulary_size(self):
|
def vocabulary_size(self):
|
||||||
return len(self.__vocabulary) + 256
|
return len(self.__vocabulary) + 256
|
||||||
@ -62,7 +59,7 @@ class NanoSocratesBPE(Encoder):
|
|||||||
self,
|
self,
|
||||||
chunk_data: list[int],
|
chunk_data: list[int],
|
||||||
memory: NanoSocratesBatchMemoryBPE,
|
memory: NanoSocratesBatchMemoryBPE,
|
||||||
last_batch: bool
|
last_batch: bool,
|
||||||
):
|
):
|
||||||
|
|
||||||
ENCODED_CHUNK = self.encode_intermediate(chunk_data)
|
ENCODED_CHUNK = self.encode_intermediate(chunk_data)
|
||||||
@ -70,7 +67,7 @@ class NanoSocratesBPE(Encoder):
|
|||||||
|
|
||||||
# update frequency of each couple of element
|
# update frequency of each couple of element
|
||||||
for i in range(0, DATA_LEN_BEFORE_LAST):
|
for i in range(0, DATA_LEN_BEFORE_LAST):
|
||||||
CANDIDATE_COUPLE = (ENCODED_CHUNK[i], ENCODED_CHUNK[i+1])
|
CANDIDATE_COUPLE = (ENCODED_CHUNK[i], ENCODED_CHUNK[i + 1])
|
||||||
|
|
||||||
frequency = memory.frequencies.get(CANDIDATE_COUPLE)
|
frequency = memory.frequencies.get(CANDIDATE_COUPLE)
|
||||||
|
|
||||||
@ -82,7 +79,6 @@ class NanoSocratesBPE(Encoder):
|
|||||||
frequency += 1
|
frequency += 1
|
||||||
memory.frequencies[CANDIDATE_COUPLE] = frequency
|
memory.frequencies[CANDIDATE_COUPLE] = frequency
|
||||||
|
|
||||||
|
|
||||||
if not last_batch:
|
if not last_batch:
|
||||||
return (self, memory, ENCODED_CHUNK)
|
return (self, memory, ENCODED_CHUNK)
|
||||||
|
|
||||||
@ -100,9 +96,6 @@ class NanoSocratesBPE(Encoder):
|
|||||||
|
|
||||||
return (self, memory, ENCODED_CHUNK)
|
return (self, memory, ENCODED_CHUNK)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def encode(self, piece: str) -> list[int]:
|
def encode(self, piece: str) -> list[int]:
|
||||||
"""Encode a String into token IDs, it firt convert it into utf-8, then pass the list of integer to encode_intermediate()
|
"""Encode a String into token IDs, it firt convert it into utf-8, then pass the list of integer to encode_intermediate()
|
||||||
Args:
|
Args:
|
||||||
@ -114,12 +107,12 @@ class NanoSocratesBPE(Encoder):
|
|||||||
return self.encode_intermediate(converted_piece)
|
return self.encode_intermediate(converted_piece)
|
||||||
|
|
||||||
def encode_intermediate(self, piece: list[int]) -> list[int]:
|
def encode_intermediate(self, piece: list[int]) -> list[int]:
|
||||||
""" Encode a piece (as list of integer) till its maximum
|
"""Encode a piece (as list of integer) till its maximum
|
||||||
Args:
|
Args:
|
||||||
piece (list[int]): piece to encode
|
piece (list[int]): piece to encode
|
||||||
Returns:
|
Returns:
|
||||||
list[int]: piece encoded
|
list[int]: piece encoded
|
||||||
"""
|
"""
|
||||||
current_piece = piece
|
current_piece = piece
|
||||||
new_piece = self.__round_encode(current_piece)
|
new_piece = self.__round_encode(current_piece)
|
||||||
|
|
||||||
@ -130,9 +123,8 @@ class NanoSocratesBPE(Encoder):
|
|||||||
|
|
||||||
return current_piece
|
return current_piece
|
||||||
|
|
||||||
|
|
||||||
def __round_encode(self, piece: list[int]):
|
def __round_encode(self, piece: list[int]):
|
||||||
""" A single round of encode that traverse all the object. Multiple round are needed for a full encode: \n
|
"""A single round of encode that traverse all the object. Multiple round are needed for a full encode: \n
|
||||||
1) "ABAB" -> "XX"
|
1) "ABAB" -> "XX"
|
||||||
2) "XX" -> "Y"
|
2) "XX" -> "Y"
|
||||||
Args:
|
Args:
|
||||||
@ -146,22 +138,25 @@ class NanoSocratesBPE(Encoder):
|
|||||||
return piece
|
return piece
|
||||||
|
|
||||||
PIECE_LENGTH = len(piece) - 1
|
PIECE_LENGTH = len(piece) - 1
|
||||||
NEW_PIECE : list[int]= []
|
NEW_PIECE: list[int] = []
|
||||||
|
|
||||||
index = 0
|
index = 0
|
||||||
while index < PIECE_LENGTH:
|
while index < PIECE_LENGTH:
|
||||||
|
|
||||||
CANDIDATE_WORD = (piece[index], piece[index + 1]) # take a tuple of consecutive element [int]
|
CANDIDATE_WORD = (
|
||||||
|
piece[index],
|
||||||
|
piece[index + 1],
|
||||||
|
) # take a tuple of consecutive element [int]
|
||||||
CANDIDATE_TOKEN = self.__vocabulary.get(CANDIDATE_WORD)
|
CANDIDATE_TOKEN = self.__vocabulary.get(CANDIDATE_WORD)
|
||||||
|
|
||||||
# if no token to substitute the tuple, append the first element
|
# if no token to substitute the tuple, append the first element
|
||||||
if CANDIDATE_TOKEN is None:
|
if CANDIDATE_TOKEN is None:
|
||||||
NEW_PIECE.append(piece[index])
|
NEW_PIECE.append(piece[index])
|
||||||
index += 1
|
index += 1
|
||||||
|
|
||||||
# if the latter element of the tuple is the last element of the piece, append it
|
# if the latter element of the tuple is the last element of the piece, append it
|
||||||
if index == PIECE_LENGTH:
|
if index == PIECE_LENGTH:
|
||||||
NEW_PIECE.append(piece[index])
|
NEW_PIECE.append(piece[index])
|
||||||
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@ -169,13 +164,10 @@ class NanoSocratesBPE(Encoder):
|
|||||||
NEW_PIECE.append(CANDIDATE_TOKEN)
|
NEW_PIECE.append(CANDIDATE_TOKEN)
|
||||||
index += 2
|
index += 2
|
||||||
|
|
||||||
|
|
||||||
return NEW_PIECE
|
return NEW_PIECE
|
||||||
|
|
||||||
|
|
||||||
# TODO: Remake decode to take a list of token IDs
|
# TODO: Remake decode to take a list of token IDs
|
||||||
def decode(self, token_ids: list[int]) -> str:
|
def decode(self, token_ids: list[int]) -> str:
|
||||||
|
|
||||||
|
|
||||||
# deque: double ended queue
|
# deque: double ended queue
|
||||||
token_stack: deque[int] = deque(token_ids)
|
token_stack: deque[int] = deque(token_ids)
|
||||||
@ -185,19 +177,13 @@ class NanoSocratesBPE(Encoder):
|
|||||||
TOKEN_ID = token_stack.popleft()
|
TOKEN_ID = token_stack.popleft()
|
||||||
|
|
||||||
if TOKEN_ID < 256:
|
if TOKEN_ID < 256:
|
||||||
UTF_8_STRING_ARR.append(
|
UTF_8_STRING_ARR.append(TOKEN_ID)
|
||||||
TOKEN_ID
|
|
||||||
)
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
left_token, right_token = self.__token_decode(TOKEN_ID)
|
left_token, right_token = self.__token_decode(TOKEN_ID)
|
||||||
|
|
||||||
token_stack.appendleft(
|
token_stack.appendleft(right_token)
|
||||||
right_token
|
token_stack.appendleft(left_token)
|
||||||
)
|
|
||||||
token_stack.appendleft(
|
|
||||||
left_token
|
|
||||||
)
|
|
||||||
|
|
||||||
return UTF_8_STRING_ARR.decode("utf-8")
|
return UTF_8_STRING_ARR.decode("utf-8")
|
||||||
|
|
||||||
@ -211,7 +197,7 @@ class NanoSocratesBPE(Encoder):
|
|||||||
return CANDIDATE_DECODED
|
return CANDIDATE_DECODED
|
||||||
|
|
||||||
def __learn_word(self, words: tuple[int, int]):
|
def __learn_word(self, words: tuple[int, int]):
|
||||||
""" learn a new couple of object in the vocabulary
|
"""learn a new couple of object in the vocabulary
|
||||||
Args:
|
Args:
|
||||||
words (tuple[int, int]): the Pair of element to substitute with a new tokenID
|
words (tuple[int, int]): the Pair of element to substitute with a new tokenID
|
||||||
|
|
||||||
|
|||||||
@ -1,47 +1,46 @@
|
|||||||
from .Encoder import Encoder
|
from .Encoder import Encoder
|
||||||
from ..Errors import OutOfDictionaryException
|
from ..Errors import OutOfDictionaryException
|
||||||
|
|
||||||
|
|
||||||
class NanoSocratesSpecial(Encoder):
|
class NanoSocratesSpecial(Encoder):
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self, bpe_vocabulary_size: int, special_tokens: list[str] = []
|
||||||
vocabulary_index: int ,
|
) -> None:
|
||||||
vocabulary: dict[str, int] | None = None
|
|
||||||
) -> None:
|
|
||||||
|
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
if vocabulary is None:
|
self.__bpe_offset = bpe_vocabulary_size
|
||||||
self.__vocabulary: dict[str, int] = {}
|
self.__vocabulary: dict[str, int] = {}
|
||||||
else:
|
|
||||||
self.__vocabulary: dict[str, int] = vocabulary
|
|
||||||
|
|
||||||
self.__reverse_vocabulary: dict[int, str] = {}
|
self.__reverse_vocabulary: dict[int, str] = {}
|
||||||
|
|
||||||
if vocabulary_index is None:
|
if len(special_tokens) == 0:
|
||||||
self.__vocabulary_index = 0
|
return
|
||||||
else:
|
|
||||||
self.__vocabulary_index = vocabulary_index
|
|
||||||
|
|
||||||
# self.__build_reverse_vocabulary()
|
for index, TOKEN in zip(range(0, len(special_tokens)), special_tokens):
|
||||||
|
|
||||||
|
CANDIDATE_ID = self.__bpe_offset + index + 1
|
||||||
|
self.__vocabulary[TOKEN] = CANDIDATE_ID
|
||||||
|
self.__reverse_vocabulary[CANDIDATE_ID] = TOKEN
|
||||||
|
|
||||||
|
@property
|
||||||
|
def __next_id(self):
|
||||||
|
BPE_OFFSET = self.__bpe_offset
|
||||||
|
VOC_LENGTH = len(self.__vocabulary)
|
||||||
|
return BPE_OFFSET + VOC_LENGTH + 1
|
||||||
|
|
||||||
def build_reverse_vocabulary(self):
|
@property
|
||||||
self.__reverse_vocabulary = {v: k for k, v in self.__vocabulary.items()}
|
def vocabulary(self) -> dict[str, int]:
|
||||||
|
return self.__vocabulary
|
||||||
|
|
||||||
# @property
|
@property
|
||||||
# def vocabulary_size(self):
|
def reverse_vocabulary(self) -> dict[int, str]:
|
||||||
# return self.__current_index
|
return self.__reverse_vocabulary
|
||||||
|
|
||||||
def set_vocabulary_index(self, vocabulary_index: int):
|
def add_special_word_to_vocabulary(self, word: str):
|
||||||
self.__vocabulary_index = vocabulary_index
|
CANDIDATE_INDEX = self.__next_id
|
||||||
|
self.__vocabulary[word] = CANDIDATE_INDEX
|
||||||
def add_special_word_to_vocabulary(self, word:str):
|
self.__reverse_vocabulary[CANDIDATE_INDEX] = word
|
||||||
self.__vocabulary_index = self.__vocabulary_index + 1
|
|
||||||
CURRENT_INDEX = self.__vocabulary_index
|
|
||||||
self.__vocabulary[word] = CURRENT_INDEX
|
|
||||||
self.__reverse_vocabulary[CURRENT_INDEX] = word
|
|
||||||
|
|
||||||
def encode(self, word: str) -> list[int]:
|
def encode(self, word: str) -> list[int]:
|
||||||
ID = self.__vocabulary.get(word)
|
ID = self.__vocabulary.get(word)
|
||||||
@ -51,15 +50,15 @@ class NanoSocratesSpecial(Encoder):
|
|||||||
|
|
||||||
return [ID]
|
return [ID]
|
||||||
|
|
||||||
def decode(self, token_id: int) -> str:
|
def decode(self, token_id: list[int]) -> str:
|
||||||
|
|
||||||
ID = token_id
|
if len(token_id) != 1:
|
||||||
|
raise OutOfDictionaryException()
|
||||||
|
|
||||||
|
ID = token_id[0]
|
||||||
WORD = self.__reverse_vocabulary.get(ID)
|
WORD = self.__reverse_vocabulary.get(ID)
|
||||||
|
|
||||||
if WORD is None:
|
if WORD is None:
|
||||||
raise OutOfDictionaryException()
|
raise OutOfDictionaryException()
|
||||||
|
|
||||||
return WORD
|
return WORD
|
||||||
|
|
||||||
def get_reverse_vocabulary(self)-> dict[int, str]:
|
|
||||||
return self.__reverse_vocabulary
|
|
||||||
|
|||||||
@ -31,7 +31,8 @@ class NanoSocratesSplitter:
|
|||||||
bpe_end = special_token_start
|
bpe_end = special_token_start
|
||||||
BPE_TOKEN_TEXT = corpus[bpe_start:bpe_end]
|
BPE_TOKEN_TEXT = corpus[bpe_start:bpe_end]
|
||||||
if BPE_TOKEN_TEXT != "":
|
if BPE_TOKEN_TEXT != "":
|
||||||
yield (BPE_TOKEN_TEXT, TokenType.BPE)
|
for WORD in self.__split_words(BPE_TOKEN_TEXT):
|
||||||
|
yield (WORD, TokenType.BPE)
|
||||||
|
|
||||||
# FIND SPECIAL TOKEN
|
# FIND SPECIAL TOKEN
|
||||||
SPECIAL_TOKEN_TEXT = corpus[special_token_start:special_token_end]
|
SPECIAL_TOKEN_TEXT = corpus[special_token_start:special_token_end]
|
||||||
@ -60,6 +61,27 @@ class NanoSocratesSplitter:
|
|||||||
# eof = len(corpus)
|
# eof = len(corpus)
|
||||||
# yield(eof,eof)
|
# yield(eof,eof)
|
||||||
|
|
||||||
|
def __split_words(self, bpe_piece: str) -> Generator[str]:
|
||||||
|
|
||||||
|
END_OF_STRING = len(bpe_piece)
|
||||||
|
bound_start = 0
|
||||||
|
bound_end = END_OF_STRING + 1
|
||||||
|
for i in range(0, END_OF_STRING):
|
||||||
|
|
||||||
|
CANDIDATE_CHAR = bpe_piece[i]
|
||||||
|
|
||||||
|
if CANDIDATE_CHAR != " ":
|
||||||
|
continue
|
||||||
|
|
||||||
|
bound_end = i
|
||||||
|
|
||||||
|
yield bpe_piece[bound_start:bound_end]
|
||||||
|
|
||||||
|
bound_start = bound_end
|
||||||
|
bound_end = END_OF_STRING + 1
|
||||||
|
|
||||||
|
yield bpe_piece[bound_start:bound_end]
|
||||||
|
|
||||||
def split_tokens(self, corpus: list[int]) -> Generator[tuple[list[int], TokenType]]:
|
def split_tokens(self, corpus: list[int]) -> Generator[tuple[list[int], TokenType]]:
|
||||||
|
|
||||||
not_special_token_list: list[int] = []
|
not_special_token_list: list[int] = []
|
||||||
|
|||||||
@ -1,79 +1,56 @@
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from Project_Model.Libs.BPE.Classes.NanoSocratesSplitter import NanoSocratesSplitter
|
from ..Classes import NanoSocratesSplitter
|
||||||
from Project_Model.Libs.BPE.Classes.NanoSocratesBPE import NanoSocratesBPE
|
from ..Classes import NanoSocratesBPE
|
||||||
from Project_Model.Libs.BPE.Classes.NanoSocratesSpecial import NanoSocratesSpecial
|
from ..Classes import NanoSocratesSpecial
|
||||||
|
|
||||||
|
from ..Utils import special_regex_maker
|
||||||
|
from ..Enums import TokenType
|
||||||
|
|
||||||
|
|
||||||
from Project_Model.Libs.BPE.Utils.special_regex_maker import special_regex_maker
|
|
||||||
from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
|
|
||||||
from Project_Model.Libs.BPE.Enums import TokenType
|
|
||||||
from Project_Model.Libs.BPE.Utils.json_utils import load_json
|
|
||||||
class TokeNanoCore:
|
class TokeNanoCore:
|
||||||
def __init__(self,
|
def __init__(
|
||||||
bpe_vocabulary: dict[tuple[int, int], int]
|
self,
|
||||||
# special_vocabulary: dict[str, int]
|
bpe_vocabulary: dict[tuple[int, int], int],
|
||||||
):
|
special_token_list: list[str],
|
||||||
self._bpe = NanoSocratesBPE(bpe_vocabulary)
|
# special_vocabulary: dict[str, int]
|
||||||
|
):
|
||||||
# special_vocabulary = [token.value for token in SpecialToken]
|
|
||||||
special_token_list = [token.value for token in SpecialToken]
|
|
||||||
self._splitter = NanoSocratesSplitter(special_regex_maker(special_token_list),self._bpe.vocabulary_size)
|
|
||||||
|
|
||||||
self._special_bpe = NanoSocratesSpecial(vocabulary_index=None) # technically its not a bpe but more something like an "autoencoder"
|
self.__bpe_encoder = NanoSocratesBPE(bpe_vocabulary)
|
||||||
self.prepare_special_token_vocabulary()
|
|
||||||
|
SPECIAL_REGEX = special_regex_maker(special_token_list)
|
||||||
|
BPE_VOCABULARY_SIZE = self.__bpe_encoder.vocabulary_size
|
||||||
def encode(self, corpus : str) -> list[int]:
|
|
||||||
output : list[int] = []
|
self.__splitter = NanoSocratesSplitter(SPECIAL_REGEX, BPE_VOCABULARY_SIZE)
|
||||||
for piece, token_type in self._splitter.split_text(corpus):
|
self.__special_encoder = NanoSocratesSpecial(
|
||||||
|
BPE_VOCABULARY_SIZE, special_token_list
|
||||||
|
)
|
||||||
|
|
||||||
|
def encode(self, corpus: str) -> list[int]:
|
||||||
|
output: list[int] = []
|
||||||
|
for piece, token_type in self.__splitter.split_text(corpus):
|
||||||
|
|
||||||
if token_type == TokenType.SPECIAL:
|
if token_type == TokenType.SPECIAL:
|
||||||
output.extend(self._special_bpe.encode(piece))
|
output.extend(self.__special_encoder.encode(piece))
|
||||||
|
|
||||||
# slow but clear
|
# slow but clear
|
||||||
if token_type == TokenType.BPE:
|
if token_type == TokenType.BPE:
|
||||||
output.extend(self._bpe.encode(piece))
|
output.extend(self.__bpe_encoder.encode(piece))
|
||||||
|
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
def decode(self, corpus: list[int]) -> str:
|
||||||
|
output_str = ""
|
||||||
def decode(self, corpus : list[int])-> str:
|
for token, token_type in self.__splitter.split_tokens(corpus):
|
||||||
output_str = ''
|
|
||||||
for token, token_type in self._splitter.split_tokens(corpus):
|
|
||||||
# token is an integer if special, a list of integer otherwise
|
# token is an integer if special, a list of integer otherwise
|
||||||
if token_type == TokenType.SPECIAL:
|
if token_type == TokenType.SPECIAL:
|
||||||
output_str += self._special_bpe.decode(token) # it accept an integer
|
output_str += self.__special_encoder.decode(
|
||||||
|
token
|
||||||
|
) # it accept an integer
|
||||||
|
|
||||||
# slow but clear
|
# slow but clear
|
||||||
if token_type == TokenType.BPE:
|
if token_type == TokenType.BPE:
|
||||||
output_str += self._bpe.decode(token) # it accept a list of integer
|
output_str += self.__bpe_encoder.decode(
|
||||||
|
token
|
||||||
|
) # it accept a list of integer
|
||||||
return output_str
|
return output_str
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def prepare_special_token_vocabulary(self):
|
|
||||||
self._special_bpe.set_vocabulary_index(self._bpe.vocabulary_size)
|
|
||||||
|
|
||||||
for special_token in [token.value for token in SpecialToken]:
|
|
||||||
self._special_bpe.add_special_word_to_vocabulary(special_token)
|
|
||||||
|
|
||||||
self._special_bpe.build_reverse_vocabulary()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
dictionary_path = "Assets/Dataset/Tmp/toy_dictionary.json"
|
|
||||||
dictionary = load_json(Path(dictionary_path))
|
|
||||||
|
|
||||||
tokenano = TokeNanoCore(dictionary)
|
|
||||||
|
|
||||||
corpus = "<SOTL><SOT><SUBJ>dbp-dbr:How_It_Should_Have_Ended<PRED>"
|
|
||||||
print(corpus)
|
|
||||||
|
|
||||||
encoded_list = tokenano.encode(corpus)
|
|
||||||
print(encoded_list)
|
|
||||||
|
|
||||||
decoded_string = tokenano.decode(encoded_list)
|
|
||||||
print(decoded_string)
|
|
||||||
|
|
||||||
# [1474, 1475, 1477, 100, 98, 112, 45, 100, 98, 114, 58, 72, 111, 119, 95, 73, 116, 95, 83, 104, 111, 117, 108, 100, 95, 72, 97, 118, 101, 95, 69, 110, 100, 101, 100, 1478]
|
|
||||||
21
Project_Model/Libs/BPE/Enums/SpecialToken.py
Normal file
21
Project_Model/Libs/BPE/Enums/SpecialToken.py
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
from enum import Enum
|
||||||
|
|
||||||
|
|
||||||
|
class SpecialToken(Enum):
|
||||||
|
# (Enum, str) -> throws an error
|
||||||
|
START_TRIPLE_LIST = "<SOTL>"
|
||||||
|
START_TRIPLE = "<SOT>"
|
||||||
|
END_TRIPLE = "<EOT>"
|
||||||
|
SUBJECT = "<SUBJ>"
|
||||||
|
RELATIONSHIP = "<PRED>"
|
||||||
|
OBJECT = "<OBJ>"
|
||||||
|
ABSTRACT = "<ABS>"
|
||||||
|
CORPUS_END = "<END>"
|
||||||
|
|
||||||
|
## Tasks' Token
|
||||||
|
RDF_TO_TEXT = "<RDF2TXT>"
|
||||||
|
TEXT_TO_RDF = "<TEXT2RDF>"
|
||||||
|
CONTINUE_RDF = "<CONTINUERDF>"
|
||||||
|
MASK = "<MASK>"
|
||||||
|
|
||||||
|
# BPE Training:
|
||||||
Loading…
x
Reference in New Issue
Block a user