Fixed a bug where a token (int) was yielded instead of a list of int
This commit is contained in:
parent
845d645348
commit
e8894504c6
@ -1,23 +1,20 @@
|
|||||||
import re
|
import re
|
||||||
from collections import deque
|
from collections import deque
|
||||||
from typing import Generator
|
from typing import Generator
|
||||||
from ..Enums import TokenType
|
from ..Enums import TokenType
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class NanoSocratesSplitter:
|
class NanoSocratesSplitter:
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self, special_token_regex: re.Pattern, max_bpe_token_id: int = 255
|
||||||
special_token_regex: re.Pattern,
|
|
||||||
max_bpe_token_id: int = 255
|
|
||||||
) -> None:
|
) -> None:
|
||||||
# attention the regex got already compiled
|
# attention the regex got already compiled
|
||||||
self.__special_token_regex = special_token_regex
|
self.__special_token_regex = special_token_regex
|
||||||
self.__max_bpe_token_id : int = max_bpe_token_id # used for decoding
|
self.__max_bpe_token_id: int = max_bpe_token_id # used for decoding
|
||||||
|
|
||||||
def split_text(self, corpus: str) -> Generator[tuple[str, TokenType]]:
|
def split_text(self, corpus: str) -> Generator[tuple[str, TokenType]]:
|
||||||
""" Split a text using a regex given
|
"""Split a text using a regex given
|
||||||
Args:
|
Args:
|
||||||
corpus (str): all the corpus string to split
|
corpus (str): all the corpus string to split
|
||||||
Yields:
|
Yields:
|
||||||
@ -26,7 +23,7 @@ class NanoSocratesSplitter:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
bpe_start = 0
|
bpe_start = 0
|
||||||
bpe_end = len(corpus) # this can be deleted!
|
bpe_end = len(corpus) # this can be deleted!
|
||||||
|
|
||||||
for special_token_start, special_token_end in self.__find_boundaries(corpus):
|
for special_token_start, special_token_end in self.__find_boundaries(corpus):
|
||||||
|
|
||||||
@ -45,7 +42,6 @@ class NanoSocratesSplitter:
|
|||||||
# it will used in the next interaction
|
# it will used in the next interaction
|
||||||
bpe_start = special_token_end
|
bpe_start = special_token_end
|
||||||
|
|
||||||
|
|
||||||
def __find_boundaries(self, corpus: str) -> Generator[tuple[int, int]]:
|
def __find_boundaries(self, corpus: str) -> Generator[tuple[int, int]]:
|
||||||
"""
|
"""
|
||||||
Find each time the start and end (not included) of the special token
|
Find each time the start and end (not included) of the special token
|
||||||
@ -53,21 +49,20 @@ class NanoSocratesSplitter:
|
|||||||
corpus (str): the string where the special token will be searched
|
corpus (str): the string where the special token will be searched
|
||||||
Yields:
|
Yields:
|
||||||
Generator[tuple[int, int]]: Note the end is not included
|
Generator[tuple[int, int]]: Note the end is not included
|
||||||
"""
|
"""
|
||||||
for match in self.__special_token_regex.finditer(corpus):
|
for match in self.__special_token_regex.finditer(corpus):
|
||||||
start = match.start()
|
start = match.start()
|
||||||
end = match.end()
|
end = match.end()
|
||||||
|
|
||||||
yield (start, end)
|
yield (start, end)
|
||||||
|
|
||||||
# make the last boundary be the end of corpus
|
# make the last boundary be the end of corpus
|
||||||
# eof = len(corpus)
|
# eof = len(corpus)
|
||||||
# yield(eof,eof)
|
# yield(eof,eof)
|
||||||
|
|
||||||
|
def split_tokens(self, corpus: list[int]) -> Generator[tuple[list[int], TokenType]]:
|
||||||
|
|
||||||
def split_tokens(self, corpus: list[int]) -> Generator[tuple[list[int], TokenType]] :
|
not_special_token_list: list[int] = []
|
||||||
|
|
||||||
not_special_token_list : list[int]= []
|
|
||||||
for token in corpus:
|
for token in corpus:
|
||||||
if token > self.__max_bpe_token_id:
|
if token > self.__max_bpe_token_id:
|
||||||
|
|
||||||
@ -75,8 +70,7 @@ class NanoSocratesSplitter:
|
|||||||
yield (not_special_token_list, TokenType.BPE)
|
yield (not_special_token_list, TokenType.BPE)
|
||||||
not_special_token_list = []
|
not_special_token_list = []
|
||||||
|
|
||||||
yield (token, TokenType.SPECIAL)
|
yield ([token], TokenType.SPECIAL)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
not_special_token_list.append(token)
|
|
||||||
|
|
||||||
|
not_special_token_list.append(token)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user