Compare commits
74 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1d23b9cc8b | ||
|
|
165290162c | ||
|
|
502016f843 | ||
|
|
845c63dbef | ||
|
|
bbadd4c521 | ||
|
|
c2f9344c82 | ||
|
|
25f3a5d221 | ||
|
|
149deb407d | ||
|
|
8a21cb1b73 | ||
|
|
d2a3dfe90f | ||
|
|
0f95aeb122 | ||
|
|
0ee6e48004 | ||
|
|
55e0d2ac23 | ||
|
|
9c5f42153f | ||
|
|
c74689d01d | ||
|
|
51f491d033 | ||
|
|
c5c0c61f79 | ||
|
|
6b9cb7cd35 | ||
|
|
e8894504c6 | ||
|
|
845d645348 | ||
|
|
09f7b39512 | ||
|
|
070dc1b744 | ||
|
|
8121c75a09 | ||
|
|
a5b8692a77 | ||
|
|
7c935d2700 | ||
|
|
a1d143187d | ||
|
|
0eef2148a9 | ||
|
|
856bd8909c | ||
|
|
2e595a3a23 | ||
|
|
2194cc7b4f | ||
|
|
1eae8582b2 | ||
|
|
eadba1fb82 | ||
|
|
aa765b4555 | ||
|
|
17d82f0a4e | ||
|
|
0975c19e69 | ||
|
|
3fe4e45ceb | ||
|
|
d19426fa62 | ||
|
|
63baf29805 | ||
|
|
b80b4e4112 | ||
|
|
7cfaf601b4 | ||
|
|
fbbe6226bb | ||
|
|
b3d444979f | ||
|
|
66bcf6e55f | ||
|
|
dbf1d99408 | ||
|
|
97bac464f3 | ||
|
|
9a8e726d74 | ||
|
|
7ab9b0358e | ||
|
|
30c2938d29 | ||
|
|
76f24d4eb0 | ||
|
|
89a0a1f4bb | ||
|
|
ccacea18d8 | ||
|
|
b09bd4acba | ||
|
|
c9032cab09 | ||
|
|
7020c9e683 | ||
|
|
2fe1ce9e9a | ||
|
|
18fc2ba9d8 | ||
|
|
5acee1d1a5 | ||
| 2e36753da4 | |||
|
|
564b0d712e | ||
|
|
e433941405 | ||
|
|
b46df4f91a | ||
|
|
d179e01971 | ||
|
|
b071145f6e | ||
|
|
ed0255e99b | ||
|
|
3e8b5c5579 | ||
|
|
8db35732f9 | ||
|
|
9552d61f8d | ||
|
|
be8a87ce01 | ||
|
|
5801a819e9 | ||
|
|
3f48b5c428 | ||
|
|
9972ab8a51 | ||
|
|
90012285b5 | ||
|
|
1bbb4a0999 | ||
|
|
ee0aa583d5 |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -254,4 +254,5 @@ $RECYCLE.BIN/
|
|||||||
|
|
||||||
# ---> Custom
|
# ---> Custom
|
||||||
**/Tmp/**
|
**/Tmp/**
|
||||||
|
**/cache/**
|
||||||
!**/.gitkeep
|
!**/.gitkeep
|
||||||
|
|||||||
16
.vscode/launch.json
vendored
Normal file
16
.vscode/launch.json
vendored
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
{
|
||||||
|
// Use IntelliSense to learn about possible attributes.
|
||||||
|
// Hover to view descriptions of existing attributes.
|
||||||
|
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
||||||
|
"version": "0.2.0",
|
||||||
|
"configurations": [
|
||||||
|
{
|
||||||
|
"name": "Python Debugger: Current File with Arguments",
|
||||||
|
"type": "debugpy",
|
||||||
|
"request": "launch",
|
||||||
|
"program": "${file}",
|
||||||
|
"console": "integratedTerminal",
|
||||||
|
"args": "${command:pickArgs}"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
77
.vscode/settings.json
vendored
77
.vscode/settings.json
vendored
@@ -1,24 +1,55 @@
|
|||||||
{
|
{
|
||||||
// Always treat the project root as the working dir for Jupyter
|
// Always treat the project root as the working dir for Jupyter
|
||||||
"jupyter.notebookFileRoot": "${workspaceFolder}",
|
"jupyter.notebookFileRoot": "${workspaceFolder}",
|
||||||
|
// When you click "Run Python File in Terminal", DON'T cd into the file's folder
|
||||||
// When you click "Run Python File in Terminal", DON'T cd into the file's folder
|
"python.terminal.executeInFileDir": false,
|
||||||
"python.terminal.executeInFileDir": false,
|
// Start new integrated terminals at the project root
|
||||||
|
"terminal.integrated.cwd": "${workspaceFolder}",
|
||||||
// Start new integrated terminals at the project root
|
// Make pytest run from the root without needing a pytest.ini
|
||||||
"terminal.integrated.cwd": "${workspaceFolder}",
|
"python.testing.pytestEnabled": true,
|
||||||
|
"python.testing.cwd": "${workspaceFolder}",
|
||||||
// Ensure Python can import from the project root no matter which file you run
|
"python.testing.pytestArgs": [
|
||||||
// (so `src/` is on sys.path). Linux shown here; add osx/windows if needed.
|
"src/test"
|
||||||
"terminal.integrated.env.linux": {
|
],
|
||||||
"PYTHONPATH": "${workspaceFolder}"
|
// Help Pylance resolve imports like `from src...` without red squiggles
|
||||||
},
|
"python.analysis.extraPaths": [
|
||||||
|
"${workspaceFolder}"
|
||||||
// Make pytest run from the root without needing a pytest.ini
|
],
|
||||||
"python.testing.pytestEnabled": true,
|
// For linux
|
||||||
"python.testing.cwd": "${workspaceFolder}",
|
"terminal.integrated.env.linux": {
|
||||||
"python.testing.pytestArgs": ["src/test"],
|
"PYTHONPATH": "${workspaceFolder}"
|
||||||
|
},
|
||||||
// Help Pylance resolve imports like `from src...` without red squiggles
|
// For OSX
|
||||||
"python.analysis.extraPaths": ["${workspaceFolder}"]
|
"terminal.integrated.env.osx": {
|
||||||
}
|
"PYTHONPATH": "${workspaceFolder}"
|
||||||
|
},
|
||||||
|
// For Windows
|
||||||
|
"terminal.integrated.env.windows": {
|
||||||
|
"PYTHONPATH": "${workspaceFolder}"
|
||||||
|
},
|
||||||
|
"python.analysis.typeCheckingMode": "standard"
|
||||||
|
}
|
||||||
|
// {
|
||||||
|
// // Always treat the project root as the working dir for Jupyter
|
||||||
|
// "jupyter.notebookFileRoot": "${workspaceFolder}",
|
||||||
|
//
|
||||||
|
// // When you click "Run Python File in Terminal", DON'T cd into the file's folder
|
||||||
|
// "python.terminal.executeInFileDir": false,
|
||||||
|
//
|
||||||
|
// // Start new integrated terminals at the project root
|
||||||
|
// "terminal.integrated.cwd": "${workspaceFolder}",
|
||||||
|
//
|
||||||
|
// // Ensure Python can import from the project root no matter which file you run
|
||||||
|
// // (so `src/` is on sys.path). Linux shown here; add osx/windows if needed.
|
||||||
|
// "terminal.integrated.env.windows": {
|
||||||
|
// "PYTHONPATH": "${workspaceFolder}"
|
||||||
|
// },
|
||||||
|
//
|
||||||
|
// // Make pytest run from the root without needing a pytest.ini
|
||||||
|
// "python.testing.pytestEnabled": true,
|
||||||
|
// "python.testing.cwd": "${workspaceFolder}",
|
||||||
|
// "python.testing.pytestArgs": ["src/test"],
|
||||||
|
//
|
||||||
|
// // Help Pylance resolve imports like `from src...` without red squiggles
|
||||||
|
// "python.analysis.extraPaths": ["${workspaceFolder}"]
|
||||||
|
// }
|
||||||
BIN
Assets/Model/toy_10/README.md
LFS
Normal file
BIN
Assets/Model/toy_10/README.md
LFS
Normal file
Binary file not shown.
BIN
Assets/Model/toy_10/toy_dictionary.json
LFS
Normal file
BIN
Assets/Model/toy_10/toy_dictionary.json
LFS
Normal file
Binary file not shown.
4
Project_Model/Libs/BPE/Classes/Encoder.py
Normal file
4
Project_Model/Libs/BPE/Classes/Encoder.py
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
from abc import ABC
|
||||||
|
|
||||||
|
class Encoder(ABC):
|
||||||
|
pass
|
||||||
164
Project_Model/Libs/BPE/Classes/NanoSocraTraineRam.py
Normal file
164
Project_Model/Libs/BPE/Classes/NanoSocraTraineRam.py
Normal file
@@ -0,0 +1,164 @@
|
|||||||
|
from collections import deque
|
||||||
|
import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
import re
|
||||||
|
from ..Classes import (
|
||||||
|
NanoSocratesBPE,
|
||||||
|
NanoSocratesChunker,
|
||||||
|
NanoSocratesSplitter,
|
||||||
|
NanoSocratesBatchMemoryBPE,
|
||||||
|
)
|
||||||
|
from ..Enums import TokenType
|
||||||
|
from ..Utils import (
|
||||||
|
special_regex_maker,
|
||||||
|
iterator_with_checks,
|
||||||
|
save_nanos_vocabulary,
|
||||||
|
load_nanos_vocabulary,
|
||||||
|
save_json,
|
||||||
|
load_json,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class NanoSocraTraineRam:
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
max_vocabulary: int,
|
||||||
|
special_vocabulary: list[str],
|
||||||
|
merge_treshold: int = 0,
|
||||||
|
max_iterations: int = 0,
|
||||||
|
print_after_iterations: int = 1,
|
||||||
|
) -> None:
|
||||||
|
# Bytes
|
||||||
|
BYTE_RESERVED_TOKENS = 256
|
||||||
|
SPECIAL_RESERVED_TOKENS = len(special_vocabulary)
|
||||||
|
RESERVED_TOKENS = BYTE_RESERVED_TOKENS + SPECIAL_RESERVED_TOKENS
|
||||||
|
|
||||||
|
self.__max_vocabulary = max_vocabulary - RESERVED_TOKENS
|
||||||
|
self.__max_iterations = max_iterations
|
||||||
|
self.__merge_treshold = merge_treshold
|
||||||
|
self.__special_token_regex = special_regex_maker(special_vocabulary)
|
||||||
|
self.__print_after_iterations = print_after_iterations
|
||||||
|
|
||||||
|
def trainBPE(
|
||||||
|
self,
|
||||||
|
path: Path,
|
||||||
|
bpe: NanoSocratesBPE | None = None,
|
||||||
|
) -> NanoSocratesBPE:
|
||||||
|
|
||||||
|
if not path.is_file():
|
||||||
|
raise FileNotFoundError()
|
||||||
|
|
||||||
|
if bpe is None:
|
||||||
|
bpe = NanoSocratesBPE()
|
||||||
|
BPE = bpe
|
||||||
|
|
||||||
|
if BPE.vocabulary_size > self.__max_vocabulary:
|
||||||
|
return BPE
|
||||||
|
|
||||||
|
exit = False
|
||||||
|
current_iteration = 0
|
||||||
|
data = self.__gather_data_from_file(path)
|
||||||
|
|
||||||
|
while not exit:
|
||||||
|
|
||||||
|
current_iteration = self.__increment_counter(current_iteration)
|
||||||
|
|
||||||
|
LAST_VOC_SIZE = BPE.vocabulary_size
|
||||||
|
|
||||||
|
last_memory = None
|
||||||
|
|
||||||
|
_, data, last_memory = self.__round_train(BPE, data)
|
||||||
|
|
||||||
|
NEW_VOC_SIZE = BPE.vocabulary_size
|
||||||
|
|
||||||
|
if current_iteration % self.__print_after_iterations == 0:
|
||||||
|
|
||||||
|
DELIMITER = "==============="
|
||||||
|
|
||||||
|
DEBUG = "\n".join(
|
||||||
|
[
|
||||||
|
DELIMITER,
|
||||||
|
f"ITERATION: {current_iteration}",
|
||||||
|
DELIMITER,
|
||||||
|
f"\tVocabulary size: {BPE.vocabulary_size}\n",
|
||||||
|
f"\tFrequencies:\n{last_memory.frequencies}\n", # type: ignore (pretty sure it's not None)
|
||||||
|
f"\tvocabulary:\n{BPE.vocabulary}",
|
||||||
|
DELIMITER,
|
||||||
|
"",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
print(DEBUG)
|
||||||
|
|
||||||
|
if LAST_VOC_SIZE == NEW_VOC_SIZE:
|
||||||
|
exit = True
|
||||||
|
continue
|
||||||
|
|
||||||
|
if current_iteration == self.__max_iterations:
|
||||||
|
exit = True
|
||||||
|
continue
|
||||||
|
|
||||||
|
if BPE.vocabulary_size == self.__max_vocabulary:
|
||||||
|
exit = True
|
||||||
|
continue
|
||||||
|
|
||||||
|
return BPE
|
||||||
|
|
||||||
|
def __round_train(self, bpe: NanoSocratesBPE, data: list[list[int]]):
|
||||||
|
|
||||||
|
DATA_LEN = len(data)
|
||||||
|
NEW_DATA = []
|
||||||
|
|
||||||
|
counter = 0
|
||||||
|
memory = NanoSocratesBatchMemoryBPE({}, 0)
|
||||||
|
while len(data) > 0:
|
||||||
|
counter += 1
|
||||||
|
last_batch = len(data) == 1
|
||||||
|
|
||||||
|
piece = data.pop()
|
||||||
|
|
||||||
|
bpe, memory, output = bpe.fit(piece, memory, last_batch)
|
||||||
|
|
||||||
|
if counter % int(1E6) == 0:
|
||||||
|
print(f"Fitted: {counter}/{DATA_LEN}")
|
||||||
|
|
||||||
|
if len(output) < 2:
|
||||||
|
continue
|
||||||
|
|
||||||
|
NEW_DATA.append(output)
|
||||||
|
|
||||||
|
return (bpe, NEW_DATA, memory)
|
||||||
|
|
||||||
|
def __gather_data_from_file(self, path: Path) -> list[list[int]]:
|
||||||
|
|
||||||
|
SPLITTER = NanoSocratesSplitter(self.__special_token_regex)
|
||||||
|
|
||||||
|
DATA: list[list[int]] = []
|
||||||
|
|
||||||
|
FILE = open(path, "r", encoding="utf-8")
|
||||||
|
file_string = FILE.read()
|
||||||
|
FILE.close()
|
||||||
|
|
||||||
|
for piece, type in SPLITTER.split_text(file_string):
|
||||||
|
|
||||||
|
if type != TokenType.BPE:
|
||||||
|
continue
|
||||||
|
|
||||||
|
int_list = self.__make_list_ids(piece)
|
||||||
|
DATA.append(int_list)
|
||||||
|
|
||||||
|
return DATA
|
||||||
|
|
||||||
|
def __increment_counter(self, counter: int):
|
||||||
|
|
||||||
|
# What if overflows???
|
||||||
|
try:
|
||||||
|
counter += 1
|
||||||
|
except:
|
||||||
|
print("Integer overflow")
|
||||||
|
counter = 1
|
||||||
|
|
||||||
|
return counter
|
||||||
|
|
||||||
|
def __make_list_ids(self, corpus: str):
|
||||||
|
return list(corpus.encode("utf-8"))
|
||||||
248
Project_Model/Libs/BPE/Classes/NanoSocraTrainer.py
Normal file
248
Project_Model/Libs/BPE/Classes/NanoSocraTrainer.py
Normal file
@@ -0,0 +1,248 @@
|
|||||||
|
from collections import deque
|
||||||
|
import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
import re
|
||||||
|
from ..Classes import (
|
||||||
|
NanoSocratesBPE,
|
||||||
|
NanoSocratesChunker,
|
||||||
|
NanoSocratesSplitter,
|
||||||
|
NanoSocratesBatchMemoryBPE,
|
||||||
|
)
|
||||||
|
from ..Enums import TokenType
|
||||||
|
from ..Utils import (
|
||||||
|
special_regex_maker,
|
||||||
|
iterator_with_checks,
|
||||||
|
save_nanos_vocabulary,
|
||||||
|
load_nanos_vocabulary,
|
||||||
|
save_json,
|
||||||
|
load_json,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class NanoSocraTrainer:
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
max_vocabulary: int,
|
||||||
|
special_vocabulary: list[str],
|
||||||
|
chunk_size: int,
|
||||||
|
merge_treshold: int = 0,
|
||||||
|
max_iterations: int = 0,
|
||||||
|
print_after_iterations: int = 1,
|
||||||
|
) -> None:
|
||||||
|
# Bytes
|
||||||
|
BYTE_RESERVED_TOKENS = 256
|
||||||
|
SPECIAL_RESERVED_TOKENS = len(special_vocabulary)
|
||||||
|
RESERVED_TOKENS = BYTE_RESERVED_TOKENS + SPECIAL_RESERVED_TOKENS
|
||||||
|
|
||||||
|
self.__max_vocabulary = max_vocabulary - RESERVED_TOKENS
|
||||||
|
self.__max_iterations = max_iterations
|
||||||
|
self.__chunk_size = chunk_size
|
||||||
|
self.__merge_treshold = merge_treshold
|
||||||
|
self.__special_token_regex = special_regex_maker(special_vocabulary)
|
||||||
|
self.__print_after_iterations = print_after_iterations
|
||||||
|
|
||||||
|
def trainBPE(
|
||||||
|
self,
|
||||||
|
path: Path,
|
||||||
|
cache_dir: Path,
|
||||||
|
bpe: NanoSocratesBPE | None = None,
|
||||||
|
resume_from_iter: int = 0,
|
||||||
|
) -> NanoSocratesBPE:
|
||||||
|
|
||||||
|
if not path.is_file():
|
||||||
|
raise FileNotFoundError()
|
||||||
|
|
||||||
|
if not cache_dir.is_dir():
|
||||||
|
raise NotADirectoryError()
|
||||||
|
|
||||||
|
if bpe is None:
|
||||||
|
bpe = NanoSocratesBPE()
|
||||||
|
BPE = bpe
|
||||||
|
|
||||||
|
if BPE.vocabulary_size > self.__max_vocabulary:
|
||||||
|
return BPE
|
||||||
|
|
||||||
|
exit = False
|
||||||
|
cached = False
|
||||||
|
current_iteration = 0
|
||||||
|
input_path = path
|
||||||
|
|
||||||
|
NEXT_ITERATION = resume_from_iter + 1 if resume_from_iter != 0 else 0
|
||||||
|
|
||||||
|
PATH_GEN = self.__switch_paths(path, cache_dir, NEXT_ITERATION)
|
||||||
|
MEMORY_PATH_GEN = self.__switch_memory(cache_dir, resume_from_iter)
|
||||||
|
|
||||||
|
if resume_from_iter != 0:
|
||||||
|
cached = True
|
||||||
|
current_iteration = resume_from_iter
|
||||||
|
input_path = next(PATH_GEN)
|
||||||
|
# UGLY: fixes a bug immediately, unfortunately
|
||||||
|
_, _ = next(MEMORY_PATH_GEN)
|
||||||
|
_, voc_cache_path = next(MEMORY_PATH_GEN)
|
||||||
|
vocabulary = load_nanos_vocabulary(voc_cache_path)
|
||||||
|
BPE = NanoSocratesBPE(vocabulary)
|
||||||
|
|
||||||
|
while not exit:
|
||||||
|
|
||||||
|
out_path = next(PATH_GEN)
|
||||||
|
internal_cache_path, vocabulary_cache = next(MEMORY_PATH_GEN)
|
||||||
|
|
||||||
|
current_iteration = self.__increment_counter(current_iteration)
|
||||||
|
LAST_VOC_SIZE = BPE.vocabulary_size
|
||||||
|
|
||||||
|
FILE = open(out_path, "w")
|
||||||
|
|
||||||
|
last_memory = None
|
||||||
|
|
||||||
|
for _, memory, output in self.__round_train(input_path, BPE, cached):
|
||||||
|
last_memory = memory
|
||||||
|
FILE.write(output)
|
||||||
|
|
||||||
|
FILE.close()
|
||||||
|
|
||||||
|
internal_cache = {
|
||||||
|
"finished_iter": current_iteration,
|
||||||
|
"read_from": f"{input_path}",
|
||||||
|
"wrote_to": f"{out_path}",
|
||||||
|
"at": datetime.datetime.now(datetime.timezone.utc).strftime(
|
||||||
|
"%Y-%m-%d %H:%M:%S.%f"
|
||||||
|
)[:-3],
|
||||||
|
}
|
||||||
|
|
||||||
|
VOCABULARY = BPE.vocabulary
|
||||||
|
|
||||||
|
save_json(internal_cache, internal_cache_path)
|
||||||
|
save_nanos_vocabulary(VOCABULARY, vocabulary_cache)
|
||||||
|
|
||||||
|
cached = True
|
||||||
|
input_path = out_path
|
||||||
|
|
||||||
|
NEW_VOC_SIZE = BPE.vocabulary_size
|
||||||
|
|
||||||
|
if current_iteration % self.__print_after_iterations == 0:
|
||||||
|
|
||||||
|
DELIMITER = "==============="
|
||||||
|
|
||||||
|
DEBUG = "\n".join(
|
||||||
|
[
|
||||||
|
DELIMITER,
|
||||||
|
f"ITERATION: {current_iteration}",
|
||||||
|
DELIMITER,
|
||||||
|
f"\tVocabulary size: {BPE.vocabulary_size}\n",
|
||||||
|
f"\tFrequencies:\n{last_memory.frequencies}\n", # type: ignore (pretty sure it's not None)
|
||||||
|
f"\tvocabulary:\n{BPE.vocabulary}",
|
||||||
|
DELIMITER,
|
||||||
|
"",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
print(DEBUG)
|
||||||
|
|
||||||
|
if LAST_VOC_SIZE == NEW_VOC_SIZE:
|
||||||
|
exit = True
|
||||||
|
continue
|
||||||
|
|
||||||
|
if current_iteration == self.__max_iterations:
|
||||||
|
exit = True
|
||||||
|
continue
|
||||||
|
|
||||||
|
if BPE.vocabulary_size == self.__max_vocabulary:
|
||||||
|
exit = True
|
||||||
|
continue
|
||||||
|
|
||||||
|
return BPE
|
||||||
|
|
||||||
|
def __round_train(self, path: Path, bpe: NanoSocratesBPE, cached: bool):
|
||||||
|
|
||||||
|
CHUNKER = NanoSocratesChunker(self.__chunk_size, self.__special_token_regex)
|
||||||
|
SPLITTER = NanoSocratesSplitter(self.__special_token_regex)
|
||||||
|
|
||||||
|
BPE = bpe
|
||||||
|
memory = NanoSocratesBatchMemoryBPE({}, self.__merge_treshold)
|
||||||
|
|
||||||
|
CHUNKER_GENERATOR = iterator_with_checks(CHUNKER.chunk(path))
|
||||||
|
|
||||||
|
for chunk, last_chunk in CHUNKER_GENERATOR:
|
||||||
|
|
||||||
|
PIECE_GENERATOR = iterator_with_checks(SPLITTER.split_text(chunk))
|
||||||
|
|
||||||
|
for piece, last_piece in PIECE_GENERATOR:
|
||||||
|
|
||||||
|
LAST_BATCH = last_chunk and last_piece
|
||||||
|
PIECE, TOKEN_TYPE = piece
|
||||||
|
|
||||||
|
if TOKEN_TYPE != TokenType.BPE:
|
||||||
|
_, _, out = BPE.fit([], memory, LAST_BATCH)
|
||||||
|
yield (BPE, memory, PIECE)
|
||||||
|
continue
|
||||||
|
|
||||||
|
PIECE_DATA = self.__make_list_ids(PIECE, cached)
|
||||||
|
|
||||||
|
_, _, out = BPE.fit(PIECE_DATA, memory, LAST_BATCH)
|
||||||
|
|
||||||
|
OUT_STRING = f"{out}"
|
||||||
|
yield (BPE, memory, OUT_STRING)
|
||||||
|
|
||||||
|
def __increment_counter(self, counter: int):
|
||||||
|
|
||||||
|
# What if overflows???
|
||||||
|
try:
|
||||||
|
counter += 1
|
||||||
|
except:
|
||||||
|
print("Integer overflow")
|
||||||
|
counter = 1
|
||||||
|
|
||||||
|
return counter
|
||||||
|
|
||||||
|
def __make_list_ids(self, corpus: str, cached: bool):
|
||||||
|
|
||||||
|
if not cached:
|
||||||
|
return list(corpus.encode("utf-8"))
|
||||||
|
|
||||||
|
REDUCED_CORPUS_LEN = len(corpus) - 1
|
||||||
|
|
||||||
|
# Skip these cars "[" "]"
|
||||||
|
INTS = corpus[1:REDUCED_CORPUS_LEN]
|
||||||
|
INT_LIST = list(map(int, INTS.split(",")))
|
||||||
|
return INT_LIST
|
||||||
|
|
||||||
|
def __switch_paths(self, path: Path, cache_path: Path, initial_iteration: int):
|
||||||
|
|
||||||
|
CORPUS_TMP_1 = cache_path / "corpus-tmp1.txt"
|
||||||
|
CORPUS_TMP_2 = cache_path / "corpus-tmp2.txt"
|
||||||
|
|
||||||
|
switch = True
|
||||||
|
|
||||||
|
if initial_iteration % 2 == 1:
|
||||||
|
switch = False
|
||||||
|
|
||||||
|
del initial_iteration
|
||||||
|
|
||||||
|
while True:
|
||||||
|
if switch:
|
||||||
|
yield CORPUS_TMP_1
|
||||||
|
else:
|
||||||
|
yield CORPUS_TMP_2
|
||||||
|
switch = not switch
|
||||||
|
|
||||||
|
def __switch_memory(self, cache_path: Path, initial_iteration: int):
|
||||||
|
|
||||||
|
INTERNAL_TMP_1 = cache_path / "internal-tmp1.json"
|
||||||
|
INTERNAL_TMP_2 = cache_path / "internal-tmp2.json"
|
||||||
|
|
||||||
|
VOCAB_TMP_1 = cache_path / "voc-tmp1.json"
|
||||||
|
VOCAB_TMP_2 = cache_path / "voc-tmp2.json"
|
||||||
|
|
||||||
|
switch = False
|
||||||
|
|
||||||
|
if initial_iteration % 2 == 1:
|
||||||
|
switch = True
|
||||||
|
|
||||||
|
del initial_iteration
|
||||||
|
|
||||||
|
while True:
|
||||||
|
if switch:
|
||||||
|
yield (INTERNAL_TMP_1, VOCAB_TMP_1)
|
||||||
|
else:
|
||||||
|
yield (INTERNAL_TMP_2, VOCAB_TMP_2)
|
||||||
|
switch = not switch
|
||||||
280
Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py
Normal file
280
Project_Model/Libs/BPE/Classes/NanoSocraTrainerPool.py
Normal file
@@ -0,0 +1,280 @@
|
|||||||
|
from collections import deque
|
||||||
|
import datetime
|
||||||
|
import itertools
|
||||||
|
from multiprocessing import Pool
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
from ..Classes import (
|
||||||
|
NanoSocratesBPE,
|
||||||
|
NanoSocratesChunker,
|
||||||
|
NanoSocratesSplitter,
|
||||||
|
NanoSocratesBatchMemoryBPE,
|
||||||
|
)
|
||||||
|
from ..Enums import TokenType
|
||||||
|
from ..Utils import (
|
||||||
|
special_regex_maker,
|
||||||
|
iterator_with_checks,
|
||||||
|
save_nanos_vocabulary,
|
||||||
|
load_nanos_vocabulary,
|
||||||
|
save_json,
|
||||||
|
load_json,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def split(a, n):
|
||||||
|
k, m = divmod(len(a), n)
|
||||||
|
return (a[i * k + min(i, m) : (i + 1) * k + min(i + 1, m)] for i in range(n))
|
||||||
|
|
||||||
|
|
||||||
|
def split_fit(object: tuple[NanoSocratesBPE, list[list[int]]]):
|
||||||
|
|
||||||
|
bpe, data = object
|
||||||
|
|
||||||
|
NEW_DATA: list[list[int]] = []
|
||||||
|
|
||||||
|
memory = NanoSocratesBatchMemoryBPE({}, 0)
|
||||||
|
|
||||||
|
while len(data) > 0:
|
||||||
|
|
||||||
|
piece = data.pop()
|
||||||
|
|
||||||
|
bpe, memory, output = bpe.fit(piece, memory, False)
|
||||||
|
|
||||||
|
if len(output) < 2:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# We are sure of its type
|
||||||
|
NEW_DATA.append(piece) # type: ignore
|
||||||
|
|
||||||
|
return (bpe, NEW_DATA, memory)
|
||||||
|
|
||||||
|
def split_encode(object: tuple[NanoSocratesBPE, list[list[int]]]):
|
||||||
|
|
||||||
|
bpe, data = object
|
||||||
|
|
||||||
|
NEW_DATA: list[list[int]] = []
|
||||||
|
|
||||||
|
for index, piece in zip(range(0, len(data)), data):
|
||||||
|
output = bpe.encode_intermediate(piece)
|
||||||
|
|
||||||
|
if len(output) < 2:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# We are sure of its type
|
||||||
|
NEW_DATA.append(data[index]) # type: ignore
|
||||||
|
|
||||||
|
return NEW_DATA
|
||||||
|
|
||||||
|
class NanoSocraTrainerPool:
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
max_vocabulary: int,
|
||||||
|
special_vocabulary: list[str],
|
||||||
|
merge_treshold: int = 0,
|
||||||
|
max_iterations: int = 0,
|
||||||
|
print_after_iterations: int = 1,
|
||||||
|
) -> None:
|
||||||
|
# Bytes
|
||||||
|
BYTE_RESERVED_TOKENS = 256
|
||||||
|
SPECIAL_RESERVED_TOKENS = len(special_vocabulary)
|
||||||
|
RESERVED_TOKENS = BYTE_RESERVED_TOKENS + SPECIAL_RESERVED_TOKENS
|
||||||
|
|
||||||
|
self.__max_vocabulary = max_vocabulary - RESERVED_TOKENS
|
||||||
|
self.__max_iterations = max_iterations
|
||||||
|
self.__merge_treshold = merge_treshold
|
||||||
|
self.__special_token_regex = special_regex_maker(special_vocabulary)
|
||||||
|
self.__print_after_iterations = print_after_iterations
|
||||||
|
|
||||||
|
# TODO: add a resume function
|
||||||
|
def trainBPE(
|
||||||
|
self,
|
||||||
|
path: Path,
|
||||||
|
cache_file: Path,
|
||||||
|
bpe: NanoSocratesBPE | None = None,
|
||||||
|
) -> NanoSocratesBPE:
|
||||||
|
|
||||||
|
if not path.is_file():
|
||||||
|
raise FileNotFoundError()
|
||||||
|
|
||||||
|
if not cache_file.is_file():
|
||||||
|
file = cache_file.open("w")
|
||||||
|
file.close()
|
||||||
|
|
||||||
|
if bpe is None:
|
||||||
|
bpe = NanoSocratesBPE()
|
||||||
|
BPE = bpe
|
||||||
|
|
||||||
|
if BPE.vocabulary_size > self.__max_vocabulary:
|
||||||
|
return BPE
|
||||||
|
|
||||||
|
exit = False
|
||||||
|
current_iteration = 0
|
||||||
|
data = self.__gather_data_from_file(path)
|
||||||
|
data = self.__encode_from_cache(BPE, data)
|
||||||
|
|
||||||
|
|
||||||
|
while not exit:
|
||||||
|
|
||||||
|
current_iteration = self.__increment_counter(current_iteration)
|
||||||
|
|
||||||
|
LAST_VOC_SIZE = BPE.vocabulary_size
|
||||||
|
|
||||||
|
last_memory = None
|
||||||
|
|
||||||
|
start = time.time_ns()
|
||||||
|
_, data, last_memory = self.__round_train(BPE, data)
|
||||||
|
end = time.time_ns()
|
||||||
|
NEW_VOC_SIZE = BPE.vocabulary_size
|
||||||
|
|
||||||
|
VOCABULARY = BPE.vocabulary
|
||||||
|
|
||||||
|
save_nanos_vocabulary(VOCABULARY, cache_file)
|
||||||
|
|
||||||
|
if current_iteration % self.__print_after_iterations == 0:
|
||||||
|
|
||||||
|
DELIMITER = "==============="
|
||||||
|
|
||||||
|
DEBUG = "\n".join(
|
||||||
|
[
|
||||||
|
DELIMITER,
|
||||||
|
f"ITERATION: {current_iteration}",
|
||||||
|
DELIMITER,
|
||||||
|
f"\tVocabulary size: {BPE.vocabulary_size - 256}\n",
|
||||||
|
f"\tTime elapsed: {(end - start)/1E9}s",
|
||||||
|
DELIMITER,
|
||||||
|
"",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
print(DEBUG)
|
||||||
|
|
||||||
|
if LAST_VOC_SIZE == NEW_VOC_SIZE:
|
||||||
|
exit = True
|
||||||
|
continue
|
||||||
|
|
||||||
|
if current_iteration == self.__max_iterations:
|
||||||
|
exit = True
|
||||||
|
continue
|
||||||
|
|
||||||
|
if BPE.vocabulary_size == self.__max_vocabulary:
|
||||||
|
exit = True
|
||||||
|
continue
|
||||||
|
|
||||||
|
return BPE
|
||||||
|
|
||||||
|
def __round_train(self, bpe: NanoSocratesBPE, data: list[list[int]]):
|
||||||
|
|
||||||
|
NEW_DATA: list[list[int]] = []
|
||||||
|
|
||||||
|
MEMORY = NanoSocratesBatchMemoryBPE({}, self.__merge_treshold)
|
||||||
|
|
||||||
|
fit_funct = split_fit
|
||||||
|
CPU_COUNT = os.process_cpu_count()
|
||||||
|
|
||||||
|
if CPU_COUNT is None:
|
||||||
|
raise Exception()
|
||||||
|
|
||||||
|
VOCABULARY = bpe.vocabulary
|
||||||
|
|
||||||
|
data_chunks = split(data, CPU_COUNT)
|
||||||
|
JOBS = [(NanoSocratesBPE(VOCABULARY), chunk) for chunk in data_chunks]
|
||||||
|
|
||||||
|
JOB_RESULTS: list[
|
||||||
|
tuple[NanoSocratesBPE, list[list[int]], NanoSocratesBatchMemoryBPE]
|
||||||
|
]
|
||||||
|
|
||||||
|
with Pool() as pool:
|
||||||
|
JOB_RESULTS = pool.map(fit_funct, JOBS)
|
||||||
|
|
||||||
|
for i, res in zip(range(0, CPU_COUNT), JOB_RESULTS):
|
||||||
|
_, job_output, job_memory = res
|
||||||
|
NEW_DATA.extend(job_output)
|
||||||
|
|
||||||
|
for key, value in job_memory.frequencies.items():
|
||||||
|
frequency = MEMORY.frequencies.get(key)
|
||||||
|
|
||||||
|
if frequency is None:
|
||||||
|
frequency = 0
|
||||||
|
MEMORY.frequencies[key] = 0
|
||||||
|
|
||||||
|
frequency += value
|
||||||
|
MEMORY.frequencies[key] = frequency
|
||||||
|
|
||||||
|
del job_output
|
||||||
|
del job_memory
|
||||||
|
|
||||||
|
print(f"Joined {i + 1} out of {CPU_COUNT}")
|
||||||
|
|
||||||
|
# Get new token
|
||||||
|
bpe.fit([], MEMORY, True)
|
||||||
|
|
||||||
|
print(f"Sentences from {len(data)} to {len(NEW_DATA)}")
|
||||||
|
|
||||||
|
return (bpe, NEW_DATA, MEMORY)
|
||||||
|
|
||||||
|
def __gather_data_from_file(self, path: Path) -> list[list[int]]:
|
||||||
|
|
||||||
|
SPLITTER = NanoSocratesSplitter(self.__special_token_regex)
|
||||||
|
|
||||||
|
DATA: list[list[int]] = []
|
||||||
|
|
||||||
|
FILE = open(path, "r", encoding="utf-8")
|
||||||
|
file_string = FILE.read()
|
||||||
|
FILE.close()
|
||||||
|
|
||||||
|
for piece, type in SPLITTER.split_text(file_string):
|
||||||
|
|
||||||
|
if type != TokenType.BPE:
|
||||||
|
continue
|
||||||
|
|
||||||
|
int_list = self.__make_list_ids(piece)
|
||||||
|
DATA.append(int_list)
|
||||||
|
|
||||||
|
return DATA
|
||||||
|
|
||||||
|
def __encode_from_cache(self, bpe: NanoSocratesBPE, data: list[list[int]]):
|
||||||
|
|
||||||
|
NEW_DATA : list[list[int]]= []
|
||||||
|
|
||||||
|
CPU_COUNT = os.process_cpu_count()
|
||||||
|
|
||||||
|
if CPU_COUNT is None:
|
||||||
|
raise Exception()
|
||||||
|
|
||||||
|
VOCABULARY = bpe.vocabulary
|
||||||
|
|
||||||
|
data_chunks = split(data, CPU_COUNT)
|
||||||
|
JOBS = [(NanoSocratesBPE(VOCABULARY), chunk) for chunk in data_chunks]
|
||||||
|
|
||||||
|
JOB_RESULTS: list[list[list[int]]]
|
||||||
|
|
||||||
|
with Pool() as pool:
|
||||||
|
JOB_RESULTS = pool.map(split_encode, JOBS)
|
||||||
|
|
||||||
|
for i, res in zip(range(0, CPU_COUNT), JOB_RESULTS):
|
||||||
|
job_output = res
|
||||||
|
NEW_DATA.extend(job_output)
|
||||||
|
|
||||||
|
del job_output
|
||||||
|
|
||||||
|
print(f"Joined {i + 1} out of {CPU_COUNT}")
|
||||||
|
|
||||||
|
print(f"Sentences from {len(data)} to {len(NEW_DATA)}")
|
||||||
|
|
||||||
|
return NEW_DATA
|
||||||
|
|
||||||
|
def __increment_counter(self, counter: int):
|
||||||
|
|
||||||
|
# What if overflows???
|
||||||
|
try:
|
||||||
|
counter += 1
|
||||||
|
except:
|
||||||
|
print("Integer overflow")
|
||||||
|
counter = 1
|
||||||
|
|
||||||
|
return counter
|
||||||
|
|
||||||
|
def __make_list_ids(self, corpus: str):
|
||||||
|
return list(corpus.encode("utf-8"))
|
||||||
219
Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py
Normal file
219
Project_Model/Libs/BPE/Classes/NanoSocratesBPE.py
Normal file
@@ -0,0 +1,219 @@
|
|||||||
|
from collections import deque
|
||||||
|
from .Encoder import Encoder
|
||||||
|
from ..Errors import OutOfDictionaryException, DuplicateWordException
|
||||||
|
|
||||||
|
|
||||||
|
# ABOUT THE DICTIONARY:
|
||||||
|
# the string is converted into utf-char bytes, that is: each char is rappresented with a set of bytes from 1 to 4.
|
||||||
|
# each bytes get casted into an integer; such that, if an integer has its value lower then 256,
|
||||||
|
# then it is rappresenting an utf-char-byte, otherwise it is a token-ID.
|
||||||
|
class NanoSocratesBatchMemoryBPE:
|
||||||
|
"""Memory to batch training. Keeps token couple frequencies, and merge_treshold"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self, frequencies: dict[tuple[int, int], int], merge_treshold: int
|
||||||
|
) -> None:
|
||||||
|
|
||||||
|
self.frequencies = frequencies
|
||||||
|
self.merge_treshold = merge_treshold
|
||||||
|
|
||||||
|
|
||||||
|
class NanoSocratesBPE(Encoder):
|
||||||
|
|
||||||
|
def __init__(self, vocabulary: dict[tuple[int, int], int] | None = None) -> None:
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
self.__vocabulary: dict[tuple[int, int], int] = {}
|
||||||
|
self.__reverse_vocabulary: dict[int, tuple[int, int]] = {}
|
||||||
|
|
||||||
|
if vocabulary is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
for key, value in vocabulary.items():
|
||||||
|
if value < 256:
|
||||||
|
raise OutOfDictionaryException()
|
||||||
|
# values under 256 are used for unpaired char
|
||||||
|
# TODO: check if they are in order
|
||||||
|
self.__vocabulary[key] = value
|
||||||
|
self.__reverse_vocabulary[value] = key
|
||||||
|
|
||||||
|
@property
|
||||||
|
def vocabulary_size(self):
|
||||||
|
return len(self.__vocabulary) + 256
|
||||||
|
|
||||||
|
@property
|
||||||
|
def vocabulary(self):
|
||||||
|
return self.__vocabulary
|
||||||
|
|
||||||
|
@property
|
||||||
|
def __next_id(self) -> int:
|
||||||
|
"""
|
||||||
|
Gets the next it
|
||||||
|
Returns:
|
||||||
|
int:
|
||||||
|
"""
|
||||||
|
return self.vocabulary_size
|
||||||
|
|
||||||
|
# TODO: implement fit
|
||||||
|
def fit(
|
||||||
|
self,
|
||||||
|
chunk_data: list[int],
|
||||||
|
memory: NanoSocratesBatchMemoryBPE,
|
||||||
|
last_batch: bool,
|
||||||
|
):
|
||||||
|
|
||||||
|
ENCODED_CHUNK = self.encode_intermediate(chunk_data)
|
||||||
|
DATA_LEN_BEFORE_LAST = len(ENCODED_CHUNK) - 1
|
||||||
|
|
||||||
|
# update frequency of each couple of element
|
||||||
|
for i in range(0, DATA_LEN_BEFORE_LAST):
|
||||||
|
CANDIDATE_COUPLE = (ENCODED_CHUNK[i], ENCODED_CHUNK[i + 1])
|
||||||
|
|
||||||
|
frequency = memory.frequencies.get(CANDIDATE_COUPLE)
|
||||||
|
|
||||||
|
# Initialize frequency
|
||||||
|
if frequency is None:
|
||||||
|
frequency = 0
|
||||||
|
memory.frequencies[CANDIDATE_COUPLE] = 0
|
||||||
|
|
||||||
|
frequency += 1
|
||||||
|
memory.frequencies[CANDIDATE_COUPLE] = frequency
|
||||||
|
|
||||||
|
if not last_batch:
|
||||||
|
return (self, memory, ENCODED_CHUNK)
|
||||||
|
|
||||||
|
if len(memory.frequencies) < 1:
|
||||||
|
return (self, memory, ENCODED_CHUNK)
|
||||||
|
|
||||||
|
FREQUENCIES = memory.frequencies
|
||||||
|
MAX_COUPLE = max(FREQUENCIES.items(), key=lambda item: item[1])[0]
|
||||||
|
FREQUENCY = FREQUENCIES[MAX_COUPLE]
|
||||||
|
|
||||||
|
if FREQUENCY < memory.merge_treshold:
|
||||||
|
return (self, memory, ENCODED_CHUNK)
|
||||||
|
|
||||||
|
self.__learn_word(MAX_COUPLE)
|
||||||
|
|
||||||
|
return (self, memory, ENCODED_CHUNK)
|
||||||
|
|
||||||
|
def encode(self, piece: str) -> list[int]:
|
||||||
|
"""Encode a String into token IDs, it firt convert it into utf-8, then pass the list of integer to encode_intermediate()
|
||||||
|
Args:
|
||||||
|
piece (str):
|
||||||
|
Returns:
|
||||||
|
list[int]:
|
||||||
|
"""
|
||||||
|
converted_piece = list(piece.encode("utf-8"))
|
||||||
|
return self.encode_intermediate(converted_piece)
|
||||||
|
|
||||||
|
def encode_intermediate(self, piece: list[int]) -> list[int]:
|
||||||
|
"""Encode a piece (as list of integer) till its maximum
|
||||||
|
Args:
|
||||||
|
piece (list[int]): piece to encode
|
||||||
|
Returns:
|
||||||
|
list[int]: piece encoded
|
||||||
|
"""
|
||||||
|
current_piece = piece
|
||||||
|
new_piece = self.__round_encode(current_piece)
|
||||||
|
|
||||||
|
# until current_piece is bigger then new_piece, keep encoding
|
||||||
|
while len(current_piece) != len(new_piece):
|
||||||
|
current_piece = new_piece
|
||||||
|
new_piece = self.__round_encode(current_piece)
|
||||||
|
|
||||||
|
return current_piece
|
||||||
|
|
||||||
|
def __round_encode(self, piece: list[int]):
|
||||||
|
"""A single round of encode that traverse all the object. Multiple round are needed for a full encode: \n
|
||||||
|
1) "ABAB" -> "XX"
|
||||||
|
2) "XX" -> "Y"
|
||||||
|
Args:
|
||||||
|
piece (list[int]): the object to encode as a list of integer
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(list[int]): the one time encoded object
|
||||||
|
"""
|
||||||
|
|
||||||
|
if len(piece) == 1:
|
||||||
|
return piece
|
||||||
|
|
||||||
|
PIECE_LENGTH = len(piece) - 1
|
||||||
|
NEW_PIECE: list[int] = []
|
||||||
|
|
||||||
|
index = 0
|
||||||
|
while index < PIECE_LENGTH:
|
||||||
|
|
||||||
|
CANDIDATE_WORD = (
|
||||||
|
piece[index],
|
||||||
|
piece[index + 1],
|
||||||
|
) # take a tuple of consecutive element [int]
|
||||||
|
CANDIDATE_TOKEN = self.__vocabulary.get(CANDIDATE_WORD)
|
||||||
|
|
||||||
|
# if no token to substitute the tuple, append the first element
|
||||||
|
if CANDIDATE_TOKEN is None:
|
||||||
|
NEW_PIECE.append(piece[index])
|
||||||
|
index += 1
|
||||||
|
|
||||||
|
# if the latter element of the tuple is the last element of the piece, append it
|
||||||
|
if index == PIECE_LENGTH:
|
||||||
|
NEW_PIECE.append(piece[index])
|
||||||
|
|
||||||
|
continue
|
||||||
|
|
||||||
|
# in this case there was a candidate token to substitute the couple of element
|
||||||
|
NEW_PIECE.append(CANDIDATE_TOKEN)
|
||||||
|
|
||||||
|
index += 2
|
||||||
|
|
||||||
|
if index == PIECE_LENGTH:
|
||||||
|
NEW_PIECE.append(piece[index])
|
||||||
|
|
||||||
|
return NEW_PIECE
|
||||||
|
|
||||||
|
# TODO: Remake decode to take a list of token IDs
|
||||||
|
def decode(self, token_ids: list[int]) -> str:
|
||||||
|
|
||||||
|
# deque: double ended queue
|
||||||
|
token_stack: deque[int] = deque(token_ids)
|
||||||
|
UTF_8_STRING_ARR: bytearray = bytearray()
|
||||||
|
|
||||||
|
while len(token_stack) > 0:
|
||||||
|
TOKEN_ID = token_stack.popleft()
|
||||||
|
|
||||||
|
if TOKEN_ID < 256:
|
||||||
|
UTF_8_STRING_ARR.append(TOKEN_ID)
|
||||||
|
continue
|
||||||
|
|
||||||
|
left_token, right_token = self.__token_decode(TOKEN_ID)
|
||||||
|
|
||||||
|
token_stack.appendleft(right_token)
|
||||||
|
token_stack.appendleft(left_token)
|
||||||
|
|
||||||
|
return UTF_8_STRING_ARR.decode("utf-8")
|
||||||
|
|
||||||
|
def __token_decode(self, token_id: int) -> tuple[int, int]:
|
||||||
|
|
||||||
|
CANDIDATE_DECODED = self.__reverse_vocabulary.get(token_id)
|
||||||
|
|
||||||
|
if CANDIDATE_DECODED is None:
|
||||||
|
raise OutOfDictionaryException()
|
||||||
|
|
||||||
|
return CANDIDATE_DECODED
|
||||||
|
|
||||||
|
def __learn_word(self, words: tuple[int, int]):
|
||||||
|
"""learn a new couple of object in the vocabulary
|
||||||
|
Args:
|
||||||
|
words (tuple[int, int]): the Pair of element to substitute with a new tokenID
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
DuplicateWordException: it launch if there is a duplicate of the new tokenID in the dictionary
|
||||||
|
"""
|
||||||
|
ID = self.__next_id
|
||||||
|
|
||||||
|
DUPLICATE = self.__vocabulary.get(words)
|
||||||
|
|
||||||
|
if DUPLICATE is not None:
|
||||||
|
raise DuplicateWordException()
|
||||||
|
|
||||||
|
self.__vocabulary[words] = ID
|
||||||
|
self.__reverse_vocabulary[ID] = words
|
||||||
70
Project_Model/Libs/BPE/Classes/NanoSocratesChunker.py
Normal file
70
Project_Model/Libs/BPE/Classes/NanoSocratesChunker.py
Normal file
@@ -0,0 +1,70 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
import re
|
||||||
|
from ..Errors import DelimiterNotFoundException
|
||||||
|
|
||||||
|
|
||||||
|
class NanoSocratesChunker:
|
||||||
|
|
||||||
|
def __init__(self, max_size: int, special_token_regex: re.Pattern) -> None:
|
||||||
|
self.__max_size: int = max_size
|
||||||
|
self.__special_token_regex: re.Pattern = special_token_regex
|
||||||
|
self.__residual: str = ""
|
||||||
|
|
||||||
|
# max theorethical size of chars
|
||||||
|
# between special tokens:
|
||||||
|
# - min: size - len(longest_token)
|
||||||
|
# - MAX: size - len(shortest_token)
|
||||||
|
def chunk(self, file_path: Path):
|
||||||
|
# read_file
|
||||||
|
FILE = open(file_path, "r", encoding="utf-8")
|
||||||
|
exit = False
|
||||||
|
|
||||||
|
while not exit:
|
||||||
|
REMAINING_SIZE = self.__max_size - len(self.__residual)
|
||||||
|
READ_SIZE = min(self.__max_size, REMAINING_SIZE)
|
||||||
|
FILE_CHUNK = FILE.read(READ_SIZE)
|
||||||
|
|
||||||
|
if len(FILE_CHUNK) == 0:
|
||||||
|
exit = True
|
||||||
|
continue
|
||||||
|
|
||||||
|
CHUNK = self.__append_residuals(FILE_CHUNK)
|
||||||
|
|
||||||
|
boundaries = self.__identify_boudaries(CHUNK)
|
||||||
|
|
||||||
|
if boundaries is None:
|
||||||
|
|
||||||
|
# boundaries not found in 2 chunks,
|
||||||
|
if len(CHUNK) > self.__max_size - 1:
|
||||||
|
raise DelimiterNotFoundException()
|
||||||
|
|
||||||
|
if exit:
|
||||||
|
yield CHUNK
|
||||||
|
|
||||||
|
self.__set_residual(0, CHUNK)
|
||||||
|
continue
|
||||||
|
|
||||||
|
start, end = boundaries
|
||||||
|
self.__set_residual(end, CHUNK)
|
||||||
|
yield CHUNK[start:end]
|
||||||
|
|
||||||
|
def __identify_boudaries(self, corpus: str) -> tuple[int, int] | None:
|
||||||
|
|
||||||
|
end = 0
|
||||||
|
|
||||||
|
for match in self.__special_token_regex.finditer(corpus):
|
||||||
|
# print(match)
|
||||||
|
end = match.end()
|
||||||
|
|
||||||
|
if end == 0:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return (0, end)
|
||||||
|
|
||||||
|
def __append_residuals(self, corpus: str) -> str:
|
||||||
|
RESIDUAL = self.__residual
|
||||||
|
self.__residual = ""
|
||||||
|
return RESIDUAL + corpus
|
||||||
|
|
||||||
|
def __set_residual(self, index: int, corpus: str):
|
||||||
|
self.__residual = corpus[index:]
|
||||||
64
Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py
Normal file
64
Project_Model/Libs/BPE/Classes/NanoSocratesSpecial.py
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
from .Encoder import Encoder
|
||||||
|
from ..Errors import OutOfDictionaryException
|
||||||
|
|
||||||
|
|
||||||
|
class NanoSocratesSpecial(Encoder):
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self, bpe_vocabulary_size: int, special_tokens: list[str] = []
|
||||||
|
) -> None:
|
||||||
|
|
||||||
|
super().__init__()
|
||||||
|
|
||||||
|
self.__bpe_offset = bpe_vocabulary_size
|
||||||
|
self.__vocabulary: dict[str, int] = {}
|
||||||
|
self.__reverse_vocabulary: dict[int, str] = {}
|
||||||
|
|
||||||
|
if len(special_tokens) == 0:
|
||||||
|
return
|
||||||
|
|
||||||
|
for index, TOKEN in zip(range(0, len(special_tokens)), special_tokens):
|
||||||
|
|
||||||
|
CANDIDATE_ID = self.__bpe_offset + index + 1
|
||||||
|
self.__vocabulary[TOKEN] = CANDIDATE_ID
|
||||||
|
self.__reverse_vocabulary[CANDIDATE_ID] = TOKEN
|
||||||
|
|
||||||
|
@property
|
||||||
|
def __next_id(self):
|
||||||
|
BPE_OFFSET = self.__bpe_offset
|
||||||
|
VOC_LENGTH = len(self.__vocabulary)
|
||||||
|
return BPE_OFFSET + VOC_LENGTH + 1
|
||||||
|
|
||||||
|
@property
|
||||||
|
def vocabulary(self) -> dict[str, int]:
|
||||||
|
return self.__vocabulary
|
||||||
|
|
||||||
|
@property
|
||||||
|
def reverse_vocabulary(self) -> dict[int, str]:
|
||||||
|
return self.__reverse_vocabulary
|
||||||
|
|
||||||
|
def add_special_word_to_vocabulary(self, word: str):
|
||||||
|
CANDIDATE_INDEX = self.__next_id
|
||||||
|
self.__vocabulary[word] = CANDIDATE_INDEX
|
||||||
|
self.__reverse_vocabulary[CANDIDATE_INDEX] = word
|
||||||
|
|
||||||
|
def encode(self, word: str) -> list[int]:
|
||||||
|
ID = self.__vocabulary.get(word)
|
||||||
|
|
||||||
|
if ID is None:
|
||||||
|
raise OutOfDictionaryException()
|
||||||
|
|
||||||
|
return [ID]
|
||||||
|
|
||||||
|
def decode(self, token_id: list[int]) -> str:
|
||||||
|
|
||||||
|
if len(token_id) != 1:
|
||||||
|
raise OutOfDictionaryException()
|
||||||
|
|
||||||
|
ID = token_id[0]
|
||||||
|
WORD = self.__reverse_vocabulary.get(ID)
|
||||||
|
|
||||||
|
if WORD is None:
|
||||||
|
raise OutOfDictionaryException()
|
||||||
|
|
||||||
|
return WORD
|
||||||
98
Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py
Normal file
98
Project_Model/Libs/BPE/Classes/NanoSocratesSplitter.py
Normal file
@@ -0,0 +1,98 @@
|
|||||||
|
import re
|
||||||
|
from collections import deque
|
||||||
|
from typing import Generator
|
||||||
|
from ..Enums import TokenType
|
||||||
|
|
||||||
|
|
||||||
|
class NanoSocratesSplitter:
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self, special_token_regex: re.Pattern, max_bpe_token_id: int = 255
|
||||||
|
) -> None:
|
||||||
|
# attention the regex got already compiled
|
||||||
|
self.__special_token_regex = special_token_regex
|
||||||
|
self.__max_bpe_token_id: int = max_bpe_token_id # used for decoding
|
||||||
|
|
||||||
|
def split_text(self, corpus: str) -> Generator[tuple[str, TokenType]]:
|
||||||
|
"""Split a text using a regex given
|
||||||
|
Args:
|
||||||
|
corpus (str): all the corpus string to split
|
||||||
|
Yields:
|
||||||
|
Generator[tuple[str, TokenType]]: each time returns a piece of the splitted text: string and its TokenType. \n
|
||||||
|
TokenType describe if the string is for the BPE or a special Token [BPE, SPECIAL]
|
||||||
|
"""
|
||||||
|
|
||||||
|
bpe_start = 0
|
||||||
|
bpe_end = len(corpus) # this can be deleted!
|
||||||
|
|
||||||
|
for special_token_start, special_token_end in self.__find_boundaries(corpus):
|
||||||
|
|
||||||
|
# FIND BPE
|
||||||
|
bpe_end = special_token_start
|
||||||
|
BPE_TOKEN_TEXT = corpus[bpe_start:bpe_end]
|
||||||
|
if BPE_TOKEN_TEXT != "":
|
||||||
|
for WORD in self.__split_words(BPE_TOKEN_TEXT):
|
||||||
|
yield (WORD, TokenType.BPE)
|
||||||
|
|
||||||
|
# FIND SPECIAL TOKEN
|
||||||
|
SPECIAL_TOKEN_TEXT = corpus[special_token_start:special_token_end]
|
||||||
|
if SPECIAL_TOKEN_TEXT != "":
|
||||||
|
yield (SPECIAL_TOKEN_TEXT, TokenType.SPECIAL)
|
||||||
|
|
||||||
|
# now save the new bpe start point
|
||||||
|
# it will used in the next interaction
|
||||||
|
bpe_start = special_token_end
|
||||||
|
|
||||||
|
def __find_boundaries(self, corpus: str) -> Generator[tuple[int, int]]:
|
||||||
|
"""
|
||||||
|
Find each time the start and end (not included) of the special token
|
||||||
|
Args:
|
||||||
|
corpus (str): the string where the special token will be searched
|
||||||
|
Yields:
|
||||||
|
Generator[tuple[int, int]]: Note the end is not included
|
||||||
|
"""
|
||||||
|
for match in self.__special_token_regex.finditer(corpus):
|
||||||
|
start = match.start()
|
||||||
|
end = match.end()
|
||||||
|
|
||||||
|
yield (start, end)
|
||||||
|
|
||||||
|
# make the last boundary be the end of corpus
|
||||||
|
# eof = len(corpus)
|
||||||
|
# yield(eof,eof)
|
||||||
|
|
||||||
|
def __split_words(self, bpe_piece: str) -> Generator[str]:
|
||||||
|
|
||||||
|
END_OF_STRING = len(bpe_piece)
|
||||||
|
bound_start = 0
|
||||||
|
bound_end = END_OF_STRING + 1
|
||||||
|
for i in range(0, END_OF_STRING):
|
||||||
|
|
||||||
|
CANDIDATE_CHAR = bpe_piece[i]
|
||||||
|
|
||||||
|
if CANDIDATE_CHAR != " ":
|
||||||
|
continue
|
||||||
|
|
||||||
|
bound_end = i
|
||||||
|
|
||||||
|
yield bpe_piece[bound_start:bound_end]
|
||||||
|
|
||||||
|
bound_start = bound_end
|
||||||
|
bound_end = END_OF_STRING + 1
|
||||||
|
|
||||||
|
yield bpe_piece[bound_start:bound_end]
|
||||||
|
|
||||||
|
def split_tokens(self, corpus: list[int]) -> Generator[tuple[list[int], TokenType]]:
|
||||||
|
|
||||||
|
not_special_token_list: list[int] = []
|
||||||
|
for token in corpus:
|
||||||
|
if token > self.__max_bpe_token_id:
|
||||||
|
|
||||||
|
if len(not_special_token_list) > 0:
|
||||||
|
yield (not_special_token_list, TokenType.BPE)
|
||||||
|
not_special_token_list = []
|
||||||
|
|
||||||
|
yield ([token], TokenType.SPECIAL)
|
||||||
|
continue
|
||||||
|
|
||||||
|
not_special_token_list.append(token)
|
||||||
8
Project_Model/Libs/BPE/Classes/TokeNano.py
Normal file
8
Project_Model/Libs/BPE/Classes/TokeNano.py
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
|
||||||
|
from Project_Model.Libs.BPE.Classes.TokeNanoCore import TokeNanoCore
|
||||||
|
|
||||||
|
class TokeNano:
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
|
||||||
|
pass
|
||||||
62
Project_Model/Libs/BPE/Classes/TokeNanoCore.py
Normal file
62
Project_Model/Libs/BPE/Classes/TokeNanoCore.py
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from ..Classes import NanoSocratesSplitter
|
||||||
|
from ..Classes import NanoSocratesBPE
|
||||||
|
from ..Classes import NanoSocratesSpecial
|
||||||
|
|
||||||
|
from ..Utils import special_regex_maker
|
||||||
|
from ..Enums import TokenType
|
||||||
|
|
||||||
|
|
||||||
|
class TokeNanoCore:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
bpe_vocabulary: dict[tuple[int, int], int],
|
||||||
|
special_token_list: list[str],
|
||||||
|
# special_vocabulary: dict[str, int]
|
||||||
|
):
|
||||||
|
|
||||||
|
self.__bpe_encoder = NanoSocratesBPE(bpe_vocabulary)
|
||||||
|
|
||||||
|
SPECIAL_REGEX = special_regex_maker(special_token_list)
|
||||||
|
BPE_VOCABULARY_SIZE = self.__bpe_encoder.vocabulary_size
|
||||||
|
|
||||||
|
self.__splitter = NanoSocratesSplitter(SPECIAL_REGEX, BPE_VOCABULARY_SIZE)
|
||||||
|
self.__special_encoder = NanoSocratesSpecial(
|
||||||
|
BPE_VOCABULARY_SIZE, special_token_list
|
||||||
|
)
|
||||||
|
|
||||||
|
def encode(self, corpus: str) -> list[int]:
|
||||||
|
output: list[int] = []
|
||||||
|
for piece, token_type in self.__splitter.split_text(corpus):
|
||||||
|
|
||||||
|
if token_type == TokenType.SPECIAL:
|
||||||
|
ENCODED_PIECE = self.__special_encoder.encode(piece)
|
||||||
|
output.extend(ENCODED_PIECE)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# slow but clear
|
||||||
|
if token_type == TokenType.BPE:
|
||||||
|
ENCODED_PIECE = self.__bpe_encoder.encode(piece)
|
||||||
|
output.extend(ENCODED_PIECE)
|
||||||
|
continue
|
||||||
|
|
||||||
|
return output
|
||||||
|
|
||||||
|
def decode(self, corpus: list[int]) -> str:
|
||||||
|
output_str = ""
|
||||||
|
for token, token_type in self.__splitter.split_tokens(corpus):
|
||||||
|
# token is an integer if special, a list of integer otherwise
|
||||||
|
if token_type == TokenType.SPECIAL:
|
||||||
|
output_str += self.__special_encoder.decode(
|
||||||
|
token
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# slow but clear
|
||||||
|
if token_type == TokenType.BPE:
|
||||||
|
output_str += self.__bpe_encoder.decode(
|
||||||
|
token
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
return output_str
|
||||||
18
Project_Model/Libs/BPE/Classes/__init__.py
Normal file
18
Project_Model/Libs/BPE/Classes/__init__.py
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
from .NanoSocratesChunker import NanoSocratesChunker
|
||||||
|
from .NanoSocratesSplitter import NanoSocratesSplitter
|
||||||
|
from .NanoSocratesBPE import NanoSocratesBPE, NanoSocratesBatchMemoryBPE
|
||||||
|
from .NanoSocraTrainer import NanoSocraTrainer
|
||||||
|
from .NanoSocraTraineRam import NanoSocraTraineRam
|
||||||
|
from .NanoSocraTrainerPool import NanoSocraTrainerPool
|
||||||
|
from .NanoSocratesSpecial import NanoSocratesSpecial
|
||||||
|
from .TokeNanoCore import TokeNanoCore
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"NanoSocratesChunker",
|
||||||
|
"NanoSocratesSplitter",
|
||||||
|
"NanoSocratesBPE",
|
||||||
|
"NanoSocraTrainer",
|
||||||
|
"NanoSocraTraineRam",
|
||||||
|
"NanoSocraTrainerPool",
|
||||||
|
"TokeNanoCore"
|
||||||
|
]
|
||||||
21
Project_Model/Libs/BPE/Enums/SpecialToken.py
Normal file
21
Project_Model/Libs/BPE/Enums/SpecialToken.py
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
from enum import Enum
|
||||||
|
|
||||||
|
|
||||||
|
class SpecialToken(Enum):
|
||||||
|
# (Enum, str) -> throws an error
|
||||||
|
START_TRIPLE_LIST = "<SOTL>"
|
||||||
|
START_TRIPLE = "<SOT>"
|
||||||
|
END_TRIPLE = "<EOT>"
|
||||||
|
SUBJECT = "<SUBJ>"
|
||||||
|
RELATIONSHIP = "<PRED>"
|
||||||
|
OBJECT = "<OBJ>"
|
||||||
|
ABSTRACT = "<ABS>"
|
||||||
|
CORPUS_END = "<END>"
|
||||||
|
|
||||||
|
## Tasks' Token
|
||||||
|
RDF_TO_TEXT = "<RDF2TXT>"
|
||||||
|
TEXT_TO_RDF = "<TEXT2RDF>"
|
||||||
|
CONTINUE_RDF = "<CONTINUERDF>"
|
||||||
|
MASK = "<MASK>"
|
||||||
|
|
||||||
|
# BPE Training:
|
||||||
6
Project_Model/Libs/BPE/Enums/TokenType.py
Normal file
6
Project_Model/Libs/BPE/Enums/TokenType.py
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
from enum import Enum, auto
|
||||||
|
|
||||||
|
class TokenType(Enum):
|
||||||
|
|
||||||
|
SPECIAL = auto()
|
||||||
|
BPE = auto()
|
||||||
1
Project_Model/Libs/BPE/Enums/__init__.py
Normal file
1
Project_Model/Libs/BPE/Enums/__init__.py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
from .TokenType import TokenType
|
||||||
@@ -0,0 +1,4 @@
|
|||||||
|
class DelimiterNotFoundException(Exception):
|
||||||
|
|
||||||
|
def __init__(self, *args: object) -> None:
|
||||||
|
super().__init__(*args)
|
||||||
4
Project_Model/Libs/BPE/Errors/DuplicateWordException.py
Normal file
4
Project_Model/Libs/BPE/Errors/DuplicateWordException.py
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
class DuplicateWordException(Exception):
|
||||||
|
|
||||||
|
def __init__(self, *args: object) -> None:
|
||||||
|
super().__init__(*args)
|
||||||
@@ -0,0 +1,4 @@
|
|||||||
|
class OutOfDictionaryException(Exception):
|
||||||
|
|
||||||
|
def __init__(self, *args: object) -> None:
|
||||||
|
super().__init__(*args)
|
||||||
@@ -0,0 +1,4 @@
|
|||||||
|
class SentenceTooLongException(Exception):
|
||||||
|
|
||||||
|
def __init__(self, *args: object) -> None:
|
||||||
|
super().__init__(*args)
|
||||||
11
Project_Model/Libs/BPE/Errors/__init__.py
Normal file
11
Project_Model/Libs/BPE/Errors/__init__.py
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
from .DelimiterNotFoundException import DelimiterNotFoundException
|
||||||
|
from .OutOfDictionaryException import OutOfDictionaryException
|
||||||
|
from .DuplicateWordException import DuplicateWordException
|
||||||
|
from .SentenceTooLongException import SentenceTooLongException
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"DelimiterNotFoundException",
|
||||||
|
"OutOfDictionaryException",
|
||||||
|
"DuplicateWordException",
|
||||||
|
"SentenceTooLongException"
|
||||||
|
]
|
||||||
13
Project_Model/Libs/BPE/Utils/__init__.py
Normal file
13
Project_Model/Libs/BPE/Utils/__init__.py
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
from .special_regex_maker import special_regex_maker
|
||||||
|
from .lag_checker_iterator import iterator_with_checks
|
||||||
|
from .vocabulary import save_nanos_vocabulary, load_nanos_vocabulary
|
||||||
|
from .json_utils import save_json, load_json
|
||||||
|
from .special_regex_maker import special_regex_maker
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"special_regex_maker",
|
||||||
|
"iterator_with_checks",
|
||||||
|
"save_nanos_vocabulary",
|
||||||
|
"load_nanos_vocabulary",
|
||||||
|
"save_json", "load_json"
|
||||||
|
]
|
||||||
18
Project_Model/Libs/BPE/Utils/json_utils.py
Normal file
18
Project_Model/Libs/BPE/Utils/json_utils.py
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
def save_json(dictionary: dict, path: Path):
|
||||||
|
|
||||||
|
json_string = json.dumps(dictionary)
|
||||||
|
FILE = open(path, "w")
|
||||||
|
FILE.write(json_string)
|
||||||
|
FILE.close()
|
||||||
|
|
||||||
|
|
||||||
|
def load_json(path: Path) -> dict:
|
||||||
|
FILE = open(path, "r")
|
||||||
|
json_string = FILE.read()
|
||||||
|
FILE.close()
|
||||||
|
|
||||||
|
return json.loads(json_string)
|
||||||
27
Project_Model/Libs/BPE/Utils/lag_checker_iterator.py
Normal file
27
Project_Model/Libs/BPE/Utils/lag_checker_iterator.py
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
from collections import deque
|
||||||
|
from typing import Generator, TypeVar
|
||||||
|
|
||||||
|
T1 = TypeVar("T1")
|
||||||
|
T2 = TypeVar("T2")
|
||||||
|
T3 = TypeVar("T3")
|
||||||
|
|
||||||
|
|
||||||
|
def iterator_with_checks(
|
||||||
|
generator: Generator[T1, T2, T3],
|
||||||
|
) -> Generator[tuple[T1, bool], T2, T3]:
|
||||||
|
|
||||||
|
# Here we can ignore to catch stop iteration
|
||||||
|
# we will propagate it
|
||||||
|
last_element = next(generator)
|
||||||
|
|
||||||
|
while True:
|
||||||
|
|
||||||
|
RETURN_ELEMENT = last_element
|
||||||
|
try:
|
||||||
|
element = next(generator)
|
||||||
|
last_element = element
|
||||||
|
yield (RETURN_ELEMENT, False)
|
||||||
|
|
||||||
|
except StopIteration:
|
||||||
|
yield (RETURN_ELEMENT, True)
|
||||||
|
break
|
||||||
15
Project_Model/Libs/BPE/Utils/special_regex_maker.py
Normal file
15
Project_Model/Libs/BPE/Utils/special_regex_maker.py
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
def special_regex_maker(special_tokens: list[str]) -> re.Pattern:
|
||||||
|
"""compile a regex for the special token
|
||||||
|
Args:
|
||||||
|
special_tokens (list[str]): the list of special token
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
re.Pattern:
|
||||||
|
"""
|
||||||
|
|
||||||
|
REGEX_STR = "|".join(special_tokens)
|
||||||
|
|
||||||
|
return re.compile(REGEX_STR)
|
||||||
49
Project_Model/Libs/BPE/Utils/vocabulary.py
Normal file
49
Project_Model/Libs/BPE/Utils/vocabulary.py
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
from ..Errors import OutOfDictionaryException
|
||||||
|
|
||||||
|
|
||||||
|
def nanos_vocabulary2json_str(vocabulary: dict[tuple[int, int], int]) -> str:
|
||||||
|
|
||||||
|
JSON: dict[str, int] = {}
|
||||||
|
|
||||||
|
for key, item in vocabulary.items():
|
||||||
|
TUPLE_STR = f"{key}"
|
||||||
|
JSON[TUPLE_STR] = item
|
||||||
|
|
||||||
|
return json.dumps(JSON)
|
||||||
|
|
||||||
|
|
||||||
|
def nanos_json_str2vocabulary(json_string: str) -> dict[tuple[int, int], int]:
|
||||||
|
|
||||||
|
JSON: dict[str, int] = json.loads(json_string)
|
||||||
|
VOCABULARY: dict[tuple[int, int], int] = {}
|
||||||
|
|
||||||
|
for key, item in JSON.items():
|
||||||
|
REDUCED_KEY = len(key) - 1
|
||||||
|
KEY_STR = key[1:REDUCED_KEY]
|
||||||
|
VOC_KEY = tuple(map(int, KEY_STR.split(",")))
|
||||||
|
|
||||||
|
if len(VOC_KEY) != 2:
|
||||||
|
raise OutOfDictionaryException()
|
||||||
|
|
||||||
|
# Checked for weird things above
|
||||||
|
VOCABULARY[VOC_KEY] = item # type: ignore
|
||||||
|
|
||||||
|
return VOCABULARY
|
||||||
|
|
||||||
|
|
||||||
|
def save_nanos_vocabulary(vocabulary: dict[tuple[int, int], int], path: Path):
|
||||||
|
|
||||||
|
json_string = nanos_vocabulary2json_str(vocabulary)
|
||||||
|
FILE = open(path, "w")
|
||||||
|
FILE.write(json_string)
|
||||||
|
FILE.close()
|
||||||
|
|
||||||
|
|
||||||
|
def load_nanos_vocabulary(path: Path) -> dict[tuple[int, int], int]:
|
||||||
|
FILE = open(path, "r")
|
||||||
|
json_string = FILE.read()
|
||||||
|
FILE.close()
|
||||||
|
|
||||||
|
return nanos_json_str2vocabulary(json_string)
|
||||||
9
Project_Model/Libs/BPE/__init__.py
Normal file
9
Project_Model/Libs/BPE/__init__.py
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
from .Classes import *
|
||||||
|
from .Enums import *
|
||||||
|
from .Errors import *
|
||||||
|
from .Utils import *
|
||||||
|
|
||||||
|
from . import Classes
|
||||||
|
from . import Enums
|
||||||
|
from . import Errors
|
||||||
|
from . import Utils
|
||||||
1
Project_Model/Libs/__init__.py
Normal file
1
Project_Model/Libs/__init__.py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
from . import BPE
|
||||||
74
Project_Model/Tests/bpe_test.py
Normal file
74
Project_Model/Tests/bpe_test.py
Normal file
@@ -0,0 +1,74 @@
|
|||||||
|
from Project_Model.Libs.BPE.Enums import TokenType
|
||||||
|
import Project_Model.Libs.BPE as BPE
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
class TestBPE:
|
||||||
|
|
||||||
|
def test_bpe_encoding_simple(self):
|
||||||
|
|
||||||
|
TEXT = "abababab"
|
||||||
|
|
||||||
|
# ab = 256
|
||||||
|
# 256, 256 = 257
|
||||||
|
# 257, 257 = 258
|
||||||
|
|
||||||
|
VOCABULARY = {(ord("a"), ord("b")): 256, (256, 256): 257, (257, 257): 258}
|
||||||
|
EXPECTED = [258]
|
||||||
|
|
||||||
|
BPE_ENCODER = BPE.NanoSocratesBPE(VOCABULARY)
|
||||||
|
|
||||||
|
ENCODED = BPE_ENCODER.encode(TEXT)
|
||||||
|
|
||||||
|
assert len(ENCODED) == len(EXPECTED)
|
||||||
|
|
||||||
|
for encoded, expected in zip(ENCODED, EXPECTED):
|
||||||
|
assert encoded == expected
|
||||||
|
|
||||||
|
def test_bpe_decoding_simple(self):
|
||||||
|
|
||||||
|
|
||||||
|
INPUT = [258]
|
||||||
|
|
||||||
|
# ab = 256
|
||||||
|
# 256, 256 = 257
|
||||||
|
# 257, 257 = 258
|
||||||
|
|
||||||
|
VOCABULARY = {(ord("a"), ord("b")): 256, (256, 256): 257, (257, 257): 258}
|
||||||
|
EXPECTED = "abababab"
|
||||||
|
|
||||||
|
BPE_ENCODER = BPE.NanoSocratesBPE(VOCABULARY)
|
||||||
|
|
||||||
|
DECODED = BPE_ENCODER.decode(INPUT)
|
||||||
|
|
||||||
|
assert len(DECODED) == len(EXPECTED)
|
||||||
|
|
||||||
|
for encoded, expected in zip(DECODED, EXPECTED):
|
||||||
|
assert encoded == expected
|
||||||
|
|
||||||
|
def test_bpe_decoding_edge_1(self):
|
||||||
|
|
||||||
|
|
||||||
|
INPUT = [258, ord("c")]
|
||||||
|
|
||||||
|
# ab = 256
|
||||||
|
# 256, 256 = 257
|
||||||
|
# 257, 257 = 258
|
||||||
|
|
||||||
|
VOCABULARY = {(ord("a"), ord("b")): 256, (256, 256): 257, (257, 257): 258}
|
||||||
|
EXPECTED = "ababababc"
|
||||||
|
|
||||||
|
BPE_ENCODER = BPE.NanoSocratesBPE(VOCABULARY)
|
||||||
|
|
||||||
|
DECODED = BPE_ENCODER.decode(INPUT)
|
||||||
|
|
||||||
|
assert len(DECODED) == len(EXPECTED)
|
||||||
|
|
||||||
|
for encoded, expected in zip(DECODED, EXPECTED):
|
||||||
|
assert encoded == expected
|
||||||
|
|
||||||
|
# Useful to debug weird cases
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# TestBPE().test_bpe_decoding_simple()
|
||||||
|
TestBPE().test_bpe_encoding_simple()
|
||||||
77
Project_Model/Tests/bpe_trainer_test.py
Normal file
77
Project_Model/Tests/bpe_trainer_test.py
Normal file
@@ -0,0 +1,77 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
from Project_Model.Libs.BPE.Enums import TokenType
|
||||||
|
import Project_Model.Libs.BPE as BPE
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
CACHE_DIR_PATH = Path("Project_Model/Tests/trainer_files/cache/pool-cache.json")
|
||||||
|
|
||||||
|
class TestTrainBPE:
|
||||||
|
|
||||||
|
def test_bpe_train_encoding_simple(self):
|
||||||
|
|
||||||
|
TRAINER = BPE.NanoSocraTrainerPool(
|
||||||
|
int(32E3),
|
||||||
|
["<SOT>", "<EOT>"]
|
||||||
|
)
|
||||||
|
|
||||||
|
TEXT = "abababab"
|
||||||
|
TEXT_PATH = Path("Project_Model/Tests/trainer_files/train_simple.txt")
|
||||||
|
|
||||||
|
EXPECTED = [258]
|
||||||
|
|
||||||
|
# ab = 256
|
||||||
|
# 256, 256 = 257
|
||||||
|
# 257, 257 = 258
|
||||||
|
|
||||||
|
BPE_ENCODER = TRAINER.trainBPE(
|
||||||
|
TEXT_PATH,
|
||||||
|
CACHE_DIR_PATH
|
||||||
|
)
|
||||||
|
|
||||||
|
ENCODED = BPE_ENCODER.encode(TEXT)
|
||||||
|
|
||||||
|
assert len(ENCODED) == len(EXPECTED)
|
||||||
|
|
||||||
|
for encoded, expected in zip(ENCODED, EXPECTED):
|
||||||
|
assert encoded == expected
|
||||||
|
|
||||||
|
|
||||||
|
def test_bpe_train_encoding_and_decoding(self):
|
||||||
|
|
||||||
|
SPECIAL_LIST = ["<ABS>", "<SOTL>"]
|
||||||
|
TRAINER = BPE.NanoSocraTrainerPool(
|
||||||
|
int(32E3),
|
||||||
|
SPECIAL_LIST
|
||||||
|
)
|
||||||
|
|
||||||
|
TEXT_PATH = Path("Project_Model/Tests/trainer_files/train_encode_decode.txt")
|
||||||
|
FILE = open(TEXT_PATH)
|
||||||
|
TEXT = FILE.read()
|
||||||
|
FILE.close()
|
||||||
|
|
||||||
|
EXPECTED = TEXT
|
||||||
|
|
||||||
|
# ab = 256
|
||||||
|
# 256, 256 = 257
|
||||||
|
# 257, 257 = 258
|
||||||
|
|
||||||
|
BPE_ENCODER = TRAINER.trainBPE(
|
||||||
|
TEXT_PATH,
|
||||||
|
CACHE_DIR_PATH
|
||||||
|
)
|
||||||
|
VOCABULARY = BPE_ENCODER.vocabulary
|
||||||
|
TOKENANO = BPE.TokeNanoCore(VOCABULARY,SPECIAL_LIST)
|
||||||
|
|
||||||
|
ENCODED = TOKENANO.encode(TEXT)
|
||||||
|
DECODED = TOKENANO.decode(ENCODED)
|
||||||
|
|
||||||
|
assert len(DECODED) == len(EXPECTED)
|
||||||
|
|
||||||
|
for decoded, expected in zip(DECODED, EXPECTED):
|
||||||
|
assert decoded == expected
|
||||||
|
|
||||||
|
# Useful to debug weird cases
|
||||||
|
if __name__ == "__main__":
|
||||||
|
# TestTrainBPE().test_bpe_train_encoding_simple()
|
||||||
|
TestTrainBPE().test_bpe_train_encoding_and_decoding()
|
||||||
4
Project_Model/Tests/chunker_files/edge-1.txt
Normal file
4
Project_Model/Tests/chunker_files/edge-1.txt
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
<SOT>Lorem <SEP>ipsu<SEP>m d<SEP>olor s<SEP>it ame<SEP>t,
|
||||||
|
<SEP>conse<SEP>cte<SEP>tur adip<SEP>iscin<SEP>g elit.
|
||||||
|
<SEP>Aenean a<SEP>t dui he<SEP>ndrer<SEP>it an<SEP>te soll<SEP>icitud
|
||||||
|
<SEP>in sce<SEP>lerisque<EOT>
|
||||||
2
Project_Model/Tests/chunker_files/simple.txt
Normal file
2
Project_Model/Tests/chunker_files/simple.txt
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
<SOT>Lorem ipsum <SEP>dolor sit amet<SEP>, consectetur <SEP>adipiscing elit.
|
||||||
|
<SEP>Aenean at dui <SEP>hendrerit ante <SEP>sollicitudin <SEP>scelerisque<EOT>
|
||||||
3
Project_Model/Tests/chunker_files/stress.txt
Normal file
3
Project_Model/Tests/chunker_files/stress.txt
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
<SOT>Lorem ipsum <SEP>dolor sit amet<SEP>, consectetur <SEP>adipiscing elit.
|
||||||
|
<SEP>Aenean at dui <SEP>hendrerit an te <SEP>sollicitudin <SEP>scelerisque
|
||||||
|
<SEP>dsdsasssdfdsdsfkjddsnfkjdsnfkjdnsjkfndf<EOT>
|
||||||
89
Project_Model/Tests/chunker_test.py
Normal file
89
Project_Model/Tests/chunker_test.py
Normal file
@@ -0,0 +1,89 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
import re
|
||||||
|
import pytest
|
||||||
|
import Project_Model.Libs.BPE as BPE
|
||||||
|
|
||||||
|
PATTERN = "<(TOKEN|SOT|SEP|EOT)>"
|
||||||
|
SYMBOL_REGEX = re.compile(PATTERN)
|
||||||
|
|
||||||
|
class TestChunker:
|
||||||
|
|
||||||
|
def test_correct_simple(self):
|
||||||
|
|
||||||
|
FILE_PATH = Path("Project_Model/Tests/chunker_files/simple.txt")
|
||||||
|
LEAST_EXPECTED_CHUNKS = 3
|
||||||
|
ORIG_TEXT = FILE_PATH.read_text(encoding="utf-8")
|
||||||
|
|
||||||
|
CHUNKER = BPE.NanoSocratesChunker(40, SYMBOL_REGEX)
|
||||||
|
|
||||||
|
CHUNKS = []
|
||||||
|
|
||||||
|
for chunk in CHUNKER.chunk(FILE_PATH):
|
||||||
|
print(chunk)
|
||||||
|
CHUNKS.append(
|
||||||
|
chunk
|
||||||
|
)
|
||||||
|
|
||||||
|
NANO_TEXT = "".join(CHUNKS)
|
||||||
|
|
||||||
|
assert len(CHUNKS) > (LEAST_EXPECTED_CHUNKS - 1)
|
||||||
|
assert NANO_TEXT == ORIG_TEXT
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def test_correct_edge_1(self):
|
||||||
|
|
||||||
|
FILE_PATH = Path("Project_Model/Tests/chunker_files/edge-1.txt")
|
||||||
|
LEAST_EXPECTED_CHUNKS = 3
|
||||||
|
ORIG_TEXT = FILE_PATH.read_text(encoding="utf-8")
|
||||||
|
|
||||||
|
CHUNKER = BPE.NanoSocratesChunker(15, SYMBOL_REGEX)
|
||||||
|
|
||||||
|
CHUNKS = []
|
||||||
|
|
||||||
|
for chunk in CHUNKER.chunk(FILE_PATH):
|
||||||
|
print(chunk)
|
||||||
|
CHUNKS.append(
|
||||||
|
chunk
|
||||||
|
)
|
||||||
|
|
||||||
|
NANO_TEXT = "".join(CHUNKS)
|
||||||
|
|
||||||
|
assert len(CHUNKS) > (LEAST_EXPECTED_CHUNKS - 1)
|
||||||
|
assert NANO_TEXT == ORIG_TEXT
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def test_throwing(self):
|
||||||
|
|
||||||
|
FILE_PATH = Path("Project_Model/Tests/chunker_files/simple.txt")
|
||||||
|
|
||||||
|
CHUNKER = BPE.NanoSocratesChunker(5, SYMBOL_REGEX)
|
||||||
|
|
||||||
|
with pytest.raises(BPE.DelimiterNotFoundException):
|
||||||
|
for chunk in CHUNKER.chunk(FILE_PATH):
|
||||||
|
print(chunk)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
|
||||||
|
FILE_PATH = Path("Project_Model/Tests/chunker_files/stress.txt")
|
||||||
|
LEAST_EXPECTED_CHUNKS = 3
|
||||||
|
ORIG_TEXT = FILE_PATH.read_text(encoding="utf-8")
|
||||||
|
|
||||||
|
CHUNKER = BPE.NanoSocratesChunker(40, SYMBOL_REGEX)
|
||||||
|
|
||||||
|
CHUNKS = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
for chunk in CHUNKER.chunk(FILE_PATH):
|
||||||
|
print(f"CHUNK START:\n{chunk} - {len(chunk)}\nCHUNK END\n")
|
||||||
|
CHUNKS.append(
|
||||||
|
chunk
|
||||||
|
)
|
||||||
|
except:
|
||||||
|
exit(0)
|
||||||
|
|
||||||
|
NANO_TEXT = "".join(CHUNKS)
|
||||||
|
|
||||||
|
assert len(CHUNKS) > (LEAST_EXPECTED_CHUNKS - 1)
|
||||||
|
assert NANO_TEXT == ORIG_TEXT
|
||||||
182
Project_Model/Tests/splitter_test.py
Normal file
182
Project_Model/Tests/splitter_test.py
Normal file
@@ -0,0 +1,182 @@
|
|||||||
|
from Project_Model.Libs.BPE.Enums import TokenType
|
||||||
|
import Project_Model.Libs.BPE as BPE
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
PATTERN = "<(TOKEN|SOT|SEP|EOT)>"
|
||||||
|
SYMBOL_REGEX = re.compile(PATTERN)
|
||||||
|
|
||||||
|
|
||||||
|
class TestSplitter:
|
||||||
|
|
||||||
|
def test_split(self):
|
||||||
|
|
||||||
|
TEXT = "<SOT>Lorem <SEP>"
|
||||||
|
|
||||||
|
SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX)
|
||||||
|
|
||||||
|
EXPECTED_CHUNKS = [
|
||||||
|
("<SOT>", TokenType.SPECIAL),
|
||||||
|
("Lorem", TokenType.BPE),
|
||||||
|
(" ", TokenType.BPE),
|
||||||
|
("<SEP>", TokenType.SPECIAL),
|
||||||
|
]
|
||||||
|
|
||||||
|
CHUNKS = list(SPLITTER.split_text(TEXT))
|
||||||
|
|
||||||
|
assert len(CHUNKS) == len(EXPECTED_CHUNKS)
|
||||||
|
|
||||||
|
for chunk, expected_chunk in zip(EXPECTED_CHUNKS, CHUNKS):
|
||||||
|
print(f"TEST:\n\tCHUNK:\t\t{chunk}\n\tEXPECTED:\t\t{expected_chunk}")
|
||||||
|
RECEIVED_TOKEN_STRING, RECEIVED_TOKEN_TYPE = chunk
|
||||||
|
EXPECTED_TOKEN_STRING, EXPECTED_TOKEN_TYPE = expected_chunk
|
||||||
|
|
||||||
|
assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING
|
||||||
|
assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE
|
||||||
|
|
||||||
|
def test_split_trailing_text(self):
|
||||||
|
|
||||||
|
TEXT = "ipsu<SEP>m d<SEP>olor"
|
||||||
|
|
||||||
|
SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX)
|
||||||
|
|
||||||
|
EXPECTED_CHUNKS = [
|
||||||
|
("ipsu", TokenType.BPE),
|
||||||
|
("<SEP>", TokenType.SPECIAL),
|
||||||
|
("m", TokenType.BPE),
|
||||||
|
(" d", TokenType.BPE),
|
||||||
|
("<SEP>", TokenType.SPECIAL),
|
||||||
|
# ("olor", TokenType.BPE)
|
||||||
|
]
|
||||||
|
|
||||||
|
CHUNKS = list(SPLITTER.split_text(TEXT))
|
||||||
|
|
||||||
|
assert len(CHUNKS) == len(EXPECTED_CHUNKS)
|
||||||
|
|
||||||
|
for chunk, expected_chunk in zip(EXPECTED_CHUNKS, CHUNKS):
|
||||||
|
print(f"TEST:\n\tCHUNK:\t\t{chunk}\n\tEXPECTED:\t\t{expected_chunk}")
|
||||||
|
RECEIVED_TOKEN_STRING, RECEIVED_TOKEN_TYPE = chunk
|
||||||
|
EXPECTED_TOKEN_STRING, EXPECTED_TOKEN_TYPE = expected_chunk
|
||||||
|
|
||||||
|
assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING
|
||||||
|
assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE
|
||||||
|
|
||||||
|
def test_split_multi_token(self):
|
||||||
|
|
||||||
|
TEXT = "ipsu<SEP>m d<SEP><SEP><SEP>dsg<SEP>olor"
|
||||||
|
|
||||||
|
SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX)
|
||||||
|
|
||||||
|
EXPECTED_CHUNKS = [
|
||||||
|
("ipsu", TokenType.BPE),
|
||||||
|
("<SEP>", TokenType.SPECIAL),
|
||||||
|
("m", TokenType.BPE),
|
||||||
|
(" d", TokenType.BPE),
|
||||||
|
("<SEP>", TokenType.SPECIAL),
|
||||||
|
("<SEP>", TokenType.SPECIAL),
|
||||||
|
("<SEP>", TokenType.SPECIAL),
|
||||||
|
("dsg", TokenType.BPE),
|
||||||
|
("<SEP>", TokenType.SPECIAL),
|
||||||
|
]
|
||||||
|
|
||||||
|
CHUNKS = list(SPLITTER.split_text(TEXT))
|
||||||
|
|
||||||
|
assert len(CHUNKS) == len(EXPECTED_CHUNKS)
|
||||||
|
|
||||||
|
for chunk, expected_chunk in zip(EXPECTED_CHUNKS, CHUNKS):
|
||||||
|
print(f"TEST:\n\tCHUNK:\t\t{chunk}\n\tEXPECTED:\t\t{expected_chunk}")
|
||||||
|
RECEIVED_TOKEN_STRING, RECEIVED_TOKEN_TYPE = chunk
|
||||||
|
EXPECTED_TOKEN_STRING, EXPECTED_TOKEN_TYPE = expected_chunk
|
||||||
|
|
||||||
|
assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING
|
||||||
|
assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE
|
||||||
|
|
||||||
|
def test_split_malformed_1(self):
|
||||||
|
|
||||||
|
TEXT = "<SEP>lerisque"
|
||||||
|
|
||||||
|
SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX)
|
||||||
|
|
||||||
|
EXPECTED_CHUNKS = [
|
||||||
|
("<SEP>", TokenType.SPECIAL),
|
||||||
|
]
|
||||||
|
|
||||||
|
CHUNKS = list(SPLITTER.split_text(TEXT))
|
||||||
|
|
||||||
|
assert len(CHUNKS) == len(EXPECTED_CHUNKS)
|
||||||
|
|
||||||
|
for chunk, expected_chunk in zip(EXPECTED_CHUNKS, CHUNKS):
|
||||||
|
print(f"TEST:\n\tCHUNK:\t\t{chunk}\n\tEXPECTED:\t\t{expected_chunk}")
|
||||||
|
RECEIVED_TOKEN_STRING, RECEIVED_TOKEN_TYPE = chunk
|
||||||
|
EXPECTED_TOKEN_STRING, EXPECTED_TOKEN_TYPE = expected_chunk
|
||||||
|
|
||||||
|
assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING
|
||||||
|
assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE
|
||||||
|
|
||||||
|
def test_split_malformed_2(self):
|
||||||
|
|
||||||
|
TEXT = "lerisque"
|
||||||
|
|
||||||
|
SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX)
|
||||||
|
|
||||||
|
EXPECTED_CHUNKS = []
|
||||||
|
|
||||||
|
CHUNKS = list(SPLITTER.split_text(TEXT))
|
||||||
|
|
||||||
|
assert len(CHUNKS) == len(EXPECTED_CHUNKS)
|
||||||
|
|
||||||
|
for chunk, expected_chunk in zip(EXPECTED_CHUNKS, CHUNKS):
|
||||||
|
print(f"TEST:\n\tCHUNK:\t\t{chunk}\n\tEXPECTED:\t\t{expected_chunk}")
|
||||||
|
RECEIVED_TOKEN_STRING, RECEIVED_TOKEN_TYPE = chunk
|
||||||
|
EXPECTED_TOKEN_STRING, EXPECTED_TOKEN_TYPE = expected_chunk
|
||||||
|
|
||||||
|
assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING
|
||||||
|
assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE
|
||||||
|
|
||||||
|
def test_split_token_decode_simple(self):
|
||||||
|
# to test the token split into special and bpe
|
||||||
|
SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX, max_bpe_token_id=1473)
|
||||||
|
token_list = [100, 101, 1477]
|
||||||
|
|
||||||
|
CHUNKS = list(SPLITTER.split_tokens(token_list))
|
||||||
|
EXPECTED_CHUNKS = [
|
||||||
|
([100, 101], TokenType.BPE),
|
||||||
|
([1477], TokenType.SPECIAL),
|
||||||
|
]
|
||||||
|
|
||||||
|
assert len(CHUNKS) == len(EXPECTED_CHUNKS)
|
||||||
|
|
||||||
|
for chunk, expected_chunk in zip(EXPECTED_CHUNKS, CHUNKS):
|
||||||
|
print(f"TEST:\n\tCHUNK:\t\t{chunk}\n\tEXPECTED:\t\t{expected_chunk}")
|
||||||
|
RECEIVED_TOKEN_STRING, RECEIVED_TOKEN_TYPE = chunk
|
||||||
|
EXPECTED_TOKEN_STRING, EXPECTED_TOKEN_TYPE = expected_chunk
|
||||||
|
|
||||||
|
assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING
|
||||||
|
assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE
|
||||||
|
|
||||||
|
def test_split_token_decode_simple_malformed(self):
|
||||||
|
# to test the token split into special and bpe
|
||||||
|
SPLITTER = BPE.NanoSocratesSplitter(SYMBOL_REGEX, max_bpe_token_id=1473)
|
||||||
|
token_list = [100, 101, 1477, 100]
|
||||||
|
|
||||||
|
CHUNKS = list(SPLITTER.split_tokens(token_list))
|
||||||
|
EXPECTED_CHUNKS = [
|
||||||
|
([100, 101], TokenType.BPE),
|
||||||
|
([1477], TokenType.SPECIAL),
|
||||||
|
]
|
||||||
|
|
||||||
|
assert len(CHUNKS) == len(EXPECTED_CHUNKS)
|
||||||
|
|
||||||
|
for chunk, expected_chunk in zip(EXPECTED_CHUNKS, CHUNKS):
|
||||||
|
print(f"TEST:\n\tCHUNK:\t\t{chunk}\n\tEXPECTED:\t\t{expected_chunk}")
|
||||||
|
RECEIVED_TOKEN_STRING, RECEIVED_TOKEN_TYPE = chunk
|
||||||
|
EXPECTED_TOKEN_STRING, EXPECTED_TOKEN_TYPE = expected_chunk
|
||||||
|
|
||||||
|
assert RECEIVED_TOKEN_STRING == EXPECTED_TOKEN_STRING
|
||||||
|
assert RECEIVED_TOKEN_TYPE == EXPECTED_TOKEN_TYPE
|
||||||
|
|
||||||
|
|
||||||
|
# Useful to debug weird cases
|
||||||
|
if __name__ == "__main__":
|
||||||
|
TestSplitter().test_split_trailing_text()
|
||||||
21
Project_Model/Tests/tokenano_test.py
Normal file
21
Project_Model/Tests/tokenano_test.py
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
|
||||||
|
from Project_Model.Libs.BPE.Classes.TokeNanoCore import TokeNanoCore
|
||||||
|
|
||||||
|
class TestTokeNano:
|
||||||
|
|
||||||
|
def test_decode_encode_simple(self):
|
||||||
|
TEXT = "<SOT>abababab<EOT>"
|
||||||
|
|
||||||
|
# ab = 256
|
||||||
|
# 256, 256 = 257
|
||||||
|
# 257, 257 = 258
|
||||||
|
|
||||||
|
VOCABULARY = {(ord("a"), ord("b")): 256, (256, 256): 257, (257, 257): 258}
|
||||||
|
# EXPECTED = [258]
|
||||||
|
|
||||||
|
TOKE_NANO = TokeNanoCore(VOCABULARY, ["<SOT>", "<EOT>"])
|
||||||
|
|
||||||
|
ENCODED = TOKE_NANO.encode(TEXT)
|
||||||
|
DECODED = TOKE_NANO.decode(ENCODED)
|
||||||
|
|
||||||
|
assert TEXT == DECODED
|
||||||
0
Project_Model/Tests/trainer_files/cache/.gitkeep
vendored
Normal file
0
Project_Model/Tests/trainer_files/cache/.gitkeep
vendored
Normal file
@@ -0,0 +1 @@
|
|||||||
|
<ABS>The Dark Knight is a 2008 superhero film directed by Christopher Nolan,<SOTL>
|
||||||
1
Project_Model/Tests/trainer_files/train_simple.txt
Normal file
1
Project_Model/Tests/trainer_files/train_simple.txt
Normal file
@@ -0,0 +1 @@
|
|||||||
|
<SOT>abababab<EOT>
|
||||||
695
Project_Model/UML/bpe.excalidraw.json
Normal file
695
Project_Model/UML/bpe.excalidraw.json
Normal file
@@ -0,0 +1,695 @@
|
|||||||
|
{
|
||||||
|
"type": "excalidraw",
|
||||||
|
"version": 2,
|
||||||
|
"source": "https://marketplace.visualstudio.com/items?itemName=pomdtr.excalidraw-editor",
|
||||||
|
"elements": [
|
||||||
|
{
|
||||||
|
"id": "EcT-dGsjmfW571ov8Gg4F",
|
||||||
|
"type": "text",
|
||||||
|
"x": 425.5,
|
||||||
|
"y": 132,
|
||||||
|
"width": 506,
|
||||||
|
"height": 425,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "transparent",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [
|
||||||
|
"4rCC2-N1thmII8_dwNhe1"
|
||||||
|
],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "a3V",
|
||||||
|
"roundness": null,
|
||||||
|
"seed": 523521109,
|
||||||
|
"version": 883,
|
||||||
|
"versionNonce": 1590682729,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": [
|
||||||
|
{
|
||||||
|
"id": "OA_NKjb3n3NLtUo_tKmPS",
|
||||||
|
"type": "arrow"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"updated": 1758881654155,
|
||||||
|
"link": null,
|
||||||
|
"locked": false,
|
||||||
|
"text": "class NanoSocratesBPE(Encoder):\n - vocabulary: Vocabulary\n\n + fit(\n data: [[int]], \n memory: NanoSocratesBPE_BatchMemory,\n last_sentence_chunk: bool, \n last_batch: bool\n ) -> (Self, NanoSocratesBPE_BatchMemory)\n\n + encode(word: [byte]) -> [int]\n\n + decode(token: [int]) -> [byte]\n\n + get_vocabulary_size() -> int\n \n",
|
||||||
|
"fontSize": 20,
|
||||||
|
"fontFamily": 8,
|
||||||
|
"textAlign": "left",
|
||||||
|
"verticalAlign": "top",
|
||||||
|
"containerId": null,
|
||||||
|
"originalText": "class NanoSocratesBPE(Encoder):\n - vocabulary: Vocabulary\n\n + fit(\n data: [[int]], \n memory: NanoSocratesBPE_BatchMemory,\n last_sentence_chunk: bool, \n last_batch: bool\n ) -> (Self, NanoSocratesBPE_BatchMemory)\n\n + encode(word: [byte]) -> [int]\n\n + decode(token: [int]) -> [byte]\n\n + get_vocabulary_size() -> int\n \n",
|
||||||
|
"autoResize": true,
|
||||||
|
"lineHeight": 1.25
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "74i4oK-JpcM4CgAqhz_x_",
|
||||||
|
"type": "rectangle",
|
||||||
|
"x": 382.5,
|
||||||
|
"y": 104.5,
|
||||||
|
"width": 592.5,
|
||||||
|
"height": 421,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "transparent",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [
|
||||||
|
"4rCC2-N1thmII8_dwNhe1"
|
||||||
|
],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "a4",
|
||||||
|
"roundness": {
|
||||||
|
"type": 3
|
||||||
|
},
|
||||||
|
"seed": 50827893,
|
||||||
|
"version": 319,
|
||||||
|
"versionNonce": 704459557,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": [],
|
||||||
|
"updated": 1758878226277,
|
||||||
|
"link": null,
|
||||||
|
"locked": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "s8I1JoKulE3Vnti9a374p",
|
||||||
|
"type": "text",
|
||||||
|
"x": 1113.5,
|
||||||
|
"y": 127,
|
||||||
|
"width": 517,
|
||||||
|
"height": 325,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "transparent",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [
|
||||||
|
"M6w9efVFwOZHkJGgwkyEw"
|
||||||
|
],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "a5",
|
||||||
|
"roundness": null,
|
||||||
|
"seed": 2091174261,
|
||||||
|
"version": 480,
|
||||||
|
"versionNonce": 1964948039,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": [],
|
||||||
|
"updated": 1758881941367,
|
||||||
|
"link": null,
|
||||||
|
"locked": false,
|
||||||
|
"text": "class Vocabulary:\n\n - vocabulary: dict<(int, int), int>\n - reverse_vocabulary: dict<int, (int, int)>\n\n + size -> int\n\n + add_word(int) -> int\n + encode(int) -> int\n + decode(int) -> int\n \n\n",
|
||||||
|
"fontSize": 20,
|
||||||
|
"fontFamily": 8,
|
||||||
|
"textAlign": "left",
|
||||||
|
"verticalAlign": "top",
|
||||||
|
"containerId": null,
|
||||||
|
"originalText": "class Vocabulary:\n\n - vocabulary: dict<(int, int), int>\n - reverse_vocabulary: dict<int, (int, int)>\n\n + size -> int\n\n + add_word(int) -> int\n + encode(int) -> int\n + decode(int) -> int\n \n\n",
|
||||||
|
"autoResize": true,
|
||||||
|
"lineHeight": 1.25
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "BY_Why7XDNftdMzPcwjVZ",
|
||||||
|
"type": "rectangle",
|
||||||
|
"x": 1086.5,
|
||||||
|
"y": 105.5,
|
||||||
|
"width": 593.0000000000001,
|
||||||
|
"height": 325.5,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "transparent",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [
|
||||||
|
"M6w9efVFwOZHkJGgwkyEw"
|
||||||
|
],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "a6",
|
||||||
|
"roundness": {
|
||||||
|
"type": 3
|
||||||
|
},
|
||||||
|
"seed": 153939611,
|
||||||
|
"version": 234,
|
||||||
|
"versionNonce": 2068149129,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": [
|
||||||
|
{
|
||||||
|
"id": "WcDks9DR8UqeZEaxAcRf9",
|
||||||
|
"type": "arrow"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"updated": 1758881945661,
|
||||||
|
"link": null,
|
||||||
|
"locked": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "JCPDhuTKRx4MN950Q3jL-",
|
||||||
|
"type": "text",
|
||||||
|
"x": 1116.411067193676,
|
||||||
|
"y": 477.3809288774704,
|
||||||
|
"width": 416.74578857421875,
|
||||||
|
"height": 99.70355731225297,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "transparent",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [
|
||||||
|
"DbtlKVF_9SjH2-9iMq9zy"
|
||||||
|
],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "a7",
|
||||||
|
"roundness": null,
|
||||||
|
"seed": 1326854235,
|
||||||
|
"version": 479,
|
||||||
|
"versionNonce": 595084597,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": [],
|
||||||
|
"updated": 1758902358518,
|
||||||
|
"link": null,
|
||||||
|
"locked": false,
|
||||||
|
"text": "class NanoSocrateBPE_BatchMemory:\n\n + frequency: dict<(int, int), int>\n + merge_treshold: int",
|
||||||
|
"fontSize": 19.940711462450594,
|
||||||
|
"fontFamily": 8,
|
||||||
|
"textAlign": "left",
|
||||||
|
"verticalAlign": "top",
|
||||||
|
"containerId": null,
|
||||||
|
"originalText": "class NanoSocrateBPE_BatchMemory:\n\n + frequency: dict<(int, int), int>\n + merge_treshold: int",
|
||||||
|
"autoResize": true,
|
||||||
|
"lineHeight": 1.25
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "l-O0rMS3SruV22_MPX9Jz",
|
||||||
|
"type": "rectangle",
|
||||||
|
"x": 1086.5,
|
||||||
|
"y": 451.4580039762846,
|
||||||
|
"width": 593,
|
||||||
|
"height": 208.0419960474308,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "transparent",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [
|
||||||
|
"DbtlKVF_9SjH2-9iMq9zy"
|
||||||
|
],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "a8",
|
||||||
|
"roundness": {
|
||||||
|
"type": 3
|
||||||
|
},
|
||||||
|
"seed": 1490898171,
|
||||||
|
"version": 305,
|
||||||
|
"versionNonce": 587306139,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": [
|
||||||
|
{
|
||||||
|
"id": "OA_NKjb3n3NLtUo_tKmPS",
|
||||||
|
"type": "arrow"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"updated": 1758902358518,
|
||||||
|
"link": null,
|
||||||
|
"locked": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "WcDks9DR8UqeZEaxAcRf9",
|
||||||
|
"type": "arrow",
|
||||||
|
"x": 773.5,
|
||||||
|
"y": 167,
|
||||||
|
"width": 297.17936724485867,
|
||||||
|
"height": 30,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "transparent",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "aB",
|
||||||
|
"roundness": {
|
||||||
|
"type": 2
|
||||||
|
},
|
||||||
|
"seed": 1681364149,
|
||||||
|
"version": 303,
|
||||||
|
"versionNonce": 1262492265,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": [],
|
||||||
|
"updated": 1758881945661,
|
||||||
|
"link": null,
|
||||||
|
"locked": false,
|
||||||
|
"points": [
|
||||||
|
[
|
||||||
|
0,
|
||||||
|
0
|
||||||
|
],
|
||||||
|
[
|
||||||
|
144.5,
|
||||||
|
-1.5
|
||||||
|
],
|
||||||
|
[
|
||||||
|
177.5,
|
||||||
|
-30
|
||||||
|
],
|
||||||
|
[
|
||||||
|
297.17936724485867,
|
||||||
|
-29.020420978562214
|
||||||
|
]
|
||||||
|
],
|
||||||
|
"lastCommittedPoint": null,
|
||||||
|
"startBinding": null,
|
||||||
|
"endBinding": {
|
||||||
|
"elementId": "BY_Why7XDNftdMzPcwjVZ",
|
||||||
|
"focus": 0.77319587628866,
|
||||||
|
"gap": 18.25
|
||||||
|
},
|
||||||
|
"startArrowhead": null,
|
||||||
|
"endArrowhead": "triangle",
|
||||||
|
"elbowed": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "OA_NKjb3n3NLtUo_tKmPS",
|
||||||
|
"type": "arrow",
|
||||||
|
"x": 946.0000000000002,
|
||||||
|
"y": 274.95951048200493,
|
||||||
|
"width": 130.016707976343,
|
||||||
|
"height": 209.36808480159067,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "transparent",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "aD",
|
||||||
|
"roundness": {
|
||||||
|
"type": 2
|
||||||
|
},
|
||||||
|
"seed": 1871768059,
|
||||||
|
"version": 1039,
|
||||||
|
"versionNonce": 213535035,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": [],
|
||||||
|
"updated": 1758902358519,
|
||||||
|
"link": null,
|
||||||
|
"locked": false,
|
||||||
|
"points": [
|
||||||
|
[
|
||||||
|
0,
|
||||||
|
0
|
||||||
|
],
|
||||||
|
[
|
||||||
|
54.99999999999977,
|
||||||
|
12.54048951799507
|
||||||
|
],
|
||||||
|
[
|
||||||
|
69.49999999999977,
|
||||||
|
188.54048951799507
|
||||||
|
],
|
||||||
|
[
|
||||||
|
130.016707976343,
|
||||||
|
209.36808480159067
|
||||||
|
]
|
||||||
|
],
|
||||||
|
"lastCommittedPoint": null,
|
||||||
|
"startBinding": {
|
||||||
|
"elementId": "EcT-dGsjmfW571ov8Gg4F",
|
||||||
|
"focus": -0.48312180762055096,
|
||||||
|
"gap": 14.500000000000114
|
||||||
|
},
|
||||||
|
"endBinding": {
|
||||||
|
"elementId": "l-O0rMS3SruV22_MPX9Jz",
|
||||||
|
"focus": -0.16742658425737647,
|
||||||
|
"gap": 11.194126334166185
|
||||||
|
},
|
||||||
|
"startArrowhead": null,
|
||||||
|
"endArrowhead": "triangle",
|
||||||
|
"elbowed": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "snZ__VDsIlri6NTp8M2Gf",
|
||||||
|
"type": "text",
|
||||||
|
"x": -245.25,
|
||||||
|
"y": 103,
|
||||||
|
"width": 330,
|
||||||
|
"height": 125,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "transparent",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "aE",
|
||||||
|
"roundness": null,
|
||||||
|
"seed": 1758461093,
|
||||||
|
"version": 265,
|
||||||
|
"versionNonce": 1069481861,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": [],
|
||||||
|
"updated": 1758879566916,
|
||||||
|
"link": null,
|
||||||
|
"locked": false,
|
||||||
|
"text": "class NanoSocratesBPETrainer:\n\n - max_iterations: int\n - max_vocabulary_size: int\n - merge_treshold: int",
|
||||||
|
"fontSize": 20,
|
||||||
|
"fontFamily": 8,
|
||||||
|
"textAlign": "left",
|
||||||
|
"verticalAlign": "top",
|
||||||
|
"containerId": null,
|
||||||
|
"originalText": "class NanoSocratesBPETrainer:\n\n - max_iterations: int\n - max_vocabulary_size: int\n - merge_treshold: int",
|
||||||
|
"autoResize": true,
|
||||||
|
"lineHeight": 1.25
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "PnbmqwEWYkP8oXElKFyTp",
|
||||||
|
"type": "text",
|
||||||
|
"x": -237.75,
|
||||||
|
"y": 544,
|
||||||
|
"width": 561,
|
||||||
|
"height": 125,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "transparent",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "aH",
|
||||||
|
"roundness": null,
|
||||||
|
"seed": 501304683,
|
||||||
|
"version": 241,
|
||||||
|
"versionNonce": 1306401003,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": [],
|
||||||
|
"updated": 1758878748210,
|
||||||
|
"link": null,
|
||||||
|
"locked": false,
|
||||||
|
"text": "class NanoSocratesSplitter:\n + splitter_regex: regex\n\n + split_text(corpus: str) -> [(str, TokenType)]\n",
|
||||||
|
"fontSize": 20,
|
||||||
|
"fontFamily": 8,
|
||||||
|
"textAlign": "left",
|
||||||
|
"verticalAlign": "top",
|
||||||
|
"containerId": null,
|
||||||
|
"originalText": "class NanoSocratesSplitter:\n + splitter_regex: regex\n\n + split_text(corpus: str) -> [(str, TokenType)]\n",
|
||||||
|
"autoResize": true,
|
||||||
|
"lineHeight": 1.25
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "xR_11IzgXX5O-m6WoRfCL",
|
||||||
|
"type": "text",
|
||||||
|
"x": -233.25,
|
||||||
|
"y": 366.5,
|
||||||
|
"width": 165,
|
||||||
|
"height": 75,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "transparent",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "aI",
|
||||||
|
"roundness": null,
|
||||||
|
"seed": 2025585125,
|
||||||
|
"version": 395,
|
||||||
|
"versionNonce": 1799178985,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": [],
|
||||||
|
"updated": 1758883940168,
|
||||||
|
"link": null,
|
||||||
|
"locked": false,
|
||||||
|
"text": "enum TokenType:\n + SPECIAL\n + BPE",
|
||||||
|
"fontSize": 20,
|
||||||
|
"fontFamily": 8,
|
||||||
|
"textAlign": "left",
|
||||||
|
"verticalAlign": "top",
|
||||||
|
"containerId": null,
|
||||||
|
"originalText": "enum TokenType:\n + SPECIAL\n + BPE",
|
||||||
|
"autoResize": true,
|
||||||
|
"lineHeight": 1.25
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "lgKSd9qCb94-5e8rd9I3r",
|
||||||
|
"type": "text",
|
||||||
|
"x": -219.75,
|
||||||
|
"y": 764.5,
|
||||||
|
"width": 462,
|
||||||
|
"height": 275,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "transparent",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "aJ",
|
||||||
|
"roundness": null,
|
||||||
|
"seed": 1963214021,
|
||||||
|
"version": 464,
|
||||||
|
"versionNonce": 1104453739,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": [],
|
||||||
|
"updated": 1759053302739,
|
||||||
|
"link": null,
|
||||||
|
"locked": false,
|
||||||
|
"text": "class TokeNanoCore:\n\n - splitter: NanoSocratesSplitter\n - bpe_encoder: NanoSocratesBPE\n - special_encoder: NanoSocratesSpecial\n\n + encode(corpus: str) -> [int]\n\n - encode_special(piece: str) -> int\n\n - encode_bpe(piece: str) -> [int]",
|
||||||
|
"fontSize": 20,
|
||||||
|
"fontFamily": 8,
|
||||||
|
"textAlign": "left",
|
||||||
|
"verticalAlign": "top",
|
||||||
|
"containerId": null,
|
||||||
|
"originalText": "class TokeNanoCore:\n\n - splitter: NanoSocratesSplitter\n - bpe_encoder: NanoSocratesBPE\n - special_encoder: NanoSocratesSpecial\n\n + encode(corpus: str) -> [int]\n\n - encode_special(piece: str) -> int\n\n - encode_bpe(piece: str) -> [int]",
|
||||||
|
"autoResize": true,
|
||||||
|
"lineHeight": 1.25
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "DwFJoUpVT2YAEe9qPYAXa",
|
||||||
|
"type": "text",
|
||||||
|
"x": 496.75,
|
||||||
|
"y": 666,
|
||||||
|
"width": 440,
|
||||||
|
"height": 100,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "transparent",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "aL",
|
||||||
|
"roundness": null,
|
||||||
|
"seed": 1317596203,
|
||||||
|
"version": 152,
|
||||||
|
"versionNonce": 1840679687,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": [],
|
||||||
|
"updated": 1758880107704,
|
||||||
|
"link": null,
|
||||||
|
"locked": false,
|
||||||
|
"text": "class NanoSocratesSpecial(Encoder):\n\n + vocabulary: dict<str, int>\n + reverse_vocabulary: dict<int, str>",
|
||||||
|
"fontSize": 20,
|
||||||
|
"fontFamily": 8,
|
||||||
|
"textAlign": "left",
|
||||||
|
"verticalAlign": "top",
|
||||||
|
"containerId": null,
|
||||||
|
"originalText": "class NanoSocratesSpecial(Encoder):\n\n + vocabulary: dict<str, int>\n + reverse_vocabulary: dict<int, str>",
|
||||||
|
"autoResize": true,
|
||||||
|
"lineHeight": 1.25
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "78gC46xatoO1_cRtaN8EC",
|
||||||
|
"type": "text",
|
||||||
|
"x": 396.375,
|
||||||
|
"y": -107.75,
|
||||||
|
"width": 396,
|
||||||
|
"height": 100,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "transparent",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "aM",
|
||||||
|
"roundness": null,
|
||||||
|
"seed": 1187595241,
|
||||||
|
"version": 130,
|
||||||
|
"versionNonce": 1273030504,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": [],
|
||||||
|
"updated": 1759070012771,
|
||||||
|
"link": null,
|
||||||
|
"locked": false,
|
||||||
|
"text": "class Encoder(ABC):\n\n + encode(corpus: str) -> [int]\n + decode(encoded: [int]) -> str ",
|
||||||
|
"fontSize": 20,
|
||||||
|
"fontFamily": 8,
|
||||||
|
"textAlign": "left",
|
||||||
|
"verticalAlign": "top",
|
||||||
|
"containerId": null,
|
||||||
|
"originalText": "class Encoder(ABC):\n\n + encode(corpus: str) -> [int]\n + decode(encoded: [int]) -> str ",
|
||||||
|
"autoResize": true,
|
||||||
|
"lineHeight": 1.25
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "3j50Ds74uU7oXoJ9kMOYJ",
|
||||||
|
"type": "text",
|
||||||
|
"x": 457.375,
|
||||||
|
"y": 903.75,
|
||||||
|
"width": 949.7594604492188,
|
||||||
|
"height": 25,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "transparent",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "aN",
|
||||||
|
"roundness": null,
|
||||||
|
"seed": 1994335529,
|
||||||
|
"version": 198,
|
||||||
|
"versionNonce": 1492696519,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": [],
|
||||||
|
"updated": 1758882694747,
|
||||||
|
"link": null,
|
||||||
|
"locked": false,
|
||||||
|
"text": "@@mamma@@è bell^^issima e @@^^le voglio molto b^^ene @--@ replit ^^ è molto ^^bello e^^ lo amo",
|
||||||
|
"fontSize": 20,
|
||||||
|
"fontFamily": 5,
|
||||||
|
"textAlign": "left",
|
||||||
|
"verticalAlign": "top",
|
||||||
|
"containerId": null,
|
||||||
|
"originalText": "@@mamma@@è bell^^issima e @@^^le voglio molto b^^ene @--@ replit ^^ è molto ^^bello e^^ lo amo",
|
||||||
|
"autoResize": true,
|
||||||
|
"lineHeight": 1.25
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "yg-TvQvz4MwJZ0y8K7Ix0",
|
||||||
|
"type": "text",
|
||||||
|
"x": 435.375,
|
||||||
|
"y": 1026.25,
|
||||||
|
"width": 352,
|
||||||
|
"height": 250,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "transparent",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "aP",
|
||||||
|
"roundness": null,
|
||||||
|
"seed": 1877486407,
|
||||||
|
"version": 344,
|
||||||
|
"versionNonce": 25830153,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": [],
|
||||||
|
"updated": 1758883468886,
|
||||||
|
"link": null,
|
||||||
|
"locked": false,
|
||||||
|
"text": "class NanoSocratesChunker:\n\n - max_bytes: int\n - max_special_length: int\n - special_token_regex: regex\n\n - residuals: str\n\n # This must be an iterator\n + read(path: Path) -> str",
|
||||||
|
"fontSize": 20,
|
||||||
|
"fontFamily": 8,
|
||||||
|
"textAlign": "left",
|
||||||
|
"verticalAlign": "top",
|
||||||
|
"containerId": null,
|
||||||
|
"originalText": "class NanoSocratesChunker:\n\n - max_bytes: int\n - max_special_length: int\n - special_token_regex: regex\n\n - residuals: str\n\n # This must be an iterator\n + read(path: Path) -> str",
|
||||||
|
"autoResize": true,
|
||||||
|
"lineHeight": 1.25
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "2UXjWdE_jMcsCE2oQgTXn",
|
||||||
|
"type": "text",
|
||||||
|
"x": -334.75,
|
||||||
|
"y": 1112.5,
|
||||||
|
"width": 165,
|
||||||
|
"height": 25,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "transparent",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "aQ",
|
||||||
|
"roundness": null,
|
||||||
|
"seed": 700532363,
|
||||||
|
"version": 76,
|
||||||
|
"versionNonce": 1671597672,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": [],
|
||||||
|
"updated": 1759070020002,
|
||||||
|
"link": null,
|
||||||
|
"locked": false,
|
||||||
|
"text": "class TokeNano:",
|
||||||
|
"fontSize": 20,
|
||||||
|
"fontFamily": 8,
|
||||||
|
"textAlign": "left",
|
||||||
|
"verticalAlign": "top",
|
||||||
|
"containerId": null,
|
||||||
|
"originalText": "class TokeNano:",
|
||||||
|
"autoResize": true,
|
||||||
|
"lineHeight": 1.25
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"appState": {
|
||||||
|
"gridSize": 20,
|
||||||
|
"gridStep": 5,
|
||||||
|
"gridModeEnabled": false,
|
||||||
|
"viewBackgroundColor": "#ffffff"
|
||||||
|
},
|
||||||
|
"files": {}
|
||||||
|
}
|
||||||
21
README.md
21
README.md
@@ -12,11 +12,30 @@ Create and activate you Conda enviroment with:
|
|||||||
|
|
||||||
conda env create -f environment.yaml
|
conda env create -f environment.yaml
|
||||||
conda activate deep_learning
|
conda activate deep_learning
|
||||||
|
|
||||||
Now install dependencies on pip:
|
Now install dependencies on pip:
|
||||||
|
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
Add the following on .vscode/settings.json
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
// For linux
|
||||||
|
"terminal.integrated.env.linux": {
|
||||||
|
"PYTHONPATH": "${workspaceFolder}"
|
||||||
|
},
|
||||||
|
// For OSX
|
||||||
|
"terminal.integrated.env.osx": {
|
||||||
|
"PYTHONPATH": "${workspaceFolder}"
|
||||||
|
},
|
||||||
|
// For Windows
|
||||||
|
"terminal.integrated.env.windows": {
|
||||||
|
"PYTHONPATH": "${workspaceFolder}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
## TroubleShooting
|
## TroubleShooting
|
||||||
|
|
||||||
Sometimes when uploading really large batch of data, git can stop the uploads thanks to the timeout.
|
Sometimes when uploading really large batch of data, git can stop the uploads thanks to the timeout.
|
||||||
|
|||||||
@@ -1,21 +0,0 @@
|
|||||||
import pandas as pd
|
|
||||||
|
|
||||||
class Debug_csv():
|
|
||||||
def __init__(self, output_path:str):
|
|
||||||
|
|
||||||
|
|
||||||
self.output = open(output_path, "w")
|
|
||||||
# then the first row as header
|
|
||||||
header = ["MovieURI","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
|
|
||||||
self.output.write(",".join(header) + "\n")
|
|
||||||
|
|
||||||
def close(self):
|
|
||||||
self.output.close()
|
|
||||||
|
|
||||||
def write(self, RDF: pd.DataFrame):
|
|
||||||
"""
|
|
||||||
Args:
|
|
||||||
RDF (pd.DataFrame): ["MovieURI","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
|
|
||||||
"""
|
|
||||||
|
|
||||||
RDF.to_csv(self.output, index=False, header=False)
|
|
||||||
@@ -1,7 +1,7 @@
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
# do not worry about circular dependencies, this class will never call something else
|
# do not worry about circular dependencies, this class will never call something else
|
||||||
from Scripts.DataCleaning.legacy.filter import PipelineApplier
|
from Scripts.DataCleaning.filter import PipelineApplier
|
||||||
|
|
||||||
class RDF_mask_task_dataset():
|
class RDF_mask_task_dataset():
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -26,7 +26,6 @@ class PipelineApplier():
|
|||||||
"""Remove rows whose RelationshipURI is in the stored filter. Generate it first callig the generate_list_relationship_filter"""
|
"""Remove rows whose RelationshipURI is in the stored filter. Generate it first callig the generate_list_relationship_filter"""
|
||||||
return RDF[~RDF["RelationshipURI"].isin(self.relationship_filter_list)]
|
return RDF[~RDF["RelationshipURI"].isin(self.relationship_filter_list)]
|
||||||
|
|
||||||
# def filter_movie_by_rel_uri_frequence()
|
|
||||||
|
|
||||||
def generate_frequency_movie_filter(self, MOVIE_COUNT: pd.DataFrame ,min_treshold: int, max_treshold: int):
|
def generate_frequency_movie_filter(self, MOVIE_COUNT: pd.DataFrame ,min_treshold: int, max_treshold: int):
|
||||||
"""
|
"""
|
||||||
@@ -74,6 +73,10 @@ class PipelineApplier():
|
|||||||
return RDF
|
return RDF
|
||||||
|
|
||||||
|
|
||||||
|
def reduce_movie_list(self, starting_offset:int , ending_offset:int):
|
||||||
|
end = min(len(self.MOVIE_FILTER), ending_offset)
|
||||||
|
self.MOVIE_FILTER = self.MOVIE_FILTER.iloc[starting_offset:end].copy()
|
||||||
|
|
||||||
def drop_na_from_dataset(self, RDF: pd.DataFrame) -> pd.DataFrame:
|
def drop_na_from_dataset(self, RDF: pd.DataFrame) -> pd.DataFrame:
|
||||||
# dataset has SubjectURI RelationshipURI ObjectURI
|
# dataset has SubjectURI RelationshipURI ObjectURI
|
||||||
# want to drop the '' in them
|
# want to drop the '' in them
|
||||||
@@ -183,9 +186,3 @@ class PipelineApplier():
|
|||||||
# as input two dataframe, one with 2 column
|
# as input two dataframe, one with 2 column
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def regex_on_objects(self, RDF: pd.DataFrame) -> pd.DataFrame:
|
|
||||||
RDF["ObjectURI"] = (RDF["ObjectURI"].astype("string")
|
|
||||||
.str.replace(r"\r?\n+", ", ", regex=True) # newlines -> ", "
|
|
||||||
.str.replace(r"\*", "", regex=True)) # delete all asterisks
|
|
||||||
|
|
||||||
return RDF
|
|
||||||
@@ -1,29 +0,0 @@
|
|||||||
import pandas as pd
|
|
||||||
|
|
||||||
def split_csv_by_percent(csv_path, train=70, val=15, test=15, seed=42):
|
|
||||||
# 1) Read and shuffle rows with a fixed seed for reproducibility
|
|
||||||
df = pd.read_csv(csv_path).sample(frac=1, random_state=seed).reset_index(drop=True)
|
|
||||||
|
|
||||||
# 2) Turn the three inputs into proportions relative to their sum
|
|
||||||
total = train + val + test # eheh you got it there :p
|
|
||||||
n = len(df)
|
|
||||||
n_train = int(n * train / total) # floor to keep indices integral
|
|
||||||
n_val = int(n * val / total)
|
|
||||||
# 3) Give the remainder to test to ensure every row is assigned
|
|
||||||
# (this naturally absorbs any rounding loss)
|
|
||||||
train_df = df.iloc[:n_train].reset_index(drop=True)
|
|
||||||
val_df = df.iloc[n_train:n_train + n_val].reset_index(drop=True)
|
|
||||||
test_df = df.iloc[n_train + n_val:].reset_index(drop=True)
|
|
||||||
|
|
||||||
return train_df, val_df, test_df
|
|
||||||
|
|
||||||
# usage:
|
|
||||||
DATASET = "Assets/Dataset/Tmp/rdf_text.csv"
|
|
||||||
TRAIN = "Assets/Dataset/Tmp/hold_out/train.csv"
|
|
||||||
TEST = "Assets/Dataset/Tmp/hold_out/test.csv"
|
|
||||||
EVALUATION = "Assets/Dataset/Tmp/hold_out/evaluation.csv"
|
|
||||||
train_df, val_df, test_df = split_csv_by_percent(DATASET, train=80, val=10, test=10, seed=42)
|
|
||||||
|
|
||||||
train_df.to_csv(TRAIN)
|
|
||||||
val_df.to_csv(EVALUATION)
|
|
||||||
test_df.to_csv(TEST)
|
|
||||||
@@ -1,381 +0,0 @@
|
|||||||
# This file deletes in the pipeline the unwanted relationship by different rules
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
# SQL-FIRST VERSION
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
# In the original (pandas) version this module:
|
|
||||||
# - stored frequency filters in DataFrames,
|
|
||||||
# - filtered/cleaned DataFrames in-memory,
|
|
||||||
# - added special tokens via string ops,
|
|
||||||
# - rebuilt one row per movie using groupby/aggregation.
|
|
||||||
#
|
|
||||||
# In this rewrite:
|
|
||||||
# - Every transformation RETURNS a SQLAlchemy `Select` object instead of a DataFrame.
|
|
||||||
# - Your pipeline can pass this `Select` (a "dataview") from one stage to the next,
|
|
||||||
# composing more SQL lazily. Nothing is executed until you call `session.execute(...)`.
|
|
||||||
# - Frequency filters are represented as SUBSELECTS, applied with `WHERE IN (subquery)`.
|
|
||||||
#
|
|
||||||
# Notes:
|
|
||||||
# - We keep the same CLASS and METHOD NAMES to preserve call sites.
|
|
||||||
# - Method comments/docstrings from your original file are carried over and updated
|
|
||||||
# to reflect Select-based behavior and return types.
|
|
||||||
# - We drop pandas/numpy/sqlite3 imports because filtering is pushed into SQL.
|
|
||||||
# - `GROUP_CONCAT` is used for the rebuild phase (SQLite-compatible). For other DBs,
|
|
||||||
# swap with an equivalent string-agg function.
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
from sqlalchemy import select, func, literal
|
|
||||||
from sqlalchemy.sql import Select
|
|
||||||
|
|
||||||
from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
|
|
||||||
|
|
||||||
|
|
||||||
class PipelineApplier():
|
|
||||||
"""
|
|
||||||
SQL-first pipeline applier.
|
|
||||||
|
|
||||||
In the pandas version, frequency filters were stored as DataFrames (self.MOVIE_FILTER / self.REL_FILTER)
|
|
||||||
and every method worked with/returned pandas.DataFrame. In this SQLAlchemy rewrite:
|
|
||||||
|
|
||||||
- self.MOVIE_FILTER and self.REL_FILTER become *subselects* (Select objects) that yield a single
|
|
||||||
column each (MovieID or RelationshipURI). These subselects can be applied via `WHERE IN (subquery)`.
|
|
||||||
|
|
||||||
- Every method that previously returned a DataFrame now returns a *Select* that represents the same
|
|
||||||
logical transformation, but pushed into the database engine.
|
|
||||||
|
|
||||||
- Comments and docstrings are updated to reflect SQL semantics while preserving your original intent.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
# In the pandas version these were DataFrames storing allowed keys.
|
|
||||||
# Here they are Select objects (single-column subselects) or None.
|
|
||||||
# Expected column names:
|
|
||||||
# - self.MOVIE_FILTER: "MovieID"
|
|
||||||
# - self.REL_FILTER: "RelationshipURI"
|
|
||||||
self.MOVIE_FILTER: Optional[Select] = None
|
|
||||||
self.REL_FILTER: Optional[Select] = None
|
|
||||||
|
|
||||||
# -------------------------------------------------------------------------
|
|
||||||
# Relationship deletion
|
|
||||||
# -------------------------------------------------------------------------
|
|
||||||
def delete_relationship_by_str(self, RDF: Select, uri: str) -> Select:
|
|
||||||
"""
|
|
||||||
Return a Select where rows having the given relationship URI are removed.
|
|
||||||
|
|
||||||
Original signature (pandas):
|
|
||||||
def delete_relationship_by_str(self, RDF: pd.DataFrame, uri: str) -> pd.DataFrame
|
|
||||||
|
|
||||||
Updated behavior:
|
|
||||||
- RDF is a Select with columns: MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
|
|
||||||
- We apply a WHERE clause: RelationshipURI != <uri>
|
|
||||||
- Returns a Select you can continue composing.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
RDF (Select): a selectable representing the RDF joined view
|
|
||||||
uri (str): RelationshipURI to exclude
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Select: filtered selectable (no execution yet)
|
|
||||||
"""
|
|
||||||
sc = RDF.selected_columns
|
|
||||||
return RDF.where(sc.RelationshipURI != literal(uri))
|
|
||||||
|
|
||||||
# -------------------------------------------------------------------------
|
|
||||||
# Frequency filter: MOVIE
|
|
||||||
# -------------------------------------------------------------------------
|
|
||||||
def generate_frequency_movie_filter(self, MOVIE_COUNT: Select, min_treshold: int, max_treshold: int):
|
|
||||||
"""
|
|
||||||
You MUST call this before filtering by movie frequency [filter_by_frequency_movie_id()],
|
|
||||||
since this method creates such filter.
|
|
||||||
|
|
||||||
Original behavior:
|
|
||||||
- Input MOVIE_COUNT as DataFrame ["MovieID","Count"]
|
|
||||||
- Keep rows where Count in [min_treshold, max_treshold)
|
|
||||||
- Store the filtered keys in self.MOVIE_FILTER
|
|
||||||
|
|
||||||
Updated behavior (SQL):
|
|
||||||
- MOVIE_COUNT is a Select that yields ["MovieID","Count"].
|
|
||||||
- We build and store a *subselect* of allowed MovieID (single column) to be used by WHERE IN.
|
|
||||||
- No query is executed here; we only create a new Select.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
MOVIE_COUNT (Select): yields columns MovieID, Count
|
|
||||||
min_treshold (int):
|
|
||||||
max_treshold (int):
|
|
||||||
"""
|
|
||||||
sc = MOVIE_COUNT.selected_columns
|
|
||||||
filtered = MOVIE_COUNT.where(sc.Count >= min_treshold).where(sc.Count < max_treshold)
|
|
||||||
# Keep only the key column so it can be used in an IN (subquery)
|
|
||||||
self.MOVIE_FILTER = select(filtered.selected_columns.MovieID)
|
|
||||||
|
|
||||||
# -------------------------------------------------------------------------
|
|
||||||
# Frequency filter: RELATIONSHIP
|
|
||||||
# -------------------------------------------------------------------------
|
|
||||||
def generate_frequency_relationship_filter(self, REL_COUNT: Select, min_treshold: int, max_treshold: int):
|
|
||||||
"""
|
|
||||||
Original behavior:
|
|
||||||
- Input REL_COUNT as DataFrame ["RelationshipURI","Count"]
|
|
||||||
- Keep rows where Count in [min_treshold, max_treshold)
|
|
||||||
- Store the filtered keys in self.REL_FILTER
|
|
||||||
|
|
||||||
Updated behavior (SQL):
|
|
||||||
- REL_COUNT is a Select that yields ["RelationshipURI","Count"].
|
|
||||||
- We build and store a *subselect* of allowed RelationshipURI (single column) to be used by WHERE IN.
|
|
||||||
- No query is executed here; we only create a new Select.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
REL_COUNT (Select): yields columns RelationshipURI, Count
|
|
||||||
min_treshold (int):
|
|
||||||
max_treshold (int):
|
|
||||||
"""
|
|
||||||
sc = REL_COUNT.selected_columns
|
|
||||||
filtered = REL_COUNT.where(sc.Count >= min_treshold).where(sc.Count < max_treshold)
|
|
||||||
self.REL_FILTER = select(filtered.selected_columns.RelationshipURI)
|
|
||||||
|
|
||||||
# -------------------------------------------------------------------------
|
|
||||||
# Apply frequency filters
|
|
||||||
# -------------------------------------------------------------------------
|
|
||||||
def filter_by_frequency_movie_id(self, RDF: Select) -> Select:
|
|
||||||
"""
|
|
||||||
Original behavior (pandas):
|
|
||||||
RDF = RDF[RDF["MovieID"].isin(self.MOVIE_FILTER["MovieID"])]
|
|
||||||
|
|
||||||
Updated behavior (SQL):
|
|
||||||
- If self.MOVIE_FILTER is present, apply: WHERE MovieID IN ( <subselect> )
|
|
||||||
- Otherwise, return RDF unchanged.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
RDF (Select): current dataset
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Select: filtered dataset (or unchanged if no filter exists)
|
|
||||||
"""
|
|
||||||
if self.MOVIE_FILTER is None:
|
|
||||||
return RDF
|
|
||||||
sc = RDF.selected_columns
|
|
||||||
return RDF.where(sc.MovieID.in_(self.MOVIE_FILTER))
|
|
||||||
|
|
||||||
def filter_by_frequency_relationship(self, RDF: Select) -> Select:
|
|
||||||
"""
|
|
||||||
Original behavior (pandas):
|
|
||||||
RDF = RDF[RDF["RelationshipURI"].isin(self.REL_FILTER["RelationshipURI"])]
|
|
||||||
|
|
||||||
Updated behavior (SQL):
|
|
||||||
- If self.REL_FILTER is present, apply: WHERE RelationshipURI IN ( <subselect> )
|
|
||||||
- Otherwise, return RDF unchanged.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
RDF (Select): current dataset
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Select: filtered dataset (or unchanged if no filter exists)
|
|
||||||
"""
|
|
||||||
if self.REL_FILTER is None:
|
|
||||||
return RDF
|
|
||||||
sc = RDF.selected_columns
|
|
||||||
return RDF.where(sc.RelationshipURI.in_(self.REL_FILTER))
|
|
||||||
|
|
||||||
# -------------------------------------------------------------------------
|
|
||||||
# Token prefixing (SubjectURI/RelationshipURI/ObjectURI)
|
|
||||||
# -------------------------------------------------------------------------
|
|
||||||
def rdf_add_special_token(self, RDF: Select) -> Select:
|
|
||||||
"""
|
|
||||||
Adds RDF special token to each element of the tuple. i.e: SUBJ to SubjectURI,
|
|
||||||
OBJ to ObjectURI, REL to RelationshipURI. Check
|
|
||||||
Scripts/Libs/CleaningPipeline/special_token.py for the up-to-date special token.
|
|
||||||
|
|
||||||
It only adds the special token of the three elements of the RDF; no other special token.
|
|
||||||
|
|
||||||
Original behavior (pandas):
|
|
||||||
- String concatenation with columns in a DataFrame.
|
|
||||||
- Returned a new DataFrame.
|
|
||||||
|
|
||||||
Updated behavior (SQL):
|
|
||||||
- Build projected columns using SQL string concatenation.
|
|
||||||
- Return a new Select with the same output column names:
|
|
||||||
["MovieID","SubjectURI","RelationshipURI","ObjectURI","Abstract"].
|
|
||||||
|
|
||||||
Args:
|
|
||||||
RDF (Select): current dataset
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Select: projected dataset with tokenized SubjectURI/RelationshipURI/ObjectURI
|
|
||||||
"""
|
|
||||||
sc = RDF.selected_columns
|
|
||||||
subj_tok = literal(SpecialToken.SUBJECT.value) + sc.SubjectURI
|
|
||||||
rel_tok = literal(SpecialToken.RELATIONSHIP.value) + sc.RelationshipURI
|
|
||||||
obj_tok = literal(SpecialToken.OBJECT.value) + sc.ObjectURI
|
|
||||||
|
|
||||||
return RDF.with_only_columns(
|
|
||||||
sc.MovieID.label("MovieID"),
|
|
||||||
subj_tok.label("SubjectURI"),
|
|
||||||
rel_tok.label("RelationshipURI"),
|
|
||||||
obj_tok.label("ObjectURI"),
|
|
||||||
sc.Abstract.label("Abstract"),
|
|
||||||
)
|
|
||||||
|
|
||||||
# -------------------------------------------------------------------------
|
|
||||||
# NA/empty drop on key columns (SubjectURI, RelationshipURI, ObjectURI)
|
|
||||||
# -------------------------------------------------------------------------
|
|
||||||
def drop_na_from_dataset(self, RDF: Select) -> Select:
|
|
||||||
"""
|
|
||||||
Dataset has SubjectURI, RelationshipURI, ObjectURI. We want to drop rows
|
|
||||||
where any of these is empty or NULL.
|
|
||||||
|
|
||||||
Original behavior (pandas):
|
|
||||||
- Replace '' with NaN and dropna on the three columns.
|
|
||||||
|
|
||||||
Updated behavior (SQL):
|
|
||||||
- Apply WHERE clauses checking for NOT NULL and not empty string.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
RDF (Select): current dataset
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Select: dataset filtered to non-empty SubjectURI/RelationshipURI/ObjectURI
|
|
||||||
"""
|
|
||||||
sc = RDF.selected_columns
|
|
||||||
return RDF.where(
|
|
||||||
(sc.SubjectURI.is_not(None)) & (sc.SubjectURI != "") &
|
|
||||||
(sc.RelationshipURI.is_not(None)) & (sc.RelationshipURI != "") &
|
|
||||||
(sc.ObjectURI.is_not(None)) & (sc.ObjectURI != "")
|
|
||||||
)
|
|
||||||
|
|
||||||
# -------------------------------------------------------------------------
|
|
||||||
# Rebuild by movie (one row per movie)
|
|
||||||
# -------------------------------------------------------------------------
|
|
||||||
def rebuild_by_movie(self, RDF: Select) -> Select:
|
|
||||||
"""
|
|
||||||
To execute this method you have to have iterated by movie_id conceptually,
|
|
||||||
because as design we want at the end one row for each movie.
|
|
||||||
|
|
||||||
Original behavior (pandas):
|
|
||||||
- Build per-row "Triple" as SubjectURI + RelationshipURI + ObjectURI,
|
|
||||||
wrapped with START_TRIPLE/END_TRIPLE.
|
|
||||||
- Group by ["MovieID", "Abstract"] and join ("".join) all Triple strings into one.
|
|
||||||
- Prefix the whole list with START_TRIPLE_LIST and Abstract with ABSTRACT.
|
|
||||||
- Return DataFrame [["MovieID","Triple","Abstract"]].
|
|
||||||
|
|
||||||
Updated behavior (SQL):
|
|
||||||
- Build per-row Triple using SQL string concatenation and constants.
|
|
||||||
- Use GROUP_CONCAT (empty separator) to aggregate per-movie.
|
|
||||||
- Prefix with START_TRIPLE_LIST and ABSTRACT in SQL.
|
|
||||||
- Return a Select with columns: ["MovieID","Triple","Abstract"].
|
|
||||||
|
|
||||||
Args:
|
|
||||||
RDF (Select): current dataset with columns
|
|
||||||
MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Select: aggregated dataset with one row per movie
|
|
||||||
"""
|
|
||||||
sc = RDF.selected_columns
|
|
||||||
|
|
||||||
# Per-row triple with START/END_TRIPLE tokens
|
|
||||||
row_triple = (
|
|
||||||
literal(SpecialToken.START_TRIPLE.value) +
|
|
||||||
(sc.SubjectURI + sc.RelationshipURI + sc.ObjectURI) +
|
|
||||||
literal(SpecialToken.END_TRIPLE.value)
|
|
||||||
).label("Triple")
|
|
||||||
|
|
||||||
# Prefixed abstract
|
|
||||||
abstract_tok = (literal(SpecialToken.ABSTRACT.value) + sc.Abstract).label("Abstract")
|
|
||||||
|
|
||||||
# Subquery of per-row triples / abstracts
|
|
||||||
row_view = RDF.with_only_columns(
|
|
||||||
sc.MovieID.label("MovieID"),
|
|
||||||
row_triple,
|
|
||||||
abstract_tok,
|
|
||||||
).subquery()
|
|
||||||
|
|
||||||
# Concatenate all triples for each movie (SQLite syntax; adjust for other DBs)
|
|
||||||
triple_concat = (
|
|
||||||
literal(SpecialToken.START_TRIPLE_LIST.value) +
|
|
||||||
func.group_concat(row_view.c.Triple, literal(""))
|
|
||||||
).label("Triple")
|
|
||||||
|
|
||||||
return (
|
|
||||||
select(
|
|
||||||
row_view.c.MovieID.label("MovieID"),
|
|
||||||
triple_concat,
|
|
||||||
row_view.c.Abstract.label("Abstract"),
|
|
||||||
)
|
|
||||||
.group_by(row_view.c.MovieID, row_view.c.Abstract)
|
|
||||||
)
|
|
||||||
|
|
||||||
# -------------------------------------------------------------------------
|
|
||||||
# Build triple(s) projection
|
|
||||||
# -------------------------------------------------------------------------
|
|
||||||
@staticmethod
|
|
||||||
def build_triple(RDF: Select) -> Select:
|
|
||||||
"""
|
|
||||||
Obtains joined RDF triple in one element, together with START and END special tokens.
|
|
||||||
|
|
||||||
Original behavior (pandas):
|
|
||||||
- Returned a Series/DataFrame column "Triple" built from three string columns.
|
|
||||||
|
|
||||||
Updated behavior (SQL):
|
|
||||||
- Returns a Select with a single column "Triple" built in SQL.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
RDF (Select): at least columns ["SubjectURI", "RelationshipURI", "ObjectURI"]
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Select: a projection containing one column named "Triple"
|
|
||||||
"""
|
|
||||||
sc = RDF.selected_columns
|
|
||||||
triple = (
|
|
||||||
literal(SpecialToken.START_TRIPLE.value) +
|
|
||||||
(sc.SubjectURI + sc.RelationshipURI + sc.ObjectURI) +
|
|
||||||
literal(SpecialToken.END_TRIPLE.value)
|
|
||||||
).label("Triple")
|
|
||||||
return RDF.with_only_columns(triple)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def build_incomplete_triple(RDF: Select) -> Select:
|
|
||||||
"""
|
|
||||||
Method helper used for the third task: "Predicting a masked component within an RDF triple".
|
|
||||||
Obtains joined RDF triple in one element, together with START and END special tokens.
|
|
||||||
The MISSING element will be replaced by the special token <MASK>.
|
|
||||||
|
|
||||||
Original behavior (pandas):
|
|
||||||
- Created a Series "Triple" using fallback values for missing columns.
|
|
||||||
|
|
||||||
Updated behavior (SQL):
|
|
||||||
- Uses COALESCE to replace NULLs with <MASK> directly in SQL.
|
|
||||||
- Returns a Select with a single column "Triple".
|
|
||||||
|
|
||||||
Args:
|
|
||||||
RDF (Select): 2 of the following columns present ["SubjectURI", "RelationshipURI", "ObjectURI"]
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Select: projection with column "Triple"
|
|
||||||
"""
|
|
||||||
sc = RDF.selected_columns
|
|
||||||
mask = literal(SpecialToken.MASK.value)
|
|
||||||
|
|
||||||
triple = (
|
|
||||||
literal(SpecialToken.START_TRIPLE.value) +
|
|
||||||
(func.coalesce(sc.SubjectURI, mask) +
|
|
||||||
func.coalesce(sc.RelationshipURI, mask) +
|
|
||||||
func.coalesce(sc.ObjectURI, mask)) +
|
|
||||||
literal(SpecialToken.END_TRIPLE.value)
|
|
||||||
).label("Triple")
|
|
||||||
return RDF.with_only_columns(triple)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def build_for_mask_task(RDF_incomplete: Select, MISSING) -> None:
|
|
||||||
"""
|
|
||||||
Currently not used.
|
|
||||||
|
|
||||||
Original intention:
|
|
||||||
Given two DataFrames (one incomplete RDF and another with just the missing component),
|
|
||||||
apply special tokens accordingly.
|
|
||||||
|
|
||||||
Updated note:
|
|
||||||
This stub remains for API parity. If needed in the future, it can be implemented
|
|
||||||
as a Select-building helper that merges/COALESCEs columns from different selects.
|
|
||||||
"""
|
|
||||||
return None
|
|
||||||
@@ -1,148 +0,0 @@
|
|||||||
# This file deletes in the pipeline the unwanted relationship by different rules
|
|
||||||
import pandas as pd
|
|
||||||
import sqlite3 # kept for compatibility
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
|
|
||||||
from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
|
|
||||||
|
|
||||||
|
|
||||||
class PipelineApplier:
|
|
||||||
def __init__(self):
|
|
||||||
# Fast internal caches for O(1) membership checks
|
|
||||||
self._MOVIE_FILTER_SET = set()
|
|
||||||
self._REL_FILTER_SET = set()
|
|
||||||
|
|
||||||
# ------------------------------
|
|
||||||
# Filters
|
|
||||||
# ------------------------------
|
|
||||||
def delete_relationship_by_str(self, RDF: pd.DataFrame, uri: str) -> pd.DataFrame:
|
|
||||||
# Vectorized boolean mask
|
|
||||||
return RDF.loc[RDF["RelationshipURI"] != uri]
|
|
||||||
|
|
||||||
def generate_frequency_movie_filter(self, MOVIE_COUNT: pd.DataFrame, min_threshold: int, max_threshold: int):
|
|
||||||
"""
|
|
||||||
You MUST call this before filter the dataset by movie frequency [filter_by_frequency_movie_id()],
|
|
||||||
since this method creates such filter.
|
|
||||||
Args:
|
|
||||||
MOVIE_COUNT (pd.DataFrame): ["MovieID","Count"]
|
|
||||||
"""
|
|
||||||
sel = (MOVIE_COUNT["Count"] >= min_threshold) & (MOVIE_COUNT["Count"] < max_threshold)
|
|
||||||
self._MOVIE_FILTER_SET = set(MOVIE_COUNT.loc[sel, "MovieID"].tolist())
|
|
||||||
|
|
||||||
def generate_frequency_relationship_filter(self, REL_COUNT: pd.DataFrame, min_threshold: int, max_threshold: int):
|
|
||||||
sel = (REL_COUNT["Count"] >= min_threshold) & (REL_COUNT["Count"] < max_threshold)
|
|
||||||
self._REL_FILTER_SET = set(REL_COUNT.loc[sel, "RelationshipURI"].tolist())
|
|
||||||
|
|
||||||
def filter_by_frequency_movie_id(self, RDF: pd.DataFrame) -> pd.DataFrame:
|
|
||||||
# Set-backed isin is the fastest path
|
|
||||||
return RDF.loc[RDF["MovieID"].isin(self._MOVIE_FILTER_SET)]
|
|
||||||
|
|
||||||
def filter_by_frequency_relationship(self, RDF: pd.DataFrame) -> pd.DataFrame:
|
|
||||||
return RDF.loc[RDF["RelationshipURI"].isin(self._REL_FILTER_SET)]
|
|
||||||
|
|
||||||
# ------------------------------
|
|
||||||
# Cleaning & preprocessing
|
|
||||||
# ------------------------------
|
|
||||||
def rdf_add_special_token(self, RDF: pd.DataFrame) -> pd.DataFrame:
|
|
||||||
"""
|
|
||||||
Adds RDF special token to SubjectURI / RelationshipURI / ObjectURI.
|
|
||||||
Returns a new DataFrame (no inplace modification of the caller's object).
|
|
||||||
"""
|
|
||||||
subj = np.char.add(SpecialToken.SUBJECT.value, RDF["SubjectURI"].to_numpy(dtype=object))
|
|
||||||
rel = np.char.add(SpecialToken.RELATIONSHIP.value, RDF["RelationshipURI"].to_numpy(dtype=object))
|
|
||||||
obj = np.char.add(SpecialToken.OBJECT.value, RDF["ObjectURI"].to_numpy(dtype=object))
|
|
||||||
return RDF.assign(SubjectURI=subj, RelationshipURI=rel, ObjectURI=obj)
|
|
||||||
|
|
||||||
def drop_na_from_dataset(self, RDF: pd.DataFrame) -> pd.DataFrame:
|
|
||||||
"""
|
|
||||||
Replace '' with NaN only on key columns, then drop rows missing any of them.
|
|
||||||
"""
|
|
||||||
cols = ["SubjectURI", "RelationshipURI", "ObjectURI"]
|
|
||||||
rdf = RDF.copy()
|
|
||||||
for c in cols:
|
|
||||||
m = rdf[c] == ""
|
|
||||||
if m.any():
|
|
||||||
rdf.loc[m, c] = np.nan
|
|
||||||
return rdf.dropna(subset=cols)
|
|
||||||
|
|
||||||
# ------------------------------
|
|
||||||
# Building triples
|
|
||||||
# ------------------------------
|
|
||||||
@staticmethod
|
|
||||||
def build_triple(RDF: pd.DataFrame):
|
|
||||||
"""
|
|
||||||
Obtains joined RDF triple in one element, together with START and END special token.
|
|
||||||
Returns:
|
|
||||||
pd.Series: RDF["Triple"] (just this column). Side-effect: sets RDF["Triple"].
|
|
||||||
"""
|
|
||||||
start = SpecialToken.START_TRIPLE.value
|
|
||||||
end = SpecialToken.END_TRIPLE.value
|
|
||||||
|
|
||||||
subj = RDF["SubjectURI"].to_numpy(dtype=object)
|
|
||||||
rel = RDF["RelationshipURI"].to_numpy(dtype=object)
|
|
||||||
obj = RDF["ObjectURI"].to_numpy(dtype=object)
|
|
||||||
|
|
||||||
arr = np.char.add(np.char.add(np.char.add(start, subj),
|
|
||||||
np.char.add(rel, obj)),
|
|
||||||
end)
|
|
||||||
RDF["Triple"] = pd.Series(arr, index=RDF.index, dtype=object, name="Triple")
|
|
||||||
return RDF["Triple"]
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def build_incomplete_triple(RDF: pd.DataFrame):
|
|
||||||
"""
|
|
||||||
Helper used for the third task: "Predicting a masked component within an RDF triple".
|
|
||||||
Accepts any subset of ["SubjectURI","RelationshipURI","ObjectURI"] (typically 2 of 3).
|
|
||||||
Missing components are replaced by <MASK>.
|
|
||||||
Returns:
|
|
||||||
pd.Series: RDF["Triple"] (just this column). Side-effect: sets RDF["Triple"].
|
|
||||||
"""
|
|
||||||
start = SpecialToken.START_TRIPLE.value
|
|
||||||
end = SpecialToken.END_TRIPLE.value
|
|
||||||
maskv = SpecialToken.MASK.value
|
|
||||||
n = len(RDF.index)
|
|
||||||
|
|
||||||
subj = RDF["SubjectURI"].to_numpy(dtype=object) if "SubjectURI" in RDF else np.full(n, maskv, dtype=object)
|
|
||||||
rel = RDF["RelationshipURI"].to_numpy(dtype=object) if "RelationshipURI" in RDF else np.full(n, maskv, dtype=object)
|
|
||||||
obj = RDF["ObjectURI"].to_numpy(dtype=object) if "ObjectURI" in RDF else np.full(n, maskv, dtype=object)
|
|
||||||
|
|
||||||
arr = np.char.add(np.char.add(np.char.add(start, subj),
|
|
||||||
np.char.add(rel, obj)),
|
|
||||||
end)
|
|
||||||
RDF["Triple"] = pd.Series(arr, index=RDF.index, dtype=object, name="Triple")
|
|
||||||
return RDF["Triple"]
|
|
||||||
|
|
||||||
def rebuild_by_movie(self, RDF: pd.DataFrame):
|
|
||||||
"""
|
|
||||||
Collapse triples + abstract into a single row per movie.
|
|
||||||
Returns: ["MovieID","Triple","Abstract"]
|
|
||||||
"""
|
|
||||||
# Build triples once (vectorized); method also sets RDF["Triple"]
|
|
||||||
triples = self.build_triple(RDF)
|
|
||||||
|
|
||||||
# Minimal frame for grouping (avoid carrying extra columns)
|
|
||||||
tmp = pd.DataFrame({
|
|
||||||
"MovieID": RDF["MovieID"].to_numpy(),
|
|
||||||
"Abstract": RDF["Abstract"].to_numpy(),
|
|
||||||
"Triple": triples.to_numpy(),
|
|
||||||
})
|
|
||||||
|
|
||||||
# Factorize high-cardinality keys to fast integer codes, group on codes,
|
|
||||||
# then map back to labels; sum concatenates strings for object dtype.
|
|
||||||
mid_codes, mid_uniques = pd.factorize(tmp["MovieID"], sort=False)
|
|
||||||
abs_codes, abs_uniques = pd.factorize(tmp["Abstract"], sort=False)
|
|
||||||
|
|
||||||
tmp["_mid"] = mid_codes
|
|
||||||
tmp["_abs"] = abs_codes
|
|
||||||
|
|
||||||
grouped = tmp.groupby(["_mid", "_abs"], sort=False, as_index=False)["Triple"].sum()
|
|
||||||
|
|
||||||
grouped["MovieID"] = grouped["_mid"].map(lambda i: mid_uniques[i])
|
|
||||||
grouped["Abstract"] = grouped["_abs"].map(lambda i: abs_uniques[i])
|
|
||||||
|
|
||||||
# Final tokens
|
|
||||||
grouped["Triple"] = SpecialToken.START_TRIPLE_LIST.value + grouped["Triple"]
|
|
||||||
grouped["Abstract"] = SpecialToken.ABSTRACT.value + grouped["Abstract"]
|
|
||||||
|
|
||||||
return grouped[["MovieID", "Triple", "Abstract"]]
|
|
||||||
@@ -1,23 +1,28 @@
|
|||||||
import re
|
import re
|
||||||
from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
|
from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
|
||||||
from Scripts.DataCleaning.legacy.filter import PipelineApplier
|
from Scripts.DataCleaning.filter import PipelineApplier
|
||||||
# tasks dataset builder
|
# tasks dataset builder
|
||||||
from Scripts.DataCleaning.data_output_models.rdf_mask_task import RDF_mask_task_dataset
|
from Scripts.DataCleaning.data_output_models.rdf_mask_task import RDF_mask_task_dataset
|
||||||
from Scripts.DataCleaning.data_output_models.bpe_corpus import BPE_corpus
|
from Scripts.DataCleaning.data_output_models.bpe_corpus import BPE_corpus
|
||||||
from Scripts.DataCleaning.data_output_models.rdf_text_tasks import RDF_text_task_dataset
|
from Scripts.DataCleaning.data_output_models.rdf_text_tasks import RDF_text_task_dataset
|
||||||
from Scripts.DataCleaning.data_output_models.rdf_completation_task import RDF_completation_task_dataset
|
from Scripts.DataCleaning.data_output_models.rdf_completation_task import RDF_completation_task_dataset
|
||||||
from Scripts.DataCleaning.data_output_models.debug_csv import Debug_csv
|
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
class Pipeline():
|
class Pipeline():
|
||||||
def __init__(self):
|
def __init__(self,
|
||||||
|
mask_task_dataset_path:str = "./Assets/Dataset/Tmp/rdf_mask.csv",
|
||||||
|
bpe_corpus_path:str = "./Assets/Dataset/Tmp/corpus.txt",
|
||||||
|
text_to_rdf_task_dataset_path:str = "./Assets/Dataset/Tmp/rdf_text.csv",
|
||||||
|
completation_rdf_task_dataset_path:str = "./Assets/Dataset/Tmp/rdf_completation.csv",
|
||||||
|
|
||||||
|
):
|
||||||
self.sql_endpoint = SqlEndpoint()
|
self.sql_endpoint = SqlEndpoint()
|
||||||
# classes to manage taskes' datasets
|
# classes to manage taskes' datasets
|
||||||
self.task_rdf_mask = RDF_mask_task_dataset("./Assets/Dataset/Tmp/rdf_mask.csv")
|
self.task_rdf_mask = RDF_mask_task_dataset(mask_task_dataset_path)
|
||||||
self.task_bpe_corpus = BPE_corpus("./Assets/Dataset/Tmp/corpus.txt")
|
self.task_bpe_corpus = BPE_corpus(bpe_corpus_path)
|
||||||
self.task_rdf_text = RDF_text_task_dataset("./Assets/Dataset/Tmp/rdf_text.csv")
|
self.task_rdf_text = RDF_text_task_dataset(text_to_rdf_task_dataset_path)
|
||||||
self.task_rdf_completation = RDF_completation_task_dataset("./Assets/Dataset/Tmp/rdf_completation.csv")
|
self.task_rdf_completation = RDF_completation_task_dataset(completation_rdf_task_dataset_path)
|
||||||
|
|
||||||
# prepare the filter
|
# prepare the filter
|
||||||
# the filter applier needs to know the frequence of Movies and Relationship among all the Dataset
|
# the filter applier needs to know the frequence of Movies and Relationship among all the Dataset
|
||||||
@@ -25,16 +30,13 @@ class Pipeline():
|
|||||||
MOVIE_COUNT = self.sql_endpoint.get_movies_id_count()
|
MOVIE_COUNT = self.sql_endpoint.get_movies_id_count()
|
||||||
REL_COUNT = self.sql_endpoint.get_relationship_count()
|
REL_COUNT = self.sql_endpoint.get_relationship_count()
|
||||||
self.filter_applier.generate_frequency_movie_filter(MOVIE_COUNT,50,3000)
|
self.filter_applier.generate_frequency_movie_filter(MOVIE_COUNT,50,3000)
|
||||||
self.filter_applier.generate_frequency_relationship_filter(REL_COUNT, 50, 2395627) # from 2718 to 3069
|
self.filter_applier.generate_frequency_relationship_filter(REL_COUNT, 50, 2395627)
|
||||||
# prepare the filter on the relationshipURI you want to delete:
|
# prepare the filter on the relationshipURI you want to delete:
|
||||||
relationship_uri_banned_list = [
|
relationship_uri_banned_list = [
|
||||||
"dbp-dbp:wikiPageUsesTemplate","w3:2000/01/rdf-schema#label","dbp-dbo:abstract",
|
"dbp-dbp:wikiPageUsesTemplate","w3:2000/01/rdf-schema#label","dbp-dbo:abstract",
|
||||||
"dbp-dbo:wikiPageID","dbp-dbo:wikiPageRevisionID", "dbp-dbo:wikiPageDisambiguates",
|
"dbp-dbo:wikiPageID","dbp-dbo:wikiPageRevisionID", "dbp-dbo:wikiPageDisambiguates",
|
||||||
"w3:2002/07/owl#sameAs","dbp-dbp:image","dbp-dbo:wikiPageLength", "w3:2000/01/rdf-schema#comment",
|
"w3:2002/07/owl#sameAs","dbp-dbp:image","dbp-dbo:wikiPageLength", "w3:2000/01/rdf-schema#comment",
|
||||||
"dbp-dbo:thumbnail", "foaf:depiction", "w3:1999/02/22-rdf-syntax-ns#type",
|
"dbp-dbo:thumbnail", "foaf:depiction", "w3:1999/02/22-rdf-syntax-ns#type"]
|
||||||
"dbp-dbp:id","dbp-dbp:totalWidth", "w3:ns/prov#wasDerivedFrom", "dbp-dbp:n", "dbp-dbp:alt",
|
|
||||||
"dbp-dbo:soundRecording"
|
|
||||||
]
|
|
||||||
self.filter_applier.generate_list_relationship_filter(relationship_uri_banned_list)
|
self.filter_applier.generate_list_relationship_filter(relationship_uri_banned_list)
|
||||||
|
|
||||||
|
|
||||||
@@ -96,8 +98,6 @@ class Pipeline():
|
|||||||
# other filter
|
# other filter
|
||||||
#
|
#
|
||||||
RDF = self.filter_applier.delete_relationship_by_list_filter(RDF)
|
RDF = self.filter_applier.delete_relationship_by_list_filter(RDF)
|
||||||
# regex on ObjectURI
|
|
||||||
RDF = self.filter_applier.regex_on_objects(RDF)
|
|
||||||
if RDF.empty:
|
if RDF.empty:
|
||||||
continue
|
continue
|
||||||
RDF = self.filter_applier.rdf_add_special_token(RDF) # WARNING, THIS MUST BE DONE AFTER FILTER BY FREQUENCE
|
RDF = self.filter_applier.rdf_add_special_token(RDF) # WARNING, THIS MUST BE DONE AFTER FILTER BY FREQUENCE
|
||||||
@@ -119,13 +119,9 @@ class Pipeline():
|
|||||||
movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]
|
movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]
|
||||||
self.sql_endpoint.movie_ids = movie_list
|
self.sql_endpoint.movie_ids = movie_list
|
||||||
|
|
||||||
def generate_csv_debug_file(self, debug_path:str):
|
def reduce_movie_list(self, starting_offset:int , ending_offset:int):
|
||||||
debug_csv = Debug_csv(debug_path)
|
self.filter_applier.reduce_movie_list(starting_offset,ending_offset)
|
||||||
|
|
||||||
for RDF in self._get_cleaned_movie_rows():
|
|
||||||
debug_csv.write(RDF)
|
|
||||||
|
|
||||||
debug_csv.close()
|
|
||||||
|
|
||||||
|
|
||||||
# there are a lot of settings to manage
|
# there are a lot of settings to manage
|
||||||
@@ -134,12 +130,11 @@ class Pipeline():
|
|||||||
# in the use_toy_dataset , to change the toy dataset
|
# in the use_toy_dataset , to change the toy dataset
|
||||||
# in _get_cleaned_movie_rows: to change how the pipeline behave
|
# in _get_cleaned_movie_rows: to change how the pipeline behave
|
||||||
|
|
||||||
pipeline = Pipeline()
|
#pipeline = Pipeline()
|
||||||
|
|
||||||
pipeline.use_toy_dataset()
|
# pipeline.use_toy_dataset()
|
||||||
# pipeline.execute_task_bpe_corpus()
|
# pipeline.execute_task_bpe_corpus()
|
||||||
# pipeline.execute_task_rdf_mask()
|
# pipeline.execute_task_rdf_mask()
|
||||||
# pipeline.execute_tasks_rdf_text()
|
# pipeline.execute_tasks_rdf_text()
|
||||||
# pipeline.execute_task_rdf_completation()
|
# pipeline.execute_task_rdf_completation()
|
||||||
# pipeline.execute_all_task()
|
# pipeline.execute_all_task()
|
||||||
pipeline.generate_csv_debug_file("Assets/Dataset/Tmp/debug.csv")
|
|
||||||
@@ -1,86 +0,0 @@
|
|||||||
# This file deletes in the pipeline the unwanted relationship by different rules
|
|
||||||
import pandas as pd
|
|
||||||
import sqlite3
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
|
|
||||||
from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
|
|
||||||
|
|
||||||
|
|
||||||
class PipelineApplier():
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def rdf_add_special_token(self, RDF: pd.DataFrame):
|
|
||||||
"""
|
|
||||||
Adds RDF special token to each element of the tuple. i.e: SUBJ to SubjectURI, OBJ to ObjectURI, REL to RelationshipURI.
|
|
||||||
Check Scrits/Libs/CleaningPipeline/special_token.py for the up-to-date special token.
|
|
||||||
It only adds the special token of the three element of the RDF, no other special token.
|
|
||||||
Args:
|
|
||||||
RDF (pd.DataFrame):
|
|
||||||
Returns:
|
|
||||||
pd.DataFrame: ["MovieURI","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
|
|
||||||
"""
|
|
||||||
# if the filter runned before sliced the RDF and created a View, here the problem is resolved
|
|
||||||
# for more context: SettingWithCopyWarning
|
|
||||||
RDF = RDF.copy()
|
|
||||||
# at the beginning of SubjectURI RelationshipURI ObjectURI, add their special token
|
|
||||||
RDF["SubjectURI"] = SpecialToken.SUBJECT.value + RDF["SubjectURI"]
|
|
||||||
RDF["ObjectURI"] = SpecialToken.OBJECT.value + RDF["ObjectURI"]
|
|
||||||
RDF["RelationshipURI"] = SpecialToken.RELATIONSHIP.value + RDF["RelationshipURI"]
|
|
||||||
return RDF
|
|
||||||
|
|
||||||
|
|
||||||
def drop_na_from_dataset(self, RDF: pd.DataFrame) -> pd.DataFrame:
|
|
||||||
RDF = RDF.replace('', np.nan)
|
|
||||||
# Drop rows where any of the key columns are NaN
|
|
||||||
RDF = RDF.dropna(subset=["SubjectURI", "RelationshipURI", "ObjectURI"])
|
|
||||||
return RDF
|
|
||||||
|
|
||||||
def rebuild_by_movie(self, RDF: pd.DataFrame) -> pd.DataFrame:
|
|
||||||
"""
|
|
||||||
Args:
|
|
||||||
RDF (pd.DataFrame): ["MovieID","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
pd.DataFrame: ["MovieID","Triple","Abstract"]
|
|
||||||
"""
|
|
||||||
# to execute this method you have to have itereted by movie_id
|
|
||||||
# because as design we want at the end one row for each movie
|
|
||||||
# MovieID and abstract can be given as input for a more generic method
|
|
||||||
# first let's combine each row creating column triple as join of rdf
|
|
||||||
RDF["Triple"] = RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"]
|
|
||||||
# special token
|
|
||||||
RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
|
|
||||||
# combine rows into one
|
|
||||||
# MovieID and Abstract are unique for each other 1 <-> 1
|
|
||||||
RDF = RDF.groupby(["MovieID", "Abstract"])["Triple"].apply("".join).reset_index()
|
|
||||||
# add special token for: start of triple, end of triple and start of abstract
|
|
||||||
RDF["Triple"] = SpecialToken.START_TRIPLE_LIST.value + RDF["Triple"]+SpecialToken.END_OF_SENTENCE.value
|
|
||||||
RDF["Abstract"] = SpecialToken.ABSTRACT.value + RDF["Abstract"] + SpecialToken.END_OF_SENTENCE.value
|
|
||||||
return RDF[["MovieID","Triple","Abstract"]]
|
|
||||||
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def build_triple(RDF: pd.DataFrame):
|
|
||||||
"""
|
|
||||||
Obtains joined RDF triple in one element, togheter with START and END special token
|
|
||||||
Args:
|
|
||||||
RDF (pd.DataFrame): at least ["SubjectURI", "RelationshipURI", "ObjectURI"]
|
|
||||||
Returns:
|
|
||||||
pd.DataFrame: RDF["Triple"] (just this column)
|
|
||||||
"""
|
|
||||||
# let's combine each row creating column triple as join of rdf
|
|
||||||
RDF["Triple"] = RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"]
|
|
||||||
# special token
|
|
||||||
RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
|
|
||||||
return RDF["Triple"]
|
|
||||||
|
|
||||||
|
|
||||||
def regex_on_objects(self, RDF: pd.DataFrame) -> pd.DataFrame:
|
|
||||||
RDF["ObjectURI"] = (RDF["ObjectURI"].astype("string")
|
|
||||||
.str.replace(r"\r?\n+", ", ", regex=True) # newlines -> ", "
|
|
||||||
.str.replace(r"\*", "", regex=True)) # delete all asterisks
|
|
||||||
|
|
||||||
return RDF
|
|
||||||
@@ -1,103 +0,0 @@
|
|||||||
import pandas as pd
|
|
||||||
from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
|
|
||||||
|
|
||||||
class MovieFilter:
|
|
||||||
|
|
||||||
def __init__(self) -> None:
|
|
||||||
self.sql_endpoint = SqlEndpoint()
|
|
||||||
# first obtain all movie_id
|
|
||||||
movie_query = "SELECT MovieID FROM Movies"
|
|
||||||
self.MOVIE_FILTER = self.sql_endpoint.get_dataframe_from_query(movie_query)
|
|
||||||
|
|
||||||
|
|
||||||
def frequency_filter(self, min_treshold:int, max_treshold:int):
|
|
||||||
movie_list_placeholder = ",".join(["?"] * len(self.MOVIE_FILTER))
|
|
||||||
|
|
||||||
filter_query = f"""
|
|
||||||
SELECT MovieID
|
|
||||||
FROM RDFs
|
|
||||||
WHERE MovieID IN ({movie_list_placeholder})
|
|
||||||
GROUP BY MovieID
|
|
||||||
HAVING COUNT(*) BETWEEN {min_treshold} AND {max_treshold};
|
|
||||||
"""
|
|
||||||
self.MOVIE_FILTER = self.sql_endpoint.get_dataframe_from_query(filter_query, tuple(self.MOVIE_FILTER["MovieID"].to_list()))
|
|
||||||
|
|
||||||
|
|
||||||
def get_movie_id(self):
|
|
||||||
return self.MOVIE_FILTER
|
|
||||||
|
|
||||||
|
|
||||||
def relation_filter(self, parsed_rel_uri: str, min_treshold:int, max_treshold:int):
|
|
||||||
movie_ids = self.MOVIE_FILTER["MovieID"].to_list()
|
|
||||||
movie_list_placeholder = ",".join(["?"] * len(movie_ids))
|
|
||||||
|
|
||||||
filter_query = f"""
|
|
||||||
SELECT MovieID
|
|
||||||
FROM RDFs
|
|
||||||
JOIN ParsedRelationships ON ParsedRelationships.RelationshipID = RDFs.RelationshipID
|
|
||||||
WHERE MovieID IN ({movie_list_placeholder})
|
|
||||||
GROUP BY MovieID
|
|
||||||
HAVING SUM(CASE WHEN ParsedRelationships.RelationshipURI = '{parsed_rel_uri}' THEN 1 ELSE 0 END)
|
|
||||||
BETWEEN {min_treshold} AND {max_treshold};
|
|
||||||
"""
|
|
||||||
|
|
||||||
params = tuple(movie_ids) # + (parsed_rel_uri, min_treshold, max_treshold)
|
|
||||||
self.MOVIE_FILTER = self.sql_endpoint.get_dataframe_from_query(filter_query, params)
|
|
||||||
|
|
||||||
|
|
||||||
def filter_by_director(self):
|
|
||||||
director_list = ['dbp-dbo:director','dbp-dbp:director']
|
|
||||||
|
|
||||||
movie_ids = self.MOVIE_FILTER["MovieID"].to_list()
|
|
||||||
movie_list_placeholder = ",".join(["?"] * len(movie_ids))
|
|
||||||
|
|
||||||
filter_query = f"""
|
|
||||||
SELECT DISTINCT RDFs.MovieID
|
|
||||||
FROM RDFs
|
|
||||||
JOIN ParsedRelationships USING (RelationshipID)
|
|
||||||
WHERE RDFs.MovieID IN ({movie_list_placeholder})
|
|
||||||
AND ParsedRelationships.RelationshipURI IN {tuple(director_list)};
|
|
||||||
"""
|
|
||||||
|
|
||||||
params = tuple(movie_ids)
|
|
||||||
self.MOVIE_FILTER = self.sql_endpoint.get_dataframe_from_query(filter_query, params)
|
|
||||||
|
|
||||||
|
|
||||||
def filter_by_english_movies(self):
|
|
||||||
movie_ids = self.MOVIE_FILTER["MovieID"].to_list()
|
|
||||||
movie_list_placeholder = ",".join(["?"] * len(movie_ids))
|
|
||||||
|
|
||||||
relationship = ["dbp-dbp:language"]
|
|
||||||
objects_list = ["English", "dbp-dbr:English_language"]
|
|
||||||
|
|
||||||
filter_query = f"""
|
|
||||||
SELECT DISTINCT RDFs.MovieID
|
|
||||||
FROM RDFs
|
|
||||||
INNER JOIN ParsedRelationships USING (RelationshipID)
|
|
||||||
INNER JOIN ParsedObjects USING (ObjectID)
|
|
||||||
WHERE RDFs.MovieID IN ({movie_list_placeholder})
|
|
||||||
AND ParsedRelationships.RelationshipURI IN ('{relationship[0]}')
|
|
||||||
AND ParsedObjects.ObjectURI in {tuple(objects_list)};
|
|
||||||
"""
|
|
||||||
|
|
||||||
other_query = f"""
|
|
||||||
SELECT RDFs.MovieID
|
|
||||||
FROM RDFs
|
|
||||||
INNER JOIN ParsedRelationships USING (RelationshipID)
|
|
||||||
INNER JOIN ParsedObjects USING (ObjectID)
|
|
||||||
WHERE RDFs.MovieID IN ({movie_list_placeholder})
|
|
||||||
AND ParsedRelationships.RelationshipURI IN ('{relationship[0]}')
|
|
||||||
GROUP BY RDFs.MovieID
|
|
||||||
HAVING
|
|
||||||
SUM(CASE WHEN ParsedObjects.ObjectURI IN {tuple(objects_list)} THEN 1 ELSE 0 END) >= 1
|
|
||||||
AND
|
|
||||||
SUM(CASE WHEN ParsedObjects.ObjectURI NOT IN {tuple(objects_list)} THEN 1 ELSE 0 END) = 0;
|
|
||||||
"""
|
|
||||||
|
|
||||||
params = tuple(movie_ids)
|
|
||||||
self.MOVIE_FILTER = self.sql_endpoint.get_dataframe_from_query(other_query, params)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# movie_filter = MovieFilter()
|
|
||||||
# movie_filter.frequency_filter(5,10)
|
|
||||||
@@ -1,155 +0,0 @@
|
|||||||
from movie_filter import MovieFilter
|
|
||||||
from relationship_filter import RelationshipFilter
|
|
||||||
from rdf_filter import RdfFilter
|
|
||||||
from cleaner import PipelineApplier
|
|
||||||
|
|
||||||
from Scripts.DataCleaning.data_output_models.bpe_corpus import BPE_corpus
|
|
||||||
from Scripts.DataCleaning.data_output_models.rdf_text_tasks import RDF_text_task_dataset
|
|
||||||
from Scripts.DataCleaning.data_output_models.rdf_completation_task import RDF_completation_task_dataset
|
|
||||||
from Scripts.DataCleaning.data_output_models.debug_csv import Debug_csv
|
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
RELATIONSHIP_FILTER_LIST = [
|
|
||||||
"dbp-dbp:wikiPageUsesTemplate","w3:2000/01/rdf-schema#label","dbp-dbo:abstract",
|
|
||||||
"dbp-dbo:wikiPageID","dbp-dbo:wikiPageRevisionID", "dbp-dbo:wikiPageDisambiguates",
|
|
||||||
"w3:2002/07/owl#sameAs","dbp-dbp:image","dbp-dbo:wikiPageLength", "w3:2000/01/rdf-schema#comment",
|
|
||||||
"dbp-dbo:thumbnail", "foaf:depiction", "w3:1999/02/22-rdf-syntax-ns#type",
|
|
||||||
"dbp-dbp:id","dbp-dbp:totalWidth", "w3:ns/prov#wasDerivedFrom", "dbp-dbp:n", "dbp-dbp:alt",
|
|
||||||
"dbp-dbo:soundRecording", "dbp-dbp:align", "dbp-dbp:format",
|
|
||||||
"dbp-dbp:filename", "dbp-dbp:wikt", "foaf:isPrimaryTopicOf", "dbp-dbp:quote", "foaf:homepage",
|
|
||||||
"dbp-dbp:wordnet_type", "dbp-dbp:length","dbp-dbp:caption", "dbp-dbo:imdbId", "dbp-dbp:border", "dbp-dbp:note",
|
|
||||||
"dbp-dbp:postalCodeType", "dbp-dbp:extraColumn", "foaf:homepage", "dbp-dbp:bgcolor","dbp-dbp:prevTitle",
|
|
||||||
"dbp-dbp:imageUpright", "dbp-dbp:url", "dbp-dbp:italicTitle", "dbp-dbp:imageSize", "dbp-dbp:text",
|
|
||||||
"dbp-dbp:captionAlign", "dbp-dbp:headerAlign", "dbp-dbp:height", "dbp-dbp:link", "dbp-dbo:wikiPageInterLanguageLink",
|
|
||||||
"w3:2003/01/geo/wgs84_pos#lat", "w3:2003/01/geo/wgs84_pos#long", "http://www.georss.org/georss/point",
|
|
||||||
"dbp-dbp:bgcolor", "dbp-dbp:mc", "dbp-dbp:rev3score", "dbp-dbp:rev4score", "dbp-dbp:imageAlt",
|
|
||||||
"dbp-dbp:b", "dbp-dbp:s", "dbp-dbp:c", "dbp-dbp:d", "dbp-dbp:m", "dbp-dbp:v", "dbp-dbp:mw", "dbp-dbp:fontsize",
|
|
||||||
"dbp-dbp:salign", "dbp-dbp:q", "dbp-dbp:portal", "dbp-dbp:dSearch", "dbp-dbp:header", "w3:2003/01/geo/wgs84_pos#geometry",
|
|
||||||
"dbp-dbp:shortsummary", "dbp-dbp:fixAttempted", "dbp-dbo:developer", "dbp-dbp:no", "dbp-dbp:ref", "dbp-dbp:infoa"
|
|
||||||
"dbp-dbp:infob", "dbp-dbp:1a", "dbp-dbp:1p", "dbp-dbp:2a", "dbp-dbp:2p", "http://rdvocab.info/RDARelationshipsWEMI/manifestationOfWork",
|
|
||||||
"dbp-dbp:isbn", "dbp-dbp:titleWidth", "dbp-dbp:prodcode", "dbp-dbp:page", "w3:2004/02/skos/core#closeMatch",
|
|
||||||
"dbp-dbp:colwidth", "dbp-dbp:imagesize", "dbp-dbp:rr", "dbp-dbp:date", "dbp-dbp:type", "dbp-dbp:list",
|
|
||||||
"dbp-dbp:listEpisodes", "dbp-dbp:footerAlign", "dbp-dbp:float", "dbp-dbp:bot", "dbp-dbp:p", "dbp-dbp:l", "dbp-dbp:t", "dbp-dbp:j",
|
|
||||||
"dbp-dbp:1y", "dbp-dbp:2y", "dbp-dbp:1pp", "dbp-dbp:vgs", "dbp-dbp:3a", "dbp-dbp:3p", "dbp-dbp:3y", "dbp-dbp:4a", "dbp-dbp:4y",
|
|
||||||
"dbp-dbp:website"
|
|
||||||
]
|
|
||||||
|
|
||||||
RELATIONSHIP_WHITE_LIST = [
|
|
||||||
"dbp-dbp:director","dbp-dbo:starring", "dbp-dbo:writer", "dbp-dbp:name", "dbp-dbp:genre", "purl:dc/terms/subject"
|
|
||||||
]
|
|
||||||
"""
|
|
||||||
SELECT DISTINCT field3
|
|
||||||
FROM debug
|
|
||||||
"""
|
|
||||||
|
|
||||||
class Pipeline():
|
|
||||||
|
|
||||||
def __init__(self) -> None:
|
|
||||||
self._movie_filter = MovieFilter()
|
|
||||||
self._relationship_filter = RelationshipFilter()
|
|
||||||
self._rdf_filter = RdfFilter()
|
|
||||||
self._pipeline = PipelineApplier()
|
|
||||||
|
|
||||||
self.task_bpe_corpus = BPE_corpus("./Assets/Dataset/Tmp/corpus.txt")
|
|
||||||
self.task_rdf_text = RDF_text_task_dataset("./Assets/Dataset/Tmp/rdf_text.csv")
|
|
||||||
self.task_rdf_completation = RDF_completation_task_dataset("./Assets/Dataset/Tmp/rdf_completation.csv")
|
|
||||||
|
|
||||||
self._movie_filter.frequency_filter(50,3000)
|
|
||||||
self._relationship_filter.frequency_filter(25, 2395627) # from 2718 to 3069
|
|
||||||
self._relationship_filter.delete_relationship_uri_by_list(RELATIONSHIP_FILTER_LIST)
|
|
||||||
|
|
||||||
def other_filter(self):
|
|
||||||
self._movie_filter.relation_filter("purl:dc/terms/subject",5,100)
|
|
||||||
self._movie_filter.filter_by_director()
|
|
||||||
self._movie_filter.filter_by_english_movies()
|
|
||||||
self._movie_filter.relation_filter("dbp-dbp:budget",1,100) # the most important film have relationship budget
|
|
||||||
self._movie_filter.relation_filter("dbp-dbp:released",1,100) # to cut to 2000 :(
|
|
||||||
|
|
||||||
def _get_cleaned_movie_rows(self):
|
|
||||||
movie_ids = self._movie_filter.get_movie_id()
|
|
||||||
rel_ids = self._relationship_filter.get_relationship_id()
|
|
||||||
# rel_ids = self._relationship_filter.get_relationship_id_from_white_list(RELATIONSHIP_WHITE_LIST)
|
|
||||||
|
|
||||||
for RDF in self._rdf_filter.yield_movie_abbreviated_rdfs(movie_ids,rel_ids):
|
|
||||||
RDF = self._pipeline.drop_na_from_dataset(RDF)
|
|
||||||
RDF = self._pipeline.regex_on_objects(RDF)
|
|
||||||
RDF = self._pipeline.rdf_add_special_token(RDF)
|
|
||||||
|
|
||||||
if RDF.empty:
|
|
||||||
continue
|
|
||||||
yield RDF
|
|
||||||
|
|
||||||
|
|
||||||
def execute_task_bpe_corpus(self):
|
|
||||||
for RDF in self._get_cleaned_movie_rows():
|
|
||||||
RDF = self._pipeline.rebuild_by_movie(RDF)
|
|
||||||
RDF = RDF[["Triple","Abstract"]]
|
|
||||||
self.task_bpe_corpus.write_from_df(RDF)
|
|
||||||
self._end_file_handler()
|
|
||||||
|
|
||||||
|
|
||||||
def execute_tasks_rdf_text(self):
|
|
||||||
for RDF in self._get_cleaned_movie_rows():
|
|
||||||
RDF = self._pipeline.rebuild_by_movie(RDF)
|
|
||||||
self.task_rdf_text.write(RDF)
|
|
||||||
self._end_file_handler()
|
|
||||||
|
|
||||||
|
|
||||||
def execute_task_rdf_completation(self):
|
|
||||||
for RDF in self._get_cleaned_movie_rows():
|
|
||||||
RDF["Triple"] = self._pipeline.build_triple(RDF)
|
|
||||||
self.task_rdf_completation.write(RDF[["MovieID","Triple"]])
|
|
||||||
self._end_file_handler()
|
|
||||||
|
|
||||||
|
|
||||||
def _end_file_handler(self):
|
|
||||||
self.task_bpe_corpus.close()
|
|
||||||
self.task_rdf_text.close()
|
|
||||||
self.task_rdf_completation.close()
|
|
||||||
|
|
||||||
|
|
||||||
def execute_all_task(self):
|
|
||||||
for RDF in self._get_cleaned_movie_rows():
|
|
||||||
completation_RDF = RDF.copy()
|
|
||||||
completation_RDF["Triple"] = self._pipeline.build_triple(completation_RDF)
|
|
||||||
self.task_rdf_completation.write(completation_RDF[["MovieID","Triple"]])
|
|
||||||
|
|
||||||
RDF = self._pipeline.rebuild_by_movie(RDF)
|
|
||||||
|
|
||||||
self.task_rdf_text.write(RDF)
|
|
||||||
self.task_bpe_corpus.write_from_df(RDF[["Triple","Abstract"]])
|
|
||||||
|
|
||||||
self._end_file_handler()
|
|
||||||
|
|
||||||
|
|
||||||
def use_toy_dataset(self):
|
|
||||||
# CHOOSEN MOVIE:
|
|
||||||
# The Dark Knight : 117248
|
|
||||||
# Inception : 147074
|
|
||||||
# The Avengers : 113621
|
|
||||||
# Cast Away : 1123
|
|
||||||
# The Departed : 117586
|
|
||||||
# American Psycho : 90177
|
|
||||||
# Avatar : 71587
|
|
||||||
# Django Unchained : 138952
|
|
||||||
# Spirited Away : 144137
|
|
||||||
# Knives Out : 148025
|
|
||||||
# [106465,106466,106467,106468,106469,106470,106471,106472,106473]
|
|
||||||
movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]
|
|
||||||
self._movie_filter.MOVIE_FILTER = pd.DataFrame({"MovieID": movie_list})
|
|
||||||
|
|
||||||
def generate_csv_debug_file(self, debug_path:str):
|
|
||||||
debug_csv = Debug_csv(debug_path)
|
|
||||||
|
|
||||||
for RDF in self._get_cleaned_movie_rows():
|
|
||||||
debug_csv.write(RDF)
|
|
||||||
|
|
||||||
debug_csv.close()
|
|
||||||
|
|
||||||
|
|
||||||
pipe = Pipeline()
|
|
||||||
#pipe.use_toy_dataset()
|
|
||||||
pipe.other_filter()
|
|
||||||
# pipe.execute_all_task()
|
|
||||||
pipe.generate_csv_debug_file("Assets/Dataset/Tmp/debug.csv")
|
|
||||||
@@ -1,32 +0,0 @@
|
|||||||
import pandas as pd
|
|
||||||
from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
|
|
||||||
|
|
||||||
class RdfFilter:
|
|
||||||
|
|
||||||
def __init__(self) -> None:
|
|
||||||
self.sql_endpoint = SqlEndpoint()
|
|
||||||
|
|
||||||
|
|
||||||
# def delete_hyperum_when_movie(self):
|
|
||||||
# purl:linguistics/gold/hypernym
|
|
||||||
# is almost ever as "dbp-dbr:Movie" or "dbp-dbr:Film"
|
|
||||||
# banned triple
|
|
||||||
|
|
||||||
def yield_movie_abbreviated_rdfs(self, MOVIE_ID: pd.DataFrame, REL_ID: pd.DataFrame):
|
|
||||||
relationship_placeholder = ",".join(["?"] * len(REL_ID))
|
|
||||||
|
|
||||||
param = tuple(REL_ID["RelationshipID"].to_list())
|
|
||||||
|
|
||||||
QUERY = f"""
|
|
||||||
SELECT MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
|
|
||||||
FROM RDFs
|
|
||||||
INNER JOIN ParsedSubjects USING (SubjectID)
|
|
||||||
INNER JOIN ParsedRelationships USING (RelationshipID)
|
|
||||||
INNER JOIN ParsedObjects USING (ObjectID)
|
|
||||||
INNER JOIN WikipediaAbstracts USING (MovieID)
|
|
||||||
WHERE MovieID = (?) AND RelationshipID IN ({relationship_placeholder});
|
|
||||||
"""
|
|
||||||
|
|
||||||
for movie_id in MOVIE_ID["MovieID"].to_list():
|
|
||||||
params = (movie_id,) + param
|
|
||||||
yield self.sql_endpoint.get_dataframe_from_query(QUERY, params=params)
|
|
||||||
@@ -1,54 +0,0 @@
|
|||||||
import pandas as pd
|
|
||||||
from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
|
|
||||||
|
|
||||||
class RelationshipFilter:
|
|
||||||
|
|
||||||
def __init__(self) -> None:
|
|
||||||
self.sql_endpoint = SqlEndpoint()
|
|
||||||
# first obtain all relationship_id
|
|
||||||
relationship_query = "SELECT RelationshipID FROM Relationships"
|
|
||||||
self.RELATIONSHIP_FILTER = self.sql_endpoint.get_dataframe_from_query(relationship_query)
|
|
||||||
|
|
||||||
|
|
||||||
def frequency_filter(self, min_treshold:int, max_treshold:int):
|
|
||||||
movie_list_placeholder = ",".join(["?"] * len(self.RELATIONSHIP_FILTER))
|
|
||||||
|
|
||||||
filter_query = f"""
|
|
||||||
SELECT RelationshipID
|
|
||||||
FROM RDFs
|
|
||||||
WHERE RelationshipID IN ({movie_list_placeholder})
|
|
||||||
GROUP BY RelationshipID
|
|
||||||
HAVING COUNT(*) BETWEEN {min_treshold} AND {max_treshold};
|
|
||||||
"""
|
|
||||||
self.RELATIONSHIP_FILTER = self.sql_endpoint.get_dataframe_from_query(filter_query, tuple(self.RELATIONSHIP_FILTER["RelationshipID"].to_list()))
|
|
||||||
|
|
||||||
|
|
||||||
def get_relationship_id(self):
|
|
||||||
return self.RELATIONSHIP_FILTER
|
|
||||||
|
|
||||||
def get_relationship_id_from_white_list(self, relationship_list: list[str]):
|
|
||||||
ids_placeholder = ",".join(["?"] * len(self.RELATIONSHIP_FILTER))
|
|
||||||
uri_placeholder = ",".join(["?"] * len(relationship_list))
|
|
||||||
filter_query = f"""
|
|
||||||
SELECT RelationshipID
|
|
||||||
FROM ParsedRelationships
|
|
||||||
WHERE RelationshipID IN ({ids_placeholder})
|
|
||||||
AND RelationshipURI IN ({uri_placeholder});
|
|
||||||
"""
|
|
||||||
params = tuple(self.RELATIONSHIP_FILTER["RelationshipID"].to_list()) + tuple(relationship_list)
|
|
||||||
return self.sql_endpoint.get_dataframe_from_query(filter_query, params)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def delete_relationship_uri_by_list(self, filter_list: list[str]):
|
|
||||||
ids_placeholder = ",".join(["?"] * len(self.RELATIONSHIP_FILTER))
|
|
||||||
uri_placeholder = ",".join(["?"] * len(filter_list))
|
|
||||||
|
|
||||||
filter_query = f"""
|
|
||||||
SELECT RelationshipID
|
|
||||||
FROM ParsedRelationships
|
|
||||||
WHERE RelationshipID IN ({ids_placeholder})
|
|
||||||
AND RelationshipURI NOT IN ({uri_placeholder});
|
|
||||||
"""
|
|
||||||
params = tuple(self.RELATIONSHIP_FILTER["RelationshipID"].to_list()) + tuple(filter_list)
|
|
||||||
self.RELATIONSHIP_FILTER = self.sql_endpoint.get_dataframe_from_query(filter_query, params)
|
|
||||||
21
Scripts/Experiments/change_me/use_bpe_pipeline.py
Normal file
21
Scripts/Experiments/change_me/use_bpe_pipeline.py
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
import Project_Model.Libs.BPE as BPE
|
||||||
|
from pathlib import Path
|
||||||
|
from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
|
||||||
|
|
||||||
|
VOCABULARY_path = "Assets/Model/toy_10/toy_dictionary.json"
|
||||||
|
VOCABULARY = BPE.load_nanos_vocabulary(Path(VOCABULARY_path))
|
||||||
|
|
||||||
|
SPECIAL_TOKEN_LIST = [token.value for token in SpecialToken]
|
||||||
|
|
||||||
|
# INPUT = "<SOTL><SOT><SUBJ>dbp-dbr:How_It_Should_Have_Ended<PRED>dbp-dbp:title<OBJ>dbp-dbr:The_Dark_Knight<EOT>"
|
||||||
|
# INPUT = "<SOTL><SOT><SUBJ>dbp-dbr:How_It_Should_Have_Ended<PRED>"
|
||||||
|
# INPUT = "<ABS>The Dark Knight is a 2008 superhero film directed by Christopher Nolan, from a screenplay co-written with his brother Jonathan. Based on the DC Comics superhero Batman, it is the sequel to Batman Begins (2005), and the second installment in The Dark Knight trilogy. The plot follows the vigilante Batman, police lieutenant James Gordon, and district attorney Harvey Dent, who form an alliance to dismantle organized crime in Gotham City. Their efforts are derailed by the Joker, an anarchistic mastermind who seeks to test how far Batman will go to save the city from chaos. The ensemble cast includes Christian Bale, Michael Caine, Heath Ledger, Gary Oldman, Aaron Eckhart, Maggie Gyllenhaal, and Morgan Freeman.Warner Bros. Pictures prioritized a sequel following the successful reinvention of the Batman film series with Batman Begins. Christopher and Batman Begins co-writer David S. Goyer developed the story elements, making Dent the central protagonist caught up in the battle between Batman and the Joker. In writing the screenplay, the Nolans were influenced by 1980s Batman comics and crime drama films, and sought to continue Batman Begins' heightened sense of realism. From April to November 2007, filming took place with a $185 million budget in Chicago and Hong Kong, and on sets in England. The Dark Knight was the first major motion picture to be filmed with high-resolution IMAX cameras. Christopher avoided using computer-generated imagery unless necessary, insisting on practical stunts such as flipping an 18-wheel truck and blowing up a factory.The Dark Knight was marketed with an innovative interactive viral campaign that initially focused on countering criticism of Ledger's casting by those who believed he was a poor choice to portray the Joker. Ledger died from an accidental prescription drug overdose in January 2008, leading to widespread interest from the press and public regarding his performance. When it was released in July, The Dark Knight received acclaim for its mature tone and themes, visual style, and performances—particularly that of Ledger, who received many posthumous awards including Academy, BAFTA, and Golden Globe awards for Best Supporting Actor, making The Dark Knight the first comic-book film to receive major industry awards. It broke several box-office records and became the highest-grossing 2008 film, the fourth-highest-grossing film to that time, and the highest-grossing superhero film of the time.Since its release, The Dark Knight has been assessed as one of the greatest superhero films ever, one of the best movies of the 2000s, and one of the best films ever made. It is considered the \"blueprint\" for many modern superhero films, particularly for its rejection of a typical comic-book movie style in favor of a crime film that features comic-book characters. Many filmmakers sought to repeat its success by emulating its gritty, realistic tone to varying degrees of success. The Dark Knight has been analyzed for its themes of terrorism and the limitations of morality and ethics. The United States Library of Congress selected it for preservation in the National Film Registry in 2020. A sequel, The Dark Knight Rises, concluded The Dark Knight trilogy in 2012.<SOTL>"
|
||||||
|
INPUT = "<ABS>The Dark Knight is a 2008 superhero film directed by Christopher Nolan,<SOTL>"
|
||||||
|
# INPUT = "<ABS> Nolan,<SOTL>"
|
||||||
|
# 32: " "
|
||||||
|
TOKENANO = BPE.Classes.TokeNanoCore(VOCABULARY, SPECIAL_TOKEN_LIST)
|
||||||
|
print(f"input: {INPUT} \ninput lenght: {len(INPUT)}")
|
||||||
|
encoded = TOKENANO.encode(INPUT)
|
||||||
|
print(f"encode: {encoded} \nencode lenght: {len(encoded)}")
|
||||||
|
decoded = TOKENANO.decode(encoded)
|
||||||
|
print(f"decode: {decoded} \ndecode lenght: {len(decoded)}")
|
||||||
@@ -1,6 +1,7 @@
|
|||||||
from enum import Enum
|
from enum import Enum
|
||||||
|
|
||||||
class SpecialToken(str, Enum):
|
|
||||||
|
class SpecialToken(Enum):
|
||||||
# (Enum, str) -> throws an error
|
# (Enum, str) -> throws an error
|
||||||
START_TRIPLE_LIST = "<SOTL>"
|
START_TRIPLE_LIST = "<SOTL>"
|
||||||
START_TRIPLE = "<SOT>"
|
START_TRIPLE = "<SOT>"
|
||||||
@@ -9,7 +10,6 @@ class SpecialToken(str, Enum):
|
|||||||
RELATIONSHIP = "<PRED>"
|
RELATIONSHIP = "<PRED>"
|
||||||
OBJECT = "<OBJ>"
|
OBJECT = "<OBJ>"
|
||||||
ABSTRACT = "<ABS>"
|
ABSTRACT = "<ABS>"
|
||||||
END_OF_SENTENCE = "<EOS>"
|
|
||||||
CORPUS_END = "<END>"
|
CORPUS_END = "<END>"
|
||||||
|
|
||||||
## Tasks' Token
|
## Tasks' Token
|
||||||
@@ -18,5 +18,4 @@ class SpecialToken(str, Enum):
|
|||||||
CONTINUE_RDF = "<CONTINUERDF>"
|
CONTINUE_RDF = "<CONTINUERDF>"
|
||||||
MASK = "<MASK>"
|
MASK = "<MASK>"
|
||||||
|
|
||||||
#BPE Training:
|
# BPE Training:
|
||||||
|
|
||||||
|
|||||||
@@ -133,11 +133,6 @@ class SqlEndpoint():
|
|||||||
GROUP BY RelationshipURI;
|
GROUP BY RelationshipURI;
|
||||||
"""
|
"""
|
||||||
return pd.read_sql_query(QUERY, self.sql_engine)
|
return pd.read_sql_query(QUERY, self.sql_engine)
|
||||||
|
|
||||||
def get_dataframe_from_query(self, query: str, params=None):
|
|
||||||
if params is None:
|
|
||||||
return pd.read_sql_query(query, self.sql_engine)
|
|
||||||
return pd.read_sql_query(query, self.sql_engine, params=params)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
101
Scripts/Training/bpe_trainer.py
Normal file
101
Scripts/Training/bpe_trainer.py
Normal file
@@ -0,0 +1,101 @@
|
|||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
import sys
|
||||||
|
# TODO: make relative imports
|
||||||
|
import Project_Model.Libs.BPE as BPE
|
||||||
|
from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
|
||||||
|
|
||||||
|
DEFAULT_CHUNK_SIZE = int(18e4)
|
||||||
|
DEFAULT_DEBUG_AFTER_ITER = 1
|
||||||
|
DEFAULT_MAX_VOCABULARY = int(32E3)
|
||||||
|
DEFAULT_MERGE_TRESHOLD = 1
|
||||||
|
DEFAULT_MAX_ITERATIONS = 0
|
||||||
|
TOKEN_LIST = [token.value for token in SpecialToken]
|
||||||
|
|
||||||
|
|
||||||
|
class ProgramArgs:
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
input_file: str,
|
||||||
|
cache_dir: str,
|
||||||
|
output_file: str,
|
||||||
|
resume_at: int,
|
||||||
|
max_vocabulary: int,
|
||||||
|
max_iterations: int,
|
||||||
|
merge_treshold: int,
|
||||||
|
chunk_size: int,
|
||||||
|
debug_after: int,
|
||||||
|
) -> None:
|
||||||
|
self.input_file = input_file
|
||||||
|
self.cache_dir = cache_dir
|
||||||
|
self.output_file = output_file
|
||||||
|
self.resume_at = resume_at
|
||||||
|
self.max_vocabulary = max_vocabulary
|
||||||
|
self.max_iterations = max_iterations
|
||||||
|
self.merge_treshold = merge_treshold
|
||||||
|
self.chunk_size = chunk_size
|
||||||
|
self.debug_after = debug_after
|
||||||
|
|
||||||
|
|
||||||
|
def get_args(args: list[str]) -> ProgramArgs:
|
||||||
|
|
||||||
|
PARSER = argparse.ArgumentParser()
|
||||||
|
PARSER.add_argument("--input-file", "--input", "-i", required=True, type=str)
|
||||||
|
PARSER.add_argument("--cache-dir", "--cache", "-c", required=True, type=str)
|
||||||
|
PARSER.add_argument("--output-file", "--output", "-o", required=True, type=str)
|
||||||
|
PARSER.add_argument("--resume-at", "--resume", "-r", default=0, type=int)
|
||||||
|
PARSER.add_argument("--max-vocabulary", "--max-voc", default=DEFAULT_MAX_VOCABULARY, type=int)
|
||||||
|
PARSER.add_argument("--max-iterations", "--max-iter", default=DEFAULT_MAX_ITERATIONS, type=int)
|
||||||
|
PARSER.add_argument("--merge-treshold", "--tresh", default=DEFAULT_MERGE_TRESHOLD, type=int)
|
||||||
|
PARSER.add_argument("--chunk-size", default=DEFAULT_CHUNK_SIZE, type=int)
|
||||||
|
PARSER.add_argument("--debug-after", default=DEFAULT_DEBUG_AFTER_ITER, type=int)
|
||||||
|
|
||||||
|
parsed_args, _ = PARSER.parse_known_args(args)
|
||||||
|
|
||||||
|
return ProgramArgs(
|
||||||
|
parsed_args.input_file,
|
||||||
|
parsed_args.cache_dir,
|
||||||
|
parsed_args.output_file,
|
||||||
|
parsed_args.resume_at,
|
||||||
|
parsed_args.max_vocabulary,
|
||||||
|
parsed_args.max_iterations,
|
||||||
|
parsed_args.merge_treshold,
|
||||||
|
parsed_args.chunk_size,
|
||||||
|
parsed_args.debug_after,
|
||||||
|
) # type ignore
|
||||||
|
|
||||||
|
|
||||||
|
def train(args: ProgramArgs):
|
||||||
|
|
||||||
|
TRAINER = BPE.NanoSocraTrainer(
|
||||||
|
args.max_vocabulary,
|
||||||
|
TOKEN_LIST,
|
||||||
|
args.chunk_size,
|
||||||
|
args.merge_treshold,
|
||||||
|
args.max_iterations,
|
||||||
|
args.debug_after
|
||||||
|
)
|
||||||
|
|
||||||
|
DATASET_PATH = Path(args.input_file)
|
||||||
|
CACHE_DIR = Path(args.cache_dir)
|
||||||
|
VOCABULARY_PATH = Path(args.output_file)
|
||||||
|
|
||||||
|
print(f"Training BPE")
|
||||||
|
|
||||||
|
BPE_ENCODER = TRAINER.trainBPE(
|
||||||
|
DATASET_PATH,
|
||||||
|
CACHE_DIR,
|
||||||
|
resume_from_iter=args.resume_at
|
||||||
|
)
|
||||||
|
|
||||||
|
VOCABULARY = BPE_ENCODER.vocabulary
|
||||||
|
|
||||||
|
print(f"Saving Vocabulary in {VOCABULARY_PATH}")
|
||||||
|
|
||||||
|
BPE.save_nanos_vocabulary(VOCABULARY, VOCABULARY_PATH)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
ARGS = get_args(sys.argv)
|
||||||
|
train(ARGS)
|
||||||
96
Scripts/Training/bpe_trainer_pool.py
Normal file
96
Scripts/Training/bpe_trainer_pool.py
Normal file
@@ -0,0 +1,96 @@
|
|||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
import sys
|
||||||
|
# TODO: make relative imports
|
||||||
|
import Project_Model.Libs.BPE as BPE
|
||||||
|
from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
|
||||||
|
|
||||||
|
DEFAULT_DEBUG_AFTER_ITER = 1
|
||||||
|
DEFAULT_MAX_VOCABULARY = int(32E3)
|
||||||
|
DEFAULT_MERGE_TRESHOLD = 1
|
||||||
|
DEFAULT_MAX_ITERATIONS = 0
|
||||||
|
TOKEN_LIST = [token.value for token in SpecialToken]
|
||||||
|
|
||||||
|
|
||||||
|
class ProgramArgs:
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
input_file: str,
|
||||||
|
output_file: str,
|
||||||
|
cache_file: str,
|
||||||
|
max_vocabulary: int,
|
||||||
|
max_iterations: int,
|
||||||
|
merge_treshold: int,
|
||||||
|
debug_after: int,
|
||||||
|
) -> None:
|
||||||
|
self.input_file = input_file
|
||||||
|
self.output_file = output_file
|
||||||
|
self.cache_file = cache_file
|
||||||
|
self.max_vocabulary = max_vocabulary
|
||||||
|
self.max_iterations = max_iterations
|
||||||
|
self.merge_treshold = merge_treshold
|
||||||
|
self.debug_after = debug_after
|
||||||
|
|
||||||
|
|
||||||
|
def get_args(args: list[str]) -> ProgramArgs:
|
||||||
|
|
||||||
|
PARSER = argparse.ArgumentParser()
|
||||||
|
PARSER.add_argument("--input-file", "--input", "-i", required=True, type=str)
|
||||||
|
PARSER.add_argument("--output-file", "--output", "-o", required=True, type=str)
|
||||||
|
PARSER.add_argument("--cache-file", "--cache", "-c", required=True, type=str)
|
||||||
|
PARSER.add_argument("--max-vocabulary", "--max-voc", default=DEFAULT_MAX_VOCABULARY, type=int)
|
||||||
|
PARSER.add_argument("--max-iterations", "--max-iter", default=DEFAULT_MAX_ITERATIONS, type=int)
|
||||||
|
PARSER.add_argument("--merge-treshold", "--tresh", default=DEFAULT_MERGE_TRESHOLD, type=int)
|
||||||
|
PARSER.add_argument("--debug-after", default=DEFAULT_DEBUG_AFTER_ITER, type=int)
|
||||||
|
|
||||||
|
parsed_args, _ = PARSER.parse_known_args(args)
|
||||||
|
|
||||||
|
return ProgramArgs(
|
||||||
|
parsed_args.input_file,
|
||||||
|
parsed_args.output_file,
|
||||||
|
parsed_args.cache_file,
|
||||||
|
parsed_args.max_vocabulary,
|
||||||
|
parsed_args.max_iterations,
|
||||||
|
parsed_args.merge_treshold,
|
||||||
|
parsed_args.debug_after,
|
||||||
|
) # type ignore
|
||||||
|
|
||||||
|
|
||||||
|
def train(args: ProgramArgs):
|
||||||
|
|
||||||
|
TRAINER = BPE.NanoSocraTrainerPool(
|
||||||
|
args.max_vocabulary,
|
||||||
|
TOKEN_LIST,
|
||||||
|
args.merge_treshold,
|
||||||
|
args.max_iterations,
|
||||||
|
args.debug_after
|
||||||
|
)
|
||||||
|
|
||||||
|
DATASET_PATH = Path(args.input_file)
|
||||||
|
VOCABULARY_PATH = Path(args.output_file)
|
||||||
|
CACHE_PATH = Path(args.cache_file)
|
||||||
|
|
||||||
|
start_bpe = BPE.NanoSocratesBPE()
|
||||||
|
if CACHE_PATH.is_file():
|
||||||
|
voc = BPE.load_nanos_vocabulary(CACHE_PATH)
|
||||||
|
start_bpe = BPE.NanoSocratesBPE(voc)
|
||||||
|
|
||||||
|
print(f"Training BPE")
|
||||||
|
|
||||||
|
BPE_ENCODER = TRAINER.trainBPE(
|
||||||
|
DATASET_PATH,
|
||||||
|
CACHE_PATH,
|
||||||
|
start_bpe
|
||||||
|
)
|
||||||
|
|
||||||
|
VOCABULARY = BPE_ENCODER.vocabulary
|
||||||
|
|
||||||
|
print(f"Saving Vocabulary in {VOCABULARY_PATH}")
|
||||||
|
|
||||||
|
BPE.save_nanos_vocabulary(VOCABULARY, VOCABULARY_PATH)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
ARGS = get_args(sys.argv)
|
||||||
|
train(ARGS)
|
||||||
84
Scripts/Training/bpe_trainer_ram.py
Normal file
84
Scripts/Training/bpe_trainer_ram.py
Normal file
@@ -0,0 +1,84 @@
|
|||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
import sys
|
||||||
|
# TODO: make relative imports
|
||||||
|
import Project_Model.Libs.BPE as BPE
|
||||||
|
from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
|
||||||
|
|
||||||
|
DEFAULT_DEBUG_AFTER_ITER = 1
|
||||||
|
DEFAULT_MAX_VOCABULARY = int(32E3)
|
||||||
|
DEFAULT_MERGE_TRESHOLD = 1
|
||||||
|
DEFAULT_MAX_ITERATIONS = 0
|
||||||
|
TOKEN_LIST = [token.value for token in SpecialToken]
|
||||||
|
|
||||||
|
|
||||||
|
class ProgramArgs:
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
input_file: str,
|
||||||
|
output_file: str,
|
||||||
|
max_vocabulary: int,
|
||||||
|
max_iterations: int,
|
||||||
|
merge_treshold: int,
|
||||||
|
debug_after: int,
|
||||||
|
) -> None:
|
||||||
|
self.input_file = input_file
|
||||||
|
self.output_file = output_file
|
||||||
|
self.max_vocabulary = max_vocabulary
|
||||||
|
self.max_iterations = max_iterations
|
||||||
|
self.merge_treshold = merge_treshold
|
||||||
|
self.debug_after = debug_after
|
||||||
|
|
||||||
|
|
||||||
|
def get_args(args: list[str]) -> ProgramArgs:
|
||||||
|
|
||||||
|
PARSER = argparse.ArgumentParser()
|
||||||
|
PARSER.add_argument("--input-file", "--input", "-i", required=True, type=str)
|
||||||
|
PARSER.add_argument("--output-file", "--output", "-o", required=True, type=str)
|
||||||
|
PARSER.add_argument("--max-vocabulary", "--max-voc", default=DEFAULT_MAX_VOCABULARY, type=int)
|
||||||
|
PARSER.add_argument("--max-iterations", "--max-iter", default=DEFAULT_MAX_ITERATIONS, type=int)
|
||||||
|
PARSER.add_argument("--merge-treshold", "--tresh", default=DEFAULT_MERGE_TRESHOLD, type=int)
|
||||||
|
PARSER.add_argument("--debug-after", default=DEFAULT_DEBUG_AFTER_ITER, type=int)
|
||||||
|
|
||||||
|
parsed_args, _ = PARSER.parse_known_args(args)
|
||||||
|
|
||||||
|
return ProgramArgs(
|
||||||
|
parsed_args.input_file,
|
||||||
|
parsed_args.output_file,
|
||||||
|
parsed_args.max_vocabulary,
|
||||||
|
parsed_args.max_iterations,
|
||||||
|
parsed_args.merge_treshold,
|
||||||
|
parsed_args.debug_after,
|
||||||
|
) # type ignore
|
||||||
|
|
||||||
|
|
||||||
|
def train(args: ProgramArgs):
|
||||||
|
|
||||||
|
TRAINER = BPE.NanoSocraTraineRam(
|
||||||
|
args.max_vocabulary,
|
||||||
|
TOKEN_LIST,
|
||||||
|
args.merge_treshold,
|
||||||
|
args.max_iterations,
|
||||||
|
args.debug_after
|
||||||
|
)
|
||||||
|
|
||||||
|
DATASET_PATH = Path(args.input_file)
|
||||||
|
VOCABULARY_PATH = Path(args.output_file)
|
||||||
|
|
||||||
|
print(f"Training BPE")
|
||||||
|
|
||||||
|
BPE_ENCODER = TRAINER.trainBPE(
|
||||||
|
DATASET_PATH
|
||||||
|
)
|
||||||
|
|
||||||
|
VOCABULARY = BPE_ENCODER.vocabulary
|
||||||
|
|
||||||
|
print(f"Saving Vocabulary in {VOCABULARY_PATH}")
|
||||||
|
|
||||||
|
BPE.save_nanos_vocabulary(VOCABULARY, VOCABULARY_PATH)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
ARGS = get_args(sys.argv)
|
||||||
|
train(ARGS)
|
||||||
12
Scripts/Training/dictionary_adjuster.py
Normal file
12
Scripts/Training/dictionary_adjuster.py
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
# to cut the mad trained dict into a short one
|
||||||
|
from Project_Model.Libs.BPE.Utils.vocabulary import load_nanos_vocabulary, save_nanos_vocabulary
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
DICTIONARY_PATH = "Assets/Dataset/Tmp/mad_cache.json"
|
||||||
|
OUTPUT_PATH = "Assets/Dataset/Tmp/trimmed.json"
|
||||||
|
|
||||||
|
|
||||||
|
big_dict = load_nanos_vocabulary(Path(DICTIONARY_PATH))
|
||||||
|
big_dict = dict(list(big_dict.items())[:31744])
|
||||||
|
|
||||||
|
save_nanos_vocabulary(big_dict,Path(OUTPUT_PATH))
|
||||||
48
Scripts/Training/mad_traininng.py
Normal file
48
Scripts/Training/mad_traininng.py
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
# generate each time a corpus big then the last, without the old data
|
||||||
|
# then using the same vocabulary let the bpe train
|
||||||
|
|
||||||
|
from Scripts.DataCleaning.pipeline import Pipeline
|
||||||
|
from Scripts.Training.bpe_trainer_pool import train,get_args
|
||||||
|
from pathlib import Path
|
||||||
|
import os, shutil
|
||||||
|
|
||||||
|
CORPUS_PATH = "Assets/Dataset/Tmp/toy_corpus.txt"
|
||||||
|
VOCABULARY_PATH = "Assets/Dataset/Tmp/mad_vocabulary.json"
|
||||||
|
CACHE_PATH = "Assets/Dataset/Tmp/mad_cache.json"
|
||||||
|
|
||||||
|
|
||||||
|
def mad_corpus_generator(corpus_size :int, corpus_offset: int):
|
||||||
|
print("New Corpus")
|
||||||
|
pipe = Pipeline(bpe_corpus_path=CORPUS_PATH)
|
||||||
|
print("Pipeline Created")
|
||||||
|
corpus_ending_offset = corpus_size + corpus_offset
|
||||||
|
pipe.reduce_movie_list(corpus_offset,corpus_ending_offset)
|
||||||
|
print("Starting building corpus")
|
||||||
|
pipe.execute_task_bpe_corpus()
|
||||||
|
print("Corpus created")
|
||||||
|
|
||||||
|
def mad_bpe_trainer():
|
||||||
|
argv = [
|
||||||
|
"--input-file", CORPUS_PATH,
|
||||||
|
"--output-file", VOCABULARY_PATH,
|
||||||
|
"--cache-file", CACHE_PATH,
|
||||||
|
]
|
||||||
|
args = get_args(argv)
|
||||||
|
train(args)
|
||||||
|
|
||||||
|
def mad_hatter():
|
||||||
|
# 10,100,500,1000,1500,2000,3000,4000,5000,10000
|
||||||
|
film_list = [10,100,500,1000,1500,2000,3000,4000,5000,10000]
|
||||||
|
starting_offset = 0
|
||||||
|
for corpus_size in film_list:
|
||||||
|
|
||||||
|
# mad_corpus_generator(corpus_size, starting_offset)
|
||||||
|
# starting_offset = starting_offset + corpus_size
|
||||||
|
|
||||||
|
mad_bpe_trainer()
|
||||||
|
# put dict into cache
|
||||||
|
shutil.copyfile(Path(VOCABULARY_PATH), Path(CACHE_PATH))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
mad_hatter()
|
||||||
22
docs/BPE.md
Normal file
22
docs/BPE.md
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
# BPE
|
||||||
|
|
||||||
|
## Reasearch Material
|
||||||
|
|
||||||
|
- [BPE Wikipedia](https://en.wikipedia.org/wiki/Byte-pair_encoding)
|
||||||
|
- [BPE Hugging Face](https://huggingface.co/learn/llm-course/chapter6/5)
|
||||||
|
- [BPE GeeksForGeeks](https://www.geeksforgeeks.org/nlp/byte-pair-encoding-bpe-in-nlp/)
|
||||||
|
- [BPE Medium Chetna Khanna](https://medium.com/data-science/byte-pair-encoding-subword-based-tokenization-algorithm-77828a70bee0)
|
||||||
|
- [Stack Overflow "Explain bpe (Byte Pair Encoding) with examples?"](https://stackoverflow.com/questions/50583254/explain-bpe-byte-pair-encoding-with-examples)
|
||||||
|
- [Implementing a byte pair encoding(BPE) Tokenizer from scratch](https://sebastianraschka.com/blog/2025/bpe-from-scratch.html)
|
||||||
|
- [Thoretical Analysis of Byte-Pair Encoding](https://arxiv.org/pdf/2411.08671)
|
||||||
|
- [A Formal Perspective on Byte-Pair Encoding](https://aclanthology.org/2023.findings-acl.38v2.pdf)
|
||||||
|
- [Byte Pair Encoding is Suboptimal for Language Model Pretraining](https://arxiv.org/pdf/2004.03720)
|
||||||
|
- [Byte pair encoding: a text compression scheme that accelerates pattern matching](https://www.researchgate.net/profile/Takeshi-Shinohara/publication/2310624_Byte_Pair_Encoding_A_Text_Compression_Scheme_That_Accelerates_Pattern_Matching/links/02e7e522f8ea00c318000000/Byte-Pair-Encoding-A-Text-Compression-Scheme-That-Accelerates-Pattern-Matching.pdf)
|
||||||
|
- [A Formal Perspective on Byte-Pair Encoding](https://arxiv.org/pdf/2306.16837)
|
||||||
|
- [Controlling byte pair encoding for neural machine translation](https://ieeexplore.ieee.org/abstract/document/8300571)
|
||||||
|
- [Scaffold-BPE: Enhancing Byte Pair Encoding for Large Language Models with Simple and Effective Scaffold Token Removal](https://ojs.aaai.org/index.php/AAAI/article/view/34633)
|
||||||
|
- [Parity-Aware Byte-Pair Encoding: Improving Cross-lingual Fairness in Tokenization](https://arxiv.org/pdf/2508.04796)
|
||||||
|
- [Code Completion using Neural Aention and Byte Pair Encoding](https://arxiv.org/pdf/2004.06343)
|
||||||
|
- [Getting the most out of your tokenizer for pre-training and domain adaptation](https://arxiv.org/html/2402.01035v2)
|
||||||
|
|
||||||
|
|
||||||
Reference in New Issue
Block a user