Merge branch 'dev' into dev.embedder

This commit is contained in:
Christian Risi 2025-10-04 19:04:03 +02:00
commit 1eef25a697
5 changed files with 79 additions and 8 deletions

View File

@ -36,6 +36,42 @@ class TestTrainBPE:
for encoded, expected in zip(ENCODED, EXPECTED): for encoded, expected in zip(ENCODED, EXPECTED):
assert encoded == expected assert encoded == expected
def test_bpe_train_encoding_and_decoding(self):
SPECIAL_LIST = ["<ABS>", "<SOTL>"]
TRAINER = BPE.NanoSocraTrainerPool(
int(32E3),
SPECIAL_LIST
)
TEXT_PATH = Path("Project_Model/Tests/trainer_files/train_encode_decode.txt")
FILE = open(TEXT_PATH)
TEXT = FILE.read()
FILE.close()
EXPECTED = TEXT
# ab = 256
# 256, 256 = 257
# 257, 257 = 258
BPE_ENCODER = TRAINER.trainBPE(
TEXT_PATH,
CACHE_DIR_PATH
)
VOCABULARY = BPE_ENCODER.vocabulary
TOKENANO = BPE.TokeNanoCore(VOCABULARY,SPECIAL_LIST)
ENCODED = TOKENANO.encode(TEXT)
DECODED = TOKENANO.decode(ENCODED)
assert len(DECODED) == len(EXPECTED)
for decoded, expected in zip(DECODED, EXPECTED):
assert decoded == expected
# Useful to debug weird cases # Useful to debug weird cases
if __name__ == "__main__": if __name__ == "__main__":
TestTrainBPE().test_bpe_train_encoding_simple() # TestTrainBPE().test_bpe_train_encoding_simple()
TestTrainBPE().test_bpe_train_encoding_and_decoding()

View File

@ -0,0 +1 @@
<ABS>The Dark Knight is a 2008 superhero film directed by Christopher Nolan,<SOTL>

View File

@ -73,6 +73,10 @@ class PipelineApplier():
return RDF return RDF
def reduce_movie_list(self, starting_offset:int , ending_offset:int):
end = min(len(self.MOVIE_FILTER), ending_offset)
self.MOVIE_FILTER = self.MOVIE_FILTER.iloc[starting_offset:end].copy()
def drop_na_from_dataset(self, RDF: pd.DataFrame) -> pd.DataFrame: def drop_na_from_dataset(self, RDF: pd.DataFrame) -> pd.DataFrame:
# dataset has SubjectURI RelationshipURI ObjectURI # dataset has SubjectURI RelationshipURI ObjectURI
# want to drop the '' in them # want to drop the '' in them

View File

@ -10,13 +10,19 @@ from Scripts.DataCleaning.data_output_models.rdf_completation_task import RDF_co
import pandas as pd import pandas as pd
class Pipeline(): class Pipeline():
def __init__(self): def __init__(self,
mask_task_dataset_path:str = "./Assets/Dataset/Tmp/rdf_mask.csv",
bpe_corpus_path:str = "./Assets/Dataset/Tmp/corpus.txt",
text_to_rdf_task_dataset_path:str = "./Assets/Dataset/Tmp/rdf_text.csv",
completation_rdf_task_dataset_path:str = "./Assets/Dataset/Tmp/rdf_completation.csv",
):
self.sql_endpoint = SqlEndpoint() self.sql_endpoint = SqlEndpoint()
# classes to manage taskes' datasets # classes to manage taskes' datasets
self.task_rdf_mask = RDF_mask_task_dataset("./Assets/Dataset/Tmp/rdf_mask.csv") self.task_rdf_mask = RDF_mask_task_dataset(mask_task_dataset_path)
self.task_bpe_corpus = BPE_corpus("./Assets/Dataset/Tmp/corpus.txt") self.task_bpe_corpus = BPE_corpus(bpe_corpus_path)
self.task_rdf_text = RDF_text_task_dataset("./Assets/Dataset/Tmp/rdf_text.csv") self.task_rdf_text = RDF_text_task_dataset(text_to_rdf_task_dataset_path)
self.task_rdf_completation = RDF_completation_task_dataset("./Assets/Dataset/Tmp/rdf_completation.csv") self.task_rdf_completation = RDF_completation_task_dataset(completation_rdf_task_dataset_path)
# prepare the filter # prepare the filter
# the filter applier needs to know the frequence of Movies and Relationship among all the Dataset # the filter applier needs to know the frequence of Movies and Relationship among all the Dataset
@ -113,6 +119,9 @@ class Pipeline():
movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025] movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]
self.sql_endpoint.movie_ids = movie_list self.sql_endpoint.movie_ids = movie_list
def reduce_movie_list(self, starting_offset:int , ending_offset:int):
self.filter_applier.reduce_movie_list(starting_offset,ending_offset)
# there are a lot of settings to manage # there are a lot of settings to manage
@ -121,11 +130,11 @@ class Pipeline():
# in the use_toy_dataset , to change the toy dataset # in the use_toy_dataset , to change the toy dataset
# in _get_cleaned_movie_rows: to change how the pipeline behave # in _get_cleaned_movie_rows: to change how the pipeline behave
pipeline = Pipeline() #pipeline = Pipeline()
# pipeline.use_toy_dataset() # pipeline.use_toy_dataset()
# pipeline.execute_task_bpe_corpus() # pipeline.execute_task_bpe_corpus()
# pipeline.execute_task_rdf_mask() # pipeline.execute_task_rdf_mask()
# pipeline.execute_tasks_rdf_text() # pipeline.execute_tasks_rdf_text()
# pipeline.execute_task_rdf_completation() # pipeline.execute_task_rdf_completation()
pipeline.execute_all_task() # pipeline.execute_all_task()

View File

@ -0,0 +1,21 @@
import Project_Model.Libs.BPE as BPE
from pathlib import Path
from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
VOCABULARY_path = "Assets/Model/toy_10/toy_dictionary.json"
VOCABULARY = BPE.load_nanos_vocabulary(Path(VOCABULARY_path))
SPECIAL_TOKEN_LIST = [token.value for token in SpecialToken]
# INPUT = "<SOTL><SOT><SUBJ>dbp-dbr:How_It_Should_Have_Ended<PRED>dbp-dbp:title<OBJ>dbp-dbr:The_Dark_Knight<EOT>"
# INPUT = "<SOTL><SOT><SUBJ>dbp-dbr:How_It_Should_Have_Ended<PRED>"
# INPUT = "<ABS>The Dark Knight is a 2008 superhero film directed by Christopher Nolan, from a screenplay co-written with his brother Jonathan. Based on the DC Comics superhero Batman, it is the sequel to Batman Begins (2005), and the second installment in The Dark Knight trilogy. The plot follows the vigilante Batman, police lieutenant James Gordon, and district attorney Harvey Dent, who form an alliance to dismantle organized crime in Gotham City. Their efforts are derailed by the Joker, an anarchistic mastermind who seeks to test how far Batman will go to save the city from chaos. The ensemble cast includes Christian Bale, Michael Caine, Heath Ledger, Gary Oldman, Aaron Eckhart, Maggie Gyllenhaal, and Morgan Freeman.Warner Bros. Pictures prioritized a sequel following the successful reinvention of the Batman film series with Batman Begins. Christopher and Batman Begins co-writer David S. Goyer developed the story elements, making Dent the central protagonist caught up in the battle between Batman and the Joker. In writing the screenplay, the Nolans were influenced by 1980s Batman comics and crime drama films, and sought to continue Batman Begins' heightened sense of realism. From April to November 2007, filming took place with a $185 million budget in Chicago and Hong Kong, and on sets in England. The Dark Knight was the first major motion picture to be filmed with high-resolution IMAX cameras. Christopher avoided using computer-generated imagery unless necessary, insisting on practical stunts such as flipping an 18-wheel truck and blowing up a factory.The Dark Knight was marketed with an innovative interactive viral campaign that initially focused on countering criticism of Ledger's casting by those who believed he was a poor choice to portray the Joker. Ledger died from an accidental prescription drug overdose in January 2008, leading to widespread interest from the press and public regarding his performance. When it was released in July, The Dark Knight received acclaim for its mature tone and themes, visual style, and performances—particularly that of Ledger, who received many posthumous awards including Academy, BAFTA, and Golden Globe awards for Best Supporting Actor, making The Dark Knight the first comic-book film to receive major industry awards. It broke several box-office records and became the highest-grossing 2008 film, the fourth-highest-grossing film to that time, and the highest-grossing superhero film of the time.Since its release, The Dark Knight has been assessed as one of the greatest superhero films ever, one of the best movies of the 2000s, and one of the best films ever made. It is considered the \"blueprint\" for many modern superhero films, particularly for its rejection of a typical comic-book movie style in favor of a crime film that features comic-book characters. Many filmmakers sought to repeat its success by emulating its gritty, realistic tone to varying degrees of success. The Dark Knight has been analyzed for its themes of terrorism and the limitations of morality and ethics. The United States Library of Congress selected it for preservation in the National Film Registry in 2020. A sequel, The Dark Knight Rises, concluded The Dark Knight trilogy in 2012.<SOTL>"
INPUT = "<ABS>The Dark Knight is a 2008 superhero film directed by Christopher Nolan,<SOTL>"
# INPUT = "<ABS> Nolan,<SOTL>"
# 32: " "
TOKENANO = BPE.Classes.TokeNanoCore(VOCABULARY, SPECIAL_TOKEN_LIST)
print(f"input: {INPUT} \ninput lenght: {len(INPUT)}")
encoded = TOKENANO.encode(INPUT)
print(f"encode: {encoded} \nencode lenght: {len(encoded)}")
decoded = TOKENANO.decode(encoded)
print(f"decode: {decoded} \ndecode lenght: {len(decoded)}")