From 25f3a5d22148fe1ef0e0de2f3315dd49c97d206d Mon Sep 17 00:00:00 2001 From: GassiGiuseppe Date: Sat, 4 Oct 2025 18:58:04 +0200 Subject: [PATCH 1/3] Logic to test BPE --- Project_Model/Tests/bpe_trainer_test.py | 38 ++++++++++++++++++- .../Experiments/change_me/use_bpe_pipeline.py | 21 ++++++++++ 2 files changed, 58 insertions(+), 1 deletion(-) create mode 100644 Scripts/Experiments/change_me/use_bpe_pipeline.py diff --git a/Project_Model/Tests/bpe_trainer_test.py b/Project_Model/Tests/bpe_trainer_test.py index 1f9f7fb..0ef36a9 100644 --- a/Project_Model/Tests/bpe_trainer_test.py +++ b/Project_Model/Tests/bpe_trainer_test.py @@ -36,6 +36,42 @@ class TestTrainBPE: for encoded, expected in zip(ENCODED, EXPECTED): assert encoded == expected + + def test_bpe_train_encoding_and_decoding(self): + + SPECIAL_LIST = ["", ""] + TRAINER = BPE.NanoSocraTrainerPool( + int(32E3), + SPECIAL_LIST + ) + + TEXT_PATH = Path("Project_Model/Tests/trainer_files/train_encode_decode.txt") + FILE = open(TEXT_PATH) + TEXT = FILE.read() + FILE.close() + + EXPECTED = TEXT + + # ab = 256 + # 256, 256 = 257 + # 257, 257 = 258 + + BPE_ENCODER = TRAINER.trainBPE( + TEXT_PATH, + CACHE_DIR_PATH + ) + VOCABULARY = BPE_ENCODER.vocabulary + TOKENANO = BPE.TokeNanoCore(VOCABULARY,SPECIAL_LIST) + + ENCODED = TOKENANO.encode(TEXT) + DECODED = TOKENANO.decode(ENCODED) + + assert len(DECODED) == len(EXPECTED) + + for decoded, expected in zip(DECODED, EXPECTED): + assert decoded == expected + # Useful to debug weird cases if __name__ == "__main__": - TestTrainBPE().test_bpe_train_encoding_simple() + # TestTrainBPE().test_bpe_train_encoding_simple() + TestTrainBPE().test_bpe_train_encoding_and_decoding() diff --git a/Scripts/Experiments/change_me/use_bpe_pipeline.py b/Scripts/Experiments/change_me/use_bpe_pipeline.py new file mode 100644 index 0000000..3b56bd3 --- /dev/null +++ b/Scripts/Experiments/change_me/use_bpe_pipeline.py @@ -0,0 +1,21 @@ +import Project_Model.Libs.BPE as BPE +from pathlib import Path +from Scripts.Libs.CleaningPipeline.special_token import SpecialToken + +VOCABULARY_path = "Assets/Model/toy_10/toy_dictionary.json" +VOCABULARY = BPE.load_nanos_vocabulary(Path(VOCABULARY_path)) + +SPECIAL_TOKEN_LIST = [token.value for token in SpecialToken] + +# INPUT = "dbp-dbr:How_It_Should_Have_Endeddbp-dbp:titledbp-dbr:The_Dark_Knight" +# INPUT = "dbp-dbr:How_It_Should_Have_Ended" +# INPUT = "The Dark Knight is a 2008 superhero film directed by Christopher Nolan, from a screenplay co-written with his brother Jonathan. Based on the DC Comics superhero Batman, it is the sequel to Batman Begins (2005), and the second installment in The Dark Knight trilogy. The plot follows the vigilante Batman, police lieutenant James Gordon, and district attorney Harvey Dent, who form an alliance to dismantle organized crime in Gotham City. Their efforts are derailed by the Joker, an anarchistic mastermind who seeks to test how far Batman will go to save the city from chaos. The ensemble cast includes Christian Bale, Michael Caine, Heath Ledger, Gary Oldman, Aaron Eckhart, Maggie Gyllenhaal, and Morgan Freeman.Warner Bros. Pictures prioritized a sequel following the successful reinvention of the Batman film series with Batman Begins. Christopher and Batman Begins co-writer David S. Goyer developed the story elements, making Dent the central protagonist caught up in the battle between Batman and the Joker. In writing the screenplay, the Nolans were influenced by 1980s Batman comics and crime drama films, and sought to continue Batman Begins' heightened sense of realism. From April to November 2007, filming took place with a $185 million budget in Chicago and Hong Kong, and on sets in England. The Dark Knight was the first major motion picture to be filmed with high-resolution IMAX cameras. Christopher avoided using computer-generated imagery unless necessary, insisting on practical stunts such as flipping an 18-wheel truck and blowing up a factory.The Dark Knight was marketed with an innovative interactive viral campaign that initially focused on countering criticism of Ledger's casting by those who believed he was a poor choice to portray the Joker. Ledger died from an accidental prescription drug overdose in January 2008, leading to widespread interest from the press and public regarding his performance. When it was released in July, The Dark Knight received acclaim for its mature tone and themes, visual style, and performances—particularly that of Ledger, who received many posthumous awards including Academy, BAFTA, and Golden Globe awards for Best Supporting Actor, making The Dark Knight the first comic-book film to receive major industry awards. It broke several box-office records and became the highest-grossing 2008 film, the fourth-highest-grossing film to that time, and the highest-grossing superhero film of the time.Since its release, The Dark Knight has been assessed as one of the greatest superhero films ever, one of the best movies of the 2000s, and one of the best films ever made. It is considered the \"blueprint\" for many modern superhero films, particularly for its rejection of a typical comic-book movie style in favor of a crime film that features comic-book characters. Many filmmakers sought to repeat its success by emulating its gritty, realistic tone to varying degrees of success. The Dark Knight has been analyzed for its themes of terrorism and the limitations of morality and ethics. The United States Library of Congress selected it for preservation in the National Film Registry in 2020. A sequel, The Dark Knight Rises, concluded The Dark Knight trilogy in 2012." +INPUT = "The Dark Knight is a 2008 superhero film directed by Christopher Nolan," +# INPUT = " Nolan," +# 32: " " +TOKENANO = BPE.Classes.TokeNanoCore(VOCABULARY, SPECIAL_TOKEN_LIST) +print(f"input: {INPUT} \ninput lenght: {len(INPUT)}") +encoded = TOKENANO.encode(INPUT) +print(f"encode: {encoded} \nencode lenght: {len(encoded)}") +decoded = TOKENANO.decode(encoded) +print(f"decode: {decoded} \ndecode lenght: {len(decoded)}") \ No newline at end of file From c2f9344c82122d0ff7def2ad117b629e64b1cb54 Mon Sep 17 00:00:00 2001 From: GassiGiuseppe Date: Sat, 4 Oct 2025 18:58:20 +0200 Subject: [PATCH 2/3] little test file --- Project_Model/Tests/trainer_files/train_encode_decode.txt | 1 + 1 file changed, 1 insertion(+) create mode 100644 Project_Model/Tests/trainer_files/train_encode_decode.txt diff --git a/Project_Model/Tests/trainer_files/train_encode_decode.txt b/Project_Model/Tests/trainer_files/train_encode_decode.txt new file mode 100644 index 0000000..70e027c --- /dev/null +++ b/Project_Model/Tests/trainer_files/train_encode_decode.txt @@ -0,0 +1 @@ +The Dark Knight is a 2008 superhero film directed by Christopher Nolan, \ No newline at end of file From bbadd4c521fdc2ef4b867d12b597c4703af336d5 Mon Sep 17 00:00:00 2001 From: GassiGiuseppe Date: Sat, 4 Oct 2025 19:00:05 +0200 Subject: [PATCH 3/3] update cleaning pipeline with a new method to filter also by number of films, also updated the signature of the pipeline --- Scripts/DataCleaning/filter.py | 4 ++++ Scripts/DataCleaning/pipeline.py | 23 ++++++++++++++++------- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/Scripts/DataCleaning/filter.py b/Scripts/DataCleaning/filter.py index 50d6ead..317ea6b 100644 --- a/Scripts/DataCleaning/filter.py +++ b/Scripts/DataCleaning/filter.py @@ -73,6 +73,10 @@ class PipelineApplier(): return RDF + def reduce_movie_list(self, starting_offset:int , ending_offset:int): + end = min(len(self.MOVIE_FILTER), ending_offset) + self.MOVIE_FILTER = self.MOVIE_FILTER.iloc[starting_offset:end].copy() + def drop_na_from_dataset(self, RDF: pd.DataFrame) -> pd.DataFrame: # dataset has SubjectURI RelationshipURI ObjectURI # want to drop the '' in them diff --git a/Scripts/DataCleaning/pipeline.py b/Scripts/DataCleaning/pipeline.py index eb5b2f7..153f127 100644 --- a/Scripts/DataCleaning/pipeline.py +++ b/Scripts/DataCleaning/pipeline.py @@ -10,13 +10,19 @@ from Scripts.DataCleaning.data_output_models.rdf_completation_task import RDF_co import pandas as pd class Pipeline(): - def __init__(self): + def __init__(self, + mask_task_dataset_path:str = "./Assets/Dataset/Tmp/rdf_mask.csv", + bpe_corpus_path:str = "./Assets/Dataset/Tmp/corpus.txt", + text_to_rdf_task_dataset_path:str = "./Assets/Dataset/Tmp/rdf_text.csv", + completation_rdf_task_dataset_path:str = "./Assets/Dataset/Tmp/rdf_completation.csv", + + ): self.sql_endpoint = SqlEndpoint() # classes to manage taskes' datasets - self.task_rdf_mask = RDF_mask_task_dataset("./Assets/Dataset/Tmp/rdf_mask.csv") - self.task_bpe_corpus = BPE_corpus("./Assets/Dataset/Tmp/corpus.txt") - self.task_rdf_text = RDF_text_task_dataset("./Assets/Dataset/Tmp/rdf_text.csv") - self.task_rdf_completation = RDF_completation_task_dataset("./Assets/Dataset/Tmp/rdf_completation.csv") + self.task_rdf_mask = RDF_mask_task_dataset(mask_task_dataset_path) + self.task_bpe_corpus = BPE_corpus(bpe_corpus_path) + self.task_rdf_text = RDF_text_task_dataset(text_to_rdf_task_dataset_path) + self.task_rdf_completation = RDF_completation_task_dataset(completation_rdf_task_dataset_path) # prepare the filter # the filter applier needs to know the frequence of Movies and Relationship among all the Dataset @@ -113,6 +119,9 @@ class Pipeline(): movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025] self.sql_endpoint.movie_ids = movie_list + def reduce_movie_list(self, starting_offset:int , ending_offset:int): + self.filter_applier.reduce_movie_list(starting_offset,ending_offset) + # there are a lot of settings to manage @@ -121,11 +130,11 @@ class Pipeline(): # in the use_toy_dataset , to change the toy dataset # in _get_cleaned_movie_rows: to change how the pipeline behave -pipeline = Pipeline() +#pipeline = Pipeline() # pipeline.use_toy_dataset() # pipeline.execute_task_bpe_corpus() # pipeline.execute_task_rdf_mask() # pipeline.execute_tasks_rdf_text() # pipeline.execute_task_rdf_completation() -pipeline.execute_all_task() \ No newline at end of file +# pipeline.execute_all_task() \ No newline at end of file