Added EOS token

2025-10-07 22:47:59 +02:00
parent a04f4c7cb7
commit ee12f53f12
3 changed files with 7 additions and 5 deletions
--- a/Scripts/DataCleaning/pipeline/cleaner.py
+++ b/Scripts/DataCleaning/pipeline/cleaner.py
@@ -57,8 +57,8 @@ class PipelineApplier():
        # MovieID and Abstract are unique for each other 1 <-> 1
        RDF = RDF.groupby(["MovieID", "Abstract"])["Triple"].apply("".join).reset_index()
        # add special token for: start of triple, end of triple and start of abstract
-        RDF["Triple"] = SpecialToken.START_TRIPLE_LIST.value + RDF["Triple"] 
+        RDF["Triple"] = SpecialToken.START_TRIPLE_LIST.value + RDF["Triple"]+SpecialToken.END_OF_SENTENCE.value
-        RDF["Abstract"] = SpecialToken.ABSTRACT.value + RDF["Abstract"]
+        RDF["Abstract"] = SpecialToken.ABSTRACT.value + RDF["Abstract"] + SpecialToken.END_OF_SENTENCE.value
        return RDF[["MovieID","Triple","Abstract"]]
--- a/Scripts/DataCleaning/pipeline/pipeline.py
+++ b/Scripts/DataCleaning/pipeline/pipeline.py
@@ -53,7 +53,7 @@ class Pipeline():
        self.task_rdf_completation = RDF_completation_task_dataset("./Assets/Dataset/Tmp/rdf_completation.csv")
        self._movie_filter.frequency_filter(50,3000)
-        self._relationship_filter.frequency_filter(20, 2395627) # from 2718 to 3069 
+        self._relationship_filter.frequency_filter(25, 2395627) # from 2718 to 3069 
        self._relationship_filter.delete_relationship_uri_by_list(RELATIONSHIP_FILTER_LIST)
    def other_filter(self):
@@ -131,7 +131,8 @@ class Pipeline():
        # Django Unchained  : 138952
        # Spirited Away     : 144137
        # Knives Out        : 148025
-        movie_list = [106465,106466,106467,106468,106469,106470,106471,106472,106473]#[117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]
+        # [106465,106466,106467,106468,106469,106470,106471,106472,106473]
        movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]
        self._movie_filter.MOVIE_FILTER = pd.DataFrame({"MovieID": movie_list})
    def generate_csv_debug_file(self, debug_path:str):
@@ -144,7 +145,7 @@ class Pipeline():
 pipe = Pipeline()
-# pipe.use_toy_dataset()
+#pipe.use_toy_dataset()
 pipe.other_filter()
 pipe.execute_all_task()
 # pipe.generate_csv_debug_file("Assets/Dataset/Tmp/debug.csv")
--- a/Scripts/Libs/CleaningPipeline/special_token.py
+++ b/Scripts/Libs/CleaningPipeline/special_token.py
@@ -9,6 +9,7 @@ class SpecialToken(str, Enum):
    RELATIONSHIP = "<PRED>"
    OBJECT = "<OBJ>"
    ABSTRACT = "<ABS>"
    END_OF_SENTENCE = "<EOS>"
    CORPUS_END = "<END>"
    ## Tasks' Token