Added EOS token
This commit is contained in:
parent
a04f4c7cb7
commit
ee12f53f12
@ -57,8 +57,8 @@ class PipelineApplier():
|
|||||||
# MovieID and Abstract are unique for each other 1 <-> 1
|
# MovieID and Abstract are unique for each other 1 <-> 1
|
||||||
RDF = RDF.groupby(["MovieID", "Abstract"])["Triple"].apply("".join).reset_index()
|
RDF = RDF.groupby(["MovieID", "Abstract"])["Triple"].apply("".join).reset_index()
|
||||||
# add special token for: start of triple, end of triple and start of abstract
|
# add special token for: start of triple, end of triple and start of abstract
|
||||||
RDF["Triple"] = SpecialToken.START_TRIPLE_LIST.value + RDF["Triple"]
|
RDF["Triple"] = SpecialToken.START_TRIPLE_LIST.value + RDF["Triple"]+SpecialToken.END_OF_SENTENCE.value
|
||||||
RDF["Abstract"] = SpecialToken.ABSTRACT.value + RDF["Abstract"]
|
RDF["Abstract"] = SpecialToken.ABSTRACT.value + RDF["Abstract"] + SpecialToken.END_OF_SENTENCE.value
|
||||||
return RDF[["MovieID","Triple","Abstract"]]
|
return RDF[["MovieID","Triple","Abstract"]]
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -53,7 +53,7 @@ class Pipeline():
|
|||||||
self.task_rdf_completation = RDF_completation_task_dataset("./Assets/Dataset/Tmp/rdf_completation.csv")
|
self.task_rdf_completation = RDF_completation_task_dataset("./Assets/Dataset/Tmp/rdf_completation.csv")
|
||||||
|
|
||||||
self._movie_filter.frequency_filter(50,3000)
|
self._movie_filter.frequency_filter(50,3000)
|
||||||
self._relationship_filter.frequency_filter(20, 2395627) # from 2718 to 3069
|
self._relationship_filter.frequency_filter(25, 2395627) # from 2718 to 3069
|
||||||
self._relationship_filter.delete_relationship_uri_by_list(RELATIONSHIP_FILTER_LIST)
|
self._relationship_filter.delete_relationship_uri_by_list(RELATIONSHIP_FILTER_LIST)
|
||||||
|
|
||||||
def other_filter(self):
|
def other_filter(self):
|
||||||
@ -131,7 +131,8 @@ class Pipeline():
|
|||||||
# Django Unchained : 138952
|
# Django Unchained : 138952
|
||||||
# Spirited Away : 144137
|
# Spirited Away : 144137
|
||||||
# Knives Out : 148025
|
# Knives Out : 148025
|
||||||
movie_list = [106465,106466,106467,106468,106469,106470,106471,106472,106473]#[117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]
|
# [106465,106466,106467,106468,106469,106470,106471,106472,106473]
|
||||||
|
movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]
|
||||||
self._movie_filter.MOVIE_FILTER = pd.DataFrame({"MovieID": movie_list})
|
self._movie_filter.MOVIE_FILTER = pd.DataFrame({"MovieID": movie_list})
|
||||||
|
|
||||||
def generate_csv_debug_file(self, debug_path:str):
|
def generate_csv_debug_file(self, debug_path:str):
|
||||||
@ -144,7 +145,7 @@ class Pipeline():
|
|||||||
|
|
||||||
|
|
||||||
pipe = Pipeline()
|
pipe = Pipeline()
|
||||||
# pipe.use_toy_dataset()
|
#pipe.use_toy_dataset()
|
||||||
pipe.other_filter()
|
pipe.other_filter()
|
||||||
pipe.execute_all_task()
|
pipe.execute_all_task()
|
||||||
# pipe.generate_csv_debug_file("Assets/Dataset/Tmp/debug.csv")
|
# pipe.generate_csv_debug_file("Assets/Dataset/Tmp/debug.csv")
|
||||||
@ -9,6 +9,7 @@ class SpecialToken(str, Enum):
|
|||||||
RELATIONSHIP = "<PRED>"
|
RELATIONSHIP = "<PRED>"
|
||||||
OBJECT = "<OBJ>"
|
OBJECT = "<OBJ>"
|
||||||
ABSTRACT = "<ABS>"
|
ABSTRACT = "<ABS>"
|
||||||
|
END_OF_SENTENCE = "<EOS>"
|
||||||
CORPUS_END = "<END>"
|
CORPUS_END = "<END>"
|
||||||
|
|
||||||
## Tasks' Token
|
## Tasks' Token
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user