Added regex to delete new lines and * from ObjectURI

This commit is contained in:
GassiGiuseppe 2025-09-30 15:00:07 +02:00
parent 007f1e9554
commit 64e355e80c
2 changed files with 8 additions and 0 deletions

View File

@ -182,3 +182,9 @@ class PipelineApplier():
# as input two dataframe, one with 2 column
return None
def regex_on_objects(self, RDF: pd.DataFrame) -> pd.DataFrame:
RDF["ObjectURI"] = (RDF["ObjectURI"].astype("string")
.str.replace(r"\r?\n+", ", ", regex=True) # newlines -> ", "
.str.replace(r"\*", "", regex=True)) # delete all asterisks
return RDF

View File

@ -92,6 +92,8 @@ class Pipeline():
# other filter
#
RDF = self.filter_applier.delete_relationship_by_list_filter(RDF)
# regex on ObjectURI
RDF = self.filter_applier.regex_on_objects(RDF)
if RDF.empty:
continue
RDF = self.filter_applier.rdf_add_special_token(RDF) # WARNING, THIS MUST BE DONE AFTER FILTER BY FREQUENCE