diff --git a/Scripts/DataCleaning/filter.py b/Scripts/DataCleaning/filter.py index 50d6ead..c555e3d 100644 --- a/Scripts/DataCleaning/filter.py +++ b/Scripts/DataCleaning/filter.py @@ -182,3 +182,9 @@ class PipelineApplier(): # as input two dataframe, one with 2 column return None + def regex_on_objects(self, RDF: pd.DataFrame) -> pd.DataFrame: + RDF["ObjectURI"] = (RDF["ObjectURI"].astype("string") + .str.replace(r"\r?\n+", ", ", regex=True) # newlines -> ", " + .str.replace(r"\*", "", regex=True)) # delete all asterisks + + return RDF \ No newline at end of file diff --git a/Scripts/DataCleaning/pipeline.py b/Scripts/DataCleaning/pipeline.py index eb5b2f7..48a0af3 100644 --- a/Scripts/DataCleaning/pipeline.py +++ b/Scripts/DataCleaning/pipeline.py @@ -92,6 +92,8 @@ class Pipeline(): # other filter # RDF = self.filter_applier.delete_relationship_by_list_filter(RDF) + # regex on ObjectURI + RDF = self.filter_applier.regex_on_objects(RDF) if RDF.empty: continue RDF = self.filter_applier.rdf_add_special_token(RDF) # WARNING, THIS MUST BE DONE AFTER FILTER BY FREQUENCE