From 64e355e80c7d6c84747448194098a7305eb9cf43 Mon Sep 17 00:00:00 2001 From: GassiGiuseppe Date: Tue, 30 Sep 2025 15:00:07 +0200 Subject: [PATCH] Added regex to delete new lines and * from ObjectURI --- Scripts/DataCleaning/filter.py | 6 ++++++ Scripts/DataCleaning/pipeline.py | 2 ++ 2 files changed, 8 insertions(+) diff --git a/Scripts/DataCleaning/filter.py b/Scripts/DataCleaning/filter.py index 50d6ead..c555e3d 100644 --- a/Scripts/DataCleaning/filter.py +++ b/Scripts/DataCleaning/filter.py @@ -182,3 +182,9 @@ class PipelineApplier(): # as input two dataframe, one with 2 column return None + def regex_on_objects(self, RDF: pd.DataFrame) -> pd.DataFrame: + RDF["ObjectURI"] = (RDF["ObjectURI"].astype("string") + .str.replace(r"\r?\n+", ", ", regex=True) # newlines -> ", " + .str.replace(r"\*", "", regex=True)) # delete all asterisks + + return RDF \ No newline at end of file diff --git a/Scripts/DataCleaning/pipeline.py b/Scripts/DataCleaning/pipeline.py index eb5b2f7..48a0af3 100644 --- a/Scripts/DataCleaning/pipeline.py +++ b/Scripts/DataCleaning/pipeline.py @@ -92,6 +92,8 @@ class Pipeline(): # other filter # RDF = self.filter_applier.delete_relationship_by_list_filter(RDF) + # regex on ObjectURI + RDF = self.filter_applier.regex_on_objects(RDF) if RDF.empty: continue RDF = self.filter_applier.rdf_add_special_token(RDF) # WARNING, THIS MUST BE DONE AFTER FILTER BY FREQUENCE