From a04f4c7cb704cd6ec57154061673b8998fad4ebb Mon Sep 17 00:00:00 2001 From: GassiGiuseppe Date: Tue, 7 Oct 2025 15:49:25 +0200 Subject: [PATCH] changes to shorten the dataset --- Scripts/DataCleaning/pipeline/pipeline.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Scripts/DataCleaning/pipeline/pipeline.py b/Scripts/DataCleaning/pipeline/pipeline.py index d523897..d350497 100644 --- a/Scripts/DataCleaning/pipeline/pipeline.py +++ b/Scripts/DataCleaning/pipeline/pipeline.py @@ -60,7 +60,8 @@ class Pipeline(): self._movie_filter.relation_filter("purl:dc/terms/subject",5,100) self._movie_filter.filter_by_director() self._movie_filter.filter_by_english_movies() - self._movie_filter.relation_filter("dbp-dbp:budget",1,100) # the most important film have relationship budget + self._movie_filter.relation_filter("dbp-dbp:budget",1,100) # the most important film have relationship budget + self._movie_filter.relation_filter("dbp-dbp:released",1,100) # to cut to 2000 :( def _get_cleaned_movie_rows(self): movie_ids = self._movie_filter.get_movie_id()