2025-09-29 15:21:26 +02:00
|
|
|
import re
|
|
|
|
|
from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
|
2025-10-05 14:54:32 +02:00
|
|
|
from Scripts.DataCleaning.legacy.filter import PipelineApplier
|
2025-09-29 15:21:26 +02:00
|
|
|
# tasks dataset builder
|
|
|
|
|
from Scripts.DataCleaning.data_output_models.rdf_mask_task import RDF_mask_task_dataset
|
|
|
|
|
from Scripts.DataCleaning.data_output_models.bpe_corpus import BPE_corpus
|
|
|
|
|
from Scripts.DataCleaning.data_output_models.rdf_text_tasks import RDF_text_task_dataset
|
|
|
|
|
from Scripts.DataCleaning.data_output_models.rdf_completation_task import RDF_completation_task_dataset
|
2025-10-04 21:33:09 +02:00
|
|
|
from Scripts.DataCleaning.data_output_models.debug_csv import Debug_csv
|
2025-09-29 15:21:26 +02:00
|
|
|
|
|
|
|
|
import pandas as pd
|
|
|
|
|
|
|
|
|
|
class Pipeline():
|
2025-09-29 16:03:49 +02:00
|
|
|
def __init__(self):
|
2025-09-29 15:21:26 +02:00
|
|
|
self.sql_endpoint = SqlEndpoint()
|
|
|
|
|
# classes to manage taskes' datasets
|
2025-09-29 16:03:49 +02:00
|
|
|
self.task_rdf_mask = RDF_mask_task_dataset("./Assets/Dataset/Tmp/rdf_mask.csv")
|
|
|
|
|
self.task_bpe_corpus = BPE_corpus("./Assets/Dataset/Tmp/corpus.txt")
|
2025-09-29 15:21:26 +02:00
|
|
|
self.task_rdf_text = RDF_text_task_dataset("./Assets/Dataset/Tmp/rdf_text.csv")
|
|
|
|
|
self.task_rdf_completation = RDF_completation_task_dataset("./Assets/Dataset/Tmp/rdf_completation.csv")
|
|
|
|
|
|
|
|
|
|
# prepare the filter
|
2025-09-29 16:03:49 +02:00
|
|
|
# the filter applier needs to know the frequence of Movies and Relationship among all the Dataset
|
2025-09-29 15:21:26 +02:00
|
|
|
self.filter_applier = PipelineApplier()
|
|
|
|
|
MOVIE_COUNT = self.sql_endpoint.get_movies_id_count()
|
|
|
|
|
REL_COUNT = self.sql_endpoint.get_relationship_count()
|
|
|
|
|
self.filter_applier.generate_frequency_movie_filter(MOVIE_COUNT,50,3000)
|
2025-10-05 14:54:32 +02:00
|
|
|
self.filter_applier.generate_frequency_relationship_filter(REL_COUNT, 50, 2395627) # from 2718 to 3069
|
2025-09-29 16:03:49 +02:00
|
|
|
# prepare the filter on the relationshipURI you want to delete:
|
2025-09-29 15:21:26 +02:00
|
|
|
relationship_uri_banned_list = [
|
|
|
|
|
"dbp-dbp:wikiPageUsesTemplate","w3:2000/01/rdf-schema#label","dbp-dbo:abstract",
|
|
|
|
|
"dbp-dbo:wikiPageID","dbp-dbo:wikiPageRevisionID", "dbp-dbo:wikiPageDisambiguates",
|
|
|
|
|
"w3:2002/07/owl#sameAs","dbp-dbp:image","dbp-dbo:wikiPageLength", "w3:2000/01/rdf-schema#comment",
|
2025-10-05 14:54:32 +02:00
|
|
|
"dbp-dbo:thumbnail", "foaf:depiction", "w3:1999/02/22-rdf-syntax-ns#type",
|
|
|
|
|
"dbp-dbp:id","dbp-dbp:totalWidth", "w3:ns/prov#wasDerivedFrom", "dbp-dbp:n", "dbp-dbp:alt",
|
|
|
|
|
"dbp-dbo:soundRecording"
|
|
|
|
|
]
|
2025-09-29 15:21:26 +02:00
|
|
|
self.filter_applier.generate_list_relationship_filter(relationship_uri_banned_list)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def execute_task_bpe_corpus(self):
|
|
|
|
|
for RDF in self._get_cleaned_movie_rows():
|
|
|
|
|
RDF = self.filter_applier.rebuild_by_movie(RDF)
|
|
|
|
|
RDF = RDF[["Triple","Abstract"]]
|
|
|
|
|
self.task_bpe_corpus.write_from_df(RDF)
|
|
|
|
|
self._end_file_handler()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def execute_task_rdf_mask(self):
|
|
|
|
|
for RDF in self._get_cleaned_movie_rows():
|
|
|
|
|
self.task_rdf_mask.write(RDF)
|
|
|
|
|
self._end_file_handler()
|
|
|
|
|
|
2025-09-29 16:03:49 +02:00
|
|
|
|
2025-09-29 15:21:26 +02:00
|
|
|
def execute_tasks_rdf_text(self):
|
|
|
|
|
for RDF in self._get_cleaned_movie_rows():
|
|
|
|
|
RDF = self.filter_applier.rebuild_by_movie(RDF)
|
|
|
|
|
self.task_rdf_text.write(RDF)
|
|
|
|
|
self._end_file_handler()
|
|
|
|
|
|
2025-09-29 16:03:49 +02:00
|
|
|
|
2025-09-29 15:21:26 +02:00
|
|
|
def execute_task_rdf_completation(self):
|
|
|
|
|
for RDF in self._get_cleaned_movie_rows():
|
|
|
|
|
RDF["Triple"] = self.filter_applier.build_triple(RDF)
|
|
|
|
|
self.task_rdf_completation.write(RDF[["MovieID","Triple"]])
|
|
|
|
|
self._end_file_handler()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def execute_all_task(self):
|
|
|
|
|
for RDF in self._get_cleaned_movie_rows():
|
|
|
|
|
self.task_rdf_mask.write(RDF)
|
|
|
|
|
|
|
|
|
|
RDF["Triple"] = self.filter_applier.build_triple(RDF)
|
|
|
|
|
self.task_rdf_completation.write(RDF[["MovieID","Triple"]])
|
|
|
|
|
|
|
|
|
|
RDF = self.filter_applier.group_by_movie_from_triple(RDF[["MovieID","Triple","Abstract"]])
|
|
|
|
|
|
|
|
|
|
self.task_rdf_text.write(RDF)
|
|
|
|
|
self.task_bpe_corpus.write_from_df(RDF[["Triple","Abstract"]])
|
|
|
|
|
|
|
|
|
|
self._end_file_handler()
|
|
|
|
|
|
|
|
|
|
|
2025-09-29 16:03:49 +02:00
|
|
|
def _end_file_handler(self):
|
|
|
|
|
self.task_bpe_corpus.close()
|
|
|
|
|
self.task_rdf_mask.close()
|
|
|
|
|
self.task_rdf_text.close()
|
|
|
|
|
self.task_rdf_completation.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _get_cleaned_movie_rows(self):
|
|
|
|
|
for RDF in self.sql_endpoint.get_abbreviated_dataset_by_movie_id():
|
|
|
|
|
RDF = self.filter_applier.drop_na_from_dataset(RDF)
|
|
|
|
|
RDF = self.filter_applier.filter_by_frequency_movie_id(RDF)
|
|
|
|
|
RDF = self.filter_applier.filter_by_frequency_relationship(RDF)
|
|
|
|
|
# other filter
|
|
|
|
|
#
|
|
|
|
|
RDF = self.filter_applier.delete_relationship_by_list_filter(RDF)
|
2025-09-30 15:00:07 +02:00
|
|
|
# regex on ObjectURI
|
|
|
|
|
RDF = self.filter_applier.regex_on_objects(RDF)
|
2025-09-29 16:03:49 +02:00
|
|
|
if RDF.empty:
|
|
|
|
|
continue
|
|
|
|
|
RDF = self.filter_applier.rdf_add_special_token(RDF) # WARNING, THIS MUST BE DONE AFTER FILTER BY FREQUENCE
|
|
|
|
|
yield RDF
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def use_toy_dataset(self):
|
|
|
|
|
# CHOOSEN MOVIE:
|
|
|
|
|
# The Dark Knight : 117248
|
|
|
|
|
# Inception : 147074
|
|
|
|
|
# The Avengers : 113621
|
|
|
|
|
# Cast Away : 1123
|
|
|
|
|
# The Departed : 117586
|
|
|
|
|
# American Psycho : 90177
|
|
|
|
|
# Avatar : 71587
|
|
|
|
|
# Django Unchained : 138952
|
|
|
|
|
# Spirited Away : 144137
|
|
|
|
|
# Knives Out : 148025
|
|
|
|
|
movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]
|
|
|
|
|
self.sql_endpoint.movie_ids = movie_list
|
|
|
|
|
|
2025-10-04 21:33:09 +02:00
|
|
|
def generate_csv_debug_file(self, debug_path:str):
|
|
|
|
|
debug_csv = Debug_csv(debug_path)
|
|
|
|
|
|
|
|
|
|
for RDF in self._get_cleaned_movie_rows():
|
|
|
|
|
debug_csv.write(RDF)
|
|
|
|
|
|
|
|
|
|
debug_csv.close()
|
2025-09-29 15:21:26 +02:00
|
|
|
|
|
|
|
|
|
2025-09-29 16:03:49 +02:00
|
|
|
# there are a lot of settings to manage
|
|
|
|
|
# you only need to change settings:
|
|
|
|
|
# in the init for file paths, frequency filter limit, banned reletionshipURI
|
|
|
|
|
# in the use_toy_dataset , to change the toy dataset
|
|
|
|
|
# in _get_cleaned_movie_rows: to change how the pipeline behave
|
2025-09-29 15:21:26 +02:00
|
|
|
|
2025-09-29 16:03:49 +02:00
|
|
|
pipeline = Pipeline()
|
2025-09-29 15:21:26 +02:00
|
|
|
|
2025-10-04 21:33:09 +02:00
|
|
|
pipeline.use_toy_dataset()
|
2025-09-29 15:21:26 +02:00
|
|
|
# pipeline.execute_task_bpe_corpus()
|
|
|
|
|
# pipeline.execute_task_rdf_mask()
|
|
|
|
|
# pipeline.execute_tasks_rdf_text()
|
|
|
|
|
# pipeline.execute_task_rdf_completation()
|
2025-10-04 21:33:09 +02:00
|
|
|
# pipeline.execute_all_task()
|
|
|
|
|
pipeline.generate_csv_debug_file("Assets/Dataset/Tmp/debug.csv")
|