NanoSocrates/Scripts/DataCleaning/legacy/pipeline.py

import re
from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
from Scripts.DataCleaning.legacy.filter import PipelineApplier
# tasks dataset builder
from Scripts.DataCleaning.data_output_models.rdf_mask_task import RDF_mask_task_dataset
from Scripts.DataCleaning.data_output_models.bpe_corpus import BPE_corpus
from Scripts.DataCleaning.data_output_models.rdf_text_tasks import RDF_text_task_dataset
from Scripts.DataCleaning.data_output_models.rdf_completation_task import RDF_completation_task_dataset
from Scripts.DataCleaning.data_output_models.debug_csv import Debug_csv

import pandas as pd

class Pipeline():
    def __init__(self):
        self.sql_endpoint = SqlEndpoint()
        # classes to manage taskes' datasets
        self.task_rdf_mask = RDF_mask_task_dataset("./Assets/Dataset/Tmp/rdf_mask.csv")
        self.task_bpe_corpus = BPE_corpus("./Assets/Dataset/Tmp/corpus.txt")
        self.task_rdf_text = RDF_text_task_dataset("./Assets/Dataset/Tmp/rdf_text.csv")
        self.task_rdf_completation = RDF_completation_task_dataset("./Assets/Dataset/Tmp/rdf_completation.csv")

        # prepare the filter
        # the filter applier needs to know the frequence of Movies and Relationship among all the Dataset
        self.filter_applier = PipelineApplier()
        MOVIE_COUNT = self.sql_endpoint.get_movies_id_count()
        REL_COUNT = self.sql_endpoint.get_relationship_count()
        self.filter_applier.generate_frequency_movie_filter(MOVIE_COUNT,50,3000)
        self.filter_applier.generate_frequency_relationship_filter(REL_COUNT, 50, 2395627) # from 2718 to 3069 
        # prepare the filter on the relationshipURI you want to delete:
        relationship_uri_banned_list = [
            "dbp-dbp:wikiPageUsesTemplate","w3:2000/01/rdf-schema#label","dbp-dbo:abstract",
            "dbp-dbo:wikiPageID","dbp-dbo:wikiPageRevisionID", "dbp-dbo:wikiPageDisambiguates",
            "w3:2002/07/owl#sameAs","dbp-dbp:image","dbp-dbo:wikiPageLength", "w3:2000/01/rdf-schema#comment",
            "dbp-dbo:thumbnail", "foaf:depiction", "w3:1999/02/22-rdf-syntax-ns#type", 
            "dbp-dbp:id","dbp-dbp:totalWidth", "w3:ns/prov#wasDerivedFrom", "dbp-dbp:n", "dbp-dbp:alt",
            "dbp-dbo:soundRecording"
            ]
        self.filter_applier.generate_list_relationship_filter(relationship_uri_banned_list)


    def execute_task_bpe_corpus(self):
        for RDF in self._get_cleaned_movie_rows():
            RDF = self.filter_applier.rebuild_by_movie(RDF)
            RDF = RDF[["Triple","Abstract"]]
            self.task_bpe_corpus.write_from_df(RDF)
        self._end_file_handler()


    def execute_task_rdf_mask(self):
        for RDF in self._get_cleaned_movie_rows():
            self.task_rdf_mask.write(RDF)
        self._end_file_handler()


    def execute_tasks_rdf_text(self):
        for RDF in self._get_cleaned_movie_rows():
            RDF = self.filter_applier.rebuild_by_movie(RDF)
            self.task_rdf_text.write(RDF)
        self._end_file_handler()


    def execute_task_rdf_completation(self):
        for RDF in self._get_cleaned_movie_rows():
            RDF["Triple"] = self.filter_applier.build_triple(RDF)
            self.task_rdf_completation.write(RDF[["MovieID","Triple"]])
        self._end_file_handler()


    def execute_all_task(self):
        for RDF in self._get_cleaned_movie_rows():
            self.task_rdf_mask.write(RDF)

            RDF["Triple"] = self.filter_applier.build_triple(RDF)
            self.task_rdf_completation.write(RDF[["MovieID","Triple"]])

            RDF = self.filter_applier.group_by_movie_from_triple(RDF[["MovieID","Triple","Abstract"]])

            self.task_rdf_text.write(RDF)
            self.task_bpe_corpus.write_from_df(RDF[["Triple","Abstract"]])

        self._end_file_handler()


    def _end_file_handler(self):
        self.task_bpe_corpus.close()
        self.task_rdf_mask.close()
        self.task_rdf_text.close()
        self.task_rdf_completation.close()


    def _get_cleaned_movie_rows(self):
        for RDF in self.sql_endpoint.get_abbreviated_dataset_by_movie_id():
            RDF = self.filter_applier.drop_na_from_dataset(RDF)
            RDF = self.filter_applier.filter_by_frequency_movie_id(RDF)
            RDF = self.filter_applier.filter_by_frequency_relationship(RDF)
            # other filter
            #
            RDF = self.filter_applier.delete_relationship_by_list_filter(RDF)
            # regex on ObjectURI
            RDF = self.filter_applier.regex_on_objects(RDF)
            if RDF.empty:
                continue
            RDF = self.filter_applier.rdf_add_special_token(RDF)  # WARNING, THIS MUST BE DONE AFTER FILTER BY FREQUENCE
            yield RDF


    def use_toy_dataset(self):
        # CHOOSEN MOVIE:
        # The Dark Knight   : 117248
        # Inception         : 147074
        # The Avengers      : 113621
        # Cast Away         : 1123
        # The Departed      : 117586
        # American Psycho   : 90177
        # Avatar            : 71587
        # Django Unchained  : 138952
        # Spirited Away     : 144137
        # Knives Out        : 148025
        movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]
        self.sql_endpoint.movie_ids = movie_list

    def generate_csv_debug_file(self, debug_path:str):
        debug_csv = Debug_csv(debug_path)

        for RDF in self._get_cleaned_movie_rows():
            debug_csv.write(RDF)

        debug_csv.close()


# there are a lot of settings to manage
# you only need to change settings: 
# in the init for file paths, frequency filter limit, banned reletionshipURI
# in the use_toy_dataset , to change the toy dataset
# in _get_cleaned_movie_rows: to change how the pipeline behave

pipeline = Pipeline()

pipeline.use_toy_dataset()
# pipeline.execute_task_bpe_corpus()
# pipeline.execute_task_rdf_mask()
# pipeline.execute_tasks_rdf_text()
# pipeline.execute_task_rdf_completation()
# pipeline.execute_all_task()
pipeline.generate_csv_debug_file("Assets/Dataset/Tmp/debug.csv")
Added file to execute the complete cleaning pipeline 2025-09-29 15:21:26 +02:00			`import re`
			`from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint`
Created legacy folder for old pipeline this pipeline still works but is slower then the new, some ot its method can be used later 2025-10-05 14:54:32 +02:00			`from Scripts.DataCleaning.legacy.filter import PipelineApplier`
Added file to execute the complete cleaning pipeline 2025-09-29 15:21:26 +02:00			`# tasks dataset builder`
			`from Scripts.DataCleaning.data_output_models.rdf_mask_task import RDF_mask_task_dataset`
			`from Scripts.DataCleaning.data_output_models.bpe_corpus import BPE_corpus`
			`from Scripts.DataCleaning.data_output_models.rdf_text_tasks import RDF_text_task_dataset`
			`from Scripts.DataCleaning.data_output_models.rdf_completation_task import RDF_completation_task_dataset`
new utility to generate a csv debug file of the output of the pipeline 2025-10-04 21:33:09 +02:00			`from Scripts.DataCleaning.data_output_models.debug_csv import Debug_csv`
Added file to execute the complete cleaning pipeline 2025-09-29 15:21:26 +02:00
			`import pandas as pd`

			`class Pipeline():`
Added Toy Dataset entry point into the Pipeline class Before it was forced into the sql_endpoint, now all the pipeline can be managed in the Pipeline class 2025-09-29 16:03:49 +02:00			`def __init__(self):`
Added file to execute the complete cleaning pipeline 2025-09-29 15:21:26 +02:00			`self.sql_endpoint = SqlEndpoint()`
			`# classes to manage taskes' datasets`
Added Toy Dataset entry point into the Pipeline class Before it was forced into the sql_endpoint, now all the pipeline can be managed in the Pipeline class 2025-09-29 16:03:49 +02:00			`self.task_rdf_mask = RDF_mask_task_dataset("./Assets/Dataset/Tmp/rdf_mask.csv")`
			`self.task_bpe_corpus = BPE_corpus("./Assets/Dataset/Tmp/corpus.txt")`
Added file to execute the complete cleaning pipeline 2025-09-29 15:21:26 +02:00			`self.task_rdf_text = RDF_text_task_dataset("./Assets/Dataset/Tmp/rdf_text.csv")`
			`self.task_rdf_completation = RDF_completation_task_dataset("./Assets/Dataset/Tmp/rdf_completation.csv")`

			`# prepare the filter`
Added Toy Dataset entry point into the Pipeline class Before it was forced into the sql_endpoint, now all the pipeline can be managed in the Pipeline class 2025-09-29 16:03:49 +02:00			`# the filter applier needs to know the frequence of Movies and Relationship among all the Dataset`
Added file to execute the complete cleaning pipeline 2025-09-29 15:21:26 +02:00			`self.filter_applier = PipelineApplier()`
			`MOVIE_COUNT = self.sql_endpoint.get_movies_id_count()`
			`REL_COUNT = self.sql_endpoint.get_relationship_count()`
			`self.filter_applier.generate_frequency_movie_filter(MOVIE_COUNT,50,3000)`
Created legacy folder for old pipeline this pipeline still works but is slower then the new, some ot its method can be used later 2025-10-05 14:54:32 +02:00			`self.filter_applier.generate_frequency_relationship_filter(REL_COUNT, 50, 2395627) # from 2718 to 3069`
Added Toy Dataset entry point into the Pipeline class Before it was forced into the sql_endpoint, now all the pipeline can be managed in the Pipeline class 2025-09-29 16:03:49 +02:00			`# prepare the filter on the relationshipURI you want to delete:`
Added file to execute the complete cleaning pipeline 2025-09-29 15:21:26 +02:00			`relationship_uri_banned_list = [`
			`"dbp-dbp:wikiPageUsesTemplate","w3:2000/01/rdf-schema#label","dbp-dbo:abstract",`
			`"dbp-dbo:wikiPageID","dbp-dbo:wikiPageRevisionID", "dbp-dbo:wikiPageDisambiguates",`
			`"w3:2002/07/owl#sameAs","dbp-dbp:image","dbp-dbo:wikiPageLength", "w3:2000/01/rdf-schema#comment",`
Created legacy folder for old pipeline this pipeline still works but is slower then the new, some ot its method can be used later 2025-10-05 14:54:32 +02:00			`"dbp-dbo:thumbnail", "foaf:depiction", "w3:1999/02/22-rdf-syntax-ns#type",`
			`"dbp-dbp:id","dbp-dbp:totalWidth", "w3:ns/prov#wasDerivedFrom", "dbp-dbp:n", "dbp-dbp:alt",`
			`"dbp-dbo:soundRecording"`
			`]`
Added file to execute the complete cleaning pipeline 2025-09-29 15:21:26 +02:00			`self.filter_applier.generate_list_relationship_filter(relationship_uri_banned_list)`


			`def execute_task_bpe_corpus(self):`
			`for RDF in self._get_cleaned_movie_rows():`
			`RDF = self.filter_applier.rebuild_by_movie(RDF)`
			`RDF = RDF[["Triple","Abstract"]]`
			`self.task_bpe_corpus.write_from_df(RDF)`
			`self._end_file_handler()`


			`def execute_task_rdf_mask(self):`
			`for RDF in self._get_cleaned_movie_rows():`
			`self.task_rdf_mask.write(RDF)`
			`self._end_file_handler()`

Added Toy Dataset entry point into the Pipeline class Before it was forced into the sql_endpoint, now all the pipeline can be managed in the Pipeline class 2025-09-29 16:03:49 +02:00
Added file to execute the complete cleaning pipeline 2025-09-29 15:21:26 +02:00			`def execute_tasks_rdf_text(self):`
			`for RDF in self._get_cleaned_movie_rows():`
			`RDF = self.filter_applier.rebuild_by_movie(RDF)`
			`self.task_rdf_text.write(RDF)`
			`self._end_file_handler()`

Added Toy Dataset entry point into the Pipeline class Before it was forced into the sql_endpoint, now all the pipeline can be managed in the Pipeline class 2025-09-29 16:03:49 +02:00
Added file to execute the complete cleaning pipeline 2025-09-29 15:21:26 +02:00			`def execute_task_rdf_completation(self):`
			`for RDF in self._get_cleaned_movie_rows():`
			`RDF["Triple"] = self.filter_applier.build_triple(RDF)`
			`self.task_rdf_completation.write(RDF[["MovieID","Triple"]])`
			`self._end_file_handler()`


			`def execute_all_task(self):`
			`for RDF in self._get_cleaned_movie_rows():`
			`self.task_rdf_mask.write(RDF)`

			`RDF["Triple"] = self.filter_applier.build_triple(RDF)`
			`self.task_rdf_completation.write(RDF[["MovieID","Triple"]])`

			`RDF = self.filter_applier.group_by_movie_from_triple(RDF[["MovieID","Triple","Abstract"]])`

			`self.task_rdf_text.write(RDF)`
			`self.task_bpe_corpus.write_from_df(RDF[["Triple","Abstract"]])`

			`self._end_file_handler()`


Added Toy Dataset entry point into the Pipeline class Before it was forced into the sql_endpoint, now all the pipeline can be managed in the Pipeline class 2025-09-29 16:03:49 +02:00			`def _end_file_handler(self):`
			`self.task_bpe_corpus.close()`
			`self.task_rdf_mask.close()`
			`self.task_rdf_text.close()`
			`self.task_rdf_completation.close()`


			`def _get_cleaned_movie_rows(self):`
			`for RDF in self.sql_endpoint.get_abbreviated_dataset_by_movie_id():`
			`RDF = self.filter_applier.drop_na_from_dataset(RDF)`
			`RDF = self.filter_applier.filter_by_frequency_movie_id(RDF)`
			`RDF = self.filter_applier.filter_by_frequency_relationship(RDF)`
			`# other filter`
			`#`
			`RDF = self.filter_applier.delete_relationship_by_list_filter(RDF)`
Added regex to delete new lines and * from ObjectURI 2025-09-30 15:00:07 +02:00			`# regex on ObjectURI`
			`RDF = self.filter_applier.regex_on_objects(RDF)`
Added Toy Dataset entry point into the Pipeline class Before it was forced into the sql_endpoint, now all the pipeline can be managed in the Pipeline class 2025-09-29 16:03:49 +02:00			`if RDF.empty:`
			`continue`
			`RDF = self.filter_applier.rdf_add_special_token(RDF) # WARNING, THIS MUST BE DONE AFTER FILTER BY FREQUENCE`
			`yield RDF`


			`def use_toy_dataset(self):`
			`# CHOOSEN MOVIE:`
			`# The Dark Knight : 117248`
			`# Inception : 147074`
			`# The Avengers : 113621`
			`# Cast Away : 1123`
			`# The Departed : 117586`
			`# American Psycho : 90177`
			`# Avatar : 71587`
			`# Django Unchained : 138952`
			`# Spirited Away : 144137`
			`# Knives Out : 148025`
			`movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]`
			`self.sql_endpoint.movie_ids = movie_list`

new utility to generate a csv debug file of the output of the pipeline 2025-10-04 21:33:09 +02:00			`def generate_csv_debug_file(self, debug_path:str):`
			`debug_csv = Debug_csv(debug_path)`

			`for RDF in self._get_cleaned_movie_rows():`
			`debug_csv.write(RDF)`

			`debug_csv.close()`
Added file to execute the complete cleaning pipeline 2025-09-29 15:21:26 +02:00

Added Toy Dataset entry point into the Pipeline class Before it was forced into the sql_endpoint, now all the pipeline can be managed in the Pipeline class 2025-09-29 16:03:49 +02:00			`# there are a lot of settings to manage`
			`# you only need to change settings:`
			`# in the init for file paths, frequency filter limit, banned reletionshipURI`
			`# in the use_toy_dataset , to change the toy dataset`
			`# in _get_cleaned_movie_rows: to change how the pipeline behave`
Added file to execute the complete cleaning pipeline 2025-09-29 15:21:26 +02:00
Added Toy Dataset entry point into the Pipeline class Before it was forced into the sql_endpoint, now all the pipeline can be managed in the Pipeline class 2025-09-29 16:03:49 +02:00			`pipeline = Pipeline()`
Added file to execute the complete cleaning pipeline 2025-09-29 15:21:26 +02:00
new utility to generate a csv debug file of the output of the pipeline 2025-10-04 21:33:09 +02:00			`pipeline.use_toy_dataset()`
Added file to execute the complete cleaning pipeline 2025-09-29 15:21:26 +02:00			`# pipeline.execute_task_bpe_corpus()`
			`# pipeline.execute_task_rdf_mask()`
			`# pipeline.execute_tasks_rdf_text()`
			`# pipeline.execute_task_rdf_completation()`
new utility to generate a csv debug file of the output of the pipeline 2025-10-04 21:33:09 +02:00			`# pipeline.execute_all_task()`
			`pipeline.generate_csv_debug_file("Assets/Dataset/Tmp/debug.csv")`