NanoSocrates/Scripts/DataCleaning/filter.py

# This file deletes in the pipeline the unwanted relationship by different rules
import pandas as pd
import sqlite3
import numpy as np

from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint


class PipelineApplier():

    def __init__(self):

        self.MOVIE_FILTER = pd.DataFrame()
        self.REL_FILTER = pd.DataFrame()


    def delete_relationship_by_str(self, RDF: pd.DataFrame, uri: str) -> pd.DataFrame:
        return RDF[RDF["RelationshipURI"]!= uri]
    
    def generate_list_relationship_filter(self, filter_list: list[str]) -> None:
        """Store RelationshipURI filters as a set """
        self.relationship_filter_list: set[str] = set(filter_list)
    
    def delete_relationship_by_list_filter(self, RDF: pd.DataFrame) -> pd.DataFrame:
        """Remove rows whose RelationshipURI is in the stored filter. Generate it first callig the generate_list_relationship_filter"""
        return RDF[~RDF["RelationshipURI"].isin(self.relationship_filter_list)]


    def generate_frequency_movie_filter(self, MOVIE_COUNT: pd.DataFrame ,min_treshold: int, max_treshold: int):
        """
        You MUST call this before filter the dataset by movie frequence [filter_by_frequence_movie_id()], 
        since this method creates such filter
        Args:
            MOVIE_COUNT (pd.DataFrame): ["MovieID","Count"]
            min_treshold (int): 
            max_treshold (int): 
        """        
        MOVIE_COUNT = MOVIE_COUNT[MOVIE_COUNT["Count"] >= min_treshold]
        MOVIE_COUNT = MOVIE_COUNT[MOVIE_COUNT["Count"] < max_treshold]
        self.MOVIE_FILTER = MOVIE_COUNT #["MovieID"]

    def generate_frequency_relationship_filter(self, REL_COUNT: pd.DataFrame ,min_treshold: int, max_treshold: int):
        REL_COUNT = REL_COUNT[REL_COUNT["Count"] >= min_treshold]
        REL_COUNT = REL_COUNT[REL_COUNT["Count"] < max_treshold]
        self.REL_FILTER = REL_COUNT #["RelationshipURI"]

    def filter_by_frequency_movie_id(self, RDF: pd.DataFrame) -> pd.DataFrame:
        RDF = RDF[RDF["MovieID"].isin(self.MOVIE_FILTER["MovieID"])]
        return RDF

    def filter_by_frequency_relationship(self, RDF: pd.DataFrame) -> pd.DataFrame:
        RDF = RDF[RDF["RelationshipURI"].isin(self.REL_FILTER["RelationshipURI"])]
        return RDF

    def rdf_add_special_token(self, RDF: pd.DataFrame):
        """
        Adds RDF special token to each element of the tuple. i.e: SUBJ to SubjectURI, OBJ to ObjectURI, REL to RelationshipURI. 
        Check Scrits/Libs/CleaningPipeline/special_token.py for the up-to-date special token.
        It only adds the special token of the three element of the RDF, no other special token.
        Args:
            RDF (pd.DataFrame):
        Returns:
            pd.DataFrame: ["MovieURI","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
        """        
        # if the filter runned before sliced the RDF and created a View, here the problem is resolved
        # for more context: SettingWithCopyWarning
        RDF = RDF.copy()
        # at the beginning of SubjectURI RelationshipURI ObjectURI, add their special token  
        RDF["SubjectURI"] = SpecialToken.SUBJECT.value + RDF["SubjectURI"]
        RDF["ObjectURI"] = SpecialToken.OBJECT.value + RDF["ObjectURI"]
        RDF["RelationshipURI"] = SpecialToken.RELATIONSHIP.value + RDF["RelationshipURI"]
        return RDF


    def reduce_movie_list(self, starting_offset:int , ending_offset:int):
        end = min(len(self.MOVIE_FILTER), ending_offset)
        self.MOVIE_FILTER = self.MOVIE_FILTER.iloc[starting_offset:end].copy()

    def drop_na_from_dataset(self, RDF: pd.DataFrame) -> pd.DataFrame:
        # dataset has SubjectURI RelationshipURI ObjectURI
        #  want to drop the '' in them
        # Replace empty strings with NaN
        RDF = RDF.replace('', np.nan)
        # Drop rows where any of the key columns are NaN
        RDF = RDF.dropna(subset=["SubjectURI", "RelationshipURI", "ObjectURI"])
        return RDF
    
    def rebuild_by_movie(self, RDF: pd.DataFrame) -> pd.DataFrame:
        """_summary_

        Args:
            RDF (pd.DataFrame): ["MovieID","SubjectURI","RelationshipURI","ObjectURI","Abstract"]

        Returns:
            pd.DataFrame: ["MovieID","Triple","Abstract"]
        """        
        # to execute this method you have to have itereted by movie_id
        # because as design we want at the end one row for each movie
        # MovieID and abstract can be given as input for a more generic method
        # movie_id = RDF["MovieID"].iloc(0)
        # abstract = RDF["Abstract"].iloc(0)
        # first let's combine each row creating column triple as join of rdf
        RDF["Triple"] =  RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"]
        # special token 
        RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
        # combine rows into one
        # MovieID and Abstract are unique for each other 1 <-> 1
        RDF = RDF.groupby(["MovieID", "Abstract"])["Triple"].apply("".join).reset_index()
        # add special token for: start of triple, end of triple and start of abstract
        RDF["Triple"] = SpecialToken.START_TRIPLE_LIST.value + RDF["Triple"] 
        RDF["Abstract"] = SpecialToken.ABSTRACT.value + RDF["Abstract"]
        return RDF[["MovieID","Triple","Abstract"]]

    def group_by_movie_from_triple(self, RDF: pd.DataFrame) -> pd.DataFrame:
        """
        Args:
            RDF (pd.DataFrame): ["MovieID","Triple","Abstract"]

        Returns:
            pd.DataFrame: ["MovieID","Triple","Abstract"]
        """
        # combine rows into one
        # MovieID and Abstract are unique for each other 1 <-> 1
        RDF = RDF.groupby(["MovieID", "Abstract"])["Triple"].apply("".join).reset_index()
        # add special token for: start of triple, end of triple and start of abstract
        RDF["Triple"] = SpecialToken.START_TRIPLE_LIST.value + RDF["Triple"] 
        RDF["Abstract"] = SpecialToken.ABSTRACT.value + RDF["Abstract"]
        return RDF[["MovieID","Triple","Abstract"]]


    @staticmethod
    def build_triple(RDF: pd.DataFrame):
        """
        Obtains joined RDF triple in one element, togheter with START and END special token
        Args:
            RDF (pd.DataFrame): at least ["SubjectURI", "RelationshipURI", "ObjectURI"]
        Returns:
            pd.DataFrame: RDF["Triple"] (just this column)
        """        
        # let's combine each row creating column triple as join of rdf
        RDF["Triple"] =  RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"]
        # special token 
        RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
        return RDF["Triple"]

    @staticmethod
    def build_incomplete_triple(RDF: pd.DataFrame):
        """
        Method helper used for the third task: "Predicting a masked component within an RDF triple". 
        Obtains joined RDF triple in one element, togheter with START and END special token.
        The MISSING element will be replaced by the special token <MASK>
        Args:
            RDF (pd.DataFrame): 2 of the following ["SubjectURI", "RelationshipURI", "ObjectURI"]
        Returns:
            RDF["Triple"]: pd.Series  (just this column, NOT A DATAFRAME)
        """        
        # let's create a new column "Triple" with the joined RDF

        # the following creates a column of MASK token of the lenght of the dataframe,
        # it is not needed since we expect to have a dataframe of just one column, but its more robust (AND SLOW)
        MISSING = pd.Series([SpecialToken.MASK.value] * len(RDF), index=RDF.index)

        RDF["Triple"] =  ( 
                    RDF.get("SubjectURI", MISSING) + 
                    RDF.get("RelationshipURI", MISSING) + 
                    RDF.get("ObjectURI", MISSING))
        # special token 
        RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
        return RDF["Triple"]

    @staticmethod
    def build_for_mask_task(RDF_incomplete: pd.DataFrame, MISSING: pd.DataFrame) -> pd.DataFrame:
        # currently not used
        """
        Method helper used for the third task: "Predicting a masked component within an RDF triple". 
        Given two Dataframe, the first containing the incompleted RDF and the other only the missing componment,
        this methods applies the special token
        Args:
            RDF (pd.DataFrame): _description_

        Returns:
            pd.DataFrame: _description_
        """  
        # take an example dataframe as ["SubjectURI",""]    
        # as input two dataframe, one with 2 column  
        return None
Added file to execute the complete cleaning pipeline 2025-09-29 15:21:26 +02:00			`# This file deletes in the pipeline the unwanted relationship by different rules`
			`import pandas as pd`
			`import sqlite3`
			`import numpy as np`

			`from Scripts.Libs.CleaningPipeline.special_token import SpecialToken`
			`from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint`


			`class PipelineApplier():`

			`def __init__(self):`

			`self.MOVIE_FILTER = pd.DataFrame()`
			`self.REL_FILTER = pd.DataFrame()`


			`def delete_relationship_by_str(self, RDF: pd.DataFrame, uri: str) -> pd.DataFrame:`
			`return RDF[RDF["RelationshipURI"]!= uri]`

			`def generate_list_relationship_filter(self, filter_list: list[str]) -> None:`
			`"""Store RelationshipURI filters as a set """`
			`self.relationship_filter_list: set[str] = set(filter_list)`

			`def delete_relationship_by_list_filter(self, RDF: pd.DataFrame) -> pd.DataFrame:`
			`"""Remove rows whose RelationshipURI is in the stored filter. Generate it first callig the generate_list_relationship_filter"""`
			`return RDF[~RDF["RelationshipURI"].isin(self.relationship_filter_list)]`


			`def generate_frequency_movie_filter(self, MOVIE_COUNT: pd.DataFrame ,min_treshold: int, max_treshold: int):`
			`"""`
			`You MUST call this before filter the dataset by movie frequence [filter_by_frequence_movie_id()],`
			`since this method creates such filter`
			`Args:`
			`MOVIE_COUNT (pd.DataFrame): ["MovieID","Count"]`
			`min_treshold (int):`
			`max_treshold (int):`
			`"""`
			`MOVIE_COUNT = MOVIE_COUNT[MOVIE_COUNT["Count"] >= min_treshold]`
			`MOVIE_COUNT = MOVIE_COUNT[MOVIE_COUNT["Count"] < max_treshold]`
			`self.MOVIE_FILTER = MOVIE_COUNT #["MovieID"]`

			`def generate_frequency_relationship_filter(self, REL_COUNT: pd.DataFrame ,min_treshold: int, max_treshold: int):`
			`REL_COUNT = REL_COUNT[REL_COUNT["Count"] >= min_treshold]`
			`REL_COUNT = REL_COUNT[REL_COUNT["Count"] < max_treshold]`
			`self.REL_FILTER = REL_COUNT #["RelationshipURI"]`

			`def filter_by_frequency_movie_id(self, RDF: pd.DataFrame) -> pd.DataFrame:`
			`RDF = RDF[RDF["MovieID"].isin(self.MOVIE_FILTER["MovieID"])]`
			`return RDF`

			`def filter_by_frequency_relationship(self, RDF: pd.DataFrame) -> pd.DataFrame:`
			`RDF = RDF[RDF["RelationshipURI"].isin(self.REL_FILTER["RelationshipURI"])]`
			`return RDF`

			`def rdf_add_special_token(self, RDF: pd.DataFrame):`
			`"""`
			`Adds RDF special token to each element of the tuple. i.e: SUBJ to SubjectURI, OBJ to ObjectURI, REL to RelationshipURI.`
			`Check Scrits/Libs/CleaningPipeline/special_token.py for the up-to-date special token.`
			`It only adds the special token of the three element of the RDF, no other special token.`
			`Args:`
			`RDF (pd.DataFrame):`
			`Returns:`
			`pd.DataFrame: ["MovieURI","SubjectURI","RelationshipURI","ObjectURI","Abstract"]`
			`"""`
			`# if the filter runned before sliced the RDF and created a View, here the problem is resolved`
			`# for more context: SettingWithCopyWarning`
			`RDF = RDF.copy()`
			`# at the beginning of SubjectURI RelationshipURI ObjectURI, add their special token`
			`RDF["SubjectURI"] = SpecialToken.SUBJECT.value + RDF["SubjectURI"]`
			`RDF["ObjectURI"] = SpecialToken.OBJECT.value + RDF["ObjectURI"]`
			`RDF["RelationshipURI"] = SpecialToken.RELATIONSHIP.value + RDF["RelationshipURI"]`
			`return RDF`


update cleaning pipeline with a new method to filter also by number of films, also updated the signature of the pipeline 2025-10-04 19:00:05 +02:00			`def reduce_movie_list(self, starting_offset:int , ending_offset:int):`
			`end = min(len(self.MOVIE_FILTER), ending_offset)`
			`self.MOVIE_FILTER = self.MOVIE_FILTER.iloc[starting_offset:end].copy()`

Added file to execute the complete cleaning pipeline 2025-09-29 15:21:26 +02:00			`def drop_na_from_dataset(self, RDF: pd.DataFrame) -> pd.DataFrame:`
			`# dataset has SubjectURI RelationshipURI ObjectURI`
			`# want to drop the '' in them`
			`# Replace empty strings with NaN`
			`RDF = RDF.replace('', np.nan)`
			`# Drop rows where any of the key columns are NaN`
			`RDF = RDF.dropna(subset=["SubjectURI", "RelationshipURI", "ObjectURI"])`
			`return RDF`

			`def rebuild_by_movie(self, RDF: pd.DataFrame) -> pd.DataFrame:`
			`"""_summary_`

			`Args:`
			`RDF (pd.DataFrame): ["MovieID","SubjectURI","RelationshipURI","ObjectURI","Abstract"]`

			`Returns:`
			`pd.DataFrame: ["MovieID","Triple","Abstract"]`
			`"""`
			`# to execute this method you have to have itereted by movie_id`
			`# because as design we want at the end one row for each movie`
			`# MovieID and abstract can be given as input for a more generic method`
			`# movie_id = RDF["MovieID"].iloc(0)`
			`# abstract = RDF["Abstract"].iloc(0)`
			`# first let's combine each row creating column triple as join of rdf`
			`RDF["Triple"] = RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"]`
			`# special token`
			`RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value`
			`# combine rows into one`
			`# MovieID and Abstract are unique for each other 1 <-> 1`
			`RDF = RDF.groupby(["MovieID", "Abstract"])["Triple"].apply("".join).reset_index()`
			`# add special token for: start of triple, end of triple and start of abstract`
			`RDF["Triple"] = SpecialToken.START_TRIPLE_LIST.value + RDF["Triple"]`
			`RDF["Abstract"] = SpecialToken.ABSTRACT.value + RDF["Abstract"]`
			`return RDF[["MovieID","Triple","Abstract"]]`

			`def group_by_movie_from_triple(self, RDF: pd.DataFrame) -> pd.DataFrame:`
			`"""`
			`Args:`
			`RDF (pd.DataFrame): ["MovieID","Triple","Abstract"]`

			`Returns:`
			`pd.DataFrame: ["MovieID","Triple","Abstract"]`
			`"""`
			`# combine rows into one`
			`# MovieID and Abstract are unique for each other 1 <-> 1`
			`RDF = RDF.groupby(["MovieID", "Abstract"])["Triple"].apply("".join).reset_index()`
			`# add special token for: start of triple, end of triple and start of abstract`
			`RDF["Triple"] = SpecialToken.START_TRIPLE_LIST.value + RDF["Triple"]`
			`RDF["Abstract"] = SpecialToken.ABSTRACT.value + RDF["Abstract"]`
			`return RDF[["MovieID","Triple","Abstract"]]`


			`@staticmethod`
			`def build_triple(RDF: pd.DataFrame):`
			`"""`
			`Obtains joined RDF triple in one element, togheter with START and END special token`
			`Args:`
			`RDF (pd.DataFrame): at least ["SubjectURI", "RelationshipURI", "ObjectURI"]`
			`Returns:`
			`pd.DataFrame: RDF["Triple"] (just this column)`
			`"""`
			`# let's combine each row creating column triple as join of rdf`
			`RDF["Triple"] = RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"]`
			`# special token`
			`RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value`
			`return RDF["Triple"]`

			`@staticmethod`
			`def build_incomplete_triple(RDF: pd.DataFrame):`
			`"""`
			`Method helper used for the third task: "Predicting a masked component within an RDF triple".`
			`Obtains joined RDF triple in one element, togheter with START and END special token.`
			`The MISSING element will be replaced by the special token <MASK>`
			`Args:`
			`RDF (pd.DataFrame): 2 of the following ["SubjectURI", "RelationshipURI", "ObjectURI"]`
			`Returns:`
			`RDF["Triple"]: pd.Series (just this column, NOT A DATAFRAME)`
			`"""`
			`# let's create a new column "Triple" with the joined RDF`

			`# the following creates a column of MASK token of the lenght of the dataframe,`
			`# it is not needed since we expect to have a dataframe of just one column, but its more robust (AND SLOW)`
			`MISSING = pd.Series([SpecialToken.MASK.value] * len(RDF), index=RDF.index)`

			`RDF["Triple"] = (`
			`RDF.get("SubjectURI", MISSING) +`
			`RDF.get("RelationshipURI", MISSING) +`
			`RDF.get("ObjectURI", MISSING))`
			`# special token`
			`RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value`
			`return RDF["Triple"]`

			`@staticmethod`
			`def build_for_mask_task(RDF_incomplete: pd.DataFrame, MISSING: pd.DataFrame) -> pd.DataFrame:`
			`# currently not used`
			`"""`
			`Method helper used for the third task: "Predicting a masked component within an RDF triple".`
			`Given two Dataframe, the first containing the incompleted RDF and the other only the missing componment,`
			`this methods applies the special token`
			`Args:`
			`RDF (pd.DataFrame): _description_`

			`Returns:`
			`pd.DataFrame: _description_`
			`"""`
			`# take an example dataframe as ["SubjectURI",""]`
			`# as input two dataframe, one with 2 column`
			`return None`