# This file deletes in the pipeline the unwanted relationship by different rules import pandas as pd import sqlite3 import numpy as np from Scripts.Libs.CleaningPipeline.special_token import SpecialToken from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint class PipelineApplier(): def __init__(self): self.MOVIE_FILTER = pd.DataFrame() self.REL_FILTER = pd.DataFrame() def delete_relationship_by_str(self, RDF: pd.DataFrame, uri: str) -> pd.DataFrame: return RDF[RDF["RelationshipURI"]!= uri] def generate_list_relationship_filter(self, filter_list: list[str]) -> None: """Store RelationshipURI filters as a set """ self.relationship_filter_list: set[str] = set(filter_list) def delete_relationship_by_list_filter(self, RDF: pd.DataFrame) -> pd.DataFrame: """Remove rows whose RelationshipURI is in the stored filter. Generate it first callig the generate_list_relationship_filter""" return RDF[~RDF["RelationshipURI"].isin(self.relationship_filter_list)] def generate_frequency_movie_filter(self, MOVIE_COUNT: pd.DataFrame ,min_treshold: int, max_treshold: int): """ You MUST call this before filter the dataset by movie frequence [filter_by_frequence_movie_id()], since this method creates such filter Args: MOVIE_COUNT (pd.DataFrame): ["MovieID","Count"] min_treshold (int): max_treshold (int): """ MOVIE_COUNT = MOVIE_COUNT[MOVIE_COUNT["Count"] >= min_treshold] MOVIE_COUNT = MOVIE_COUNT[MOVIE_COUNT["Count"] < max_treshold] self.MOVIE_FILTER = MOVIE_COUNT #["MovieID"] def generate_frequency_relationship_filter(self, REL_COUNT: pd.DataFrame ,min_treshold: int, max_treshold: int): REL_COUNT = REL_COUNT[REL_COUNT["Count"] >= min_treshold] REL_COUNT = REL_COUNT[REL_COUNT["Count"] < max_treshold] self.REL_FILTER = REL_COUNT #["RelationshipURI"] def filter_by_frequency_movie_id(self, RDF: pd.DataFrame) -> pd.DataFrame: RDF = RDF[RDF["MovieID"].isin(self.MOVIE_FILTER["MovieID"])] return RDF def filter_by_frequency_relationship(self, RDF: pd.DataFrame) -> pd.DataFrame: RDF = RDF[RDF["RelationshipURI"].isin(self.REL_FILTER["RelationshipURI"])] return RDF def rdf_add_special_token(self, RDF: pd.DataFrame): """ Adds RDF special token to each element of the tuple. i.e: SUBJ to SubjectURI, OBJ to ObjectURI, REL to RelationshipURI. Check Scrits/Libs/CleaningPipeline/special_token.py for the up-to-date special token. It only adds the special token of the three element of the RDF, no other special token. Args: RDF (pd.DataFrame): Returns: pd.DataFrame: ["MovieURI","SubjectURI","RelationshipURI","ObjectURI","Abstract"] """ # if the filter runned before sliced the RDF and created a View, here the problem is resolved # for more context: SettingWithCopyWarning RDF = RDF.copy() # at the beginning of SubjectURI RelationshipURI ObjectURI, add their special token RDF["SubjectURI"] = SpecialToken.SUBJECT.value + RDF["SubjectURI"] RDF["ObjectURI"] = SpecialToken.OBJECT.value + RDF["ObjectURI"] RDF["RelationshipURI"] = SpecialToken.RELATIONSHIP.value + RDF["RelationshipURI"] return RDF def reduce_movie_list(self, starting_offset:int , ending_offset:int): end = min(len(self.MOVIE_FILTER), ending_offset) self.MOVIE_FILTER = self.MOVIE_FILTER.iloc[starting_offset:end].copy() def drop_na_from_dataset(self, RDF: pd.DataFrame) -> pd.DataFrame: # dataset has SubjectURI RelationshipURI ObjectURI # want to drop the '' in them # Replace empty strings with NaN RDF = RDF.replace('', np.nan) # Drop rows where any of the key columns are NaN RDF = RDF.dropna(subset=["SubjectURI", "RelationshipURI", "ObjectURI"]) return RDF def rebuild_by_movie(self, RDF: pd.DataFrame) -> pd.DataFrame: """_summary_ Args: RDF (pd.DataFrame): ["MovieID","SubjectURI","RelationshipURI","ObjectURI","Abstract"] Returns: pd.DataFrame: ["MovieID","Triple","Abstract"] """ # to execute this method you have to have itereted by movie_id # because as design we want at the end one row for each movie # MovieID and abstract can be given as input for a more generic method # movie_id = RDF["MovieID"].iloc(0) # abstract = RDF["Abstract"].iloc(0) # first let's combine each row creating column triple as join of rdf RDF["Triple"] = RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"] # special token RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value # combine rows into one # MovieID and Abstract are unique for each other 1 <-> 1 RDF = RDF.groupby(["MovieID", "Abstract"])["Triple"].apply("".join).reset_index() # add special token for: start of triple, end of triple and start of abstract RDF["Triple"] = SpecialToken.START_TRIPLE_LIST.value + RDF["Triple"] RDF["Abstract"] = SpecialToken.ABSTRACT.value + RDF["Abstract"] return RDF[["MovieID","Triple","Abstract"]] def group_by_movie_from_triple(self, RDF: pd.DataFrame) -> pd.DataFrame: """ Args: RDF (pd.DataFrame): ["MovieID","Triple","Abstract"] Returns: pd.DataFrame: ["MovieID","Triple","Abstract"] """ # combine rows into one # MovieID and Abstract are unique for each other 1 <-> 1 RDF = RDF.groupby(["MovieID", "Abstract"])["Triple"].apply("".join).reset_index() # add special token for: start of triple, end of triple and start of abstract RDF["Triple"] = SpecialToken.START_TRIPLE_LIST.value + RDF["Triple"] RDF["Abstract"] = SpecialToken.ABSTRACT.value + RDF["Abstract"] return RDF[["MovieID","Triple","Abstract"]] @staticmethod def build_triple(RDF: pd.DataFrame): """ Obtains joined RDF triple in one element, togheter with START and END special token Args: RDF (pd.DataFrame): at least ["SubjectURI", "RelationshipURI", "ObjectURI"] Returns: pd.DataFrame: RDF["Triple"] (just this column) """ # let's combine each row creating column triple as join of rdf RDF["Triple"] = RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"] # special token RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value return RDF["Triple"] @staticmethod def build_incomplete_triple(RDF: pd.DataFrame): """ Method helper used for the third task: "Predicting a masked component within an RDF triple". Obtains joined RDF triple in one element, togheter with START and END special token. The MISSING element will be replaced by the special token Args: RDF (pd.DataFrame): 2 of the following ["SubjectURI", "RelationshipURI", "ObjectURI"] Returns: RDF["Triple"]: pd.Series (just this column, NOT A DATAFRAME) """ # let's create a new column "Triple" with the joined RDF # the following creates a column of MASK token of the lenght of the dataframe, # it is not needed since we expect to have a dataframe of just one column, but its more robust (AND SLOW) MISSING = pd.Series([SpecialToken.MASK.value] * len(RDF), index=RDF.index) RDF["Triple"] = ( RDF.get("SubjectURI", MISSING) + RDF.get("RelationshipURI", MISSING) + RDF.get("ObjectURI", MISSING)) # special token RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value return RDF["Triple"] @staticmethod def build_for_mask_task(RDF_incomplete: pd.DataFrame, MISSING: pd.DataFrame) -> pd.DataFrame: # currently not used """ Method helper used for the third task: "Predicting a masked component within an RDF triple". Given two Dataframe, the first containing the incompleted RDF and the other only the missing componment, this methods applies the special token Args: RDF (pd.DataFrame): _description_ Returns: pd.DataFrame: _description_ """ # take an example dataframe as ["SubjectURI",""] # as input two dataframe, one with 2 column return None