# This file deletes in the pipeline the unwanted relationship by different rules import pandas as pd import sqlite3 import numpy as np from Scripts.Libs.CleaningPipeline.special_token import SpecialToken from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint class PipelineApplier(): def __init__(self): pass def rdf_add_special_token(self, RDF: pd.DataFrame): """ Adds RDF special token to each element of the tuple. i.e: SUBJ to SubjectURI, OBJ to ObjectURI, REL to RelationshipURI. Check Scrits/Libs/CleaningPipeline/special_token.py for the up-to-date special token. It only adds the special token of the three element of the RDF, no other special token. Args: RDF (pd.DataFrame): Returns: pd.DataFrame: ["MovieURI","SubjectURI","RelationshipURI","ObjectURI","Abstract"] """ # if the filter runned before sliced the RDF and created a View, here the problem is resolved # for more context: SettingWithCopyWarning RDF = RDF.copy() # at the beginning of SubjectURI RelationshipURI ObjectURI, add their special token RDF["SubjectURI"] = SpecialToken.SUBJECT.value + RDF["SubjectURI"] RDF["ObjectURI"] = SpecialToken.OBJECT.value + RDF["ObjectURI"] RDF["RelationshipURI"] = SpecialToken.RELATIONSHIP.value + RDF["RelationshipURI"] return RDF def drop_na_from_dataset(self, RDF: pd.DataFrame) -> pd.DataFrame: RDF = RDF.replace('', np.nan) # Drop rows where any of the key columns are NaN RDF = RDF.dropna(subset=["SubjectURI", "RelationshipURI", "ObjectURI"]) return RDF def rebuild_by_movie(self, RDF: pd.DataFrame) -> pd.DataFrame: """ Args: RDF (pd.DataFrame): ["MovieID","SubjectURI","RelationshipURI","ObjectURI","Abstract"] Returns: pd.DataFrame: ["MovieID","Triple","Abstract"] """ # to execute this method you have to have itereted by movie_id # because as design we want at the end one row for each movie # MovieID and abstract can be given as input for a more generic method # first let's combine each row creating column triple as join of rdf RDF["Triple"] = RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"] # special token RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value # combine rows into one # MovieID and Abstract are unique for each other 1 <-> 1 RDF = RDF.groupby(["MovieID", "Abstract"])["Triple"].apply("".join).reset_index() # add special token for: start of triple, end of triple and start of abstract RDF["Triple"] = SpecialToken.START_TRIPLE_LIST.value + RDF["Triple"]+SpecialToken.END_OF_SENTENCE.value RDF["Abstract"] = SpecialToken.ABSTRACT.value + RDF["Abstract"] + SpecialToken.END_OF_SENTENCE.value return RDF[["MovieID","Triple","Abstract"]] @staticmethod def build_triple(RDF: pd.DataFrame): """ Obtains joined RDF triple in one element, togheter with START and END special token Args: RDF (pd.DataFrame): at least ["SubjectURI", "RelationshipURI", "ObjectURI"] Returns: pd.DataFrame: RDF["Triple"] (just this column) """ # let's combine each row creating column triple as join of rdf RDF["Triple"] = RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"] # special token RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value return RDF["Triple"] def regex_on_objects(self, RDF: pd.DataFrame) -> pd.DataFrame: RDF["ObjectURI"] = (RDF["ObjectURI"].astype("string") .str.replace(r"\r?\n+", ", ", regex=True) # newlines -> ", " .str.replace(r"\*", "", regex=True)) # delete all asterisks return RDF