190 lines
8.7 KiB
Python
190 lines
8.7 KiB
Python
# This file deletes in the pipeline the unwanted relationship by different rules
|
|
import pandas as pd
|
|
import sqlite3
|
|
import numpy as np
|
|
|
|
from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
|
|
from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
|
|
|
|
|
|
class PipelineApplier():
|
|
|
|
def __init__(self):
|
|
|
|
self.MOVIE_FILTER = pd.DataFrame()
|
|
self.REL_FILTER = pd.DataFrame()
|
|
|
|
|
|
def delete_relationship_by_str(self, RDF: pd.DataFrame, uri: str) -> pd.DataFrame:
|
|
return RDF[RDF["RelationshipURI"]!= uri]
|
|
|
|
def generate_list_relationship_filter(self, filter_list: list[str]) -> None:
|
|
"""Store RelationshipURI filters as a set """
|
|
self.relationship_filter_list: set[str] = set(filter_list)
|
|
|
|
def delete_relationship_by_list_filter(self, RDF: pd.DataFrame) -> pd.DataFrame:
|
|
"""Remove rows whose RelationshipURI is in the stored filter. Generate it first callig the generate_list_relationship_filter"""
|
|
return RDF[~RDF["RelationshipURI"].isin(self.relationship_filter_list)]
|
|
|
|
|
|
def generate_frequency_movie_filter(self, MOVIE_COUNT: pd.DataFrame ,min_treshold: int, max_treshold: int):
|
|
"""
|
|
You MUST call this before filter the dataset by movie frequence [filter_by_frequence_movie_id()],
|
|
since this method creates such filter
|
|
Args:
|
|
MOVIE_COUNT (pd.DataFrame): ["MovieID","Count"]
|
|
min_treshold (int):
|
|
max_treshold (int):
|
|
"""
|
|
MOVIE_COUNT = MOVIE_COUNT[MOVIE_COUNT["Count"] >= min_treshold]
|
|
MOVIE_COUNT = MOVIE_COUNT[MOVIE_COUNT["Count"] < max_treshold]
|
|
self.MOVIE_FILTER = MOVIE_COUNT #["MovieID"]
|
|
|
|
def generate_frequency_relationship_filter(self, REL_COUNT: pd.DataFrame ,min_treshold: int, max_treshold: int):
|
|
REL_COUNT = REL_COUNT[REL_COUNT["Count"] >= min_treshold]
|
|
REL_COUNT = REL_COUNT[REL_COUNT["Count"] < max_treshold]
|
|
self.REL_FILTER = REL_COUNT #["RelationshipURI"]
|
|
|
|
def filter_by_frequency_movie_id(self, RDF: pd.DataFrame) -> pd.DataFrame:
|
|
RDF = RDF[RDF["MovieID"].isin(self.MOVIE_FILTER["MovieID"])]
|
|
return RDF
|
|
|
|
def filter_by_frequency_relationship(self, RDF: pd.DataFrame) -> pd.DataFrame:
|
|
RDF = RDF[RDF["RelationshipURI"].isin(self.REL_FILTER["RelationshipURI"])]
|
|
return RDF
|
|
|
|
def rdf_add_special_token(self, RDF: pd.DataFrame):
|
|
"""
|
|
Adds RDF special token to each element of the tuple. i.e: SUBJ to SubjectURI, OBJ to ObjectURI, REL to RelationshipURI.
|
|
Check Scrits/Libs/CleaningPipeline/special_token.py for the up-to-date special token.
|
|
It only adds the special token of the three element of the RDF, no other special token.
|
|
Args:
|
|
RDF (pd.DataFrame):
|
|
Returns:
|
|
pd.DataFrame: ["MovieURI","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
|
|
"""
|
|
# if the filter runned before sliced the RDF and created a View, here the problem is resolved
|
|
# for more context: SettingWithCopyWarning
|
|
RDF = RDF.copy()
|
|
# at the beginning of SubjectURI RelationshipURI ObjectURI, add their special token
|
|
RDF["SubjectURI"] = SpecialToken.SUBJECT.value + RDF["SubjectURI"]
|
|
RDF["ObjectURI"] = SpecialToken.OBJECT.value + RDF["ObjectURI"]
|
|
RDF["RelationshipURI"] = SpecialToken.RELATIONSHIP.value + RDF["RelationshipURI"]
|
|
return RDF
|
|
|
|
|
|
def drop_na_from_dataset(self, RDF: pd.DataFrame) -> pd.DataFrame:
|
|
# dataset has SubjectURI RelationshipURI ObjectURI
|
|
# want to drop the '' in them
|
|
# Replace empty strings with NaN
|
|
RDF = RDF.replace('', np.nan)
|
|
# Drop rows where any of the key columns are NaN
|
|
RDF = RDF.dropna(subset=["SubjectURI", "RelationshipURI", "ObjectURI"])
|
|
return RDF
|
|
|
|
def rebuild_by_movie(self, RDF: pd.DataFrame) -> pd.DataFrame:
|
|
"""_summary_
|
|
|
|
Args:
|
|
RDF (pd.DataFrame): ["MovieID","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
|
|
|
|
Returns:
|
|
pd.DataFrame: ["MovieID","Triple","Abstract"]
|
|
"""
|
|
# to execute this method you have to have itereted by movie_id
|
|
# because as design we want at the end one row for each movie
|
|
# MovieID and abstract can be given as input for a more generic method
|
|
# movie_id = RDF["MovieID"].iloc(0)
|
|
# abstract = RDF["Abstract"].iloc(0)
|
|
# first let's combine each row creating column triple as join of rdf
|
|
RDF["Triple"] = RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"]
|
|
# special token
|
|
RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
|
|
# combine rows into one
|
|
# MovieID and Abstract are unique for each other 1 <-> 1
|
|
RDF = RDF.groupby(["MovieID", "Abstract"])["Triple"].apply("".join).reset_index()
|
|
# add special token for: start of triple, end of triple and start of abstract
|
|
RDF["Triple"] = SpecialToken.START_TRIPLE_LIST.value + RDF["Triple"]
|
|
RDF["Abstract"] = SpecialToken.ABSTRACT.value + RDF["Abstract"]
|
|
return RDF[["MovieID","Triple","Abstract"]]
|
|
|
|
def group_by_movie_from_triple(self, RDF: pd.DataFrame) -> pd.DataFrame:
|
|
"""
|
|
Args:
|
|
RDF (pd.DataFrame): ["MovieID","Triple","Abstract"]
|
|
|
|
Returns:
|
|
pd.DataFrame: ["MovieID","Triple","Abstract"]
|
|
"""
|
|
# combine rows into one
|
|
# MovieID and Abstract are unique for each other 1 <-> 1
|
|
RDF = RDF.groupby(["MovieID", "Abstract"])["Triple"].apply("".join).reset_index()
|
|
# add special token for: start of triple, end of triple and start of abstract
|
|
RDF["Triple"] = SpecialToken.START_TRIPLE_LIST.value + RDF["Triple"]
|
|
RDF["Abstract"] = SpecialToken.ABSTRACT.value + RDF["Abstract"]
|
|
return RDF[["MovieID","Triple","Abstract"]]
|
|
|
|
|
|
@staticmethod
|
|
def build_triple(RDF: pd.DataFrame):
|
|
"""
|
|
Obtains joined RDF triple in one element, togheter with START and END special token
|
|
Args:
|
|
RDF (pd.DataFrame): at least ["SubjectURI", "RelationshipURI", "ObjectURI"]
|
|
Returns:
|
|
pd.DataFrame: RDF["Triple"] (just this column)
|
|
"""
|
|
# let's combine each row creating column triple as join of rdf
|
|
RDF["Triple"] = RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"]
|
|
# special token
|
|
RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
|
|
return RDF["Triple"]
|
|
|
|
@staticmethod
|
|
def build_incomplete_triple(RDF: pd.DataFrame):
|
|
"""
|
|
Method helper used for the third task: "Predicting a masked component within an RDF triple".
|
|
Obtains joined RDF triple in one element, togheter with START and END special token.
|
|
The MISSING element will be replaced by the special token <MASK>
|
|
Args:
|
|
RDF (pd.DataFrame): 2 of the following ["SubjectURI", "RelationshipURI", "ObjectURI"]
|
|
Returns:
|
|
RDF["Triple"]: pd.Series (just this column, NOT A DATAFRAME)
|
|
"""
|
|
# let's create a new column "Triple" with the joined RDF
|
|
|
|
# the following creates a column of MASK token of the lenght of the dataframe,
|
|
# it is not needed since we expect to have a dataframe of just one column, but its more robust (AND SLOW)
|
|
MISSING = pd.Series([SpecialToken.MASK.value] * len(RDF), index=RDF.index)
|
|
|
|
RDF["Triple"] = (
|
|
RDF.get("SubjectURI", MISSING) +
|
|
RDF.get("RelationshipURI", MISSING) +
|
|
RDF.get("ObjectURI", MISSING))
|
|
# special token
|
|
RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
|
|
return RDF["Triple"]
|
|
|
|
@staticmethod
|
|
def build_for_mask_task(RDF_incomplete: pd.DataFrame, MISSING: pd.DataFrame) -> pd.DataFrame:
|
|
# currently not used
|
|
"""
|
|
Method helper used for the third task: "Predicting a masked component within an RDF triple".
|
|
Given two Dataframe, the first containing the incompleted RDF and the other only the missing componment,
|
|
this methods applies the special token
|
|
Args:
|
|
RDF (pd.DataFrame): _description_
|
|
|
|
Returns:
|
|
pd.DataFrame: _description_
|
|
"""
|
|
# take an example dataframe as ["SubjectURI",""]
|
|
# as input two dataframe, one with 2 column
|
|
return None
|
|
|
|
def regex_on_objects(self, RDF: pd.DataFrame) -> pd.DataFrame:
|
|
RDF["ObjectURI"] = (RDF["ObjectURI"].astype("string")
|
|
.str.replace(r"\r?\n+", ", ", regex=True) # newlines -> ", "
|
|
.str.replace(r"\*", "", regex=True)) # delete all asterisks
|
|
|
|
return RDF |