new faster pipeline
This commit is contained in:
86
Scripts/DataCleaning/pipeline/cleaner.py
Normal file
86
Scripts/DataCleaning/pipeline/cleaner.py
Normal file
@@ -0,0 +1,86 @@
|
||||
# This file deletes in the pipeline the unwanted relationship by different rules
|
||||
import pandas as pd
|
||||
import sqlite3
|
||||
import numpy as np
|
||||
|
||||
from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
|
||||
from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
|
||||
|
||||
|
||||
class PipelineApplier():
|
||||
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def rdf_add_special_token(self, RDF: pd.DataFrame):
|
||||
"""
|
||||
Adds RDF special token to each element of the tuple. i.e: SUBJ to SubjectURI, OBJ to ObjectURI, REL to RelationshipURI.
|
||||
Check Scrits/Libs/CleaningPipeline/special_token.py for the up-to-date special token.
|
||||
It only adds the special token of the three element of the RDF, no other special token.
|
||||
Args:
|
||||
RDF (pd.DataFrame):
|
||||
Returns:
|
||||
pd.DataFrame: ["MovieURI","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
|
||||
"""
|
||||
# if the filter runned before sliced the RDF and created a View, here the problem is resolved
|
||||
# for more context: SettingWithCopyWarning
|
||||
RDF = RDF.copy()
|
||||
# at the beginning of SubjectURI RelationshipURI ObjectURI, add their special token
|
||||
RDF["SubjectURI"] = SpecialToken.SUBJECT.value + RDF["SubjectURI"]
|
||||
RDF["ObjectURI"] = SpecialToken.OBJECT.value + RDF["ObjectURI"]
|
||||
RDF["RelationshipURI"] = SpecialToken.RELATIONSHIP.value + RDF["RelationshipURI"]
|
||||
return RDF
|
||||
|
||||
|
||||
def drop_na_from_dataset(self, RDF: pd.DataFrame) -> pd.DataFrame:
|
||||
RDF = RDF.replace('', np.nan)
|
||||
# Drop rows where any of the key columns are NaN
|
||||
RDF = RDF.dropna(subset=["SubjectURI", "RelationshipURI", "ObjectURI"])
|
||||
return RDF
|
||||
|
||||
def rebuild_by_movie(self, RDF: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
Args:
|
||||
RDF (pd.DataFrame): ["MovieID","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: ["MovieID","Triple","Abstract"]
|
||||
"""
|
||||
# to execute this method you have to have itereted by movie_id
|
||||
# because as design we want at the end one row for each movie
|
||||
# MovieID and abstract can be given as input for a more generic method
|
||||
# first let's combine each row creating column triple as join of rdf
|
||||
RDF["Triple"] = RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"]
|
||||
# special token
|
||||
RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
|
||||
# combine rows into one
|
||||
# MovieID and Abstract are unique for each other 1 <-> 1
|
||||
RDF = RDF.groupby(["MovieID", "Abstract"])["Triple"].apply("".join).reset_index()
|
||||
# add special token for: start of triple, end of triple and start of abstract
|
||||
RDF["Triple"] = SpecialToken.START_TRIPLE_LIST.value + RDF["Triple"]
|
||||
RDF["Abstract"] = SpecialToken.ABSTRACT.value + RDF["Abstract"]
|
||||
return RDF[["MovieID","Triple","Abstract"]]
|
||||
|
||||
|
||||
@staticmethod
|
||||
def build_triple(RDF: pd.DataFrame):
|
||||
"""
|
||||
Obtains joined RDF triple in one element, togheter with START and END special token
|
||||
Args:
|
||||
RDF (pd.DataFrame): at least ["SubjectURI", "RelationshipURI", "ObjectURI"]
|
||||
Returns:
|
||||
pd.DataFrame: RDF["Triple"] (just this column)
|
||||
"""
|
||||
# let's combine each row creating column triple as join of rdf
|
||||
RDF["Triple"] = RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"]
|
||||
# special token
|
||||
RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
|
||||
return RDF["Triple"]
|
||||
|
||||
|
||||
def regex_on_objects(self, RDF: pd.DataFrame) -> pd.DataFrame:
|
||||
RDF["ObjectURI"] = (RDF["ObjectURI"].astype("string")
|
||||
.str.replace(r"\r?\n+", ", ", regex=True) # newlines -> ", "
|
||||
.str.replace(r"\*", "", regex=True)) # delete all asterisks
|
||||
|
||||
return RDF
|
||||
Reference in New Issue
Block a user