NanoSocrates/Scripts/DataCleaning/pipeline/cleaner.py

# This file deletes in the pipeline the unwanted relationship by different rules
import pandas as pd
import sqlite3
import numpy as np

from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint


class PipelineApplier():

    def __init__(self):
        pass

    def rdf_add_special_token(self, RDF: pd.DataFrame):
        """
        Adds RDF special token to each element of the tuple. i.e: SUBJ to SubjectURI, OBJ to ObjectURI, REL to RelationshipURI. 
        Check Scrits/Libs/CleaningPipeline/special_token.py for the up-to-date special token.
        It only adds the special token of the three element of the RDF, no other special token.
        Args:
            RDF (pd.DataFrame):
        Returns:
            pd.DataFrame: ["MovieURI","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
        """        
        # if the filter runned before sliced the RDF and created a View, here the problem is resolved
        # for more context: SettingWithCopyWarning
        RDF = RDF.copy()
        # at the beginning of SubjectURI RelationshipURI ObjectURI, add their special token  
        RDF["SubjectURI"] = SpecialToken.SUBJECT.value + RDF["SubjectURI"]
        RDF["ObjectURI"] = SpecialToken.OBJECT.value + RDF["ObjectURI"]
        RDF["RelationshipURI"] = SpecialToken.RELATIONSHIP.value + RDF["RelationshipURI"]
        return RDF


    def drop_na_from_dataset(self, RDF: pd.DataFrame) -> pd.DataFrame:
        RDF = RDF.replace('', np.nan)
        # Drop rows where any of the key columns are NaN
        RDF = RDF.dropna(subset=["SubjectURI", "RelationshipURI", "ObjectURI"])
        return RDF
    
    def rebuild_by_movie(self, RDF: pd.DataFrame) -> pd.DataFrame:
        """
        Args:
            RDF (pd.DataFrame): ["MovieID","SubjectURI","RelationshipURI","ObjectURI","Abstract"]

        Returns:
            pd.DataFrame: ["MovieID","Triple","Abstract"]
        """        
        # to execute this method you have to have itereted by movie_id
        # because as design we want at the end one row for each movie
        # MovieID and abstract can be given as input for a more generic method
        # first let's combine each row creating column triple as join of rdf
        RDF["Triple"] =  RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"]
        # special token 
        RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
        # combine rows into one
        # MovieID and Abstract are unique for each other 1 <-> 1
        RDF = RDF.groupby(["MovieID", "Abstract"])["Triple"].apply("".join).reset_index()
        # add special token for: start of triple, end of triple and start of abstract
        RDF["Triple"] = SpecialToken.START_TRIPLE_LIST.value + RDF["Triple"]+SpecialToken.END_OF_SENTENCE.value
        RDF["Abstract"] = SpecialToken.ABSTRACT.value + RDF["Abstract"] + SpecialToken.END_OF_SENTENCE.value
        return RDF[["MovieID","Triple","Abstract"]]


    @staticmethod
    def build_triple(RDF: pd.DataFrame):
        """
        Obtains joined RDF triple in one element, togheter with START and END special token
        Args:
            RDF (pd.DataFrame): at least ["SubjectURI", "RelationshipURI", "ObjectURI"]
        Returns:
            pd.DataFrame: RDF["Triple"] (just this column)
        """        
        # let's combine each row creating column triple as join of rdf
        RDF["Triple"] =  RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"]
        # special token 
        RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
        return RDF["Triple"]


    def regex_on_objects(self, RDF: pd.DataFrame) -> pd.DataFrame:
        RDF["ObjectURI"] = (RDF["ObjectURI"].astype("string")
                   .str.replace(r"\r?\n+", ", ", regex=True)   # newlines -> ", "
                   .str.replace(r"\*", "", regex=True))        # delete all asterisks

        return RDF
new faster pipeline 2025-10-05 14:57:45 +02:00			`# This file deletes in the pipeline the unwanted relationship by different rules`
			`import pandas as pd`
			`import sqlite3`
			`import numpy as np`

			`from Scripts.Libs.CleaningPipeline.special_token import SpecialToken`
			`from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint`


			`class PipelineApplier():`

			`def __init__(self):`
			`pass`

			`def rdf_add_special_token(self, RDF: pd.DataFrame):`
			`"""`
			`Adds RDF special token to each element of the tuple. i.e: SUBJ to SubjectURI, OBJ to ObjectURI, REL to RelationshipURI.`
			`Check Scrits/Libs/CleaningPipeline/special_token.py for the up-to-date special token.`
			`It only adds the special token of the three element of the RDF, no other special token.`
			`Args:`
			`RDF (pd.DataFrame):`
			`Returns:`
			`pd.DataFrame: ["MovieURI","SubjectURI","RelationshipURI","ObjectURI","Abstract"]`
			`"""`
			`# if the filter runned before sliced the RDF and created a View, here the problem is resolved`
			`# for more context: SettingWithCopyWarning`
			`RDF = RDF.copy()`
			`# at the beginning of SubjectURI RelationshipURI ObjectURI, add their special token`
			`RDF["SubjectURI"] = SpecialToken.SUBJECT.value + RDF["SubjectURI"]`
			`RDF["ObjectURI"] = SpecialToken.OBJECT.value + RDF["ObjectURI"]`
			`RDF["RelationshipURI"] = SpecialToken.RELATIONSHIP.value + RDF["RelationshipURI"]`
			`return RDF`


			`def drop_na_from_dataset(self, RDF: pd.DataFrame) -> pd.DataFrame:`
			`RDF = RDF.replace('', np.nan)`
			`# Drop rows where any of the key columns are NaN`
			`RDF = RDF.dropna(subset=["SubjectURI", "RelationshipURI", "ObjectURI"])`
			`return RDF`

			`def rebuild_by_movie(self, RDF: pd.DataFrame) -> pd.DataFrame:`
			`"""`
			`Args:`
			`RDF (pd.DataFrame): ["MovieID","SubjectURI","RelationshipURI","ObjectURI","Abstract"]`

			`Returns:`
			`pd.DataFrame: ["MovieID","Triple","Abstract"]`
			`"""`
			`# to execute this method you have to have itereted by movie_id`
			`# because as design we want at the end one row for each movie`
			`# MovieID and abstract can be given as input for a more generic method`
			`# first let's combine each row creating column triple as join of rdf`
			`RDF["Triple"] = RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"]`
			`# special token`
			`RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value`
			`# combine rows into one`
			`# MovieID and Abstract are unique for each other 1 <-> 1`
			`RDF = RDF.groupby(["MovieID", "Abstract"])["Triple"].apply("".join).reset_index()`
			`# add special token for: start of triple, end of triple and start of abstract`
Added EOS token 2025-10-07 22:47:59 +02:00			`RDF["Triple"] = SpecialToken.START_TRIPLE_LIST.value + RDF["Triple"]+SpecialToken.END_OF_SENTENCE.value`
			`RDF["Abstract"] = SpecialToken.ABSTRACT.value + RDF["Abstract"] + SpecialToken.END_OF_SENTENCE.value`
new faster pipeline 2025-10-05 14:57:45 +02:00			`return RDF[["MovieID","Triple","Abstract"]]`


			`@staticmethod`
			`def build_triple(RDF: pd.DataFrame):`
			`"""`
			`Obtains joined RDF triple in one element, togheter with START and END special token`
			`Args:`
			`RDF (pd.DataFrame): at least ["SubjectURI", "RelationshipURI", "ObjectURI"]`
			`Returns:`
			`pd.DataFrame: RDF["Triple"] (just this column)`
			`"""`
			`# let's combine each row creating column triple as join of rdf`
			`RDF["Triple"] = RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"]`
			`# special token`
			`RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value`
			`return RDF["Triple"]`


			`def regex_on_objects(self, RDF: pd.DataFrame) -> pd.DataFrame:`
			`RDF["ObjectURI"] = (RDF["ObjectURI"].astype("string")`
			`.str.replace(r"\r?\n+", ", ", regex=True) # newlines -> ", "`
			`.str.replace(r"\*", "", regex=True)) # delete all asterisks`

			`return RDF`