Added Toy Dataset entry point into the Pipeline class

Before it was forced into the sql_endpoint, now all the pipeline can be managed in the Pipeline class
2025-09-29 16:03:49 +02:00
parent bd72ad3571
commit 8167c9d435
2 changed files with 57 additions and 33 deletions
--- a/Scripts/Libs/CleaningPipeline/sql_endpoint.py
+++ b/Scripts/Libs/CleaningPipeline/sql_endpoint.py
@@ -18,8 +18,8 @@ class SqlEndpoint():
        # self.conn = self.sql_engine.connect().execution_options(stream_results=True)
        # it seems that sqlite doenst support streamer cursor
        # PRAGMA exeutes better in writing not reading
-        self.chunk_size_row = chunk_size_row
-        pass
+        self.chunk_size_row = chunk_size_row                    # not used now, since each chunk is a movie
+        self.movie_ids = movie_ids = pd.read_sql_query("SELECT MovieID FROM Movies;", self.sql_engine)["MovieID"]

    def get_RDF(self) -> pd.DataFrame :
        
@@ -79,7 +79,7 @@ class SqlEndpoint():
            Pandas.DataFrame: [MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract]
        """        
        # chunk by movieId, abstract is the same and some intersting logic are appliable
-        movie_ids = pd.read_sql_query("SELECT MovieID FROM Movies;", self.sql_engine)["MovieID"]
+        # movie_ids = pd.read_sql_query("SELECT MovieID FROM Movies;", self.sql_engine)["MovieID"]
        # CHOOSEN MOVIE:
        # The Dark Knight   : 117248
        # Inception         : 147074
@@ -91,8 +91,8 @@ class SqlEndpoint():
        # Django Unchained  : 138952
        # Spirited Away     : 144137
        # Knives Out        : 148025
-        movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]
-        movie_ids = movie_list
+        # movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]
+        # movie_ids = movie_list

        QUERY = """
                SELECT MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
@@ -104,7 +104,7 @@ class SqlEndpoint():
                WHERE MovieID = (?);
                """        

-        for movie_id in movie_ids:
+        for movie_id in self.movie_ids:
            yield pd.read_sql_query(QUERY, self.sql_engine, params=(movie_id,))

    def get_movies_id_count(self) -> pd.DataFrame: