Added Toy Dataset entry point into the Pipeline class

Before it was forced into the sql_endpoint,
now all the pipeline can be managed in the Pipeline class
This commit is contained in:
GassiGiuseppe
2025-09-29 16:03:49 +02:00
parent bd72ad3571
commit 8167c9d435
2 changed files with 57 additions and 33 deletions

View File

@@ -18,8 +18,8 @@ class SqlEndpoint():
# self.conn = self.sql_engine.connect().execution_options(stream_results=True)
# it seems that sqlite doenst support streamer cursor
# PRAGMA exeutes better in writing not reading
self.chunk_size_row = chunk_size_row
pass
self.chunk_size_row = chunk_size_row # not used now, since each chunk is a movie
self.movie_ids = movie_ids = pd.read_sql_query("SELECT MovieID FROM Movies;", self.sql_engine)["MovieID"]
def get_RDF(self) -> pd.DataFrame :
@@ -79,7 +79,7 @@ class SqlEndpoint():
Pandas.DataFrame: [MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract]
"""
# chunk by movieId, abstract is the same and some intersting logic are appliable
movie_ids = pd.read_sql_query("SELECT MovieID FROM Movies;", self.sql_engine)["MovieID"]
# movie_ids = pd.read_sql_query("SELECT MovieID FROM Movies;", self.sql_engine)["MovieID"]
# CHOOSEN MOVIE:
# The Dark Knight : 117248
# Inception : 147074
@@ -91,8 +91,8 @@ class SqlEndpoint():
# Django Unchained : 138952
# Spirited Away : 144137
# Knives Out : 148025
movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]
movie_ids = movie_list
# movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]
# movie_ids = movie_list
QUERY = """
SELECT MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
@@ -104,7 +104,7 @@ class SqlEndpoint():
WHERE MovieID = (?);
"""
for movie_id in movie_ids:
for movie_id in self.movie_ids:
yield pd.read_sql_query(QUERY, self.sql_engine, params=(movie_id,))
def get_movies_id_count(self) -> pd.DataFrame: