dev.etl #5

Merged
gape_01 merged 9 commits from dev.etl into dev 2025-09-30 11:28:57 +02:00
14 changed files with 2176 additions and 1 deletions

24
.vscode/settings.json vendored Normal file
View File

@ -0,0 +1,24 @@
{
// Always treat the project root as the working dir for Jupyter
"jupyter.notebookFileRoot": "${workspaceFolder}",
// When you click "Run Python File in Terminal", DON'T cd into the file's folder
"python.terminal.executeInFileDir": false,
// Start new integrated terminals at the project root
"terminal.integrated.cwd": "${workspaceFolder}",
// Ensure Python can import from the project root no matter which file you run
// (so `src/` is on sys.path). Linux shown here; add osx/windows if needed.
"terminal.integrated.env.linux": {
"PYTHONPATH": "${workspaceFolder}"
},
// Make pytest run from the root without needing a pytest.ini
"python.testing.pytestEnabled": true,
"python.testing.cwd": "${workspaceFolder}",
"python.testing.pytestArgs": ["src/test"],
// Help Pylance resolve imports like `from src...` without red squiggles
"python.analysis.extraPaths": ["${workspaceFolder}"]
}

View File

@ -0,0 +1,21 @@
from Scripts.Libs.Utils.dataframe_interaction import get_raw_from_dataframe
from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
import pandas as pd
class BPE_corpus():
def __init__(self, output_path :str):
self.output_handler = open(output_path, "w")
def close(self):
# add corpus end before closing
self.output_handler.write(SpecialToken.CORPUS_END.value)
self.output_handler.close()
def write_from_str(self, output: str):
if output == '':
return
self.output_handler.write(output)
def write_from_df(self, df: pd.DataFrame):
self.write_from_str(get_raw_from_dataframe(df))

View File

@ -0,0 +1,26 @@
import pandas as pd
class RDF_completation_task_dataset():
"""
Write the CSV for the fourth task, which is "Predicting subsequent triples based on a given context".
Each RDF is saved as str
CSV Composition: ["MovieID","RDF"]
"""
def __init__(self, output_path:str):
self.output = open(output_path, "w")
# then the first row as header
header = ["MovieID","RDF"]
self.output.write(",".join(header) + "\n")
def close(self):
self.output.close()
def write(self, RDF: pd.DataFrame):
"""
Args:
RDF (pd.DataFrame): ["MovieID","RDF"]
"""
RDF.to_csv(self.output, index=False, header=False)

View File

@ -0,0 +1,58 @@
import pandas as pd
# do not worry about circular dependencies, this class will never call something else
from Scripts.DataCleaning.filter import PipelineApplier
class RDF_mask_task_dataset():
"""
Write the CSV for the third task, which is "Predicting a masked component within an RDF triple".
The CSV is like: for each RDF there will be 3 rows, where every time one of the componments is missing.
CSV Composition: ["MovieID","IncompleteRDF","Missing","RDF"]
"""
def __init__(self, output_path:str):
# this methods will only be used by this class, but they belong in a lower level
self._build_triple = PipelineApplier.build_triple
self._build_incomplete_triple = PipelineApplier.build_incomplete_triple
self.output = open(output_path, "w")
# then the first row as header
header = ["MovieID","IncompleteRDF","Missing","RDF"]
self.output.write(",".join(header) + "\n")
def close(self):
self.output.close()
def write(self, RDF: pd.DataFrame):
rdf_complete = self._build_triple(RDF)
rdf_without_subject = self._build_incomplete_triple(RDF.drop(columns=["SubjectURI"]))
rdf_without_relationship = self._build_incomplete_triple(RDF.drop(columns=["RelationshipURI"]))
rdf_without_object = self._build_incomplete_triple(RDF.drop(columns=["ObjectURI"]))
####
df_subject = pd.DataFrame({
"MovieID": RDF["MovieID"],
"IncompleteRDF": rdf_without_subject,
"Missing": RDF["SubjectURI"],
"RDF": rdf_complete,
})
df_relationship = pd.DataFrame({
"MovieID": RDF["MovieID"],
"IncompleteRDF": rdf_without_relationship,
"Missing": RDF["RelationshipURI"],
"RDF": rdf_complete,
})
df_object = pd.DataFrame({
"MovieID": RDF["MovieID"],
"IncompleteRDF": rdf_without_object,
"Missing": RDF["ObjectURI"],
"RDF": rdf_complete,
})
output_df = pd.concat([df_subject, df_relationship, df_object], ignore_index=True)
output_df.to_csv(self.output, index=False, header=False)

View File

@ -0,0 +1,26 @@
import pandas as pd
class RDF_text_task_dataset():
"""
Write the CSV for the firsts two tasks, which are "Generating structured RDF triples from natural language text" and reverse.
In the CVS the RDFs will be saved toghether as a string.
CSV Composition: ["MovieID","RDFs","Abstract"]
"""
def __init__(self, output_path:str):
self.output = open(output_path, "w")
# then the first row as header
header = ["MovieID","RDFs","Abstract"]
self.output.write(",".join(header) + "\n")
def close(self):
self.output.close()
def write(self, RDF: pd.DataFrame):
"""
Args:
RDF (pd.DataFrame): ["MovieID","Triple","Abstract"]
"""
RDF.to_csv(self.output, index=False, header=False)

View File

@ -0,0 +1,184 @@
# This file deletes in the pipeline the unwanted relationship by different rules
import pandas as pd
import sqlite3
import numpy as np
from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
class PipelineApplier():
def __init__(self):
self.MOVIE_FILTER = pd.DataFrame()
self.REL_FILTER = pd.DataFrame()
def delete_relationship_by_str(self, RDF: pd.DataFrame, uri: str) -> pd.DataFrame:
return RDF[RDF["RelationshipURI"]!= uri]
def generate_list_relationship_filter(self, filter_list: list[str]) -> None:
"""Store RelationshipURI filters as a set """
self.relationship_filter_list: set[str] = set(filter_list)
def delete_relationship_by_list_filter(self, RDF: pd.DataFrame) -> pd.DataFrame:
"""Remove rows whose RelationshipURI is in the stored filter. Generate it first callig the generate_list_relationship_filter"""
return RDF[~RDF["RelationshipURI"].isin(self.relationship_filter_list)]
def generate_frequency_movie_filter(self, MOVIE_COUNT: pd.DataFrame ,min_treshold: int, max_treshold: int):
"""
You MUST call this before filter the dataset by movie frequence [filter_by_frequence_movie_id()],
since this method creates such filter
Args:
MOVIE_COUNT (pd.DataFrame): ["MovieID","Count"]
min_treshold (int):
max_treshold (int):
"""
MOVIE_COUNT = MOVIE_COUNT[MOVIE_COUNT["Count"] >= min_treshold]
MOVIE_COUNT = MOVIE_COUNT[MOVIE_COUNT["Count"] < max_treshold]
self.MOVIE_FILTER = MOVIE_COUNT #["MovieID"]
def generate_frequency_relationship_filter(self, REL_COUNT: pd.DataFrame ,min_treshold: int, max_treshold: int):
REL_COUNT = REL_COUNT[REL_COUNT["Count"] >= min_treshold]
REL_COUNT = REL_COUNT[REL_COUNT["Count"] < max_treshold]
self.REL_FILTER = REL_COUNT #["RelationshipURI"]
def filter_by_frequency_movie_id(self, RDF: pd.DataFrame) -> pd.DataFrame:
RDF = RDF[RDF["MovieID"].isin(self.MOVIE_FILTER["MovieID"])]
return RDF
def filter_by_frequency_relationship(self, RDF: pd.DataFrame) -> pd.DataFrame:
RDF = RDF[RDF["RelationshipURI"].isin(self.REL_FILTER["RelationshipURI"])]
return RDF
def rdf_add_special_token(self, RDF: pd.DataFrame):
"""
Adds RDF special token to each element of the tuple. i.e: SUBJ to SubjectURI, OBJ to ObjectURI, REL to RelationshipURI.
Check Scrits/Libs/CleaningPipeline/special_token.py for the up-to-date special token.
It only adds the special token of the three element of the RDF, no other special token.
Args:
RDF (pd.DataFrame):
Returns:
pd.DataFrame: ["MovieURI","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
"""
# if the filter runned before sliced the RDF and created a View, here the problem is resolved
# for more context: SettingWithCopyWarning
RDF = RDF.copy()
# at the beginning of SubjectURI RelationshipURI ObjectURI, add their special token
RDF["SubjectURI"] = SpecialToken.SUBJECT.value + RDF["SubjectURI"]
RDF["ObjectURI"] = SpecialToken.OBJECT.value + RDF["ObjectURI"]
RDF["RelationshipURI"] = SpecialToken.RELATIONSHIP.value + RDF["RelationshipURI"]
return RDF
def drop_na_from_dataset(self, RDF: pd.DataFrame) -> pd.DataFrame:
# dataset has SubjectURI RelationshipURI ObjectURI
# want to drop the '' in them
# Replace empty strings with NaN
RDF = RDF.replace('', np.nan)
# Drop rows where any of the key columns are NaN
RDF = RDF.dropna(subset=["SubjectURI", "RelationshipURI", "ObjectURI"])
return RDF
def rebuild_by_movie(self, RDF: pd.DataFrame) -> pd.DataFrame:
"""_summary_
Args:
RDF (pd.DataFrame): ["MovieID","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
Returns:
pd.DataFrame: ["MovieID","Triple","Abstract"]
"""
# to execute this method you have to have itereted by movie_id
# because as design we want at the end one row for each movie
# MovieID and abstract can be given as input for a more generic method
# movie_id = RDF["MovieID"].iloc(0)
# abstract = RDF["Abstract"].iloc(0)
# first let's combine each row creating column triple as join of rdf
RDF["Triple"] = RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"]
# special token
RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
# combine rows into one
# MovieID and Abstract are unique for each other 1 <-> 1
RDF = RDF.groupby(["MovieID", "Abstract"])["Triple"].apply("".join).reset_index()
# add special token for: start of triple, end of triple and start of abstract
RDF["Triple"] = SpecialToken.START_TRIPLE_LIST.value + RDF["Triple"]
RDF["Abstract"] = SpecialToken.ABSTRACT.value + RDF["Abstract"]
return RDF[["MovieID","Triple","Abstract"]]
def group_by_movie_from_triple(self, RDF: pd.DataFrame) -> pd.DataFrame:
"""
Args:
RDF (pd.DataFrame): ["MovieID","Triple","Abstract"]
Returns:
pd.DataFrame: ["MovieID","Triple","Abstract"]
"""
# combine rows into one
# MovieID and Abstract are unique for each other 1 <-> 1
RDF = RDF.groupby(["MovieID", "Abstract"])["Triple"].apply("".join).reset_index()
# add special token for: start of triple, end of triple and start of abstract
RDF["Triple"] = SpecialToken.START_TRIPLE_LIST.value + RDF["Triple"]
RDF["Abstract"] = SpecialToken.ABSTRACT.value + RDF["Abstract"]
return RDF[["MovieID","Triple","Abstract"]]
@staticmethod
def build_triple(RDF: pd.DataFrame):
"""
Obtains joined RDF triple in one element, togheter with START and END special token
Args:
RDF (pd.DataFrame): at least ["SubjectURI", "RelationshipURI", "ObjectURI"]
Returns:
pd.DataFrame: RDF["Triple"] (just this column)
"""
# let's combine each row creating column triple as join of rdf
RDF["Triple"] = RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"]
# special token
RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
return RDF["Triple"]
@staticmethod
def build_incomplete_triple(RDF: pd.DataFrame):
"""
Method helper used for the third task: "Predicting a masked component within an RDF triple".
Obtains joined RDF triple in one element, togheter with START and END special token.
The MISSING element will be replaced by the special token <MASK>
Args:
RDF (pd.DataFrame): 2 of the following ["SubjectURI", "RelationshipURI", "ObjectURI"]
Returns:
RDF["Triple"]: pd.Series (just this column, NOT A DATAFRAME)
"""
# let's create a new column "Triple" with the joined RDF
# the following creates a column of MASK token of the lenght of the dataframe,
# it is not needed since we expect to have a dataframe of just one column, but its more robust (AND SLOW)
MISSING = pd.Series([SpecialToken.MASK.value] * len(RDF), index=RDF.index)
RDF["Triple"] = (
RDF.get("SubjectURI", MISSING) +
RDF.get("RelationshipURI", MISSING) +
RDF.get("ObjectURI", MISSING))
# special token
RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
return RDF["Triple"]
@staticmethod
def build_for_mask_task(RDF_incomplete: pd.DataFrame, MISSING: pd.DataFrame) -> pd.DataFrame:
# currently not used
"""
Method helper used for the third task: "Predicting a masked component within an RDF triple".
Given two Dataframe, the first containing the incompleted RDF and the other only the missing componment,
this methods applies the special token
Args:
RDF (pd.DataFrame): _description_
Returns:
pd.DataFrame: _description_
"""
# take an example dataframe as ["SubjectURI",""]
# as input two dataframe, one with 2 column
return None

View File

@ -101,7 +101,6 @@ def tree_like(file: str, csv_uri_header:str, out: str):
FILE = open(file, "r", encoding="utf-8") FILE = open(file, "r", encoding="utf-8")
# TODO: Change here so it takes single URI from a CSV file
# It is needed the header-name # It is needed the header-name
for row in csv.DictReader(FILE): for row in csv.DictReader(FILE):

View File

@ -0,0 +1,131 @@
import re
from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
from Scripts.DataCleaning.filter import PipelineApplier
# tasks dataset builder
from Scripts.DataCleaning.data_output_models.rdf_mask_task import RDF_mask_task_dataset
from Scripts.DataCleaning.data_output_models.bpe_corpus import BPE_corpus
from Scripts.DataCleaning.data_output_models.rdf_text_tasks import RDF_text_task_dataset
from Scripts.DataCleaning.data_output_models.rdf_completation_task import RDF_completation_task_dataset
import pandas as pd
class Pipeline():
def __init__(self):
self.sql_endpoint = SqlEndpoint()
# classes to manage taskes' datasets
self.task_rdf_mask = RDF_mask_task_dataset("./Assets/Dataset/Tmp/rdf_mask.csv")
self.task_bpe_corpus = BPE_corpus("./Assets/Dataset/Tmp/corpus.txt")
self.task_rdf_text = RDF_text_task_dataset("./Assets/Dataset/Tmp/rdf_text.csv")
self.task_rdf_completation = RDF_completation_task_dataset("./Assets/Dataset/Tmp/rdf_completation.csv")
# prepare the filter
# the filter applier needs to know the frequence of Movies and Relationship among all the Dataset
self.filter_applier = PipelineApplier()
MOVIE_COUNT = self.sql_endpoint.get_movies_id_count()
REL_COUNT = self.sql_endpoint.get_relationship_count()
self.filter_applier.generate_frequency_movie_filter(MOVIE_COUNT,50,3000)
self.filter_applier.generate_frequency_relationship_filter(REL_COUNT, 50, 2395627)
# prepare the filter on the relationshipURI you want to delete:
relationship_uri_banned_list = [
"dbp-dbp:wikiPageUsesTemplate","w3:2000/01/rdf-schema#label","dbp-dbo:abstract",
"dbp-dbo:wikiPageID","dbp-dbo:wikiPageRevisionID", "dbp-dbo:wikiPageDisambiguates",
"w3:2002/07/owl#sameAs","dbp-dbp:image","dbp-dbo:wikiPageLength", "w3:2000/01/rdf-schema#comment",
"dbp-dbo:thumbnail", "foaf:depiction", "w3:1999/02/22-rdf-syntax-ns#type"]
self.filter_applier.generate_list_relationship_filter(relationship_uri_banned_list)
def execute_task_bpe_corpus(self):
for RDF in self._get_cleaned_movie_rows():
RDF = self.filter_applier.rebuild_by_movie(RDF)
RDF = RDF[["Triple","Abstract"]]
self.task_bpe_corpus.write_from_df(RDF)
self._end_file_handler()
def execute_task_rdf_mask(self):
for RDF in self._get_cleaned_movie_rows():
self.task_rdf_mask.write(RDF)
self._end_file_handler()
def execute_tasks_rdf_text(self):
for RDF in self._get_cleaned_movie_rows():
RDF = self.filter_applier.rebuild_by_movie(RDF)
self.task_rdf_text.write(RDF)
self._end_file_handler()
def execute_task_rdf_completation(self):
for RDF in self._get_cleaned_movie_rows():
RDF["Triple"] = self.filter_applier.build_triple(RDF)
self.task_rdf_completation.write(RDF[["MovieID","Triple"]])
self._end_file_handler()
def execute_all_task(self):
for RDF in self._get_cleaned_movie_rows():
self.task_rdf_mask.write(RDF)
RDF["Triple"] = self.filter_applier.build_triple(RDF)
self.task_rdf_completation.write(RDF[["MovieID","Triple"]])
RDF = self.filter_applier.group_by_movie_from_triple(RDF[["MovieID","Triple","Abstract"]])
self.task_rdf_text.write(RDF)
self.task_bpe_corpus.write_from_df(RDF[["Triple","Abstract"]])
self._end_file_handler()
def _end_file_handler(self):
self.task_bpe_corpus.close()
self.task_rdf_mask.close()
self.task_rdf_text.close()
self.task_rdf_completation.close()
def _get_cleaned_movie_rows(self):
for RDF in self.sql_endpoint.get_abbreviated_dataset_by_movie_id():
RDF = self.filter_applier.drop_na_from_dataset(RDF)
RDF = self.filter_applier.filter_by_frequency_movie_id(RDF)
RDF = self.filter_applier.filter_by_frequency_relationship(RDF)
# other filter
#
RDF = self.filter_applier.delete_relationship_by_list_filter(RDF)
if RDF.empty:
continue
RDF = self.filter_applier.rdf_add_special_token(RDF) # WARNING, THIS MUST BE DONE AFTER FILTER BY FREQUENCE
yield RDF
def use_toy_dataset(self):
# CHOOSEN MOVIE:
# The Dark Knight : 117248
# Inception : 147074
# The Avengers : 113621
# Cast Away : 1123
# The Departed : 117586
# American Psycho : 90177
# Avatar : 71587
# Django Unchained : 138952
# Spirited Away : 144137
# Knives Out : 148025
movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]
self.sql_endpoint.movie_ids = movie_list
# there are a lot of settings to manage
# you only need to change settings:
# in the init for file paths, frequency filter limit, banned reletionshipURI
# in the use_toy_dataset , to change the toy dataset
# in _get_cleaned_movie_rows: to change how the pipeline behave
pipeline = Pipeline()
# pipeline.use_toy_dataset()
# pipeline.execute_task_bpe_corpus()
# pipeline.execute_task_rdf_mask()
# pipeline.execute_tasks_rdf_text()
# pipeline.execute_task_rdf_completation()
pipeline.execute_all_task()

View File

@ -0,0 +1,21 @@
from enum import Enum
class SpecialToken(str, Enum):
# (Enum, str) -> throws an error
START_TRIPLE_LIST = "<SOTL>"
START_TRIPLE = "<SOT>"
END_TRIPLE = "<EOT>"
SUBJECT = "<SUBJ>"
RELATIONSHIP = "<PRED>"
OBJECT = "<OBJ>"
ABSTRACT = "<ABS>"
CORPUS_END = "<END>"
## Tasks' Token
RDF_TO_TEXT = "<RDF2TXT>"
TEXT_TO_RDF = "<TEXT2RDF>"
CONTINUE_RDF = "<CONTINUERDF>"
MASK = "<MASK>"
#BPE Training:

View File

@ -0,0 +1,144 @@
#######################################################
# This file stand as endpoint to interact with DB #
#######################################################
# import sqlite3
import pandas as pd
from sqlalchemy import create_engine
from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
class SqlEndpoint():
def __init__(self, DB_PATH = "./Assets/Dataset/DatawareHouse/dataset.db", chunk_size_row = 500):
# self.CONN = sqlite3.connect(DB_PATH) # DEPRECATED
self.sql_engine = create_engine(f"sqlite:///{DB_PATH}")
# /// 3 slash -> relative path
# //// 4 slash -> absolute
# self.conn = self.sql_engine.connect().execution_options(stream_results=True)
# it seems that sqlite doenst support streamer cursor
# PRAGMA exeutes better in writing not reading
self.chunk_size_row = chunk_size_row # not used now, since each chunk is a movie
self.movie_ids = movie_ids = pd.read_sql_query("SELECT MovieID FROM Movies;", self.sql_engine)["MovieID"]
def get_RDF(self) -> pd.DataFrame :
QUERY = """
SELECT MovieID, SubjectURI, RelationshipURI, ObjectURI
FROM RDFs
INNER JOIN Subjects USING (SubjectID)
INNER JOIN Relationships USING (RelationshipID)
INNER JOIN Objects USING (ObjectID);
"""
return pd.read_sql_query(QUERY, self.CONN)
def get_chunked_abbreviated_dataset(self) -> pd.DataFrame :
"""
Returns:
pd.DataFrame: MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
"""
QUERY = """
SELECT MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
FROM RDFs
INNER JOIN ParsedSubjects USING (SubjectID)
INNER JOIN ParsedRelationships USING (RelationshipID)
INNER JOIN ParsedObjects USING (ObjectID)
INNER JOIN WikipediaAbstracts USING (MovieID);
"""
# return pd.read_sql_query(QUERY, self.CONN, chunksize=500)
# sqlite3
return pd.read_sql_query(QUERY, self.sql_engine, chunksize=self.chunk_size_row)
def get_chunked_abbreviated_dataset_with_start_token(self)-> pd.DataFrame:
# DEPRECATED !
start_token = SpecialToken()
QUERY = """
SELECT
MovieID,
? || SubjectURI AS SubjectURI,
? || RelationshipURI AS RelationshipURI,
? || ObjectURI AS ObjectURI,
Abstract
FROM RDFs
INNER JOIN ParsedSubjects USING (SubjectID)
INNER JOIN ParsedRelationships USING (RelationshipID)
INNER JOIN ParsedObjects USING (ObjectID)
INNER JOIN WikipediaAbstracts USING (MovieID);
"""
return pd.read_sql_query(QUERY, self.sql_engine, chunksize=self.chunk_size_row)
def get_abbreviated_dataset_by_movie_id(self):# -> iter[pd.DataFrame]:
"""
Gets each time a DataFrame per movie ( with all its rows in the dataset).
The retrieved RDFs are already abbrevieted by the sql parser
Yields:
Pandas.DataFrame: [MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract]
"""
# chunk by movieId, abstract is the same and some intersting logic are appliable
# movie_ids = pd.read_sql_query("SELECT MovieID FROM Movies;", self.sql_engine)["MovieID"]
# CHOOSEN MOVIE:
# The Dark Knight : 117248
# Inception : 147074
# The Avengers : 113621
# Cast Away : 1123
# The Departed : 117586
# American Psycho : 90177
# Avatar : 71587
# Django Unchained : 138952
# Spirited Away : 144137
# Knives Out : 148025
# movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]
# movie_ids = movie_list
QUERY = """
SELECT MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
FROM RDFs
INNER JOIN ParsedSubjects USING (SubjectID)
INNER JOIN ParsedRelationships USING (RelationshipID)
INNER JOIN ParsedObjects USING (ObjectID)
INNER JOIN WikipediaAbstracts USING (MovieID)
WHERE MovieID = (?);
"""
for movie_id in self.movie_ids:
yield pd.read_sql_query(QUERY, self.sql_engine, params=(movie_id,))
def get_movies_id_count(self) -> pd.DataFrame:
"""
Gets the count of each Movie in the Dataset
Returns:
Pandas.DataFrame: [MovieID, Count]
"""
QUERY = """
SELECT MovieID, COUNT(*) AS Count
FROM RDFs
GROUP BY MovieID;
"""
return pd.read_sql_query(QUERY, self.sql_engine)
def get_relationship_count(self) -> pd.DataFrame:
"""
Gets the count of each Relationship in the Dataset
Returns:
Pandas.DataFrame: [RelationshipURI, Count]
"""
QUERY = """
SELECT RelationshipURI, COUNT(*) AS Count
FROM RDFs
INNER JOIN ParsedRelationships USING (RelationshipID)
GROUP BY RelationshipURI;
"""
return pd.read_sql_query(QUERY, self.sql_engine)
if __name__ == "__main__" :
sql_endpoint = SqlEndpoint()
for pandas_row in sql_endpoint.get_abbreviated_dataset_by_movie_id():
print(pandas_row)
# sql_endpoint.get_RDF()
print("done")

View File

@ -0,0 +1,9 @@
import pandas as pd
def get_raw_from_dataframe(DF: pd.DataFrame) -> str:
output = ''
for row in DF.itertuples(index=False, name=None):
output += "".join(map(str, row))
return output

View File

@ -0,0 +1,897 @@
{
"type": "excalidraw",
"version": 2,
"source": "https://marketplace.visualstudio.com/items?itemName=pomdtr.excalidraw-editor",
"elements": [
{
"id": "3zbCui3XtIGozHXTVAGRp",
"type": "rectangle",
"x": 316.5,
"y": 123,
"width": 436.5,
"height": 145.5,
"angle": 0,
"strokeColor": "#1e1e1e",
"backgroundColor": "transparent",
"fillStyle": "solid",
"strokeWidth": 2,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"groupIds": [],
"frameId": null,
"index": "a0",
"roundness": {
"type": 3
},
"seed": 1698427950,
"version": 35,
"versionNonce": 601575602,
"isDeleted": false,
"boundElements": [
{
"id": "wD66RDbG05HfvRhAtMb0J",
"type": "text"
},
{
"id": "gus_rxauKJ6T2L_F59PfN",
"type": "arrow"
}
],
"updated": 1758818588814,
"link": null,
"locked": false
},
{
"id": "wD66RDbG05HfvRhAtMb0J",
"type": "text",
"x": 480.98004150390625,
"y": 183.25,
"width": 107.5399169921875,
"height": 25,
"angle": 0,
"strokeColor": "#1e1e1e",
"backgroundColor": "transparent",
"fillStyle": "solid",
"strokeWidth": 2,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"groupIds": [],
"frameId": null,
"index": "a1",
"roundness": null,
"seed": 910769774,
"version": 31,
"versionNonce": 1120989938,
"isDeleted": false,
"boundElements": null,
"updated": 1758818416720,
"link": null,
"locked": false,
"text": "dataset.db",
"fontSize": 20,
"fontFamily": 5,
"textAlign": "center",
"verticalAlign": "middle",
"containerId": "3zbCui3XtIGozHXTVAGRp",
"originalText": "dataset.db",
"autoResize": true,
"lineHeight": 1.25
},
{
"id": "87-MeaiZGT1wln0nggYPZ",
"type": "rectangle",
"x": 339.5,
"y": 309.5,
"width": 392,
"height": 156,
"angle": 0,
"strokeColor": "#1e1e1e",
"backgroundColor": "transparent",
"fillStyle": "solid",
"strokeWidth": 2,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"groupIds": [],
"frameId": null,
"index": "a2",
"roundness": {
"type": 3
},
"seed": 655550318,
"version": 77,
"versionNonce": 1103939826,
"isDeleted": false,
"boundElements": null,
"updated": 1758818339000,
"link": null,
"locked": false
},
{
"id": "EjUxEhZqEBzwvlw0VE9eJ",
"type": "rectangle",
"x": 355.5,
"y": 327,
"width": 162,
"height": 125.5,
"angle": 0,
"strokeColor": "#1e1e1e",
"backgroundColor": "transparent",
"fillStyle": "solid",
"strokeWidth": 2,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"groupIds": [],
"frameId": null,
"index": "a3",
"roundness": {
"type": 3
},
"seed": 1739846638,
"version": 64,
"versionNonce": 1594290034,
"isDeleted": false,
"boundElements": [
{
"type": "text",
"id": "ogRkV0neHrhEKTE6zlggl"
}
],
"updated": 1758818391415,
"link": null,
"locked": false
},
{
"id": "ogRkV0neHrhEKTE6zlggl",
"type": "text",
"x": 378.7100524902344,
"y": 377.25,
"width": 115.57989501953125,
"height": 25,
"angle": 0,
"strokeColor": "#1e1e1e",
"backgroundColor": "transparent",
"fillStyle": "solid",
"strokeWidth": 2,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"groupIds": [],
"frameId": null,
"index": "a3V",
"roundness": null,
"seed": 2037675630,
"version": 12,
"versionNonce": 1286472046,
"isDeleted": false,
"boundElements": null,
"updated": 1758818399222,
"link": null,
"locked": false,
"text": "RDF_String",
"fontSize": 20,
"fontFamily": 5,
"textAlign": "center",
"verticalAlign": "middle",
"containerId": "EjUxEhZqEBzwvlw0VE9eJ",
"originalText": "RDF_String",
"autoResize": true,
"lineHeight": 1.25
},
{
"id": "hoIRMNiMJZl4YDo-hovWy",
"type": "rectangle",
"x": 542.5,
"y": 327,
"width": 173,
"height": 125.5,
"angle": 0,
"strokeColor": "#1e1e1e",
"backgroundColor": "transparent",
"fillStyle": "solid",
"strokeWidth": 2,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"groupIds": [],
"frameId": null,
"index": "a4",
"roundness": {
"type": 3
},
"seed": 1189796530,
"version": 99,
"versionNonce": 1071057006,
"isDeleted": false,
"boundElements": [
{
"type": "text",
"id": "rsapATFAT5YSBCXzLupgZ"
},
{
"id": "gus_rxauKJ6T2L_F59PfN",
"type": "arrow"
},
{
"id": "Wk1bJbbtC31FqObEL5xWt",
"type": "arrow"
}
],
"updated": 1758818593647,
"link": null,
"locked": false
},
{
"id": "rsapATFAT5YSBCXzLupgZ",
"type": "text",
"x": 585.6800384521484,
"y": 377.25,
"width": 86.63992309570312,
"height": 25,
"angle": 0,
"strokeColor": "#1e1e1e",
"backgroundColor": "transparent",
"fillStyle": "solid",
"strokeWidth": 2,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"groupIds": [],
"frameId": null,
"index": "a5",
"roundness": null,
"seed": 829619694,
"version": 12,
"versionNonce": 713902318,
"isDeleted": false,
"boundElements": null,
"updated": 1758818405150,
"link": null,
"locked": false,
"text": "Abstract",
"fontSize": 20,
"fontFamily": 5,
"textAlign": "center",
"verticalAlign": "middle",
"containerId": "hoIRMNiMJZl4YDo-hovWy",
"originalText": "Abstract",
"autoResize": true,
"lineHeight": 1.25
},
{
"id": "jSx8ApfhtRs_nk37VvDMb",
"type": "rectangle",
"x": 316.5,
"y": 511,
"width": 436.5,
"height": 145.5,
"angle": 0,
"strokeColor": "#1e1e1e",
"backgroundColor": "transparent",
"fillStyle": "solid",
"strokeWidth": 2,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"groupIds": [],
"frameId": null,
"index": "a6",
"roundness": {
"type": 3
},
"seed": 492582894,
"version": 132,
"versionNonce": 893797614,
"isDeleted": false,
"boundElements": [
{
"type": "text",
"id": "6E23g-rgowNqHsBxX-LuM"
},
{
"id": "hyFKqXwet_F79QM71atgI",
"type": "arrow"
},
{
"id": "x_DP1FcQ7jraGz0gBuDi3",
"type": "arrow"
},
{
"id": "1IGbCps2EHnzKgJUWM5nq",
"type": "arrow"
},
{
"id": "Wk1bJbbtC31FqObEL5xWt",
"type": "arrow"
}
],
"updated": 1758818593647,
"link": null,
"locked": false
},
{
"id": "6E23g-rgowNqHsBxX-LuM",
"type": "text",
"x": 499.9100341796875,
"y": 571.25,
"width": 69.679931640625,
"height": 25,
"angle": 0,
"strokeColor": "#1e1e1e",
"backgroundColor": "transparent",
"fillStyle": "solid",
"strokeWidth": 2,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"groupIds": [],
"frameId": null,
"index": "a7",
"roundness": null,
"seed": 267696178,
"version": 132,
"versionNonce": 1668243186,
"isDeleted": false,
"boundElements": null,
"updated": 1758818543211,
"link": null,
"locked": false,
"text": "Pandas",
"fontSize": 20,
"fontFamily": 5,
"textAlign": "center",
"verticalAlign": "middle",
"containerId": "jSx8ApfhtRs_nk37VvDMb",
"originalText": "Pandas",
"autoResize": true,
"lineHeight": 1.25
},
{
"id": "ohj18N4AOTDz5lJNcV9gi",
"type": "rectangle",
"x": 261,
"y": 765.5,
"width": 157,
"height": 87,
"angle": 0,
"strokeColor": "#1e1e1e",
"backgroundColor": "transparent",
"fillStyle": "solid",
"strokeWidth": 2,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"groupIds": [],
"frameId": null,
"index": "a8",
"roundness": {
"type": 3
},
"seed": 1446207150,
"version": 279,
"versionNonce": 317375026,
"isDeleted": false,
"boundElements": [
{
"id": "Ea1_ke2wA0D8ZjVOUtvfY",
"type": "text"
},
{
"id": "hyFKqXwet_F79QM71atgI",
"type": "arrow"
}
],
"updated": 1758818570993,
"link": null,
"locked": false
},
{
"id": "Ea1_ke2wA0D8ZjVOUtvfY",
"type": "text",
"x": 297.0800323486328,
"y": 796.5,
"width": 84.83993530273438,
"height": 25,
"angle": 0,
"strokeColor": "#1e1e1e",
"backgroundColor": "transparent",
"fillStyle": "solid",
"strokeWidth": 2,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"groupIds": [],
"frameId": null,
"index": "a9",
"roundness": null,
"seed": 435116270,
"version": 199,
"versionNonce": 1282911218,
"isDeleted": false,
"boundElements": null,
"updated": 1758818570993,
"link": null,
"locked": false,
"text": "train.txt",
"fontSize": 20,
"fontFamily": 5,
"textAlign": "center",
"verticalAlign": "middle",
"containerId": "ohj18N4AOTDz5lJNcV9gi",
"originalText": "train.txt",
"autoResize": true,
"lineHeight": 1.25
},
{
"id": "A4Y54Y26fe257U_QU9lxX",
"type": "rectangle",
"x": 464,
"y": 765.5,
"width": 157,
"height": 87,
"angle": 0,
"strokeColor": "#1e1e1e",
"backgroundColor": "transparent",
"fillStyle": "solid",
"strokeWidth": 2,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"groupIds": [],
"frameId": null,
"index": "aA",
"roundness": {
"type": 3
},
"seed": 186148850,
"version": 232,
"versionNonce": 997119858,
"isDeleted": false,
"boundElements": [
{
"id": "v4TvUlDEjH7EvPDmtbOn2",
"type": "text"
},
{
"id": "1IGbCps2EHnzKgJUWM5nq",
"type": "arrow"
}
],
"updated": 1758818570993,
"link": null,
"locked": false
},
{
"id": "v4TvUlDEjH7EvPDmtbOn2",
"type": "text",
"x": 476.3500442504883,
"y": 796.5,
"width": 132.29991149902344,
"height": 25,
"angle": 0,
"strokeColor": "#1e1e1e",
"backgroundColor": "transparent",
"fillStyle": "solid",
"strokeWidth": 2,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"groupIds": [],
"frameId": null,
"index": "aB",
"roundness": null,
"seed": 1131059634,
"version": 171,
"versionNonce": 239540530,
"isDeleted": false,
"boundElements": null,
"updated": 1758818570993,
"link": null,
"locked": false,
"text": "validation.txt",
"fontSize": 20,
"fontFamily": 5,
"textAlign": "center",
"verticalAlign": "middle",
"containerId": "A4Y54Y26fe257U_QU9lxX",
"originalText": "validation.txt",
"autoResize": true,
"lineHeight": 1.25
},
{
"id": "mPaYpJ9Xn7tlJPmKPqJKJ",
"type": "rectangle",
"x": 674.5,
"y": 765.5,
"width": 157,
"height": 87,
"angle": 0,
"strokeColor": "#1e1e1e",
"backgroundColor": "transparent",
"fillStyle": "solid",
"strokeWidth": 2,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"groupIds": [],
"frameId": null,
"index": "aC",
"roundness": {
"type": 3
},
"seed": 1049323314,
"version": 235,
"versionNonce": 330560690,
"isDeleted": false,
"boundElements": [
{
"type": "text",
"id": "kg9nm2rpud6cax5aNPSnu"
},
{
"id": "x_DP1FcQ7jraGz0gBuDi3",
"type": "arrow"
}
],
"updated": 1758818570993,
"link": null,
"locked": false
},
{
"id": "kg9nm2rpud6cax5aNPSnu",
"type": "text",
"x": 711.4300231933594,
"y": 796.5,
"width": 83.13995361328125,
"height": 25,
"angle": 0,
"strokeColor": "#1e1e1e",
"backgroundColor": "transparent",
"fillStyle": "solid",
"strokeWidth": 2,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"groupIds": [],
"frameId": null,
"index": "aD",
"roundness": null,
"seed": 522572142,
"version": 193,
"versionNonce": 1920372338,
"isDeleted": false,
"boundElements": null,
"updated": 1758818570993,
"link": null,
"locked": false,
"text": "test.txt",
"fontSize": 20,
"fontFamily": 5,
"textAlign": "center",
"verticalAlign": "middle",
"containerId": "mPaYpJ9Xn7tlJPmKPqJKJ",
"originalText": "test.txt",
"autoResize": true,
"lineHeight": 1.25
},
{
"id": "hyFKqXwet_F79QM71atgI",
"type": "arrow",
"x": 534.65,
"y": 661.5,
"width": 195.25,
"height": 99,
"angle": 0,
"strokeColor": "#1e1e1e",
"backgroundColor": "transparent",
"fillStyle": "solid",
"strokeWidth": 2,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"groupIds": [],
"frameId": null,
"index": "aG",
"roundness": null,
"seed": 873266098,
"version": 71,
"versionNonce": 541154738,
"isDeleted": false,
"boundElements": null,
"updated": 1758818570993,
"link": null,
"locked": false,
"points": [
[
0,
0
],
[
0,
49.5
],
[
-195.25,
49.5
],
[
-195.25,
99
]
],
"lastCommittedPoint": null,
"startBinding": {
"elementId": "jSx8ApfhtRs_nk37VvDMb",
"fixedPoint": [
0.49977090492554405,
1.034364261168385
],
"focus": 0,
"gap": 0
},
"endBinding": {
"elementId": "ohj18N4AOTDz5lJNcV9gi",
"fixedPoint": [
0.4993630573248406,
-0.05747126436781609
],
"focus": 0,
"gap": 0
},
"startArrowhead": null,
"endArrowhead": "triangle",
"elbowed": true,
"fixedSegments": null,
"startIsSpecial": null,
"endIsSpecial": null
},
{
"id": "x_DP1FcQ7jraGz0gBuDi3",
"type": "arrow",
"x": 534.65,
"y": 661.5,
"width": 218.25,
"height": 99,
"angle": 0,
"strokeColor": "#1e1e1e",
"backgroundColor": "transparent",
"fillStyle": "solid",
"strokeWidth": 2,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"groupIds": [],
"frameId": null,
"index": "aH",
"roundness": null,
"seed": 1210817582,
"version": 77,
"versionNonce": 1483392370,
"isDeleted": false,
"boundElements": null,
"updated": 1758818580594,
"link": null,
"locked": false,
"points": [
[
0,
0
],
[
0,
49.5
],
[
218.25,
49.5
],
[
218.25,
99
]
],
"lastCommittedPoint": null,
"startBinding": {
"elementId": "jSx8ApfhtRs_nk37VvDMb",
"fixedPoint": [
0.49977090492554405,
1.034364261168385
],
"focus": 0,
"gap": 0
},
"endBinding": {
"elementId": "mPaYpJ9Xn7tlJPmKPqJKJ",
"fixedPoint": [
0.4993630573248406,
-0.05747126436781609
],
"focus": 0,
"gap": 0
},
"startArrowhead": null,
"endArrowhead": "triangle",
"elbowed": true,
"fixedSegments": null,
"startIsSpecial": null,
"endIsSpecial": null
},
{
"id": "1IGbCps2EHnzKgJUWM5nq",
"type": "arrow",
"x": 534.65,
"y": 661.5,
"width": 0.5719232650604908,
"height": 99.07394122590165,
"angle": 0,
"strokeColor": "#1e1e1e",
"backgroundColor": "transparent",
"fillStyle": "solid",
"strokeWidth": 2,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"groupIds": [],
"frameId": null,
"index": "aK",
"roundness": null,
"seed": 1205316658,
"version": 96,
"versionNonce": 1748050674,
"isDeleted": false,
"boundElements": null,
"updated": 1758818570993,
"link": null,
"locked": false,
"points": [
[
0,
0
],
[
-0.5719232650604908,
99.07394122590165
]
],
"lastCommittedPoint": null,
"startBinding": {
"elementId": "jSx8ApfhtRs_nk37VvDMb",
"fixedPoint": [
0.49977090492554405,
1.034364261168385
],
"focus": 0,
"gap": 0
},
"endBinding": {
"elementId": "A4Y54Y26fe257U_QU9lxX",
"fixedPoint": [
0.44635717665566554,
-0.056621365219521276
],
"focus": 0,
"gap": 0
},
"startArrowhead": null,
"endArrowhead": "triangle",
"elbowed": true,
"fixedSegments": null,
"startIsSpecial": null,
"endIsSpecial": null
},
{
"id": "gus_rxauKJ6T2L_F59PfN",
"type": "arrow",
"x": 539,
"y": 271.5,
"width": 0,
"height": 33.5,
"angle": 0,
"strokeColor": "#1e1e1e",
"backgroundColor": "transparent",
"fillStyle": "solid",
"strokeWidth": 2,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"groupIds": [],
"frameId": null,
"index": "aL",
"roundness": null,
"seed": 763990258,
"version": 17,
"versionNonce": 1028811378,
"isDeleted": false,
"boundElements": null,
"updated": 1758818588814,
"link": null,
"locked": false,
"points": [
[
0,
0
],
[
0,
33.5
]
],
"lastCommittedPoint": null,
"startBinding": {
"elementId": "3zbCui3XtIGozHXTVAGRp",
"focus": -0.019473081328751418,
"gap": 3
},
"endBinding": {
"elementId": "hoIRMNiMJZl4YDo-hovWy",
"focus": -1.0404624277456647,
"gap": 30.7545797799829
},
"startArrowhead": null,
"endArrowhead": "triangle",
"elbowed": false
},
{
"id": "Wk1bJbbtC31FqObEL5xWt",
"type": "arrow",
"x": 536.5,
"y": 468.5,
"width": 0,
"height": 39,
"angle": 0,
"strokeColor": "#1e1e1e",
"backgroundColor": "transparent",
"fillStyle": "solid",
"strokeWidth": 2,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"groupIds": [],
"frameId": null,
"index": "aM",
"roundness": null,
"seed": 1489771054,
"version": 33,
"versionNonce": 1828178606,
"isDeleted": false,
"boundElements": null,
"updated": 1758818593647,
"link": null,
"locked": false,
"points": [
[
0,
0
],
[
0,
39
]
],
"lastCommittedPoint": null,
"startBinding": {
"elementId": "hoIRMNiMJZl4YDo-hovWy",
"focus": 1.0693641618497107,
"gap": 27.157190169432425
},
"endBinding": {
"elementId": "jSx8ApfhtRs_nk37VvDMb",
"focus": 0.008018327605956525,
"gap": 3.5
},
"startArrowhead": null,
"endArrowhead": "triangle",
"elbowed": false
}
],
"appState": {
"gridSize": 20,
"gridStep": 5,
"gridModeEnabled": false,
"viewBackgroundColor": "#ffffff"
},
"files": {}
}

View File

@ -0,0 +1,634 @@
{
"type": "excalidraw",
"version": 2,
"source": "https://marketplace.visualstudio.com/items?itemName=pomdtr.excalidraw-editor",
"elements": [
{
"id": "JNB9z-PeqZ4s8KDfWaoXe",
"type": "rectangle",
"x": 106,
"y": 27,
"width": 653,
"height": 263,
"angle": 0,
"strokeColor": "#1e1e1e",
"backgroundColor": "transparent",
"fillStyle": "solid",
"strokeWidth": 2,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"groupIds": [],
"frameId": null,
"index": "a2",
"roundness": {
"type": 3
},
"seed": 710740889,
"version": 326,
"versionNonce": 1107631703,
"isDeleted": false,
"boundElements": null,
"updated": 1759156408059,
"link": null,
"locked": false
},
{
"id": "e13wNTgUpn2flMpmMttqx",
"type": "text",
"x": 200.5943407656526,
"y": 44.07937975075269,
"width": 307.2781467269385,
"height": 23.3097531902191,
"angle": 0,
"strokeColor": "#1e1e1e",
"backgroundColor": "transparent",
"fillStyle": "solid",
"strokeWidth": 2,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"groupIds": [],
"frameId": null,
"index": "a3",
"roundness": null,
"seed": 1012740663,
"version": 444,
"versionNonce": 589551257,
"isDeleted": false,
"boundElements": null,
"updated": 1759156408059,
"link": null,
"locked": false,
"text": "Libs/CleaningPipeline/sql_endpoint",
"fontSize": 18.64780255217528,
"fontFamily": 5,
"textAlign": "left",
"verticalAlign": "top",
"containerId": null,
"originalText": "Libs/CleaningPipeline/sql_endpoint",
"autoResize": true,
"lineHeight": 1.25
},
{
"id": "CgxCElJkKBtIHv-5WQrbo",
"type": "text",
"x": 195,
"y": 80.44259472749451,
"width": 403.64997665852184,
"height": 186.4780255217528,
"angle": 0,
"strokeColor": "#1e1e1e",
"backgroundColor": "transparent",
"fillStyle": "solid",
"strokeWidth": 2,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"groupIds": [],
"frameId": null,
"index": "a4",
"roundness": null,
"seed": 1261951799,
"version": 507,
"versionNonce": 1922906999,
"isDeleted": false,
"boundElements": null,
"updated": 1759156408059,
"link": null,
"locked": false,
"text": "Class SqlEndpoint:\n - sql_engine\n + movie_ids: list[int]\n\n #\n + get_abbreviated_dataset_by_movie_id\n\n",
"fontSize": 18.64780255217528,
"fontFamily": 5,
"textAlign": "left",
"verticalAlign": "top",
"containerId": null,
"originalText": "Class SqlEndpoint:\n - sql_engine\n + movie_ids: list[int]\n\n #\n + get_abbreviated_dataset_by_movie_id\n\n",
"autoResize": true,
"lineHeight": 1.25
},
{
"type": "line",
"version": 4979,
"versionNonce": 1473849177,
"isDeleted": false,
"id": "sYReMTdYblr-oJtYYJALU",
"fillStyle": "solid",
"strokeWidth": 1,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"angle": 0,
"x": -67.14432426259049,
"y": 87.19293561900287,
"strokeColor": "#000000",
"backgroundColor": "#a5d8ff",
"width": 77.09201683999922,
"height": 99.49948667804088,
"seed": 1263944119,
"groupIds": [
"9YkNe1yqnfZy9Z1JX2xr4",
"BDBCTrrhjbJynRAyuf3xJ"
],
"strokeSharpness": "round",
"boundElementIds": [],
"startBinding": null,
"endBinding": null,
"lastCommittedPoint": null,
"startArrowhead": null,
"endArrowhead": null,
"points": [
[
0,
0
],
[
0.2542098813493443,
75.20117273657175
],
[
0.011896425679918422,
83.76249969444815
],
[
3.970409367559332,
87.46174320643391
],
[
17.75573317066317,
90.59250103325854
],
[
41.05683533152865,
91.56737225214069
],
[
63.319497586673116,
90.01084754868091
],
[
75.14781395923075,
86.28844687220405
],
[
76.81603792670788,
83.15042405259751
],
[
77.05033394391478,
76.25776215104557
],
[
76.86643881413028,
6.3089586511537865
],
[
76.45188016352971,
-0.2999144698665015
],
[
71.50179495549581,
-3.9936571317850627
],
[
61.077971898861186,
-6.132877429442784
],
[
37.32348754161154,
-7.932114425900202
],
[
18.278415656797975,
-6.859225353587373
],
[
3.2995959613238286,
-3.2201165291205287
],
[
-0.04168289608444441,
-0.045185660461322996
],
[
0,
0
]
],
"index": "a6",
"frameId": null,
"roundness": {
"type": 2
},
"boundElements": [],
"updated": 1759158252997,
"link": null,
"locked": false
},
{
"type": "line",
"version": 2684,
"versionNonce": 952947769,
"isDeleted": false,
"id": "0S6dEWQVqKUVkP6Z5IX1l",
"fillStyle": "solid",
"strokeWidth": 1,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"angle": 0,
"x": -66.6203948243155,
"y": 144.31921927673278,
"strokeColor": "#000000",
"backgroundColor": "#a5d8ff",
"width": 77.17198221193564,
"height": 8.562348957853036,
"seed": 817033943,
"groupIds": [
"9YkNe1yqnfZy9Z1JX2xr4",
"BDBCTrrhjbJynRAyuf3xJ"
],
"strokeSharpness": "round",
"boundElementIds": [],
"startBinding": null,
"endBinding": null,
"lastCommittedPoint": null,
"startArrowhead": null,
"endArrowhead": null,
"points": [
[
0,
0
],
[
2.033150371639873,
3.413095389435587
],
[
10.801287372573954,
6.276651055277943
],
[
22.468666942209353,
8.010803051612635
],
[
40.747074201802775,
8.168828515515864
],
[
62.077348233027564,
7.0647721921469495
],
[
74.53446931782398,
3.04824021069218
],
[
77.17198221193564,
-0.3935204423371723
]
],
"index": "a7",
"frameId": null,
"roundness": {
"type": 2
},
"boundElements": [],
"updated": 1759158252997,
"link": null,
"locked": false
},
{
"type": "line",
"version": 2770,
"versionNonce": 477619481,
"isDeleted": false,
"id": "szGLND7J0nVOvRkNXX9AS",
"fillStyle": "solid",
"strokeWidth": 1,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"angle": 0,
"x": -67.65225214681931,
"y": 115.35516394150972,
"strokeColor": "#000000",
"backgroundColor": "#a5d8ff",
"width": 77.17198221193564,
"height": 8.562348957853036,
"seed": 1704755191,
"groupIds": [
"9YkNe1yqnfZy9Z1JX2xr4",
"BDBCTrrhjbJynRAyuf3xJ"
],
"strokeSharpness": "round",
"boundElementIds": [],
"startBinding": null,
"endBinding": null,
"lastCommittedPoint": null,
"startArrowhead": null,
"endArrowhead": null,
"points": [
[
0,
0
],
[
2.033150371639873,
3.413095389435587
],
[
10.801287372573954,
6.276651055277943
],
[
22.468666942209353,
8.010803051612635
],
[
40.747074201802775,
8.168828515515864
],
[
62.077348233027564,
7.0647721921469495
],
[
74.53446931782398,
3.04824021069218
],
[
77.17198221193564,
-0.3935204423371723
]
],
"index": "a8",
"frameId": null,
"roundness": {
"type": 2
},
"boundElements": [],
"updated": 1759158252997,
"link": null,
"locked": false
},
{
"type": "ellipse",
"version": 5767,
"versionNonce": 2119031289,
"isDeleted": false,
"id": "O3t2uGktJlDd1_OX_bpV4",
"fillStyle": "solid",
"strokeWidth": 1,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"angle": 0,
"x": -68.71020112890136,
"y": 80.06066699332126,
"strokeColor": "#000000",
"backgroundColor": "#a5d8ff",
"width": 76.59753601865496,
"height": 15.49127539284798,
"seed": 471296279,
"groupIds": [
"9YkNe1yqnfZy9Z1JX2xr4",
"BDBCTrrhjbJynRAyuf3xJ"
],
"strokeSharpness": "sharp",
"boundElementIds": [
"bxuMGTzXLn7H-uBCptINx"
],
"index": "a9",
"frameId": null,
"roundness": null,
"boundElements": [],
"updated": 1759158252997,
"link": null,
"locked": false
},
{
"type": "ellipse",
"version": 1177,
"versionNonce": 525480665,
"isDeleted": false,
"id": "_SzKlOBOvJgBg7FX0JTTM",
"fillStyle": "solid",
"strokeWidth": 1,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"angle": 0,
"x": -32.218214023678854,
"y": 104.53733467322485,
"strokeColor": "#000000",
"backgroundColor": "#228be6",
"width": 11.226103154161754,
"height": 12.183758484455605,
"seed": 1368927799,
"groupIds": [
"9YkNe1yqnfZy9Z1JX2xr4",
"BDBCTrrhjbJynRAyuf3xJ"
],
"strokeSharpness": "sharp",
"boundElementIds": [],
"index": "aA",
"frameId": null,
"roundness": null,
"boundElements": [],
"updated": 1759158252997,
"link": null,
"locked": false
},
{
"type": "ellipse",
"version": 1465,
"versionNonce": 1410887609,
"isDeleted": false,
"id": "oJMl2Kxa3SPaiAY0kxo7A",
"fillStyle": "solid",
"strokeWidth": 1,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"angle": 0,
"x": -31.867072239745255,
"y": 130.75394896028996,
"strokeColor": "#000000",
"backgroundColor": "#228be6",
"width": 11.226103154161754,
"height": 12.183758484455605,
"seed": 1627606871,
"groupIds": [
"9YkNe1yqnfZy9Z1JX2xr4",
"BDBCTrrhjbJynRAyuf3xJ"
],
"strokeSharpness": "sharp",
"boundElementIds": [],
"index": "aB",
"frameId": null,
"roundness": null,
"boundElements": [],
"updated": 1759158252997,
"link": null,
"locked": false
},
{
"type": "ellipse",
"version": 1348,
"versionNonce": 314839193,
"isDeleted": false,
"id": "fB6pJBSMA-pRHrpgYKaLL",
"fillStyle": "solid",
"strokeWidth": 1,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"angle": 6.239590202363168,
"x": -31.218214023678854,
"y": 159.52267553159635,
"strokeColor": "#000000",
"backgroundColor": "#228be6",
"width": 11.226103154161754,
"height": 12.183758484455605,
"seed": 1420643447,
"groupIds": [
"9YkNe1yqnfZy9Z1JX2xr4",
"BDBCTrrhjbJynRAyuf3xJ"
],
"strokeSharpness": "sharp",
"boundElementIds": [],
"index": "aC",
"frameId": null,
"roundness": null,
"boundElements": [],
"updated": 1759158252997,
"link": null,
"locked": false
},
{
"type": "text",
"version": 846,
"versionNonce": 1091081593,
"isDeleted": false,
"id": "9gZ3Yy1MeP9kEOTLODqLG",
"fillStyle": "solid",
"strokeWidth": 1,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"angle": 0,
"x": -76.81018163712321,
"y": 181.11281713043917,
"strokeColor": "#000000",
"backgroundColor": "#a5d8ff",
"width": 95.63072204589844,
"height": 23.595161071904883,
"seed": 2019206551,
"groupIds": [
"BDBCTrrhjbJynRAyuf3xJ"
],
"strokeSharpness": "sharp",
"boundElementIds": [],
"fontSize": 17.4778970902999,
"fontFamily": 1,
"text": "dataset.db",
"baseline": 16.595161071904883,
"textAlign": "center",
"verticalAlign": "top",
"index": "aD",
"frameId": null,
"roundness": null,
"boundElements": [],
"updated": 1759158252997,
"link": null,
"locked": false,
"containerId": null,
"originalText": "dataset.db",
"autoResize": true,
"lineHeight": 1.350000000000001
},
{
"id": "3eOw20xMhpB5jf_RMG24P",
"type": "text",
"x": 1131.3333333333335,
"y": 31.333333333333428,
"width": 508.3333333333333,
"height": 550,
"angle": 0,
"strokeColor": "#1e1e1e",
"backgroundColor": "transparent",
"fillStyle": "solid",
"strokeWidth": 2,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"groupIds": [],
"frameId": null,
"index": "aE",
"roundness": null,
"seed": 1535658041,
"version": 821,
"versionNonce": 1630266809,
"isDeleted": false,
"boundElements": null,
"updated": 1759157181677,
"link": null,
"locked": false,
"text": "Class PipelineApplier\n - movie_frequence_filter : pd.DataFrame()\n - rel_Frequence_Filter : pd.DataFrame()\n - rel_banned_list: list[str]\n\n + generate_movie_frequency_filter()\n + generate_rel_frequency_filter()\n + generate_list_relationship_filter()\n \n + filter_by_movie_frequency()\n + filter_by_relationship_frequency()\n + delete_relationship_by_list_filter()\n + delete_relationship_by_str()\n\n + drop_na() \n\n + rdf_add_special_token()\n + group_triple_by_movie()\n + build_by_movie()\n # static\n + build_triple()\n + build_incomplete_triple()",
"fontSize": 20,
"fontFamily": 5,
"textAlign": "left",
"verticalAlign": "top",
"containerId": null,
"originalText": "Class PipelineApplier\n - movie_frequence_filter : pd.DataFrame()\n - rel_Frequence_Filter : pd.DataFrame()\n - rel_banned_list: list[str]\n\n + generate_movie_frequency_filter()\n + generate_rel_frequency_filter()\n + generate_list_relationship_filter()\n \n + filter_by_movie_frequency()\n + filter_by_relationship_frequency()\n + delete_relationship_by_list_filter()\n + delete_relationship_by_str()\n\n + drop_na() \n\n + rdf_add_special_token()\n + group_triple_by_movie()\n + build_by_movie()\n # static\n + build_triple()\n + build_incomplete_triple()",
"autoResize": false,
"lineHeight": 1.25
},
{
"id": "Fbl1gpb5r7QrdRauGUWm2",
"type": "text",
"x": 158.23809523809535,
"y": 502.52380952380935,
"width": 484.2857142857143,
"height": 500,
"angle": 0,
"strokeColor": "#1e1e1e",
"backgroundColor": "transparent",
"fillStyle": "solid",
"strokeWidth": 2,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"groupIds": [],
"frameId": null,
"index": "aF",
"roundness": null,
"seed": 2066618807,
"version": 552,
"versionNonce": 1269344823,
"isDeleted": false,
"boundElements": null,
"updated": 1759158199532,
"link": null,
"locked": false,
"text": "Class Pipeline\n - sql_endpoint: SqlEndpoint()\n\n - task_rdf_mask_file_handler:\n - task_bpe_corpus_file_handler:\n - task_rdf_text_file_handler:\n - task_rdf_completation_file_handler:\n\n - Filter_applier : PipelineApplier()\n\n #\n - get_cleaned_movie_rows()\n \n + execute_task_bpe_corpus()\n + execute_task_rdf_mask()\n + execute_task_rdf_text()\n + execute_task_rdf_completation()\n + execute_all_task()\n\n + use_toy_dataset()",
"fontSize": 20,
"fontFamily": 5,
"textAlign": "left",
"verticalAlign": "top",
"containerId": null,
"originalText": "Class Pipeline\n - sql_endpoint: SqlEndpoint()\n\n - task_rdf_mask_file_handler:\n - task_bpe_corpus_file_handler:\n - task_rdf_text_file_handler:\n - task_rdf_completation_file_handler:\n\n - Filter_applier : PipelineApplier()\n\n #\n - get_cleaned_movie_rows()\n \n + execute_task_bpe_corpus()\n + execute_task_rdf_mask()\n + execute_task_rdf_text()\n + execute_task_rdf_completation()\n + execute_all_task()\n\n + use_toy_dataset()",
"autoResize": false,
"lineHeight": 1.25
}
],
"appState": {
"gridSize": 20,
"gridStep": 5,
"gridModeEnabled": false,
"viewBackgroundColor": "#ffffff"
},
"files": {}
}

View File

@ -15,3 +15,4 @@ tzdata==2025.2
urllib3==2.5.0 urllib3==2.5.0
wheel==0.45.1 wheel==0.45.1
Wikipedia-API==0.8.1 Wikipedia-API==0.8.1
SQLAlchemy