30 Commits

Author SHA1 Message Date
GassiGiuseppe
9440a562f2 Merge branch 'dev.etl' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev.etl 2025-09-25 18:33:51 +02:00
Christian Risi
5eda131aac Fixed creation query to be unique even with movieID in RDFs 2025-09-25 17:58:09 +02:00
GassiGiuseppe
57884eaf2e CSV support added to path_splitter_tree
Also resolved a minor bug to print also leaf nodes
2025-09-25 17:57:46 +02:00
Christian Risi
4548a683c2 Fixed DB 2025-09-25 17:57:45 +02:00
GassiGiuseppe
3eec49ffa5 WIP: added test file: clean_relationship.jupyter
to create a first cleaning pipeline
2025-09-25 16:28:24 +02:00
Christian Risi
0bc7f4b227 Fixed Typos 2025-09-25 12:37:52 +02:00
Christian Risi
f28952b0a2 Added todo 2025-09-25 12:00:26 +02:00
Christian Risi
0b626a8e09 Modified query to take all data 2025-09-25 11:53:12 +02:00
Christian Risi
b254098532 Added views to count for subjects and objects 2025-09-25 11:40:44 +02:00
Christian Risi
ee88ffe4cf Added View to filter over relationship counts 2025-09-25 11:32:03 +02:00
Christian Risi
70b4bd8645 Added Complex query 2025-09-25 11:31:34 +02:00
Christian Risi
6316d2bfc4 Added queries to take data from SQL for dataset 2025-09-25 11:27:19 +02:00
Christian Risi
87ca748f45 Updated DB to reflect new changes 2025-09-24 19:29:57 +02:00
Christian Risi
4315d70109 Merged abbreviation_datawarehouse into datawarehouse 2025-09-24 19:29:43 +02:00
Christian Risi
9a5d633b5e Fixed Typos 2025-09-24 19:29:07 +02:00
Christian Risi
a6760cd52d Updated SQL Queries to support parsing in DB 2025-09-24 19:28:55 +02:00
GassiGiuseppe
a7eb92227d Moved all db queries file in their own folder 2025-09-24 16:44:55 +02:00
GassiGiuseppe
9f221e31cd Merge branch 'dev.etl' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev.etl 2025-09-24 16:32:52 +02:00
GassiGiuseppe
47197194d5 WIP abbrevietion_datawarehouse to creat an abbreviation system 2025-09-24 16:32:09 +02:00
Christian Risi
0cdbf6f624 Added query to retrieve a dirty dataset from SQLite DB 2025-09-24 16:15:47 +02:00
Christian Risi
3e30489f86 Updated Queries for DB 2025-09-24 14:44:53 +02:00
Christian Risi
8a22e453e4 Fixed csv 2025-09-24 14:44:25 +02:00
Christian Risi
7feb4eb857 Fixed URI generation 2025-09-24 14:44:07 +02:00
Christian Risi
70af19d356 Removed unused imports and added trailing slashes 2025-09-24 14:04:48 +02:00
Christian Risi
a4b44ab2ee Fixed Typos 2025-09-24 14:04:27 +02:00
Christian Risi
74b6b609dd Fixed typos 2025-09-24 13:59:19 +02:00
Christian Risi
59796c37cb Added script to take dbpedia uris 2025-09-24 13:49:29 +02:00
Christian Risi
f696f5950b Added uri-abbreviations 2025-09-24 13:48:53 +02:00
Christian Risi
605b496da7 Added barebone UML diagram for a Cleaning Pipeline 2025-09-23 19:49:01 +02:00
Christian Risi
7d693964dd Added new directories to tree structure 2025-09-23 19:47:56 +02:00
13 changed files with 1684 additions and 152 deletions

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:c1fcb1ad61a69145145c45c639ab42b36ffc63caa0ef9832eb81491197883ff4
3 size 8086

Binary file not shown.

View File

@@ -0,0 +1,30 @@
-- To pass to Pandas
SELECT *
FROM RDFs
INNER JOIN Subjects USING (SubjectID)
INNER JOIN Relationships USING (RelationshipID)
INNER JOIN Objects USING (ObjectID);
-- To pass to Pandas for abstracts
SELECT *
FROM RDFs
INNER JOIN WikipediaAbstracts USING (MovieID);
-- To pass to Pandas for abbreviations
SELECT *
FROM Abbreviations;
-- More complex to have clean dataset
-- More complex to have clean dataset
SELECT MovieID, GROUP_CONCAT('<SOT>' || '<SUB>' || SubjectURI || '<REL>' || RelationshipURI || '<OBJ>' || ObjectURI || '<EOT>', '') as RDF_String, Abstract
FROM RDFs
INNER JOIN SubjectsCountInRDFs USING (SubjectID)
INNER JOIN RelationshipsCountInRDFs USING(RelationshipID)
INNER JOIN ObjectsCountInRDFs USING (ObjectID)
INNER JOIN ParsedSubjects USING (SubjectID)
INNER JOIN ParsedRelationships USING (RelationshipID)
INNER JOIN ParsedObjects USING (ObjectID)
INNER JOIN WikipediaAbstracts USING (MovieID)
-- insert WHERE here
-- WHERE SubjectID = 134626
GROUP BY MovieID;

View File

@@ -0,0 +1,174 @@
CREATE TABLE IF NOT EXISTS Movies (
MovieID INTEGER PRIMARY KEY AUTOINCREMENT,
MovieURI TEXT UNIQUE NOT NULL
);
CREATE TABLE IF NOT EXISTS WikiPageIDs (
MovieID INTEGER PRIMARY KEY,
PageID INTEGER UNIQUE NOT NULL,
FOREIGN KEY(MovieID) REFERENCES Movies(MovieID)
);
CREATE TABLE IF NOT EXISTS WikipediaAbstracts (
MovieID INTEGER PRIMARY KEY,
Abstract TEXT NOT NULL,
FOREIGN KEY(MovieID) REFERENCES Movies(MovieID)
);
CREATE TABLE IF NOT EXISTS Origins (
OriginID INTEGER PRIMARY KEY AUTOINCREMENT,
OriginName TEXT UNIQUE NOT NULL
);
CREATE TABLE IF NOT EXISTS Subjects (
SubjectID INTEGER PRIMARY KEY AUTOINCREMENT,
SubjectURI TEXT UNIQUE NOT NULL,
OriginID BIGINT NOT NULL,
FOREIGN KEY(OriginID) REFERENCES Origins(OriginID)
);
CREATE TABLE IF NOT EXISTS Relationships (
RelationshipID INTEGER PRIMARY KEY AUTOINCREMENT,
RelationshipURI TEXT UNIQUE NOT NULL
);
CREATE TABLE IF NOT EXISTS Objects (
ObjectID INTEGER PRIMARY KEY AUTOINCREMENT,
ObjectURI TEXT UNIQUE NOT NULL,
OriginID BIGINT NOT NULL,
FOREIGN KEY(OriginID) REFERENCES Origins(OriginID)
);
CREATE TABLE IF NOT EXISTS RDFs (
RDF_ID INTEGER PRIMARY KEY AUTOINCREMENT,
MovieID INTEGER NOT NULL,
SubjectID INTEGER NOT NULL,
RelationshipID INTEGER NOT NULL,
ObjectID INTEGER NOT NULL,
UNIQUE(MovieID, SubjectID, RelationshipID, ObjectID),
FOREIGN KEY(MovieID) REFERENCES Movies(MovieID),
FOREIGN KEY(SubjectID) REFERENCES Subjects(SubjectID),
FOREIGN KEY(RelationshipID) REFERENCES Relationships(RelationshipID),
FOREIGN KEY(ObjectID) REFERENCES Objects(ObjectID)
);
CREATE INDEX IF NOT EXISTS idx_rdf_movie_id ON RDFs(MovieID);
CREATE INDEX IF NOT EXISTS idx_rdf_subject_id ON RDFs(SubjectID);
CREATE INDEX IF NOT EXISTS idx_rdf_relationship_id ON RDFs(RelationshipID);
CREATE INDEX IF NOT EXISTS idx_rdf_object_id ON RDFs(ObjectID);
CREATE TABLE IF NOT EXISTS Abbreviations (
AbbreviationID INTEGER PRIMARY KEY AUTOINCREMENT,
URI TEXT UNIQUE NOT NULL,
Abbreviation TEXT UNIQUE NOT NULL
);
CREATE TABLE IF NOT EXISTS Subjects_Abbreviations (
SubjectID INTEGER NOT NULL,
AbbreviationID INTEGER NOT NULL,
PRIMARY KEY(SubjectID, AbbreviationID),
FOREIGN KEY(SubjectID) REFERENCES Subjects(SubjectID),
FOREIGN KEY(AbbreviationID) REFERENCES Abbreviations(AbbreviationID)
);
CREATE TABLE IF NOT EXISTS Relationships_Abbreviations (
RelationshipID INTEGER NOT NULL,
AbbreviationID INTEGER NOT NULL,
PRIMARY KEY(RelationshipID, AbbreviationID),
FOREIGN KEY(RelationshipID) REFERENCES Relationships(RelationshipID),
FOREIGN KEY(AbbreviationID) REFERENCES Abbreviations(AbbreviationID)
);
CREATE TABLE IF NOT EXISTS Objects_Abbreviations (
ObjectID INTEGER NOT NULL,
AbbreviationID INTEGER NOT NULL,
PRIMARY KEY(ObjectID, AbbreviationID),
FOREIGN KEY(ObjectID) REFERENCES Objects(ObjectID),
FOREIGN KEY(AbbreviationID) REFERENCES Abbreviations(AbbreviationID)
);
CREATE INDEX IF NOT EXISTS idx_sub_abbr_sub_id ON Subjects_Abbreviations(SubjectID);
CREATE INDEX IF NOT EXISTS idx_sub_abbr_abbr_id ON Subjects_Abbreviations(AbbreviationID);
CREATE INDEX IF NOT EXISTS idx_rel_abbr_rel_id ON Relationships_Abbreviations(RelationshipID);
CREATE INDEX IF NOT EXISTS idx_rel_abbr_abbr_id ON Relationships_Abbreviations(AbbreviationID);
CREATE INDEX IF NOT EXISTS idx_obj_abbr_obj_id ON Objects_Abbreviations(ObjectID);
CREATE INDEX IF NOT EXISTS idx_obj_abbr_abbr_id ON Objects_Abbreviations(AbbreviationID);
-- Views
-- Subjects
CREATE VIEW IF NOT EXISTS ParsedSubjects
AS
SELECT
SubjectID,
CASE WHEN Abbreviation IS NULL
THEN SubjectURI
ELSE Abbreviation || ':' || replace(SubjectURI, URI, '') END
AS SubjectURI
FROM Subjects
LEFT JOIN Subjects_Abbreviations USING (SubjectID)
LEFT JOIN Abbreviations USING (AbbreviationID);
-- Relationships
CREATE VIEW IF NOT EXISTS ParsedRelationships
AS
SELECT
RelationshipID,
CASE WHEN Abbreviation IS NULL
THEN RelationshipURI
ELSE Abbreviation || ':' || replace(RelationshipURI, URI, '') END
AS RelationshipURI
FROM Relationships
LEFT JOIN Relationships_Abbreviations USING (RelationshipID)
LEFT JOIN Abbreviations USING (AbbreviationID);
-- Objects
CREATE VIEW IF NOT EXISTS ParsedObjects
AS
SELECT
ObjectID,
CASE WHEN Abbreviation IS NULL
THEN ObjectURI
ELSE Abbreviation || ':' || replace(ObjectURI, URI, '') END
AS ObjectURI
FROM Objects
LEFT JOIN Objects_Abbreviations USING (ObjectID)
LEFT JOIN Abbreviations USING (AbbreviationID);
-- Subject Count
CREATE VIEW IF NOT EXISTS SubjectsCountInRDFs
AS
SELECT SubjectID, count(SubjectID) as Sub_Count
FROM RDFs
GROUP BY SubjectID;
-- Relationship Count
CREATE VIEW IF NOT EXISTS RelationshipsCountInRDFs
AS
SELECT RelationshipID, count(RelationshipID) as Rel_Count
FROM RDFs
GROUP BY RelationshipID;
-- Object Count
CREATE VIEW IF NOT EXISTS ObjectsCountInRDFs
AS
SELECT ObjectID, count(ObjectID) as Obj_Count
FROM RDFs
GROUP BY ObjectID;

View File

@@ -33,3 +33,23 @@ SELECT ObjectID FROM Objects WHERE ObjectURI = ?;
INSERT INTO RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?,?,?,?); INSERT INTO RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?,?,?,?);
-- Prefixes
INSERT INTO Abbreviations(URI, Abbreviation) VALUES (?,?);
INSERT INTO Objects_Abbreviations(ObjectID, AbbreviationID) VALUES (?,?);
INSERT INTO Relationships_Abbreviations(RelationshipID, AbbreviationID) VALUES (?,?);
INSERT INTO Subjects_Abbreviations(SubjectID, AbbreviationID) VALUES (?,?);
-- Please be sure it is a URI before running this query
-- and take at least until the domain and the first path part
SELECT AbbreviationID FROM Abbreviations WHERE URI LIKE ?;
-- Query to retrieve data
SELECT MovieID, GROUP_CONCAT('<SOT>' || '<SUB>' || SubjectURI || '<REL>' || RelationshipURI || '<OBJ>' || ObjectURI || '<EOT>', '') as RDF_String, Abstract
FROM RDFs
INNER JOIN ParsedSubjects USING (SubjectID)
INNER JOIN ParsedRelationships USING (RelationshipID)
INNER JOIN ParsedObjects USING (ObjectID)
INNER JOIN WikipediaAbstracts USING (MovieID)
-- insert WHERE here
GROUP BY MovieID;

View File

@@ -0,0 +1,186 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "b9081b7c",
"metadata": {},
"outputs": [],
"source": [
"# This file deletes in the pipeline the unwanted relationship by different rules\n",
"import pandas as pd\n",
"import sqlite3\n",
"import numpy as np\n",
"\n",
"\n",
"CONN = sqlite3.connect('../../Assets/Dataset/Tmp/dataset2.db')\n",
"\n",
"def get_RDF() -> pd.DataFrame:\n",
" \"\"\"\n",
" QUERY = \"SELECT * FROM RDFs \" \\\n",
" \"INNER JOIN Subjects USING (SubjectID) \" \\\n",
" \"INNER JOIN Relationships USING (RelationshipID) \" \\\n",
" \"INNER JOIN Objects USING (ObjectID);\"\n",
" RDF = pd.read_sql_query(QUERY, CONN)\n",
" RDF = RDF[[\"SubjectURI\", \"RelationshipURI\", \"ObjectURI\"]]\n",
" RDF = RDF.dropna()\n",
" \"\"\"\n",
" Subjects = pd.read_sql_query('SELECT * FROM Subjects;', CONN)\n",
" Objects = pd.read_sql_query('SELECT * FROM Objects;', CONN)\n",
" Relationships = pd.read_sql_query('SELECT * FROM Relationships;', CONN)\n",
" RDF = pd.read_sql_query('SELECT * FROM RDFs;', CONN)\n",
"\n",
" # drop '' values \n",
" Subjects = Subjects.replace('', np.nan)# .dropna()\n",
" Relationships = Relationships.replace('', np.nan)# .dropna()\n",
" Objects = Objects.replace('', np.nan)# .dropna()\n",
"\n",
" # join RDF with its components\n",
" RDF = RDF.merge(Subjects, left_on=\"SubjectID\", right_on=\"SubjectID\")\n",
" RDF = RDF.merge(Objects, left_on=\"ObjectID\", right_on=\"ObjectID\")\n",
" RDF = RDF.merge(Relationships, left_on=\"RelationshipID\", right_on=\"RelationshipID\")\n",
" RDF = RDF[[\"SubjectURI\", \"RelationshipURI\", \"ObjectURI\", \"MovieID\"]]\n",
" return RDF\n",
"\n",
"\n",
"#def delete_relationship_by_uri(RDF: pd.DataFrame, )\n",
"\n",
"def delete_relationship_by_uri(RDF: pd.DataFrame, uri: str) -> pd.DataFrame:\n",
" return RDF[RDF[\"RelationshipURI\"]!= uri]\n",
"\n",
"\n",
"\n",
"RDF = get_RDF()\n",
"# RDF = RDF.dropna()\n",
"# print(RDF)\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "644690bb",
"metadata": {},
"outputs": [],
"source": [
"def filter_by_frequence_relationship_uri(RDF: pd.DataFrame, count_treshold) -> pd.DataFrame:\n",
" counts = RDF[\"RelationshipURI\"].value_counts() \n",
" RDF[\"RelationshipFreq\"] = RDF[\"RelationshipURI\"].map(counts)\n",
" RDF = RDF[RDF[\"RelationshipFreq\"] >= count_treshold]\n",
" # counts is a series as key: relationship, value: count\n",
" # counts = counts[counts > count_treshold]\n",
" # relationships = counts.index\n",
" # RDF = RDF[RDF[\"RelationshipURI\"].isin(relationships)]\n",
" # RDF = RDF.groupby(\"RelationshipURI\").filter(lambda x: len(x) >= count_treshold)\n",
" return RDF\n",
"\n",
"RDF = filter_by_frequence_relationship_uri(RDF, 1)\n",
"# print(new_RDF)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "34525be6",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" SubjectURI \\\n",
"0 http://dbpedia.org/resource/Nights_of_Cabiria \n",
"1 http://dbpedia.org/resource/California_Science... \n",
"2 http://dbpedia.org/resource/China_Captain \n",
"3 http://dbpedia.org/resource/Caravan_of_Courage... \n",
"4 http://dbpedia.org/resource/WHIH_Newsfront \n",
"... ... \n",
"12725500 http://dbpedia.org/resource/I_Will_Follow_(film) \n",
"12725501 http://dbpedia.org/resource/I_Will_Follow_(film) \n",
"12725502 http://dbpedia.org/resource/I_Witnessed_Genoci... \n",
"12725503 http://dbpedia.org/resource/I_Woke_Up_Early_th... \n",
"12725504 http://dbpedia.org/resource/I_Won't_Play \n",
"\n",
" RelationshipURI \\\n",
"0 http://www.w3.org/2002/07/owl#differentFrom \n",
"1 http://www.w3.org/2002/07/owl#differentFrom \n",
"2 http://www.w3.org/2002/07/owl#differentFrom \n",
"3 http://www.w3.org/2002/07/owl#differentFrom \n",
"4 http://www.w3.org/2000/01/rdf-schema#seeAlso \n",
"... ... \n",
"12725500 http://dbpedia.org/ontology/producer \n",
"12725501 http://dbpedia.org/ontology/producer \n",
"12725502 http://dbpedia.org/ontology/producer \n",
"12725503 http://dbpedia.org/ontology/producer \n",
"12725504 http://dbpedia.org/ontology/producer \n",
"\n",
" ObjectURI MovieID \\\n",
"0 http://dbpedia.org/resource/Cabiria 26 \n",
"1 http://dbpedia.org/resource/California_Academy... 185 \n",
"2 http://dbpedia.org/resource/Captain_China 614 \n",
"3 http://dbpedia.org/resource/Caravan_of_Courage... 740 \n",
"4 http://dbpedia.org/resource/Captain_America:_C... 594 \n",
"... ... ... \n",
"12725500 http://dbpedia.org/resource/Ava_DuVernay 145854 \n",
"12725501 http://dbpedia.org/resource/Molly_Mayeux 145854 \n",
"12725502 http://dbpedia.org/resource/Headlines_Today 145861 \n",
"12725503 http://dbpedia.org/resource/Billy_Zane 145862 \n",
"12725504 http://dbpedia.org/resource/Gordon_Hollingshead 145864 \n",
"\n",
" RelationshipFreq MovieFreq \n",
"0 2132 216 \n",
"1 2132 264 \n",
"2 2132 66 \n",
"3 2132 131 \n",
"4 1653 133 \n",
"... ... ... \n",
"12725500 80077 95 \n",
"12725501 80077 95 \n",
"12725502 80077 41 \n",
"12725503 80077 98 \n",
"12725504 80077 91 \n",
"\n",
"[12725505 rows x 6 columns]\n"
]
}
],
"source": [
"def filter_by_frequence_movie_id(RDF: pd.DataFrame, min_treshold, max_treshold) -> pd.DataFrame:\n",
" counts = RDF[\"MovieID\"].value_counts() \n",
" RDF[\"MovieFreq\"] = RDF[\"MovieID\"].map(counts)\n",
" RDF = RDF[RDF[\"MovieFreq\"] >= min_treshold]\n",
" RDF = RDF[RDF[\"MovieFreq\"] < max_treshold]\n",
" # counts is a series as key: relationship, value: count\n",
" # counts = counts[counts > count_treshold]\n",
" # relationships = counts.index\n",
" # RDF = RDF[RDF[\"RelationshipURI\"].isin(relationships)]\n",
" # RDF = RDF.groupby(\"RelationshipURI\").filter(lambda x: len(x) >= count_treshold)\n",
" return RDF\n",
"\n",
"RDF = filter_by_frequence_movie_id(RDF, 1, 1500)\n",
"print(RDF)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "deep_learning",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@@ -0,0 +1,77 @@
import argparse
import sys
class ProgramArgs:
def __init__(self, file: str, output: str, treshold: int):
self.file = file
self.output = output
self.treshold = treshold
def get_args(args: list[str]) -> ProgramArgs:
PARSER = argparse.ArgumentParser()
PARSER.add_argument("--input-file", "-i", required=True, type=str)
PARSER.add_argument("--output-file", "-o", required=True, type=str)
PARSER.add_argument("--treshold", "-t", type=int, default=1)
parsed_args, _ = PARSER.parse_known_args(args)
# print(parsed_args.input_file)
return ProgramArgs(parsed_args.input_file,parsed_args.output_file, parsed_args.treshold) # type ignore
def print_dbpedia(file: str, out: str):
FILE = open(file, "r", encoding="utf-8")
OUT = open(out, mode="w", encoding="utf-8")
DOMAIN_PART = "dbpedia"
already_parsed : set[str] = set()
for row in FILE:
sections = row.split("/")
sections = list(filter(lambda item: item != "", sections))
# print(sections)
if len(sections) < 3:
continue
URI = "/".join(sections[1:3])
URI = "//".join([sections[0], URI])
if URI in already_parsed:
continue
DOMAIN = sections[1]
SUBDOMAINS = DOMAIN.split(".")
TYPE = sections[2][0]
if DOMAIN_PART not in SUBDOMAINS:
continue
already_parsed.add(URI)
SUB_ID = SUBDOMAINS[0]
if len(SUB_ID) > 3:
SUB_ID = SUB_ID[:3]
OUT.write(f"\"{URI}/\", \"{SUB_ID}-db{TYPE}\"\n")
FILE.close()
OUT.close()
if __name__ == "__main__":
ARGS = get_args(sys.argv)
# ARGS = get_debug_args()
print_dbpedia(ARGS.file, ARGS.output)

View File

@@ -6,8 +6,16 @@ from typing import Self
class ProgramArgs: class ProgramArgs:
def __init__(self, file: str, output: str, treshold: int): def __init__(self, file: str, csv_uri_header: str, output: str, treshold: int):
"""
Args:
file (str):
csv_header (str): The name of the column of the csv file from which the program will get the URIs
output (str):
treshold (int):
"""
self.file = file self.file = file
self.csv_uri_header = csv_uri_header
self.output = output self.output = output
self.treshold = treshold self.treshold = treshold
@@ -33,11 +41,15 @@ class Node:
KEY = child[0] KEY = child[0]
if not self.children.get(KEY): if not self.children.get(KEY):
# if the key has no value, it means we are traversing this branch for the first time
# create another node for the key
self.children[KEY] = Node(KEY, 0) self.children[KEY] = Node(KEY, 0)
# take the node for the key
CHILD = self.children[KEY] CHILD = self.children[KEY]
self.quantity += 1 self.quantity += 1
# if the child list to enter has only one element, which is KEY, no more node will be created
if len(child) == 1: if len(child) == 1:
return return
@@ -53,27 +65,32 @@ def get_args(args: list[str]) -> ProgramArgs:
PARSER = argparse.ArgumentParser() PARSER = argparse.ArgumentParser()
PARSER.add_argument("--input-file", "-i", required=True, type=str) PARSER.add_argument("--input-file", "-i", required=True, type=str)
PARSER.add_argument("--header-name", "-c", required=True, type=str) # c stands for column
PARSER.add_argument("--output-file", "-o", required=True, type=str) PARSER.add_argument("--output-file", "-o", required=True, type=str)
PARSER.add_argument("--treshold", "-t", type=int, default=1) PARSER.add_argument("--treshold", "-t", type=int, default=1)
parsed_args, _ = PARSER.parse_known_args(args) parsed_args, _ = PARSER.parse_known_args(args)
# print(parsed_args.input_file) # print(parsed_args.input_file)
return ProgramArgs(parsed_args.input_file,parsed_args.output_file, parsed_args.treshold) # type ignore return ProgramArgs(parsed_args.input_file, parsed_args.header_name ,parsed_args.output_file, parsed_args.treshold) # type ignore
def get_debug_args() -> ProgramArgs: def get_debug_args() -> ProgramArgs:
# -i ./Assets/Dataset/1-hop/movies.csv -c subject -o Assets/Dataset/Tmp/prova.csv -t 1
FILE = "./Assets/Dataset/Tmp/reverse-rel.txt" FILE = "./Assets/Dataset/1-hop/movies.csv"
CSV_HEADER = "subject"
OUTPUT = "./Assets/Dataset/Tmp/prova.csv"
TRESHOLD = 1 TRESHOLD = 1
return ProgramArgs( return ProgramArgs(
FILE, FILE,
CSV_HEADER,
OUTPUT,
TRESHOLD TRESHOLD
) )
def tree_like(file: str, out: str): def tree_like(file: str, csv_uri_header:str, out: str):
INDENTATION = " " INDENTATION = " "
@@ -84,9 +101,12 @@ def tree_like(file: str, out: str):
FILE = open(file, "r", encoding="utf-8") FILE = open(file, "r", encoding="utf-8")
for row in FILE: # TODO: Change here so it takes single URI from a CSV file
# It is needed the header-name
for row in csv.DictReader(FILE):
sections = row.split("/") uri_element = row[csv_uri_header]
sections = uri_element.split("/")
sections = list(filter(lambda item: item != "", sections)) sections = list(filter(lambda item: item != "", sections))
# print(sections) # print(sections)
@@ -115,7 +135,9 @@ def tree_like(file: str, out: str):
INDENT: str = INDENTATION * DEPTH INDENT: str = INDENTATION * DEPTH
if NODE.quantity < ARGS.treshold: # Leaf node have quantity 0, so if i want them to appear the threshold have to be 0
# if NODE.quantity < ARGS.treshold:
if ARGS.treshold > NODE.quantity:
continue continue
OUT.write(f"{INDENT}- {NODE}\n") OUT.write(f"{INDENT}- {NODE}\n")
@@ -133,7 +155,8 @@ def tree_like(file: str, out: str):
OUT.close() OUT.close()
if __name__ == "__main__": if __name__ == "__main__":
ARGS = get_args(sys.argv) ARGS = get_args(sys.argv)
# ARGS = get_debug_args() # ARGS = get_debug_args()
tree_like(ARGS.file, ARGS.output) tree_like(ARGS.file,ARGS.csv_uri_header, ARGS.output)

View File

@@ -1,65 +0,0 @@
CREATE TABLE IF NOT EXISTS Movies (
MovieID INTEGER PRIMARY KEY AUTOINCREMENT,
MovieURI TEXT UNIQUE NOT NULL
);
CREATE TABLE IF NOT EXISTS WikiPageIDs (
MovieID INTEGER PRIMARY KEY,
PageID INTEGER UNIQUE NOT NULL,
FOREIGN KEY(MovieID) REFERENCES Movies(MovieID)
);
CREATE TABLE IF NOT EXISTS WikipediaAbstracts (
MovieID INTEGER PRIMARY KEY,
Abstract TEXT NOT NULL,
FOREIGN KEY(MovieID) REFERENCES Movies(MovieID)
);
CREATE TABLE IF NOT EXISTS Origins (
OriginID INTEGER PRIMARY KEY AUTOINCREMENT,
OriginName TEXT UNIQUE NOT NULL
);
CREATE TABLE IF NOT EXISTS Subjects (
SubjectID INTEGER PRIMARY KEY AUTOINCREMENT,
SubjectURI TEXT UNIQUE NOT NULL,
OriginID BIGINT NOT NULL,
FOREIGN KEY(OriginID) REFERENCES Origins(OriginID)
);
CREATE TABLE IF NOT EXISTS Relationships (
RelationshipID INTEGER PRIMARY KEY AUTOINCREMENT,
RelationshipURI TEXT UNIQUE NOT NULL
);
CREATE TABLE IF NOT EXISTS Objects (
ObjectID INTEGER PRIMARY KEY AUTOINCREMENT,
ObjectURI TEXT UNIQUE NOT NULL,
OriginID BIGINT NOT NULL,
FOREIGN KEY(OriginID) REFERENCES Origins(OriginID)
);
CREATE TABLE IF NOT EXISTS RDFs (
RDF_ID INTEGER PRIMARY KEY AUTOINCREMENT,
MovieID INTEGER NOT NULL,
SubjectID INTEGER NOT NULL,
RelationshipID INTEGER NOT NULL,
ObjectID INTEGER NOT NULL,
UNIQUE(SubjectID, RelationshipID, ObjectID),
FOREIGN KEY(MovieID) REFERENCES Movies(MovieID),
FOREIGN KEY(SubjectID) REFERENCES Subjects(SubjectID),
FOREIGN KEY(RelationshipID) REFERENCES Relationships(RelationshipID),
FOREIGN KEY(ObjectID) REFERENCES Objects(ObjectID)
);
CREATE INDEX IF NOT EXISTS idx_rdf_movie_id ON RDFs(MovieID);
CREATE INDEX IF NOT EXISTS idx_rdf_subject_id ON RDFs(SubjectID);
CREATE INDEX IF NOT EXISTS idx_rdf_relationship_id ON RDFs(RelationshipID);
CREATE INDEX IF NOT EXISTS idx_rdf_object_id ON RDFs(ObjectID);

View File

@@ -8,7 +8,7 @@ import csv
##################################################################### #####################################################################
# sometimes you may need to build a new db file, here a little snippet for you # sometimes you may need to build a new db file, here a little snippet for you
# sqlite3 ./Assets/Dataset/Tmp/dataset.db < ./Scripts/DataCleaning/SQL_Queries/db_creation.sql # sqlite3 ./Assets/Dataset/Tmp/dataset.db < ./Scripts/DataCleaning/SQL_Queries/db_creation.sql
# --- Global configuration --- # --- Global configuration ---
DB_NAME = "./Assets/Dataset/DatawareHouse/dataset.db" DB_NAME = "./Assets/Dataset/DatawareHouse/dataset.db"
@@ -17,12 +17,15 @@ PAGEID_CSV = "./Assets/Dataset/1-hop/movie-pageid.csv"
SUMMARY_CSV = "./Assets/Dataset/1-hop/wikipedia-summary.csv" SUMMARY_CSV = "./Assets/Dataset/1-hop/wikipedia-summary.csv"
DATASET_CSV = "./Assets/Dataset/1-hop/dataset.csv" DATASET_CSV = "./Assets/Dataset/1-hop/dataset.csv"
REVERSE_CSV = "./Assets/Dataset/1-hop/reverse.csv" REVERSE_CSV = "./Assets/Dataset/1-hop/reverse.csv"
URI_CSV = "./Assets/Dataset/1-hop/uri-abbreviations.csv"
MOVIES_CSV_HANDLER = open(MOVIES_CSV, "r", newline="", encoding="utf-8")
PAGEID_CSV_HANDLER = open(PAGEID_CSV, "r", newline="", encoding="utf-8")
SUMMARY_CSV_HANDLER = open(SUMMARY_CSV, "r", newline="", encoding="utf-8")
DATASET_CSV_HANDLER = open(DATASET_CSV, "r", newline="", encoding="utf-8")
REVERSE_CSV_HANDLER = open(REVERSE_CSV, "r", newline="", encoding="utf-8")
URI_ABBR_CSV_HANDLER = open(URI_CSV, "r", newline="", encoding="utf-8")
MOVIES_CSV_HANDLER = open(MOVIES_CSV,"r",newline='', encoding="utf-8")
PAGEID_CSV_HANDLER = open(PAGEID_CSV,"r",newline='', encoding="utf-8")
SUMMARY_CSV_HANDLER = open(SUMMARY_CSV,"r",newline='', encoding="utf-8")
DATASET_CSV_HANDLER = open(DATASET_CSV,"r",newline='', encoding="utf-8")
REVERSE_CSV_HANDLER = open(REVERSE_CSV,"r",newline='', encoding="utf-8")
CONN = sqlite3.connect(DB_NAME) CONN = sqlite3.connect(DB_NAME)
CURS = CONN.cursor() CURS = CONN.cursor()
@@ -30,7 +33,8 @@ CURS = CONN.cursor()
# MARK: SQL Definitions # MARK: SQL Definitions
# Insert MovieURI # Insert MovieURI
def insertOrigin(curs : sqlite3.Cursor ) -> bool:
def insertOrigin(curs: sqlite3.Cursor) -> bool:
QUERY = "INSERT INTO Origins (OriginName) VALUES ('dataset.csv'),('reverse.csv');" QUERY = "INSERT INTO Origins (OriginName) VALUES ('dataset.csv'),('reverse.csv');"
try: try:
@@ -38,24 +42,26 @@ def insertOrigin(curs : sqlite3.Cursor ) -> bool:
return True return True
except sqlite3.IntegrityError: except sqlite3.IntegrityError:
return False return False
def selectOrigin(curs: sqlite3.Cursor, originName: str) -> int | None: def selectOrigin(curs: sqlite3.Cursor, originName: str) -> int | None:
QUERY = "SELECT OriginID FROM Origins WHERE OriginName = ?;" QUERY = "SELECT OriginID FROM Origins WHERE OriginName = ?;"
curs.execute(QUERY, [originName]) curs.execute(QUERY, [originName])
originId = curs.fetchone() originId = curs.fetchone()
if not originId: if not originId:
return None return None
# in this case the real id is the first element of the tuple # in this case the real id is the first element of the tuple
return originId[0] return originId[0]
def insertMovie(curs : sqlite3.Cursor , movieUri: str) -> bool:
def insertMovie(curs: sqlite3.Cursor, movieUri: str) -> bool:
QUERY = "INSERT INTO Movies (MovieURI) VALUES (?);" QUERY = "INSERT INTO Movies (MovieURI) VALUES (?);"
try: try:
curs.execute(QUERY,[movieUri]) curs.execute(QUERY, [movieUri])
return True return True
except sqlite3.IntegrityError: except sqlite3.IntegrityError:
return False return False
@@ -64,12 +70,12 @@ def insertMovie(curs : sqlite3.Cursor , movieUri: str) -> bool:
def selectMovieId(curs: sqlite3.Cursor, movieUri: str) -> int | None: def selectMovieId(curs: sqlite3.Cursor, movieUri: str) -> int | None:
QUERY = "SELECT MovieID FROM Movies WHERE MovieURI = ?;" QUERY = "SELECT MovieID FROM Movies WHERE MovieURI = ?;"
curs.execute(QUERY, [movieUri]) curs.execute(QUERY, [movieUri])
movieId = curs.fetchone() movieId = curs.fetchone()
if not movieId: if not movieId:
return None return None
# in this case the real id is the first element of the tuple # in this case the real id is the first element of the tuple
return movieId[0] return movieId[0]
@@ -77,105 +83,164 @@ def selectMovieId(curs: sqlite3.Cursor, movieUri: str) -> int | None:
def insertWikiPageId(curs: sqlite3.Cursor, movieId: int, pageId: int) -> bool: def insertWikiPageId(curs: sqlite3.Cursor, movieId: int, pageId: int) -> bool:
QUERY = "INSERT INTO WikiPageIDs (MovieID, PageID) VALUES (?,?);" QUERY = "INSERT INTO WikiPageIDs (MovieID, PageID) VALUES (?,?);"
try: try:
curs.execute(QUERY,[movieId, pageId]) curs.execute(QUERY, [movieId, pageId])
return True return True
except sqlite3.IntegrityError: except sqlite3.IntegrityError:
return False return False
def selectMovieIdFromWikiPageId(curs: sqlite3.Cursor,pageId: int) -> int | None:
def selectMovieIdFromWikiPageId(curs: sqlite3.Cursor, pageId: int) -> int | None:
QUERY = "SELECT MovieID FROM WikiPageIDs WHERE PageID = ?;" QUERY = "SELECT MovieID FROM WikiPageIDs WHERE PageID = ?;"
curs.execute(QUERY, [pageId]) curs.execute(QUERY, [pageId])
movieId = curs.fetchone() movieId = curs.fetchone()
if not movieId: if not movieId:
return None return None
# in this case the real id is the first element of the tuple # in this case the real id is the first element of the tuple
return movieId[0] return movieId[0]
def insertWikiAbstract(curs: sqlite3.Cursor, movieId: int, abstract: str) -> bool: def insertWikiAbstract(curs: sqlite3.Cursor, movieId: int, abstract: str) -> bool:
QUERY = "INSERT INTO WikipediaAbstracts (MovieID, Abstract) VALUES (?,?);" QUERY = "INSERT INTO WikipediaAbstracts (MovieID, Abstract) VALUES (?,?);"
try: try:
curs.execute(QUERY,[movieId, abstract]) curs.execute(QUERY, [movieId, abstract])
return True return True
except sqlite3.IntegrityError: except sqlite3.IntegrityError:
return False return False
def insertSubject(curs: sqlite3.Cursor, subjectURI: str, originID: int) -> bool: def insertSubject(curs: sqlite3.Cursor, subjectURI: str, originID: int) -> bool:
QUERY = "INSERT INTO Subjects (SubjectURI, OriginID) VALUES (?,?);" QUERY = "INSERT INTO Subjects (SubjectURI, OriginID) VALUES (?,?);"
try: try:
curs.execute(QUERY,[subjectURI, originID]) curs.execute(QUERY, [subjectURI, originID])
return True return True
except sqlite3.IntegrityError: except sqlite3.IntegrityError:
return False return False
def insertRelationship(curs: sqlite3.Cursor, relationshipURI: str) -> bool: def insertRelationship(curs: sqlite3.Cursor, relationshipURI: str) -> bool:
QUERY = "INSERT INTO Relationships (RelationshipURI) VALUES (?);" QUERY = "INSERT INTO Relationships (RelationshipURI) VALUES (?);"
try: try:
curs.execute(QUERY,[relationshipURI]) curs.execute(QUERY, [relationshipURI])
return True return True
except sqlite3.IntegrityError: except sqlite3.IntegrityError:
return False return False
def insertObject(curs: sqlite3.Cursor, objectURI: str, originID: int) -> bool: def insertObject(curs: sqlite3.Cursor, objectURI: str, originID: int) -> bool:
QUERY = "INSERT INTO objects (ObjectURI, OriginID) VALUES (?,?);" QUERY = "INSERT INTO objects (ObjectURI, OriginID) VALUES (?,?);"
try: try:
curs.execute(QUERY,[objectURI, originID]) curs.execute(QUERY, [objectURI, originID])
return True return True
except sqlite3.IntegrityError: except sqlite3.IntegrityError:
return False return False
def selectSubjectId(curs: sqlite3.Cursor, subjectURI: str) -> int | None: def selectSubjectId(curs: sqlite3.Cursor, subjectURI: str) -> int | None:
QUERY = "SELECT SubjectID FROM Subjects WHERE SubjectURI = ?;" QUERY = "SELECT SubjectID FROM Subjects WHERE SubjectURI = ?;"
curs.execute(QUERY, [subjectURI]) curs.execute(QUERY, [subjectURI])
subjectId = curs.fetchone() subjectId = curs.fetchone()
if not subjectId: if not subjectId:
return None return None
# in this case the real id is the first element of the tuple # in this case the real id is the first element of the tuple
return subjectId[0] return subjectId[0]
def selectRelationshipId(curs: sqlite3.Cursor, relationshipURI: str) -> int | None: def selectRelationshipId(curs: sqlite3.Cursor, relationshipURI: str) -> int | None:
QUERY = "SELECT RelationshipID FROM Relationships WHERE RelationshipURI = ?;" QUERY = "SELECT RelationshipID FROM Relationships WHERE RelationshipURI = ?;"
curs.execute(QUERY, [relationshipURI]) curs.execute(QUERY, [relationshipURI])
relationshipId = curs.fetchone() relationshipId = curs.fetchone()
if not relationshipId: if not relationshipId:
return None return None
# in this case the real id is the first element of the tuple # in this case the real id is the first element of the tuple
return relationshipId[0] return relationshipId[0]
def selectObjectId(curs: sqlite3.Cursor, objectURI: str) -> int | None: def selectObjectId(curs: sqlite3.Cursor, objectURI: str) -> int | None:
QUERY = "SELECT ObjectID FROM Objects WHERE ObjectURI = ?;" QUERY = "SELECT ObjectID FROM Objects WHERE ObjectURI = ?;"
curs.execute(QUERY, [objectURI]) curs.execute(QUERY, [objectURI])
objectId = curs.fetchone() objectId = curs.fetchone()
if not objectId: if not objectId:
return None return None
# in this case the real id is the first element of the tuple # in this case the real id is the first element of the tuple
return objectId[0] return objectId[0]
def insertRDF( def insertRDF(
curs: sqlite3.Cursor, curs: sqlite3.Cursor,
movieId: int, movieId: int,
subjectId: int, subjectId: int,
relationshipId: int, relationshipId: int,
objectId: int objectId: int,
) -> bool: ) -> bool:
QUERY = "INSERT INTO RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?,?,?,?);" QUERY = "INSERT INTO RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?,?,?,?);"
try: try:
curs.execute(QUERY,[movieId,subjectId,relationshipId,objectId]) curs.execute(QUERY, [movieId, subjectId, relationshipId, objectId])
return True return True
except sqlite3.IntegrityError: except sqlite3.IntegrityError:
return False return False
# UGLY: correct method to add cursor
def insert_abbreviation(uri, abbreviation) -> bool:
QUERY = "INSERT INTO Abbreviations(URI, Abbreviation) VALUES (?,?);"
try:
CURS.execute(QUERY, [uri, abbreviation])
return True
except sqlite3.IntegrityError:
return False
# UGLY: correct method to add cursor
def insert_object_abbreviation(object_id, abbreviation_id) -> bool:
QUERY = "INSERT INTO Objects_Abbreviations(ObjectID, AbbreviationID) VALUES (?,?);"
try:
CURS.execute(QUERY, [object_id, abbreviation_id])
return True
except sqlite3.IntegrityError:
return False
# UGLY: correct method to add cursor
def insert_relationship_abbreviation(relationship_id, abbreviation_id) -> bool:
QUERY = "INSERT INTO Relationships_Abbreviations(RelationshipID, AbbreviationID) VALUES (?,?);"
try:
CURS.execute(QUERY, [relationship_id, abbreviation_id])
return True
except sqlite3.IntegrityError:
return False
# UGLY: correct method to add cursor
def insert_subject_abbreviation(subject_id, abbreviation_id) -> bool:
QUERY = (
"INSERT INTO Subjects_Abbreviations(SubjectID, AbbreviationID) VALUES (?,?);"
)
try:
CURS.execute(QUERY, [subject_id, abbreviation_id])
return True
except sqlite3.IntegrityError:
return False
# UGLY: correct method to add cursor
def select_abbreviation_id(uri) -> int | None:
QUERY = "SELECT AbbreviationID FROM Abbreviations WHERE URI LIKE ?;"
CURS.execute(QUERY, [uri])
abbreviation_id = CURS.fetchone()
if not abbreviation_id:
return None
# in this case the real id is the first element of the tuple
return abbreviation_id[0]
# MARK: Parsing # MARK: Parsing
def parseMovies(): def parseMovies():
@@ -203,12 +268,11 @@ def parseWikiPageId():
def parseAbstract(): def parseAbstract():
CSV_READER = csv.DictReader(SUMMARY_CSV_HANDLER) CSV_READER = csv.DictReader(SUMMARY_CSV_HANDLER)
for row in CSV_READER: for row in CSV_READER:
WIKI_PAGE_ID = int(row["subject"]) WIKI_PAGE_ID = int(row["subject"])
ABSTRACT = row["text"] ABSTRACT = row["text"]
MOVIE_ID = selectMovieIdFromWikiPageId(CURS, WIKI_PAGE_ID) MOVIE_ID = selectMovieIdFromWikiPageId(CURS, WIKI_PAGE_ID)
if MOVIE_ID is None: if MOVIE_ID is None:
print(f"The WikiPageId: {WIKI_PAGE_ID} has not a MovieId ") print(f"The WikiPageId: {WIKI_PAGE_ID} has not a MovieId ")
continue continue
@@ -216,10 +280,24 @@ def parseAbstract():
insertWikiAbstract(CURS, MOVIE_ID, ABSTRACT) insertWikiAbstract(CURS, MOVIE_ID, ABSTRACT)
def parseAbbreviations():
URI_CSV = csv.DictReader(URI_ABBR_CSV_HANDLER)
for row in URI_CSV:
URI = row["uri"]
ABBREVIATION = row["abbreviation"]
insert_abbreviation(URI, ABBREVIATION)
def parseRDF_Reverse(): def parseRDF_Reverse():
REVERSE_CSV_READER = csv.DictReader(REVERSE_CSV_HANDLER) REVERSE_CSV_READER = csv.DictReader(REVERSE_CSV_HANDLER)
REVERSE_ORIGIN_ID = selectOrigin(CURS, 'reverse.csv') REVERSE_ORIGIN_ID = selectOrigin(CURS, "reverse.csv")
if REVERSE_ORIGIN_ID is None:
return
total = 0 total = 0
for row in REVERSE_CSV_READER: for row in REVERSE_CSV_READER:
@@ -227,7 +305,7 @@ def parseRDF_Reverse():
RELATIONSHIP = row["relationship"] RELATIONSHIP = row["relationship"]
OBJECT = row["object"] OBJECT = row["object"]
print(f"RDF triplets:\n\t{SUBJECT} - {RELATIONSHIP} - {OBJECT}") print(f"RDF triplets:\n\t{SUBJECT} - {RELATIONSHIP} - {OBJECT}")
insertSubject(CURS,SUBJECT,REVERSE_ORIGIN_ID) insertSubject(CURS, SUBJECT, REVERSE_ORIGIN_ID)
insertRelationship(CURS, RELATIONSHIP) insertRelationship(CURS, RELATIONSHIP)
insertObject(CURS, OBJECT, REVERSE_ORIGIN_ID) insertObject(CURS, OBJECT, REVERSE_ORIGIN_ID)
@@ -236,7 +314,6 @@ def parseRDF_Reverse():
RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP) RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
MOVIE_ID = selectMovieId(CURS, OBJECT) MOVIE_ID = selectMovieId(CURS, OBJECT)
skip = False skip = False
# guard # guard
@@ -259,17 +336,19 @@ def parseRDF_Reverse():
if skip: if skip:
continue continue
if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID): if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID): # type: ignore
total += 1 total += 1
print(total) print(total)
def parseRDF_Dataset(): def parseRDF_Dataset():
DATASET_CSV_READER = csv.DictReader(DATASET_CSV_HANDLER) DATASET_CSV_READER = csv.DictReader(DATASET_CSV_HANDLER)
DATASET_ORIGIN_ID = selectOrigin(CURS, 'dataset.csv') DATASET_ORIGIN_ID = selectOrigin(CURS, "dataset.csv")
if DATASET_ORIGIN_ID is None:
return
total = 0 total = 0
rdf_idx = 0 rdf_idx = 0
@@ -284,7 +363,7 @@ def parseRDF_Dataset():
if rdf_idx % 100000 == 0: if rdf_idx % 100000 == 0:
print(f"RDF number {rdf_idx}:\n\t{SUBJECT} - {RELATIONSHIP} - {OBJECT}") print(f"RDF number {rdf_idx}:\n\t{SUBJECT} - {RELATIONSHIP} - {OBJECT}")
insertSubject(CURS,SUBJECT,DATASET_ORIGIN_ID) insertSubject(CURS, SUBJECT, DATASET_ORIGIN_ID)
insertRelationship(CURS, RELATIONSHIP) insertRelationship(CURS, RELATIONSHIP)
insertObject(CURS, OBJECT, DATASET_ORIGIN_ID) insertObject(CURS, OBJECT, DATASET_ORIGIN_ID)
@@ -293,7 +372,6 @@ def parseRDF_Dataset():
RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP) RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
MOVIE_ID = selectMovieId(CURS, SUBJECT) MOVIE_ID = selectMovieId(CURS, SUBJECT)
skip = False skip = False
# guard # guard
@@ -316,24 +394,203 @@ def parseRDF_Dataset():
if skip: if skip:
continue continue
if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID): if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID): # type: ignore
total += 1 total += 1
print(total) print(total)
def parseAbbr_Reverse():
REVERSE_CSV_READER = csv.DictReader(REVERSE_CSV_HANDLER)
REVERSE_ORIGIN_ID = selectOrigin(CURS, "reverse.csv")
if REVERSE_ORIGIN_ID is None:
return
total = 0
for row in REVERSE_CSV_READER:
SUBJECT = row["subject"]
RELATIONSHIP = row["relationship"]
OBJECT = row["object"]
SUBJECT_ID = selectSubjectId(CURS, SUBJECT)
OBJECT_ID = selectObjectId(CURS, OBJECT)
RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
SUB_SECTIONS = SUBJECT.split("/")
REL_SECTIONS = RELATIONSHIP.split("/")
OBJ_SECTIONS = OBJECT.split("/")
SUB_ABBR_ID = None
REL_ABBR_ID = None
OBJ_ABBR_ID = None
skip = False
# guard
if SUBJECT_ID is None:
print(f"No SubjectId for {SUBJECT}")
skip = True
if OBJECT_ID is None:
print(f"No ObjectId for {OBJECT}")
skip = True
if RELATIONSHIP_ID is None:
print(f"No RelationshipId for {RELATIONSHIP}")
skip = True
if skip:
continue
if len(SUB_SECTIONS) > 4:
index = min(len(SUB_SECTIONS), 7)
while index > 3:
PATH = "/".join(SUB_SECTIONS[0:index]) + "%"
SUB_ABBR_ID = select_abbreviation_id(PATH)
if SUB_ABBR_ID is not None:
if insert_subject_abbreviation(SUBJECT_ID, SUB_ABBR_ID):
total += 1
index = 0
index -= 1
if len(REL_SECTIONS) > 4:
index = min(len(REL_SECTIONS), 7)
while index > 2:
PATH = "/".join(REL_SECTIONS[0:index]) + "%"
REL_ABBR_ID = select_abbreviation_id(PATH)
if REL_ABBR_ID is not None:
if insert_relationship_abbreviation(RELATIONSHIP_ID, REL_ABBR_ID):
total += 1
index = 0
index -= 1
if len(OBJ_SECTIONS) > 4:
index = min(len(OBJ_SECTIONS), 7)
while index > 3:
PATH = "/".join(OBJ_SECTIONS[0:index]) + "%"
OBJ_ABBR_ID = select_abbreviation_id(PATH)
if OBJ_ABBR_ID is not None:
if insert_object_abbreviation(OBJECT_ID, OBJ_ABBR_ID):
total += 1
index = 0
index -= 1
print(total)
def parseAbbr_Dataset():
DATASET_CSV_READER = csv.DictReader(DATASET_CSV_HANDLER)
DATASET_ORIGIN_ID = selectOrigin(CURS, "dataset.csv")
if DATASET_ORIGIN_ID is None:
return
total = 0
rdf_idx = 0
for row in DATASET_CSV_READER:
SUBJECT = row["subject"]
RELATIONSHIP = row["relationship"]
OBJECT = row["object"]
rdf_idx += 1
if rdf_idx % 100000 == 0:
print(f"RDF number {rdf_idx}:\n\t{SUBJECT} - {RELATIONSHIP} - {OBJECT}")
SUBJECT_ID = selectSubjectId(CURS, SUBJECT)
OBJECT_ID = selectObjectId(CURS, OBJECT)
RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
SUB_SECTIONS = SUBJECT.split("/")
REL_SECTIONS = RELATIONSHIP.split("/")
OBJ_SECTIONS = OBJECT.split("/")
SUB_ABBR_ID = None
REL_ABBR_ID = None
OBJ_ABBR_ID = None
skip = False
# guard
if SUBJECT_ID is None:
print(f"No SubjectId for {SUBJECT}")
skip = True
if OBJECT_ID is None:
print(f"No ObjectId for {OBJECT}")
skip = True
if RELATIONSHIP_ID is None:
print(f"No RelationshipId for {RELATIONSHIP}")
skip = True
if skip:
continue
if len(SUB_SECTIONS) > 4:
index = min(len(SUB_SECTIONS), 7)
while index > 3:
PATH = "/".join(SUB_SECTIONS[0:index]) + "%"
SUB_ABBR_ID = select_abbreviation_id(PATH)
if SUB_ABBR_ID is not None:
if insert_subject_abbreviation(SUBJECT_ID, SUB_ABBR_ID):
total += 1
index = 0
index -= 1
if len(REL_SECTIONS) > 4:
index = min(len(REL_SECTIONS), 7)
while index > 2:
PATH = "/".join(REL_SECTIONS[0:index]) + "%"
REL_ABBR_ID = select_abbreviation_id(PATH)
if REL_ABBR_ID is not None:
if insert_relationship_abbreviation(RELATIONSHIP_ID, REL_ABBR_ID):
total += 1
index = 0
index -= 1
if len(OBJ_SECTIONS) > 4:
index = min(len(OBJ_SECTIONS), 7)
while index > 3:
PATH = "/".join(OBJ_SECTIONS[0:index]) + "%"
OBJ_ABBR_ID = select_abbreviation_id(PATH)
if OBJ_ABBR_ID is not None:
if insert_object_abbreviation(OBJECT_ID, OBJ_ABBR_ID):
total += 1
index = 0
index -= 1
print(total)
# MARK: Actual Code # MARK: Actual Code
# parseMovies() # parseMovies()
# parseWikiPageId() # parseWikiPageId()
# parseAbstract() # parseAbstract()
# insertOrigin(CURS) # insertOrigin(CURS)
# parseAbbreviations()
# parseRDF_Reverse() # parseRDF_Reverse()
# parseRDF_Dataset() # parseRDF_Dataset()
# parseAbbr_Reverse()
parseAbbr_Dataset()
CONN.commit() CONN.commit()
CONN.close() CONN.close()
MOVIES_CSV_HANDLER.close() MOVIES_CSV_HANDLER.close()
@@ -341,35 +598,36 @@ PAGEID_CSV_HANDLER.close()
SUMMARY_CSV_HANDLER.close() SUMMARY_CSV_HANDLER.close()
DATASET_CSV_HANDLER.close() DATASET_CSV_HANDLER.close()
REVERSE_CSV_HANDLER.close() REVERSE_CSV_HANDLER.close()
URI_ABBR_CSV_HANDLER.close()
""" """
The MovieUri: http://dbpedia.org/resource/1%25_(film) has not a MovieId The MovieUri: http://dbpedia.org/resource/1%25_(film) has not a MovieId
The MovieUri: http://dbpedia.org/resource/10%25:_What_Makes_a_Hero%3F has not a MovieId The MovieUri: http://dbpedia.org/resource/10%25:_What_Makes_a_Hero%3F has not a MovieId
The MovieUri: http://dbpedia.org/resource/100%25_Arabica has not a MovieId The MovieUri: http://dbpedia.org/resource/100%25_Arabica has not a MovieId
The MovieUri: http://dbpedia.org/resource/100%25_Kadhal has not a MovieId The MovieUri: http://dbpedia.org/resource/100%25_Kadhal has not a MovieId
The MovieUri: http://dbpedia.org/resource/100%25_Love_(2011_film) has not a MovieId The MovieUri: http://dbpedia.org/resource/100%25_Love_(2011_film) has not a MovieId
The MovieUri: http://dbpedia.org/resource/100%25_Love_(2012_film) has not a MovieId The MovieUri: http://dbpedia.org/resource/100%25_Love_(2012_film) has not a MovieId
The MovieUri: http://dbpedia.org/resource/100%25_Wolf has not a MovieId The MovieUri: http://dbpedia.org/resource/100%25_Wolf has not a MovieId
The MovieUri: http://dbpedia.org/resource/Who_the_$&%25_Is_Jackson_Pollock%3F has not a MovieId The MovieUri: http://dbpedia.org/resource/Who_the_$&%25_Is_Jackson_Pollock%3F has not a MovieId
The MovieUri: http://dbpedia.org/resource/99%25:_The_Occupy_Wall_Street_Collaborative_Film has not a MovieId The MovieUri: http://dbpedia.org/resource/99%25:_The_Occupy_Wall_Street_Collaborative_Film has not a MovieId
The MovieUri: http://dbpedia.org/resource/99_and_44/100%25_Dead has not a MovieId The MovieUri: http://dbpedia.org/resource/99_and_44/100%25_Dead has not a MovieId
The MovieUri: http://dbpedia.org/resource/Postcards_from_the_48%25 has not a MovieId The MovieUri: http://dbpedia.org/resource/Postcards_from_the_48%25 has not a MovieId
The MovieUri: http://dbpedia.org/resource/Wool_100%25 has not a MovieId The MovieUri: http://dbpedia.org/resource/Wool_100%25 has not a MovieId
""" """
""" """
The WikiPageId: 10068850 has not a MovieId The WikiPageId: 10068850 has not a MovieId
The WikiPageId: 55069615 has not a MovieId The WikiPageId: 55069615 has not a MovieId
The WikiPageId: 49510056 has not a MovieId The WikiPageId: 49510056 has not a MovieId
The WikiPageId: 4049786 has not a MovieId The WikiPageId: 4049786 has not a MovieId
The WikiPageId: 55510238 has not a MovieId The WikiPageId: 55510238 has not a MovieId
The WikiPageId: 31239628 has not a MovieId The WikiPageId: 31239628 has not a MovieId
The WikiPageId: 34757217 has not a MovieId The WikiPageId: 34757217 has not a MovieId
The WikiPageId: 64311757 has not a MovieId The WikiPageId: 64311757 has not a MovieId
The WikiPageId: 8326198 has not a MovieId The WikiPageId: 8326198 has not a MovieId
The WikiPageId: 42162164 has not a MovieId The WikiPageId: 42162164 has not a MovieId
The WikiPageId: 18502369 has not a MovieId The WikiPageId: 18502369 has not a MovieId
The WikiPageId: 58092358 has not a MovieId The WikiPageId: 58092358 has not a MovieId
The WikiPageId: 40710250 has not a MovieId The WikiPageId: 40710250 has not a MovieId
""" """

View File

View File

View File

@@ -0,0 +1,826 @@
{
"type": "excalidraw",
"version": 2,
"source": "https://marketplace.visualstudio.com/items?itemName=pomdtr.excalidraw-editor",
"elements": [
{
"type": "line",
"version": 4622,
"versionNonce": 1623045672,
"isDeleted": false,
"id": "twu_PiAvEuQ4l1YYtZLET",
"fillStyle": "solid",
"strokeWidth": 1,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"angle": 0,
"x": 289.8504963515835,
"y": 91.87474806402287,
"strokeColor": "#000000",
"backgroundColor": "#a5d8ff",
"width": 77.09201683999922,
"height": 99.49948667804088,
"seed": 1975340120,
"groupIds": [
"9PT4BXPfQ6UoCaB-T-h9A",
"dp_TZJyYdyPIH1hOkAPlb"
],
"strokeSharpness": "round",
"boundElementIds": [],
"startBinding": null,
"endBinding": null,
"lastCommittedPoint": null,
"startArrowhead": null,
"endArrowhead": null,
"points": [
[
0,
0
],
[
0.2542098813493443,
75.20117273657175
],
[
0.011896425679918422,
83.76249969444815
],
[
3.970409367559332,
87.46174320643391
],
[
17.75573317066317,
90.59250103325854
],
[
41.05683533152865,
91.56737225214069
],
[
63.319497586673116,
90.01084754868091
],
[
75.14781395923075,
86.28844687220405
],
[
76.81603792670788,
83.15042405259751
],
[
77.05033394391478,
76.25776215104557
],
[
76.86643881413028,
6.3089586511537865
],
[
76.45188016352971,
-0.2999144698665015
],
[
71.50179495549581,
-3.9936571317850627
],
[
61.077971898861186,
-6.132877429442784
],
[
37.32348754161154,
-7.932114425900202
],
[
18.278415656797975,
-6.859225353587373
],
[
3.2995959613238286,
-3.2201165291205287
],
[
-0.04168289608444441,
-0.045185660461322996
],
[
0,
0
]
],
"index": "a1",
"frameId": null,
"roundness": {
"type": 2
},
"boundElements": [],
"updated": 1758646548051,
"link": null,
"locked": false
},
{
"type": "line",
"version": 2327,
"versionNonce": 1593094440,
"isDeleted": false,
"id": "hmJk4dH9VpOsfkrCTkhvh",
"fillStyle": "solid",
"strokeWidth": 1,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"angle": 0,
"x": 290.3744257898585,
"y": 149.00103172175278,
"strokeColor": "#000000",
"backgroundColor": "#a5d8ff",
"width": 77.17198221193564,
"height": 8.562348957853036,
"seed": 637665624,
"groupIds": [
"9PT4BXPfQ6UoCaB-T-h9A",
"dp_TZJyYdyPIH1hOkAPlb"
],
"strokeSharpness": "round",
"boundElementIds": [],
"startBinding": null,
"endBinding": null,
"lastCommittedPoint": null,
"startArrowhead": null,
"endArrowhead": null,
"points": [
[
0,
0
],
[
2.033150371639873,
3.413095389435587
],
[
10.801287372573954,
6.276651055277943
],
[
22.468666942209353,
8.010803051612635
],
[
40.747074201802775,
8.168828515515864
],
[
62.077348233027564,
7.0647721921469495
],
[
74.53446931782398,
3.04824021069218
],
[
77.17198221193564,
-0.3935204423371723
]
],
"index": "a2",
"frameId": null,
"roundness": {
"type": 2
},
"boundElements": [],
"updated": 1758646548051,
"link": null,
"locked": false
},
{
"type": "line",
"version": 2413,
"versionNonce": 311708712,
"isDeleted": false,
"id": "X1ldVIXm4DfBal5N2Pwn9",
"fillStyle": "solid",
"strokeWidth": 1,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"angle": 0,
"x": 289.3425684673547,
"y": 120.03697638652972,
"strokeColor": "#000000",
"backgroundColor": "#a5d8ff",
"width": 77.17198221193564,
"height": 8.562348957853036,
"seed": 904402520,
"groupIds": [
"9PT4BXPfQ6UoCaB-T-h9A",
"dp_TZJyYdyPIH1hOkAPlb"
],
"strokeSharpness": "round",
"boundElementIds": [],
"startBinding": null,
"endBinding": null,
"lastCommittedPoint": null,
"startArrowhead": null,
"endArrowhead": null,
"points": [
[
0,
0
],
[
2.033150371639873,
3.413095389435587
],
[
10.801287372573954,
6.276651055277943
],
[
22.468666942209353,
8.010803051612635
],
[
40.747074201802775,
8.168828515515864
],
[
62.077348233027564,
7.0647721921469495
],
[
74.53446931782398,
3.04824021069218
],
[
77.17198221193564,
-0.3935204423371723
]
],
"index": "a3",
"frameId": null,
"roundness": {
"type": 2
},
"boundElements": [],
"updated": 1758646548051,
"link": null,
"locked": false
},
{
"type": "ellipse",
"version": 5410,
"versionNonce": 92833576,
"isDeleted": false,
"id": "CFhp5ZxSVwHYzGUj4hEn1",
"fillStyle": "solid",
"strokeWidth": 1,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"angle": 0,
"x": 288.28461948527263,
"y": 84.74247943834126,
"strokeColor": "#000000",
"backgroundColor": "#a5d8ff",
"width": 76.59753601865496,
"height": 15.49127539284798,
"seed": 1782811480,
"groupIds": [
"9PT4BXPfQ6UoCaB-T-h9A",
"dp_TZJyYdyPIH1hOkAPlb"
],
"strokeSharpness": "sharp",
"boundElementIds": [
"bxuMGTzXLn7H-uBCptINx"
],
"index": "a4",
"frameId": null,
"roundness": null,
"boundElements": [],
"updated": 1758646548051,
"link": null,
"locked": false
},
{
"type": "ellipse",
"version": 820,
"versionNonce": 608002600,
"isDeleted": false,
"id": "B43R7rWwK2_vdiRHBSSPk",
"fillStyle": "solid",
"strokeWidth": 1,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"angle": 0,
"x": 324.77660659049513,
"y": 109.21914711824485,
"strokeColor": "#000000",
"backgroundColor": "#228be6",
"width": 11.226103154161754,
"height": 12.183758484455605,
"seed": 1298686040,
"groupIds": [
"9PT4BXPfQ6UoCaB-T-h9A",
"dp_TZJyYdyPIH1hOkAPlb"
],
"strokeSharpness": "sharp",
"boundElementIds": [],
"index": "a5",
"frameId": null,
"roundness": null,
"boundElements": [],
"updated": 1758646548051,
"link": null,
"locked": false
},
{
"type": "ellipse",
"version": 1108,
"versionNonce": 1839127848,
"isDeleted": false,
"id": "CkKMb9wkJfVk04T217zSs",
"fillStyle": "solid",
"strokeWidth": 1,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"angle": 0,
"x": 325.12774837442873,
"y": 135.43576140530996,
"strokeColor": "#000000",
"backgroundColor": "#228be6",
"width": 11.226103154161754,
"height": 12.183758484455605,
"seed": 2133497176,
"groupIds": [
"9PT4BXPfQ6UoCaB-T-h9A",
"dp_TZJyYdyPIH1hOkAPlb"
],
"strokeSharpness": "sharp",
"boundElementIds": [],
"index": "a6",
"frameId": null,
"roundness": null,
"boundElements": [],
"updated": 1758646548051,
"link": null,
"locked": false
},
{
"type": "ellipse",
"version": 991,
"versionNonce": 588838952,
"isDeleted": false,
"id": "SHJdKeQPkfpvzSoNH--3o",
"fillStyle": "solid",
"strokeWidth": 1,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"angle": 6.239590202363168,
"x": 325.77660659049513,
"y": 164.20448797661635,
"strokeColor": "#000000",
"backgroundColor": "#228be6",
"width": 11.226103154161754,
"height": 12.183758484455605,
"seed": 81668696,
"groupIds": [
"9PT4BXPfQ6UoCaB-T-h9A",
"dp_TZJyYdyPIH1hOkAPlb"
],
"strokeSharpness": "sharp",
"boundElementIds": [],
"index": "a7",
"frameId": null,
"roundness": null,
"boundElements": [],
"updated": 1758646548051,
"link": null,
"locked": false
},
{
"type": "text",
"version": 489,
"versionNonce": 2023207720,
"isDeleted": false,
"id": "vUSyMBPup0jZ71CYXKyGb",
"fillStyle": "solid",
"strokeWidth": 1,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"angle": 0,
"x": 280.1846389770508,
"y": 185.79462957545917,
"strokeColor": "#000000",
"backgroundColor": "#a5d8ff",
"width": 95.63072204589844,
"height": 23.595161071904883,
"seed": 425140056,
"groupIds": [
"dp_TZJyYdyPIH1hOkAPlb"
],
"strokeSharpness": "sharp",
"boundElementIds": [],
"fontSize": 17.4778970902999,
"fontFamily": 1,
"text": "dataset.db",
"baseline": 16.595161071904883,
"textAlign": "center",
"verticalAlign": "top",
"index": "a8",
"frameId": null,
"roundness": null,
"boundElements": [],
"updated": 1758646548051,
"link": null,
"locked": false,
"containerId": null,
"originalText": "dataset.db",
"autoResize": true,
"lineHeight": 1.350000000000001
},
{
"id": "R7pU0VP6CFKCAwuvt0xsr",
"type": "text",
"x": 295.5,
"y": 342,
"width": 374,
"height": 225,
"angle": 0,
"strokeColor": "#1e1e1e",
"backgroundColor": "#228be6",
"fillStyle": "solid",
"strokeWidth": 2,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"groupIds": [],
"frameId": null,
"index": "a9",
"roundness": null,
"seed": 705463336,
"version": 1130,
"versionNonce": 72522328,
"isDeleted": false,
"boundElements": null,
"updated": 1758648226024,
"link": null,
"locked": false,
"text": "class Extract(Action):\n # Static\n + type : ActionTypes = Extract\n \n # Properties\n - db_connection: Path\n - query: str\n - query_parameters: [str]\n - output_mapper: [str]",
"fontSize": 20,
"fontFamily": 8,
"textAlign": "left",
"verticalAlign": "top",
"containerId": null,
"originalText": "class Extract(Action):\n # Static\n + type : ActionTypes = Extract\n \n # Properties\n - db_connection: Path\n - query: str\n - query_parameters: [str]\n - output_mapper: [str]",
"autoResize": true,
"lineHeight": 1.25
},
{
"id": "G1xIRcJgm34_NMEWQFFlW",
"type": "text",
"x": 1419.5,
"y": 110,
"width": 253,
"height": 75,
"angle": 0,
"strokeColor": "#1e1e1e",
"backgroundColor": "#228be6",
"fillStyle": "solid",
"strokeWidth": 2,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"groupIds": [],
"frameId": null,
"index": "aA",
"roundness": null,
"seed": 651981400,
"version": 256,
"versionNonce": 138082856,
"isDeleted": false,
"boundElements": null,
"updated": 1758646570344,
"link": null,
"locked": false,
"text": "class Pipeline\n - actions: [Action]\n ",
"fontSize": 20,
"fontFamily": 8,
"textAlign": "left",
"verticalAlign": "top",
"containerId": null,
"originalText": "class Pipeline\n - actions: [Action]\n ",
"autoResize": true,
"lineHeight": 1.25
},
{
"id": "TBVy3JbJCkbA9kjVEJ8lv",
"type": "text",
"x": 694,
"y": 100,
"width": 495,
"height": 150,
"angle": 0,
"strokeColor": "#1e1e1e",
"backgroundColor": "#228be6",
"fillStyle": "solid",
"strokeWidth": 2,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"groupIds": [],
"frameId": null,
"index": "aB",
"roundness": null,
"seed": 680960040,
"version": 560,
"versionNonce": 85012520,
"isDeleted": false,
"boundElements": null,
"updated": 1758649442239,
"link": null,
"locked": false,
"text": "class Action\n + type: ActionTypes\n + name: str\n + depends_on: [str]\n\n + execute(mem) -> [Dict<str, any>] | Void",
"fontSize": 20,
"fontFamily": 8,
"textAlign": "left",
"verticalAlign": "top",
"containerId": null,
"originalText": "class Action\n + type: ActionTypes\n + name: str\n + depends_on: [str]\n\n + execute(mem) -> [Dict<str, any>] | Void",
"autoResize": true,
"lineHeight": 1.25
},
{
"id": "an7KRTzWpCytKNKgHftKC",
"type": "text",
"x": 1528.5,
"y": 365.5,
"width": 187,
"height": 150,
"angle": 0,
"strokeColor": "#1e1e1e",
"backgroundColor": "#228be6",
"fillStyle": "solid",
"strokeWidth": 2,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"groupIds": [],
"frameId": null,
"index": "aC",
"roundness": null,
"seed": 1974317656,
"version": 306,
"versionNonce": 1574962264,
"isDeleted": false,
"boundElements": null,
"updated": 1758648154009,
"link": null,
"locked": false,
"text": "enum ActionTypes:\n + Extract\n + Aggregate\n + Filter\n + Map\n + Dump",
"fontSize": 20,
"fontFamily": 8,
"textAlign": "left",
"verticalAlign": "top",
"containerId": null,
"originalText": "enum ActionTypes:\n + Extract\n + Aggregate\n + Filter\n + Map\n + Dump",
"autoResize": true,
"lineHeight": 1.25
},
{
"id": "2pQ5EULirrWs_QZPbClhh",
"type": "text",
"x": 785,
"y": 332.5,
"width": 418,
"height": 375,
"angle": 0,
"strokeColor": "#1e1e1e",
"backgroundColor": "#228be6",
"fillStyle": "solid",
"strokeWidth": 2,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"groupIds": [],
"frameId": null,
"index": "aH",
"roundness": null,
"seed": 1402251560,
"version": 742,
"versionNonce": 680432168,
"isDeleted": false,
"boundElements": null,
"updated": 1758649532881,
"link": null,
"locked": false,
"text": "class Aggregate(Action):\n # Static\n + type: ActionTypes = Aggregate\n\n # Properties\n - actionIDs: [str]\n - associations: [Association]\n - output_mapper: [str]\n\n + execute(mem):\n tables = mem.gather(actionIDs)\n\n for join in association:\n \n ",
"fontSize": 20,
"fontFamily": 8,
"textAlign": "left",
"verticalAlign": "top",
"containerId": null,
"originalText": "class Aggregate(Action):\n # Static\n + type: ActionTypes = Aggregate\n\n # Properties\n - actionIDs: [str]\n - associations: [Association]\n - output_mapper: [str]\n\n + execute(mem):\n tables = mem.gather(actionIDs)\n\n for join in association:\n \n ",
"autoResize": true,
"lineHeight": 1.25
},
{
"id": "O0fso8DJqFfwJEzmpUikM",
"type": "text",
"x": 1289,
"y": 195,
"width": 594,
"height": 100,
"angle": 0,
"strokeColor": "#1e1e1e",
"backgroundColor": "#228be6",
"fillStyle": "solid",
"strokeWidth": 2,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"groupIds": [],
"frameId": null,
"index": "aI",
"roundness": null,
"seed": 1582329944,
"version": 459,
"versionNonce": 1080077144,
"isDeleted": false,
"boundElements": null,
"updated": 1758647067031,
"link": null,
"locked": false,
"text": "input_mapper: \n - key: ActionID (name) that produced such output\n - value: list of strings that represent the values\n to take",
"fontSize": 20,
"fontFamily": 8,
"textAlign": "left",
"verticalAlign": "top",
"containerId": null,
"originalText": "input_mapper: \n - key: ActionID (name) that produced such output\n - value: list of strings that represent the values\n to take",
"autoResize": true,
"lineHeight": 1.25
},
{
"id": "v0kzO6vlBWOdJCV3yoG69",
"type": "text",
"x": 1379.5,
"y": 718.5,
"width": 286,
"height": 175,
"angle": 0,
"strokeColor": "#1e1e1e",
"backgroundColor": "#228be6",
"fillStyle": "solid",
"strokeWidth": 2,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"groupIds": [],
"frameId": null,
"index": "aL",
"roundness": null,
"seed": 1462407976,
"version": 635,
"versionNonce": 1012998696,
"isDeleted": false,
"boundElements": null,
"updated": 1758649495598,
"link": null,
"locked": false,
"text": "class Association:\n - from_actionID: str\n - from_key_name: str\n - from_value_name: str\n - to_actionID: str\n - to_value_name: str\n - type: Type",
"fontSize": 20,
"fontFamily": 8,
"textAlign": "left",
"verticalAlign": "top",
"containerId": null,
"originalText": "class Association:\n - from_actionID: str\n - from_key_name: str\n - from_value_name: str\n - to_actionID: str\n - to_value_name: str\n - type: Type",
"autoResize": true,
"lineHeight": 1.25
},
{
"id": "WK34n9xeVxntypCtrlK6p",
"type": "text",
"x": 256.5,
"y": 787.5,
"width": 517,
"height": 175,
"angle": 0,
"strokeColor": "#1e1e1e",
"backgroundColor": "#228be6",
"fillStyle": "solid",
"strokeWidth": 2,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"groupIds": [],
"frameId": null,
"index": "aM",
"roundness": null,
"seed": 1166526296,
"version": 318,
"versionNonce": 1042162520,
"isDeleted": false,
"boundElements": null,
"updated": 1758649002604,
"link": null,
"locked": false,
"text": "class Filter(Action):\n # Static\n + type: ActionTypes = Filter\n\n # Properties\n - compare: function(Dict<str, any>) -> bool\n - output_mapper: [str]",
"fontSize": 20,
"fontFamily": 8,
"textAlign": "left",
"verticalAlign": "top",
"containerId": null,
"originalText": "class Filter(Action):\n # Static\n + type: ActionTypes = Filter\n\n # Properties\n - compare: function(Dict<str, any>) -> bool\n - output_mapper: [str]",
"autoResize": true,
"lineHeight": 1.25
},
{
"id": "NY9jyUFLFFCNPE2sh00SX",
"type": "text",
"x": 1639,
"y": 606.5,
"width": 407,
"height": 200,
"angle": 0,
"strokeColor": "#1e1e1e",
"backgroundColor": "#228be6",
"fillStyle": "solid",
"strokeWidth": 2,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"groupIds": [],
"frameId": null,
"index": "aP",
"roundness": null,
"seed": 20345896,
"version": 168,
"versionNonce": 627282472,
"isDeleted": false,
"boundElements": null,
"updated": 1758649426380,
"link": null,
"locked": false,
"text": "class Map(Action):\n # Static\n + type: ActionTypes = Map\n\n # Properties\n - compare_mapper: [str]\n - mapper: function(any...) -> any\n - output_mapper: [str]",
"fontSize": 20,
"fontFamily": 8,
"textAlign": "left",
"verticalAlign": "top",
"containerId": null,
"originalText": "class Map(Action):\n # Static\n + type: ActionTypes = Map\n\n # Properties\n - compare_mapper: [str]\n - mapper: function(any...) -> any\n - output_mapper: [str]",
"autoResize": true,
"lineHeight": 1.25
},
{
"id": "SkhaoW-3TTKDZzEii3Lf6",
"type": "text",
"x": 1457.5,
"y": 955.5,
"width": 121,
"height": 50,
"angle": 0,
"strokeColor": "#1e1e1e",
"backgroundColor": "#228be6",
"fillStyle": "solid",
"strokeWidth": 2,
"strokeStyle": "solid",
"roughness": 1,
"opacity": 100,
"groupIds": [],
"frameId": null,
"index": "aQ",
"roundness": null,
"seed": 2071523672,
"version": 37,
"versionNonce": 105260376,
"isDeleted": false,
"boundElements": null,
"updated": 1758648834435,
"link": null,
"locked": false,
"text": "class Dump:\n -",
"fontSize": 20,
"fontFamily": 8,
"textAlign": "left",
"verticalAlign": "top",
"containerId": null,
"originalText": "class Dump:\n -",
"autoResize": true,
"lineHeight": 1.25
}
],
"appState": {
"gridSize": 20,
"gridStep": 5,
"gridModeEnabled": false,
"viewBackgroundColor": "#ffffff"
},
"files": {}
}