Compare commits
30 Commits
dev.splitt
...
dev.report
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9440a562f2 | ||
|
|
5eda131aac | ||
|
|
57884eaf2e | ||
|
|
4548a683c2 | ||
|
|
3eec49ffa5 | ||
|
|
0bc7f4b227 | ||
|
|
f28952b0a2 | ||
|
|
0b626a8e09 | ||
|
|
b254098532 | ||
|
|
ee88ffe4cf | ||
|
|
70b4bd8645 | ||
|
|
6316d2bfc4 | ||
|
|
87ca748f45 | ||
|
|
4315d70109 | ||
|
|
9a5d633b5e | ||
|
|
a6760cd52d | ||
|
|
a7eb92227d | ||
|
|
9f221e31cd | ||
|
|
47197194d5 | ||
|
|
0cdbf6f624 | ||
|
|
3e30489f86 | ||
|
|
8a22e453e4 | ||
|
|
7feb4eb857 | ||
|
|
70af19d356 | ||
|
|
a4b44ab2ee | ||
|
|
74b6b609dd | ||
|
|
59796c37cb | ||
|
|
f696f5950b | ||
|
|
605b496da7 | ||
|
|
7d693964dd |
BIN
Assets/Dataset/1-hop/uri-abbreviations.csv
LFS
Normal file
BIN
Assets/Dataset/1-hop/uri-abbreviations.csv
LFS
Normal file
Binary file not shown.
|
Binary file not shown.
30
Scripts/DataBaseQueries/dataset.sql
Normal file
30
Scripts/DataBaseQueries/dataset.sql
Normal file
@@ -0,0 +1,30 @@
|
||||
-- To pass to Pandas
|
||||
SELECT *
|
||||
FROM RDFs
|
||||
INNER JOIN Subjects USING (SubjectID)
|
||||
INNER JOIN Relationships USING (RelationshipID)
|
||||
INNER JOIN Objects USING (ObjectID);
|
||||
|
||||
-- To pass to Pandas for abstracts
|
||||
SELECT *
|
||||
FROM RDFs
|
||||
INNER JOIN WikipediaAbstracts USING (MovieID);
|
||||
|
||||
-- To pass to Pandas for abbreviations
|
||||
SELECT *
|
||||
FROM Abbreviations;
|
||||
|
||||
-- More complex to have clean dataset
|
||||
-- More complex to have clean dataset
|
||||
SELECT MovieID, GROUP_CONCAT('<SOT>' || '<SUB>' || SubjectURI || '<REL>' || RelationshipURI || '<OBJ>' || ObjectURI || '<EOT>', '') as RDF_String, Abstract
|
||||
FROM RDFs
|
||||
INNER JOIN SubjectsCountInRDFs USING (SubjectID)
|
||||
INNER JOIN RelationshipsCountInRDFs USING(RelationshipID)
|
||||
INNER JOIN ObjectsCountInRDFs USING (ObjectID)
|
||||
INNER JOIN ParsedSubjects USING (SubjectID)
|
||||
INNER JOIN ParsedRelationships USING (RelationshipID)
|
||||
INNER JOIN ParsedObjects USING (ObjectID)
|
||||
INNER JOIN WikipediaAbstracts USING (MovieID)
|
||||
-- insert WHERE here
|
||||
-- WHERE SubjectID = 134626
|
||||
GROUP BY MovieID;
|
||||
174
Scripts/DataBaseQueries/db_creation.sql
Normal file
174
Scripts/DataBaseQueries/db_creation.sql
Normal file
@@ -0,0 +1,174 @@
|
||||
CREATE TABLE IF NOT EXISTS Movies (
|
||||
MovieID INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
MovieURI TEXT UNIQUE NOT NULL
|
||||
);
|
||||
|
||||
|
||||
CREATE TABLE IF NOT EXISTS WikiPageIDs (
|
||||
MovieID INTEGER PRIMARY KEY,
|
||||
PageID INTEGER UNIQUE NOT NULL,
|
||||
FOREIGN KEY(MovieID) REFERENCES Movies(MovieID)
|
||||
);
|
||||
|
||||
|
||||
CREATE TABLE IF NOT EXISTS WikipediaAbstracts (
|
||||
MovieID INTEGER PRIMARY KEY,
|
||||
Abstract TEXT NOT NULL,
|
||||
FOREIGN KEY(MovieID) REFERENCES Movies(MovieID)
|
||||
);
|
||||
|
||||
|
||||
CREATE TABLE IF NOT EXISTS Origins (
|
||||
OriginID INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
OriginName TEXT UNIQUE NOT NULL
|
||||
);
|
||||
|
||||
|
||||
CREATE TABLE IF NOT EXISTS Subjects (
|
||||
SubjectID INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
SubjectURI TEXT UNIQUE NOT NULL,
|
||||
OriginID BIGINT NOT NULL,
|
||||
FOREIGN KEY(OriginID) REFERENCES Origins(OriginID)
|
||||
);
|
||||
|
||||
|
||||
CREATE TABLE IF NOT EXISTS Relationships (
|
||||
RelationshipID INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
RelationshipURI TEXT UNIQUE NOT NULL
|
||||
);
|
||||
|
||||
|
||||
CREATE TABLE IF NOT EXISTS Objects (
|
||||
ObjectID INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
ObjectURI TEXT UNIQUE NOT NULL,
|
||||
OriginID BIGINT NOT NULL,
|
||||
FOREIGN KEY(OriginID) REFERENCES Origins(OriginID)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS RDFs (
|
||||
RDF_ID INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
MovieID INTEGER NOT NULL,
|
||||
SubjectID INTEGER NOT NULL,
|
||||
RelationshipID INTEGER NOT NULL,
|
||||
ObjectID INTEGER NOT NULL,
|
||||
UNIQUE(MovieID, SubjectID, RelationshipID, ObjectID),
|
||||
FOREIGN KEY(MovieID) REFERENCES Movies(MovieID),
|
||||
FOREIGN KEY(SubjectID) REFERENCES Subjects(SubjectID),
|
||||
FOREIGN KEY(RelationshipID) REFERENCES Relationships(RelationshipID),
|
||||
FOREIGN KEY(ObjectID) REFERENCES Objects(ObjectID)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_rdf_movie_id ON RDFs(MovieID);
|
||||
CREATE INDEX IF NOT EXISTS idx_rdf_subject_id ON RDFs(SubjectID);
|
||||
CREATE INDEX IF NOT EXISTS idx_rdf_relationship_id ON RDFs(RelationshipID);
|
||||
CREATE INDEX IF NOT EXISTS idx_rdf_object_id ON RDFs(ObjectID);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS Abbreviations (
|
||||
AbbreviationID INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
URI TEXT UNIQUE NOT NULL,
|
||||
Abbreviation TEXT UNIQUE NOT NULL
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS Subjects_Abbreviations (
|
||||
SubjectID INTEGER NOT NULL,
|
||||
AbbreviationID INTEGER NOT NULL,
|
||||
PRIMARY KEY(SubjectID, AbbreviationID),
|
||||
FOREIGN KEY(SubjectID) REFERENCES Subjects(SubjectID),
|
||||
FOREIGN KEY(AbbreviationID) REFERENCES Abbreviations(AbbreviationID)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS Relationships_Abbreviations (
|
||||
RelationshipID INTEGER NOT NULL,
|
||||
AbbreviationID INTEGER NOT NULL,
|
||||
PRIMARY KEY(RelationshipID, AbbreviationID),
|
||||
FOREIGN KEY(RelationshipID) REFERENCES Relationships(RelationshipID),
|
||||
FOREIGN KEY(AbbreviationID) REFERENCES Abbreviations(AbbreviationID)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS Objects_Abbreviations (
|
||||
ObjectID INTEGER NOT NULL,
|
||||
AbbreviationID INTEGER NOT NULL,
|
||||
PRIMARY KEY(ObjectID, AbbreviationID),
|
||||
FOREIGN KEY(ObjectID) REFERENCES Objects(ObjectID),
|
||||
FOREIGN KEY(AbbreviationID) REFERENCES Abbreviations(AbbreviationID)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_sub_abbr_sub_id ON Subjects_Abbreviations(SubjectID);
|
||||
CREATE INDEX IF NOT EXISTS idx_sub_abbr_abbr_id ON Subjects_Abbreviations(AbbreviationID);
|
||||
CREATE INDEX IF NOT EXISTS idx_rel_abbr_rel_id ON Relationships_Abbreviations(RelationshipID);
|
||||
CREATE INDEX IF NOT EXISTS idx_rel_abbr_abbr_id ON Relationships_Abbreviations(AbbreviationID);
|
||||
CREATE INDEX IF NOT EXISTS idx_obj_abbr_obj_id ON Objects_Abbreviations(ObjectID);
|
||||
CREATE INDEX IF NOT EXISTS idx_obj_abbr_abbr_id ON Objects_Abbreviations(AbbreviationID);
|
||||
|
||||
-- Views
|
||||
-- Subjects
|
||||
CREATE VIEW IF NOT EXISTS ParsedSubjects
|
||||
AS
|
||||
SELECT
|
||||
SubjectID,
|
||||
CASE WHEN Abbreviation IS NULL
|
||||
THEN SubjectURI
|
||||
ELSE Abbreviation || ':' || replace(SubjectURI, URI, '') END
|
||||
AS SubjectURI
|
||||
FROM Subjects
|
||||
LEFT JOIN Subjects_Abbreviations USING (SubjectID)
|
||||
LEFT JOIN Abbreviations USING (AbbreviationID);
|
||||
|
||||
-- Relationships
|
||||
CREATE VIEW IF NOT EXISTS ParsedRelationships
|
||||
AS
|
||||
SELECT
|
||||
RelationshipID,
|
||||
CASE WHEN Abbreviation IS NULL
|
||||
THEN RelationshipURI
|
||||
ELSE Abbreviation || ':' || replace(RelationshipURI, URI, '') END
|
||||
AS RelationshipURI
|
||||
FROM Relationships
|
||||
LEFT JOIN Relationships_Abbreviations USING (RelationshipID)
|
||||
LEFT JOIN Abbreviations USING (AbbreviationID);
|
||||
|
||||
-- Objects
|
||||
CREATE VIEW IF NOT EXISTS ParsedObjects
|
||||
AS
|
||||
SELECT
|
||||
ObjectID,
|
||||
CASE WHEN Abbreviation IS NULL
|
||||
THEN ObjectURI
|
||||
ELSE Abbreviation || ':' || replace(ObjectURI, URI, '') END
|
||||
AS ObjectURI
|
||||
FROM Objects
|
||||
LEFT JOIN Objects_Abbreviations USING (ObjectID)
|
||||
LEFT JOIN Abbreviations USING (AbbreviationID);
|
||||
|
||||
|
||||
-- Subject Count
|
||||
CREATE VIEW IF NOT EXISTS SubjectsCountInRDFs
|
||||
AS
|
||||
SELECT SubjectID, count(SubjectID) as Sub_Count
|
||||
FROM RDFs
|
||||
GROUP BY SubjectID;
|
||||
|
||||
|
||||
|
||||
|
||||
-- Relationship Count
|
||||
CREATE VIEW IF NOT EXISTS RelationshipsCountInRDFs
|
||||
AS
|
||||
SELECT RelationshipID, count(RelationshipID) as Rel_Count
|
||||
FROM RDFs
|
||||
GROUP BY RelationshipID;
|
||||
|
||||
|
||||
-- Object Count
|
||||
CREATE VIEW IF NOT EXISTS ObjectsCountInRDFs
|
||||
AS
|
||||
SELECT ObjectID, count(ObjectID) as Obj_Count
|
||||
FROM RDFs
|
||||
GROUP BY ObjectID;
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -33,3 +33,23 @@ SELECT ObjectID FROM Objects WHERE ObjectURI = ?;
|
||||
|
||||
|
||||
INSERT INTO RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?,?,?,?);
|
||||
|
||||
-- Prefixes
|
||||
INSERT INTO Abbreviations(URI, Abbreviation) VALUES (?,?);
|
||||
INSERT INTO Objects_Abbreviations(ObjectID, AbbreviationID) VALUES (?,?);
|
||||
INSERT INTO Relationships_Abbreviations(RelationshipID, AbbreviationID) VALUES (?,?);
|
||||
INSERT INTO Subjects_Abbreviations(SubjectID, AbbreviationID) VALUES (?,?);
|
||||
|
||||
-- Please be sure it is a URI before running this query
|
||||
-- and take at least until the domain and the first path part
|
||||
SELECT AbbreviationID FROM Abbreviations WHERE URI LIKE ?;
|
||||
|
||||
-- Query to retrieve data
|
||||
SELECT MovieID, GROUP_CONCAT('<SOT>' || '<SUB>' || SubjectURI || '<REL>' || RelationshipURI || '<OBJ>' || ObjectURI || '<EOT>', '') as RDF_String, Abstract
|
||||
FROM RDFs
|
||||
INNER JOIN ParsedSubjects USING (SubjectID)
|
||||
INNER JOIN ParsedRelationships USING (RelationshipID)
|
||||
INNER JOIN ParsedObjects USING (ObjectID)
|
||||
INNER JOIN WikipediaAbstracts USING (MovieID)
|
||||
-- insert WHERE here
|
||||
GROUP BY MovieID;
|
||||
186
Scripts/DataCleaning/clean_relationship.ipynb
Normal file
186
Scripts/DataCleaning/clean_relationship.ipynb
Normal file
@@ -0,0 +1,186 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "b9081b7c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# This file deletes in the pipeline the unwanted relationship by different rules\n",
|
||||
"import pandas as pd\n",
|
||||
"import sqlite3\n",
|
||||
"import numpy as np\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"CONN = sqlite3.connect('../../Assets/Dataset/Tmp/dataset2.db')\n",
|
||||
"\n",
|
||||
"def get_RDF() -> pd.DataFrame:\n",
|
||||
" \"\"\"\n",
|
||||
" QUERY = \"SELECT * FROM RDFs \" \\\n",
|
||||
" \"INNER JOIN Subjects USING (SubjectID) \" \\\n",
|
||||
" \"INNER JOIN Relationships USING (RelationshipID) \" \\\n",
|
||||
" \"INNER JOIN Objects USING (ObjectID);\"\n",
|
||||
" RDF = pd.read_sql_query(QUERY, CONN)\n",
|
||||
" RDF = RDF[[\"SubjectURI\", \"RelationshipURI\", \"ObjectURI\"]]\n",
|
||||
" RDF = RDF.dropna()\n",
|
||||
" \"\"\"\n",
|
||||
" Subjects = pd.read_sql_query('SELECT * FROM Subjects;', CONN)\n",
|
||||
" Objects = pd.read_sql_query('SELECT * FROM Objects;', CONN)\n",
|
||||
" Relationships = pd.read_sql_query('SELECT * FROM Relationships;', CONN)\n",
|
||||
" RDF = pd.read_sql_query('SELECT * FROM RDFs;', CONN)\n",
|
||||
"\n",
|
||||
" # drop '' values \n",
|
||||
" Subjects = Subjects.replace('', np.nan)# .dropna()\n",
|
||||
" Relationships = Relationships.replace('', np.nan)# .dropna()\n",
|
||||
" Objects = Objects.replace('', np.nan)# .dropna()\n",
|
||||
"\n",
|
||||
" # join RDF with its components\n",
|
||||
" RDF = RDF.merge(Subjects, left_on=\"SubjectID\", right_on=\"SubjectID\")\n",
|
||||
" RDF = RDF.merge(Objects, left_on=\"ObjectID\", right_on=\"ObjectID\")\n",
|
||||
" RDF = RDF.merge(Relationships, left_on=\"RelationshipID\", right_on=\"RelationshipID\")\n",
|
||||
" RDF = RDF[[\"SubjectURI\", \"RelationshipURI\", \"ObjectURI\", \"MovieID\"]]\n",
|
||||
" return RDF\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"#def delete_relationship_by_uri(RDF: pd.DataFrame, )\n",
|
||||
"\n",
|
||||
"def delete_relationship_by_uri(RDF: pd.DataFrame, uri: str) -> pd.DataFrame:\n",
|
||||
" return RDF[RDF[\"RelationshipURI\"]!= uri]\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"RDF = get_RDF()\n",
|
||||
"# RDF = RDF.dropna()\n",
|
||||
"# print(RDF)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "644690bb",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def filter_by_frequence_relationship_uri(RDF: pd.DataFrame, count_treshold) -> pd.DataFrame:\n",
|
||||
" counts = RDF[\"RelationshipURI\"].value_counts() \n",
|
||||
" RDF[\"RelationshipFreq\"] = RDF[\"RelationshipURI\"].map(counts)\n",
|
||||
" RDF = RDF[RDF[\"RelationshipFreq\"] >= count_treshold]\n",
|
||||
" # counts is a series as key: relationship, value: count\n",
|
||||
" # counts = counts[counts > count_treshold]\n",
|
||||
" # relationships = counts.index\n",
|
||||
" # RDF = RDF[RDF[\"RelationshipURI\"].isin(relationships)]\n",
|
||||
" # RDF = RDF.groupby(\"RelationshipURI\").filter(lambda x: len(x) >= count_treshold)\n",
|
||||
" return RDF\n",
|
||||
"\n",
|
||||
"RDF = filter_by_frequence_relationship_uri(RDF, 1)\n",
|
||||
"# print(new_RDF)\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "34525be6",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" SubjectURI \\\n",
|
||||
"0 http://dbpedia.org/resource/Nights_of_Cabiria \n",
|
||||
"1 http://dbpedia.org/resource/California_Science... \n",
|
||||
"2 http://dbpedia.org/resource/China_Captain \n",
|
||||
"3 http://dbpedia.org/resource/Caravan_of_Courage... \n",
|
||||
"4 http://dbpedia.org/resource/WHIH_Newsfront \n",
|
||||
"... ... \n",
|
||||
"12725500 http://dbpedia.org/resource/I_Will_Follow_(film) \n",
|
||||
"12725501 http://dbpedia.org/resource/I_Will_Follow_(film) \n",
|
||||
"12725502 http://dbpedia.org/resource/I_Witnessed_Genoci... \n",
|
||||
"12725503 http://dbpedia.org/resource/I_Woke_Up_Early_th... \n",
|
||||
"12725504 http://dbpedia.org/resource/I_Won't_Play \n",
|
||||
"\n",
|
||||
" RelationshipURI \\\n",
|
||||
"0 http://www.w3.org/2002/07/owl#differentFrom \n",
|
||||
"1 http://www.w3.org/2002/07/owl#differentFrom \n",
|
||||
"2 http://www.w3.org/2002/07/owl#differentFrom \n",
|
||||
"3 http://www.w3.org/2002/07/owl#differentFrom \n",
|
||||
"4 http://www.w3.org/2000/01/rdf-schema#seeAlso \n",
|
||||
"... ... \n",
|
||||
"12725500 http://dbpedia.org/ontology/producer \n",
|
||||
"12725501 http://dbpedia.org/ontology/producer \n",
|
||||
"12725502 http://dbpedia.org/ontology/producer \n",
|
||||
"12725503 http://dbpedia.org/ontology/producer \n",
|
||||
"12725504 http://dbpedia.org/ontology/producer \n",
|
||||
"\n",
|
||||
" ObjectURI MovieID \\\n",
|
||||
"0 http://dbpedia.org/resource/Cabiria 26 \n",
|
||||
"1 http://dbpedia.org/resource/California_Academy... 185 \n",
|
||||
"2 http://dbpedia.org/resource/Captain_China 614 \n",
|
||||
"3 http://dbpedia.org/resource/Caravan_of_Courage... 740 \n",
|
||||
"4 http://dbpedia.org/resource/Captain_America:_C... 594 \n",
|
||||
"... ... ... \n",
|
||||
"12725500 http://dbpedia.org/resource/Ava_DuVernay 145854 \n",
|
||||
"12725501 http://dbpedia.org/resource/Molly_Mayeux 145854 \n",
|
||||
"12725502 http://dbpedia.org/resource/Headlines_Today 145861 \n",
|
||||
"12725503 http://dbpedia.org/resource/Billy_Zane 145862 \n",
|
||||
"12725504 http://dbpedia.org/resource/Gordon_Hollingshead 145864 \n",
|
||||
"\n",
|
||||
" RelationshipFreq MovieFreq \n",
|
||||
"0 2132 216 \n",
|
||||
"1 2132 264 \n",
|
||||
"2 2132 66 \n",
|
||||
"3 2132 131 \n",
|
||||
"4 1653 133 \n",
|
||||
"... ... ... \n",
|
||||
"12725500 80077 95 \n",
|
||||
"12725501 80077 95 \n",
|
||||
"12725502 80077 41 \n",
|
||||
"12725503 80077 98 \n",
|
||||
"12725504 80077 91 \n",
|
||||
"\n",
|
||||
"[12725505 rows x 6 columns]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"def filter_by_frequence_movie_id(RDF: pd.DataFrame, min_treshold, max_treshold) -> pd.DataFrame:\n",
|
||||
" counts = RDF[\"MovieID\"].value_counts() \n",
|
||||
" RDF[\"MovieFreq\"] = RDF[\"MovieID\"].map(counts)\n",
|
||||
" RDF = RDF[RDF[\"MovieFreq\"] >= min_treshold]\n",
|
||||
" RDF = RDF[RDF[\"MovieFreq\"] < max_treshold]\n",
|
||||
" # counts is a series as key: relationship, value: count\n",
|
||||
" # counts = counts[counts > count_treshold]\n",
|
||||
" # relationships = counts.index\n",
|
||||
" # RDF = RDF[RDF[\"RelationshipURI\"].isin(relationships)]\n",
|
||||
" # RDF = RDF.groupby(\"RelationshipURI\").filter(lambda x: len(x) >= count_treshold)\n",
|
||||
" return RDF\n",
|
||||
"\n",
|
||||
"RDF = filter_by_frequence_movie_id(RDF, 1, 1500)\n",
|
||||
"print(RDF)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "deep_learning",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.13.7"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
77
Scripts/DataCleaning/dbpedia-uri.py
Normal file
77
Scripts/DataCleaning/dbpedia-uri.py
Normal file
@@ -0,0 +1,77 @@
|
||||
import argparse
|
||||
import sys
|
||||
|
||||
|
||||
|
||||
class ProgramArgs:
|
||||
|
||||
def __init__(self, file: str, output: str, treshold: int):
|
||||
self.file = file
|
||||
self.output = output
|
||||
self.treshold = treshold
|
||||
|
||||
def get_args(args: list[str]) -> ProgramArgs:
|
||||
|
||||
PARSER = argparse.ArgumentParser()
|
||||
PARSER.add_argument("--input-file", "-i", required=True, type=str)
|
||||
PARSER.add_argument("--output-file", "-o", required=True, type=str)
|
||||
PARSER.add_argument("--treshold", "-t", type=int, default=1)
|
||||
parsed_args, _ = PARSER.parse_known_args(args)
|
||||
|
||||
# print(parsed_args.input_file)
|
||||
|
||||
return ProgramArgs(parsed_args.input_file,parsed_args.output_file, parsed_args.treshold) # type ignore
|
||||
|
||||
|
||||
def print_dbpedia(file: str, out: str):
|
||||
|
||||
|
||||
FILE = open(file, "r", encoding="utf-8")
|
||||
OUT = open(out, mode="w", encoding="utf-8")
|
||||
|
||||
DOMAIN_PART = "dbpedia"
|
||||
|
||||
already_parsed : set[str] = set()
|
||||
|
||||
|
||||
for row in FILE:
|
||||
|
||||
sections = row.split("/")
|
||||
sections = list(filter(lambda item: item != "", sections))
|
||||
|
||||
# print(sections)
|
||||
|
||||
if len(sections) < 3:
|
||||
continue
|
||||
|
||||
URI = "/".join(sections[1:3])
|
||||
URI = "//".join([sections[0], URI])
|
||||
|
||||
if URI in already_parsed:
|
||||
continue
|
||||
|
||||
DOMAIN = sections[1]
|
||||
SUBDOMAINS = DOMAIN.split(".")
|
||||
TYPE = sections[2][0]
|
||||
|
||||
if DOMAIN_PART not in SUBDOMAINS:
|
||||
continue
|
||||
|
||||
already_parsed.add(URI)
|
||||
|
||||
SUB_ID = SUBDOMAINS[0]
|
||||
|
||||
if len(SUB_ID) > 3:
|
||||
SUB_ID = SUB_ID[:3]
|
||||
|
||||
OUT.write(f"\"{URI}/\", \"{SUB_ID}-db{TYPE}\"\n")
|
||||
|
||||
|
||||
FILE.close()
|
||||
OUT.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
ARGS = get_args(sys.argv)
|
||||
# ARGS = get_debug_args()
|
||||
print_dbpedia(ARGS.file, ARGS.output)
|
||||
@@ -6,8 +6,16 @@ from typing import Self
|
||||
|
||||
class ProgramArgs:
|
||||
|
||||
def __init__(self, file: str, output: str, treshold: int):
|
||||
def __init__(self, file: str, csv_uri_header: str, output: str, treshold: int):
|
||||
"""
|
||||
Args:
|
||||
file (str):
|
||||
csv_header (str): The name of the column of the csv file from which the program will get the URIs
|
||||
output (str):
|
||||
treshold (int):
|
||||
"""
|
||||
self.file = file
|
||||
self.csv_uri_header = csv_uri_header
|
||||
self.output = output
|
||||
self.treshold = treshold
|
||||
|
||||
@@ -33,11 +41,15 @@ class Node:
|
||||
KEY = child[0]
|
||||
|
||||
if not self.children.get(KEY):
|
||||
# if the key has no value, it means we are traversing this branch for the first time
|
||||
# create another node for the key
|
||||
self.children[KEY] = Node(KEY, 0)
|
||||
|
||||
# take the node for the key
|
||||
CHILD = self.children[KEY]
|
||||
self.quantity += 1
|
||||
|
||||
# if the child list to enter has only one element, which is KEY, no more node will be created
|
||||
if len(child) == 1:
|
||||
return
|
||||
|
||||
@@ -53,27 +65,32 @@ def get_args(args: list[str]) -> ProgramArgs:
|
||||
|
||||
PARSER = argparse.ArgumentParser()
|
||||
PARSER.add_argument("--input-file", "-i", required=True, type=str)
|
||||
PARSER.add_argument("--header-name", "-c", required=True, type=str) # c stands for column
|
||||
PARSER.add_argument("--output-file", "-o", required=True, type=str)
|
||||
PARSER.add_argument("--treshold", "-t", type=int, default=1)
|
||||
parsed_args, _ = PARSER.parse_known_args(args)
|
||||
|
||||
# print(parsed_args.input_file)
|
||||
|
||||
return ProgramArgs(parsed_args.input_file,parsed_args.output_file, parsed_args.treshold) # type ignore
|
||||
return ProgramArgs(parsed_args.input_file, parsed_args.header_name ,parsed_args.output_file, parsed_args.treshold) # type ignore
|
||||
|
||||
|
||||
def get_debug_args() -> ProgramArgs:
|
||||
|
||||
FILE = "./Assets/Dataset/Tmp/reverse-rel.txt"
|
||||
# -i ./Assets/Dataset/1-hop/movies.csv -c subject -o Assets/Dataset/Tmp/prova.csv -t 1
|
||||
FILE = "./Assets/Dataset/1-hop/movies.csv"
|
||||
CSV_HEADER = "subject"
|
||||
OUTPUT = "./Assets/Dataset/Tmp/prova.csv"
|
||||
TRESHOLD = 1
|
||||
|
||||
return ProgramArgs(
|
||||
FILE,
|
||||
CSV_HEADER,
|
||||
OUTPUT,
|
||||
TRESHOLD
|
||||
)
|
||||
|
||||
|
||||
def tree_like(file: str, out: str):
|
||||
def tree_like(file: str, csv_uri_header:str, out: str):
|
||||
|
||||
INDENTATION = " "
|
||||
|
||||
@@ -84,9 +101,12 @@ def tree_like(file: str, out: str):
|
||||
|
||||
FILE = open(file, "r", encoding="utf-8")
|
||||
|
||||
for row in FILE:
|
||||
# TODO: Change here so it takes single URI from a CSV file
|
||||
# It is needed the header-name
|
||||
for row in csv.DictReader(FILE):
|
||||
|
||||
sections = row.split("/")
|
||||
uri_element = row[csv_uri_header]
|
||||
sections = uri_element.split("/")
|
||||
sections = list(filter(lambda item: item != "", sections))
|
||||
|
||||
# print(sections)
|
||||
@@ -115,7 +135,9 @@ def tree_like(file: str, out: str):
|
||||
|
||||
INDENT: str = INDENTATION * DEPTH
|
||||
|
||||
if NODE.quantity < ARGS.treshold:
|
||||
# Leaf node have quantity 0, so if i want them to appear the threshold have to be 0
|
||||
# if NODE.quantity < ARGS.treshold:
|
||||
if ARGS.treshold > NODE.quantity:
|
||||
continue
|
||||
|
||||
OUT.write(f"{INDENT}- {NODE}\n")
|
||||
@@ -133,7 +155,8 @@ def tree_like(file: str, out: str):
|
||||
OUT.close()
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
ARGS = get_args(sys.argv)
|
||||
# ARGS = get_debug_args()
|
||||
tree_like(ARGS.file, ARGS.output)
|
||||
tree_like(ARGS.file,ARGS.csv_uri_header, ARGS.output)
|
||||
|
||||
@@ -1,65 +0,0 @@
|
||||
CREATE TABLE IF NOT EXISTS Movies (
|
||||
MovieID INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
MovieURI TEXT UNIQUE NOT NULL
|
||||
);
|
||||
|
||||
|
||||
CREATE TABLE IF NOT EXISTS WikiPageIDs (
|
||||
MovieID INTEGER PRIMARY KEY,
|
||||
PageID INTEGER UNIQUE NOT NULL,
|
||||
FOREIGN KEY(MovieID) REFERENCES Movies(MovieID)
|
||||
);
|
||||
|
||||
|
||||
CREATE TABLE IF NOT EXISTS WikipediaAbstracts (
|
||||
MovieID INTEGER PRIMARY KEY,
|
||||
Abstract TEXT NOT NULL,
|
||||
FOREIGN KEY(MovieID) REFERENCES Movies(MovieID)
|
||||
);
|
||||
|
||||
|
||||
CREATE TABLE IF NOT EXISTS Origins (
|
||||
OriginID INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
OriginName TEXT UNIQUE NOT NULL
|
||||
);
|
||||
|
||||
|
||||
CREATE TABLE IF NOT EXISTS Subjects (
|
||||
SubjectID INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
SubjectURI TEXT UNIQUE NOT NULL,
|
||||
OriginID BIGINT NOT NULL,
|
||||
FOREIGN KEY(OriginID) REFERENCES Origins(OriginID)
|
||||
);
|
||||
|
||||
|
||||
CREATE TABLE IF NOT EXISTS Relationships (
|
||||
RelationshipID INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
RelationshipURI TEXT UNIQUE NOT NULL
|
||||
);
|
||||
|
||||
|
||||
CREATE TABLE IF NOT EXISTS Objects (
|
||||
ObjectID INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
ObjectURI TEXT UNIQUE NOT NULL,
|
||||
OriginID BIGINT NOT NULL,
|
||||
FOREIGN KEY(OriginID) REFERENCES Origins(OriginID)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS RDFs (
|
||||
RDF_ID INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
MovieID INTEGER NOT NULL,
|
||||
SubjectID INTEGER NOT NULL,
|
||||
RelationshipID INTEGER NOT NULL,
|
||||
ObjectID INTEGER NOT NULL,
|
||||
UNIQUE(SubjectID, RelationshipID, ObjectID),
|
||||
FOREIGN KEY(MovieID) REFERENCES Movies(MovieID),
|
||||
FOREIGN KEY(SubjectID) REFERENCES Subjects(SubjectID),
|
||||
FOREIGN KEY(RelationshipID) REFERENCES Relationships(RelationshipID),
|
||||
FOREIGN KEY(ObjectID) REFERENCES Objects(ObjectID)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_rdf_movie_id ON RDFs(MovieID);
|
||||
CREATE INDEX IF NOT EXISTS idx_rdf_subject_id ON RDFs(SubjectID);
|
||||
CREATE INDEX IF NOT EXISTS idx_rdf_relationship_id ON RDFs(RelationshipID);
|
||||
CREATE INDEX IF NOT EXISTS idx_rdf_object_id ON RDFs(ObjectID);
|
||||
|
||||
@@ -17,12 +17,15 @@ PAGEID_CSV = "./Assets/Dataset/1-hop/movie-pageid.csv"
|
||||
SUMMARY_CSV = "./Assets/Dataset/1-hop/wikipedia-summary.csv"
|
||||
DATASET_CSV = "./Assets/Dataset/1-hop/dataset.csv"
|
||||
REVERSE_CSV = "./Assets/Dataset/1-hop/reverse.csv"
|
||||
URI_CSV = "./Assets/Dataset/1-hop/uri-abbreviations.csv"
|
||||
|
||||
MOVIES_CSV_HANDLER = open(MOVIES_CSV, "r", newline="", encoding="utf-8")
|
||||
PAGEID_CSV_HANDLER = open(PAGEID_CSV, "r", newline="", encoding="utf-8")
|
||||
SUMMARY_CSV_HANDLER = open(SUMMARY_CSV, "r", newline="", encoding="utf-8")
|
||||
DATASET_CSV_HANDLER = open(DATASET_CSV, "r", newline="", encoding="utf-8")
|
||||
REVERSE_CSV_HANDLER = open(REVERSE_CSV, "r", newline="", encoding="utf-8")
|
||||
URI_ABBR_CSV_HANDLER = open(URI_CSV, "r", newline="", encoding="utf-8")
|
||||
|
||||
MOVIES_CSV_HANDLER = open(MOVIES_CSV,"r",newline='', encoding="utf-8")
|
||||
PAGEID_CSV_HANDLER = open(PAGEID_CSV,"r",newline='', encoding="utf-8")
|
||||
SUMMARY_CSV_HANDLER = open(SUMMARY_CSV,"r",newline='', encoding="utf-8")
|
||||
DATASET_CSV_HANDLER = open(DATASET_CSV,"r",newline='', encoding="utf-8")
|
||||
REVERSE_CSV_HANDLER = open(REVERSE_CSV,"r",newline='', encoding="utf-8")
|
||||
|
||||
CONN = sqlite3.connect(DB_NAME)
|
||||
CURS = CONN.cursor()
|
||||
@@ -30,6 +33,7 @@ CURS = CONN.cursor()
|
||||
# MARK: SQL Definitions
|
||||
# Insert MovieURI
|
||||
|
||||
|
||||
def insertOrigin(curs: sqlite3.Cursor) -> bool:
|
||||
|
||||
QUERY = "INSERT INTO Origins (OriginName) VALUES ('dataset.csv'),('reverse.csv');"
|
||||
@@ -39,6 +43,7 @@ def insertOrigin(curs : sqlite3.Cursor ) -> bool:
|
||||
except sqlite3.IntegrityError:
|
||||
return False
|
||||
|
||||
|
||||
def selectOrigin(curs: sqlite3.Cursor, originName: str) -> int | None:
|
||||
|
||||
QUERY = "SELECT OriginID FROM Origins WHERE OriginName = ?;"
|
||||
@@ -51,6 +56,7 @@ def selectOrigin(curs: sqlite3.Cursor, originName: str) -> int | None:
|
||||
# in this case the real id is the first element of the tuple
|
||||
return originId[0]
|
||||
|
||||
|
||||
def insertMovie(curs: sqlite3.Cursor, movieUri: str) -> bool:
|
||||
|
||||
QUERY = "INSERT INTO Movies (MovieURI) VALUES (?);"
|
||||
@@ -82,6 +88,7 @@ def insertWikiPageId(curs: sqlite3.Cursor, movieId: int, pageId: int) -> bool:
|
||||
except sqlite3.IntegrityError:
|
||||
return False
|
||||
|
||||
|
||||
def selectMovieIdFromWikiPageId(curs: sqlite3.Cursor, pageId: int) -> int | None:
|
||||
|
||||
QUERY = "SELECT MovieID FROM WikiPageIDs WHERE PageID = ?;"
|
||||
@@ -94,6 +101,7 @@ def selectMovieIdFromWikiPageId(curs: sqlite3.Cursor,pageId: int) -> int | None:
|
||||
# in this case the real id is the first element of the tuple
|
||||
return movieId[0]
|
||||
|
||||
|
||||
def insertWikiAbstract(curs: sqlite3.Cursor, movieId: int, abstract: str) -> bool:
|
||||
QUERY = "INSERT INTO WikipediaAbstracts (MovieID, Abstract) VALUES (?,?);"
|
||||
try:
|
||||
@@ -102,6 +110,7 @@ def insertWikiAbstract(curs: sqlite3.Cursor, movieId: int, abstract: str) -> boo
|
||||
except sqlite3.IntegrityError:
|
||||
return False
|
||||
|
||||
|
||||
def insertSubject(curs: sqlite3.Cursor, subjectURI: str, originID: int) -> bool:
|
||||
QUERY = "INSERT INTO Subjects (SubjectURI, OriginID) VALUES (?,?);"
|
||||
try:
|
||||
@@ -110,6 +119,7 @@ def insertSubject(curs: sqlite3.Cursor, subjectURI: str, originID: int) -> bool:
|
||||
except sqlite3.IntegrityError:
|
||||
return False
|
||||
|
||||
|
||||
def insertRelationship(curs: sqlite3.Cursor, relationshipURI: str) -> bool:
|
||||
QUERY = "INSERT INTO Relationships (RelationshipURI) VALUES (?);"
|
||||
try:
|
||||
@@ -118,6 +128,7 @@ def insertRelationship(curs: sqlite3.Cursor, relationshipURI: str) -> bool:
|
||||
except sqlite3.IntegrityError:
|
||||
return False
|
||||
|
||||
|
||||
def insertObject(curs: sqlite3.Cursor, objectURI: str, originID: int) -> bool:
|
||||
QUERY = "INSERT INTO objects (ObjectURI, OriginID) VALUES (?,?);"
|
||||
try:
|
||||
@@ -126,6 +137,7 @@ def insertObject(curs: sqlite3.Cursor, objectURI: str, originID: int) -> bool:
|
||||
except sqlite3.IntegrityError:
|
||||
return False
|
||||
|
||||
|
||||
def selectSubjectId(curs: sqlite3.Cursor, subjectURI: str) -> int | None:
|
||||
|
||||
QUERY = "SELECT SubjectID FROM Subjects WHERE SubjectURI = ?;"
|
||||
@@ -138,6 +150,7 @@ def selectSubjectId(curs: sqlite3.Cursor, subjectURI: str) -> int | None:
|
||||
# in this case the real id is the first element of the tuple
|
||||
return subjectId[0]
|
||||
|
||||
|
||||
def selectRelationshipId(curs: sqlite3.Cursor, relationshipURI: str) -> int | None:
|
||||
|
||||
QUERY = "SELECT RelationshipID FROM Relationships WHERE RelationshipURI = ?;"
|
||||
@@ -150,6 +163,7 @@ def selectRelationshipId(curs: sqlite3.Cursor, relationshipURI: str) -> int | No
|
||||
# in this case the real id is the first element of the tuple
|
||||
return relationshipId[0]
|
||||
|
||||
|
||||
def selectObjectId(curs: sqlite3.Cursor, objectURI: str) -> int | None:
|
||||
|
||||
QUERY = "SELECT ObjectID FROM Objects WHERE ObjectURI = ?;"
|
||||
@@ -162,12 +176,13 @@ def selectObjectId(curs: sqlite3.Cursor, objectURI: str) -> int | None:
|
||||
# in this case the real id is the first element of the tuple
|
||||
return objectId[0]
|
||||
|
||||
|
||||
def insertRDF(
|
||||
curs: sqlite3.Cursor,
|
||||
movieId: int,
|
||||
subjectId: int,
|
||||
relationshipId: int,
|
||||
objectId: int
|
||||
objectId: int,
|
||||
) -> bool:
|
||||
QUERY = "INSERT INTO RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?,?,?,?);"
|
||||
try:
|
||||
@@ -176,6 +191,56 @@ def insertRDF(
|
||||
except sqlite3.IntegrityError:
|
||||
return False
|
||||
|
||||
# UGLY: correct method to add cursor
|
||||
def insert_abbreviation(uri, abbreviation) -> bool:
|
||||
QUERY = "INSERT INTO Abbreviations(URI, Abbreviation) VALUES (?,?);"
|
||||
try:
|
||||
CURS.execute(QUERY, [uri, abbreviation])
|
||||
return True
|
||||
except sqlite3.IntegrityError:
|
||||
return False
|
||||
|
||||
# UGLY: correct method to add cursor
|
||||
def insert_object_abbreviation(object_id, abbreviation_id) -> bool:
|
||||
QUERY = "INSERT INTO Objects_Abbreviations(ObjectID, AbbreviationID) VALUES (?,?);"
|
||||
try:
|
||||
CURS.execute(QUERY, [object_id, abbreviation_id])
|
||||
return True
|
||||
except sqlite3.IntegrityError:
|
||||
return False
|
||||
|
||||
# UGLY: correct method to add cursor
|
||||
def insert_relationship_abbreviation(relationship_id, abbreviation_id) -> bool:
|
||||
QUERY = "INSERT INTO Relationships_Abbreviations(RelationshipID, AbbreviationID) VALUES (?,?);"
|
||||
try:
|
||||
CURS.execute(QUERY, [relationship_id, abbreviation_id])
|
||||
return True
|
||||
except sqlite3.IntegrityError:
|
||||
return False
|
||||
|
||||
# UGLY: correct method to add cursor
|
||||
def insert_subject_abbreviation(subject_id, abbreviation_id) -> bool:
|
||||
QUERY = (
|
||||
"INSERT INTO Subjects_Abbreviations(SubjectID, AbbreviationID) VALUES (?,?);"
|
||||
)
|
||||
try:
|
||||
CURS.execute(QUERY, [subject_id, abbreviation_id])
|
||||
return True
|
||||
except sqlite3.IntegrityError:
|
||||
return False
|
||||
|
||||
# UGLY: correct method to add cursor
|
||||
def select_abbreviation_id(uri) -> int | None:
|
||||
QUERY = "SELECT AbbreviationID FROM Abbreviations WHERE URI LIKE ?;"
|
||||
CURS.execute(QUERY, [uri])
|
||||
abbreviation_id = CURS.fetchone()
|
||||
if not abbreviation_id:
|
||||
return None
|
||||
|
||||
# in this case the real id is the first element of the tuple
|
||||
return abbreviation_id[0]
|
||||
|
||||
|
||||
# MARK: Parsing
|
||||
def parseMovies():
|
||||
|
||||
@@ -208,7 +273,6 @@ def parseAbstract():
|
||||
ABSTRACT = row["text"]
|
||||
MOVIE_ID = selectMovieIdFromWikiPageId(CURS, WIKI_PAGE_ID)
|
||||
|
||||
|
||||
if MOVIE_ID is None:
|
||||
print(f"The WikiPageId: {WIKI_PAGE_ID} has not a MovieId ")
|
||||
continue
|
||||
@@ -216,10 +280,24 @@ def parseAbstract():
|
||||
insertWikiAbstract(CURS, MOVIE_ID, ABSTRACT)
|
||||
|
||||
|
||||
def parseAbbreviations():
|
||||
URI_CSV = csv.DictReader(URI_ABBR_CSV_HANDLER)
|
||||
for row in URI_CSV:
|
||||
|
||||
URI = row["uri"]
|
||||
ABBREVIATION = row["abbreviation"]
|
||||
|
||||
insert_abbreviation(URI, ABBREVIATION)
|
||||
|
||||
|
||||
def parseRDF_Reverse():
|
||||
|
||||
REVERSE_CSV_READER = csv.DictReader(REVERSE_CSV_HANDLER)
|
||||
REVERSE_ORIGIN_ID = selectOrigin(CURS, 'reverse.csv')
|
||||
REVERSE_ORIGIN_ID = selectOrigin(CURS, "reverse.csv")
|
||||
|
||||
if REVERSE_ORIGIN_ID is None:
|
||||
return
|
||||
|
||||
total = 0
|
||||
|
||||
for row in REVERSE_CSV_READER:
|
||||
@@ -236,7 +314,6 @@ def parseRDF_Reverse():
|
||||
RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
|
||||
MOVIE_ID = selectMovieId(CURS, OBJECT)
|
||||
|
||||
|
||||
skip = False
|
||||
|
||||
# guard
|
||||
@@ -259,17 +336,19 @@ def parseRDF_Reverse():
|
||||
if skip:
|
||||
continue
|
||||
|
||||
if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID):
|
||||
if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID): # type: ignore
|
||||
total += 1
|
||||
|
||||
print(total)
|
||||
|
||||
|
||||
|
||||
def parseRDF_Dataset():
|
||||
|
||||
DATASET_CSV_READER = csv.DictReader(DATASET_CSV_HANDLER)
|
||||
DATASET_ORIGIN_ID = selectOrigin(CURS, 'dataset.csv')
|
||||
DATASET_ORIGIN_ID = selectOrigin(CURS, "dataset.csv")
|
||||
|
||||
if DATASET_ORIGIN_ID is None:
|
||||
return
|
||||
|
||||
total = 0
|
||||
rdf_idx = 0
|
||||
@@ -293,7 +372,6 @@ def parseRDF_Dataset():
|
||||
RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
|
||||
MOVIE_ID = selectMovieId(CURS, SUBJECT)
|
||||
|
||||
|
||||
skip = False
|
||||
|
||||
# guard
|
||||
@@ -316,31 +394,211 @@ def parseRDF_Dataset():
|
||||
if skip:
|
||||
continue
|
||||
|
||||
if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID):
|
||||
if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID): # type: ignore
|
||||
total += 1
|
||||
|
||||
print(total)
|
||||
|
||||
|
||||
def parseAbbr_Reverse():
|
||||
|
||||
REVERSE_CSV_READER = csv.DictReader(REVERSE_CSV_HANDLER)
|
||||
REVERSE_ORIGIN_ID = selectOrigin(CURS, "reverse.csv")
|
||||
|
||||
if REVERSE_ORIGIN_ID is None:
|
||||
return
|
||||
|
||||
total = 0
|
||||
|
||||
for row in REVERSE_CSV_READER:
|
||||
SUBJECT = row["subject"]
|
||||
RELATIONSHIP = row["relationship"]
|
||||
OBJECT = row["object"]
|
||||
|
||||
SUBJECT_ID = selectSubjectId(CURS, SUBJECT)
|
||||
OBJECT_ID = selectObjectId(CURS, OBJECT)
|
||||
RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
|
||||
|
||||
SUB_SECTIONS = SUBJECT.split("/")
|
||||
REL_SECTIONS = RELATIONSHIP.split("/")
|
||||
OBJ_SECTIONS = OBJECT.split("/")
|
||||
|
||||
SUB_ABBR_ID = None
|
||||
REL_ABBR_ID = None
|
||||
OBJ_ABBR_ID = None
|
||||
|
||||
skip = False
|
||||
|
||||
# guard
|
||||
if SUBJECT_ID is None:
|
||||
print(f"No SubjectId for {SUBJECT}")
|
||||
skip = True
|
||||
|
||||
if OBJECT_ID is None:
|
||||
print(f"No ObjectId for {OBJECT}")
|
||||
skip = True
|
||||
|
||||
if RELATIONSHIP_ID is None:
|
||||
print(f"No RelationshipId for {RELATIONSHIP}")
|
||||
skip = True
|
||||
|
||||
|
||||
if skip:
|
||||
continue
|
||||
|
||||
if len(SUB_SECTIONS) > 4:
|
||||
index = min(len(SUB_SECTIONS), 7)
|
||||
while index > 3:
|
||||
PATH = "/".join(SUB_SECTIONS[0:index]) + "%"
|
||||
SUB_ABBR_ID = select_abbreviation_id(PATH)
|
||||
|
||||
if SUB_ABBR_ID is not None:
|
||||
if insert_subject_abbreviation(SUBJECT_ID, SUB_ABBR_ID):
|
||||
total += 1
|
||||
index = 0
|
||||
index -= 1
|
||||
|
||||
if len(REL_SECTIONS) > 4:
|
||||
index = min(len(REL_SECTIONS), 7)
|
||||
while index > 2:
|
||||
PATH = "/".join(REL_SECTIONS[0:index]) + "%"
|
||||
REL_ABBR_ID = select_abbreviation_id(PATH)
|
||||
|
||||
|
||||
if REL_ABBR_ID is not None:
|
||||
if insert_relationship_abbreviation(RELATIONSHIP_ID, REL_ABBR_ID):
|
||||
total += 1
|
||||
index = 0
|
||||
index -= 1
|
||||
|
||||
if len(OBJ_SECTIONS) > 4:
|
||||
index = min(len(OBJ_SECTIONS), 7)
|
||||
while index > 3:
|
||||
PATH = "/".join(OBJ_SECTIONS[0:index]) + "%"
|
||||
OBJ_ABBR_ID = select_abbreviation_id(PATH)
|
||||
|
||||
if OBJ_ABBR_ID is not None:
|
||||
if insert_object_abbreviation(OBJECT_ID, OBJ_ABBR_ID):
|
||||
total += 1
|
||||
index = 0
|
||||
index -= 1
|
||||
|
||||
print(total)
|
||||
|
||||
|
||||
def parseAbbr_Dataset():
|
||||
|
||||
DATASET_CSV_READER = csv.DictReader(DATASET_CSV_HANDLER)
|
||||
DATASET_ORIGIN_ID = selectOrigin(CURS, "dataset.csv")
|
||||
|
||||
if DATASET_ORIGIN_ID is None:
|
||||
return
|
||||
|
||||
total = 0
|
||||
rdf_idx = 0
|
||||
for row in DATASET_CSV_READER:
|
||||
SUBJECT = row["subject"]
|
||||
RELATIONSHIP = row["relationship"]
|
||||
OBJECT = row["object"]
|
||||
|
||||
rdf_idx += 1
|
||||
|
||||
if rdf_idx % 100000 == 0:
|
||||
print(f"RDF number {rdf_idx}:\n\t{SUBJECT} - {RELATIONSHIP} - {OBJECT}")
|
||||
|
||||
SUBJECT_ID = selectSubjectId(CURS, SUBJECT)
|
||||
OBJECT_ID = selectObjectId(CURS, OBJECT)
|
||||
RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
|
||||
|
||||
SUB_SECTIONS = SUBJECT.split("/")
|
||||
REL_SECTIONS = RELATIONSHIP.split("/")
|
||||
OBJ_SECTIONS = OBJECT.split("/")
|
||||
|
||||
SUB_ABBR_ID = None
|
||||
REL_ABBR_ID = None
|
||||
OBJ_ABBR_ID = None
|
||||
|
||||
skip = False
|
||||
|
||||
# guard
|
||||
if SUBJECT_ID is None:
|
||||
print(f"No SubjectId for {SUBJECT}")
|
||||
skip = True
|
||||
|
||||
if OBJECT_ID is None:
|
||||
print(f"No ObjectId for {OBJECT}")
|
||||
skip = True
|
||||
|
||||
if RELATIONSHIP_ID is None:
|
||||
print(f"No RelationshipId for {RELATIONSHIP}")
|
||||
skip = True
|
||||
|
||||
|
||||
if skip:
|
||||
continue
|
||||
|
||||
if len(SUB_SECTIONS) > 4:
|
||||
index = min(len(SUB_SECTIONS), 7)
|
||||
while index > 3:
|
||||
PATH = "/".join(SUB_SECTIONS[0:index]) + "%"
|
||||
SUB_ABBR_ID = select_abbreviation_id(PATH)
|
||||
|
||||
if SUB_ABBR_ID is not None:
|
||||
if insert_subject_abbreviation(SUBJECT_ID, SUB_ABBR_ID):
|
||||
total += 1
|
||||
index = 0
|
||||
index -= 1
|
||||
|
||||
if len(REL_SECTIONS) > 4:
|
||||
index = min(len(REL_SECTIONS), 7)
|
||||
while index > 2:
|
||||
PATH = "/".join(REL_SECTIONS[0:index]) + "%"
|
||||
REL_ABBR_ID = select_abbreviation_id(PATH)
|
||||
|
||||
|
||||
if REL_ABBR_ID is not None:
|
||||
if insert_relationship_abbreviation(RELATIONSHIP_ID, REL_ABBR_ID):
|
||||
total += 1
|
||||
index = 0
|
||||
index -= 1
|
||||
|
||||
if len(OBJ_SECTIONS) > 4:
|
||||
index = min(len(OBJ_SECTIONS), 7)
|
||||
while index > 3:
|
||||
PATH = "/".join(OBJ_SECTIONS[0:index]) + "%"
|
||||
OBJ_ABBR_ID = select_abbreviation_id(PATH)
|
||||
|
||||
if OBJ_ABBR_ID is not None:
|
||||
if insert_object_abbreviation(OBJECT_ID, OBJ_ABBR_ID):
|
||||
total += 1
|
||||
index = 0
|
||||
index -= 1
|
||||
|
||||
print(total)
|
||||
|
||||
|
||||
# MARK: Actual Code
|
||||
# parseMovies()
|
||||
# parseWikiPageId()
|
||||
# parseAbstract()
|
||||
# insertOrigin(CURS)
|
||||
# parseAbbreviations()
|
||||
# parseRDF_Reverse()
|
||||
# parseRDF_Dataset()
|
||||
# parseAbbr_Reverse()
|
||||
parseAbbr_Dataset()
|
||||
|
||||
|
||||
CONN.commit()
|
||||
CONN.close()
|
||||
|
||||
|
||||
|
||||
MOVIES_CSV_HANDLER.close()
|
||||
PAGEID_CSV_HANDLER.close()
|
||||
SUMMARY_CSV_HANDLER.close()
|
||||
DATASET_CSV_HANDLER.close()
|
||||
REVERSE_CSV_HANDLER.close()
|
||||
URI_ABBR_CSV_HANDLER.close()
|
||||
|
||||
|
||||
"""
|
||||
|
||||
0
Scripts/Libs/CleaningPipeline/.gitkeep
Normal file
0
Scripts/Libs/CleaningPipeline/.gitkeep
Normal file
0
Scripts/Libs/Utils/.gitkeep
Normal file
0
Scripts/Libs/Utils/.gitkeep
Normal file
826
Scripts/UML/CleaningPipeline/classes.excalidraw.json
Normal file
826
Scripts/UML/CleaningPipeline/classes.excalidraw.json
Normal file
@@ -0,0 +1,826 @@
|
||||
{
|
||||
"type": "excalidraw",
|
||||
"version": 2,
|
||||
"source": "https://marketplace.visualstudio.com/items?itemName=pomdtr.excalidraw-editor",
|
||||
"elements": [
|
||||
{
|
||||
"type": "line",
|
||||
"version": 4622,
|
||||
"versionNonce": 1623045672,
|
||||
"isDeleted": false,
|
||||
"id": "twu_PiAvEuQ4l1YYtZLET",
|
||||
"fillStyle": "solid",
|
||||
"strokeWidth": 1,
|
||||
"strokeStyle": "solid",
|
||||
"roughness": 1,
|
||||
"opacity": 100,
|
||||
"angle": 0,
|
||||
"x": 289.8504963515835,
|
||||
"y": 91.87474806402287,
|
||||
"strokeColor": "#000000",
|
||||
"backgroundColor": "#a5d8ff",
|
||||
"width": 77.09201683999922,
|
||||
"height": 99.49948667804088,
|
||||
"seed": 1975340120,
|
||||
"groupIds": [
|
||||
"9PT4BXPfQ6UoCaB-T-h9A",
|
||||
"dp_TZJyYdyPIH1hOkAPlb"
|
||||
],
|
||||
"strokeSharpness": "round",
|
||||
"boundElementIds": [],
|
||||
"startBinding": null,
|
||||
"endBinding": null,
|
||||
"lastCommittedPoint": null,
|
||||
"startArrowhead": null,
|
||||
"endArrowhead": null,
|
||||
"points": [
|
||||
[
|
||||
0,
|
||||
0
|
||||
],
|
||||
[
|
||||
0.2542098813493443,
|
||||
75.20117273657175
|
||||
],
|
||||
[
|
||||
0.011896425679918422,
|
||||
83.76249969444815
|
||||
],
|
||||
[
|
||||
3.970409367559332,
|
||||
87.46174320643391
|
||||
],
|
||||
[
|
||||
17.75573317066317,
|
||||
90.59250103325854
|
||||
],
|
||||
[
|
||||
41.05683533152865,
|
||||
91.56737225214069
|
||||
],
|
||||
[
|
||||
63.319497586673116,
|
||||
90.01084754868091
|
||||
],
|
||||
[
|
||||
75.14781395923075,
|
||||
86.28844687220405
|
||||
],
|
||||
[
|
||||
76.81603792670788,
|
||||
83.15042405259751
|
||||
],
|
||||
[
|
||||
77.05033394391478,
|
||||
76.25776215104557
|
||||
],
|
||||
[
|
||||
76.86643881413028,
|
||||
6.3089586511537865
|
||||
],
|
||||
[
|
||||
76.45188016352971,
|
||||
-0.2999144698665015
|
||||
],
|
||||
[
|
||||
71.50179495549581,
|
||||
-3.9936571317850627
|
||||
],
|
||||
[
|
||||
61.077971898861186,
|
||||
-6.132877429442784
|
||||
],
|
||||
[
|
||||
37.32348754161154,
|
||||
-7.932114425900202
|
||||
],
|
||||
[
|
||||
18.278415656797975,
|
||||
-6.859225353587373
|
||||
],
|
||||
[
|
||||
3.2995959613238286,
|
||||
-3.2201165291205287
|
||||
],
|
||||
[
|
||||
-0.04168289608444441,
|
||||
-0.045185660461322996
|
||||
],
|
||||
[
|
||||
0,
|
||||
0
|
||||
]
|
||||
],
|
||||
"index": "a1",
|
||||
"frameId": null,
|
||||
"roundness": {
|
||||
"type": 2
|
||||
},
|
||||
"boundElements": [],
|
||||
"updated": 1758646548051,
|
||||
"link": null,
|
||||
"locked": false
|
||||
},
|
||||
{
|
||||
"type": "line",
|
||||
"version": 2327,
|
||||
"versionNonce": 1593094440,
|
||||
"isDeleted": false,
|
||||
"id": "hmJk4dH9VpOsfkrCTkhvh",
|
||||
"fillStyle": "solid",
|
||||
"strokeWidth": 1,
|
||||
"strokeStyle": "solid",
|
||||
"roughness": 1,
|
||||
"opacity": 100,
|
||||
"angle": 0,
|
||||
"x": 290.3744257898585,
|
||||
"y": 149.00103172175278,
|
||||
"strokeColor": "#000000",
|
||||
"backgroundColor": "#a5d8ff",
|
||||
"width": 77.17198221193564,
|
||||
"height": 8.562348957853036,
|
||||
"seed": 637665624,
|
||||
"groupIds": [
|
||||
"9PT4BXPfQ6UoCaB-T-h9A",
|
||||
"dp_TZJyYdyPIH1hOkAPlb"
|
||||
],
|
||||
"strokeSharpness": "round",
|
||||
"boundElementIds": [],
|
||||
"startBinding": null,
|
||||
"endBinding": null,
|
||||
"lastCommittedPoint": null,
|
||||
"startArrowhead": null,
|
||||
"endArrowhead": null,
|
||||
"points": [
|
||||
[
|
||||
0,
|
||||
0
|
||||
],
|
||||
[
|
||||
2.033150371639873,
|
||||
3.413095389435587
|
||||
],
|
||||
[
|
||||
10.801287372573954,
|
||||
6.276651055277943
|
||||
],
|
||||
[
|
||||
22.468666942209353,
|
||||
8.010803051612635
|
||||
],
|
||||
[
|
||||
40.747074201802775,
|
||||
8.168828515515864
|
||||
],
|
||||
[
|
||||
62.077348233027564,
|
||||
7.0647721921469495
|
||||
],
|
||||
[
|
||||
74.53446931782398,
|
||||
3.04824021069218
|
||||
],
|
||||
[
|
||||
77.17198221193564,
|
||||
-0.3935204423371723
|
||||
]
|
||||
],
|
||||
"index": "a2",
|
||||
"frameId": null,
|
||||
"roundness": {
|
||||
"type": 2
|
||||
},
|
||||
"boundElements": [],
|
||||
"updated": 1758646548051,
|
||||
"link": null,
|
||||
"locked": false
|
||||
},
|
||||
{
|
||||
"type": "line",
|
||||
"version": 2413,
|
||||
"versionNonce": 311708712,
|
||||
"isDeleted": false,
|
||||
"id": "X1ldVIXm4DfBal5N2Pwn9",
|
||||
"fillStyle": "solid",
|
||||
"strokeWidth": 1,
|
||||
"strokeStyle": "solid",
|
||||
"roughness": 1,
|
||||
"opacity": 100,
|
||||
"angle": 0,
|
||||
"x": 289.3425684673547,
|
||||
"y": 120.03697638652972,
|
||||
"strokeColor": "#000000",
|
||||
"backgroundColor": "#a5d8ff",
|
||||
"width": 77.17198221193564,
|
||||
"height": 8.562348957853036,
|
||||
"seed": 904402520,
|
||||
"groupIds": [
|
||||
"9PT4BXPfQ6UoCaB-T-h9A",
|
||||
"dp_TZJyYdyPIH1hOkAPlb"
|
||||
],
|
||||
"strokeSharpness": "round",
|
||||
"boundElementIds": [],
|
||||
"startBinding": null,
|
||||
"endBinding": null,
|
||||
"lastCommittedPoint": null,
|
||||
"startArrowhead": null,
|
||||
"endArrowhead": null,
|
||||
"points": [
|
||||
[
|
||||
0,
|
||||
0
|
||||
],
|
||||
[
|
||||
2.033150371639873,
|
||||
3.413095389435587
|
||||
],
|
||||
[
|
||||
10.801287372573954,
|
||||
6.276651055277943
|
||||
],
|
||||
[
|
||||
22.468666942209353,
|
||||
8.010803051612635
|
||||
],
|
||||
[
|
||||
40.747074201802775,
|
||||
8.168828515515864
|
||||
],
|
||||
[
|
||||
62.077348233027564,
|
||||
7.0647721921469495
|
||||
],
|
||||
[
|
||||
74.53446931782398,
|
||||
3.04824021069218
|
||||
],
|
||||
[
|
||||
77.17198221193564,
|
||||
-0.3935204423371723
|
||||
]
|
||||
],
|
||||
"index": "a3",
|
||||
"frameId": null,
|
||||
"roundness": {
|
||||
"type": 2
|
||||
},
|
||||
"boundElements": [],
|
||||
"updated": 1758646548051,
|
||||
"link": null,
|
||||
"locked": false
|
||||
},
|
||||
{
|
||||
"type": "ellipse",
|
||||
"version": 5410,
|
||||
"versionNonce": 92833576,
|
||||
"isDeleted": false,
|
||||
"id": "CFhp5ZxSVwHYzGUj4hEn1",
|
||||
"fillStyle": "solid",
|
||||
"strokeWidth": 1,
|
||||
"strokeStyle": "solid",
|
||||
"roughness": 1,
|
||||
"opacity": 100,
|
||||
"angle": 0,
|
||||
"x": 288.28461948527263,
|
||||
"y": 84.74247943834126,
|
||||
"strokeColor": "#000000",
|
||||
"backgroundColor": "#a5d8ff",
|
||||
"width": 76.59753601865496,
|
||||
"height": 15.49127539284798,
|
||||
"seed": 1782811480,
|
||||
"groupIds": [
|
||||
"9PT4BXPfQ6UoCaB-T-h9A",
|
||||
"dp_TZJyYdyPIH1hOkAPlb"
|
||||
],
|
||||
"strokeSharpness": "sharp",
|
||||
"boundElementIds": [
|
||||
"bxuMGTzXLn7H-uBCptINx"
|
||||
],
|
||||
"index": "a4",
|
||||
"frameId": null,
|
||||
"roundness": null,
|
||||
"boundElements": [],
|
||||
"updated": 1758646548051,
|
||||
"link": null,
|
||||
"locked": false
|
||||
},
|
||||
{
|
||||
"type": "ellipse",
|
||||
"version": 820,
|
||||
"versionNonce": 608002600,
|
||||
"isDeleted": false,
|
||||
"id": "B43R7rWwK2_vdiRHBSSPk",
|
||||
"fillStyle": "solid",
|
||||
"strokeWidth": 1,
|
||||
"strokeStyle": "solid",
|
||||
"roughness": 1,
|
||||
"opacity": 100,
|
||||
"angle": 0,
|
||||
"x": 324.77660659049513,
|
||||
"y": 109.21914711824485,
|
||||
"strokeColor": "#000000",
|
||||
"backgroundColor": "#228be6",
|
||||
"width": 11.226103154161754,
|
||||
"height": 12.183758484455605,
|
||||
"seed": 1298686040,
|
||||
"groupIds": [
|
||||
"9PT4BXPfQ6UoCaB-T-h9A",
|
||||
"dp_TZJyYdyPIH1hOkAPlb"
|
||||
],
|
||||
"strokeSharpness": "sharp",
|
||||
"boundElementIds": [],
|
||||
"index": "a5",
|
||||
"frameId": null,
|
||||
"roundness": null,
|
||||
"boundElements": [],
|
||||
"updated": 1758646548051,
|
||||
"link": null,
|
||||
"locked": false
|
||||
},
|
||||
{
|
||||
"type": "ellipse",
|
||||
"version": 1108,
|
||||
"versionNonce": 1839127848,
|
||||
"isDeleted": false,
|
||||
"id": "CkKMb9wkJfVk04T217zSs",
|
||||
"fillStyle": "solid",
|
||||
"strokeWidth": 1,
|
||||
"strokeStyle": "solid",
|
||||
"roughness": 1,
|
||||
"opacity": 100,
|
||||
"angle": 0,
|
||||
"x": 325.12774837442873,
|
||||
"y": 135.43576140530996,
|
||||
"strokeColor": "#000000",
|
||||
"backgroundColor": "#228be6",
|
||||
"width": 11.226103154161754,
|
||||
"height": 12.183758484455605,
|
||||
"seed": 2133497176,
|
||||
"groupIds": [
|
||||
"9PT4BXPfQ6UoCaB-T-h9A",
|
||||
"dp_TZJyYdyPIH1hOkAPlb"
|
||||
],
|
||||
"strokeSharpness": "sharp",
|
||||
"boundElementIds": [],
|
||||
"index": "a6",
|
||||
"frameId": null,
|
||||
"roundness": null,
|
||||
"boundElements": [],
|
||||
"updated": 1758646548051,
|
||||
"link": null,
|
||||
"locked": false
|
||||
},
|
||||
{
|
||||
"type": "ellipse",
|
||||
"version": 991,
|
||||
"versionNonce": 588838952,
|
||||
"isDeleted": false,
|
||||
"id": "SHJdKeQPkfpvzSoNH--3o",
|
||||
"fillStyle": "solid",
|
||||
"strokeWidth": 1,
|
||||
"strokeStyle": "solid",
|
||||
"roughness": 1,
|
||||
"opacity": 100,
|
||||
"angle": 6.239590202363168,
|
||||
"x": 325.77660659049513,
|
||||
"y": 164.20448797661635,
|
||||
"strokeColor": "#000000",
|
||||
"backgroundColor": "#228be6",
|
||||
"width": 11.226103154161754,
|
||||
"height": 12.183758484455605,
|
||||
"seed": 81668696,
|
||||
"groupIds": [
|
||||
"9PT4BXPfQ6UoCaB-T-h9A",
|
||||
"dp_TZJyYdyPIH1hOkAPlb"
|
||||
],
|
||||
"strokeSharpness": "sharp",
|
||||
"boundElementIds": [],
|
||||
"index": "a7",
|
||||
"frameId": null,
|
||||
"roundness": null,
|
||||
"boundElements": [],
|
||||
"updated": 1758646548051,
|
||||
"link": null,
|
||||
"locked": false
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"version": 489,
|
||||
"versionNonce": 2023207720,
|
||||
"isDeleted": false,
|
||||
"id": "vUSyMBPup0jZ71CYXKyGb",
|
||||
"fillStyle": "solid",
|
||||
"strokeWidth": 1,
|
||||
"strokeStyle": "solid",
|
||||
"roughness": 1,
|
||||
"opacity": 100,
|
||||
"angle": 0,
|
||||
"x": 280.1846389770508,
|
||||
"y": 185.79462957545917,
|
||||
"strokeColor": "#000000",
|
||||
"backgroundColor": "#a5d8ff",
|
||||
"width": 95.63072204589844,
|
||||
"height": 23.595161071904883,
|
||||
"seed": 425140056,
|
||||
"groupIds": [
|
||||
"dp_TZJyYdyPIH1hOkAPlb"
|
||||
],
|
||||
"strokeSharpness": "sharp",
|
||||
"boundElementIds": [],
|
||||
"fontSize": 17.4778970902999,
|
||||
"fontFamily": 1,
|
||||
"text": "dataset.db",
|
||||
"baseline": 16.595161071904883,
|
||||
"textAlign": "center",
|
||||
"verticalAlign": "top",
|
||||
"index": "a8",
|
||||
"frameId": null,
|
||||
"roundness": null,
|
||||
"boundElements": [],
|
||||
"updated": 1758646548051,
|
||||
"link": null,
|
||||
"locked": false,
|
||||
"containerId": null,
|
||||
"originalText": "dataset.db",
|
||||
"autoResize": true,
|
||||
"lineHeight": 1.350000000000001
|
||||
},
|
||||
{
|
||||
"id": "R7pU0VP6CFKCAwuvt0xsr",
|
||||
"type": "text",
|
||||
"x": 295.5,
|
||||
"y": 342,
|
||||
"width": 374,
|
||||
"height": 225,
|
||||
"angle": 0,
|
||||
"strokeColor": "#1e1e1e",
|
||||
"backgroundColor": "#228be6",
|
||||
"fillStyle": "solid",
|
||||
"strokeWidth": 2,
|
||||
"strokeStyle": "solid",
|
||||
"roughness": 1,
|
||||
"opacity": 100,
|
||||
"groupIds": [],
|
||||
"frameId": null,
|
||||
"index": "a9",
|
||||
"roundness": null,
|
||||
"seed": 705463336,
|
||||
"version": 1130,
|
||||
"versionNonce": 72522328,
|
||||
"isDeleted": false,
|
||||
"boundElements": null,
|
||||
"updated": 1758648226024,
|
||||
"link": null,
|
||||
"locked": false,
|
||||
"text": "class Extract(Action):\n # Static\n + type : ActionTypes = Extract\n \n # Properties\n - db_connection: Path\n - query: str\n - query_parameters: [str]\n - output_mapper: [str]",
|
||||
"fontSize": 20,
|
||||
"fontFamily": 8,
|
||||
"textAlign": "left",
|
||||
"verticalAlign": "top",
|
||||
"containerId": null,
|
||||
"originalText": "class Extract(Action):\n # Static\n + type : ActionTypes = Extract\n \n # Properties\n - db_connection: Path\n - query: str\n - query_parameters: [str]\n - output_mapper: [str]",
|
||||
"autoResize": true,
|
||||
"lineHeight": 1.25
|
||||
},
|
||||
{
|
||||
"id": "G1xIRcJgm34_NMEWQFFlW",
|
||||
"type": "text",
|
||||
"x": 1419.5,
|
||||
"y": 110,
|
||||
"width": 253,
|
||||
"height": 75,
|
||||
"angle": 0,
|
||||
"strokeColor": "#1e1e1e",
|
||||
"backgroundColor": "#228be6",
|
||||
"fillStyle": "solid",
|
||||
"strokeWidth": 2,
|
||||
"strokeStyle": "solid",
|
||||
"roughness": 1,
|
||||
"opacity": 100,
|
||||
"groupIds": [],
|
||||
"frameId": null,
|
||||
"index": "aA",
|
||||
"roundness": null,
|
||||
"seed": 651981400,
|
||||
"version": 256,
|
||||
"versionNonce": 138082856,
|
||||
"isDeleted": false,
|
||||
"boundElements": null,
|
||||
"updated": 1758646570344,
|
||||
"link": null,
|
||||
"locked": false,
|
||||
"text": "class Pipeline\n - actions: [Action]\n ",
|
||||
"fontSize": 20,
|
||||
"fontFamily": 8,
|
||||
"textAlign": "left",
|
||||
"verticalAlign": "top",
|
||||
"containerId": null,
|
||||
"originalText": "class Pipeline\n - actions: [Action]\n ",
|
||||
"autoResize": true,
|
||||
"lineHeight": 1.25
|
||||
},
|
||||
{
|
||||
"id": "TBVy3JbJCkbA9kjVEJ8lv",
|
||||
"type": "text",
|
||||
"x": 694,
|
||||
"y": 100,
|
||||
"width": 495,
|
||||
"height": 150,
|
||||
"angle": 0,
|
||||
"strokeColor": "#1e1e1e",
|
||||
"backgroundColor": "#228be6",
|
||||
"fillStyle": "solid",
|
||||
"strokeWidth": 2,
|
||||
"strokeStyle": "solid",
|
||||
"roughness": 1,
|
||||
"opacity": 100,
|
||||
"groupIds": [],
|
||||
"frameId": null,
|
||||
"index": "aB",
|
||||
"roundness": null,
|
||||
"seed": 680960040,
|
||||
"version": 560,
|
||||
"versionNonce": 85012520,
|
||||
"isDeleted": false,
|
||||
"boundElements": null,
|
||||
"updated": 1758649442239,
|
||||
"link": null,
|
||||
"locked": false,
|
||||
"text": "class Action\n + type: ActionTypes\n + name: str\n + depends_on: [str]\n\n + execute(mem) -> [Dict<str, any>] | Void",
|
||||
"fontSize": 20,
|
||||
"fontFamily": 8,
|
||||
"textAlign": "left",
|
||||
"verticalAlign": "top",
|
||||
"containerId": null,
|
||||
"originalText": "class Action\n + type: ActionTypes\n + name: str\n + depends_on: [str]\n\n + execute(mem) -> [Dict<str, any>] | Void",
|
||||
"autoResize": true,
|
||||
"lineHeight": 1.25
|
||||
},
|
||||
{
|
||||
"id": "an7KRTzWpCytKNKgHftKC",
|
||||
"type": "text",
|
||||
"x": 1528.5,
|
||||
"y": 365.5,
|
||||
"width": 187,
|
||||
"height": 150,
|
||||
"angle": 0,
|
||||
"strokeColor": "#1e1e1e",
|
||||
"backgroundColor": "#228be6",
|
||||
"fillStyle": "solid",
|
||||
"strokeWidth": 2,
|
||||
"strokeStyle": "solid",
|
||||
"roughness": 1,
|
||||
"opacity": 100,
|
||||
"groupIds": [],
|
||||
"frameId": null,
|
||||
"index": "aC",
|
||||
"roundness": null,
|
||||
"seed": 1974317656,
|
||||
"version": 306,
|
||||
"versionNonce": 1574962264,
|
||||
"isDeleted": false,
|
||||
"boundElements": null,
|
||||
"updated": 1758648154009,
|
||||
"link": null,
|
||||
"locked": false,
|
||||
"text": "enum ActionTypes:\n + Extract\n + Aggregate\n + Filter\n + Map\n + Dump",
|
||||
"fontSize": 20,
|
||||
"fontFamily": 8,
|
||||
"textAlign": "left",
|
||||
"verticalAlign": "top",
|
||||
"containerId": null,
|
||||
"originalText": "enum ActionTypes:\n + Extract\n + Aggregate\n + Filter\n + Map\n + Dump",
|
||||
"autoResize": true,
|
||||
"lineHeight": 1.25
|
||||
},
|
||||
{
|
||||
"id": "2pQ5EULirrWs_QZPbClhh",
|
||||
"type": "text",
|
||||
"x": 785,
|
||||
"y": 332.5,
|
||||
"width": 418,
|
||||
"height": 375,
|
||||
"angle": 0,
|
||||
"strokeColor": "#1e1e1e",
|
||||
"backgroundColor": "#228be6",
|
||||
"fillStyle": "solid",
|
||||
"strokeWidth": 2,
|
||||
"strokeStyle": "solid",
|
||||
"roughness": 1,
|
||||
"opacity": 100,
|
||||
"groupIds": [],
|
||||
"frameId": null,
|
||||
"index": "aH",
|
||||
"roundness": null,
|
||||
"seed": 1402251560,
|
||||
"version": 742,
|
||||
"versionNonce": 680432168,
|
||||
"isDeleted": false,
|
||||
"boundElements": null,
|
||||
"updated": 1758649532881,
|
||||
"link": null,
|
||||
"locked": false,
|
||||
"text": "class Aggregate(Action):\n # Static\n + type: ActionTypes = Aggregate\n\n # Properties\n - actionIDs: [str]\n - associations: [Association]\n - output_mapper: [str]\n\n + execute(mem):\n tables = mem.gather(actionIDs)\n\n for join in association:\n \n ",
|
||||
"fontSize": 20,
|
||||
"fontFamily": 8,
|
||||
"textAlign": "left",
|
||||
"verticalAlign": "top",
|
||||
"containerId": null,
|
||||
"originalText": "class Aggregate(Action):\n # Static\n + type: ActionTypes = Aggregate\n\n # Properties\n - actionIDs: [str]\n - associations: [Association]\n - output_mapper: [str]\n\n + execute(mem):\n tables = mem.gather(actionIDs)\n\n for join in association:\n \n ",
|
||||
"autoResize": true,
|
||||
"lineHeight": 1.25
|
||||
},
|
||||
{
|
||||
"id": "O0fso8DJqFfwJEzmpUikM",
|
||||
"type": "text",
|
||||
"x": 1289,
|
||||
"y": 195,
|
||||
"width": 594,
|
||||
"height": 100,
|
||||
"angle": 0,
|
||||
"strokeColor": "#1e1e1e",
|
||||
"backgroundColor": "#228be6",
|
||||
"fillStyle": "solid",
|
||||
"strokeWidth": 2,
|
||||
"strokeStyle": "solid",
|
||||
"roughness": 1,
|
||||
"opacity": 100,
|
||||
"groupIds": [],
|
||||
"frameId": null,
|
||||
"index": "aI",
|
||||
"roundness": null,
|
||||
"seed": 1582329944,
|
||||
"version": 459,
|
||||
"versionNonce": 1080077144,
|
||||
"isDeleted": false,
|
||||
"boundElements": null,
|
||||
"updated": 1758647067031,
|
||||
"link": null,
|
||||
"locked": false,
|
||||
"text": "input_mapper: \n - key: ActionID (name) that produced such output\n - value: list of strings that represent the values\n to take",
|
||||
"fontSize": 20,
|
||||
"fontFamily": 8,
|
||||
"textAlign": "left",
|
||||
"verticalAlign": "top",
|
||||
"containerId": null,
|
||||
"originalText": "input_mapper: \n - key: ActionID (name) that produced such output\n - value: list of strings that represent the values\n to take",
|
||||
"autoResize": true,
|
||||
"lineHeight": 1.25
|
||||
},
|
||||
{
|
||||
"id": "v0kzO6vlBWOdJCV3yoG69",
|
||||
"type": "text",
|
||||
"x": 1379.5,
|
||||
"y": 718.5,
|
||||
"width": 286,
|
||||
"height": 175,
|
||||
"angle": 0,
|
||||
"strokeColor": "#1e1e1e",
|
||||
"backgroundColor": "#228be6",
|
||||
"fillStyle": "solid",
|
||||
"strokeWidth": 2,
|
||||
"strokeStyle": "solid",
|
||||
"roughness": 1,
|
||||
"opacity": 100,
|
||||
"groupIds": [],
|
||||
"frameId": null,
|
||||
"index": "aL",
|
||||
"roundness": null,
|
||||
"seed": 1462407976,
|
||||
"version": 635,
|
||||
"versionNonce": 1012998696,
|
||||
"isDeleted": false,
|
||||
"boundElements": null,
|
||||
"updated": 1758649495598,
|
||||
"link": null,
|
||||
"locked": false,
|
||||
"text": "class Association:\n - from_actionID: str\n - from_key_name: str\n - from_value_name: str\n - to_actionID: str\n - to_value_name: str\n - type: Type",
|
||||
"fontSize": 20,
|
||||
"fontFamily": 8,
|
||||
"textAlign": "left",
|
||||
"verticalAlign": "top",
|
||||
"containerId": null,
|
||||
"originalText": "class Association:\n - from_actionID: str\n - from_key_name: str\n - from_value_name: str\n - to_actionID: str\n - to_value_name: str\n - type: Type",
|
||||
"autoResize": true,
|
||||
"lineHeight": 1.25
|
||||
},
|
||||
{
|
||||
"id": "WK34n9xeVxntypCtrlK6p",
|
||||
"type": "text",
|
||||
"x": 256.5,
|
||||
"y": 787.5,
|
||||
"width": 517,
|
||||
"height": 175,
|
||||
"angle": 0,
|
||||
"strokeColor": "#1e1e1e",
|
||||
"backgroundColor": "#228be6",
|
||||
"fillStyle": "solid",
|
||||
"strokeWidth": 2,
|
||||
"strokeStyle": "solid",
|
||||
"roughness": 1,
|
||||
"opacity": 100,
|
||||
"groupIds": [],
|
||||
"frameId": null,
|
||||
"index": "aM",
|
||||
"roundness": null,
|
||||
"seed": 1166526296,
|
||||
"version": 318,
|
||||
"versionNonce": 1042162520,
|
||||
"isDeleted": false,
|
||||
"boundElements": null,
|
||||
"updated": 1758649002604,
|
||||
"link": null,
|
||||
"locked": false,
|
||||
"text": "class Filter(Action):\n # Static\n + type: ActionTypes = Filter\n\n # Properties\n - compare: function(Dict<str, any>) -> bool\n - output_mapper: [str]",
|
||||
"fontSize": 20,
|
||||
"fontFamily": 8,
|
||||
"textAlign": "left",
|
||||
"verticalAlign": "top",
|
||||
"containerId": null,
|
||||
"originalText": "class Filter(Action):\n # Static\n + type: ActionTypes = Filter\n\n # Properties\n - compare: function(Dict<str, any>) -> bool\n - output_mapper: [str]",
|
||||
"autoResize": true,
|
||||
"lineHeight": 1.25
|
||||
},
|
||||
{
|
||||
"id": "NY9jyUFLFFCNPE2sh00SX",
|
||||
"type": "text",
|
||||
"x": 1639,
|
||||
"y": 606.5,
|
||||
"width": 407,
|
||||
"height": 200,
|
||||
"angle": 0,
|
||||
"strokeColor": "#1e1e1e",
|
||||
"backgroundColor": "#228be6",
|
||||
"fillStyle": "solid",
|
||||
"strokeWidth": 2,
|
||||
"strokeStyle": "solid",
|
||||
"roughness": 1,
|
||||
"opacity": 100,
|
||||
"groupIds": [],
|
||||
"frameId": null,
|
||||
"index": "aP",
|
||||
"roundness": null,
|
||||
"seed": 20345896,
|
||||
"version": 168,
|
||||
"versionNonce": 627282472,
|
||||
"isDeleted": false,
|
||||
"boundElements": null,
|
||||
"updated": 1758649426380,
|
||||
"link": null,
|
||||
"locked": false,
|
||||
"text": "class Map(Action):\n # Static\n + type: ActionTypes = Map\n\n # Properties\n - compare_mapper: [str]\n - mapper: function(any...) -> any\n - output_mapper: [str]",
|
||||
"fontSize": 20,
|
||||
"fontFamily": 8,
|
||||
"textAlign": "left",
|
||||
"verticalAlign": "top",
|
||||
"containerId": null,
|
||||
"originalText": "class Map(Action):\n # Static\n + type: ActionTypes = Map\n\n # Properties\n - compare_mapper: [str]\n - mapper: function(any...) -> any\n - output_mapper: [str]",
|
||||
"autoResize": true,
|
||||
"lineHeight": 1.25
|
||||
},
|
||||
{
|
||||
"id": "SkhaoW-3TTKDZzEii3Lf6",
|
||||
"type": "text",
|
||||
"x": 1457.5,
|
||||
"y": 955.5,
|
||||
"width": 121,
|
||||
"height": 50,
|
||||
"angle": 0,
|
||||
"strokeColor": "#1e1e1e",
|
||||
"backgroundColor": "#228be6",
|
||||
"fillStyle": "solid",
|
||||
"strokeWidth": 2,
|
||||
"strokeStyle": "solid",
|
||||
"roughness": 1,
|
||||
"opacity": 100,
|
||||
"groupIds": [],
|
||||
"frameId": null,
|
||||
"index": "aQ",
|
||||
"roundness": null,
|
||||
"seed": 2071523672,
|
||||
"version": 37,
|
||||
"versionNonce": 105260376,
|
||||
"isDeleted": false,
|
||||
"boundElements": null,
|
||||
"updated": 1758648834435,
|
||||
"link": null,
|
||||
"locked": false,
|
||||
"text": "class Dump:\n -",
|
||||
"fontSize": 20,
|
||||
"fontFamily": 8,
|
||||
"textAlign": "left",
|
||||
"verticalAlign": "top",
|
||||
"containerId": null,
|
||||
"originalText": "class Dump:\n -",
|
||||
"autoResize": true,
|
||||
"lineHeight": 1.25
|
||||
}
|
||||
],
|
||||
"appState": {
|
||||
"gridSize": 20,
|
||||
"gridStep": 5,
|
||||
"gridModeEnabled": false,
|
||||
"viewBackgroundColor": "#ffffff"
|
||||
},
|
||||
"files": {}
|
||||
}
|
||||
Reference in New Issue
Block a user