Compare commits
117 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
856c693650 | ||
|
|
e9d30b3cea | ||
|
|
ee12f53f12 | ||
|
|
a04f4c7cb7 | ||
|
|
a93e61b8c1 | ||
|
|
0373460105 | ||
|
|
7307916891 | ||
|
|
acb43fc899 | ||
|
|
255d801a80 | ||
|
|
2bd24ec278 | ||
|
|
69fba7c3e9 | ||
|
|
64e355e80c | ||
|
|
007f1e9554 | ||
|
|
c319398ca0 | ||
|
|
255d8a072d | ||
|
|
8167c9d435 | ||
|
|
bd72ad3571 | ||
|
|
6ddb7de9da | ||
|
|
650b37c586 | ||
|
|
e521b0704e | ||
|
|
0a698e9837 | ||
|
|
9440a562f2 | ||
|
|
5eda131aac | ||
|
|
57884eaf2e | ||
|
|
4548a683c2 | ||
|
|
3eec49ffa5 | ||
|
|
0bc7f4b227 | ||
|
|
f28952b0a2 | ||
|
|
0b626a8e09 | ||
|
|
b254098532 | ||
|
|
ee88ffe4cf | ||
|
|
70b4bd8645 | ||
|
|
6316d2bfc4 | ||
|
|
87ca748f45 | ||
|
|
4315d70109 | ||
|
|
9a5d633b5e | ||
|
|
a6760cd52d | ||
|
|
a7eb92227d | ||
|
|
9f221e31cd | ||
|
|
47197194d5 | ||
|
|
0cdbf6f624 | ||
|
|
3e30489f86 | ||
|
|
8a22e453e4 | ||
|
|
7feb4eb857 | ||
|
|
70af19d356 | ||
|
|
a4b44ab2ee | ||
|
|
74b6b609dd | ||
|
|
59796c37cb | ||
|
|
f696f5950b | ||
|
|
605b496da7 | ||
|
|
7d693964dd | ||
|
|
25f401b577 | ||
|
|
14c5ade230 | ||
| 4c9c51f902 | |||
|
|
63c1a4a160 | ||
|
|
51114af853 | ||
|
|
3a6dca0681 | ||
|
|
346098d2b7 | ||
|
|
64f9b41378 | ||
|
|
ac1ed42c49 | ||
|
|
edd01a2c83 | ||
|
|
5aa9e3fcf3 | ||
|
|
0970cabf92 | ||
|
|
a26d92750f | ||
|
|
34c4782232 | ||
|
|
c5439533e6 | ||
|
|
8819b8e87f | ||
|
|
1076dc8aa6 | ||
|
|
3d15e03b09 | ||
|
|
0ee2ec6fcd | ||
|
|
95cfa5486c | ||
|
|
0d30e90ee0 | ||
|
|
faaba17a98 | ||
|
|
854e5f1d98 | ||
|
|
242d7f674f | ||
|
|
de8c2afceb | ||
|
|
f89dffff75 | ||
|
|
e39bad8348 | ||
|
|
7a1a221017 | ||
|
|
fafe6ae0f9 | ||
|
|
e32444df75 | ||
|
|
b74b7ac4f0 | ||
|
|
22134391d9 | ||
|
|
82c9023849 | ||
|
|
00b87e01ea | ||
|
|
ce3d4bf6c5 | ||
|
|
c415b175a0 | ||
|
|
ec81ea7930 | ||
|
|
4bb03f86b3 | ||
|
|
e5f201f3db | ||
|
|
1c715dc569 | ||
|
|
6686b47328 | ||
|
|
9a5a7d84fd | ||
|
|
9678ece9c0 | ||
|
|
67bcd732b5 | ||
|
|
1a4f900500 | ||
|
|
ca8729b67c | ||
|
|
9dbffc52ed | ||
|
|
b7f504942a | ||
|
|
7f0c5ce8d3 | ||
|
|
9838e287a4 | ||
|
|
ca6143ea3c | ||
|
|
16e7ab4d9f | ||
|
|
28723ab662 | ||
|
|
3e59efcf33 | ||
|
|
7c04309cc1 | ||
|
|
db87295890 | ||
|
|
61568200a8 | ||
|
|
8df2736b97 | ||
|
|
eb5b7f629a | ||
|
|
79232b391e | ||
|
|
72eb937b47 | ||
|
|
cececa14ce | ||
|
|
2487d44abd | ||
|
|
553b86cac2 | ||
|
|
12bd781fd3 | ||
|
|
463f4907b8 |
1
.gitattributes
vendored
1
.gitattributes
vendored
@ -1,2 +1,3 @@
|
|||||||
Exam/Deep_Learning_2025_VIII.pdf filter=lfs diff=lfs merge=lfs -text
|
Exam/Deep_Learning_2025_VIII.pdf filter=lfs diff=lfs merge=lfs -text
|
||||||
Assets/** filter=lfs diff=lfs merge=lfs -text
|
Assets/** filter=lfs diff=lfs merge=lfs -text
|
||||||
|
Assets/Dataset/1-hop/dataset.csv filter=lfs diff=lfs merge=lfs -text
|
||||||
|
|||||||
4
.gitignore
vendored
4
.gitignore
vendored
@ -191,6 +191,7 @@ ipython_config.py
|
|||||||
# Icon must end with two \r
|
# Icon must end with two \r
|
||||||
Icon
|
Icon
|
||||||
|
|
||||||
|
|
||||||
# Thumbnails
|
# Thumbnails
|
||||||
._*
|
._*
|
||||||
|
|
||||||
@ -251,3 +252,6 @@ $RECYCLE.BIN/
|
|||||||
# .nfs files are created when an open file is removed but is still being accessed
|
# .nfs files are created when an open file is removed but is still being accessed
|
||||||
.nfs*
|
.nfs*
|
||||||
|
|
||||||
|
# ---> Custom
|
||||||
|
**/Tmp/**
|
||||||
|
!**/.gitkeep
|
||||||
|
|||||||
14
.vscode/extensions.json
vendored
Normal file
14
.vscode/extensions.json
vendored
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
{
|
||||||
|
"recommendations": [
|
||||||
|
"bierner.github-markdown-preview",
|
||||||
|
"bierner.markdown-checkbox",
|
||||||
|
"bierner.markdown-emoji",
|
||||||
|
"bierner.markdown-footnotes",
|
||||||
|
"bierner.markdown-mermaid",
|
||||||
|
"bierner.markdown-preview-github-styles",
|
||||||
|
"bierner.markdown-yaml-preamble",
|
||||||
|
"davidanson.vscode-markdownlint",
|
||||||
|
"kejun.markdown-alert",
|
||||||
|
"yzhang.markdown-all-in-one"
|
||||||
|
]
|
||||||
|
}
|
||||||
24
.vscode/settings.json
vendored
Normal file
24
.vscode/settings.json
vendored
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
{
|
||||||
|
// Always treat the project root as the working dir for Jupyter
|
||||||
|
"jupyter.notebookFileRoot": "${workspaceFolder}",
|
||||||
|
|
||||||
|
// When you click "Run Python File in Terminal", DON'T cd into the file's folder
|
||||||
|
"python.terminal.executeInFileDir": false,
|
||||||
|
|
||||||
|
// Start new integrated terminals at the project root
|
||||||
|
"terminal.integrated.cwd": "${workspaceFolder}",
|
||||||
|
|
||||||
|
// Ensure Python can import from the project root no matter which file you run
|
||||||
|
// (so `src/` is on sys.path). Linux shown here; add osx/windows if needed.
|
||||||
|
"terminal.integrated.env.linux": {
|
||||||
|
"PYTHONPATH": "${workspaceFolder}"
|
||||||
|
},
|
||||||
|
|
||||||
|
// Make pytest run from the root without needing a pytest.ini
|
||||||
|
"python.testing.pytestEnabled": true,
|
||||||
|
"python.testing.cwd": "${workspaceFolder}",
|
||||||
|
"python.testing.pytestArgs": ["src/test"],
|
||||||
|
|
||||||
|
// Help Pylance resolve imports like `from src...` without red squiggles
|
||||||
|
"python.analysis.extraPaths": ["${workspaceFolder}"]
|
||||||
|
}
|
||||||
BIN
Assets/Dataset/1-hop/dataset.csv
(Stored with Git LFS)
Normal file
BIN
Assets/Dataset/1-hop/dataset.csv
(Stored with Git LFS)
Normal file
Binary file not shown.
|
BIN
Assets/Dataset/1-hop/movie-pageid.csv
(Stored with Git LFS)
Normal file
BIN
Assets/Dataset/1-hop/movie-pageid.csv
(Stored with Git LFS)
Normal file
Binary file not shown.
|
BIN
Assets/Dataset/1-hop/movies.csv
(Stored with Git LFS)
Normal file
BIN
Assets/Dataset/1-hop/movies.csv
(Stored with Git LFS)
Normal file
Binary file not shown.
|
BIN
Assets/Dataset/1-hop/reverse.csv
(Stored with Git LFS)
Normal file
BIN
Assets/Dataset/1-hop/reverse.csv
(Stored with Git LFS)
Normal file
Binary file not shown.
|
BIN
Assets/Dataset/1-hop/uri-abbreviations.csv
(Stored with Git LFS)
Normal file
BIN
Assets/Dataset/1-hop/uri-abbreviations.csv
(Stored with Git LFS)
Normal file
Binary file not shown.
|
BIN
Assets/Dataset/1-hop/wikipedia-movie.csv
(Stored with Git LFS)
Normal file
BIN
Assets/Dataset/1-hop/wikipedia-movie.csv
(Stored with Git LFS)
Normal file
Binary file not shown.
|
BIN
Assets/Dataset/1-hop/wikipedia-summary.csv
(Stored with Git LFS)
Normal file
BIN
Assets/Dataset/1-hop/wikipedia-summary.csv
(Stored with Git LFS)
Normal file
Binary file not shown.
|
BIN
Assets/Dataset/DatawareHouse/dataset.db
(Stored with Git LFS)
Normal file
BIN
Assets/Dataset/DatawareHouse/dataset.db
(Stored with Git LFS)
Normal file
Binary file not shown.
0
Assets/Dataset/Tmp/.gitkeep
Normal file
0
Assets/Dataset/Tmp/.gitkeep
Normal file
25
README.md
25
README.md
@ -1,3 +1,28 @@
|
|||||||
# NanoSocrates
|
# NanoSocrates
|
||||||
|
|
||||||
This is the work project for the DeepLearning exam of 16th September 2025
|
This is the work project for the DeepLearning exam of 16th September 2025
|
||||||
|
|
||||||
|
## Index
|
||||||
|
|
||||||
|
- [Resources](./docs/RESOURCES.md)
|
||||||
|
|
||||||
|
## Setup
|
||||||
|
|
||||||
|
Create and activate you Conda enviroment with:
|
||||||
|
|
||||||
|
conda env create -f environment.yaml
|
||||||
|
conda activate deep_learning
|
||||||
|
|
||||||
|
Now install dependencies on pip:
|
||||||
|
|
||||||
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
## TroubleShooting
|
||||||
|
|
||||||
|
Sometimes when uploading really large batch of data, git can stop the uploads thanks to the timeout.
|
||||||
|
The solution is to locally change its settings:
|
||||||
|
|
||||||
|
git config lfs.dialtimeout 3600
|
||||||
|
git config lfs.activitytimeout 3600
|
||||||
|
|
||||||
|
for clearance check the link: https://stackoverflow.com/questions/58961697/i-o-timeout-when-pushing-to-a-git-reporsitory
|
||||||
30
Scripts/DataBaseQueries/dataset.sql
Normal file
30
Scripts/DataBaseQueries/dataset.sql
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
-- To pass to Pandas
|
||||||
|
SELECT *
|
||||||
|
FROM RDFs
|
||||||
|
INNER JOIN Subjects USING (SubjectID)
|
||||||
|
INNER JOIN Relationships USING (RelationshipID)
|
||||||
|
INNER JOIN Objects USING (ObjectID);
|
||||||
|
|
||||||
|
-- To pass to Pandas for abstracts
|
||||||
|
SELECT *
|
||||||
|
FROM RDFs
|
||||||
|
INNER JOIN WikipediaAbstracts USING (MovieID);
|
||||||
|
|
||||||
|
-- To pass to Pandas for abbreviations
|
||||||
|
SELECT *
|
||||||
|
FROM Abbreviations;
|
||||||
|
|
||||||
|
-- More complex to have clean dataset
|
||||||
|
-- More complex to have clean dataset
|
||||||
|
SELECT MovieID, GROUP_CONCAT('<SOT>' || '<SUB>' || SubjectURI || '<REL>' || RelationshipURI || '<OBJ>' || ObjectURI || '<EOT>', '') as RDF_String, Abstract
|
||||||
|
FROM RDFs
|
||||||
|
INNER JOIN SubjectsCountInRDFs USING (SubjectID)
|
||||||
|
INNER JOIN RelationshipsCountInRDFs USING(RelationshipID)
|
||||||
|
INNER JOIN ObjectsCountInRDFs USING (ObjectID)
|
||||||
|
INNER JOIN ParsedSubjects USING (SubjectID)
|
||||||
|
INNER JOIN ParsedRelationships USING (RelationshipID)
|
||||||
|
INNER JOIN ParsedObjects USING (ObjectID)
|
||||||
|
INNER JOIN WikipediaAbstracts USING (MovieID)
|
||||||
|
-- insert WHERE here
|
||||||
|
-- WHERE SubjectID = 134626
|
||||||
|
GROUP BY MovieID;
|
||||||
174
Scripts/DataBaseQueries/db_creation.sql
Normal file
174
Scripts/DataBaseQueries/db_creation.sql
Normal file
@ -0,0 +1,174 @@
|
|||||||
|
CREATE TABLE IF NOT EXISTS Movies (
|
||||||
|
MovieID INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
MovieURI TEXT UNIQUE NOT NULL
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS WikiPageIDs (
|
||||||
|
MovieID INTEGER PRIMARY KEY,
|
||||||
|
PageID INTEGER UNIQUE NOT NULL,
|
||||||
|
FOREIGN KEY(MovieID) REFERENCES Movies(MovieID)
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS WikipediaAbstracts (
|
||||||
|
MovieID INTEGER PRIMARY KEY,
|
||||||
|
Abstract TEXT NOT NULL,
|
||||||
|
FOREIGN KEY(MovieID) REFERENCES Movies(MovieID)
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS Origins (
|
||||||
|
OriginID INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
OriginName TEXT UNIQUE NOT NULL
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS Subjects (
|
||||||
|
SubjectID INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
SubjectURI TEXT UNIQUE NOT NULL,
|
||||||
|
OriginID BIGINT NOT NULL,
|
||||||
|
FOREIGN KEY(OriginID) REFERENCES Origins(OriginID)
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS Relationships (
|
||||||
|
RelationshipID INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
RelationshipURI TEXT UNIQUE NOT NULL
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS Objects (
|
||||||
|
ObjectID INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
ObjectURI TEXT UNIQUE NOT NULL,
|
||||||
|
OriginID BIGINT NOT NULL,
|
||||||
|
FOREIGN KEY(OriginID) REFERENCES Origins(OriginID)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS RDFs (
|
||||||
|
RDF_ID INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
MovieID INTEGER NOT NULL,
|
||||||
|
SubjectID INTEGER NOT NULL,
|
||||||
|
RelationshipID INTEGER NOT NULL,
|
||||||
|
ObjectID INTEGER NOT NULL,
|
||||||
|
UNIQUE(MovieID, SubjectID, RelationshipID, ObjectID),
|
||||||
|
FOREIGN KEY(MovieID) REFERENCES Movies(MovieID),
|
||||||
|
FOREIGN KEY(SubjectID) REFERENCES Subjects(SubjectID),
|
||||||
|
FOREIGN KEY(RelationshipID) REFERENCES Relationships(RelationshipID),
|
||||||
|
FOREIGN KEY(ObjectID) REFERENCES Objects(ObjectID)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_rdf_movie_id ON RDFs(MovieID);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_rdf_subject_id ON RDFs(SubjectID);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_rdf_relationship_id ON RDFs(RelationshipID);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_rdf_object_id ON RDFs(ObjectID);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS Abbreviations (
|
||||||
|
AbbreviationID INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
URI TEXT UNIQUE NOT NULL,
|
||||||
|
Abbreviation TEXT UNIQUE NOT NULL
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS Subjects_Abbreviations (
|
||||||
|
SubjectID INTEGER NOT NULL,
|
||||||
|
AbbreviationID INTEGER NOT NULL,
|
||||||
|
PRIMARY KEY(SubjectID, AbbreviationID),
|
||||||
|
FOREIGN KEY(SubjectID) REFERENCES Subjects(SubjectID),
|
||||||
|
FOREIGN KEY(AbbreviationID) REFERENCES Abbreviations(AbbreviationID)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS Relationships_Abbreviations (
|
||||||
|
RelationshipID INTEGER NOT NULL,
|
||||||
|
AbbreviationID INTEGER NOT NULL,
|
||||||
|
PRIMARY KEY(RelationshipID, AbbreviationID),
|
||||||
|
FOREIGN KEY(RelationshipID) REFERENCES Relationships(RelationshipID),
|
||||||
|
FOREIGN KEY(AbbreviationID) REFERENCES Abbreviations(AbbreviationID)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS Objects_Abbreviations (
|
||||||
|
ObjectID INTEGER NOT NULL,
|
||||||
|
AbbreviationID INTEGER NOT NULL,
|
||||||
|
PRIMARY KEY(ObjectID, AbbreviationID),
|
||||||
|
FOREIGN KEY(ObjectID) REFERENCES Objects(ObjectID),
|
||||||
|
FOREIGN KEY(AbbreviationID) REFERENCES Abbreviations(AbbreviationID)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_sub_abbr_sub_id ON Subjects_Abbreviations(SubjectID);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_sub_abbr_abbr_id ON Subjects_Abbreviations(AbbreviationID);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_rel_abbr_rel_id ON Relationships_Abbreviations(RelationshipID);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_rel_abbr_abbr_id ON Relationships_Abbreviations(AbbreviationID);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_obj_abbr_obj_id ON Objects_Abbreviations(ObjectID);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_obj_abbr_abbr_id ON Objects_Abbreviations(AbbreviationID);
|
||||||
|
|
||||||
|
-- Views
|
||||||
|
-- Subjects
|
||||||
|
CREATE VIEW IF NOT EXISTS ParsedSubjects
|
||||||
|
AS
|
||||||
|
SELECT
|
||||||
|
SubjectID,
|
||||||
|
CASE WHEN Abbreviation IS NULL
|
||||||
|
THEN SubjectURI
|
||||||
|
ELSE Abbreviation || ':' || replace(SubjectURI, URI, '') END
|
||||||
|
AS SubjectURI
|
||||||
|
FROM Subjects
|
||||||
|
LEFT JOIN Subjects_Abbreviations USING (SubjectID)
|
||||||
|
LEFT JOIN Abbreviations USING (AbbreviationID);
|
||||||
|
|
||||||
|
-- Relationships
|
||||||
|
CREATE VIEW IF NOT EXISTS ParsedRelationships
|
||||||
|
AS
|
||||||
|
SELECT
|
||||||
|
RelationshipID,
|
||||||
|
CASE WHEN Abbreviation IS NULL
|
||||||
|
THEN RelationshipURI
|
||||||
|
ELSE Abbreviation || ':' || replace(RelationshipURI, URI, '') END
|
||||||
|
AS RelationshipURI
|
||||||
|
FROM Relationships
|
||||||
|
LEFT JOIN Relationships_Abbreviations USING (RelationshipID)
|
||||||
|
LEFT JOIN Abbreviations USING (AbbreviationID);
|
||||||
|
|
||||||
|
-- Objects
|
||||||
|
CREATE VIEW IF NOT EXISTS ParsedObjects
|
||||||
|
AS
|
||||||
|
SELECT
|
||||||
|
ObjectID,
|
||||||
|
CASE WHEN Abbreviation IS NULL
|
||||||
|
THEN ObjectURI
|
||||||
|
ELSE Abbreviation || ':' || replace(ObjectURI, URI, '') END
|
||||||
|
AS ObjectURI
|
||||||
|
FROM Objects
|
||||||
|
LEFT JOIN Objects_Abbreviations USING (ObjectID)
|
||||||
|
LEFT JOIN Abbreviations USING (AbbreviationID);
|
||||||
|
|
||||||
|
|
||||||
|
-- Subject Count
|
||||||
|
CREATE VIEW IF NOT EXISTS SubjectsCountInRDFs
|
||||||
|
AS
|
||||||
|
SELECT SubjectID, count(SubjectID) as Sub_Count
|
||||||
|
FROM RDFs
|
||||||
|
GROUP BY SubjectID;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
-- Relationship Count
|
||||||
|
CREATE VIEW IF NOT EXISTS RelationshipsCountInRDFs
|
||||||
|
AS
|
||||||
|
SELECT RelationshipID, count(RelationshipID) as Rel_Count
|
||||||
|
FROM RDFs
|
||||||
|
GROUP BY RelationshipID;
|
||||||
|
|
||||||
|
|
||||||
|
-- Object Count
|
||||||
|
CREATE VIEW IF NOT EXISTS ObjectsCountInRDFs
|
||||||
|
AS
|
||||||
|
SELECT ObjectID, count(ObjectID) as Obj_Count
|
||||||
|
FROM RDFs
|
||||||
|
GROUP BY ObjectID;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
55
Scripts/DataBaseQueries/query.sql
Normal file
55
Scripts/DataBaseQueries/query.sql
Normal file
@ -0,0 +1,55 @@
|
|||||||
|
-- Insert MovieURI into Movies ; MovieID is auto incremental
|
||||||
|
INSERT INTO Movies (MovieURI) VALUES (?);
|
||||||
|
|
||||||
|
-- Get MovieID where MovieURI equal given value
|
||||||
|
SELECT MovieID FROM Movies WHERE MovieURI = ?;
|
||||||
|
|
||||||
|
-- SetPageId
|
||||||
|
INSERT INTO WikiPageIDs (MovieID, PageID) VALUES (?,?);
|
||||||
|
|
||||||
|
-- Get MovieId by PageID ... ( to create WikipediaAbstract)
|
||||||
|
SELECT MovieID FROM WikiPageIDs WHERE PageID = ?;
|
||||||
|
|
||||||
|
-- SetAbstract ...
|
||||||
|
|
||||||
|
INSERT INTO WikipediaAbstracts (MovieID, Abstract) VALUES (?,?);
|
||||||
|
|
||||||
|
|
||||||
|
-- SetOrigin
|
||||||
|
---
|
||||||
|
INSERT INTO Origins (OriginName) VALUES ("dataset.csv"),("reverse.csv");
|
||||||
|
|
||||||
|
-- GetOrigin
|
||||||
|
SELECT OriginID FROM Origins WHERE OriginName = ?;
|
||||||
|
|
||||||
|
-- Subject, Relationship, Object, RDF
|
||||||
|
INSERT INTO Subjects (SubjectURI, OriginID) VALUES (?,?);
|
||||||
|
INSERT INTO Relationships (RelationshipURI) VALUES (?);
|
||||||
|
INSERT INTO Objects (ObjectURI, OriginID) VALUES (?,?);
|
||||||
|
|
||||||
|
SELECT SubjectID FROM Subjects WHERE SubjectURI = ?;
|
||||||
|
SELECT RelationshipID FROM Relationships WHERE RelationshipURI = ?;
|
||||||
|
SELECT ObjectID FROM Objects WHERE ObjectURI = ?;
|
||||||
|
|
||||||
|
|
||||||
|
INSERT INTO RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?,?,?,?);
|
||||||
|
|
||||||
|
-- Prefixes
|
||||||
|
INSERT INTO Abbreviations(URI, Abbreviation) VALUES (?,?);
|
||||||
|
INSERT INTO Objects_Abbreviations(ObjectID, AbbreviationID) VALUES (?,?);
|
||||||
|
INSERT INTO Relationships_Abbreviations(RelationshipID, AbbreviationID) VALUES (?,?);
|
||||||
|
INSERT INTO Subjects_Abbreviations(SubjectID, AbbreviationID) VALUES (?,?);
|
||||||
|
|
||||||
|
-- Please be sure it is a URI before running this query
|
||||||
|
-- and take at least until the domain and the first path part
|
||||||
|
SELECT AbbreviationID FROM Abbreviations WHERE URI LIKE ?;
|
||||||
|
|
||||||
|
-- Query to retrieve data
|
||||||
|
SELECT MovieID, GROUP_CONCAT('<SOT>' || '<SUB>' || SubjectURI || '<REL>' || RelationshipURI || '<OBJ>' || ObjectURI || '<EOT>', '') as RDF_String, Abstract
|
||||||
|
FROM RDFs
|
||||||
|
INNER JOIN ParsedSubjects USING (SubjectID)
|
||||||
|
INNER JOIN ParsedRelationships USING (RelationshipID)
|
||||||
|
INNER JOIN ParsedObjects USING (ObjectID)
|
||||||
|
INNER JOIN WikipediaAbstracts USING (MovieID)
|
||||||
|
-- insert WHERE here
|
||||||
|
GROUP BY MovieID;
|
||||||
186
Scripts/DataCleaning/clean_relationship.ipynb
Normal file
186
Scripts/DataCleaning/clean_relationship.ipynb
Normal file
@ -0,0 +1,186 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "b9081b7c",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# This file deletes in the pipeline the unwanted relationship by different rules\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"import sqlite3\n",
|
||||||
|
"import numpy as np\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"CONN = sqlite3.connect('../../Assets/Dataset/Tmp/dataset2.db')\n",
|
||||||
|
"\n",
|
||||||
|
"def get_RDF() -> pd.DataFrame:\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" QUERY = \"SELECT * FROM RDFs \" \\\n",
|
||||||
|
" \"INNER JOIN Subjects USING (SubjectID) \" \\\n",
|
||||||
|
" \"INNER JOIN Relationships USING (RelationshipID) \" \\\n",
|
||||||
|
" \"INNER JOIN Objects USING (ObjectID);\"\n",
|
||||||
|
" RDF = pd.read_sql_query(QUERY, CONN)\n",
|
||||||
|
" RDF = RDF[[\"SubjectURI\", \"RelationshipURI\", \"ObjectURI\"]]\n",
|
||||||
|
" RDF = RDF.dropna()\n",
|
||||||
|
" \"\"\"\n",
|
||||||
|
" Subjects = pd.read_sql_query('SELECT * FROM Subjects;', CONN)\n",
|
||||||
|
" Objects = pd.read_sql_query('SELECT * FROM Objects;', CONN)\n",
|
||||||
|
" Relationships = pd.read_sql_query('SELECT * FROM Relationships;', CONN)\n",
|
||||||
|
" RDF = pd.read_sql_query('SELECT * FROM RDFs;', CONN)\n",
|
||||||
|
"\n",
|
||||||
|
" # drop '' values \n",
|
||||||
|
" Subjects = Subjects.replace('', np.nan)# .dropna()\n",
|
||||||
|
" Relationships = Relationships.replace('', np.nan)# .dropna()\n",
|
||||||
|
" Objects = Objects.replace('', np.nan)# .dropna()\n",
|
||||||
|
"\n",
|
||||||
|
" # join RDF with its components\n",
|
||||||
|
" RDF = RDF.merge(Subjects, left_on=\"SubjectID\", right_on=\"SubjectID\")\n",
|
||||||
|
" RDF = RDF.merge(Objects, left_on=\"ObjectID\", right_on=\"ObjectID\")\n",
|
||||||
|
" RDF = RDF.merge(Relationships, left_on=\"RelationshipID\", right_on=\"RelationshipID\")\n",
|
||||||
|
" RDF = RDF[[\"SubjectURI\", \"RelationshipURI\", \"ObjectURI\", \"MovieID\"]]\n",
|
||||||
|
" return RDF\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"#def delete_relationship_by_uri(RDF: pd.DataFrame, )\n",
|
||||||
|
"\n",
|
||||||
|
"def delete_relationship_by_uri(RDF: pd.DataFrame, uri: str) -> pd.DataFrame:\n",
|
||||||
|
" return RDF[RDF[\"RelationshipURI\"]!= uri]\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"RDF = get_RDF()\n",
|
||||||
|
"# RDF = RDF.dropna()\n",
|
||||||
|
"# print(RDF)\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "644690bb",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def filter_by_frequence_relationship_uri(RDF: pd.DataFrame, count_treshold) -> pd.DataFrame:\n",
|
||||||
|
" counts = RDF[\"RelationshipURI\"].value_counts() \n",
|
||||||
|
" RDF[\"RelationshipFreq\"] = RDF[\"RelationshipURI\"].map(counts)\n",
|
||||||
|
" RDF = RDF[RDF[\"RelationshipFreq\"] >= count_treshold]\n",
|
||||||
|
" # counts is a series as key: relationship, value: count\n",
|
||||||
|
" # counts = counts[counts > count_treshold]\n",
|
||||||
|
" # relationships = counts.index\n",
|
||||||
|
" # RDF = RDF[RDF[\"RelationshipURI\"].isin(relationships)]\n",
|
||||||
|
" # RDF = RDF.groupby(\"RelationshipURI\").filter(lambda x: len(x) >= count_treshold)\n",
|
||||||
|
" return RDF\n",
|
||||||
|
"\n",
|
||||||
|
"RDF = filter_by_frequence_relationship_uri(RDF, 1)\n",
|
||||||
|
"# print(new_RDF)\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "34525be6",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
" SubjectURI \\\n",
|
||||||
|
"0 http://dbpedia.org/resource/Nights_of_Cabiria \n",
|
||||||
|
"1 http://dbpedia.org/resource/California_Science... \n",
|
||||||
|
"2 http://dbpedia.org/resource/China_Captain \n",
|
||||||
|
"3 http://dbpedia.org/resource/Caravan_of_Courage... \n",
|
||||||
|
"4 http://dbpedia.org/resource/WHIH_Newsfront \n",
|
||||||
|
"... ... \n",
|
||||||
|
"12725500 http://dbpedia.org/resource/I_Will_Follow_(film) \n",
|
||||||
|
"12725501 http://dbpedia.org/resource/I_Will_Follow_(film) \n",
|
||||||
|
"12725502 http://dbpedia.org/resource/I_Witnessed_Genoci... \n",
|
||||||
|
"12725503 http://dbpedia.org/resource/I_Woke_Up_Early_th... \n",
|
||||||
|
"12725504 http://dbpedia.org/resource/I_Won't_Play \n",
|
||||||
|
"\n",
|
||||||
|
" RelationshipURI \\\n",
|
||||||
|
"0 http://www.w3.org/2002/07/owl#differentFrom \n",
|
||||||
|
"1 http://www.w3.org/2002/07/owl#differentFrom \n",
|
||||||
|
"2 http://www.w3.org/2002/07/owl#differentFrom \n",
|
||||||
|
"3 http://www.w3.org/2002/07/owl#differentFrom \n",
|
||||||
|
"4 http://www.w3.org/2000/01/rdf-schema#seeAlso \n",
|
||||||
|
"... ... \n",
|
||||||
|
"12725500 http://dbpedia.org/ontology/producer \n",
|
||||||
|
"12725501 http://dbpedia.org/ontology/producer \n",
|
||||||
|
"12725502 http://dbpedia.org/ontology/producer \n",
|
||||||
|
"12725503 http://dbpedia.org/ontology/producer \n",
|
||||||
|
"12725504 http://dbpedia.org/ontology/producer \n",
|
||||||
|
"\n",
|
||||||
|
" ObjectURI MovieID \\\n",
|
||||||
|
"0 http://dbpedia.org/resource/Cabiria 26 \n",
|
||||||
|
"1 http://dbpedia.org/resource/California_Academy... 185 \n",
|
||||||
|
"2 http://dbpedia.org/resource/Captain_China 614 \n",
|
||||||
|
"3 http://dbpedia.org/resource/Caravan_of_Courage... 740 \n",
|
||||||
|
"4 http://dbpedia.org/resource/Captain_America:_C... 594 \n",
|
||||||
|
"... ... ... \n",
|
||||||
|
"12725500 http://dbpedia.org/resource/Ava_DuVernay 145854 \n",
|
||||||
|
"12725501 http://dbpedia.org/resource/Molly_Mayeux 145854 \n",
|
||||||
|
"12725502 http://dbpedia.org/resource/Headlines_Today 145861 \n",
|
||||||
|
"12725503 http://dbpedia.org/resource/Billy_Zane 145862 \n",
|
||||||
|
"12725504 http://dbpedia.org/resource/Gordon_Hollingshead 145864 \n",
|
||||||
|
"\n",
|
||||||
|
" RelationshipFreq MovieFreq \n",
|
||||||
|
"0 2132 216 \n",
|
||||||
|
"1 2132 264 \n",
|
||||||
|
"2 2132 66 \n",
|
||||||
|
"3 2132 131 \n",
|
||||||
|
"4 1653 133 \n",
|
||||||
|
"... ... ... \n",
|
||||||
|
"12725500 80077 95 \n",
|
||||||
|
"12725501 80077 95 \n",
|
||||||
|
"12725502 80077 41 \n",
|
||||||
|
"12725503 80077 98 \n",
|
||||||
|
"12725504 80077 91 \n",
|
||||||
|
"\n",
|
||||||
|
"[12725505 rows x 6 columns]\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"def filter_by_frequence_movie_id(RDF: pd.DataFrame, min_treshold, max_treshold) -> pd.DataFrame:\n",
|
||||||
|
" counts = RDF[\"MovieID\"].value_counts() \n",
|
||||||
|
" RDF[\"MovieFreq\"] = RDF[\"MovieID\"].map(counts)\n",
|
||||||
|
" RDF = RDF[RDF[\"MovieFreq\"] >= min_treshold]\n",
|
||||||
|
" RDF = RDF[RDF[\"MovieFreq\"] < max_treshold]\n",
|
||||||
|
" # counts is a series as key: relationship, value: count\n",
|
||||||
|
" # counts = counts[counts > count_treshold]\n",
|
||||||
|
" # relationships = counts.index\n",
|
||||||
|
" # RDF = RDF[RDF[\"RelationshipURI\"].isin(relationships)]\n",
|
||||||
|
" # RDF = RDF.groupby(\"RelationshipURI\").filter(lambda x: len(x) >= count_treshold)\n",
|
||||||
|
" return RDF\n",
|
||||||
|
"\n",
|
||||||
|
"RDF = filter_by_frequence_movie_id(RDF, 1, 1500)\n",
|
||||||
|
"print(RDF)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "deep_learning",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.13.7"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
||||||
21
Scripts/DataCleaning/data_output_models/bpe_corpus.py
Normal file
21
Scripts/DataCleaning/data_output_models/bpe_corpus.py
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
from Scripts.Libs.Utils.dataframe_interaction import get_raw_from_dataframe
|
||||||
|
from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
class BPE_corpus():
|
||||||
|
|
||||||
|
def __init__(self, output_path :str):
|
||||||
|
self.output_handler = open(output_path, "w")
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
# add corpus end before closing
|
||||||
|
self.output_handler.write(SpecialToken.CORPUS_END.value)
|
||||||
|
self.output_handler.close()
|
||||||
|
|
||||||
|
def write_from_str(self, output: str):
|
||||||
|
if output == '':
|
||||||
|
return
|
||||||
|
self.output_handler.write(output)
|
||||||
|
|
||||||
|
def write_from_df(self, df: pd.DataFrame):
|
||||||
|
self.write_from_str(get_raw_from_dataframe(df))
|
||||||
21
Scripts/DataCleaning/data_output_models/debug_csv.py
Normal file
21
Scripts/DataCleaning/data_output_models/debug_csv.py
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
class Debug_csv():
|
||||||
|
def __init__(self, output_path:str):
|
||||||
|
|
||||||
|
|
||||||
|
self.output = open(output_path, "w")
|
||||||
|
# then the first row as header
|
||||||
|
header = ["MovieURI","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
|
||||||
|
self.output.write(",".join(header) + "\n")
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
self.output.close()
|
||||||
|
|
||||||
|
def write(self, RDF: pd.DataFrame):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
RDF (pd.DataFrame): ["MovieURI","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
|
||||||
|
"""
|
||||||
|
|
||||||
|
RDF.to_csv(self.output, index=False, header=False)
|
||||||
@ -0,0 +1,26 @@
|
|||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
class RDF_completation_task_dataset():
|
||||||
|
"""
|
||||||
|
Write the CSV for the fourth task, which is "Predicting subsequent triples based on a given context".
|
||||||
|
Each RDF is saved as str
|
||||||
|
CSV Composition: ["MovieID","RDF"]
|
||||||
|
"""
|
||||||
|
def __init__(self, output_path:str):
|
||||||
|
|
||||||
|
|
||||||
|
self.output = open(output_path, "w")
|
||||||
|
# then the first row as header
|
||||||
|
header = ["MovieID","RDF"]
|
||||||
|
self.output.write(",".join(header) + "\n")
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
self.output.close()
|
||||||
|
|
||||||
|
def write(self, RDF: pd.DataFrame):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
RDF (pd.DataFrame): ["MovieID","RDF"]
|
||||||
|
"""
|
||||||
|
|
||||||
|
RDF.to_csv(self.output, index=False, header=False)
|
||||||
58
Scripts/DataCleaning/data_output_models/rdf_mask_task.py
Normal file
58
Scripts/DataCleaning/data_output_models/rdf_mask_task.py
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
# do not worry about circular dependencies, this class will never call something else
|
||||||
|
from Scripts.DataCleaning.legacy.filter import PipelineApplier
|
||||||
|
|
||||||
|
class RDF_mask_task_dataset():
|
||||||
|
"""
|
||||||
|
Write the CSV for the third task, which is "Predicting a masked component within an RDF triple".
|
||||||
|
The CSV is like: for each RDF there will be 3 rows, where every time one of the componments is missing.
|
||||||
|
CSV Composition: ["MovieID","IncompleteRDF","Missing","RDF"]
|
||||||
|
"""
|
||||||
|
def __init__(self, output_path:str):
|
||||||
|
|
||||||
|
# this methods will only be used by this class, but they belong in a lower level
|
||||||
|
self._build_triple = PipelineApplier.build_triple
|
||||||
|
self._build_incomplete_triple = PipelineApplier.build_incomplete_triple
|
||||||
|
|
||||||
|
self.output = open(output_path, "w")
|
||||||
|
# then the first row as header
|
||||||
|
header = ["MovieID","IncompleteRDF","Missing","RDF"]
|
||||||
|
self.output.write(",".join(header) + "\n")
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
self.output.close()
|
||||||
|
|
||||||
|
def write(self, RDF: pd.DataFrame):
|
||||||
|
rdf_complete = self._build_triple(RDF)
|
||||||
|
|
||||||
|
rdf_without_subject = self._build_incomplete_triple(RDF.drop(columns=["SubjectURI"]))
|
||||||
|
rdf_without_relationship = self._build_incomplete_triple(RDF.drop(columns=["RelationshipURI"]))
|
||||||
|
rdf_without_object = self._build_incomplete_triple(RDF.drop(columns=["ObjectURI"]))
|
||||||
|
####
|
||||||
|
df_subject = pd.DataFrame({
|
||||||
|
"MovieID": RDF["MovieID"],
|
||||||
|
"IncompleteRDF": rdf_without_subject,
|
||||||
|
"Missing": RDF["SubjectURI"],
|
||||||
|
"RDF": rdf_complete,
|
||||||
|
})
|
||||||
|
|
||||||
|
df_relationship = pd.DataFrame({
|
||||||
|
"MovieID": RDF["MovieID"],
|
||||||
|
"IncompleteRDF": rdf_without_relationship,
|
||||||
|
"Missing": RDF["RelationshipURI"],
|
||||||
|
"RDF": rdf_complete,
|
||||||
|
})
|
||||||
|
|
||||||
|
df_object = pd.DataFrame({
|
||||||
|
"MovieID": RDF["MovieID"],
|
||||||
|
"IncompleteRDF": rdf_without_object,
|
||||||
|
"Missing": RDF["ObjectURI"],
|
||||||
|
"RDF": rdf_complete,
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
output_df = pd.concat([df_subject, df_relationship, df_object], ignore_index=True)
|
||||||
|
output_df.to_csv(self.output, index=False, header=False)
|
||||||
|
|
||||||
|
|
||||||
26
Scripts/DataCleaning/data_output_models/rdf_text_tasks.py
Normal file
26
Scripts/DataCleaning/data_output_models/rdf_text_tasks.py
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
class RDF_text_task_dataset():
|
||||||
|
"""
|
||||||
|
Write the CSV for the firsts two tasks, which are "Generating structured RDF triples from natural language text" and reverse.
|
||||||
|
In the CVS the RDFs will be saved toghether as a string.
|
||||||
|
CSV Composition: ["MovieID","RDFs","Abstract"]
|
||||||
|
"""
|
||||||
|
def __init__(self, output_path:str):
|
||||||
|
|
||||||
|
|
||||||
|
self.output = open(output_path, "w")
|
||||||
|
# then the first row as header
|
||||||
|
header = ["MovieID","RDFs","Abstract"]
|
||||||
|
self.output.write(",".join(header) + "\n")
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
self.output.close()
|
||||||
|
|
||||||
|
def write(self, RDF: pd.DataFrame):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
RDF (pd.DataFrame): ["MovieID","Triple","Abstract"]
|
||||||
|
"""
|
||||||
|
|
||||||
|
RDF.to_csv(self.output, index=False, header=False)
|
||||||
77
Scripts/DataCleaning/dbpedia-uri.py
Normal file
77
Scripts/DataCleaning/dbpedia-uri.py
Normal file
@ -0,0 +1,77 @@
|
|||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class ProgramArgs:
|
||||||
|
|
||||||
|
def __init__(self, file: str, output: str, treshold: int):
|
||||||
|
self.file = file
|
||||||
|
self.output = output
|
||||||
|
self.treshold = treshold
|
||||||
|
|
||||||
|
def get_args(args: list[str]) -> ProgramArgs:
|
||||||
|
|
||||||
|
PARSER = argparse.ArgumentParser()
|
||||||
|
PARSER.add_argument("--input-file", "-i", required=True, type=str)
|
||||||
|
PARSER.add_argument("--output-file", "-o", required=True, type=str)
|
||||||
|
PARSER.add_argument("--treshold", "-t", type=int, default=1)
|
||||||
|
parsed_args, _ = PARSER.parse_known_args(args)
|
||||||
|
|
||||||
|
# print(parsed_args.input_file)
|
||||||
|
|
||||||
|
return ProgramArgs(parsed_args.input_file,parsed_args.output_file, parsed_args.treshold) # type ignore
|
||||||
|
|
||||||
|
|
||||||
|
def print_dbpedia(file: str, out: str):
|
||||||
|
|
||||||
|
|
||||||
|
FILE = open(file, "r", encoding="utf-8")
|
||||||
|
OUT = open(out, mode="w", encoding="utf-8")
|
||||||
|
|
||||||
|
DOMAIN_PART = "dbpedia"
|
||||||
|
|
||||||
|
already_parsed : set[str] = set()
|
||||||
|
|
||||||
|
|
||||||
|
for row in FILE:
|
||||||
|
|
||||||
|
sections = row.split("/")
|
||||||
|
sections = list(filter(lambda item: item != "", sections))
|
||||||
|
|
||||||
|
# print(sections)
|
||||||
|
|
||||||
|
if len(sections) < 3:
|
||||||
|
continue
|
||||||
|
|
||||||
|
URI = "/".join(sections[1:3])
|
||||||
|
URI = "//".join([sections[0], URI])
|
||||||
|
|
||||||
|
if URI in already_parsed:
|
||||||
|
continue
|
||||||
|
|
||||||
|
DOMAIN = sections[1]
|
||||||
|
SUBDOMAINS = DOMAIN.split(".")
|
||||||
|
TYPE = sections[2][0]
|
||||||
|
|
||||||
|
if DOMAIN_PART not in SUBDOMAINS:
|
||||||
|
continue
|
||||||
|
|
||||||
|
already_parsed.add(URI)
|
||||||
|
|
||||||
|
SUB_ID = SUBDOMAINS[0]
|
||||||
|
|
||||||
|
if len(SUB_ID) > 3:
|
||||||
|
SUB_ID = SUB_ID[:3]
|
||||||
|
|
||||||
|
OUT.write(f"\"{URI}/\", \"{SUB_ID}-db{TYPE}\"\n")
|
||||||
|
|
||||||
|
|
||||||
|
FILE.close()
|
||||||
|
OUT.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
ARGS = get_args(sys.argv)
|
||||||
|
# ARGS = get_debug_args()
|
||||||
|
print_dbpedia(ARGS.file, ARGS.output)
|
||||||
29
Scripts/DataCleaning/hold_out/divide.py
Normal file
29
Scripts/DataCleaning/hold_out/divide.py
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
def split_csv_by_percent(csv_path, train=70, val=15, test=15, seed=42):
|
||||||
|
# 1) Read and shuffle rows with a fixed seed for reproducibility
|
||||||
|
df = pd.read_csv(csv_path).sample(frac=1, random_state=seed).reset_index(drop=True)
|
||||||
|
|
||||||
|
# 2) Turn the three inputs into proportions relative to their sum
|
||||||
|
total = train + val + test # eheh you got it there :p
|
||||||
|
n = len(df)
|
||||||
|
n_train = int(n * train / total) # floor to keep indices integral
|
||||||
|
n_val = int(n * val / total)
|
||||||
|
# 3) Give the remainder to test to ensure every row is assigned
|
||||||
|
# (this naturally absorbs any rounding loss)
|
||||||
|
train_df = df.iloc[:n_train].reset_index(drop=True)
|
||||||
|
val_df = df.iloc[n_train:n_train + n_val].reset_index(drop=True)
|
||||||
|
test_df = df.iloc[n_train + n_val:].reset_index(drop=True)
|
||||||
|
|
||||||
|
return train_df, val_df, test_df
|
||||||
|
|
||||||
|
# usage:
|
||||||
|
DATASET = "Assets/Dataset/Tmp/rdf_text.csv"
|
||||||
|
TRAIN = "Assets/Dataset/Tmp/hold_out/train.csv"
|
||||||
|
TEST = "Assets/Dataset/Tmp/hold_out/test.csv"
|
||||||
|
EVALUATION = "Assets/Dataset/Tmp/hold_out/evaluation.csv"
|
||||||
|
train_df, val_df, test_df = split_csv_by_percent(DATASET, train=80, val=10, test=10, seed=42)
|
||||||
|
|
||||||
|
train_df.to_csv(TRAIN)
|
||||||
|
val_df.to_csv(EVALUATION)
|
||||||
|
test_df.to_csv(TEST)
|
||||||
381
Scripts/DataCleaning/legacy/deprecated.py
Normal file
381
Scripts/DataCleaning/legacy/deprecated.py
Normal file
@ -0,0 +1,381 @@
|
|||||||
|
# This file deletes in the pipeline the unwanted relationship by different rules
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# SQL-FIRST VERSION
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# In the original (pandas) version this module:
|
||||||
|
# - stored frequency filters in DataFrames,
|
||||||
|
# - filtered/cleaned DataFrames in-memory,
|
||||||
|
# - added special tokens via string ops,
|
||||||
|
# - rebuilt one row per movie using groupby/aggregation.
|
||||||
|
#
|
||||||
|
# In this rewrite:
|
||||||
|
# - Every transformation RETURNS a SQLAlchemy `Select` object instead of a DataFrame.
|
||||||
|
# - Your pipeline can pass this `Select` (a "dataview") from one stage to the next,
|
||||||
|
# composing more SQL lazily. Nothing is executed until you call `session.execute(...)`.
|
||||||
|
# - Frequency filters are represented as SUBSELECTS, applied with `WHERE IN (subquery)`.
|
||||||
|
#
|
||||||
|
# Notes:
|
||||||
|
# - We keep the same CLASS and METHOD NAMES to preserve call sites.
|
||||||
|
# - Method comments/docstrings from your original file are carried over and updated
|
||||||
|
# to reflect Select-based behavior and return types.
|
||||||
|
# - We drop pandas/numpy/sqlite3 imports because filtering is pushed into SQL.
|
||||||
|
# - `GROUP_CONCAT` is used for the rebuild phase (SQLite-compatible). For other DBs,
|
||||||
|
# swap with an equivalent string-agg function.
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from sqlalchemy import select, func, literal
|
||||||
|
from sqlalchemy.sql import Select
|
||||||
|
|
||||||
|
from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
|
||||||
|
|
||||||
|
|
||||||
|
class PipelineApplier():
|
||||||
|
"""
|
||||||
|
SQL-first pipeline applier.
|
||||||
|
|
||||||
|
In the pandas version, frequency filters were stored as DataFrames (self.MOVIE_FILTER / self.REL_FILTER)
|
||||||
|
and every method worked with/returned pandas.DataFrame. In this SQLAlchemy rewrite:
|
||||||
|
|
||||||
|
- self.MOVIE_FILTER and self.REL_FILTER become *subselects* (Select objects) that yield a single
|
||||||
|
column each (MovieID or RelationshipURI). These subselects can be applied via `WHERE IN (subquery)`.
|
||||||
|
|
||||||
|
- Every method that previously returned a DataFrame now returns a *Select* that represents the same
|
||||||
|
logical transformation, but pushed into the database engine.
|
||||||
|
|
||||||
|
- Comments and docstrings are updated to reflect SQL semantics while preserving your original intent.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
# In the pandas version these were DataFrames storing allowed keys.
|
||||||
|
# Here they are Select objects (single-column subselects) or None.
|
||||||
|
# Expected column names:
|
||||||
|
# - self.MOVIE_FILTER: "MovieID"
|
||||||
|
# - self.REL_FILTER: "RelationshipURI"
|
||||||
|
self.MOVIE_FILTER: Optional[Select] = None
|
||||||
|
self.REL_FILTER: Optional[Select] = None
|
||||||
|
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
# Relationship deletion
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
def delete_relationship_by_str(self, RDF: Select, uri: str) -> Select:
|
||||||
|
"""
|
||||||
|
Return a Select where rows having the given relationship URI are removed.
|
||||||
|
|
||||||
|
Original signature (pandas):
|
||||||
|
def delete_relationship_by_str(self, RDF: pd.DataFrame, uri: str) -> pd.DataFrame
|
||||||
|
|
||||||
|
Updated behavior:
|
||||||
|
- RDF is a Select with columns: MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
|
||||||
|
- We apply a WHERE clause: RelationshipURI != <uri>
|
||||||
|
- Returns a Select you can continue composing.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
RDF (Select): a selectable representing the RDF joined view
|
||||||
|
uri (str): RelationshipURI to exclude
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Select: filtered selectable (no execution yet)
|
||||||
|
"""
|
||||||
|
sc = RDF.selected_columns
|
||||||
|
return RDF.where(sc.RelationshipURI != literal(uri))
|
||||||
|
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
# Frequency filter: MOVIE
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
def generate_frequency_movie_filter(self, MOVIE_COUNT: Select, min_treshold: int, max_treshold: int):
|
||||||
|
"""
|
||||||
|
You MUST call this before filtering by movie frequency [filter_by_frequency_movie_id()],
|
||||||
|
since this method creates such filter.
|
||||||
|
|
||||||
|
Original behavior:
|
||||||
|
- Input MOVIE_COUNT as DataFrame ["MovieID","Count"]
|
||||||
|
- Keep rows where Count in [min_treshold, max_treshold)
|
||||||
|
- Store the filtered keys in self.MOVIE_FILTER
|
||||||
|
|
||||||
|
Updated behavior (SQL):
|
||||||
|
- MOVIE_COUNT is a Select that yields ["MovieID","Count"].
|
||||||
|
- We build and store a *subselect* of allowed MovieID (single column) to be used by WHERE IN.
|
||||||
|
- No query is executed here; we only create a new Select.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
MOVIE_COUNT (Select): yields columns MovieID, Count
|
||||||
|
min_treshold (int):
|
||||||
|
max_treshold (int):
|
||||||
|
"""
|
||||||
|
sc = MOVIE_COUNT.selected_columns
|
||||||
|
filtered = MOVIE_COUNT.where(sc.Count >= min_treshold).where(sc.Count < max_treshold)
|
||||||
|
# Keep only the key column so it can be used in an IN (subquery)
|
||||||
|
self.MOVIE_FILTER = select(filtered.selected_columns.MovieID)
|
||||||
|
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
# Frequency filter: RELATIONSHIP
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
def generate_frequency_relationship_filter(self, REL_COUNT: Select, min_treshold: int, max_treshold: int):
|
||||||
|
"""
|
||||||
|
Original behavior:
|
||||||
|
- Input REL_COUNT as DataFrame ["RelationshipURI","Count"]
|
||||||
|
- Keep rows where Count in [min_treshold, max_treshold)
|
||||||
|
- Store the filtered keys in self.REL_FILTER
|
||||||
|
|
||||||
|
Updated behavior (SQL):
|
||||||
|
- REL_COUNT is a Select that yields ["RelationshipURI","Count"].
|
||||||
|
- We build and store a *subselect* of allowed RelationshipURI (single column) to be used by WHERE IN.
|
||||||
|
- No query is executed here; we only create a new Select.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
REL_COUNT (Select): yields columns RelationshipURI, Count
|
||||||
|
min_treshold (int):
|
||||||
|
max_treshold (int):
|
||||||
|
"""
|
||||||
|
sc = REL_COUNT.selected_columns
|
||||||
|
filtered = REL_COUNT.where(sc.Count >= min_treshold).where(sc.Count < max_treshold)
|
||||||
|
self.REL_FILTER = select(filtered.selected_columns.RelationshipURI)
|
||||||
|
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
# Apply frequency filters
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
def filter_by_frequency_movie_id(self, RDF: Select) -> Select:
|
||||||
|
"""
|
||||||
|
Original behavior (pandas):
|
||||||
|
RDF = RDF[RDF["MovieID"].isin(self.MOVIE_FILTER["MovieID"])]
|
||||||
|
|
||||||
|
Updated behavior (SQL):
|
||||||
|
- If self.MOVIE_FILTER is present, apply: WHERE MovieID IN ( <subselect> )
|
||||||
|
- Otherwise, return RDF unchanged.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
RDF (Select): current dataset
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Select: filtered dataset (or unchanged if no filter exists)
|
||||||
|
"""
|
||||||
|
if self.MOVIE_FILTER is None:
|
||||||
|
return RDF
|
||||||
|
sc = RDF.selected_columns
|
||||||
|
return RDF.where(sc.MovieID.in_(self.MOVIE_FILTER))
|
||||||
|
|
||||||
|
def filter_by_frequency_relationship(self, RDF: Select) -> Select:
|
||||||
|
"""
|
||||||
|
Original behavior (pandas):
|
||||||
|
RDF = RDF[RDF["RelationshipURI"].isin(self.REL_FILTER["RelationshipURI"])]
|
||||||
|
|
||||||
|
Updated behavior (SQL):
|
||||||
|
- If self.REL_FILTER is present, apply: WHERE RelationshipURI IN ( <subselect> )
|
||||||
|
- Otherwise, return RDF unchanged.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
RDF (Select): current dataset
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Select: filtered dataset (or unchanged if no filter exists)
|
||||||
|
"""
|
||||||
|
if self.REL_FILTER is None:
|
||||||
|
return RDF
|
||||||
|
sc = RDF.selected_columns
|
||||||
|
return RDF.where(sc.RelationshipURI.in_(self.REL_FILTER))
|
||||||
|
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
# Token prefixing (SubjectURI/RelationshipURI/ObjectURI)
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
def rdf_add_special_token(self, RDF: Select) -> Select:
|
||||||
|
"""
|
||||||
|
Adds RDF special token to each element of the tuple. i.e: SUBJ to SubjectURI,
|
||||||
|
OBJ to ObjectURI, REL to RelationshipURI. Check
|
||||||
|
Scripts/Libs/CleaningPipeline/special_token.py for the up-to-date special token.
|
||||||
|
|
||||||
|
It only adds the special token of the three elements of the RDF; no other special token.
|
||||||
|
|
||||||
|
Original behavior (pandas):
|
||||||
|
- String concatenation with columns in a DataFrame.
|
||||||
|
- Returned a new DataFrame.
|
||||||
|
|
||||||
|
Updated behavior (SQL):
|
||||||
|
- Build projected columns using SQL string concatenation.
|
||||||
|
- Return a new Select with the same output column names:
|
||||||
|
["MovieID","SubjectURI","RelationshipURI","ObjectURI","Abstract"].
|
||||||
|
|
||||||
|
Args:
|
||||||
|
RDF (Select): current dataset
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Select: projected dataset with tokenized SubjectURI/RelationshipURI/ObjectURI
|
||||||
|
"""
|
||||||
|
sc = RDF.selected_columns
|
||||||
|
subj_tok = literal(SpecialToken.SUBJECT.value) + sc.SubjectURI
|
||||||
|
rel_tok = literal(SpecialToken.RELATIONSHIP.value) + sc.RelationshipURI
|
||||||
|
obj_tok = literal(SpecialToken.OBJECT.value) + sc.ObjectURI
|
||||||
|
|
||||||
|
return RDF.with_only_columns(
|
||||||
|
sc.MovieID.label("MovieID"),
|
||||||
|
subj_tok.label("SubjectURI"),
|
||||||
|
rel_tok.label("RelationshipURI"),
|
||||||
|
obj_tok.label("ObjectURI"),
|
||||||
|
sc.Abstract.label("Abstract"),
|
||||||
|
)
|
||||||
|
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
# NA/empty drop on key columns (SubjectURI, RelationshipURI, ObjectURI)
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
def drop_na_from_dataset(self, RDF: Select) -> Select:
|
||||||
|
"""
|
||||||
|
Dataset has SubjectURI, RelationshipURI, ObjectURI. We want to drop rows
|
||||||
|
where any of these is empty or NULL.
|
||||||
|
|
||||||
|
Original behavior (pandas):
|
||||||
|
- Replace '' with NaN and dropna on the three columns.
|
||||||
|
|
||||||
|
Updated behavior (SQL):
|
||||||
|
- Apply WHERE clauses checking for NOT NULL and not empty string.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
RDF (Select): current dataset
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Select: dataset filtered to non-empty SubjectURI/RelationshipURI/ObjectURI
|
||||||
|
"""
|
||||||
|
sc = RDF.selected_columns
|
||||||
|
return RDF.where(
|
||||||
|
(sc.SubjectURI.is_not(None)) & (sc.SubjectURI != "") &
|
||||||
|
(sc.RelationshipURI.is_not(None)) & (sc.RelationshipURI != "") &
|
||||||
|
(sc.ObjectURI.is_not(None)) & (sc.ObjectURI != "")
|
||||||
|
)
|
||||||
|
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
# Rebuild by movie (one row per movie)
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
def rebuild_by_movie(self, RDF: Select) -> Select:
|
||||||
|
"""
|
||||||
|
To execute this method you have to have iterated by movie_id conceptually,
|
||||||
|
because as design we want at the end one row for each movie.
|
||||||
|
|
||||||
|
Original behavior (pandas):
|
||||||
|
- Build per-row "Triple" as SubjectURI + RelationshipURI + ObjectURI,
|
||||||
|
wrapped with START_TRIPLE/END_TRIPLE.
|
||||||
|
- Group by ["MovieID", "Abstract"] and join ("".join) all Triple strings into one.
|
||||||
|
- Prefix the whole list with START_TRIPLE_LIST and Abstract with ABSTRACT.
|
||||||
|
- Return DataFrame [["MovieID","Triple","Abstract"]].
|
||||||
|
|
||||||
|
Updated behavior (SQL):
|
||||||
|
- Build per-row Triple using SQL string concatenation and constants.
|
||||||
|
- Use GROUP_CONCAT (empty separator) to aggregate per-movie.
|
||||||
|
- Prefix with START_TRIPLE_LIST and ABSTRACT in SQL.
|
||||||
|
- Return a Select with columns: ["MovieID","Triple","Abstract"].
|
||||||
|
|
||||||
|
Args:
|
||||||
|
RDF (Select): current dataset with columns
|
||||||
|
MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Select: aggregated dataset with one row per movie
|
||||||
|
"""
|
||||||
|
sc = RDF.selected_columns
|
||||||
|
|
||||||
|
# Per-row triple with START/END_TRIPLE tokens
|
||||||
|
row_triple = (
|
||||||
|
literal(SpecialToken.START_TRIPLE.value) +
|
||||||
|
(sc.SubjectURI + sc.RelationshipURI + sc.ObjectURI) +
|
||||||
|
literal(SpecialToken.END_TRIPLE.value)
|
||||||
|
).label("Triple")
|
||||||
|
|
||||||
|
# Prefixed abstract
|
||||||
|
abstract_tok = (literal(SpecialToken.ABSTRACT.value) + sc.Abstract).label("Abstract")
|
||||||
|
|
||||||
|
# Subquery of per-row triples / abstracts
|
||||||
|
row_view = RDF.with_only_columns(
|
||||||
|
sc.MovieID.label("MovieID"),
|
||||||
|
row_triple,
|
||||||
|
abstract_tok,
|
||||||
|
).subquery()
|
||||||
|
|
||||||
|
# Concatenate all triples for each movie (SQLite syntax; adjust for other DBs)
|
||||||
|
triple_concat = (
|
||||||
|
literal(SpecialToken.START_TRIPLE_LIST.value) +
|
||||||
|
func.group_concat(row_view.c.Triple, literal(""))
|
||||||
|
).label("Triple")
|
||||||
|
|
||||||
|
return (
|
||||||
|
select(
|
||||||
|
row_view.c.MovieID.label("MovieID"),
|
||||||
|
triple_concat,
|
||||||
|
row_view.c.Abstract.label("Abstract"),
|
||||||
|
)
|
||||||
|
.group_by(row_view.c.MovieID, row_view.c.Abstract)
|
||||||
|
)
|
||||||
|
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
# Build triple(s) projection
|
||||||
|
# -------------------------------------------------------------------------
|
||||||
|
@staticmethod
|
||||||
|
def build_triple(RDF: Select) -> Select:
|
||||||
|
"""
|
||||||
|
Obtains joined RDF triple in one element, together with START and END special tokens.
|
||||||
|
|
||||||
|
Original behavior (pandas):
|
||||||
|
- Returned a Series/DataFrame column "Triple" built from three string columns.
|
||||||
|
|
||||||
|
Updated behavior (SQL):
|
||||||
|
- Returns a Select with a single column "Triple" built in SQL.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
RDF (Select): at least columns ["SubjectURI", "RelationshipURI", "ObjectURI"]
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Select: a projection containing one column named "Triple"
|
||||||
|
"""
|
||||||
|
sc = RDF.selected_columns
|
||||||
|
triple = (
|
||||||
|
literal(SpecialToken.START_TRIPLE.value) +
|
||||||
|
(sc.SubjectURI + sc.RelationshipURI + sc.ObjectURI) +
|
||||||
|
literal(SpecialToken.END_TRIPLE.value)
|
||||||
|
).label("Triple")
|
||||||
|
return RDF.with_only_columns(triple)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def build_incomplete_triple(RDF: Select) -> Select:
|
||||||
|
"""
|
||||||
|
Method helper used for the third task: "Predicting a masked component within an RDF triple".
|
||||||
|
Obtains joined RDF triple in one element, together with START and END special tokens.
|
||||||
|
The MISSING element will be replaced by the special token <MASK>.
|
||||||
|
|
||||||
|
Original behavior (pandas):
|
||||||
|
- Created a Series "Triple" using fallback values for missing columns.
|
||||||
|
|
||||||
|
Updated behavior (SQL):
|
||||||
|
- Uses COALESCE to replace NULLs with <MASK> directly in SQL.
|
||||||
|
- Returns a Select with a single column "Triple".
|
||||||
|
|
||||||
|
Args:
|
||||||
|
RDF (Select): 2 of the following columns present ["SubjectURI", "RelationshipURI", "ObjectURI"]
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Select: projection with column "Triple"
|
||||||
|
"""
|
||||||
|
sc = RDF.selected_columns
|
||||||
|
mask = literal(SpecialToken.MASK.value)
|
||||||
|
|
||||||
|
triple = (
|
||||||
|
literal(SpecialToken.START_TRIPLE.value) +
|
||||||
|
(func.coalesce(sc.SubjectURI, mask) +
|
||||||
|
func.coalesce(sc.RelationshipURI, mask) +
|
||||||
|
func.coalesce(sc.ObjectURI, mask)) +
|
||||||
|
literal(SpecialToken.END_TRIPLE.value)
|
||||||
|
).label("Triple")
|
||||||
|
return RDF.with_only_columns(triple)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def build_for_mask_task(RDF_incomplete: Select, MISSING) -> None:
|
||||||
|
"""
|
||||||
|
Currently not used.
|
||||||
|
|
||||||
|
Original intention:
|
||||||
|
Given two DataFrames (one incomplete RDF and another with just the missing component),
|
||||||
|
apply special tokens accordingly.
|
||||||
|
|
||||||
|
Updated note:
|
||||||
|
This stub remains for API parity. If needed in the future, it can be implemented
|
||||||
|
as a Select-building helper that merges/COALESCEs columns from different selects.
|
||||||
|
"""
|
||||||
|
return None
|
||||||
148
Scripts/DataCleaning/legacy/fast_filter.py
Normal file
148
Scripts/DataCleaning/legacy/fast_filter.py
Normal file
@ -0,0 +1,148 @@
|
|||||||
|
# This file deletes in the pipeline the unwanted relationship by different rules
|
||||||
|
import pandas as pd
|
||||||
|
import sqlite3 # kept for compatibility
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
|
||||||
|
from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
|
||||||
|
|
||||||
|
|
||||||
|
class PipelineApplier:
|
||||||
|
def __init__(self):
|
||||||
|
# Fast internal caches for O(1) membership checks
|
||||||
|
self._MOVIE_FILTER_SET = set()
|
||||||
|
self._REL_FILTER_SET = set()
|
||||||
|
|
||||||
|
# ------------------------------
|
||||||
|
# Filters
|
||||||
|
# ------------------------------
|
||||||
|
def delete_relationship_by_str(self, RDF: pd.DataFrame, uri: str) -> pd.DataFrame:
|
||||||
|
# Vectorized boolean mask
|
||||||
|
return RDF.loc[RDF["RelationshipURI"] != uri]
|
||||||
|
|
||||||
|
def generate_frequency_movie_filter(self, MOVIE_COUNT: pd.DataFrame, min_threshold: int, max_threshold: int):
|
||||||
|
"""
|
||||||
|
You MUST call this before filter the dataset by movie frequency [filter_by_frequency_movie_id()],
|
||||||
|
since this method creates such filter.
|
||||||
|
Args:
|
||||||
|
MOVIE_COUNT (pd.DataFrame): ["MovieID","Count"]
|
||||||
|
"""
|
||||||
|
sel = (MOVIE_COUNT["Count"] >= min_threshold) & (MOVIE_COUNT["Count"] < max_threshold)
|
||||||
|
self._MOVIE_FILTER_SET = set(MOVIE_COUNT.loc[sel, "MovieID"].tolist())
|
||||||
|
|
||||||
|
def generate_frequency_relationship_filter(self, REL_COUNT: pd.DataFrame, min_threshold: int, max_threshold: int):
|
||||||
|
sel = (REL_COUNT["Count"] >= min_threshold) & (REL_COUNT["Count"] < max_threshold)
|
||||||
|
self._REL_FILTER_SET = set(REL_COUNT.loc[sel, "RelationshipURI"].tolist())
|
||||||
|
|
||||||
|
def filter_by_frequency_movie_id(self, RDF: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
# Set-backed isin is the fastest path
|
||||||
|
return RDF.loc[RDF["MovieID"].isin(self._MOVIE_FILTER_SET)]
|
||||||
|
|
||||||
|
def filter_by_frequency_relationship(self, RDF: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
return RDF.loc[RDF["RelationshipURI"].isin(self._REL_FILTER_SET)]
|
||||||
|
|
||||||
|
# ------------------------------
|
||||||
|
# Cleaning & preprocessing
|
||||||
|
# ------------------------------
|
||||||
|
def rdf_add_special_token(self, RDF: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
Adds RDF special token to SubjectURI / RelationshipURI / ObjectURI.
|
||||||
|
Returns a new DataFrame (no inplace modification of the caller's object).
|
||||||
|
"""
|
||||||
|
subj = np.char.add(SpecialToken.SUBJECT.value, RDF["SubjectURI"].to_numpy(dtype=object))
|
||||||
|
rel = np.char.add(SpecialToken.RELATIONSHIP.value, RDF["RelationshipURI"].to_numpy(dtype=object))
|
||||||
|
obj = np.char.add(SpecialToken.OBJECT.value, RDF["ObjectURI"].to_numpy(dtype=object))
|
||||||
|
return RDF.assign(SubjectURI=subj, RelationshipURI=rel, ObjectURI=obj)
|
||||||
|
|
||||||
|
def drop_na_from_dataset(self, RDF: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
Replace '' with NaN only on key columns, then drop rows missing any of them.
|
||||||
|
"""
|
||||||
|
cols = ["SubjectURI", "RelationshipURI", "ObjectURI"]
|
||||||
|
rdf = RDF.copy()
|
||||||
|
for c in cols:
|
||||||
|
m = rdf[c] == ""
|
||||||
|
if m.any():
|
||||||
|
rdf.loc[m, c] = np.nan
|
||||||
|
return rdf.dropna(subset=cols)
|
||||||
|
|
||||||
|
# ------------------------------
|
||||||
|
# Building triples
|
||||||
|
# ------------------------------
|
||||||
|
@staticmethod
|
||||||
|
def build_triple(RDF: pd.DataFrame):
|
||||||
|
"""
|
||||||
|
Obtains joined RDF triple in one element, together with START and END special token.
|
||||||
|
Returns:
|
||||||
|
pd.Series: RDF["Triple"] (just this column). Side-effect: sets RDF["Triple"].
|
||||||
|
"""
|
||||||
|
start = SpecialToken.START_TRIPLE.value
|
||||||
|
end = SpecialToken.END_TRIPLE.value
|
||||||
|
|
||||||
|
subj = RDF["SubjectURI"].to_numpy(dtype=object)
|
||||||
|
rel = RDF["RelationshipURI"].to_numpy(dtype=object)
|
||||||
|
obj = RDF["ObjectURI"].to_numpy(dtype=object)
|
||||||
|
|
||||||
|
arr = np.char.add(np.char.add(np.char.add(start, subj),
|
||||||
|
np.char.add(rel, obj)),
|
||||||
|
end)
|
||||||
|
RDF["Triple"] = pd.Series(arr, index=RDF.index, dtype=object, name="Triple")
|
||||||
|
return RDF["Triple"]
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def build_incomplete_triple(RDF: pd.DataFrame):
|
||||||
|
"""
|
||||||
|
Helper used for the third task: "Predicting a masked component within an RDF triple".
|
||||||
|
Accepts any subset of ["SubjectURI","RelationshipURI","ObjectURI"] (typically 2 of 3).
|
||||||
|
Missing components are replaced by <MASK>.
|
||||||
|
Returns:
|
||||||
|
pd.Series: RDF["Triple"] (just this column). Side-effect: sets RDF["Triple"].
|
||||||
|
"""
|
||||||
|
start = SpecialToken.START_TRIPLE.value
|
||||||
|
end = SpecialToken.END_TRIPLE.value
|
||||||
|
maskv = SpecialToken.MASK.value
|
||||||
|
n = len(RDF.index)
|
||||||
|
|
||||||
|
subj = RDF["SubjectURI"].to_numpy(dtype=object) if "SubjectURI" in RDF else np.full(n, maskv, dtype=object)
|
||||||
|
rel = RDF["RelationshipURI"].to_numpy(dtype=object) if "RelationshipURI" in RDF else np.full(n, maskv, dtype=object)
|
||||||
|
obj = RDF["ObjectURI"].to_numpy(dtype=object) if "ObjectURI" in RDF else np.full(n, maskv, dtype=object)
|
||||||
|
|
||||||
|
arr = np.char.add(np.char.add(np.char.add(start, subj),
|
||||||
|
np.char.add(rel, obj)),
|
||||||
|
end)
|
||||||
|
RDF["Triple"] = pd.Series(arr, index=RDF.index, dtype=object, name="Triple")
|
||||||
|
return RDF["Triple"]
|
||||||
|
|
||||||
|
def rebuild_by_movie(self, RDF: pd.DataFrame):
|
||||||
|
"""
|
||||||
|
Collapse triples + abstract into a single row per movie.
|
||||||
|
Returns: ["MovieID","Triple","Abstract"]
|
||||||
|
"""
|
||||||
|
# Build triples once (vectorized); method also sets RDF["Triple"]
|
||||||
|
triples = self.build_triple(RDF)
|
||||||
|
|
||||||
|
# Minimal frame for grouping (avoid carrying extra columns)
|
||||||
|
tmp = pd.DataFrame({
|
||||||
|
"MovieID": RDF["MovieID"].to_numpy(),
|
||||||
|
"Abstract": RDF["Abstract"].to_numpy(),
|
||||||
|
"Triple": triples.to_numpy(),
|
||||||
|
})
|
||||||
|
|
||||||
|
# Factorize high-cardinality keys to fast integer codes, group on codes,
|
||||||
|
# then map back to labels; sum concatenates strings for object dtype.
|
||||||
|
mid_codes, mid_uniques = pd.factorize(tmp["MovieID"], sort=False)
|
||||||
|
abs_codes, abs_uniques = pd.factorize(tmp["Abstract"], sort=False)
|
||||||
|
|
||||||
|
tmp["_mid"] = mid_codes
|
||||||
|
tmp["_abs"] = abs_codes
|
||||||
|
|
||||||
|
grouped = tmp.groupby(["_mid", "_abs"], sort=False, as_index=False)["Triple"].sum()
|
||||||
|
|
||||||
|
grouped["MovieID"] = grouped["_mid"].map(lambda i: mid_uniques[i])
|
||||||
|
grouped["Abstract"] = grouped["_abs"].map(lambda i: abs_uniques[i])
|
||||||
|
|
||||||
|
# Final tokens
|
||||||
|
grouped["Triple"] = SpecialToken.START_TRIPLE_LIST.value + grouped["Triple"]
|
||||||
|
grouped["Abstract"] = SpecialToken.ABSTRACT.value + grouped["Abstract"]
|
||||||
|
|
||||||
|
return grouped[["MovieID", "Triple", "Abstract"]]
|
||||||
191
Scripts/DataCleaning/legacy/filter.py
Normal file
191
Scripts/DataCleaning/legacy/filter.py
Normal file
@ -0,0 +1,191 @@
|
|||||||
|
# This file deletes in the pipeline the unwanted relationship by different rules
|
||||||
|
import pandas as pd
|
||||||
|
import sqlite3
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
|
||||||
|
from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
|
||||||
|
|
||||||
|
|
||||||
|
class PipelineApplier():
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
|
||||||
|
self.MOVIE_FILTER = pd.DataFrame()
|
||||||
|
self.REL_FILTER = pd.DataFrame()
|
||||||
|
|
||||||
|
|
||||||
|
def delete_relationship_by_str(self, RDF: pd.DataFrame, uri: str) -> pd.DataFrame:
|
||||||
|
return RDF[RDF["RelationshipURI"]!= uri]
|
||||||
|
|
||||||
|
def generate_list_relationship_filter(self, filter_list: list[str]) -> None:
|
||||||
|
"""Store RelationshipURI filters as a set """
|
||||||
|
self.relationship_filter_list: set[str] = set(filter_list)
|
||||||
|
|
||||||
|
def delete_relationship_by_list_filter(self, RDF: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
"""Remove rows whose RelationshipURI is in the stored filter. Generate it first callig the generate_list_relationship_filter"""
|
||||||
|
return RDF[~RDF["RelationshipURI"].isin(self.relationship_filter_list)]
|
||||||
|
|
||||||
|
# def filter_movie_by_rel_uri_frequence()
|
||||||
|
|
||||||
|
def generate_frequency_movie_filter(self, MOVIE_COUNT: pd.DataFrame ,min_treshold: int, max_treshold: int):
|
||||||
|
"""
|
||||||
|
You MUST call this before filter the dataset by movie frequence [filter_by_frequence_movie_id()],
|
||||||
|
since this method creates such filter
|
||||||
|
Args:
|
||||||
|
MOVIE_COUNT (pd.DataFrame): ["MovieID","Count"]
|
||||||
|
min_treshold (int):
|
||||||
|
max_treshold (int):
|
||||||
|
"""
|
||||||
|
MOVIE_COUNT = MOVIE_COUNT[MOVIE_COUNT["Count"] >= min_treshold]
|
||||||
|
MOVIE_COUNT = MOVIE_COUNT[MOVIE_COUNT["Count"] < max_treshold]
|
||||||
|
self.MOVIE_FILTER = MOVIE_COUNT #["MovieID"]
|
||||||
|
|
||||||
|
def generate_frequency_relationship_filter(self, REL_COUNT: pd.DataFrame ,min_treshold: int, max_treshold: int):
|
||||||
|
REL_COUNT = REL_COUNT[REL_COUNT["Count"] >= min_treshold]
|
||||||
|
REL_COUNT = REL_COUNT[REL_COUNT["Count"] < max_treshold]
|
||||||
|
self.REL_FILTER = REL_COUNT #["RelationshipURI"]
|
||||||
|
|
||||||
|
def filter_by_frequency_movie_id(self, RDF: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
RDF = RDF[RDF["MovieID"].isin(self.MOVIE_FILTER["MovieID"])]
|
||||||
|
return RDF
|
||||||
|
|
||||||
|
def filter_by_frequency_relationship(self, RDF: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
RDF = RDF[RDF["RelationshipURI"].isin(self.REL_FILTER["RelationshipURI"])]
|
||||||
|
return RDF
|
||||||
|
|
||||||
|
def rdf_add_special_token(self, RDF: pd.DataFrame):
|
||||||
|
"""
|
||||||
|
Adds RDF special token to each element of the tuple. i.e: SUBJ to SubjectURI, OBJ to ObjectURI, REL to RelationshipURI.
|
||||||
|
Check Scrits/Libs/CleaningPipeline/special_token.py for the up-to-date special token.
|
||||||
|
It only adds the special token of the three element of the RDF, no other special token.
|
||||||
|
Args:
|
||||||
|
RDF (pd.DataFrame):
|
||||||
|
Returns:
|
||||||
|
pd.DataFrame: ["MovieURI","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
|
||||||
|
"""
|
||||||
|
# if the filter runned before sliced the RDF and created a View, here the problem is resolved
|
||||||
|
# for more context: SettingWithCopyWarning
|
||||||
|
RDF = RDF.copy()
|
||||||
|
# at the beginning of SubjectURI RelationshipURI ObjectURI, add their special token
|
||||||
|
RDF["SubjectURI"] = SpecialToken.SUBJECT.value + RDF["SubjectURI"]
|
||||||
|
RDF["ObjectURI"] = SpecialToken.OBJECT.value + RDF["ObjectURI"]
|
||||||
|
RDF["RelationshipURI"] = SpecialToken.RELATIONSHIP.value + RDF["RelationshipURI"]
|
||||||
|
return RDF
|
||||||
|
|
||||||
|
|
||||||
|
def drop_na_from_dataset(self, RDF: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
# dataset has SubjectURI RelationshipURI ObjectURI
|
||||||
|
# want to drop the '' in them
|
||||||
|
# Replace empty strings with NaN
|
||||||
|
RDF = RDF.replace('', np.nan)
|
||||||
|
# Drop rows where any of the key columns are NaN
|
||||||
|
RDF = RDF.dropna(subset=["SubjectURI", "RelationshipURI", "ObjectURI"])
|
||||||
|
return RDF
|
||||||
|
|
||||||
|
def rebuild_by_movie(self, RDF: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
"""_summary_
|
||||||
|
|
||||||
|
Args:
|
||||||
|
RDF (pd.DataFrame): ["MovieID","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
pd.DataFrame: ["MovieID","Triple","Abstract"]
|
||||||
|
"""
|
||||||
|
# to execute this method you have to have itereted by movie_id
|
||||||
|
# because as design we want at the end one row for each movie
|
||||||
|
# MovieID and abstract can be given as input for a more generic method
|
||||||
|
# movie_id = RDF["MovieID"].iloc(0)
|
||||||
|
# abstract = RDF["Abstract"].iloc(0)
|
||||||
|
# first let's combine each row creating column triple as join of rdf
|
||||||
|
RDF["Triple"] = RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"]
|
||||||
|
# special token
|
||||||
|
RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
|
||||||
|
# combine rows into one
|
||||||
|
# MovieID and Abstract are unique for each other 1 <-> 1
|
||||||
|
RDF = RDF.groupby(["MovieID", "Abstract"])["Triple"].apply("".join).reset_index()
|
||||||
|
# add special token for: start of triple, end of triple and start of abstract
|
||||||
|
RDF["Triple"] = SpecialToken.START_TRIPLE_LIST.value + RDF["Triple"]
|
||||||
|
RDF["Abstract"] = SpecialToken.ABSTRACT.value + RDF["Abstract"]
|
||||||
|
return RDF[["MovieID","Triple","Abstract"]]
|
||||||
|
|
||||||
|
def group_by_movie_from_triple(self, RDF: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
RDF (pd.DataFrame): ["MovieID","Triple","Abstract"]
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
pd.DataFrame: ["MovieID","Triple","Abstract"]
|
||||||
|
"""
|
||||||
|
# combine rows into one
|
||||||
|
# MovieID and Abstract are unique for each other 1 <-> 1
|
||||||
|
RDF = RDF.groupby(["MovieID", "Abstract"])["Triple"].apply("".join).reset_index()
|
||||||
|
# add special token for: start of triple, end of triple and start of abstract
|
||||||
|
RDF["Triple"] = SpecialToken.START_TRIPLE_LIST.value + RDF["Triple"]
|
||||||
|
RDF["Abstract"] = SpecialToken.ABSTRACT.value + RDF["Abstract"]
|
||||||
|
return RDF[["MovieID","Triple","Abstract"]]
|
||||||
|
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def build_triple(RDF: pd.DataFrame):
|
||||||
|
"""
|
||||||
|
Obtains joined RDF triple in one element, togheter with START and END special token
|
||||||
|
Args:
|
||||||
|
RDF (pd.DataFrame): at least ["SubjectURI", "RelationshipURI", "ObjectURI"]
|
||||||
|
Returns:
|
||||||
|
pd.DataFrame: RDF["Triple"] (just this column)
|
||||||
|
"""
|
||||||
|
# let's combine each row creating column triple as join of rdf
|
||||||
|
RDF["Triple"] = RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"]
|
||||||
|
# special token
|
||||||
|
RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
|
||||||
|
return RDF["Triple"]
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def build_incomplete_triple(RDF: pd.DataFrame):
|
||||||
|
"""
|
||||||
|
Method helper used for the third task: "Predicting a masked component within an RDF triple".
|
||||||
|
Obtains joined RDF triple in one element, togheter with START and END special token.
|
||||||
|
The MISSING element will be replaced by the special token <MASK>
|
||||||
|
Args:
|
||||||
|
RDF (pd.DataFrame): 2 of the following ["SubjectURI", "RelationshipURI", "ObjectURI"]
|
||||||
|
Returns:
|
||||||
|
RDF["Triple"]: pd.Series (just this column, NOT A DATAFRAME)
|
||||||
|
"""
|
||||||
|
# let's create a new column "Triple" with the joined RDF
|
||||||
|
|
||||||
|
# the following creates a column of MASK token of the lenght of the dataframe,
|
||||||
|
# it is not needed since we expect to have a dataframe of just one column, but its more robust (AND SLOW)
|
||||||
|
MISSING = pd.Series([SpecialToken.MASK.value] * len(RDF), index=RDF.index)
|
||||||
|
|
||||||
|
RDF["Triple"] = (
|
||||||
|
RDF.get("SubjectURI", MISSING) +
|
||||||
|
RDF.get("RelationshipURI", MISSING) +
|
||||||
|
RDF.get("ObjectURI", MISSING))
|
||||||
|
# special token
|
||||||
|
RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
|
||||||
|
return RDF["Triple"]
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def build_for_mask_task(RDF_incomplete: pd.DataFrame, MISSING: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
# currently not used
|
||||||
|
"""
|
||||||
|
Method helper used for the third task: "Predicting a masked component within an RDF triple".
|
||||||
|
Given two Dataframe, the first containing the incompleted RDF and the other only the missing componment,
|
||||||
|
this methods applies the special token
|
||||||
|
Args:
|
||||||
|
RDF (pd.DataFrame): _description_
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
pd.DataFrame: _description_
|
||||||
|
"""
|
||||||
|
# take an example dataframe as ["SubjectURI",""]
|
||||||
|
# as input two dataframe, one with 2 column
|
||||||
|
return None
|
||||||
|
|
||||||
|
def regex_on_objects(self, RDF: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
RDF["ObjectURI"] = (RDF["ObjectURI"].astype("string")
|
||||||
|
.str.replace(r"\r?\n+", ", ", regex=True) # newlines -> ", "
|
||||||
|
.str.replace(r"\*", "", regex=True)) # delete all asterisks
|
||||||
|
|
||||||
|
return RDF
|
||||||
145
Scripts/DataCleaning/legacy/pipeline.py
Normal file
145
Scripts/DataCleaning/legacy/pipeline.py
Normal file
@ -0,0 +1,145 @@
|
|||||||
|
import re
|
||||||
|
from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
|
||||||
|
from Scripts.DataCleaning.legacy.filter import PipelineApplier
|
||||||
|
# tasks dataset builder
|
||||||
|
from Scripts.DataCleaning.data_output_models.rdf_mask_task import RDF_mask_task_dataset
|
||||||
|
from Scripts.DataCleaning.data_output_models.bpe_corpus import BPE_corpus
|
||||||
|
from Scripts.DataCleaning.data_output_models.rdf_text_tasks import RDF_text_task_dataset
|
||||||
|
from Scripts.DataCleaning.data_output_models.rdf_completation_task import RDF_completation_task_dataset
|
||||||
|
from Scripts.DataCleaning.data_output_models.debug_csv import Debug_csv
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
class Pipeline():
|
||||||
|
def __init__(self):
|
||||||
|
self.sql_endpoint = SqlEndpoint()
|
||||||
|
# classes to manage taskes' datasets
|
||||||
|
self.task_rdf_mask = RDF_mask_task_dataset("./Assets/Dataset/Tmp/rdf_mask.csv")
|
||||||
|
self.task_bpe_corpus = BPE_corpus("./Assets/Dataset/Tmp/corpus.txt")
|
||||||
|
self.task_rdf_text = RDF_text_task_dataset("./Assets/Dataset/Tmp/rdf_text.csv")
|
||||||
|
self.task_rdf_completation = RDF_completation_task_dataset("./Assets/Dataset/Tmp/rdf_completation.csv")
|
||||||
|
|
||||||
|
# prepare the filter
|
||||||
|
# the filter applier needs to know the frequence of Movies and Relationship among all the Dataset
|
||||||
|
self.filter_applier = PipelineApplier()
|
||||||
|
MOVIE_COUNT = self.sql_endpoint.get_movies_id_count()
|
||||||
|
REL_COUNT = self.sql_endpoint.get_relationship_count()
|
||||||
|
self.filter_applier.generate_frequency_movie_filter(MOVIE_COUNT,50,3000)
|
||||||
|
self.filter_applier.generate_frequency_relationship_filter(REL_COUNT, 50, 2395627) # from 2718 to 3069
|
||||||
|
# prepare the filter on the relationshipURI you want to delete:
|
||||||
|
relationship_uri_banned_list = [
|
||||||
|
"dbp-dbp:wikiPageUsesTemplate","w3:2000/01/rdf-schema#label","dbp-dbo:abstract",
|
||||||
|
"dbp-dbo:wikiPageID","dbp-dbo:wikiPageRevisionID", "dbp-dbo:wikiPageDisambiguates",
|
||||||
|
"w3:2002/07/owl#sameAs","dbp-dbp:image","dbp-dbo:wikiPageLength", "w3:2000/01/rdf-schema#comment",
|
||||||
|
"dbp-dbo:thumbnail", "foaf:depiction", "w3:1999/02/22-rdf-syntax-ns#type",
|
||||||
|
"dbp-dbp:id","dbp-dbp:totalWidth", "w3:ns/prov#wasDerivedFrom", "dbp-dbp:n", "dbp-dbp:alt",
|
||||||
|
"dbp-dbo:soundRecording"
|
||||||
|
]
|
||||||
|
self.filter_applier.generate_list_relationship_filter(relationship_uri_banned_list)
|
||||||
|
|
||||||
|
|
||||||
|
def execute_task_bpe_corpus(self):
|
||||||
|
for RDF in self._get_cleaned_movie_rows():
|
||||||
|
RDF = self.filter_applier.rebuild_by_movie(RDF)
|
||||||
|
RDF = RDF[["Triple","Abstract"]]
|
||||||
|
self.task_bpe_corpus.write_from_df(RDF)
|
||||||
|
self._end_file_handler()
|
||||||
|
|
||||||
|
|
||||||
|
def execute_task_rdf_mask(self):
|
||||||
|
for RDF in self._get_cleaned_movie_rows():
|
||||||
|
self.task_rdf_mask.write(RDF)
|
||||||
|
self._end_file_handler()
|
||||||
|
|
||||||
|
|
||||||
|
def execute_tasks_rdf_text(self):
|
||||||
|
for RDF in self._get_cleaned_movie_rows():
|
||||||
|
RDF = self.filter_applier.rebuild_by_movie(RDF)
|
||||||
|
self.task_rdf_text.write(RDF)
|
||||||
|
self._end_file_handler()
|
||||||
|
|
||||||
|
|
||||||
|
def execute_task_rdf_completation(self):
|
||||||
|
for RDF in self._get_cleaned_movie_rows():
|
||||||
|
RDF["Triple"] = self.filter_applier.build_triple(RDF)
|
||||||
|
self.task_rdf_completation.write(RDF[["MovieID","Triple"]])
|
||||||
|
self._end_file_handler()
|
||||||
|
|
||||||
|
|
||||||
|
def execute_all_task(self):
|
||||||
|
for RDF in self._get_cleaned_movie_rows():
|
||||||
|
self.task_rdf_mask.write(RDF)
|
||||||
|
|
||||||
|
RDF["Triple"] = self.filter_applier.build_triple(RDF)
|
||||||
|
self.task_rdf_completation.write(RDF[["MovieID","Triple"]])
|
||||||
|
|
||||||
|
RDF = self.filter_applier.group_by_movie_from_triple(RDF[["MovieID","Triple","Abstract"]])
|
||||||
|
|
||||||
|
self.task_rdf_text.write(RDF)
|
||||||
|
self.task_bpe_corpus.write_from_df(RDF[["Triple","Abstract"]])
|
||||||
|
|
||||||
|
self._end_file_handler()
|
||||||
|
|
||||||
|
|
||||||
|
def _end_file_handler(self):
|
||||||
|
self.task_bpe_corpus.close()
|
||||||
|
self.task_rdf_mask.close()
|
||||||
|
self.task_rdf_text.close()
|
||||||
|
self.task_rdf_completation.close()
|
||||||
|
|
||||||
|
|
||||||
|
def _get_cleaned_movie_rows(self):
|
||||||
|
for RDF in self.sql_endpoint.get_abbreviated_dataset_by_movie_id():
|
||||||
|
RDF = self.filter_applier.drop_na_from_dataset(RDF)
|
||||||
|
RDF = self.filter_applier.filter_by_frequency_movie_id(RDF)
|
||||||
|
RDF = self.filter_applier.filter_by_frequency_relationship(RDF)
|
||||||
|
# other filter
|
||||||
|
#
|
||||||
|
RDF = self.filter_applier.delete_relationship_by_list_filter(RDF)
|
||||||
|
# regex on ObjectURI
|
||||||
|
RDF = self.filter_applier.regex_on_objects(RDF)
|
||||||
|
if RDF.empty:
|
||||||
|
continue
|
||||||
|
RDF = self.filter_applier.rdf_add_special_token(RDF) # WARNING, THIS MUST BE DONE AFTER FILTER BY FREQUENCE
|
||||||
|
yield RDF
|
||||||
|
|
||||||
|
|
||||||
|
def use_toy_dataset(self):
|
||||||
|
# CHOOSEN MOVIE:
|
||||||
|
# The Dark Knight : 117248
|
||||||
|
# Inception : 147074
|
||||||
|
# The Avengers : 113621
|
||||||
|
# Cast Away : 1123
|
||||||
|
# The Departed : 117586
|
||||||
|
# American Psycho : 90177
|
||||||
|
# Avatar : 71587
|
||||||
|
# Django Unchained : 138952
|
||||||
|
# Spirited Away : 144137
|
||||||
|
# Knives Out : 148025
|
||||||
|
movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]
|
||||||
|
self.sql_endpoint.movie_ids = movie_list
|
||||||
|
|
||||||
|
def generate_csv_debug_file(self, debug_path:str):
|
||||||
|
debug_csv = Debug_csv(debug_path)
|
||||||
|
|
||||||
|
for RDF in self._get_cleaned_movie_rows():
|
||||||
|
debug_csv.write(RDF)
|
||||||
|
|
||||||
|
debug_csv.close()
|
||||||
|
|
||||||
|
|
||||||
|
# there are a lot of settings to manage
|
||||||
|
# you only need to change settings:
|
||||||
|
# in the init for file paths, frequency filter limit, banned reletionshipURI
|
||||||
|
# in the use_toy_dataset , to change the toy dataset
|
||||||
|
# in _get_cleaned_movie_rows: to change how the pipeline behave
|
||||||
|
|
||||||
|
pipeline = Pipeline()
|
||||||
|
|
||||||
|
pipeline.use_toy_dataset()
|
||||||
|
# pipeline.execute_task_bpe_corpus()
|
||||||
|
# pipeline.execute_task_rdf_mask()
|
||||||
|
# pipeline.execute_tasks_rdf_text()
|
||||||
|
# pipeline.execute_task_rdf_completation()
|
||||||
|
# pipeline.execute_all_task()
|
||||||
|
pipeline.generate_csv_debug_file("Assets/Dataset/Tmp/debug.csv")
|
||||||
161
Scripts/DataCleaning/path_splitter_tree.py
Normal file
161
Scripts/DataCleaning/path_splitter_tree.py
Normal file
@ -0,0 +1,161 @@
|
|||||||
|
import argparse
|
||||||
|
import csv
|
||||||
|
import sys
|
||||||
|
from typing import Self
|
||||||
|
|
||||||
|
|
||||||
|
class ProgramArgs:
|
||||||
|
|
||||||
|
def __init__(self, file: str, csv_uri_header: str, output: str, treshold: int):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
file (str):
|
||||||
|
csv_header (str): The name of the column of the csv file from which the program will get the URIs
|
||||||
|
output (str):
|
||||||
|
treshold (int):
|
||||||
|
"""
|
||||||
|
self.file = file
|
||||||
|
self.csv_uri_header = csv_uri_header
|
||||||
|
self.output = output
|
||||||
|
self.treshold = treshold
|
||||||
|
|
||||||
|
|
||||||
|
class Node:
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
name: str,
|
||||||
|
quantity: int = 0,
|
||||||
|
):
|
||||||
|
self.name = name
|
||||||
|
self.quantity = quantity
|
||||||
|
self.children: dict[str, Node] = {}
|
||||||
|
|
||||||
|
@property
|
||||||
|
def is_leaf(self):
|
||||||
|
return len(self.children) == 0
|
||||||
|
|
||||||
|
def append_child(self, child: list[str]):
|
||||||
|
|
||||||
|
# print(child)
|
||||||
|
KEY = child[0]
|
||||||
|
|
||||||
|
if not self.children.get(KEY):
|
||||||
|
# if the key has no value, it means we are traversing this branch for the first time
|
||||||
|
# create another node for the key
|
||||||
|
self.children[KEY] = Node(KEY, 0)
|
||||||
|
|
||||||
|
# take the node for the key
|
||||||
|
CHILD = self.children[KEY]
|
||||||
|
self.quantity += 1
|
||||||
|
|
||||||
|
# if the child list to enter has only one element, which is KEY, no more node will be created
|
||||||
|
if len(child) == 1:
|
||||||
|
return
|
||||||
|
|
||||||
|
new_children = child[1:]
|
||||||
|
|
||||||
|
CHILD.append_child(new_children)
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return f"{self.name}/ - {self.quantity}"
|
||||||
|
|
||||||
|
|
||||||
|
def get_args(args: list[str]) -> ProgramArgs:
|
||||||
|
|
||||||
|
PARSER = argparse.ArgumentParser()
|
||||||
|
PARSER.add_argument("--input-file", "-i", required=True, type=str)
|
||||||
|
PARSER.add_argument("--header-name", "-c", required=True, type=str) # c stands for column
|
||||||
|
PARSER.add_argument("--output-file", "-o", required=True, type=str)
|
||||||
|
PARSER.add_argument("--treshold", "-t", type=int, default=1)
|
||||||
|
parsed_args, _ = PARSER.parse_known_args(args)
|
||||||
|
|
||||||
|
# print(parsed_args.input_file)
|
||||||
|
|
||||||
|
return ProgramArgs(parsed_args.input_file, parsed_args.header_name ,parsed_args.output_file, parsed_args.treshold) # type ignore
|
||||||
|
|
||||||
|
|
||||||
|
def get_debug_args() -> ProgramArgs:
|
||||||
|
# -i ./Assets/Dataset/1-hop/movies.csv -c subject -o Assets/Dataset/Tmp/prova.csv -t 1
|
||||||
|
FILE = "./Assets/Dataset/1-hop/movies.csv"
|
||||||
|
CSV_HEADER = "subject"
|
||||||
|
OUTPUT = "./Assets/Dataset/Tmp/prova.csv"
|
||||||
|
TRESHOLD = 1
|
||||||
|
|
||||||
|
return ProgramArgs(
|
||||||
|
FILE,
|
||||||
|
CSV_HEADER,
|
||||||
|
OUTPUT,
|
||||||
|
TRESHOLD
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def tree_like(file: str, csv_uri_header:str, out: str):
|
||||||
|
|
||||||
|
INDENTATION = " "
|
||||||
|
|
||||||
|
properties: dict[str, Node] = {}
|
||||||
|
|
||||||
|
properties["pure"] = Node("pure", 0)
|
||||||
|
properties["URI"] = Node("uri", 0)
|
||||||
|
|
||||||
|
FILE = open(file, "r", encoding="utf-8")
|
||||||
|
|
||||||
|
# It is needed the header-name
|
||||||
|
for row in csv.DictReader(FILE):
|
||||||
|
|
||||||
|
uri_element = row[csv_uri_header]
|
||||||
|
sections = uri_element.split("/")
|
||||||
|
sections = list(filter(lambda item: item != "", sections))
|
||||||
|
|
||||||
|
# print(sections)
|
||||||
|
|
||||||
|
if sections[0] != "http:" and sections[0] != "https:":
|
||||||
|
properties["pure"].append_child(sections)
|
||||||
|
continue
|
||||||
|
|
||||||
|
properties["URI"].append_child(sections)
|
||||||
|
|
||||||
|
FILE.close()
|
||||||
|
|
||||||
|
stack: list[tuple[Node, int]] = []
|
||||||
|
|
||||||
|
for _, item in properties.items():
|
||||||
|
stack.append((item, 0))
|
||||||
|
|
||||||
|
OUT = open(out, mode="w", encoding="utf-8")
|
||||||
|
|
||||||
|
while len(stack) > 0:
|
||||||
|
|
||||||
|
LAST_ITEM = stack.pop()
|
||||||
|
|
||||||
|
NODE: Node = LAST_ITEM[0]
|
||||||
|
DEPTH: int = LAST_ITEM[1]
|
||||||
|
|
||||||
|
INDENT: str = INDENTATION * DEPTH
|
||||||
|
|
||||||
|
# Leaf node have quantity 0, so if i want them to appear the threshold have to be 0
|
||||||
|
# if NODE.quantity < ARGS.treshold:
|
||||||
|
if ARGS.treshold > NODE.quantity:
|
||||||
|
continue
|
||||||
|
|
||||||
|
OUT.write(f"{INDENT}- {NODE}\n")
|
||||||
|
|
||||||
|
if NODE.is_leaf:
|
||||||
|
continue
|
||||||
|
|
||||||
|
CHILDREN = []
|
||||||
|
|
||||||
|
for _, child in NODE.children.items():
|
||||||
|
CHILDREN.append((child, DEPTH + 1))
|
||||||
|
|
||||||
|
stack.extend(CHILDREN)
|
||||||
|
|
||||||
|
OUT.close()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
ARGS = get_args(sys.argv)
|
||||||
|
# ARGS = get_debug_args()
|
||||||
|
tree_like(ARGS.file,ARGS.csv_uri_header, ARGS.output)
|
||||||
86
Scripts/DataCleaning/pipeline/cleaner.py
Normal file
86
Scripts/DataCleaning/pipeline/cleaner.py
Normal file
@ -0,0 +1,86 @@
|
|||||||
|
# This file deletes in the pipeline the unwanted relationship by different rules
|
||||||
|
import pandas as pd
|
||||||
|
import sqlite3
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
|
||||||
|
from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
|
||||||
|
|
||||||
|
|
||||||
|
class PipelineApplier():
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def rdf_add_special_token(self, RDF: pd.DataFrame):
|
||||||
|
"""
|
||||||
|
Adds RDF special token to each element of the tuple. i.e: SUBJ to SubjectURI, OBJ to ObjectURI, REL to RelationshipURI.
|
||||||
|
Check Scrits/Libs/CleaningPipeline/special_token.py for the up-to-date special token.
|
||||||
|
It only adds the special token of the three element of the RDF, no other special token.
|
||||||
|
Args:
|
||||||
|
RDF (pd.DataFrame):
|
||||||
|
Returns:
|
||||||
|
pd.DataFrame: ["MovieURI","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
|
||||||
|
"""
|
||||||
|
# if the filter runned before sliced the RDF and created a View, here the problem is resolved
|
||||||
|
# for more context: SettingWithCopyWarning
|
||||||
|
RDF = RDF.copy()
|
||||||
|
# at the beginning of SubjectURI RelationshipURI ObjectURI, add their special token
|
||||||
|
RDF["SubjectURI"] = SpecialToken.SUBJECT.value + RDF["SubjectURI"]
|
||||||
|
RDF["ObjectURI"] = SpecialToken.OBJECT.value + RDF["ObjectURI"]
|
||||||
|
RDF["RelationshipURI"] = SpecialToken.RELATIONSHIP.value + RDF["RelationshipURI"]
|
||||||
|
return RDF
|
||||||
|
|
||||||
|
|
||||||
|
def drop_na_from_dataset(self, RDF: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
RDF = RDF.replace('', np.nan)
|
||||||
|
# Drop rows where any of the key columns are NaN
|
||||||
|
RDF = RDF.dropna(subset=["SubjectURI", "RelationshipURI", "ObjectURI"])
|
||||||
|
return RDF
|
||||||
|
|
||||||
|
def rebuild_by_movie(self, RDF: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
RDF (pd.DataFrame): ["MovieID","SubjectURI","RelationshipURI","ObjectURI","Abstract"]
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
pd.DataFrame: ["MovieID","Triple","Abstract"]
|
||||||
|
"""
|
||||||
|
# to execute this method you have to have itereted by movie_id
|
||||||
|
# because as design we want at the end one row for each movie
|
||||||
|
# MovieID and abstract can be given as input for a more generic method
|
||||||
|
# first let's combine each row creating column triple as join of rdf
|
||||||
|
RDF["Triple"] = RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"]
|
||||||
|
# special token
|
||||||
|
RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
|
||||||
|
# combine rows into one
|
||||||
|
# MovieID and Abstract are unique for each other 1 <-> 1
|
||||||
|
RDF = RDF.groupby(["MovieID", "Abstract"])["Triple"].apply("".join).reset_index()
|
||||||
|
# add special token for: start of triple, end of triple and start of abstract
|
||||||
|
RDF["Triple"] = SpecialToken.START_TRIPLE_LIST.value + RDF["Triple"]+SpecialToken.END_OF_SENTENCE.value
|
||||||
|
RDF["Abstract"] = SpecialToken.ABSTRACT.value + RDF["Abstract"] + SpecialToken.END_OF_SENTENCE.value
|
||||||
|
return RDF[["MovieID","Triple","Abstract"]]
|
||||||
|
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def build_triple(RDF: pd.DataFrame):
|
||||||
|
"""
|
||||||
|
Obtains joined RDF triple in one element, togheter with START and END special token
|
||||||
|
Args:
|
||||||
|
RDF (pd.DataFrame): at least ["SubjectURI", "RelationshipURI", "ObjectURI"]
|
||||||
|
Returns:
|
||||||
|
pd.DataFrame: RDF["Triple"] (just this column)
|
||||||
|
"""
|
||||||
|
# let's combine each row creating column triple as join of rdf
|
||||||
|
RDF["Triple"] = RDF["SubjectURI"] + RDF["RelationshipURI"] + RDF["ObjectURI"]
|
||||||
|
# special token
|
||||||
|
RDF["Triple"] = SpecialToken.START_TRIPLE.value + RDF["Triple"] + SpecialToken.END_TRIPLE.value
|
||||||
|
return RDF["Triple"]
|
||||||
|
|
||||||
|
|
||||||
|
def regex_on_objects(self, RDF: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
RDF["ObjectURI"] = (RDF["ObjectURI"].astype("string")
|
||||||
|
.str.replace(r"\r?\n+", ", ", regex=True) # newlines -> ", "
|
||||||
|
.str.replace(r"\*", "", regex=True)) # delete all asterisks
|
||||||
|
|
||||||
|
return RDF
|
||||||
103
Scripts/DataCleaning/pipeline/movie_filter.py
Normal file
103
Scripts/DataCleaning/pipeline/movie_filter.py
Normal file
@ -0,0 +1,103 @@
|
|||||||
|
import pandas as pd
|
||||||
|
from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
|
||||||
|
|
||||||
|
class MovieFilter:
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self.sql_endpoint = SqlEndpoint()
|
||||||
|
# first obtain all movie_id
|
||||||
|
movie_query = "SELECT MovieID FROM Movies"
|
||||||
|
self.MOVIE_FILTER = self.sql_endpoint.get_dataframe_from_query(movie_query)
|
||||||
|
|
||||||
|
|
||||||
|
def frequency_filter(self, min_treshold:int, max_treshold:int):
|
||||||
|
movie_list_placeholder = ",".join(["?"] * len(self.MOVIE_FILTER))
|
||||||
|
|
||||||
|
filter_query = f"""
|
||||||
|
SELECT MovieID
|
||||||
|
FROM RDFs
|
||||||
|
WHERE MovieID IN ({movie_list_placeholder})
|
||||||
|
GROUP BY MovieID
|
||||||
|
HAVING COUNT(*) BETWEEN {min_treshold} AND {max_treshold};
|
||||||
|
"""
|
||||||
|
self.MOVIE_FILTER = self.sql_endpoint.get_dataframe_from_query(filter_query, tuple(self.MOVIE_FILTER["MovieID"].to_list()))
|
||||||
|
|
||||||
|
|
||||||
|
def get_movie_id(self):
|
||||||
|
return self.MOVIE_FILTER
|
||||||
|
|
||||||
|
|
||||||
|
def relation_filter(self, parsed_rel_uri: str, min_treshold:int, max_treshold:int):
|
||||||
|
movie_ids = self.MOVIE_FILTER["MovieID"].to_list()
|
||||||
|
movie_list_placeholder = ",".join(["?"] * len(movie_ids))
|
||||||
|
|
||||||
|
filter_query = f"""
|
||||||
|
SELECT MovieID
|
||||||
|
FROM RDFs
|
||||||
|
JOIN ParsedRelationships ON ParsedRelationships.RelationshipID = RDFs.RelationshipID
|
||||||
|
WHERE MovieID IN ({movie_list_placeholder})
|
||||||
|
GROUP BY MovieID
|
||||||
|
HAVING SUM(CASE WHEN ParsedRelationships.RelationshipURI = '{parsed_rel_uri}' THEN 1 ELSE 0 END)
|
||||||
|
BETWEEN {min_treshold} AND {max_treshold};
|
||||||
|
"""
|
||||||
|
|
||||||
|
params = tuple(movie_ids) # + (parsed_rel_uri, min_treshold, max_treshold)
|
||||||
|
self.MOVIE_FILTER = self.sql_endpoint.get_dataframe_from_query(filter_query, params)
|
||||||
|
|
||||||
|
|
||||||
|
def filter_by_director(self):
|
||||||
|
director_list = ['dbp-dbo:director','dbp-dbp:director']
|
||||||
|
|
||||||
|
movie_ids = self.MOVIE_FILTER["MovieID"].to_list()
|
||||||
|
movie_list_placeholder = ",".join(["?"] * len(movie_ids))
|
||||||
|
|
||||||
|
filter_query = f"""
|
||||||
|
SELECT DISTINCT RDFs.MovieID
|
||||||
|
FROM RDFs
|
||||||
|
JOIN ParsedRelationships USING (RelationshipID)
|
||||||
|
WHERE RDFs.MovieID IN ({movie_list_placeholder})
|
||||||
|
AND ParsedRelationships.RelationshipURI IN {tuple(director_list)};
|
||||||
|
"""
|
||||||
|
|
||||||
|
params = tuple(movie_ids)
|
||||||
|
self.MOVIE_FILTER = self.sql_endpoint.get_dataframe_from_query(filter_query, params)
|
||||||
|
|
||||||
|
|
||||||
|
def filter_by_english_movies(self):
|
||||||
|
movie_ids = self.MOVIE_FILTER["MovieID"].to_list()
|
||||||
|
movie_list_placeholder = ",".join(["?"] * len(movie_ids))
|
||||||
|
|
||||||
|
relationship = ["dbp-dbp:language"]
|
||||||
|
objects_list = ["English", "dbp-dbr:English_language"]
|
||||||
|
|
||||||
|
filter_query = f"""
|
||||||
|
SELECT DISTINCT RDFs.MovieID
|
||||||
|
FROM RDFs
|
||||||
|
INNER JOIN ParsedRelationships USING (RelationshipID)
|
||||||
|
INNER JOIN ParsedObjects USING (ObjectID)
|
||||||
|
WHERE RDFs.MovieID IN ({movie_list_placeholder})
|
||||||
|
AND ParsedRelationships.RelationshipURI IN ('{relationship[0]}')
|
||||||
|
AND ParsedObjects.ObjectURI in {tuple(objects_list)};
|
||||||
|
"""
|
||||||
|
|
||||||
|
other_query = f"""
|
||||||
|
SELECT RDFs.MovieID
|
||||||
|
FROM RDFs
|
||||||
|
INNER JOIN ParsedRelationships USING (RelationshipID)
|
||||||
|
INNER JOIN ParsedObjects USING (ObjectID)
|
||||||
|
WHERE RDFs.MovieID IN ({movie_list_placeholder})
|
||||||
|
AND ParsedRelationships.RelationshipURI IN ('{relationship[0]}')
|
||||||
|
GROUP BY RDFs.MovieID
|
||||||
|
HAVING
|
||||||
|
SUM(CASE WHEN ParsedObjects.ObjectURI IN {tuple(objects_list)} THEN 1 ELSE 0 END) >= 1
|
||||||
|
AND
|
||||||
|
SUM(CASE WHEN ParsedObjects.ObjectURI NOT IN {tuple(objects_list)} THEN 1 ELSE 0 END) = 0;
|
||||||
|
"""
|
||||||
|
|
||||||
|
params = tuple(movie_ids)
|
||||||
|
self.MOVIE_FILTER = self.sql_endpoint.get_dataframe_from_query(other_query, params)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# movie_filter = MovieFilter()
|
||||||
|
# movie_filter.frequency_filter(5,10)
|
||||||
155
Scripts/DataCleaning/pipeline/pipeline.py
Normal file
155
Scripts/DataCleaning/pipeline/pipeline.py
Normal file
@ -0,0 +1,155 @@
|
|||||||
|
from movie_filter import MovieFilter
|
||||||
|
from relationship_filter import RelationshipFilter
|
||||||
|
from rdf_filter import RdfFilter
|
||||||
|
from cleaner import PipelineApplier
|
||||||
|
|
||||||
|
from Scripts.DataCleaning.data_output_models.bpe_corpus import BPE_corpus
|
||||||
|
from Scripts.DataCleaning.data_output_models.rdf_text_tasks import RDF_text_task_dataset
|
||||||
|
from Scripts.DataCleaning.data_output_models.rdf_completation_task import RDF_completation_task_dataset
|
||||||
|
from Scripts.DataCleaning.data_output_models.debug_csv import Debug_csv
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
RELATIONSHIP_FILTER_LIST = [
|
||||||
|
"dbp-dbp:wikiPageUsesTemplate","w3:2000/01/rdf-schema#label","dbp-dbo:abstract",
|
||||||
|
"dbp-dbo:wikiPageID","dbp-dbo:wikiPageRevisionID", "dbp-dbo:wikiPageDisambiguates",
|
||||||
|
"w3:2002/07/owl#sameAs","dbp-dbp:image","dbp-dbo:wikiPageLength", "w3:2000/01/rdf-schema#comment",
|
||||||
|
"dbp-dbo:thumbnail", "foaf:depiction", "w3:1999/02/22-rdf-syntax-ns#type",
|
||||||
|
"dbp-dbp:id","dbp-dbp:totalWidth", "w3:ns/prov#wasDerivedFrom", "dbp-dbp:n", "dbp-dbp:alt",
|
||||||
|
"dbp-dbo:soundRecording", "dbp-dbp:align", "dbp-dbp:format",
|
||||||
|
"dbp-dbp:filename", "dbp-dbp:wikt", "foaf:isPrimaryTopicOf", "dbp-dbp:quote", "foaf:homepage",
|
||||||
|
"dbp-dbp:wordnet_type", "dbp-dbp:length","dbp-dbp:caption", "dbp-dbo:imdbId", "dbp-dbp:border", "dbp-dbp:note",
|
||||||
|
"dbp-dbp:postalCodeType", "dbp-dbp:extraColumn", "foaf:homepage", "dbp-dbp:bgcolor","dbp-dbp:prevTitle",
|
||||||
|
"dbp-dbp:imageUpright", "dbp-dbp:url", "dbp-dbp:italicTitle", "dbp-dbp:imageSize", "dbp-dbp:text",
|
||||||
|
"dbp-dbp:captionAlign", "dbp-dbp:headerAlign", "dbp-dbp:height", "dbp-dbp:link", "dbp-dbo:wikiPageInterLanguageLink",
|
||||||
|
"w3:2003/01/geo/wgs84_pos#lat", "w3:2003/01/geo/wgs84_pos#long", "http://www.georss.org/georss/point",
|
||||||
|
"dbp-dbp:bgcolor", "dbp-dbp:mc", "dbp-dbp:rev3score", "dbp-dbp:rev4score", "dbp-dbp:imageAlt",
|
||||||
|
"dbp-dbp:b", "dbp-dbp:s", "dbp-dbp:c", "dbp-dbp:d", "dbp-dbp:m", "dbp-dbp:v", "dbp-dbp:mw", "dbp-dbp:fontsize",
|
||||||
|
"dbp-dbp:salign", "dbp-dbp:q", "dbp-dbp:portal", "dbp-dbp:dSearch", "dbp-dbp:header", "w3:2003/01/geo/wgs84_pos#geometry",
|
||||||
|
"dbp-dbp:shortsummary", "dbp-dbp:fixAttempted", "dbp-dbo:developer", "dbp-dbp:no", "dbp-dbp:ref", "dbp-dbp:infoa"
|
||||||
|
"dbp-dbp:infob", "dbp-dbp:1a", "dbp-dbp:1p", "dbp-dbp:2a", "dbp-dbp:2p", "http://rdvocab.info/RDARelationshipsWEMI/manifestationOfWork",
|
||||||
|
"dbp-dbp:isbn", "dbp-dbp:titleWidth", "dbp-dbp:prodcode", "dbp-dbp:page", "w3:2004/02/skos/core#closeMatch",
|
||||||
|
"dbp-dbp:colwidth", "dbp-dbp:imagesize", "dbp-dbp:rr", "dbp-dbp:date", "dbp-dbp:type", "dbp-dbp:list",
|
||||||
|
"dbp-dbp:listEpisodes", "dbp-dbp:footerAlign", "dbp-dbp:float", "dbp-dbp:bot", "dbp-dbp:p", "dbp-dbp:l", "dbp-dbp:t", "dbp-dbp:j",
|
||||||
|
"dbp-dbp:1y", "dbp-dbp:2y", "dbp-dbp:1pp", "dbp-dbp:vgs", "dbp-dbp:3a", "dbp-dbp:3p", "dbp-dbp:3y", "dbp-dbp:4a", "dbp-dbp:4y",
|
||||||
|
"dbp-dbp:website"
|
||||||
|
]
|
||||||
|
|
||||||
|
RELATIONSHIP_WHITE_LIST = [
|
||||||
|
"dbp-dbp:director","dbp-dbo:starring", "dbp-dbo:writer", "dbp-dbp:name", "dbp-dbp:genre", "purl:dc/terms/subject"
|
||||||
|
]
|
||||||
|
"""
|
||||||
|
SELECT DISTINCT field3
|
||||||
|
FROM debug
|
||||||
|
"""
|
||||||
|
|
||||||
|
class Pipeline():
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self._movie_filter = MovieFilter()
|
||||||
|
self._relationship_filter = RelationshipFilter()
|
||||||
|
self._rdf_filter = RdfFilter()
|
||||||
|
self._pipeline = PipelineApplier()
|
||||||
|
|
||||||
|
self.task_bpe_corpus = BPE_corpus("./Assets/Dataset/Tmp/corpus.txt")
|
||||||
|
self.task_rdf_text = RDF_text_task_dataset("./Assets/Dataset/Tmp/rdf_text.csv")
|
||||||
|
self.task_rdf_completation = RDF_completation_task_dataset("./Assets/Dataset/Tmp/rdf_completation.csv")
|
||||||
|
|
||||||
|
self._movie_filter.frequency_filter(50,3000)
|
||||||
|
self._relationship_filter.frequency_filter(25, 2395627) # from 2718 to 3069
|
||||||
|
self._relationship_filter.delete_relationship_uri_by_list(RELATIONSHIP_FILTER_LIST)
|
||||||
|
|
||||||
|
def other_filter(self):
|
||||||
|
self._movie_filter.relation_filter("purl:dc/terms/subject",5,100)
|
||||||
|
self._movie_filter.filter_by_director()
|
||||||
|
self._movie_filter.filter_by_english_movies()
|
||||||
|
self._movie_filter.relation_filter("dbp-dbp:budget",1,100) # the most important film have relationship budget
|
||||||
|
self._movie_filter.relation_filter("dbp-dbp:released",1,100) # to cut to 2000 :(
|
||||||
|
|
||||||
|
def _get_cleaned_movie_rows(self):
|
||||||
|
movie_ids = self._movie_filter.get_movie_id()
|
||||||
|
rel_ids = self._relationship_filter.get_relationship_id()
|
||||||
|
# rel_ids = self._relationship_filter.get_relationship_id_from_white_list(RELATIONSHIP_WHITE_LIST)
|
||||||
|
|
||||||
|
for RDF in self._rdf_filter.yield_movie_abbreviated_rdfs(movie_ids,rel_ids):
|
||||||
|
RDF = self._pipeline.drop_na_from_dataset(RDF)
|
||||||
|
RDF = self._pipeline.regex_on_objects(RDF)
|
||||||
|
RDF = self._pipeline.rdf_add_special_token(RDF)
|
||||||
|
|
||||||
|
if RDF.empty:
|
||||||
|
continue
|
||||||
|
yield RDF
|
||||||
|
|
||||||
|
|
||||||
|
def execute_task_bpe_corpus(self):
|
||||||
|
for RDF in self._get_cleaned_movie_rows():
|
||||||
|
RDF = self._pipeline.rebuild_by_movie(RDF)
|
||||||
|
RDF = RDF[["Triple","Abstract"]]
|
||||||
|
self.task_bpe_corpus.write_from_df(RDF)
|
||||||
|
self._end_file_handler()
|
||||||
|
|
||||||
|
|
||||||
|
def execute_tasks_rdf_text(self):
|
||||||
|
for RDF in self._get_cleaned_movie_rows():
|
||||||
|
RDF = self._pipeline.rebuild_by_movie(RDF)
|
||||||
|
self.task_rdf_text.write(RDF)
|
||||||
|
self._end_file_handler()
|
||||||
|
|
||||||
|
|
||||||
|
def execute_task_rdf_completation(self):
|
||||||
|
for RDF in self._get_cleaned_movie_rows():
|
||||||
|
RDF["Triple"] = self._pipeline.build_triple(RDF)
|
||||||
|
self.task_rdf_completation.write(RDF[["MovieID","Triple"]])
|
||||||
|
self._end_file_handler()
|
||||||
|
|
||||||
|
|
||||||
|
def _end_file_handler(self):
|
||||||
|
self.task_bpe_corpus.close()
|
||||||
|
self.task_rdf_text.close()
|
||||||
|
self.task_rdf_completation.close()
|
||||||
|
|
||||||
|
|
||||||
|
def execute_all_task(self):
|
||||||
|
for RDF in self._get_cleaned_movie_rows():
|
||||||
|
completation_RDF = RDF.copy()
|
||||||
|
completation_RDF["Triple"] = self._pipeline.build_triple(completation_RDF)
|
||||||
|
self.task_rdf_completation.write(completation_RDF[["MovieID","Triple"]])
|
||||||
|
|
||||||
|
RDF = self._pipeline.rebuild_by_movie(RDF)
|
||||||
|
|
||||||
|
self.task_rdf_text.write(RDF)
|
||||||
|
self.task_bpe_corpus.write_from_df(RDF[["Triple","Abstract"]])
|
||||||
|
|
||||||
|
self._end_file_handler()
|
||||||
|
|
||||||
|
|
||||||
|
def use_toy_dataset(self):
|
||||||
|
# CHOOSEN MOVIE:
|
||||||
|
# The Dark Knight : 117248
|
||||||
|
# Inception : 147074
|
||||||
|
# The Avengers : 113621
|
||||||
|
# Cast Away : 1123
|
||||||
|
# The Departed : 117586
|
||||||
|
# American Psycho : 90177
|
||||||
|
# Avatar : 71587
|
||||||
|
# Django Unchained : 138952
|
||||||
|
# Spirited Away : 144137
|
||||||
|
# Knives Out : 148025
|
||||||
|
# [106465,106466,106467,106468,106469,106470,106471,106472,106473]
|
||||||
|
movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]
|
||||||
|
self._movie_filter.MOVIE_FILTER = pd.DataFrame({"MovieID": movie_list})
|
||||||
|
|
||||||
|
def generate_csv_debug_file(self, debug_path:str):
|
||||||
|
debug_csv = Debug_csv(debug_path)
|
||||||
|
|
||||||
|
for RDF in self._get_cleaned_movie_rows():
|
||||||
|
debug_csv.write(RDF)
|
||||||
|
|
||||||
|
debug_csv.close()
|
||||||
|
|
||||||
|
|
||||||
|
pipe = Pipeline()
|
||||||
|
#pipe.use_toy_dataset()
|
||||||
|
pipe.other_filter()
|
||||||
|
# pipe.execute_all_task()
|
||||||
|
pipe.generate_csv_debug_file("Assets/Dataset/Tmp/debug.csv")
|
||||||
32
Scripts/DataCleaning/pipeline/rdf_filter.py
Normal file
32
Scripts/DataCleaning/pipeline/rdf_filter.py
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
import pandas as pd
|
||||||
|
from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
|
||||||
|
|
||||||
|
class RdfFilter:
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self.sql_endpoint = SqlEndpoint()
|
||||||
|
|
||||||
|
|
||||||
|
# def delete_hyperum_when_movie(self):
|
||||||
|
# purl:linguistics/gold/hypernym
|
||||||
|
# is almost ever as "dbp-dbr:Movie" or "dbp-dbr:Film"
|
||||||
|
# banned triple
|
||||||
|
|
||||||
|
def yield_movie_abbreviated_rdfs(self, MOVIE_ID: pd.DataFrame, REL_ID: pd.DataFrame):
|
||||||
|
relationship_placeholder = ",".join(["?"] * len(REL_ID))
|
||||||
|
|
||||||
|
param = tuple(REL_ID["RelationshipID"].to_list())
|
||||||
|
|
||||||
|
QUERY = f"""
|
||||||
|
SELECT MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
|
||||||
|
FROM RDFs
|
||||||
|
INNER JOIN ParsedSubjects USING (SubjectID)
|
||||||
|
INNER JOIN ParsedRelationships USING (RelationshipID)
|
||||||
|
INNER JOIN ParsedObjects USING (ObjectID)
|
||||||
|
INNER JOIN WikipediaAbstracts USING (MovieID)
|
||||||
|
WHERE MovieID = (?) AND RelationshipID IN ({relationship_placeholder});
|
||||||
|
"""
|
||||||
|
|
||||||
|
for movie_id in MOVIE_ID["MovieID"].to_list():
|
||||||
|
params = (movie_id,) + param
|
||||||
|
yield self.sql_endpoint.get_dataframe_from_query(QUERY, params=params)
|
||||||
54
Scripts/DataCleaning/pipeline/relationship_filter.py
Normal file
54
Scripts/DataCleaning/pipeline/relationship_filter.py
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
import pandas as pd
|
||||||
|
from Scripts.Libs.CleaningPipeline.sql_endpoint import SqlEndpoint
|
||||||
|
|
||||||
|
class RelationshipFilter:
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self.sql_endpoint = SqlEndpoint()
|
||||||
|
# first obtain all relationship_id
|
||||||
|
relationship_query = "SELECT RelationshipID FROM Relationships"
|
||||||
|
self.RELATIONSHIP_FILTER = self.sql_endpoint.get_dataframe_from_query(relationship_query)
|
||||||
|
|
||||||
|
|
||||||
|
def frequency_filter(self, min_treshold:int, max_treshold:int):
|
||||||
|
movie_list_placeholder = ",".join(["?"] * len(self.RELATIONSHIP_FILTER))
|
||||||
|
|
||||||
|
filter_query = f"""
|
||||||
|
SELECT RelationshipID
|
||||||
|
FROM RDFs
|
||||||
|
WHERE RelationshipID IN ({movie_list_placeholder})
|
||||||
|
GROUP BY RelationshipID
|
||||||
|
HAVING COUNT(*) BETWEEN {min_treshold} AND {max_treshold};
|
||||||
|
"""
|
||||||
|
self.RELATIONSHIP_FILTER = self.sql_endpoint.get_dataframe_from_query(filter_query, tuple(self.RELATIONSHIP_FILTER["RelationshipID"].to_list()))
|
||||||
|
|
||||||
|
|
||||||
|
def get_relationship_id(self):
|
||||||
|
return self.RELATIONSHIP_FILTER
|
||||||
|
|
||||||
|
def get_relationship_id_from_white_list(self, relationship_list: list[str]):
|
||||||
|
ids_placeholder = ",".join(["?"] * len(self.RELATIONSHIP_FILTER))
|
||||||
|
uri_placeholder = ",".join(["?"] * len(relationship_list))
|
||||||
|
filter_query = f"""
|
||||||
|
SELECT RelationshipID
|
||||||
|
FROM ParsedRelationships
|
||||||
|
WHERE RelationshipID IN ({ids_placeholder})
|
||||||
|
AND RelationshipURI IN ({uri_placeholder});
|
||||||
|
"""
|
||||||
|
params = tuple(self.RELATIONSHIP_FILTER["RelationshipID"].to_list()) + tuple(relationship_list)
|
||||||
|
return self.sql_endpoint.get_dataframe_from_query(filter_query, params)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def delete_relationship_uri_by_list(self, filter_list: list[str]):
|
||||||
|
ids_placeholder = ",".join(["?"] * len(self.RELATIONSHIP_FILTER))
|
||||||
|
uri_placeholder = ",".join(["?"] * len(filter_list))
|
||||||
|
|
||||||
|
filter_query = f"""
|
||||||
|
SELECT RelationshipID
|
||||||
|
FROM ParsedRelationships
|
||||||
|
WHERE RelationshipID IN ({ids_placeholder})
|
||||||
|
AND RelationshipURI NOT IN ({uri_placeholder});
|
||||||
|
"""
|
||||||
|
params = tuple(self.RELATIONSHIP_FILTER["RelationshipID"].to_list()) + tuple(filter_list)
|
||||||
|
self.RELATIONSHIP_FILTER = self.sql_endpoint.get_dataframe_from_query(filter_query, params)
|
||||||
53
Scripts/DataGathering/analysis.py
Normal file
53
Scripts/DataGathering/analysis.py
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
class ProgramArgs:
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self, input_file: str, column: str, output_file: str, count: bool
|
||||||
|
) -> None:
|
||||||
|
self.input_file = input_file
|
||||||
|
self.column = column
|
||||||
|
self.output_file = output_file
|
||||||
|
self.count = count
|
||||||
|
|
||||||
|
|
||||||
|
def get_args(args: list[str]) -> ProgramArgs:
|
||||||
|
|
||||||
|
PARSER = argparse.ArgumentParser()
|
||||||
|
PARSER.add_argument("--input-file", "--input", "-i", required=True, type=str)
|
||||||
|
PARSER.add_argument("--output-file", "--output", "-o", required=True, type=str)
|
||||||
|
PARSER.add_argument("--column", "--col", required=True, type=str)
|
||||||
|
PARSER.add_argument(
|
||||||
|
"--count", "-c", action="store_const", const=True, default=False
|
||||||
|
)
|
||||||
|
parsed_args, _ = PARSER.parse_known_args(args)
|
||||||
|
|
||||||
|
return ProgramArgs(
|
||||||
|
parsed_args.input_file,
|
||||||
|
parsed_args.column,
|
||||||
|
parsed_args.output_file,
|
||||||
|
parsed_args.count,
|
||||||
|
) # type ignore
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
ARGS = get_args(sys.argv)
|
||||||
|
|
||||||
|
OUTPUT_FILE = open(ARGS.output_file, "w+", encoding="utf-8")
|
||||||
|
|
||||||
|
# Load the CSV
|
||||||
|
df = pd.read_csv(ARGS.input_file)
|
||||||
|
|
||||||
|
# Count occurrences of each unique last part
|
||||||
|
item_counts = df[ARGS.column].value_counts()
|
||||||
|
|
||||||
|
# Print the counts
|
||||||
|
for item, count in item_counts.items():
|
||||||
|
|
||||||
|
if ARGS.count:
|
||||||
|
OUTPUT_FILE.write(f"{item}: {count}\n")
|
||||||
|
else:
|
||||||
|
OUTPUT_FILE.write(f"{item}\n")
|
||||||
146
Scripts/DataGathering/fetchdata.py
Normal file
146
Scripts/DataGathering/fetchdata.py
Normal file
@ -0,0 +1,146 @@
|
|||||||
|
import argparse
|
||||||
|
from math import floor
|
||||||
|
import sys
|
||||||
|
from time import sleep
|
||||||
|
import SPARQLWrapper
|
||||||
|
|
||||||
|
|
||||||
|
class ProgramData:
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
local_url,
|
||||||
|
query_url,
|
||||||
|
sparql_url,
|
||||||
|
output_type,
|
||||||
|
initial_offset,
|
||||||
|
timeout,
|
||||||
|
limit,
|
||||||
|
max_pages,
|
||||||
|
verbosity_level,
|
||||||
|
) -> None:
|
||||||
|
|
||||||
|
self.local_url = local_url
|
||||||
|
self.query_url = query_url
|
||||||
|
self.sparql_url = sparql_url
|
||||||
|
self.output_type = output_type
|
||||||
|
self.initial_offset = initial_offset
|
||||||
|
self.timeout = timeout
|
||||||
|
self.limit = limit
|
||||||
|
self.max_pages = max_pages
|
||||||
|
self.verbosity_level = verbosity_level
|
||||||
|
|
||||||
|
@property
|
||||||
|
def offset(self):
|
||||||
|
return self.limit
|
||||||
|
|
||||||
|
@property
|
||||||
|
def query(self):
|
||||||
|
|
||||||
|
with open(self.query_url, "r") as file:
|
||||||
|
return file.read()
|
||||||
|
|
||||||
|
|
||||||
|
DBPEDIA_URL = "https://dbpedia.org/sparql"
|
||||||
|
TYPE = SPARQLWrapper.CSV
|
||||||
|
TIMEOUT_SECONDS = 1.5
|
||||||
|
LIMIT = int(1E4)
|
||||||
|
INITIAL_OFFSET = 0
|
||||||
|
MAX_PAGES = int(1E9)
|
||||||
|
|
||||||
|
|
||||||
|
def gather_cli_args(args: list[str]) -> ProgramData:
|
||||||
|
|
||||||
|
# TODO: Add argument for type
|
||||||
|
PARSER = argparse.ArgumentParser("sparql data fetcher")
|
||||||
|
PARSER.add_argument("--file-path", "--file", "--output", "-o", required=True, type=str)
|
||||||
|
PARSER.add_argument("--query-file", "--query", "-q", required=True, type=str)
|
||||||
|
PARSER.add_argument("--url", type=str, default=DBPEDIA_URL)
|
||||||
|
PARSER.add_argument("--limit", type=int, default=LIMIT)
|
||||||
|
PARSER.add_argument("--timeout", type=float, default=TIMEOUT_SECONDS)
|
||||||
|
PARSER.add_argument("--offset", type=int, default=INITIAL_OFFSET)
|
||||||
|
PARSER.add_argument("--max-pages", type=int, default=MAX_PAGES)
|
||||||
|
PARSER.add_argument("--verbose", "-v", action="count", default=0)
|
||||||
|
|
||||||
|
parsed_args, _ = PARSER.parse_known_args(args)
|
||||||
|
|
||||||
|
return ProgramData(
|
||||||
|
parsed_args.file_path,
|
||||||
|
parsed_args.query_file,
|
||||||
|
parsed_args.url,
|
||||||
|
SPARQLWrapper.CSV,
|
||||||
|
parsed_args.offset,
|
||||||
|
parsed_args.timeout,
|
||||||
|
parsed_args.limit,
|
||||||
|
parsed_args.max_pages,
|
||||||
|
parsed_args.verbose
|
||||||
|
)
|
||||||
|
# type: ignore
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_data(DATA: ProgramData):
|
||||||
|
|
||||||
|
# Take correction of page into account
|
||||||
|
page = int(floor(DATA.initial_offset / DATA.limit)) - 1
|
||||||
|
exit = False
|
||||||
|
|
||||||
|
while not exit:
|
||||||
|
|
||||||
|
print(f"Starting to get page {page}")
|
||||||
|
|
||||||
|
CURRENT_OFFSET = int(DATA.offset + (page * DATA.limit))
|
||||||
|
sparql = SPARQLWrapper.SPARQLWrapper(DATA.sparql_url)
|
||||||
|
|
||||||
|
sparql.setReturnFormat(TYPE)
|
||||||
|
|
||||||
|
CURRENT_PAGE_QUERY = "\n".join([
|
||||||
|
DATA.query,
|
||||||
|
f"LIMIT {LIMIT}",
|
||||||
|
f"OFFSET {CURRENT_OFFSET}"
|
||||||
|
])
|
||||||
|
|
||||||
|
print(f"\nCurrent Query:\n{CURRENT_PAGE_QUERY}\n")
|
||||||
|
|
||||||
|
sparql.setQuery(CURRENT_PAGE_QUERY)
|
||||||
|
|
||||||
|
try:
|
||||||
|
res = sparql.queryAndConvert()
|
||||||
|
text = ""
|
||||||
|
|
||||||
|
if type(res) == bytes:
|
||||||
|
|
||||||
|
initial_offset = 0
|
||||||
|
|
||||||
|
if page != 0:
|
||||||
|
initial_offset = 1
|
||||||
|
|
||||||
|
lines = res.decode("utf-8", "ignore").split("\n")
|
||||||
|
text = "\n".join(lines[initial_offset:])
|
||||||
|
|
||||||
|
if text == "":
|
||||||
|
exit = True
|
||||||
|
continue
|
||||||
|
|
||||||
|
with open(DATA.local_url, "a+", encoding="utf-8") as dataset:
|
||||||
|
|
||||||
|
print(f"Writing page {page} on {DATA.local_url}")
|
||||||
|
dataset.write(
|
||||||
|
text
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as ex:
|
||||||
|
print(f"Something went wrong during page {page}:\n\t{ex}")
|
||||||
|
|
||||||
|
print(f"Sleeping for {TIMEOUT_SECONDS}")
|
||||||
|
|
||||||
|
page += 1
|
||||||
|
|
||||||
|
if page == MAX_PAGES - 1:
|
||||||
|
exit = True
|
||||||
|
|
||||||
|
sleep(TIMEOUT_SECONDS)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
DATA = gather_cli_args(sys.argv)
|
||||||
|
fetch_data(DATA)
|
||||||
154
Scripts/DataGathering/wikipedia_gathering.py
Normal file
154
Scripts/DataGathering/wikipedia_gathering.py
Normal file
@ -0,0 +1,154 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
import csv
|
||||||
|
import time
|
||||||
|
import requests
|
||||||
|
|
||||||
|
input_csv = "./Assets/Dataset/1-hop/movie-pageid.csv"
|
||||||
|
output_csv = "./Assets/Dataset/Tmp/wikipedia-summary.csv"
|
||||||
|
|
||||||
|
|
||||||
|
sess = requests.Session()
|
||||||
|
|
||||||
|
CHUNK = 20
|
||||||
|
|
||||||
|
|
||||||
|
# Function to get clean full text from Wikipedia PageID
|
||||||
|
def get_clean_text(pageIDS: list[str]):
|
||||||
|
|
||||||
|
parsing_time = 0
|
||||||
|
start_full = time.time()
|
||||||
|
API_URL = "https://en.wikipedia.org/w/api.php"
|
||||||
|
headers = {
|
||||||
|
"User-Agent": "CoolBot/0.0"
|
||||||
|
""
|
||||||
|
" (https://example.org/coolbot/; coolbot@example.org)"
|
||||||
|
}
|
||||||
|
|
||||||
|
ids = "|".join(pageIDS)
|
||||||
|
|
||||||
|
start_fetch = time.time()
|
||||||
|
res = sess.get(headers=headers, url=f"{API_URL}?action=query&pageids={ids}&prop=extracts&exintro=1&explaintext=1&format=json")
|
||||||
|
end_fetch = time.time()
|
||||||
|
fetch_time = end_fetch - start_fetch
|
||||||
|
print(f"Time elapsed FETCH: {fetch_time} seconds")
|
||||||
|
|
||||||
|
data = res.json()
|
||||||
|
|
||||||
|
|
||||||
|
abstracts = {}
|
||||||
|
# Make sure 'query' and the page exist
|
||||||
|
SKIPPED = 0
|
||||||
|
if "query" in data and "pages" in data["query"]:
|
||||||
|
for pageID in pageIDS:
|
||||||
|
if pageID in data["query"]["pages"]:
|
||||||
|
page = data["query"]["pages"][pageID]
|
||||||
|
extract: str = page.get("extract")
|
||||||
|
|
||||||
|
if extract:
|
||||||
|
print(f"Entry FOUND for pageID {pageID}")
|
||||||
|
start_parse = time.time()
|
||||||
|
extract = extract.strip()
|
||||||
|
extract = extract.replace("\n", "")
|
||||||
|
end_parse = time.time()
|
||||||
|
parsing_time = end_parse - start_parse
|
||||||
|
print(f"Time elapsed PARSE: {parsing_time} seconds")
|
||||||
|
abstracts[pageID] = extract
|
||||||
|
else:
|
||||||
|
SKIPPED += 1
|
||||||
|
print(f"Entry MISSING for pageID {pageID}")
|
||||||
|
else:
|
||||||
|
SKIPPED += 1
|
||||||
|
print(f"Page MISSING for pageID {pageID}")
|
||||||
|
|
||||||
|
print(f"Chunk done - Skipped {SKIPPED}")
|
||||||
|
end_full = time.time()
|
||||||
|
|
||||||
|
print(f"Time elapsed FULL: {end_full - start_full} seconds\n\tNO PARSE: {(end_full - start_full) - parsing_time} seconds")
|
||||||
|
return abstracts
|
||||||
|
|
||||||
|
|
||||||
|
def flush(movie_ids):
|
||||||
|
|
||||||
|
|
||||||
|
abstracts = get_clean_text(movie_ids)
|
||||||
|
|
||||||
|
start = time.time()
|
||||||
|
with open(output_csv, "a", newline="", encoding="utf-8") as f_out:
|
||||||
|
writer = csv.DictWriter(f_out, fieldnames=["subject", "text"])
|
||||||
|
|
||||||
|
for id, text in abstracts.items():
|
||||||
|
writer.writerow({"subject": id, "text": text})
|
||||||
|
end = time.time()
|
||||||
|
|
||||||
|
print(f"Time elapsed WRITE: {end - start} seconds")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def reconcile() -> int:
|
||||||
|
|
||||||
|
start = time.time()
|
||||||
|
input_file = open(input_csv, "r", newline="", encoding="utf-8")
|
||||||
|
output_file = open(output_csv, "r", newline="", encoding="utf-8")
|
||||||
|
|
||||||
|
next(input_file)
|
||||||
|
LAST_CHECKED = output_file.readlines()[-1].split(",")[0]
|
||||||
|
current_check = input_file.readline().split(",")[1]
|
||||||
|
|
||||||
|
index = 1
|
||||||
|
|
||||||
|
while current_check != LAST_CHECKED:
|
||||||
|
current_check = input_file.readline().split(",")[1].replace("\n", "")
|
||||||
|
index += 1
|
||||||
|
|
||||||
|
input_file.close()
|
||||||
|
output_file.close()
|
||||||
|
end = time.time()
|
||||||
|
|
||||||
|
|
||||||
|
print(f"Time elapsed RECONCILE: {end - start} seconds")
|
||||||
|
|
||||||
|
print(f"FOUND, we need to skip {index} lines")
|
||||||
|
|
||||||
|
return index
|
||||||
|
|
||||||
|
|
||||||
|
if not Path(output_csv).is_file():
|
||||||
|
# Initialize output CSV
|
||||||
|
with open(output_csv, "w", newline="", encoding="utf-8") as f_out:
|
||||||
|
writer = csv.DictWriter(f_out, fieldnames=["subject", "text"])
|
||||||
|
writer.writeheader()
|
||||||
|
|
||||||
|
|
||||||
|
SKIP = reconcile()
|
||||||
|
|
||||||
|
|
||||||
|
# Read CSV in RAM
|
||||||
|
with open(input_csv, "r", newline="", encoding="utf-8") as input:
|
||||||
|
|
||||||
|
# Skip already done
|
||||||
|
for i in range(0, SKIP):
|
||||||
|
next(input)
|
||||||
|
|
||||||
|
reader = csv.reader(input)
|
||||||
|
|
||||||
|
index = -1
|
||||||
|
movie_ids = []
|
||||||
|
|
||||||
|
for line in reader:
|
||||||
|
|
||||||
|
index += 1
|
||||||
|
|
||||||
|
if index == 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Save movies in map
|
||||||
|
movie_ids.append(line[1])
|
||||||
|
|
||||||
|
if index % CHUNK == 0:
|
||||||
|
|
||||||
|
# Flush movies
|
||||||
|
flush(movie_ids)
|
||||||
|
movie_ids = []
|
||||||
26
Scripts/DatasetMerging/datasetInfo.md
Normal file
26
Scripts/DatasetMerging/datasetInfo.md
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
# HOW THE DATASET IS BUILT AND POPULATED
|
||||||
|
|
||||||
|
Note: the data are taken from CSV files in 1-hop
|
||||||
|
|
||||||
|
## CSV files composition
|
||||||
|
|
||||||
|
| CSV files | Original structure | Saved AS |
|
||||||
|
|--------------------|---------------------------------------|-------------------------------------|
|
||||||
|
| Wikipeda-summary | PageId / abstract | subject, text |
|
||||||
|
| Movies | Movie URI | "subject" |
|
||||||
|
| Dataset | Movie URI / Relationship / Object [RDF] | subject, relationship, object |
|
||||||
|
| Movies-PageId | Movie URI / PageId (wiki) | "subject", "object" |
|
||||||
|
| Reverse | Subject / Relationship / Movie URI | "subject", "relationship", "object" |
|
||||||
|
|
||||||
|
## Wanted tables schema
|
||||||
|
|
||||||
|
| Table | Columns |
|
||||||
|
|---------------|-------------------------------------------------------------------------|
|
||||||
|
| Movies | MovieID [PK], Movie URI |
|
||||||
|
| WikiPageIDs | MovieID [PK, FK], PageId [IDX] (wiki) *(Not important for now)* |
|
||||||
|
| Abstracts | MovieID [PK, FK], abstract |
|
||||||
|
| Subjects | SubjectID [PK], RDF Subject (from Dataset.csv or Reverse.csv), OriginID [FK] |
|
||||||
|
| Relationships | RelationshipID [PK], RDF Relationship (value only, not the actual relation) |
|
||||||
|
| Objects | ObjectID [PK], RDF Object, OriginID [FK] |
|
||||||
|
| Origins | OriginID [PK], Origin Name |
|
||||||
|
| RDFs | RDF_ID [PK], MovieID [FK], SubjectID [FK], RelationshipID [FK], ObjectID [FK] |
|
||||||
633
Scripts/DatasetMerging/datawarehouse.py
Normal file
633
Scripts/DatasetMerging/datawarehouse.py
Normal file
@ -0,0 +1,633 @@
|
|||||||
|
import sqlite3
|
||||||
|
import csv
|
||||||
|
|
||||||
|
#####################################################################
|
||||||
|
# This file builds DatawareHouse/dataset.db from 1-hop csv files #
|
||||||
|
# Its Schema in . /SQL_Queries/db_creation.sql #
|
||||||
|
# The sql query used to popualate id in . /SQL_Queries/query.sql #
|
||||||
|
#####################################################################
|
||||||
|
|
||||||
|
# sometimes you may need to build a new db file, here a little snippet for you
|
||||||
|
# sqlite3 ./Assets/Dataset/Tmp/dataset.db < ./Scripts/DataCleaning/SQL_Queries/db_creation.sql
|
||||||
|
|
||||||
|
# --- Global configuration ---
|
||||||
|
DB_NAME = "./Assets/Dataset/DatawareHouse/dataset.db"
|
||||||
|
MOVIES_CSV = "./Assets/Dataset/1-hop/movies.csv"
|
||||||
|
PAGEID_CSV = "./Assets/Dataset/1-hop/movie-pageid.csv"
|
||||||
|
SUMMARY_CSV = "./Assets/Dataset/1-hop/wikipedia-summary.csv"
|
||||||
|
DATASET_CSV = "./Assets/Dataset/1-hop/dataset.csv"
|
||||||
|
REVERSE_CSV = "./Assets/Dataset/1-hop/reverse.csv"
|
||||||
|
URI_CSV = "./Assets/Dataset/1-hop/uri-abbreviations.csv"
|
||||||
|
|
||||||
|
MOVIES_CSV_HANDLER = open(MOVIES_CSV, "r", newline="", encoding="utf-8")
|
||||||
|
PAGEID_CSV_HANDLER = open(PAGEID_CSV, "r", newline="", encoding="utf-8")
|
||||||
|
SUMMARY_CSV_HANDLER = open(SUMMARY_CSV, "r", newline="", encoding="utf-8")
|
||||||
|
DATASET_CSV_HANDLER = open(DATASET_CSV, "r", newline="", encoding="utf-8")
|
||||||
|
REVERSE_CSV_HANDLER = open(REVERSE_CSV, "r", newline="", encoding="utf-8")
|
||||||
|
URI_ABBR_CSV_HANDLER = open(URI_CSV, "r", newline="", encoding="utf-8")
|
||||||
|
|
||||||
|
|
||||||
|
CONN = sqlite3.connect(DB_NAME)
|
||||||
|
CURS = CONN.cursor()
|
||||||
|
|
||||||
|
# MARK: SQL Definitions
|
||||||
|
# Insert MovieURI
|
||||||
|
|
||||||
|
|
||||||
|
def insertOrigin(curs: sqlite3.Cursor) -> bool:
|
||||||
|
|
||||||
|
QUERY = "INSERT INTO Origins (OriginName) VALUES ('dataset.csv'),('reverse.csv');"
|
||||||
|
try:
|
||||||
|
curs.execute(QUERY)
|
||||||
|
return True
|
||||||
|
except sqlite3.IntegrityError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def selectOrigin(curs: sqlite3.Cursor, originName: str) -> int | None:
|
||||||
|
|
||||||
|
QUERY = "SELECT OriginID FROM Origins WHERE OriginName = ?;"
|
||||||
|
|
||||||
|
curs.execute(QUERY, [originName])
|
||||||
|
originId = curs.fetchone()
|
||||||
|
if not originId:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# in this case the real id is the first element of the tuple
|
||||||
|
return originId[0]
|
||||||
|
|
||||||
|
|
||||||
|
def insertMovie(curs: sqlite3.Cursor, movieUri: str) -> bool:
|
||||||
|
|
||||||
|
QUERY = "INSERT INTO Movies (MovieURI) VALUES (?);"
|
||||||
|
try:
|
||||||
|
curs.execute(QUERY, [movieUri])
|
||||||
|
return True
|
||||||
|
except sqlite3.IntegrityError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def selectMovieId(curs: sqlite3.Cursor, movieUri: str) -> int | None:
|
||||||
|
|
||||||
|
QUERY = "SELECT MovieID FROM Movies WHERE MovieURI = ?;"
|
||||||
|
|
||||||
|
curs.execute(QUERY, [movieUri])
|
||||||
|
movieId = curs.fetchone()
|
||||||
|
if not movieId:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# in this case the real id is the first element of the tuple
|
||||||
|
return movieId[0]
|
||||||
|
|
||||||
|
|
||||||
|
def insertWikiPageId(curs: sqlite3.Cursor, movieId: int, pageId: int) -> bool:
|
||||||
|
QUERY = "INSERT INTO WikiPageIDs (MovieID, PageID) VALUES (?,?);"
|
||||||
|
try:
|
||||||
|
curs.execute(QUERY, [movieId, pageId])
|
||||||
|
return True
|
||||||
|
except sqlite3.IntegrityError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def selectMovieIdFromWikiPageId(curs: sqlite3.Cursor, pageId: int) -> int | None:
|
||||||
|
|
||||||
|
QUERY = "SELECT MovieID FROM WikiPageIDs WHERE PageID = ?;"
|
||||||
|
|
||||||
|
curs.execute(QUERY, [pageId])
|
||||||
|
movieId = curs.fetchone()
|
||||||
|
if not movieId:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# in this case the real id is the first element of the tuple
|
||||||
|
return movieId[0]
|
||||||
|
|
||||||
|
|
||||||
|
def insertWikiAbstract(curs: sqlite3.Cursor, movieId: int, abstract: str) -> bool:
|
||||||
|
QUERY = "INSERT INTO WikipediaAbstracts (MovieID, Abstract) VALUES (?,?);"
|
||||||
|
try:
|
||||||
|
curs.execute(QUERY, [movieId, abstract])
|
||||||
|
return True
|
||||||
|
except sqlite3.IntegrityError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def insertSubject(curs: sqlite3.Cursor, subjectURI: str, originID: int) -> bool:
|
||||||
|
QUERY = "INSERT INTO Subjects (SubjectURI, OriginID) VALUES (?,?);"
|
||||||
|
try:
|
||||||
|
curs.execute(QUERY, [subjectURI, originID])
|
||||||
|
return True
|
||||||
|
except sqlite3.IntegrityError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def insertRelationship(curs: sqlite3.Cursor, relationshipURI: str) -> bool:
|
||||||
|
QUERY = "INSERT INTO Relationships (RelationshipURI) VALUES (?);"
|
||||||
|
try:
|
||||||
|
curs.execute(QUERY, [relationshipURI])
|
||||||
|
return True
|
||||||
|
except sqlite3.IntegrityError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def insertObject(curs: sqlite3.Cursor, objectURI: str, originID: int) -> bool:
|
||||||
|
QUERY = "INSERT INTO objects (ObjectURI, OriginID) VALUES (?,?);"
|
||||||
|
try:
|
||||||
|
curs.execute(QUERY, [objectURI, originID])
|
||||||
|
return True
|
||||||
|
except sqlite3.IntegrityError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def selectSubjectId(curs: sqlite3.Cursor, subjectURI: str) -> int | None:
|
||||||
|
|
||||||
|
QUERY = "SELECT SubjectID FROM Subjects WHERE SubjectURI = ?;"
|
||||||
|
|
||||||
|
curs.execute(QUERY, [subjectURI])
|
||||||
|
subjectId = curs.fetchone()
|
||||||
|
if not subjectId:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# in this case the real id is the first element of the tuple
|
||||||
|
return subjectId[0]
|
||||||
|
|
||||||
|
|
||||||
|
def selectRelationshipId(curs: sqlite3.Cursor, relationshipURI: str) -> int | None:
|
||||||
|
|
||||||
|
QUERY = "SELECT RelationshipID FROM Relationships WHERE RelationshipURI = ?;"
|
||||||
|
|
||||||
|
curs.execute(QUERY, [relationshipURI])
|
||||||
|
relationshipId = curs.fetchone()
|
||||||
|
if not relationshipId:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# in this case the real id is the first element of the tuple
|
||||||
|
return relationshipId[0]
|
||||||
|
|
||||||
|
|
||||||
|
def selectObjectId(curs: sqlite3.Cursor, objectURI: str) -> int | None:
|
||||||
|
|
||||||
|
QUERY = "SELECT ObjectID FROM Objects WHERE ObjectURI = ?;"
|
||||||
|
|
||||||
|
curs.execute(QUERY, [objectURI])
|
||||||
|
objectId = curs.fetchone()
|
||||||
|
if not objectId:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# in this case the real id is the first element of the tuple
|
||||||
|
return objectId[0]
|
||||||
|
|
||||||
|
|
||||||
|
def insertRDF(
|
||||||
|
curs: sqlite3.Cursor,
|
||||||
|
movieId: int,
|
||||||
|
subjectId: int,
|
||||||
|
relationshipId: int,
|
||||||
|
objectId: int,
|
||||||
|
) -> bool:
|
||||||
|
QUERY = "INSERT INTO RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?,?,?,?);"
|
||||||
|
try:
|
||||||
|
curs.execute(QUERY, [movieId, subjectId, relationshipId, objectId])
|
||||||
|
return True
|
||||||
|
except sqlite3.IntegrityError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# UGLY: correct method to add cursor
|
||||||
|
def insert_abbreviation(uri, abbreviation) -> bool:
|
||||||
|
QUERY = "INSERT INTO Abbreviations(URI, Abbreviation) VALUES (?,?);"
|
||||||
|
try:
|
||||||
|
CURS.execute(QUERY, [uri, abbreviation])
|
||||||
|
return True
|
||||||
|
except sqlite3.IntegrityError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# UGLY: correct method to add cursor
|
||||||
|
def insert_object_abbreviation(object_id, abbreviation_id) -> bool:
|
||||||
|
QUERY = "INSERT INTO Objects_Abbreviations(ObjectID, AbbreviationID) VALUES (?,?);"
|
||||||
|
try:
|
||||||
|
CURS.execute(QUERY, [object_id, abbreviation_id])
|
||||||
|
return True
|
||||||
|
except sqlite3.IntegrityError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# UGLY: correct method to add cursor
|
||||||
|
def insert_relationship_abbreviation(relationship_id, abbreviation_id) -> bool:
|
||||||
|
QUERY = "INSERT INTO Relationships_Abbreviations(RelationshipID, AbbreviationID) VALUES (?,?);"
|
||||||
|
try:
|
||||||
|
CURS.execute(QUERY, [relationship_id, abbreviation_id])
|
||||||
|
return True
|
||||||
|
except sqlite3.IntegrityError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# UGLY: correct method to add cursor
|
||||||
|
def insert_subject_abbreviation(subject_id, abbreviation_id) -> bool:
|
||||||
|
QUERY = (
|
||||||
|
"INSERT INTO Subjects_Abbreviations(SubjectID, AbbreviationID) VALUES (?,?);"
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
CURS.execute(QUERY, [subject_id, abbreviation_id])
|
||||||
|
return True
|
||||||
|
except sqlite3.IntegrityError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# UGLY: correct method to add cursor
|
||||||
|
def select_abbreviation_id(uri) -> int | None:
|
||||||
|
QUERY = "SELECT AbbreviationID FROM Abbreviations WHERE URI LIKE ?;"
|
||||||
|
CURS.execute(QUERY, [uri])
|
||||||
|
abbreviation_id = CURS.fetchone()
|
||||||
|
if not abbreviation_id:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# in this case the real id is the first element of the tuple
|
||||||
|
return abbreviation_id[0]
|
||||||
|
|
||||||
|
|
||||||
|
# MARK: Parsing
|
||||||
|
def parseMovies():
|
||||||
|
|
||||||
|
CSV_READER = csv.reader(MOVIES_CSV_HANDLER)
|
||||||
|
next(CSV_READER)
|
||||||
|
for row in CSV_READER:
|
||||||
|
MOVIE = row[0]
|
||||||
|
insertMovie(CURS, MOVIE)
|
||||||
|
|
||||||
|
|
||||||
|
def parseWikiPageId():
|
||||||
|
CSV_READER = csv.DictReader(PAGEID_CSV_HANDLER)
|
||||||
|
for row in CSV_READER:
|
||||||
|
MOVIE_URI = row["subject"]
|
||||||
|
WIKI_PAGE_ID = int(row["object"])
|
||||||
|
MOVIE_ID = selectMovieId(CURS, MOVIE_URI)
|
||||||
|
|
||||||
|
if MOVIE_ID is None:
|
||||||
|
print(f"The MovieUri: {MOVIE_URI} has not a MovieId ")
|
||||||
|
continue
|
||||||
|
|
||||||
|
insertWikiPageId(CURS, MOVIE_ID, WIKI_PAGE_ID)
|
||||||
|
|
||||||
|
|
||||||
|
def parseAbstract():
|
||||||
|
CSV_READER = csv.DictReader(SUMMARY_CSV_HANDLER)
|
||||||
|
for row in CSV_READER:
|
||||||
|
|
||||||
|
WIKI_PAGE_ID = int(row["subject"])
|
||||||
|
ABSTRACT = row["text"]
|
||||||
|
MOVIE_ID = selectMovieIdFromWikiPageId(CURS, WIKI_PAGE_ID)
|
||||||
|
|
||||||
|
if MOVIE_ID is None:
|
||||||
|
print(f"The WikiPageId: {WIKI_PAGE_ID} has not a MovieId ")
|
||||||
|
continue
|
||||||
|
|
||||||
|
insertWikiAbstract(CURS, MOVIE_ID, ABSTRACT)
|
||||||
|
|
||||||
|
|
||||||
|
def parseAbbreviations():
|
||||||
|
URI_CSV = csv.DictReader(URI_ABBR_CSV_HANDLER)
|
||||||
|
for row in URI_CSV:
|
||||||
|
|
||||||
|
URI = row["uri"]
|
||||||
|
ABBREVIATION = row["abbreviation"]
|
||||||
|
|
||||||
|
insert_abbreviation(URI, ABBREVIATION)
|
||||||
|
|
||||||
|
|
||||||
|
def parseRDF_Reverse():
|
||||||
|
|
||||||
|
REVERSE_CSV_READER = csv.DictReader(REVERSE_CSV_HANDLER)
|
||||||
|
REVERSE_ORIGIN_ID = selectOrigin(CURS, "reverse.csv")
|
||||||
|
|
||||||
|
if REVERSE_ORIGIN_ID is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
total = 0
|
||||||
|
|
||||||
|
for row in REVERSE_CSV_READER:
|
||||||
|
SUBJECT = row["subject"]
|
||||||
|
RELATIONSHIP = row["relationship"]
|
||||||
|
OBJECT = row["object"]
|
||||||
|
print(f"RDF triplets:\n\t{SUBJECT} - {RELATIONSHIP} - {OBJECT}")
|
||||||
|
insertSubject(CURS, SUBJECT, REVERSE_ORIGIN_ID)
|
||||||
|
insertRelationship(CURS, RELATIONSHIP)
|
||||||
|
insertObject(CURS, OBJECT, REVERSE_ORIGIN_ID)
|
||||||
|
|
||||||
|
SUBJECT_ID = selectSubjectId(CURS, SUBJECT)
|
||||||
|
OBJECT_ID = selectObjectId(CURS, OBJECT)
|
||||||
|
RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
|
||||||
|
MOVIE_ID = selectMovieId(CURS, OBJECT)
|
||||||
|
|
||||||
|
skip = False
|
||||||
|
|
||||||
|
# guard
|
||||||
|
if SUBJECT_ID is None:
|
||||||
|
print(f"No SubjectId for {SUBJECT}")
|
||||||
|
skip = True
|
||||||
|
|
||||||
|
if OBJECT_ID is None:
|
||||||
|
print(f"No ObjectId for {OBJECT}")
|
||||||
|
skip = True
|
||||||
|
|
||||||
|
if RELATIONSHIP_ID is None:
|
||||||
|
print(f"No RelationshipId for {RELATIONSHIP}")
|
||||||
|
skip = True
|
||||||
|
|
||||||
|
if MOVIE_ID is None:
|
||||||
|
print(f"No MovieId for {OBJECT}")
|
||||||
|
skip = True
|
||||||
|
|
||||||
|
if skip:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID): # type: ignore
|
||||||
|
total += 1
|
||||||
|
|
||||||
|
print(total)
|
||||||
|
|
||||||
|
|
||||||
|
def parseRDF_Dataset():
|
||||||
|
|
||||||
|
DATASET_CSV_READER = csv.DictReader(DATASET_CSV_HANDLER)
|
||||||
|
DATASET_ORIGIN_ID = selectOrigin(CURS, "dataset.csv")
|
||||||
|
|
||||||
|
if DATASET_ORIGIN_ID is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
total = 0
|
||||||
|
rdf_idx = 0
|
||||||
|
for row in DATASET_CSV_READER:
|
||||||
|
|
||||||
|
SUBJECT = row["subject"]
|
||||||
|
RELATIONSHIP = row["relationship"]
|
||||||
|
OBJECT = row["object"]
|
||||||
|
|
||||||
|
rdf_idx += 1
|
||||||
|
|
||||||
|
if rdf_idx % 100000 == 0:
|
||||||
|
print(f"RDF number {rdf_idx}:\n\t{SUBJECT} - {RELATIONSHIP} - {OBJECT}")
|
||||||
|
|
||||||
|
insertSubject(CURS, SUBJECT, DATASET_ORIGIN_ID)
|
||||||
|
insertRelationship(CURS, RELATIONSHIP)
|
||||||
|
insertObject(CURS, OBJECT, DATASET_ORIGIN_ID)
|
||||||
|
|
||||||
|
SUBJECT_ID = selectSubjectId(CURS, SUBJECT)
|
||||||
|
OBJECT_ID = selectObjectId(CURS, OBJECT)
|
||||||
|
RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
|
||||||
|
MOVIE_ID = selectMovieId(CURS, SUBJECT)
|
||||||
|
|
||||||
|
skip = False
|
||||||
|
|
||||||
|
# guard
|
||||||
|
if SUBJECT_ID is None:
|
||||||
|
print(f"No SubjectId for {SUBJECT}")
|
||||||
|
skip = True
|
||||||
|
|
||||||
|
if OBJECT_ID is None:
|
||||||
|
print(f"No ObjectId for {OBJECT}")
|
||||||
|
skip = True
|
||||||
|
|
||||||
|
if RELATIONSHIP_ID is None:
|
||||||
|
print(f"No RelationshipId for {RELATIONSHIP}")
|
||||||
|
skip = True
|
||||||
|
|
||||||
|
if MOVIE_ID is None:
|
||||||
|
print(f"No MovieId for {SUBJECT}")
|
||||||
|
skip = True
|
||||||
|
|
||||||
|
if skip:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID): # type: ignore
|
||||||
|
total += 1
|
||||||
|
|
||||||
|
print(total)
|
||||||
|
|
||||||
|
|
||||||
|
def parseAbbr_Reverse():
|
||||||
|
|
||||||
|
REVERSE_CSV_READER = csv.DictReader(REVERSE_CSV_HANDLER)
|
||||||
|
REVERSE_ORIGIN_ID = selectOrigin(CURS, "reverse.csv")
|
||||||
|
|
||||||
|
if REVERSE_ORIGIN_ID is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
total = 0
|
||||||
|
|
||||||
|
for row in REVERSE_CSV_READER:
|
||||||
|
SUBJECT = row["subject"]
|
||||||
|
RELATIONSHIP = row["relationship"]
|
||||||
|
OBJECT = row["object"]
|
||||||
|
|
||||||
|
SUBJECT_ID = selectSubjectId(CURS, SUBJECT)
|
||||||
|
OBJECT_ID = selectObjectId(CURS, OBJECT)
|
||||||
|
RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
|
||||||
|
|
||||||
|
SUB_SECTIONS = SUBJECT.split("/")
|
||||||
|
REL_SECTIONS = RELATIONSHIP.split("/")
|
||||||
|
OBJ_SECTIONS = OBJECT.split("/")
|
||||||
|
|
||||||
|
SUB_ABBR_ID = None
|
||||||
|
REL_ABBR_ID = None
|
||||||
|
OBJ_ABBR_ID = None
|
||||||
|
|
||||||
|
skip = False
|
||||||
|
|
||||||
|
# guard
|
||||||
|
if SUBJECT_ID is None:
|
||||||
|
print(f"No SubjectId for {SUBJECT}")
|
||||||
|
skip = True
|
||||||
|
|
||||||
|
if OBJECT_ID is None:
|
||||||
|
print(f"No ObjectId for {OBJECT}")
|
||||||
|
skip = True
|
||||||
|
|
||||||
|
if RELATIONSHIP_ID is None:
|
||||||
|
print(f"No RelationshipId for {RELATIONSHIP}")
|
||||||
|
skip = True
|
||||||
|
|
||||||
|
|
||||||
|
if skip:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if len(SUB_SECTIONS) > 4:
|
||||||
|
index = min(len(SUB_SECTIONS), 7)
|
||||||
|
while index > 3:
|
||||||
|
PATH = "/".join(SUB_SECTIONS[0:index]) + "%"
|
||||||
|
SUB_ABBR_ID = select_abbreviation_id(PATH)
|
||||||
|
|
||||||
|
if SUB_ABBR_ID is not None:
|
||||||
|
if insert_subject_abbreviation(SUBJECT_ID, SUB_ABBR_ID):
|
||||||
|
total += 1
|
||||||
|
index = 0
|
||||||
|
index -= 1
|
||||||
|
|
||||||
|
if len(REL_SECTIONS) > 4:
|
||||||
|
index = min(len(REL_SECTIONS), 7)
|
||||||
|
while index > 2:
|
||||||
|
PATH = "/".join(REL_SECTIONS[0:index]) + "%"
|
||||||
|
REL_ABBR_ID = select_abbreviation_id(PATH)
|
||||||
|
|
||||||
|
|
||||||
|
if REL_ABBR_ID is not None:
|
||||||
|
if insert_relationship_abbreviation(RELATIONSHIP_ID, REL_ABBR_ID):
|
||||||
|
total += 1
|
||||||
|
index = 0
|
||||||
|
index -= 1
|
||||||
|
|
||||||
|
if len(OBJ_SECTIONS) > 4:
|
||||||
|
index = min(len(OBJ_SECTIONS), 7)
|
||||||
|
while index > 3:
|
||||||
|
PATH = "/".join(OBJ_SECTIONS[0:index]) + "%"
|
||||||
|
OBJ_ABBR_ID = select_abbreviation_id(PATH)
|
||||||
|
|
||||||
|
if OBJ_ABBR_ID is not None:
|
||||||
|
if insert_object_abbreviation(OBJECT_ID, OBJ_ABBR_ID):
|
||||||
|
total += 1
|
||||||
|
index = 0
|
||||||
|
index -= 1
|
||||||
|
|
||||||
|
print(total)
|
||||||
|
|
||||||
|
|
||||||
|
def parseAbbr_Dataset():
|
||||||
|
|
||||||
|
DATASET_CSV_READER = csv.DictReader(DATASET_CSV_HANDLER)
|
||||||
|
DATASET_ORIGIN_ID = selectOrigin(CURS, "dataset.csv")
|
||||||
|
|
||||||
|
if DATASET_ORIGIN_ID is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
total = 0
|
||||||
|
rdf_idx = 0
|
||||||
|
for row in DATASET_CSV_READER:
|
||||||
|
SUBJECT = row["subject"]
|
||||||
|
RELATIONSHIP = row["relationship"]
|
||||||
|
OBJECT = row["object"]
|
||||||
|
|
||||||
|
rdf_idx += 1
|
||||||
|
|
||||||
|
if rdf_idx % 100000 == 0:
|
||||||
|
print(f"RDF number {rdf_idx}:\n\t{SUBJECT} - {RELATIONSHIP} - {OBJECT}")
|
||||||
|
|
||||||
|
SUBJECT_ID = selectSubjectId(CURS, SUBJECT)
|
||||||
|
OBJECT_ID = selectObjectId(CURS, OBJECT)
|
||||||
|
RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
|
||||||
|
|
||||||
|
SUB_SECTIONS = SUBJECT.split("/")
|
||||||
|
REL_SECTIONS = RELATIONSHIP.split("/")
|
||||||
|
OBJ_SECTIONS = OBJECT.split("/")
|
||||||
|
|
||||||
|
SUB_ABBR_ID = None
|
||||||
|
REL_ABBR_ID = None
|
||||||
|
OBJ_ABBR_ID = None
|
||||||
|
|
||||||
|
skip = False
|
||||||
|
|
||||||
|
# guard
|
||||||
|
if SUBJECT_ID is None:
|
||||||
|
print(f"No SubjectId for {SUBJECT}")
|
||||||
|
skip = True
|
||||||
|
|
||||||
|
if OBJECT_ID is None:
|
||||||
|
print(f"No ObjectId for {OBJECT}")
|
||||||
|
skip = True
|
||||||
|
|
||||||
|
if RELATIONSHIP_ID is None:
|
||||||
|
print(f"No RelationshipId for {RELATIONSHIP}")
|
||||||
|
skip = True
|
||||||
|
|
||||||
|
|
||||||
|
if skip:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if len(SUB_SECTIONS) > 4:
|
||||||
|
index = min(len(SUB_SECTIONS), 7)
|
||||||
|
while index > 3:
|
||||||
|
PATH = "/".join(SUB_SECTIONS[0:index]) + "%"
|
||||||
|
SUB_ABBR_ID = select_abbreviation_id(PATH)
|
||||||
|
|
||||||
|
if SUB_ABBR_ID is not None:
|
||||||
|
if insert_subject_abbreviation(SUBJECT_ID, SUB_ABBR_ID):
|
||||||
|
total += 1
|
||||||
|
index = 0
|
||||||
|
index -= 1
|
||||||
|
|
||||||
|
if len(REL_SECTIONS) > 4:
|
||||||
|
index = min(len(REL_SECTIONS), 7)
|
||||||
|
while index > 2:
|
||||||
|
PATH = "/".join(REL_SECTIONS[0:index]) + "%"
|
||||||
|
REL_ABBR_ID = select_abbreviation_id(PATH)
|
||||||
|
|
||||||
|
|
||||||
|
if REL_ABBR_ID is not None:
|
||||||
|
if insert_relationship_abbreviation(RELATIONSHIP_ID, REL_ABBR_ID):
|
||||||
|
total += 1
|
||||||
|
index = 0
|
||||||
|
index -= 1
|
||||||
|
|
||||||
|
if len(OBJ_SECTIONS) > 4:
|
||||||
|
index = min(len(OBJ_SECTIONS), 7)
|
||||||
|
while index > 3:
|
||||||
|
PATH = "/".join(OBJ_SECTIONS[0:index]) + "%"
|
||||||
|
OBJ_ABBR_ID = select_abbreviation_id(PATH)
|
||||||
|
|
||||||
|
if OBJ_ABBR_ID is not None:
|
||||||
|
if insert_object_abbreviation(OBJECT_ID, OBJ_ABBR_ID):
|
||||||
|
total += 1
|
||||||
|
index = 0
|
||||||
|
index -= 1
|
||||||
|
|
||||||
|
print(total)
|
||||||
|
|
||||||
|
|
||||||
|
# MARK: Actual Code
|
||||||
|
# parseMovies()
|
||||||
|
# parseWikiPageId()
|
||||||
|
# parseAbstract()
|
||||||
|
# insertOrigin(CURS)
|
||||||
|
# parseAbbreviations()
|
||||||
|
# parseRDF_Reverse()
|
||||||
|
# parseRDF_Dataset()
|
||||||
|
# parseAbbr_Reverse()
|
||||||
|
parseAbbr_Dataset()
|
||||||
|
|
||||||
|
|
||||||
|
CONN.commit()
|
||||||
|
CONN.close()
|
||||||
|
|
||||||
|
|
||||||
|
MOVIES_CSV_HANDLER.close()
|
||||||
|
PAGEID_CSV_HANDLER.close()
|
||||||
|
SUMMARY_CSV_HANDLER.close()
|
||||||
|
DATASET_CSV_HANDLER.close()
|
||||||
|
REVERSE_CSV_HANDLER.close()
|
||||||
|
URI_ABBR_CSV_HANDLER.close()
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
The MovieUri: http://dbpedia.org/resource/1%25_(film) has not a MovieId
|
||||||
|
The MovieUri: http://dbpedia.org/resource/10%25:_What_Makes_a_Hero%3F has not a MovieId
|
||||||
|
The MovieUri: http://dbpedia.org/resource/100%25_Arabica has not a MovieId
|
||||||
|
The MovieUri: http://dbpedia.org/resource/100%25_Kadhal has not a MovieId
|
||||||
|
The MovieUri: http://dbpedia.org/resource/100%25_Love_(2011_film) has not a MovieId
|
||||||
|
The MovieUri: http://dbpedia.org/resource/100%25_Love_(2012_film) has not a MovieId
|
||||||
|
The MovieUri: http://dbpedia.org/resource/100%25_Wolf has not a MovieId
|
||||||
|
The MovieUri: http://dbpedia.org/resource/Who_the_$&%25_Is_Jackson_Pollock%3F has not a MovieId
|
||||||
|
The MovieUri: http://dbpedia.org/resource/99%25:_The_Occupy_Wall_Street_Collaborative_Film has not a MovieId
|
||||||
|
The MovieUri: http://dbpedia.org/resource/99_and_44/100%25_Dead has not a MovieId
|
||||||
|
The MovieUri: http://dbpedia.org/resource/Postcards_from_the_48%25 has not a MovieId
|
||||||
|
The MovieUri: http://dbpedia.org/resource/Wool_100%25 has not a MovieId
|
||||||
|
"""
|
||||||
|
|
||||||
|
"""
|
||||||
|
The WikiPageId: 10068850 has not a MovieId
|
||||||
|
The WikiPageId: 55069615 has not a MovieId
|
||||||
|
The WikiPageId: 49510056 has not a MovieId
|
||||||
|
The WikiPageId: 4049786 has not a MovieId
|
||||||
|
The WikiPageId: 55510238 has not a MovieId
|
||||||
|
The WikiPageId: 31239628 has not a MovieId
|
||||||
|
The WikiPageId: 34757217 has not a MovieId
|
||||||
|
The WikiPageId: 64311757 has not a MovieId
|
||||||
|
The WikiPageId: 8326198 has not a MovieId
|
||||||
|
The WikiPageId: 42162164 has not a MovieId
|
||||||
|
The WikiPageId: 18502369 has not a MovieId
|
||||||
|
The WikiPageId: 58092358 has not a MovieId
|
||||||
|
The WikiPageId: 40710250 has not a MovieId
|
||||||
|
"""
|
||||||
0
Scripts/Experiments/.gitkeep
Normal file
0
Scripts/Experiments/.gitkeep
Normal file
0
Scripts/Experiments/Queries/.gitkeep
Normal file
0
Scripts/Experiments/Queries/.gitkeep
Normal file
0
Scripts/Experiments/Tmp/.gitkeep
Normal file
0
Scripts/Experiments/Tmp/.gitkeep
Normal file
0
Scripts/Libs/CleaningPipeline/.gitkeep
Normal file
0
Scripts/Libs/CleaningPipeline/.gitkeep
Normal file
22
Scripts/Libs/CleaningPipeline/special_token.py
Normal file
22
Scripts/Libs/CleaningPipeline/special_token.py
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
from enum import Enum
|
||||||
|
|
||||||
|
class SpecialToken(str, Enum):
|
||||||
|
# (Enum, str) -> throws an error
|
||||||
|
START_TRIPLE_LIST = "<SOTL>"
|
||||||
|
START_TRIPLE = "<SOT>"
|
||||||
|
END_TRIPLE = "<EOT>"
|
||||||
|
SUBJECT = "<SUBJ>"
|
||||||
|
RELATIONSHIP = "<PRED>"
|
||||||
|
OBJECT = "<OBJ>"
|
||||||
|
ABSTRACT = "<ABS>"
|
||||||
|
END_OF_SENTENCE = "<EOS>"
|
||||||
|
CORPUS_END = "<END>"
|
||||||
|
|
||||||
|
## Tasks' Token
|
||||||
|
RDF_TO_TEXT = "<RDF2TXT>"
|
||||||
|
TEXT_TO_RDF = "<TEXT2RDF>"
|
||||||
|
CONTINUE_RDF = "<CONTINUERDF>"
|
||||||
|
MASK = "<MASK>"
|
||||||
|
|
||||||
|
#BPE Training:
|
||||||
|
|
||||||
149
Scripts/Libs/CleaningPipeline/sql_endpoint.py
Normal file
149
Scripts/Libs/CleaningPipeline/sql_endpoint.py
Normal file
@ -0,0 +1,149 @@
|
|||||||
|
#######################################################
|
||||||
|
# This file stand as endpoint to interact with DB #
|
||||||
|
#######################################################
|
||||||
|
|
||||||
|
# import sqlite3
|
||||||
|
import pandas as pd
|
||||||
|
from sqlalchemy import create_engine
|
||||||
|
from Scripts.Libs.CleaningPipeline.special_token import SpecialToken
|
||||||
|
|
||||||
|
|
||||||
|
class SqlEndpoint():
|
||||||
|
|
||||||
|
def __init__(self, DB_PATH = "./Assets/Dataset/DatawareHouse/dataset.db", chunk_size_row = 500):
|
||||||
|
# self.CONN = sqlite3.connect(DB_PATH) # DEPRECATED
|
||||||
|
self.sql_engine = create_engine(f"sqlite:///{DB_PATH}")
|
||||||
|
# /// 3 slash -> relative path
|
||||||
|
# //// 4 slash -> absolute
|
||||||
|
# self.conn = self.sql_engine.connect().execution_options(stream_results=True)
|
||||||
|
# it seems that sqlite doenst support streamer cursor
|
||||||
|
# PRAGMA exeutes better in writing not reading
|
||||||
|
self.chunk_size_row = chunk_size_row # not used now, since each chunk is a movie
|
||||||
|
self.movie_ids = movie_ids = pd.read_sql_query("SELECT MovieID FROM Movies;", self.sql_engine)["MovieID"]
|
||||||
|
|
||||||
|
def get_RDF(self) -> pd.DataFrame :
|
||||||
|
|
||||||
|
QUERY = """
|
||||||
|
SELECT MovieID, SubjectURI, RelationshipURI, ObjectURI
|
||||||
|
FROM RDFs
|
||||||
|
INNER JOIN Subjects USING (SubjectID)
|
||||||
|
INNER JOIN Relationships USING (RelationshipID)
|
||||||
|
INNER JOIN Objects USING (ObjectID);
|
||||||
|
"""
|
||||||
|
|
||||||
|
return pd.read_sql_query(QUERY, self.CONN)
|
||||||
|
|
||||||
|
def get_chunked_abbreviated_dataset(self) -> pd.DataFrame :
|
||||||
|
"""
|
||||||
|
Returns:
|
||||||
|
pd.DataFrame: MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
|
||||||
|
"""
|
||||||
|
|
||||||
|
QUERY = """
|
||||||
|
SELECT MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
|
||||||
|
FROM RDFs
|
||||||
|
INNER JOIN ParsedSubjects USING (SubjectID)
|
||||||
|
INNER JOIN ParsedRelationships USING (RelationshipID)
|
||||||
|
INNER JOIN ParsedObjects USING (ObjectID)
|
||||||
|
INNER JOIN WikipediaAbstracts USING (MovieID);
|
||||||
|
"""
|
||||||
|
|
||||||
|
# return pd.read_sql_query(QUERY, self.CONN, chunksize=500)
|
||||||
|
# sqlite3
|
||||||
|
return pd.read_sql_query(QUERY, self.sql_engine, chunksize=self.chunk_size_row)
|
||||||
|
|
||||||
|
|
||||||
|
def get_chunked_abbreviated_dataset_with_start_token(self)-> pd.DataFrame:
|
||||||
|
# DEPRECATED !
|
||||||
|
start_token = SpecialToken()
|
||||||
|
QUERY = """
|
||||||
|
SELECT
|
||||||
|
MovieID,
|
||||||
|
? || SubjectURI AS SubjectURI,
|
||||||
|
? || RelationshipURI AS RelationshipURI,
|
||||||
|
? || ObjectURI AS ObjectURI,
|
||||||
|
Abstract
|
||||||
|
FROM RDFs
|
||||||
|
INNER JOIN ParsedSubjects USING (SubjectID)
|
||||||
|
INNER JOIN ParsedRelationships USING (RelationshipID)
|
||||||
|
INNER JOIN ParsedObjects USING (ObjectID)
|
||||||
|
INNER JOIN WikipediaAbstracts USING (MovieID);
|
||||||
|
"""
|
||||||
|
return pd.read_sql_query(QUERY, self.sql_engine, chunksize=self.chunk_size_row)
|
||||||
|
|
||||||
|
def get_abbreviated_dataset_by_movie_id(self):# -> iter[pd.DataFrame]:
|
||||||
|
"""
|
||||||
|
Gets each time a DataFrame per movie ( with all its rows in the dataset).
|
||||||
|
The retrieved RDFs are already abbrevieted by the sql parser
|
||||||
|
Yields:
|
||||||
|
Pandas.DataFrame: [MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract]
|
||||||
|
"""
|
||||||
|
# chunk by movieId, abstract is the same and some intersting logic are appliable
|
||||||
|
# movie_ids = pd.read_sql_query("SELECT MovieID FROM Movies;", self.sql_engine)["MovieID"]
|
||||||
|
# CHOOSEN MOVIE:
|
||||||
|
# The Dark Knight : 117248
|
||||||
|
# Inception : 147074
|
||||||
|
# The Avengers : 113621
|
||||||
|
# Cast Away : 1123
|
||||||
|
# The Departed : 117586
|
||||||
|
# American Psycho : 90177
|
||||||
|
# Avatar : 71587
|
||||||
|
# Django Unchained : 138952
|
||||||
|
# Spirited Away : 144137
|
||||||
|
# Knives Out : 148025
|
||||||
|
# movie_list = [117248, 147074, 113621, 1123, 117586, 90177, 71587, 138952, 144137, 148025]
|
||||||
|
# movie_ids = movie_list
|
||||||
|
|
||||||
|
QUERY = """
|
||||||
|
SELECT MovieID, SubjectURI, RelationshipURI, ObjectURI, Abstract
|
||||||
|
FROM RDFs
|
||||||
|
INNER JOIN ParsedSubjects USING (SubjectID)
|
||||||
|
INNER JOIN ParsedRelationships USING (RelationshipID)
|
||||||
|
INNER JOIN ParsedObjects USING (ObjectID)
|
||||||
|
INNER JOIN WikipediaAbstracts USING (MovieID)
|
||||||
|
WHERE MovieID = (?);
|
||||||
|
"""
|
||||||
|
|
||||||
|
for movie_id in self.movie_ids:
|
||||||
|
yield pd.read_sql_query(QUERY, self.sql_engine, params=(movie_id,))
|
||||||
|
|
||||||
|
def get_movies_id_count(self) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
Gets the count of each Movie in the Dataset
|
||||||
|
Returns:
|
||||||
|
Pandas.DataFrame: [MovieID, Count]
|
||||||
|
"""
|
||||||
|
QUERY = """
|
||||||
|
SELECT MovieID, COUNT(*) AS Count
|
||||||
|
FROM RDFs
|
||||||
|
GROUP BY MovieID;
|
||||||
|
"""
|
||||||
|
return pd.read_sql_query(QUERY, self.sql_engine)
|
||||||
|
|
||||||
|
def get_relationship_count(self) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
Gets the count of each Relationship in the Dataset
|
||||||
|
Returns:
|
||||||
|
Pandas.DataFrame: [RelationshipURI, Count]
|
||||||
|
"""
|
||||||
|
QUERY = """
|
||||||
|
SELECT RelationshipURI, COUNT(*) AS Count
|
||||||
|
FROM RDFs
|
||||||
|
INNER JOIN ParsedRelationships USING (RelationshipID)
|
||||||
|
GROUP BY RelationshipURI;
|
||||||
|
"""
|
||||||
|
return pd.read_sql_query(QUERY, self.sql_engine)
|
||||||
|
|
||||||
|
def get_dataframe_from_query(self, query: str, params=None):
|
||||||
|
if params is None:
|
||||||
|
return pd.read_sql_query(query, self.sql_engine)
|
||||||
|
return pd.read_sql_query(query, self.sql_engine, params=params)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__" :
|
||||||
|
sql_endpoint = SqlEndpoint()
|
||||||
|
for pandas_row in sql_endpoint.get_abbreviated_dataset_by_movie_id():
|
||||||
|
print(pandas_row)
|
||||||
|
# sql_endpoint.get_RDF()
|
||||||
|
print("done")
|
||||||
0
Scripts/Libs/Utils/.gitkeep
Normal file
0
Scripts/Libs/Utils/.gitkeep
Normal file
9
Scripts/Libs/Utils/dataframe_interaction.py
Normal file
9
Scripts/Libs/Utils/dataframe_interaction.py
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def get_raw_from_dataframe(DF: pd.DataFrame) -> str:
|
||||||
|
output = ''
|
||||||
|
for row in DF.itertuples(index=False, name=None):
|
||||||
|
output += "".join(map(str, row))
|
||||||
|
return output
|
||||||
897
Scripts/UML/CleaningPipeline/bpe-pipeline.excalidraw.json
Normal file
897
Scripts/UML/CleaningPipeline/bpe-pipeline.excalidraw.json
Normal file
@ -0,0 +1,897 @@
|
|||||||
|
{
|
||||||
|
"type": "excalidraw",
|
||||||
|
"version": 2,
|
||||||
|
"source": "https://marketplace.visualstudio.com/items?itemName=pomdtr.excalidraw-editor",
|
||||||
|
"elements": [
|
||||||
|
{
|
||||||
|
"id": "3zbCui3XtIGozHXTVAGRp",
|
||||||
|
"type": "rectangle",
|
||||||
|
"x": 316.5,
|
||||||
|
"y": 123,
|
||||||
|
"width": 436.5,
|
||||||
|
"height": 145.5,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "transparent",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "a0",
|
||||||
|
"roundness": {
|
||||||
|
"type": 3
|
||||||
|
},
|
||||||
|
"seed": 1698427950,
|
||||||
|
"version": 35,
|
||||||
|
"versionNonce": 601575602,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": [
|
||||||
|
{
|
||||||
|
"id": "wD66RDbG05HfvRhAtMb0J",
|
||||||
|
"type": "text"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "gus_rxauKJ6T2L_F59PfN",
|
||||||
|
"type": "arrow"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"updated": 1758818588814,
|
||||||
|
"link": null,
|
||||||
|
"locked": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "wD66RDbG05HfvRhAtMb0J",
|
||||||
|
"type": "text",
|
||||||
|
"x": 480.98004150390625,
|
||||||
|
"y": 183.25,
|
||||||
|
"width": 107.5399169921875,
|
||||||
|
"height": 25,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "transparent",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "a1",
|
||||||
|
"roundness": null,
|
||||||
|
"seed": 910769774,
|
||||||
|
"version": 31,
|
||||||
|
"versionNonce": 1120989938,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": null,
|
||||||
|
"updated": 1758818416720,
|
||||||
|
"link": null,
|
||||||
|
"locked": false,
|
||||||
|
"text": "dataset.db",
|
||||||
|
"fontSize": 20,
|
||||||
|
"fontFamily": 5,
|
||||||
|
"textAlign": "center",
|
||||||
|
"verticalAlign": "middle",
|
||||||
|
"containerId": "3zbCui3XtIGozHXTVAGRp",
|
||||||
|
"originalText": "dataset.db",
|
||||||
|
"autoResize": true,
|
||||||
|
"lineHeight": 1.25
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "87-MeaiZGT1wln0nggYPZ",
|
||||||
|
"type": "rectangle",
|
||||||
|
"x": 339.5,
|
||||||
|
"y": 309.5,
|
||||||
|
"width": 392,
|
||||||
|
"height": 156,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "transparent",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "a2",
|
||||||
|
"roundness": {
|
||||||
|
"type": 3
|
||||||
|
},
|
||||||
|
"seed": 655550318,
|
||||||
|
"version": 77,
|
||||||
|
"versionNonce": 1103939826,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": null,
|
||||||
|
"updated": 1758818339000,
|
||||||
|
"link": null,
|
||||||
|
"locked": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "EjUxEhZqEBzwvlw0VE9eJ",
|
||||||
|
"type": "rectangle",
|
||||||
|
"x": 355.5,
|
||||||
|
"y": 327,
|
||||||
|
"width": 162,
|
||||||
|
"height": 125.5,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "transparent",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "a3",
|
||||||
|
"roundness": {
|
||||||
|
"type": 3
|
||||||
|
},
|
||||||
|
"seed": 1739846638,
|
||||||
|
"version": 64,
|
||||||
|
"versionNonce": 1594290034,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": [
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"id": "ogRkV0neHrhEKTE6zlggl"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"updated": 1758818391415,
|
||||||
|
"link": null,
|
||||||
|
"locked": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "ogRkV0neHrhEKTE6zlggl",
|
||||||
|
"type": "text",
|
||||||
|
"x": 378.7100524902344,
|
||||||
|
"y": 377.25,
|
||||||
|
"width": 115.57989501953125,
|
||||||
|
"height": 25,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "transparent",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "a3V",
|
||||||
|
"roundness": null,
|
||||||
|
"seed": 2037675630,
|
||||||
|
"version": 12,
|
||||||
|
"versionNonce": 1286472046,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": null,
|
||||||
|
"updated": 1758818399222,
|
||||||
|
"link": null,
|
||||||
|
"locked": false,
|
||||||
|
"text": "RDF_String",
|
||||||
|
"fontSize": 20,
|
||||||
|
"fontFamily": 5,
|
||||||
|
"textAlign": "center",
|
||||||
|
"verticalAlign": "middle",
|
||||||
|
"containerId": "EjUxEhZqEBzwvlw0VE9eJ",
|
||||||
|
"originalText": "RDF_String",
|
||||||
|
"autoResize": true,
|
||||||
|
"lineHeight": 1.25
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "hoIRMNiMJZl4YDo-hovWy",
|
||||||
|
"type": "rectangle",
|
||||||
|
"x": 542.5,
|
||||||
|
"y": 327,
|
||||||
|
"width": 173,
|
||||||
|
"height": 125.5,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "transparent",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "a4",
|
||||||
|
"roundness": {
|
||||||
|
"type": 3
|
||||||
|
},
|
||||||
|
"seed": 1189796530,
|
||||||
|
"version": 99,
|
||||||
|
"versionNonce": 1071057006,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": [
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"id": "rsapATFAT5YSBCXzLupgZ"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "gus_rxauKJ6T2L_F59PfN",
|
||||||
|
"type": "arrow"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "Wk1bJbbtC31FqObEL5xWt",
|
||||||
|
"type": "arrow"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"updated": 1758818593647,
|
||||||
|
"link": null,
|
||||||
|
"locked": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "rsapATFAT5YSBCXzLupgZ",
|
||||||
|
"type": "text",
|
||||||
|
"x": 585.6800384521484,
|
||||||
|
"y": 377.25,
|
||||||
|
"width": 86.63992309570312,
|
||||||
|
"height": 25,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "transparent",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "a5",
|
||||||
|
"roundness": null,
|
||||||
|
"seed": 829619694,
|
||||||
|
"version": 12,
|
||||||
|
"versionNonce": 713902318,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": null,
|
||||||
|
"updated": 1758818405150,
|
||||||
|
"link": null,
|
||||||
|
"locked": false,
|
||||||
|
"text": "Abstract",
|
||||||
|
"fontSize": 20,
|
||||||
|
"fontFamily": 5,
|
||||||
|
"textAlign": "center",
|
||||||
|
"verticalAlign": "middle",
|
||||||
|
"containerId": "hoIRMNiMJZl4YDo-hovWy",
|
||||||
|
"originalText": "Abstract",
|
||||||
|
"autoResize": true,
|
||||||
|
"lineHeight": 1.25
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "jSx8ApfhtRs_nk37VvDMb",
|
||||||
|
"type": "rectangle",
|
||||||
|
"x": 316.5,
|
||||||
|
"y": 511,
|
||||||
|
"width": 436.5,
|
||||||
|
"height": 145.5,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "transparent",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "a6",
|
||||||
|
"roundness": {
|
||||||
|
"type": 3
|
||||||
|
},
|
||||||
|
"seed": 492582894,
|
||||||
|
"version": 132,
|
||||||
|
"versionNonce": 893797614,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": [
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"id": "6E23g-rgowNqHsBxX-LuM"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "hyFKqXwet_F79QM71atgI",
|
||||||
|
"type": "arrow"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "x_DP1FcQ7jraGz0gBuDi3",
|
||||||
|
"type": "arrow"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "1IGbCps2EHnzKgJUWM5nq",
|
||||||
|
"type": "arrow"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "Wk1bJbbtC31FqObEL5xWt",
|
||||||
|
"type": "arrow"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"updated": 1758818593647,
|
||||||
|
"link": null,
|
||||||
|
"locked": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "6E23g-rgowNqHsBxX-LuM",
|
||||||
|
"type": "text",
|
||||||
|
"x": 499.9100341796875,
|
||||||
|
"y": 571.25,
|
||||||
|
"width": 69.679931640625,
|
||||||
|
"height": 25,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "transparent",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "a7",
|
||||||
|
"roundness": null,
|
||||||
|
"seed": 267696178,
|
||||||
|
"version": 132,
|
||||||
|
"versionNonce": 1668243186,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": null,
|
||||||
|
"updated": 1758818543211,
|
||||||
|
"link": null,
|
||||||
|
"locked": false,
|
||||||
|
"text": "Pandas",
|
||||||
|
"fontSize": 20,
|
||||||
|
"fontFamily": 5,
|
||||||
|
"textAlign": "center",
|
||||||
|
"verticalAlign": "middle",
|
||||||
|
"containerId": "jSx8ApfhtRs_nk37VvDMb",
|
||||||
|
"originalText": "Pandas",
|
||||||
|
"autoResize": true,
|
||||||
|
"lineHeight": 1.25
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "ohj18N4AOTDz5lJNcV9gi",
|
||||||
|
"type": "rectangle",
|
||||||
|
"x": 261,
|
||||||
|
"y": 765.5,
|
||||||
|
"width": 157,
|
||||||
|
"height": 87,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "transparent",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "a8",
|
||||||
|
"roundness": {
|
||||||
|
"type": 3
|
||||||
|
},
|
||||||
|
"seed": 1446207150,
|
||||||
|
"version": 279,
|
||||||
|
"versionNonce": 317375026,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": [
|
||||||
|
{
|
||||||
|
"id": "Ea1_ke2wA0D8ZjVOUtvfY",
|
||||||
|
"type": "text"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "hyFKqXwet_F79QM71atgI",
|
||||||
|
"type": "arrow"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"updated": 1758818570993,
|
||||||
|
"link": null,
|
||||||
|
"locked": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "Ea1_ke2wA0D8ZjVOUtvfY",
|
||||||
|
"type": "text",
|
||||||
|
"x": 297.0800323486328,
|
||||||
|
"y": 796.5,
|
||||||
|
"width": 84.83993530273438,
|
||||||
|
"height": 25,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "transparent",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "a9",
|
||||||
|
"roundness": null,
|
||||||
|
"seed": 435116270,
|
||||||
|
"version": 199,
|
||||||
|
"versionNonce": 1282911218,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": null,
|
||||||
|
"updated": 1758818570993,
|
||||||
|
"link": null,
|
||||||
|
"locked": false,
|
||||||
|
"text": "train.txt",
|
||||||
|
"fontSize": 20,
|
||||||
|
"fontFamily": 5,
|
||||||
|
"textAlign": "center",
|
||||||
|
"verticalAlign": "middle",
|
||||||
|
"containerId": "ohj18N4AOTDz5lJNcV9gi",
|
||||||
|
"originalText": "train.txt",
|
||||||
|
"autoResize": true,
|
||||||
|
"lineHeight": 1.25
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "A4Y54Y26fe257U_QU9lxX",
|
||||||
|
"type": "rectangle",
|
||||||
|
"x": 464,
|
||||||
|
"y": 765.5,
|
||||||
|
"width": 157,
|
||||||
|
"height": 87,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "transparent",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "aA",
|
||||||
|
"roundness": {
|
||||||
|
"type": 3
|
||||||
|
},
|
||||||
|
"seed": 186148850,
|
||||||
|
"version": 232,
|
||||||
|
"versionNonce": 997119858,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": [
|
||||||
|
{
|
||||||
|
"id": "v4TvUlDEjH7EvPDmtbOn2",
|
||||||
|
"type": "text"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "1IGbCps2EHnzKgJUWM5nq",
|
||||||
|
"type": "arrow"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"updated": 1758818570993,
|
||||||
|
"link": null,
|
||||||
|
"locked": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "v4TvUlDEjH7EvPDmtbOn2",
|
||||||
|
"type": "text",
|
||||||
|
"x": 476.3500442504883,
|
||||||
|
"y": 796.5,
|
||||||
|
"width": 132.29991149902344,
|
||||||
|
"height": 25,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "transparent",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "aB",
|
||||||
|
"roundness": null,
|
||||||
|
"seed": 1131059634,
|
||||||
|
"version": 171,
|
||||||
|
"versionNonce": 239540530,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": null,
|
||||||
|
"updated": 1758818570993,
|
||||||
|
"link": null,
|
||||||
|
"locked": false,
|
||||||
|
"text": "validation.txt",
|
||||||
|
"fontSize": 20,
|
||||||
|
"fontFamily": 5,
|
||||||
|
"textAlign": "center",
|
||||||
|
"verticalAlign": "middle",
|
||||||
|
"containerId": "A4Y54Y26fe257U_QU9lxX",
|
||||||
|
"originalText": "validation.txt",
|
||||||
|
"autoResize": true,
|
||||||
|
"lineHeight": 1.25
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "mPaYpJ9Xn7tlJPmKPqJKJ",
|
||||||
|
"type": "rectangle",
|
||||||
|
"x": 674.5,
|
||||||
|
"y": 765.5,
|
||||||
|
"width": 157,
|
||||||
|
"height": 87,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "transparent",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "aC",
|
||||||
|
"roundness": {
|
||||||
|
"type": 3
|
||||||
|
},
|
||||||
|
"seed": 1049323314,
|
||||||
|
"version": 235,
|
||||||
|
"versionNonce": 330560690,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": [
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"id": "kg9nm2rpud6cax5aNPSnu"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "x_DP1FcQ7jraGz0gBuDi3",
|
||||||
|
"type": "arrow"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"updated": 1758818570993,
|
||||||
|
"link": null,
|
||||||
|
"locked": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "kg9nm2rpud6cax5aNPSnu",
|
||||||
|
"type": "text",
|
||||||
|
"x": 711.4300231933594,
|
||||||
|
"y": 796.5,
|
||||||
|
"width": 83.13995361328125,
|
||||||
|
"height": 25,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "transparent",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "aD",
|
||||||
|
"roundness": null,
|
||||||
|
"seed": 522572142,
|
||||||
|
"version": 193,
|
||||||
|
"versionNonce": 1920372338,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": null,
|
||||||
|
"updated": 1758818570993,
|
||||||
|
"link": null,
|
||||||
|
"locked": false,
|
||||||
|
"text": "test.txt",
|
||||||
|
"fontSize": 20,
|
||||||
|
"fontFamily": 5,
|
||||||
|
"textAlign": "center",
|
||||||
|
"verticalAlign": "middle",
|
||||||
|
"containerId": "mPaYpJ9Xn7tlJPmKPqJKJ",
|
||||||
|
"originalText": "test.txt",
|
||||||
|
"autoResize": true,
|
||||||
|
"lineHeight": 1.25
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "hyFKqXwet_F79QM71atgI",
|
||||||
|
"type": "arrow",
|
||||||
|
"x": 534.65,
|
||||||
|
"y": 661.5,
|
||||||
|
"width": 195.25,
|
||||||
|
"height": 99,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "transparent",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "aG",
|
||||||
|
"roundness": null,
|
||||||
|
"seed": 873266098,
|
||||||
|
"version": 71,
|
||||||
|
"versionNonce": 541154738,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": null,
|
||||||
|
"updated": 1758818570993,
|
||||||
|
"link": null,
|
||||||
|
"locked": false,
|
||||||
|
"points": [
|
||||||
|
[
|
||||||
|
0,
|
||||||
|
0
|
||||||
|
],
|
||||||
|
[
|
||||||
|
0,
|
||||||
|
49.5
|
||||||
|
],
|
||||||
|
[
|
||||||
|
-195.25,
|
||||||
|
49.5
|
||||||
|
],
|
||||||
|
[
|
||||||
|
-195.25,
|
||||||
|
99
|
||||||
|
]
|
||||||
|
],
|
||||||
|
"lastCommittedPoint": null,
|
||||||
|
"startBinding": {
|
||||||
|
"elementId": "jSx8ApfhtRs_nk37VvDMb",
|
||||||
|
"fixedPoint": [
|
||||||
|
0.49977090492554405,
|
||||||
|
1.034364261168385
|
||||||
|
],
|
||||||
|
"focus": 0,
|
||||||
|
"gap": 0
|
||||||
|
},
|
||||||
|
"endBinding": {
|
||||||
|
"elementId": "ohj18N4AOTDz5lJNcV9gi",
|
||||||
|
"fixedPoint": [
|
||||||
|
0.4993630573248406,
|
||||||
|
-0.05747126436781609
|
||||||
|
],
|
||||||
|
"focus": 0,
|
||||||
|
"gap": 0
|
||||||
|
},
|
||||||
|
"startArrowhead": null,
|
||||||
|
"endArrowhead": "triangle",
|
||||||
|
"elbowed": true,
|
||||||
|
"fixedSegments": null,
|
||||||
|
"startIsSpecial": null,
|
||||||
|
"endIsSpecial": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "x_DP1FcQ7jraGz0gBuDi3",
|
||||||
|
"type": "arrow",
|
||||||
|
"x": 534.65,
|
||||||
|
"y": 661.5,
|
||||||
|
"width": 218.25,
|
||||||
|
"height": 99,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "transparent",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "aH",
|
||||||
|
"roundness": null,
|
||||||
|
"seed": 1210817582,
|
||||||
|
"version": 77,
|
||||||
|
"versionNonce": 1483392370,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": null,
|
||||||
|
"updated": 1758818580594,
|
||||||
|
"link": null,
|
||||||
|
"locked": false,
|
||||||
|
"points": [
|
||||||
|
[
|
||||||
|
0,
|
||||||
|
0
|
||||||
|
],
|
||||||
|
[
|
||||||
|
0,
|
||||||
|
49.5
|
||||||
|
],
|
||||||
|
[
|
||||||
|
218.25,
|
||||||
|
49.5
|
||||||
|
],
|
||||||
|
[
|
||||||
|
218.25,
|
||||||
|
99
|
||||||
|
]
|
||||||
|
],
|
||||||
|
"lastCommittedPoint": null,
|
||||||
|
"startBinding": {
|
||||||
|
"elementId": "jSx8ApfhtRs_nk37VvDMb",
|
||||||
|
"fixedPoint": [
|
||||||
|
0.49977090492554405,
|
||||||
|
1.034364261168385
|
||||||
|
],
|
||||||
|
"focus": 0,
|
||||||
|
"gap": 0
|
||||||
|
},
|
||||||
|
"endBinding": {
|
||||||
|
"elementId": "mPaYpJ9Xn7tlJPmKPqJKJ",
|
||||||
|
"fixedPoint": [
|
||||||
|
0.4993630573248406,
|
||||||
|
-0.05747126436781609
|
||||||
|
],
|
||||||
|
"focus": 0,
|
||||||
|
"gap": 0
|
||||||
|
},
|
||||||
|
"startArrowhead": null,
|
||||||
|
"endArrowhead": "triangle",
|
||||||
|
"elbowed": true,
|
||||||
|
"fixedSegments": null,
|
||||||
|
"startIsSpecial": null,
|
||||||
|
"endIsSpecial": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "1IGbCps2EHnzKgJUWM5nq",
|
||||||
|
"type": "arrow",
|
||||||
|
"x": 534.65,
|
||||||
|
"y": 661.5,
|
||||||
|
"width": 0.5719232650604908,
|
||||||
|
"height": 99.07394122590165,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "transparent",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "aK",
|
||||||
|
"roundness": null,
|
||||||
|
"seed": 1205316658,
|
||||||
|
"version": 96,
|
||||||
|
"versionNonce": 1748050674,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": null,
|
||||||
|
"updated": 1758818570993,
|
||||||
|
"link": null,
|
||||||
|
"locked": false,
|
||||||
|
"points": [
|
||||||
|
[
|
||||||
|
0,
|
||||||
|
0
|
||||||
|
],
|
||||||
|
[
|
||||||
|
-0.5719232650604908,
|
||||||
|
99.07394122590165
|
||||||
|
]
|
||||||
|
],
|
||||||
|
"lastCommittedPoint": null,
|
||||||
|
"startBinding": {
|
||||||
|
"elementId": "jSx8ApfhtRs_nk37VvDMb",
|
||||||
|
"fixedPoint": [
|
||||||
|
0.49977090492554405,
|
||||||
|
1.034364261168385
|
||||||
|
],
|
||||||
|
"focus": 0,
|
||||||
|
"gap": 0
|
||||||
|
},
|
||||||
|
"endBinding": {
|
||||||
|
"elementId": "A4Y54Y26fe257U_QU9lxX",
|
||||||
|
"fixedPoint": [
|
||||||
|
0.44635717665566554,
|
||||||
|
-0.056621365219521276
|
||||||
|
],
|
||||||
|
"focus": 0,
|
||||||
|
"gap": 0
|
||||||
|
},
|
||||||
|
"startArrowhead": null,
|
||||||
|
"endArrowhead": "triangle",
|
||||||
|
"elbowed": true,
|
||||||
|
"fixedSegments": null,
|
||||||
|
"startIsSpecial": null,
|
||||||
|
"endIsSpecial": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "gus_rxauKJ6T2L_F59PfN",
|
||||||
|
"type": "arrow",
|
||||||
|
"x": 539,
|
||||||
|
"y": 271.5,
|
||||||
|
"width": 0,
|
||||||
|
"height": 33.5,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "transparent",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "aL",
|
||||||
|
"roundness": null,
|
||||||
|
"seed": 763990258,
|
||||||
|
"version": 17,
|
||||||
|
"versionNonce": 1028811378,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": null,
|
||||||
|
"updated": 1758818588814,
|
||||||
|
"link": null,
|
||||||
|
"locked": false,
|
||||||
|
"points": [
|
||||||
|
[
|
||||||
|
0,
|
||||||
|
0
|
||||||
|
],
|
||||||
|
[
|
||||||
|
0,
|
||||||
|
33.5
|
||||||
|
]
|
||||||
|
],
|
||||||
|
"lastCommittedPoint": null,
|
||||||
|
"startBinding": {
|
||||||
|
"elementId": "3zbCui3XtIGozHXTVAGRp",
|
||||||
|
"focus": -0.019473081328751418,
|
||||||
|
"gap": 3
|
||||||
|
},
|
||||||
|
"endBinding": {
|
||||||
|
"elementId": "hoIRMNiMJZl4YDo-hovWy",
|
||||||
|
"focus": -1.0404624277456647,
|
||||||
|
"gap": 30.7545797799829
|
||||||
|
},
|
||||||
|
"startArrowhead": null,
|
||||||
|
"endArrowhead": "triangle",
|
||||||
|
"elbowed": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "Wk1bJbbtC31FqObEL5xWt",
|
||||||
|
"type": "arrow",
|
||||||
|
"x": 536.5,
|
||||||
|
"y": 468.5,
|
||||||
|
"width": 0,
|
||||||
|
"height": 39,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "transparent",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "aM",
|
||||||
|
"roundness": null,
|
||||||
|
"seed": 1489771054,
|
||||||
|
"version": 33,
|
||||||
|
"versionNonce": 1828178606,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": null,
|
||||||
|
"updated": 1758818593647,
|
||||||
|
"link": null,
|
||||||
|
"locked": false,
|
||||||
|
"points": [
|
||||||
|
[
|
||||||
|
0,
|
||||||
|
0
|
||||||
|
],
|
||||||
|
[
|
||||||
|
0,
|
||||||
|
39
|
||||||
|
]
|
||||||
|
],
|
||||||
|
"lastCommittedPoint": null,
|
||||||
|
"startBinding": {
|
||||||
|
"elementId": "hoIRMNiMJZl4YDo-hovWy",
|
||||||
|
"focus": 1.0693641618497107,
|
||||||
|
"gap": 27.157190169432425
|
||||||
|
},
|
||||||
|
"endBinding": {
|
||||||
|
"elementId": "jSx8ApfhtRs_nk37VvDMb",
|
||||||
|
"focus": 0.008018327605956525,
|
||||||
|
"gap": 3.5
|
||||||
|
},
|
||||||
|
"startArrowhead": null,
|
||||||
|
"endArrowhead": "triangle",
|
||||||
|
"elbowed": false
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"appState": {
|
||||||
|
"gridSize": 20,
|
||||||
|
"gridStep": 5,
|
||||||
|
"gridModeEnabled": false,
|
||||||
|
"viewBackgroundColor": "#ffffff"
|
||||||
|
},
|
||||||
|
"files": {}
|
||||||
|
}
|
||||||
826
Scripts/UML/CleaningPipeline/classes.excalidraw.json
Normal file
826
Scripts/UML/CleaningPipeline/classes.excalidraw.json
Normal file
@ -0,0 +1,826 @@
|
|||||||
|
{
|
||||||
|
"type": "excalidraw",
|
||||||
|
"version": 2,
|
||||||
|
"source": "https://marketplace.visualstudio.com/items?itemName=pomdtr.excalidraw-editor",
|
||||||
|
"elements": [
|
||||||
|
{
|
||||||
|
"type": "line",
|
||||||
|
"version": 4622,
|
||||||
|
"versionNonce": 1623045672,
|
||||||
|
"isDeleted": false,
|
||||||
|
"id": "twu_PiAvEuQ4l1YYtZLET",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 1,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"angle": 0,
|
||||||
|
"x": 289.8504963515835,
|
||||||
|
"y": 91.87474806402287,
|
||||||
|
"strokeColor": "#000000",
|
||||||
|
"backgroundColor": "#a5d8ff",
|
||||||
|
"width": 77.09201683999922,
|
||||||
|
"height": 99.49948667804088,
|
||||||
|
"seed": 1975340120,
|
||||||
|
"groupIds": [
|
||||||
|
"9PT4BXPfQ6UoCaB-T-h9A",
|
||||||
|
"dp_TZJyYdyPIH1hOkAPlb"
|
||||||
|
],
|
||||||
|
"strokeSharpness": "round",
|
||||||
|
"boundElementIds": [],
|
||||||
|
"startBinding": null,
|
||||||
|
"endBinding": null,
|
||||||
|
"lastCommittedPoint": null,
|
||||||
|
"startArrowhead": null,
|
||||||
|
"endArrowhead": null,
|
||||||
|
"points": [
|
||||||
|
[
|
||||||
|
0,
|
||||||
|
0
|
||||||
|
],
|
||||||
|
[
|
||||||
|
0.2542098813493443,
|
||||||
|
75.20117273657175
|
||||||
|
],
|
||||||
|
[
|
||||||
|
0.011896425679918422,
|
||||||
|
83.76249969444815
|
||||||
|
],
|
||||||
|
[
|
||||||
|
3.970409367559332,
|
||||||
|
87.46174320643391
|
||||||
|
],
|
||||||
|
[
|
||||||
|
17.75573317066317,
|
||||||
|
90.59250103325854
|
||||||
|
],
|
||||||
|
[
|
||||||
|
41.05683533152865,
|
||||||
|
91.56737225214069
|
||||||
|
],
|
||||||
|
[
|
||||||
|
63.319497586673116,
|
||||||
|
90.01084754868091
|
||||||
|
],
|
||||||
|
[
|
||||||
|
75.14781395923075,
|
||||||
|
86.28844687220405
|
||||||
|
],
|
||||||
|
[
|
||||||
|
76.81603792670788,
|
||||||
|
83.15042405259751
|
||||||
|
],
|
||||||
|
[
|
||||||
|
77.05033394391478,
|
||||||
|
76.25776215104557
|
||||||
|
],
|
||||||
|
[
|
||||||
|
76.86643881413028,
|
||||||
|
6.3089586511537865
|
||||||
|
],
|
||||||
|
[
|
||||||
|
76.45188016352971,
|
||||||
|
-0.2999144698665015
|
||||||
|
],
|
||||||
|
[
|
||||||
|
71.50179495549581,
|
||||||
|
-3.9936571317850627
|
||||||
|
],
|
||||||
|
[
|
||||||
|
61.077971898861186,
|
||||||
|
-6.132877429442784
|
||||||
|
],
|
||||||
|
[
|
||||||
|
37.32348754161154,
|
||||||
|
-7.932114425900202
|
||||||
|
],
|
||||||
|
[
|
||||||
|
18.278415656797975,
|
||||||
|
-6.859225353587373
|
||||||
|
],
|
||||||
|
[
|
||||||
|
3.2995959613238286,
|
||||||
|
-3.2201165291205287
|
||||||
|
],
|
||||||
|
[
|
||||||
|
-0.04168289608444441,
|
||||||
|
-0.045185660461322996
|
||||||
|
],
|
||||||
|
[
|
||||||
|
0,
|
||||||
|
0
|
||||||
|
]
|
||||||
|
],
|
||||||
|
"index": "a1",
|
||||||
|
"frameId": null,
|
||||||
|
"roundness": {
|
||||||
|
"type": 2
|
||||||
|
},
|
||||||
|
"boundElements": [],
|
||||||
|
"updated": 1758646548051,
|
||||||
|
"link": null,
|
||||||
|
"locked": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "line",
|
||||||
|
"version": 2327,
|
||||||
|
"versionNonce": 1593094440,
|
||||||
|
"isDeleted": false,
|
||||||
|
"id": "hmJk4dH9VpOsfkrCTkhvh",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 1,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"angle": 0,
|
||||||
|
"x": 290.3744257898585,
|
||||||
|
"y": 149.00103172175278,
|
||||||
|
"strokeColor": "#000000",
|
||||||
|
"backgroundColor": "#a5d8ff",
|
||||||
|
"width": 77.17198221193564,
|
||||||
|
"height": 8.562348957853036,
|
||||||
|
"seed": 637665624,
|
||||||
|
"groupIds": [
|
||||||
|
"9PT4BXPfQ6UoCaB-T-h9A",
|
||||||
|
"dp_TZJyYdyPIH1hOkAPlb"
|
||||||
|
],
|
||||||
|
"strokeSharpness": "round",
|
||||||
|
"boundElementIds": [],
|
||||||
|
"startBinding": null,
|
||||||
|
"endBinding": null,
|
||||||
|
"lastCommittedPoint": null,
|
||||||
|
"startArrowhead": null,
|
||||||
|
"endArrowhead": null,
|
||||||
|
"points": [
|
||||||
|
[
|
||||||
|
0,
|
||||||
|
0
|
||||||
|
],
|
||||||
|
[
|
||||||
|
2.033150371639873,
|
||||||
|
3.413095389435587
|
||||||
|
],
|
||||||
|
[
|
||||||
|
10.801287372573954,
|
||||||
|
6.276651055277943
|
||||||
|
],
|
||||||
|
[
|
||||||
|
22.468666942209353,
|
||||||
|
8.010803051612635
|
||||||
|
],
|
||||||
|
[
|
||||||
|
40.747074201802775,
|
||||||
|
8.168828515515864
|
||||||
|
],
|
||||||
|
[
|
||||||
|
62.077348233027564,
|
||||||
|
7.0647721921469495
|
||||||
|
],
|
||||||
|
[
|
||||||
|
74.53446931782398,
|
||||||
|
3.04824021069218
|
||||||
|
],
|
||||||
|
[
|
||||||
|
77.17198221193564,
|
||||||
|
-0.3935204423371723
|
||||||
|
]
|
||||||
|
],
|
||||||
|
"index": "a2",
|
||||||
|
"frameId": null,
|
||||||
|
"roundness": {
|
||||||
|
"type": 2
|
||||||
|
},
|
||||||
|
"boundElements": [],
|
||||||
|
"updated": 1758646548051,
|
||||||
|
"link": null,
|
||||||
|
"locked": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "line",
|
||||||
|
"version": 2413,
|
||||||
|
"versionNonce": 311708712,
|
||||||
|
"isDeleted": false,
|
||||||
|
"id": "X1ldVIXm4DfBal5N2Pwn9",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 1,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"angle": 0,
|
||||||
|
"x": 289.3425684673547,
|
||||||
|
"y": 120.03697638652972,
|
||||||
|
"strokeColor": "#000000",
|
||||||
|
"backgroundColor": "#a5d8ff",
|
||||||
|
"width": 77.17198221193564,
|
||||||
|
"height": 8.562348957853036,
|
||||||
|
"seed": 904402520,
|
||||||
|
"groupIds": [
|
||||||
|
"9PT4BXPfQ6UoCaB-T-h9A",
|
||||||
|
"dp_TZJyYdyPIH1hOkAPlb"
|
||||||
|
],
|
||||||
|
"strokeSharpness": "round",
|
||||||
|
"boundElementIds": [],
|
||||||
|
"startBinding": null,
|
||||||
|
"endBinding": null,
|
||||||
|
"lastCommittedPoint": null,
|
||||||
|
"startArrowhead": null,
|
||||||
|
"endArrowhead": null,
|
||||||
|
"points": [
|
||||||
|
[
|
||||||
|
0,
|
||||||
|
0
|
||||||
|
],
|
||||||
|
[
|
||||||
|
2.033150371639873,
|
||||||
|
3.413095389435587
|
||||||
|
],
|
||||||
|
[
|
||||||
|
10.801287372573954,
|
||||||
|
6.276651055277943
|
||||||
|
],
|
||||||
|
[
|
||||||
|
22.468666942209353,
|
||||||
|
8.010803051612635
|
||||||
|
],
|
||||||
|
[
|
||||||
|
40.747074201802775,
|
||||||
|
8.168828515515864
|
||||||
|
],
|
||||||
|
[
|
||||||
|
62.077348233027564,
|
||||||
|
7.0647721921469495
|
||||||
|
],
|
||||||
|
[
|
||||||
|
74.53446931782398,
|
||||||
|
3.04824021069218
|
||||||
|
],
|
||||||
|
[
|
||||||
|
77.17198221193564,
|
||||||
|
-0.3935204423371723
|
||||||
|
]
|
||||||
|
],
|
||||||
|
"index": "a3",
|
||||||
|
"frameId": null,
|
||||||
|
"roundness": {
|
||||||
|
"type": 2
|
||||||
|
},
|
||||||
|
"boundElements": [],
|
||||||
|
"updated": 1758646548051,
|
||||||
|
"link": null,
|
||||||
|
"locked": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "ellipse",
|
||||||
|
"version": 5410,
|
||||||
|
"versionNonce": 92833576,
|
||||||
|
"isDeleted": false,
|
||||||
|
"id": "CFhp5ZxSVwHYzGUj4hEn1",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 1,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"angle": 0,
|
||||||
|
"x": 288.28461948527263,
|
||||||
|
"y": 84.74247943834126,
|
||||||
|
"strokeColor": "#000000",
|
||||||
|
"backgroundColor": "#a5d8ff",
|
||||||
|
"width": 76.59753601865496,
|
||||||
|
"height": 15.49127539284798,
|
||||||
|
"seed": 1782811480,
|
||||||
|
"groupIds": [
|
||||||
|
"9PT4BXPfQ6UoCaB-T-h9A",
|
||||||
|
"dp_TZJyYdyPIH1hOkAPlb"
|
||||||
|
],
|
||||||
|
"strokeSharpness": "sharp",
|
||||||
|
"boundElementIds": [
|
||||||
|
"bxuMGTzXLn7H-uBCptINx"
|
||||||
|
],
|
||||||
|
"index": "a4",
|
||||||
|
"frameId": null,
|
||||||
|
"roundness": null,
|
||||||
|
"boundElements": [],
|
||||||
|
"updated": 1758646548051,
|
||||||
|
"link": null,
|
||||||
|
"locked": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "ellipse",
|
||||||
|
"version": 820,
|
||||||
|
"versionNonce": 608002600,
|
||||||
|
"isDeleted": false,
|
||||||
|
"id": "B43R7rWwK2_vdiRHBSSPk",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 1,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"angle": 0,
|
||||||
|
"x": 324.77660659049513,
|
||||||
|
"y": 109.21914711824485,
|
||||||
|
"strokeColor": "#000000",
|
||||||
|
"backgroundColor": "#228be6",
|
||||||
|
"width": 11.226103154161754,
|
||||||
|
"height": 12.183758484455605,
|
||||||
|
"seed": 1298686040,
|
||||||
|
"groupIds": [
|
||||||
|
"9PT4BXPfQ6UoCaB-T-h9A",
|
||||||
|
"dp_TZJyYdyPIH1hOkAPlb"
|
||||||
|
],
|
||||||
|
"strokeSharpness": "sharp",
|
||||||
|
"boundElementIds": [],
|
||||||
|
"index": "a5",
|
||||||
|
"frameId": null,
|
||||||
|
"roundness": null,
|
||||||
|
"boundElements": [],
|
||||||
|
"updated": 1758646548051,
|
||||||
|
"link": null,
|
||||||
|
"locked": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "ellipse",
|
||||||
|
"version": 1108,
|
||||||
|
"versionNonce": 1839127848,
|
||||||
|
"isDeleted": false,
|
||||||
|
"id": "CkKMb9wkJfVk04T217zSs",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 1,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"angle": 0,
|
||||||
|
"x": 325.12774837442873,
|
||||||
|
"y": 135.43576140530996,
|
||||||
|
"strokeColor": "#000000",
|
||||||
|
"backgroundColor": "#228be6",
|
||||||
|
"width": 11.226103154161754,
|
||||||
|
"height": 12.183758484455605,
|
||||||
|
"seed": 2133497176,
|
||||||
|
"groupIds": [
|
||||||
|
"9PT4BXPfQ6UoCaB-T-h9A",
|
||||||
|
"dp_TZJyYdyPIH1hOkAPlb"
|
||||||
|
],
|
||||||
|
"strokeSharpness": "sharp",
|
||||||
|
"boundElementIds": [],
|
||||||
|
"index": "a6",
|
||||||
|
"frameId": null,
|
||||||
|
"roundness": null,
|
||||||
|
"boundElements": [],
|
||||||
|
"updated": 1758646548051,
|
||||||
|
"link": null,
|
||||||
|
"locked": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "ellipse",
|
||||||
|
"version": 991,
|
||||||
|
"versionNonce": 588838952,
|
||||||
|
"isDeleted": false,
|
||||||
|
"id": "SHJdKeQPkfpvzSoNH--3o",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 1,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"angle": 6.239590202363168,
|
||||||
|
"x": 325.77660659049513,
|
||||||
|
"y": 164.20448797661635,
|
||||||
|
"strokeColor": "#000000",
|
||||||
|
"backgroundColor": "#228be6",
|
||||||
|
"width": 11.226103154161754,
|
||||||
|
"height": 12.183758484455605,
|
||||||
|
"seed": 81668696,
|
||||||
|
"groupIds": [
|
||||||
|
"9PT4BXPfQ6UoCaB-T-h9A",
|
||||||
|
"dp_TZJyYdyPIH1hOkAPlb"
|
||||||
|
],
|
||||||
|
"strokeSharpness": "sharp",
|
||||||
|
"boundElementIds": [],
|
||||||
|
"index": "a7",
|
||||||
|
"frameId": null,
|
||||||
|
"roundness": null,
|
||||||
|
"boundElements": [],
|
||||||
|
"updated": 1758646548051,
|
||||||
|
"link": null,
|
||||||
|
"locked": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"version": 489,
|
||||||
|
"versionNonce": 2023207720,
|
||||||
|
"isDeleted": false,
|
||||||
|
"id": "vUSyMBPup0jZ71CYXKyGb",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 1,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"angle": 0,
|
||||||
|
"x": 280.1846389770508,
|
||||||
|
"y": 185.79462957545917,
|
||||||
|
"strokeColor": "#000000",
|
||||||
|
"backgroundColor": "#a5d8ff",
|
||||||
|
"width": 95.63072204589844,
|
||||||
|
"height": 23.595161071904883,
|
||||||
|
"seed": 425140056,
|
||||||
|
"groupIds": [
|
||||||
|
"dp_TZJyYdyPIH1hOkAPlb"
|
||||||
|
],
|
||||||
|
"strokeSharpness": "sharp",
|
||||||
|
"boundElementIds": [],
|
||||||
|
"fontSize": 17.4778970902999,
|
||||||
|
"fontFamily": 1,
|
||||||
|
"text": "dataset.db",
|
||||||
|
"baseline": 16.595161071904883,
|
||||||
|
"textAlign": "center",
|
||||||
|
"verticalAlign": "top",
|
||||||
|
"index": "a8",
|
||||||
|
"frameId": null,
|
||||||
|
"roundness": null,
|
||||||
|
"boundElements": [],
|
||||||
|
"updated": 1758646548051,
|
||||||
|
"link": null,
|
||||||
|
"locked": false,
|
||||||
|
"containerId": null,
|
||||||
|
"originalText": "dataset.db",
|
||||||
|
"autoResize": true,
|
||||||
|
"lineHeight": 1.350000000000001
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "R7pU0VP6CFKCAwuvt0xsr",
|
||||||
|
"type": "text",
|
||||||
|
"x": 295.5,
|
||||||
|
"y": 342,
|
||||||
|
"width": 374,
|
||||||
|
"height": 225,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "#228be6",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "a9",
|
||||||
|
"roundness": null,
|
||||||
|
"seed": 705463336,
|
||||||
|
"version": 1130,
|
||||||
|
"versionNonce": 72522328,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": null,
|
||||||
|
"updated": 1758648226024,
|
||||||
|
"link": null,
|
||||||
|
"locked": false,
|
||||||
|
"text": "class Extract(Action):\n # Static\n + type : ActionTypes = Extract\n \n # Properties\n - db_connection: Path\n - query: str\n - query_parameters: [str]\n - output_mapper: [str]",
|
||||||
|
"fontSize": 20,
|
||||||
|
"fontFamily": 8,
|
||||||
|
"textAlign": "left",
|
||||||
|
"verticalAlign": "top",
|
||||||
|
"containerId": null,
|
||||||
|
"originalText": "class Extract(Action):\n # Static\n + type : ActionTypes = Extract\n \n # Properties\n - db_connection: Path\n - query: str\n - query_parameters: [str]\n - output_mapper: [str]",
|
||||||
|
"autoResize": true,
|
||||||
|
"lineHeight": 1.25
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "G1xIRcJgm34_NMEWQFFlW",
|
||||||
|
"type": "text",
|
||||||
|
"x": 1419.5,
|
||||||
|
"y": 110,
|
||||||
|
"width": 253,
|
||||||
|
"height": 75,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "#228be6",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "aA",
|
||||||
|
"roundness": null,
|
||||||
|
"seed": 651981400,
|
||||||
|
"version": 256,
|
||||||
|
"versionNonce": 138082856,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": null,
|
||||||
|
"updated": 1758646570344,
|
||||||
|
"link": null,
|
||||||
|
"locked": false,
|
||||||
|
"text": "class Pipeline\n - actions: [Action]\n ",
|
||||||
|
"fontSize": 20,
|
||||||
|
"fontFamily": 8,
|
||||||
|
"textAlign": "left",
|
||||||
|
"verticalAlign": "top",
|
||||||
|
"containerId": null,
|
||||||
|
"originalText": "class Pipeline\n - actions: [Action]\n ",
|
||||||
|
"autoResize": true,
|
||||||
|
"lineHeight": 1.25
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "TBVy3JbJCkbA9kjVEJ8lv",
|
||||||
|
"type": "text",
|
||||||
|
"x": 694,
|
||||||
|
"y": 100,
|
||||||
|
"width": 495,
|
||||||
|
"height": 150,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "#228be6",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "aB",
|
||||||
|
"roundness": null,
|
||||||
|
"seed": 680960040,
|
||||||
|
"version": 560,
|
||||||
|
"versionNonce": 85012520,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": null,
|
||||||
|
"updated": 1758649442239,
|
||||||
|
"link": null,
|
||||||
|
"locked": false,
|
||||||
|
"text": "class Action\n + type: ActionTypes\n + name: str\n + depends_on: [str]\n\n + execute(mem) -> [Dict<str, any>] | Void",
|
||||||
|
"fontSize": 20,
|
||||||
|
"fontFamily": 8,
|
||||||
|
"textAlign": "left",
|
||||||
|
"verticalAlign": "top",
|
||||||
|
"containerId": null,
|
||||||
|
"originalText": "class Action\n + type: ActionTypes\n + name: str\n + depends_on: [str]\n\n + execute(mem) -> [Dict<str, any>] | Void",
|
||||||
|
"autoResize": true,
|
||||||
|
"lineHeight": 1.25
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "an7KRTzWpCytKNKgHftKC",
|
||||||
|
"type": "text",
|
||||||
|
"x": 1528.5,
|
||||||
|
"y": 365.5,
|
||||||
|
"width": 187,
|
||||||
|
"height": 150,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "#228be6",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "aC",
|
||||||
|
"roundness": null,
|
||||||
|
"seed": 1974317656,
|
||||||
|
"version": 306,
|
||||||
|
"versionNonce": 1574962264,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": null,
|
||||||
|
"updated": 1758648154009,
|
||||||
|
"link": null,
|
||||||
|
"locked": false,
|
||||||
|
"text": "enum ActionTypes:\n + Extract\n + Aggregate\n + Filter\n + Map\n + Dump",
|
||||||
|
"fontSize": 20,
|
||||||
|
"fontFamily": 8,
|
||||||
|
"textAlign": "left",
|
||||||
|
"verticalAlign": "top",
|
||||||
|
"containerId": null,
|
||||||
|
"originalText": "enum ActionTypes:\n + Extract\n + Aggregate\n + Filter\n + Map\n + Dump",
|
||||||
|
"autoResize": true,
|
||||||
|
"lineHeight": 1.25
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "2pQ5EULirrWs_QZPbClhh",
|
||||||
|
"type": "text",
|
||||||
|
"x": 785,
|
||||||
|
"y": 332.5,
|
||||||
|
"width": 418,
|
||||||
|
"height": 375,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "#228be6",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "aH",
|
||||||
|
"roundness": null,
|
||||||
|
"seed": 1402251560,
|
||||||
|
"version": 742,
|
||||||
|
"versionNonce": 680432168,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": null,
|
||||||
|
"updated": 1758649532881,
|
||||||
|
"link": null,
|
||||||
|
"locked": false,
|
||||||
|
"text": "class Aggregate(Action):\n # Static\n + type: ActionTypes = Aggregate\n\n # Properties\n - actionIDs: [str]\n - associations: [Association]\n - output_mapper: [str]\n\n + execute(mem):\n tables = mem.gather(actionIDs)\n\n for join in association:\n \n ",
|
||||||
|
"fontSize": 20,
|
||||||
|
"fontFamily": 8,
|
||||||
|
"textAlign": "left",
|
||||||
|
"verticalAlign": "top",
|
||||||
|
"containerId": null,
|
||||||
|
"originalText": "class Aggregate(Action):\n # Static\n + type: ActionTypes = Aggregate\n\n # Properties\n - actionIDs: [str]\n - associations: [Association]\n - output_mapper: [str]\n\n + execute(mem):\n tables = mem.gather(actionIDs)\n\n for join in association:\n \n ",
|
||||||
|
"autoResize": true,
|
||||||
|
"lineHeight": 1.25
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "O0fso8DJqFfwJEzmpUikM",
|
||||||
|
"type": "text",
|
||||||
|
"x": 1289,
|
||||||
|
"y": 195,
|
||||||
|
"width": 594,
|
||||||
|
"height": 100,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "#228be6",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "aI",
|
||||||
|
"roundness": null,
|
||||||
|
"seed": 1582329944,
|
||||||
|
"version": 459,
|
||||||
|
"versionNonce": 1080077144,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": null,
|
||||||
|
"updated": 1758647067031,
|
||||||
|
"link": null,
|
||||||
|
"locked": false,
|
||||||
|
"text": "input_mapper: \n - key: ActionID (name) that produced such output\n - value: list of strings that represent the values\n to take",
|
||||||
|
"fontSize": 20,
|
||||||
|
"fontFamily": 8,
|
||||||
|
"textAlign": "left",
|
||||||
|
"verticalAlign": "top",
|
||||||
|
"containerId": null,
|
||||||
|
"originalText": "input_mapper: \n - key: ActionID (name) that produced such output\n - value: list of strings that represent the values\n to take",
|
||||||
|
"autoResize": true,
|
||||||
|
"lineHeight": 1.25
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "v0kzO6vlBWOdJCV3yoG69",
|
||||||
|
"type": "text",
|
||||||
|
"x": 1379.5,
|
||||||
|
"y": 718.5,
|
||||||
|
"width": 286,
|
||||||
|
"height": 175,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "#228be6",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "aL",
|
||||||
|
"roundness": null,
|
||||||
|
"seed": 1462407976,
|
||||||
|
"version": 635,
|
||||||
|
"versionNonce": 1012998696,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": null,
|
||||||
|
"updated": 1758649495598,
|
||||||
|
"link": null,
|
||||||
|
"locked": false,
|
||||||
|
"text": "class Association:\n - from_actionID: str\n - from_key_name: str\n - from_value_name: str\n - to_actionID: str\n - to_value_name: str\n - type: Type",
|
||||||
|
"fontSize": 20,
|
||||||
|
"fontFamily": 8,
|
||||||
|
"textAlign": "left",
|
||||||
|
"verticalAlign": "top",
|
||||||
|
"containerId": null,
|
||||||
|
"originalText": "class Association:\n - from_actionID: str\n - from_key_name: str\n - from_value_name: str\n - to_actionID: str\n - to_value_name: str\n - type: Type",
|
||||||
|
"autoResize": true,
|
||||||
|
"lineHeight": 1.25
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "WK34n9xeVxntypCtrlK6p",
|
||||||
|
"type": "text",
|
||||||
|
"x": 256.5,
|
||||||
|
"y": 787.5,
|
||||||
|
"width": 517,
|
||||||
|
"height": 175,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "#228be6",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "aM",
|
||||||
|
"roundness": null,
|
||||||
|
"seed": 1166526296,
|
||||||
|
"version": 318,
|
||||||
|
"versionNonce": 1042162520,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": null,
|
||||||
|
"updated": 1758649002604,
|
||||||
|
"link": null,
|
||||||
|
"locked": false,
|
||||||
|
"text": "class Filter(Action):\n # Static\n + type: ActionTypes = Filter\n\n # Properties\n - compare: function(Dict<str, any>) -> bool\n - output_mapper: [str]",
|
||||||
|
"fontSize": 20,
|
||||||
|
"fontFamily": 8,
|
||||||
|
"textAlign": "left",
|
||||||
|
"verticalAlign": "top",
|
||||||
|
"containerId": null,
|
||||||
|
"originalText": "class Filter(Action):\n # Static\n + type: ActionTypes = Filter\n\n # Properties\n - compare: function(Dict<str, any>) -> bool\n - output_mapper: [str]",
|
||||||
|
"autoResize": true,
|
||||||
|
"lineHeight": 1.25
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "NY9jyUFLFFCNPE2sh00SX",
|
||||||
|
"type": "text",
|
||||||
|
"x": 1639,
|
||||||
|
"y": 606.5,
|
||||||
|
"width": 407,
|
||||||
|
"height": 200,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "#228be6",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "aP",
|
||||||
|
"roundness": null,
|
||||||
|
"seed": 20345896,
|
||||||
|
"version": 168,
|
||||||
|
"versionNonce": 627282472,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": null,
|
||||||
|
"updated": 1758649426380,
|
||||||
|
"link": null,
|
||||||
|
"locked": false,
|
||||||
|
"text": "class Map(Action):\n # Static\n + type: ActionTypes = Map\n\n # Properties\n - compare_mapper: [str]\n - mapper: function(any...) -> any\n - output_mapper: [str]",
|
||||||
|
"fontSize": 20,
|
||||||
|
"fontFamily": 8,
|
||||||
|
"textAlign": "left",
|
||||||
|
"verticalAlign": "top",
|
||||||
|
"containerId": null,
|
||||||
|
"originalText": "class Map(Action):\n # Static\n + type: ActionTypes = Map\n\n # Properties\n - compare_mapper: [str]\n - mapper: function(any...) -> any\n - output_mapper: [str]",
|
||||||
|
"autoResize": true,
|
||||||
|
"lineHeight": 1.25
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "SkhaoW-3TTKDZzEii3Lf6",
|
||||||
|
"type": "text",
|
||||||
|
"x": 1457.5,
|
||||||
|
"y": 955.5,
|
||||||
|
"width": 121,
|
||||||
|
"height": 50,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "#228be6",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "aQ",
|
||||||
|
"roundness": null,
|
||||||
|
"seed": 2071523672,
|
||||||
|
"version": 37,
|
||||||
|
"versionNonce": 105260376,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": null,
|
||||||
|
"updated": 1758648834435,
|
||||||
|
"link": null,
|
||||||
|
"locked": false,
|
||||||
|
"text": "class Dump:\n -",
|
||||||
|
"fontSize": 20,
|
||||||
|
"fontFamily": 8,
|
||||||
|
"textAlign": "left",
|
||||||
|
"verticalAlign": "top",
|
||||||
|
"containerId": null,
|
||||||
|
"originalText": "class Dump:\n -",
|
||||||
|
"autoResize": true,
|
||||||
|
"lineHeight": 1.25
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"appState": {
|
||||||
|
"gridSize": 20,
|
||||||
|
"gridStep": 5,
|
||||||
|
"gridModeEnabled": false,
|
||||||
|
"viewBackgroundColor": "#ffffff"
|
||||||
|
},
|
||||||
|
"files": {}
|
||||||
|
}
|
||||||
634
Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json
Normal file
634
Scripts/UML/CleaningPipeline/cleaning-pipeline.excalidraw.json
Normal file
@ -0,0 +1,634 @@
|
|||||||
|
{
|
||||||
|
"type": "excalidraw",
|
||||||
|
"version": 2,
|
||||||
|
"source": "https://marketplace.visualstudio.com/items?itemName=pomdtr.excalidraw-editor",
|
||||||
|
"elements": [
|
||||||
|
{
|
||||||
|
"id": "JNB9z-PeqZ4s8KDfWaoXe",
|
||||||
|
"type": "rectangle",
|
||||||
|
"x": 106,
|
||||||
|
"y": 27,
|
||||||
|
"width": 653,
|
||||||
|
"height": 263,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "transparent",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "a2",
|
||||||
|
"roundness": {
|
||||||
|
"type": 3
|
||||||
|
},
|
||||||
|
"seed": 710740889,
|
||||||
|
"version": 326,
|
||||||
|
"versionNonce": 1107631703,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": null,
|
||||||
|
"updated": 1759156408059,
|
||||||
|
"link": null,
|
||||||
|
"locked": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "e13wNTgUpn2flMpmMttqx",
|
||||||
|
"type": "text",
|
||||||
|
"x": 200.5943407656526,
|
||||||
|
"y": 44.07937975075269,
|
||||||
|
"width": 307.2781467269385,
|
||||||
|
"height": 23.3097531902191,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "transparent",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "a3",
|
||||||
|
"roundness": null,
|
||||||
|
"seed": 1012740663,
|
||||||
|
"version": 444,
|
||||||
|
"versionNonce": 589551257,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": null,
|
||||||
|
"updated": 1759156408059,
|
||||||
|
"link": null,
|
||||||
|
"locked": false,
|
||||||
|
"text": "Libs/CleaningPipeline/sql_endpoint",
|
||||||
|
"fontSize": 18.64780255217528,
|
||||||
|
"fontFamily": 5,
|
||||||
|
"textAlign": "left",
|
||||||
|
"verticalAlign": "top",
|
||||||
|
"containerId": null,
|
||||||
|
"originalText": "Libs/CleaningPipeline/sql_endpoint",
|
||||||
|
"autoResize": true,
|
||||||
|
"lineHeight": 1.25
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "CgxCElJkKBtIHv-5WQrbo",
|
||||||
|
"type": "text",
|
||||||
|
"x": 195,
|
||||||
|
"y": 80.44259472749451,
|
||||||
|
"width": 403.64997665852184,
|
||||||
|
"height": 186.4780255217528,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "transparent",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "a4",
|
||||||
|
"roundness": null,
|
||||||
|
"seed": 1261951799,
|
||||||
|
"version": 507,
|
||||||
|
"versionNonce": 1922906999,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": null,
|
||||||
|
"updated": 1759156408059,
|
||||||
|
"link": null,
|
||||||
|
"locked": false,
|
||||||
|
"text": "Class SqlEndpoint:\n - sql_engine\n + movie_ids: list[int]\n\n #\n + get_abbreviated_dataset_by_movie_id\n\n",
|
||||||
|
"fontSize": 18.64780255217528,
|
||||||
|
"fontFamily": 5,
|
||||||
|
"textAlign": "left",
|
||||||
|
"verticalAlign": "top",
|
||||||
|
"containerId": null,
|
||||||
|
"originalText": "Class SqlEndpoint:\n - sql_engine\n + movie_ids: list[int]\n\n #\n + get_abbreviated_dataset_by_movie_id\n\n",
|
||||||
|
"autoResize": true,
|
||||||
|
"lineHeight": 1.25
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "line",
|
||||||
|
"version": 4979,
|
||||||
|
"versionNonce": 1473849177,
|
||||||
|
"isDeleted": false,
|
||||||
|
"id": "sYReMTdYblr-oJtYYJALU",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 1,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"angle": 0,
|
||||||
|
"x": -67.14432426259049,
|
||||||
|
"y": 87.19293561900287,
|
||||||
|
"strokeColor": "#000000",
|
||||||
|
"backgroundColor": "#a5d8ff",
|
||||||
|
"width": 77.09201683999922,
|
||||||
|
"height": 99.49948667804088,
|
||||||
|
"seed": 1263944119,
|
||||||
|
"groupIds": [
|
||||||
|
"9YkNe1yqnfZy9Z1JX2xr4",
|
||||||
|
"BDBCTrrhjbJynRAyuf3xJ"
|
||||||
|
],
|
||||||
|
"strokeSharpness": "round",
|
||||||
|
"boundElementIds": [],
|
||||||
|
"startBinding": null,
|
||||||
|
"endBinding": null,
|
||||||
|
"lastCommittedPoint": null,
|
||||||
|
"startArrowhead": null,
|
||||||
|
"endArrowhead": null,
|
||||||
|
"points": [
|
||||||
|
[
|
||||||
|
0,
|
||||||
|
0
|
||||||
|
],
|
||||||
|
[
|
||||||
|
0.2542098813493443,
|
||||||
|
75.20117273657175
|
||||||
|
],
|
||||||
|
[
|
||||||
|
0.011896425679918422,
|
||||||
|
83.76249969444815
|
||||||
|
],
|
||||||
|
[
|
||||||
|
3.970409367559332,
|
||||||
|
87.46174320643391
|
||||||
|
],
|
||||||
|
[
|
||||||
|
17.75573317066317,
|
||||||
|
90.59250103325854
|
||||||
|
],
|
||||||
|
[
|
||||||
|
41.05683533152865,
|
||||||
|
91.56737225214069
|
||||||
|
],
|
||||||
|
[
|
||||||
|
63.319497586673116,
|
||||||
|
90.01084754868091
|
||||||
|
],
|
||||||
|
[
|
||||||
|
75.14781395923075,
|
||||||
|
86.28844687220405
|
||||||
|
],
|
||||||
|
[
|
||||||
|
76.81603792670788,
|
||||||
|
83.15042405259751
|
||||||
|
],
|
||||||
|
[
|
||||||
|
77.05033394391478,
|
||||||
|
76.25776215104557
|
||||||
|
],
|
||||||
|
[
|
||||||
|
76.86643881413028,
|
||||||
|
6.3089586511537865
|
||||||
|
],
|
||||||
|
[
|
||||||
|
76.45188016352971,
|
||||||
|
-0.2999144698665015
|
||||||
|
],
|
||||||
|
[
|
||||||
|
71.50179495549581,
|
||||||
|
-3.9936571317850627
|
||||||
|
],
|
||||||
|
[
|
||||||
|
61.077971898861186,
|
||||||
|
-6.132877429442784
|
||||||
|
],
|
||||||
|
[
|
||||||
|
37.32348754161154,
|
||||||
|
-7.932114425900202
|
||||||
|
],
|
||||||
|
[
|
||||||
|
18.278415656797975,
|
||||||
|
-6.859225353587373
|
||||||
|
],
|
||||||
|
[
|
||||||
|
3.2995959613238286,
|
||||||
|
-3.2201165291205287
|
||||||
|
],
|
||||||
|
[
|
||||||
|
-0.04168289608444441,
|
||||||
|
-0.045185660461322996
|
||||||
|
],
|
||||||
|
[
|
||||||
|
0,
|
||||||
|
0
|
||||||
|
]
|
||||||
|
],
|
||||||
|
"index": "a6",
|
||||||
|
"frameId": null,
|
||||||
|
"roundness": {
|
||||||
|
"type": 2
|
||||||
|
},
|
||||||
|
"boundElements": [],
|
||||||
|
"updated": 1759158252997,
|
||||||
|
"link": null,
|
||||||
|
"locked": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "line",
|
||||||
|
"version": 2684,
|
||||||
|
"versionNonce": 952947769,
|
||||||
|
"isDeleted": false,
|
||||||
|
"id": "0S6dEWQVqKUVkP6Z5IX1l",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 1,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"angle": 0,
|
||||||
|
"x": -66.6203948243155,
|
||||||
|
"y": 144.31921927673278,
|
||||||
|
"strokeColor": "#000000",
|
||||||
|
"backgroundColor": "#a5d8ff",
|
||||||
|
"width": 77.17198221193564,
|
||||||
|
"height": 8.562348957853036,
|
||||||
|
"seed": 817033943,
|
||||||
|
"groupIds": [
|
||||||
|
"9YkNe1yqnfZy9Z1JX2xr4",
|
||||||
|
"BDBCTrrhjbJynRAyuf3xJ"
|
||||||
|
],
|
||||||
|
"strokeSharpness": "round",
|
||||||
|
"boundElementIds": [],
|
||||||
|
"startBinding": null,
|
||||||
|
"endBinding": null,
|
||||||
|
"lastCommittedPoint": null,
|
||||||
|
"startArrowhead": null,
|
||||||
|
"endArrowhead": null,
|
||||||
|
"points": [
|
||||||
|
[
|
||||||
|
0,
|
||||||
|
0
|
||||||
|
],
|
||||||
|
[
|
||||||
|
2.033150371639873,
|
||||||
|
3.413095389435587
|
||||||
|
],
|
||||||
|
[
|
||||||
|
10.801287372573954,
|
||||||
|
6.276651055277943
|
||||||
|
],
|
||||||
|
[
|
||||||
|
22.468666942209353,
|
||||||
|
8.010803051612635
|
||||||
|
],
|
||||||
|
[
|
||||||
|
40.747074201802775,
|
||||||
|
8.168828515515864
|
||||||
|
],
|
||||||
|
[
|
||||||
|
62.077348233027564,
|
||||||
|
7.0647721921469495
|
||||||
|
],
|
||||||
|
[
|
||||||
|
74.53446931782398,
|
||||||
|
3.04824021069218
|
||||||
|
],
|
||||||
|
[
|
||||||
|
77.17198221193564,
|
||||||
|
-0.3935204423371723
|
||||||
|
]
|
||||||
|
],
|
||||||
|
"index": "a7",
|
||||||
|
"frameId": null,
|
||||||
|
"roundness": {
|
||||||
|
"type": 2
|
||||||
|
},
|
||||||
|
"boundElements": [],
|
||||||
|
"updated": 1759158252997,
|
||||||
|
"link": null,
|
||||||
|
"locked": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "line",
|
||||||
|
"version": 2770,
|
||||||
|
"versionNonce": 477619481,
|
||||||
|
"isDeleted": false,
|
||||||
|
"id": "szGLND7J0nVOvRkNXX9AS",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 1,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"angle": 0,
|
||||||
|
"x": -67.65225214681931,
|
||||||
|
"y": 115.35516394150972,
|
||||||
|
"strokeColor": "#000000",
|
||||||
|
"backgroundColor": "#a5d8ff",
|
||||||
|
"width": 77.17198221193564,
|
||||||
|
"height": 8.562348957853036,
|
||||||
|
"seed": 1704755191,
|
||||||
|
"groupIds": [
|
||||||
|
"9YkNe1yqnfZy9Z1JX2xr4",
|
||||||
|
"BDBCTrrhjbJynRAyuf3xJ"
|
||||||
|
],
|
||||||
|
"strokeSharpness": "round",
|
||||||
|
"boundElementIds": [],
|
||||||
|
"startBinding": null,
|
||||||
|
"endBinding": null,
|
||||||
|
"lastCommittedPoint": null,
|
||||||
|
"startArrowhead": null,
|
||||||
|
"endArrowhead": null,
|
||||||
|
"points": [
|
||||||
|
[
|
||||||
|
0,
|
||||||
|
0
|
||||||
|
],
|
||||||
|
[
|
||||||
|
2.033150371639873,
|
||||||
|
3.413095389435587
|
||||||
|
],
|
||||||
|
[
|
||||||
|
10.801287372573954,
|
||||||
|
6.276651055277943
|
||||||
|
],
|
||||||
|
[
|
||||||
|
22.468666942209353,
|
||||||
|
8.010803051612635
|
||||||
|
],
|
||||||
|
[
|
||||||
|
40.747074201802775,
|
||||||
|
8.168828515515864
|
||||||
|
],
|
||||||
|
[
|
||||||
|
62.077348233027564,
|
||||||
|
7.0647721921469495
|
||||||
|
],
|
||||||
|
[
|
||||||
|
74.53446931782398,
|
||||||
|
3.04824021069218
|
||||||
|
],
|
||||||
|
[
|
||||||
|
77.17198221193564,
|
||||||
|
-0.3935204423371723
|
||||||
|
]
|
||||||
|
],
|
||||||
|
"index": "a8",
|
||||||
|
"frameId": null,
|
||||||
|
"roundness": {
|
||||||
|
"type": 2
|
||||||
|
},
|
||||||
|
"boundElements": [],
|
||||||
|
"updated": 1759158252997,
|
||||||
|
"link": null,
|
||||||
|
"locked": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "ellipse",
|
||||||
|
"version": 5767,
|
||||||
|
"versionNonce": 2119031289,
|
||||||
|
"isDeleted": false,
|
||||||
|
"id": "O3t2uGktJlDd1_OX_bpV4",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 1,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"angle": 0,
|
||||||
|
"x": -68.71020112890136,
|
||||||
|
"y": 80.06066699332126,
|
||||||
|
"strokeColor": "#000000",
|
||||||
|
"backgroundColor": "#a5d8ff",
|
||||||
|
"width": 76.59753601865496,
|
||||||
|
"height": 15.49127539284798,
|
||||||
|
"seed": 471296279,
|
||||||
|
"groupIds": [
|
||||||
|
"9YkNe1yqnfZy9Z1JX2xr4",
|
||||||
|
"BDBCTrrhjbJynRAyuf3xJ"
|
||||||
|
],
|
||||||
|
"strokeSharpness": "sharp",
|
||||||
|
"boundElementIds": [
|
||||||
|
"bxuMGTzXLn7H-uBCptINx"
|
||||||
|
],
|
||||||
|
"index": "a9",
|
||||||
|
"frameId": null,
|
||||||
|
"roundness": null,
|
||||||
|
"boundElements": [],
|
||||||
|
"updated": 1759158252997,
|
||||||
|
"link": null,
|
||||||
|
"locked": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "ellipse",
|
||||||
|
"version": 1177,
|
||||||
|
"versionNonce": 525480665,
|
||||||
|
"isDeleted": false,
|
||||||
|
"id": "_SzKlOBOvJgBg7FX0JTTM",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 1,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"angle": 0,
|
||||||
|
"x": -32.218214023678854,
|
||||||
|
"y": 104.53733467322485,
|
||||||
|
"strokeColor": "#000000",
|
||||||
|
"backgroundColor": "#228be6",
|
||||||
|
"width": 11.226103154161754,
|
||||||
|
"height": 12.183758484455605,
|
||||||
|
"seed": 1368927799,
|
||||||
|
"groupIds": [
|
||||||
|
"9YkNe1yqnfZy9Z1JX2xr4",
|
||||||
|
"BDBCTrrhjbJynRAyuf3xJ"
|
||||||
|
],
|
||||||
|
"strokeSharpness": "sharp",
|
||||||
|
"boundElementIds": [],
|
||||||
|
"index": "aA",
|
||||||
|
"frameId": null,
|
||||||
|
"roundness": null,
|
||||||
|
"boundElements": [],
|
||||||
|
"updated": 1759158252997,
|
||||||
|
"link": null,
|
||||||
|
"locked": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "ellipse",
|
||||||
|
"version": 1465,
|
||||||
|
"versionNonce": 1410887609,
|
||||||
|
"isDeleted": false,
|
||||||
|
"id": "oJMl2Kxa3SPaiAY0kxo7A",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 1,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"angle": 0,
|
||||||
|
"x": -31.867072239745255,
|
||||||
|
"y": 130.75394896028996,
|
||||||
|
"strokeColor": "#000000",
|
||||||
|
"backgroundColor": "#228be6",
|
||||||
|
"width": 11.226103154161754,
|
||||||
|
"height": 12.183758484455605,
|
||||||
|
"seed": 1627606871,
|
||||||
|
"groupIds": [
|
||||||
|
"9YkNe1yqnfZy9Z1JX2xr4",
|
||||||
|
"BDBCTrrhjbJynRAyuf3xJ"
|
||||||
|
],
|
||||||
|
"strokeSharpness": "sharp",
|
||||||
|
"boundElementIds": [],
|
||||||
|
"index": "aB",
|
||||||
|
"frameId": null,
|
||||||
|
"roundness": null,
|
||||||
|
"boundElements": [],
|
||||||
|
"updated": 1759158252997,
|
||||||
|
"link": null,
|
||||||
|
"locked": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "ellipse",
|
||||||
|
"version": 1348,
|
||||||
|
"versionNonce": 314839193,
|
||||||
|
"isDeleted": false,
|
||||||
|
"id": "fB6pJBSMA-pRHrpgYKaLL",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 1,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"angle": 6.239590202363168,
|
||||||
|
"x": -31.218214023678854,
|
||||||
|
"y": 159.52267553159635,
|
||||||
|
"strokeColor": "#000000",
|
||||||
|
"backgroundColor": "#228be6",
|
||||||
|
"width": 11.226103154161754,
|
||||||
|
"height": 12.183758484455605,
|
||||||
|
"seed": 1420643447,
|
||||||
|
"groupIds": [
|
||||||
|
"9YkNe1yqnfZy9Z1JX2xr4",
|
||||||
|
"BDBCTrrhjbJynRAyuf3xJ"
|
||||||
|
],
|
||||||
|
"strokeSharpness": "sharp",
|
||||||
|
"boundElementIds": [],
|
||||||
|
"index": "aC",
|
||||||
|
"frameId": null,
|
||||||
|
"roundness": null,
|
||||||
|
"boundElements": [],
|
||||||
|
"updated": 1759158252997,
|
||||||
|
"link": null,
|
||||||
|
"locked": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"version": 846,
|
||||||
|
"versionNonce": 1091081593,
|
||||||
|
"isDeleted": false,
|
||||||
|
"id": "9gZ3Yy1MeP9kEOTLODqLG",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 1,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"angle": 0,
|
||||||
|
"x": -76.81018163712321,
|
||||||
|
"y": 181.11281713043917,
|
||||||
|
"strokeColor": "#000000",
|
||||||
|
"backgroundColor": "#a5d8ff",
|
||||||
|
"width": 95.63072204589844,
|
||||||
|
"height": 23.595161071904883,
|
||||||
|
"seed": 2019206551,
|
||||||
|
"groupIds": [
|
||||||
|
"BDBCTrrhjbJynRAyuf3xJ"
|
||||||
|
],
|
||||||
|
"strokeSharpness": "sharp",
|
||||||
|
"boundElementIds": [],
|
||||||
|
"fontSize": 17.4778970902999,
|
||||||
|
"fontFamily": 1,
|
||||||
|
"text": "dataset.db",
|
||||||
|
"baseline": 16.595161071904883,
|
||||||
|
"textAlign": "center",
|
||||||
|
"verticalAlign": "top",
|
||||||
|
"index": "aD",
|
||||||
|
"frameId": null,
|
||||||
|
"roundness": null,
|
||||||
|
"boundElements": [],
|
||||||
|
"updated": 1759158252997,
|
||||||
|
"link": null,
|
||||||
|
"locked": false,
|
||||||
|
"containerId": null,
|
||||||
|
"originalText": "dataset.db",
|
||||||
|
"autoResize": true,
|
||||||
|
"lineHeight": 1.350000000000001
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "3eOw20xMhpB5jf_RMG24P",
|
||||||
|
"type": "text",
|
||||||
|
"x": 1131.3333333333335,
|
||||||
|
"y": 31.333333333333428,
|
||||||
|
"width": 508.3333333333333,
|
||||||
|
"height": 550,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "transparent",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "aE",
|
||||||
|
"roundness": null,
|
||||||
|
"seed": 1535658041,
|
||||||
|
"version": 821,
|
||||||
|
"versionNonce": 1630266809,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": null,
|
||||||
|
"updated": 1759157181677,
|
||||||
|
"link": null,
|
||||||
|
"locked": false,
|
||||||
|
"text": "Class PipelineApplier\n - movie_frequence_filter : pd.DataFrame()\n - rel_Frequence_Filter : pd.DataFrame()\n - rel_banned_list: list[str]\n\n + generate_movie_frequency_filter()\n + generate_rel_frequency_filter()\n + generate_list_relationship_filter()\n \n + filter_by_movie_frequency()\n + filter_by_relationship_frequency()\n + delete_relationship_by_list_filter()\n + delete_relationship_by_str()\n\n + drop_na() \n\n + rdf_add_special_token()\n + group_triple_by_movie()\n + build_by_movie()\n # static\n + build_triple()\n + build_incomplete_triple()",
|
||||||
|
"fontSize": 20,
|
||||||
|
"fontFamily": 5,
|
||||||
|
"textAlign": "left",
|
||||||
|
"verticalAlign": "top",
|
||||||
|
"containerId": null,
|
||||||
|
"originalText": "Class PipelineApplier\n - movie_frequence_filter : pd.DataFrame()\n - rel_Frequence_Filter : pd.DataFrame()\n - rel_banned_list: list[str]\n\n + generate_movie_frequency_filter()\n + generate_rel_frequency_filter()\n + generate_list_relationship_filter()\n \n + filter_by_movie_frequency()\n + filter_by_relationship_frequency()\n + delete_relationship_by_list_filter()\n + delete_relationship_by_str()\n\n + drop_na() \n\n + rdf_add_special_token()\n + group_triple_by_movie()\n + build_by_movie()\n # static\n + build_triple()\n + build_incomplete_triple()",
|
||||||
|
"autoResize": false,
|
||||||
|
"lineHeight": 1.25
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "Fbl1gpb5r7QrdRauGUWm2",
|
||||||
|
"type": "text",
|
||||||
|
"x": 158.23809523809535,
|
||||||
|
"y": 502.52380952380935,
|
||||||
|
"width": 484.2857142857143,
|
||||||
|
"height": 500,
|
||||||
|
"angle": 0,
|
||||||
|
"strokeColor": "#1e1e1e",
|
||||||
|
"backgroundColor": "transparent",
|
||||||
|
"fillStyle": "solid",
|
||||||
|
"strokeWidth": 2,
|
||||||
|
"strokeStyle": "solid",
|
||||||
|
"roughness": 1,
|
||||||
|
"opacity": 100,
|
||||||
|
"groupIds": [],
|
||||||
|
"frameId": null,
|
||||||
|
"index": "aF",
|
||||||
|
"roundness": null,
|
||||||
|
"seed": 2066618807,
|
||||||
|
"version": 552,
|
||||||
|
"versionNonce": 1269344823,
|
||||||
|
"isDeleted": false,
|
||||||
|
"boundElements": null,
|
||||||
|
"updated": 1759158199532,
|
||||||
|
"link": null,
|
||||||
|
"locked": false,
|
||||||
|
"text": "Class Pipeline\n - sql_endpoint: SqlEndpoint()\n\n - task_rdf_mask_file_handler:\n - task_bpe_corpus_file_handler:\n - task_rdf_text_file_handler:\n - task_rdf_completation_file_handler:\n\n - Filter_applier : PipelineApplier()\n\n #\n - get_cleaned_movie_rows()\n \n + execute_task_bpe_corpus()\n + execute_task_rdf_mask()\n + execute_task_rdf_text()\n + execute_task_rdf_completation()\n + execute_all_task()\n\n + use_toy_dataset()",
|
||||||
|
"fontSize": 20,
|
||||||
|
"fontFamily": 5,
|
||||||
|
"textAlign": "left",
|
||||||
|
"verticalAlign": "top",
|
||||||
|
"containerId": null,
|
||||||
|
"originalText": "Class Pipeline\n - sql_endpoint: SqlEndpoint()\n\n - task_rdf_mask_file_handler:\n - task_bpe_corpus_file_handler:\n - task_rdf_text_file_handler:\n - task_rdf_completation_file_handler:\n\n - Filter_applier : PipelineApplier()\n\n #\n - get_cleaned_movie_rows()\n \n + execute_task_bpe_corpus()\n + execute_task_rdf_mask()\n + execute_task_rdf_text()\n + execute_task_rdf_completation()\n + execute_all_task()\n\n + use_toy_dataset()",
|
||||||
|
"autoResize": false,
|
||||||
|
"lineHeight": 1.25
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"appState": {
|
||||||
|
"gridSize": 20,
|
||||||
|
"gridStep": 5,
|
||||||
|
"gridModeEnabled": false,
|
||||||
|
"viewBackgroundColor": "#ffffff"
|
||||||
|
},
|
||||||
|
"files": {}
|
||||||
|
}
|
||||||
215
docs/DBPEDIA.md
Normal file
215
docs/DBPEDIA.md
Normal file
@ -0,0 +1,215 @@
|
|||||||
|
# DBPedia
|
||||||
|
|
||||||
|
## GraphIRI
|
||||||
|
|
||||||
|
This is the graph identifier (URI):
|
||||||
|
|
||||||
|
`http://dbpedia.org`
|
||||||
|
|
||||||
|
## History of queries
|
||||||
|
|
||||||
|
```SQL
|
||||||
|
PREFIX dbo: <http://dbpedia.org/ontology/>
|
||||||
|
PREFIX dbp: <http://dbpedia.org/property/>
|
||||||
|
PREFIX dbr: <http://dbpedia.org/resource/>
|
||||||
|
|
||||||
|
SELECT ?subject, ?relationship, ?object
|
||||||
|
WHERE {
|
||||||
|
?subject ?relationship ?object .
|
||||||
|
{
|
||||||
|
SELECT ?object
|
||||||
|
WHERE {
|
||||||
|
?m rdf:type dbo:Film .
|
||||||
|
?object ?r ?m
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### 2 Hops
|
||||||
|
|
||||||
|
```SQL
|
||||||
|
PREFIX dbo: <http://dbpedia.org/ontology/>
|
||||||
|
PREFIX dbp: <http://dbpedia.org/property/>
|
||||||
|
PREFIX dbr: <http://dbpedia.org/resource/>
|
||||||
|
|
||||||
|
SELECT ?subject, ?relationship, ?object
|
||||||
|
WHERE {
|
||||||
|
?subject ?relationship ?object .
|
||||||
|
FILTER (?relationship != <http://dbpedia.org/ontology/wikiPageWikiLink>)
|
||||||
|
{
|
||||||
|
SELECT ?object
|
||||||
|
WHERE {
|
||||||
|
?m rdf:type dbo:Film .
|
||||||
|
?object ?r ?m
|
||||||
|
FILTER (?r != <http://dbpedia.org/ontology/wikiPageWikiLink>)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
LIMIT 1000000
|
||||||
|
```
|
||||||
|
|
||||||
|
### 1 Hop
|
||||||
|
|
||||||
|
```SQL
|
||||||
|
PREFIX dbo: <http://dbpedia.org/ontology/>
|
||||||
|
PREFIX dbp: <http://dbpedia.org/property/>
|
||||||
|
PREFIX dbr: <http://dbpedia.org/resource/>
|
||||||
|
|
||||||
|
SELECT ?subject, ?relationship, ?object
|
||||||
|
WHERE {
|
||||||
|
?subject ?relationship ?object .
|
||||||
|
?object rdf:type dbo:Film .
|
||||||
|
FILTER (?relationship != <http://dbpedia.org/ontology/wikiPageWikiLink>)
|
||||||
|
}
|
||||||
|
LIMIT 1000000
|
||||||
|
```
|
||||||
|
|
||||||
|
```SQL
|
||||||
|
PREFIX dbo: <http://dbpedia.org/ontology/>
|
||||||
|
PREFIX dbp: <http://dbpedia.org/property/>
|
||||||
|
PREFIX dbr: <http://dbpedia.org/resource/>
|
||||||
|
|
||||||
|
SELECT ?subject, ?relationship, ?object
|
||||||
|
WHERE {
|
||||||
|
?subject ?relationship ?object .
|
||||||
|
?subject rdf:type dbo:Film .
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
```SQL
|
||||||
|
PREFIX dbo: <http://dbpedia.org/ontology/>
|
||||||
|
PREFIX dbp: <http://dbpedia.org/property/>
|
||||||
|
PREFIX dbr: <http://dbpedia.org/resource/>
|
||||||
|
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
|
||||||
|
|
||||||
|
SELECT ?subject, ?relationship, ?object
|
||||||
|
WHERE {
|
||||||
|
?subject ?relationship ?object .
|
||||||
|
?subject rdf:type dbo:Film .
|
||||||
|
?a foaf:primaryTopic ?subject
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
```SQL
|
||||||
|
PREFIX dbo: <http://dbpedia.org/ontology/>
|
||||||
|
PREFIX dbp: <http://dbpedia.org/property/>
|
||||||
|
PREFIX dbr: <http://dbpedia.org/resource/>
|
||||||
|
|
||||||
|
SELECT ?subject
|
||||||
|
WHERE {
|
||||||
|
?subject rdf:type dbo:Film .
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
```SQL
|
||||||
|
PREFIX dbo: <http://dbpedia.org/ontology/>
|
||||||
|
PREFIX dbp: <http://dbpedia.org/property/>
|
||||||
|
PREFIX dbr: <http://dbpedia.org/resource/>
|
||||||
|
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
|
||||||
|
|
||||||
|
SELECT ?subject
|
||||||
|
WHERE {
|
||||||
|
?subject rdf:type dbo:Film .
|
||||||
|
?a foaf:primaryTopic ?subject
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
```SQL
|
||||||
|
PREFIX dbo: <http://dbpedia.org/ontology/>
|
||||||
|
PREFIX dbp: <http://dbpedia.org/property/>
|
||||||
|
PREFIX dbr: <http://dbpedia.org/resource/>
|
||||||
|
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
|
||||||
|
|
||||||
|
SELECT ?subject, ?relationship, ?object
|
||||||
|
WHERE {
|
||||||
|
?subject ?relationship ?object .
|
||||||
|
?subject rdf:type dbo:Film .
|
||||||
|
?a foaf:primaryTopic ?subject
|
||||||
|
FILTER (?relationship NOT IN (
|
||||||
|
dbo:wikiPageRedirects,
|
||||||
|
dbo:wikiPageExternalLink,
|
||||||
|
dbo:wikiPageWikiLink,
|
||||||
|
foaf:primaryTopic
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Wikipedia-movie
|
||||||
|
|
||||||
|
a.k.a the file with the wikipedia abstract
|
||||||
|
|
||||||
|
```SQL
|
||||||
|
PREFIX dbo: <http://dbpedia.org/ontology/>
|
||||||
|
PREFIX dbp: <http://dbpedia.org/property/>
|
||||||
|
PREFIX dbr: <http://dbpedia.org/resource/>
|
||||||
|
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
|
||||||
|
|
||||||
|
SELECT ?subject , ?object
|
||||||
|
WHERE {
|
||||||
|
?subject foaf:primaryTopic ?object .
|
||||||
|
?object rdf:type dbo:Film
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Reverse
|
||||||
|
|
||||||
|
```SQL
|
||||||
|
PREFIX dbo: <http://dbpedia.org/ontology/>
|
||||||
|
PREFIX dbp: <http://dbpedia.org/property/>
|
||||||
|
PREFIX dbr: <http://dbpedia.org/resource/>
|
||||||
|
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
|
||||||
|
|
||||||
|
SELECT ?subject, ?relationship, ?object
|
||||||
|
WHERE {
|
||||||
|
?subject ?relationship ?object .
|
||||||
|
?object rdf:type dbo:Film .
|
||||||
|
?a foaf:primaryTopic ?object
|
||||||
|
FILTER (?relationship NOT IN (
|
||||||
|
dbo:wikiPageRedirects,
|
||||||
|
dbo:wikiPageExternalLink,
|
||||||
|
dbo:wikiPageWikiLink,
|
||||||
|
foaf:primaryTopic
|
||||||
|
))
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
```SQL
|
||||||
|
PREFIX dbo: <http://dbpedia.org/ontology/>
|
||||||
|
PREFIX dbp: <http://dbpedia.org/property/>
|
||||||
|
PREFIX dbr: <http://dbpedia.org/resource/>
|
||||||
|
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
|
||||||
|
|
||||||
|
SELECT ?subject, ?relationship, ?object
|
||||||
|
WHERE {
|
||||||
|
?subject ?relationship ?object .
|
||||||
|
?object rdf:type dbo:Film .
|
||||||
|
?a foaf:primaryTopic ?object
|
||||||
|
FILTER (?relationship NOT IN (
|
||||||
|
dbo:wikiPageRedirects,
|
||||||
|
dbo:wikiPageExternalLink,
|
||||||
|
dbo:wikiPageWikiLink,
|
||||||
|
foaf:primaryTopic
|
||||||
|
))
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Film \ wiki page ID
|
||||||
|
|
||||||
|
```SQL
|
||||||
|
PREFIX dbo: <http://dbpedia.org/ontology/>
|
||||||
|
PREFIX dbp: <http://dbpedia.org/property/>
|
||||||
|
PREFIX dbr: <http://dbpedia.org/resource/>
|
||||||
|
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
|
||||||
|
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
|
||||||
|
|
||||||
|
SELECT ?subject ?pageID
|
||||||
|
WHERE {
|
||||||
|
?subject rdf:type dbo:Film .
|
||||||
|
?subject dbo:wikiPageID ?pageID .
|
||||||
|
?subject rdfs:label ?label .
|
||||||
|
FILTER (lang(?label) = "en")
|
||||||
|
}
|
||||||
|
|
||||||
|
```
|
||||||
3
docs/DEVELOPMENT.md
Normal file
3
docs/DEVELOPMENT.md
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
# Development
|
||||||
|
|
||||||
|
## Data Gathering
|
||||||
108
docs/RESOURCES.md
Normal file
108
docs/RESOURCES.md
Normal file
@ -0,0 +1,108 @@
|
|||||||
|
# Resources
|
||||||
|
|
||||||
|
## Byte-Pair Encoding (BPE)
|
||||||
|
|
||||||
|
### Overview
|
||||||
|
|
||||||
|
Byte-Pair Encoding (BPE) is a simple but powerful text compression and tokenization algorithm.
|
||||||
|
Originally introduced as a data compression method, it has been widely adopted in **Natural Language Processing (NLP)** to build subword vocabularies for models such as GPT and BERT.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Key Idea
|
||||||
|
|
||||||
|
BPE works by iteratively replacing the most frequent pair of symbols (initially characters) with a new symbol.
|
||||||
|
Over time, frequent character sequences (e.g., common morphemes, prefixes, suffixes) are merged into single tokens.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Algorithm Steps
|
||||||
|
|
||||||
|
1. **Initialization**
|
||||||
|
- Treat each character of the input text as a token.
|
||||||
|
|
||||||
|
2. **Find Frequent Pairs**
|
||||||
|
- Count all adjacent token pairs in the sequence.
|
||||||
|
|
||||||
|
3. **Merge Most Frequent Pair**
|
||||||
|
- Replace the most frequent pair with a new symbol not used in the text.
|
||||||
|
|
||||||
|
4. **Repeat**
|
||||||
|
- Continue until no frequent pairs remain or a desired vocabulary size is reached.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Example
|
||||||
|
|
||||||
|
Suppose the data to be encoded is:
|
||||||
|
|
||||||
|
```text
|
||||||
|
aaabdaaabac
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Step 1: Merge `"aa"`
|
||||||
|
|
||||||
|
Most frequent pair: `"aa"` → replace with `"Z"`
|
||||||
|
|
||||||
|
```text
|
||||||
|
ZabdZabac
|
||||||
|
Z = aa
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
#### Step 2: Merge `"ab"`
|
||||||
|
|
||||||
|
Most frequent pair: `"ab"` → replace with `"Y"`
|
||||||
|
|
||||||
|
```text
|
||||||
|
ZYdZYac
|
||||||
|
Y = ab
|
||||||
|
Z = aa
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
#### Step 3: Merge `"ZY"`
|
||||||
|
|
||||||
|
Most frequent pair: `"ZY"` → replace with `"X"`
|
||||||
|
|
||||||
|
```text
|
||||||
|
XdXac
|
||||||
|
X = ZY
|
||||||
|
Y = ab
|
||||||
|
Z = aa
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
At this point, no pairs occur more than once, so the process stops.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Decompression
|
||||||
|
|
||||||
|
To recover the original data, replacements are applied in **reverse order**:
|
||||||
|
|
||||||
|
```text
|
||||||
|
XdXac
|
||||||
|
→ ZYdZYac
|
||||||
|
→ ZabdZabac
|
||||||
|
→ aaabdaaabac
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Advantages
|
||||||
|
|
||||||
|
- **Efficient vocabulary building**: reduces the need for massive word lists.
|
||||||
|
- **Handles rare words**: breaks them into meaningful subword units.
|
||||||
|
- **Balances character- and word-level tokenization**.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Limitations
|
||||||
|
|
||||||
|
- Does not consider linguistic meaning—merges are frequency-based.
|
||||||
|
- May create tokens that are not linguistically natural.
|
||||||
|
- Vocabulary is fixed after training.
|
||||||
67
docs/SPARQL.md
Normal file
67
docs/SPARQL.md
Normal file
@ -0,0 +1,67 @@
|
|||||||
|
# SparQL
|
||||||
|
|
||||||
|
> [!NOTE]
|
||||||
|
> Resources taken from [this website](https://sparql.dev/)
|
||||||
|
|
||||||
|
## SQL Queries
|
||||||
|
|
||||||
|
### SELECT
|
||||||
|
|
||||||
|
```SQL
|
||||||
|
SELECT ?var1, ?var2, ...
|
||||||
|
```
|
||||||
|
|
||||||
|
### WHERE
|
||||||
|
|
||||||
|
```SQL
|
||||||
|
WHERE {
|
||||||
|
pattern1 .
|
||||||
|
pattern2 .
|
||||||
|
...
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### FILTER
|
||||||
|
|
||||||
|
It's used to restrict [`WHERE`](#where) clauses
|
||||||
|
|
||||||
|
```SQL
|
||||||
|
WHERE {
|
||||||
|
?person <http://example.com/hasCar> ?car .
|
||||||
|
FILTER (?car = <http://example.com/Car1>)
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### OPTIONAL
|
||||||
|
|
||||||
|
It's used to fetch available content if exists
|
||||||
|
|
||||||
|
```SQL
|
||||||
|
SELECT ?person ?car
|
||||||
|
WHERE {
|
||||||
|
?person <http://example.com/hasCar> ?car .
|
||||||
|
OPTIONAL {
|
||||||
|
?car <http://example.com/hasColor> ?color .
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### LIMIT
|
||||||
|
|
||||||
|
Limits results
|
||||||
|
|
||||||
|
```SQL
|
||||||
|
LIMIT 10 -- Take only 10 results
|
||||||
|
```
|
||||||
|
|
||||||
|
## SparQL functions
|
||||||
|
|
||||||
|
### COUNT
|
||||||
|
|
||||||
|
```SQL
|
||||||
|
SELECT (COUNT(?person) AS ?count)
|
||||||
|
WHERE {
|
||||||
|
?person <http://example.com/hasCar> ?car .
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
BIN
environment.yaml
Normal file
BIN
environment.yaml
Normal file
Binary file not shown.
18
requirements.txt
Normal file
18
requirements.txt
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
certifi==2025.8.3
|
||||||
|
charset-normalizer==3.4.3
|
||||||
|
idna==3.10
|
||||||
|
numpy==2.3.3
|
||||||
|
pandas==2.3.2
|
||||||
|
pyparsing==3.2.4
|
||||||
|
python-dateutil==2.9.0.post0
|
||||||
|
pytz==2025.2
|
||||||
|
rdflib==7.1.4
|
||||||
|
requests==2.32.5
|
||||||
|
setuptools==78.1.1
|
||||||
|
six==1.17.0
|
||||||
|
SPARQLWrapper==2.0.0
|
||||||
|
tzdata==2025.2
|
||||||
|
urllib3==2.5.0
|
||||||
|
wheel==0.45.1
|
||||||
|
Wikipedia-API==0.8.1
|
||||||
|
SQLAlchemy
|
||||||
Loading…
x
Reference in New Issue
Block a user