From ac1ed42c4991e5ce37d49c6b75d7b4c524ac5685 Mon Sep 17 00:00:00 2001 From: GassiGiuseppe Date: Mon, 22 Sep 2025 17:11:49 +0200 Subject: [PATCH] Folder DataCleaning renamed to DatasetMerging since it doesn't clean nothing and instead Build the dataset --- Scripts/DataCleaning/DBMerger.py | 28 ------------ Scripts/DatasetMerging/DBMerger.py | 45 +++++++++++++++++++ .../DataRetrivial.py | 0 .../SQL_Queries/db_creation.sql | 0 4 files changed, 45 insertions(+), 28 deletions(-) delete mode 100644 Scripts/DataCleaning/DBMerger.py create mode 100644 Scripts/DatasetMerging/DBMerger.py rename Scripts/{DataCleaning => DatasetMerging}/DataRetrivial.py (100%) rename Scripts/{DataCleaning => DatasetMerging}/SQL_Queries/db_creation.sql (100%) diff --git a/Scripts/DataCleaning/DBMerger.py b/Scripts/DataCleaning/DBMerger.py deleted file mode 100644 index 8eb703a..0000000 --- a/Scripts/DataCleaning/DBMerger.py +++ /dev/null @@ -1,28 +0,0 @@ -""" -What we have now: - -Wikipeda-summary : PageId / abstract -Movies : Movie URI -Dataset : Movie URI / Relationship / Object [RDF] -Movies-PageId : Movie URI / PageId (wiki) -Reverse : Subject / Relationship / Movie URI - -What we want: -( we will generate MovieID) -Movies : MovieID [PK] / Movie URI -WikiPageIDs : MovieID [PK, FK]/ PageId [IDX] (wiki) (Not important for now) -Abstracts : MovieID [PK, FK]/ abstract -Subjects : SubjectID [PK] / RDF Subject ( both from either Dataset.csv or Reverse.csv) / OriginID [FK] -Relationships : RelationshipID [PK]/ RDF Relationship (not the actual relationshi but the value) -Objects : ObjectID [PK]/ RDF Object / OriginID [FK] -Origins : OriginID [PK]/ Origin Name -RDFs : RDF_ID[PK] / MovieID [FK] / SubjectID [FK]/ RelationshipID [FK]/ ObjectID [FK] - -What we will build for the model - -we need RDF list for each movie together with abstract - -: MovieID / RDF_set / abstrct - -""" - diff --git a/Scripts/DatasetMerging/DBMerger.py b/Scripts/DatasetMerging/DBMerger.py new file mode 100644 index 0000000..4ad3989 --- /dev/null +++ b/Scripts/DatasetMerging/DBMerger.py @@ -0,0 +1,45 @@ +""" +What we have now: Saved AS: + +Wikipeda-summary : PageId / abstract subject,text +Movies : Movie URI "subject" +Dataset : Movie URI / Relationship / Object [RDF] subject,relationship,object +Movies-PageId : Movie URI / PageId (wiki) "subject", "object" +Reverse : Subject / Relationship / Movie URI "subject","relationship","object" + +What we want: +( we will generate MovieID) +Movies : MovieID [PK] / Movie URI +WikiPageIDs : MovieID [PK, FK]/ PageId [IDX] (wiki) (Not important for now) +Abstracts : MovieID [PK, FK]/ abstract +Subjects : SubjectID [PK] / RDF Subject ( both from either Dataset.csv or Reverse.csv) / OriginID [FK] +Relationships : RelationshipID [PK]/ RDF Relationship (not the actual relationshi but the value) +Objects : ObjectID [PK]/ RDF Object / OriginID [FK] +Origins : OriginID [PK]/ Origin Name +RDFs : RDF_ID[PK] / MovieID [FK] / SubjectID [FK]/ RelationshipID [FK]/ ObjectID [FK] + +What we will build for the model + +we need RDF list for each movie together with abstract + +: MovieID / RDF_set / abstrct + +""" + +import sqlite3 + +# Create a SQL connection to our SQLite database +con = sqlite3.connect("data/portal_mammals.sqlite") + +cur = con.cursor() + +# Return all results of query +cur.execute('SELECT plot_id FROM plots WHERE plot_type="Control"') +cur.fetchall() + +# Return first result of query +cur.execute('SELECT species FROM species WHERE taxa="Bird"') +cur.fetchone() + +# Be sure to close the connection +con.close() \ No newline at end of file diff --git a/Scripts/DataCleaning/DataRetrivial.py b/Scripts/DatasetMerging/DataRetrivial.py similarity index 100% rename from Scripts/DataCleaning/DataRetrivial.py rename to Scripts/DatasetMerging/DataRetrivial.py diff --git a/Scripts/DataCleaning/SQL_Queries/db_creation.sql b/Scripts/DatasetMerging/SQL_Queries/db_creation.sql similarity index 100% rename from Scripts/DataCleaning/SQL_Queries/db_creation.sql rename to Scripts/DatasetMerging/SQL_Queries/db_creation.sql