From 47197194d5e6ca20a5bd1041d62f7aad72067cd6 Mon Sep 17 00:00:00 2001 From: GassiGiuseppe Date: Wed, 24 Sep 2025 16:32:09 +0200 Subject: [PATCH] WIP abbrevietion_datawarehouse to creat an abbreviation system --- .../abbrevietion_datawarehouse.py | 105 ++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 Scripts/DataCleaning/abbrevietion_datawarehouse.py diff --git a/Scripts/DataCleaning/abbrevietion_datawarehouse.py b/Scripts/DataCleaning/abbrevietion_datawarehouse.py new file mode 100644 index 0000000..bc88cd5 --- /dev/null +++ b/Scripts/DataCleaning/abbrevietion_datawarehouse.py @@ -0,0 +1,105 @@ +import sqlite3 +import csv +import pandas as pd + +DB_NAME = "./Assets/Dataset/DatawareHouse/dataset.db" +CSV_MAPPER = "./Assets/Dataset/1-hop/uri-abbreviations.csv" +# MAPPER_HANDLER = open(CSV_MAPPER,"r",newline='', encoding="utf-8") +mapper = pd.read_csv(CSV_MAPPER) +mapper_key_list = mapper["uri"].to_list() +mapper_value_list = mapper["abbreviation"].to_list() + +CONN = sqlite3.connect(DB_NAME) +CURS = CONN.cursor() + +def insert_abbreviation(uri, abbreviation) -> bool: + QUERY = "INSERT INTO Abbreviations(URI, Abbreviation) VALUES (?,?);" + try: + CURS.execute(QUERY,[uri, abbreviation]) + return True + except sqlite3.IntegrityError: + return False + +def inserto_object_abbreviation(object_id, abbreviation_id) -> bool: + QUERY = "INSERT INTO Objects_Abbreviations(ObjectID, AbrreviationID) VALUES (?,?);" + try: + CURS.execute(QUERY,[object_id, abbreviation_id]) + return True + except sqlite3.IntegrityError: + return False + +def insert_relationship_abbreviation(relationship_id, abbreviation_id) -> bool: + QUERY = "INSERT INTO Relationships_Abbreviations(RelationshipID, AbbreviationID) VALUES (?,?);" + try: + CURS.execute(QUERY,[relationship_id, abbreviation_id]) + return True + except sqlite3.IntegrityError: + return False + +def insert_subject_abbreviation(subject_id, abbreviation_id) -> bool: + QUERY = "INSERT INTO Subjects_Abbreviations(SubjectID, AbbreviationID) VALUES (?,?);" + try: + CURS.execute(QUERY,[subject_id, abbreviation_id]) + return True + except sqlite3.IntegrityError: + return False + +def select_abbreviation_id(uri) -> int | None: + QUERY = "SELECT AbbreviationID FROM Abbreviations WHERE URI LIKE ?;" + CURS.execute(QUERY, [uri]) + abbreviation_id = CURS.fetchone() + if not abbreviation_id: + return None + + # in this case the real id is the first element of the tuple + return abbreviation_id[0] + +def parser(element: pd.DataFrame): + # df.replace(['Boston Celtics', 'Amir Johnson', 'R.J. Hunter'], + # ['Omega Warriors', 'Mitcell Johnson', 'Shivang Thomas']) + return element.replace(mapper_key_list, mapper_value_list) +# # map by csv + + + +def populate(): + # get subject, relationships, objects + # for index, row in df.iterrows(): + Subjects = pd.read_sql_query('SELECT * FROM Subjects;', CONN) + Objects = pd.read_sql_query('SELECT * FROM Objects;', CONN) + Relationships = pd.read_sql_query('SELECT * FROM Relationships;', CONN) + # add at each df their abbreviation + Subjects["Abbreviation"] = Subjects["SubjectURI"] + Objects["Abbreviation"] = Objects["ObjectURI"] + Relationships["Abbreviation"] = Relationships["RelationshipURI"] + + + for index, row in Subjects.iterrows(): + subject_uri = row["SubjectURI"] + subject_id = row["SubjectID"] + abbreviation = parser(subject_uri) + insert_abbreviation(subject_uri,abbreviation) + abbreviation_id = select_abbreviation_id(subject_uri) + insert_subject_abbreviation(subject_id,abbreviation_id) + + for index, row in Objects.iterrows(): + object_uri = row["ObjectURI"] + object_id = row["ObjectID"] + abbreviation = parser(object_uri) + insert_abbreviation(object_uri,abbreviation) + abbreviation_id = select_abbreviation_id(object_uri) + insert_subject_abbreviation(object_id,abbreviation_id) + + for index, row in Relationships.iterrows(): + relationship_uri = row["RelationshipURI"] + relationship_id = row["RelationshipID"] + abbreviation = parser(relationship_uri) + insert_abbreviation(relationship_uri,abbreviation) + abbreviation_id = select_abbreviation_id(relationship_uri) + insert_subject_abbreviation(relationship_id,abbreviation_id) + + +CONN.commit() +CONN.close() + +# MAPPER_HANDLER.close() \ No newline at end of file