WIP abbrevietion_datawarehouse to creat an abbreviation system
This commit is contained in:
parent
3e30489f86
commit
47197194d5
105
Scripts/DataCleaning/abbrevietion_datawarehouse.py
Normal file
105
Scripts/DataCleaning/abbrevietion_datawarehouse.py
Normal file
@ -0,0 +1,105 @@
|
||||
import sqlite3
|
||||
import csv
|
||||
import pandas as pd
|
||||
|
||||
DB_NAME = "./Assets/Dataset/DatawareHouse/dataset.db"
|
||||
CSV_MAPPER = "./Assets/Dataset/1-hop/uri-abbreviations.csv"
|
||||
# MAPPER_HANDLER = open(CSV_MAPPER,"r",newline='', encoding="utf-8")
|
||||
mapper = pd.read_csv(CSV_MAPPER)
|
||||
mapper_key_list = mapper["uri"].to_list()
|
||||
mapper_value_list = mapper["abbreviation"].to_list()
|
||||
|
||||
CONN = sqlite3.connect(DB_NAME)
|
||||
CURS = CONN.cursor()
|
||||
|
||||
def insert_abbreviation(uri, abbreviation) -> bool:
|
||||
QUERY = "INSERT INTO Abbreviations(URI, Abbreviation) VALUES (?,?);"
|
||||
try:
|
||||
CURS.execute(QUERY,[uri, abbreviation])
|
||||
return True
|
||||
except sqlite3.IntegrityError:
|
||||
return False
|
||||
|
||||
def inserto_object_abbreviation(object_id, abbreviation_id) -> bool:
|
||||
QUERY = "INSERT INTO Objects_Abbreviations(ObjectID, AbrreviationID) VALUES (?,?);"
|
||||
try:
|
||||
CURS.execute(QUERY,[object_id, abbreviation_id])
|
||||
return True
|
||||
except sqlite3.IntegrityError:
|
||||
return False
|
||||
|
||||
def insert_relationship_abbreviation(relationship_id, abbreviation_id) -> bool:
|
||||
QUERY = "INSERT INTO Relationships_Abbreviations(RelationshipID, AbbreviationID) VALUES (?,?);"
|
||||
try:
|
||||
CURS.execute(QUERY,[relationship_id, abbreviation_id])
|
||||
return True
|
||||
except sqlite3.IntegrityError:
|
||||
return False
|
||||
|
||||
def insert_subject_abbreviation(subject_id, abbreviation_id) -> bool:
|
||||
QUERY = "INSERT INTO Subjects_Abbreviations(SubjectID, AbbreviationID) VALUES (?,?);"
|
||||
try:
|
||||
CURS.execute(QUERY,[subject_id, abbreviation_id])
|
||||
return True
|
||||
except sqlite3.IntegrityError:
|
||||
return False
|
||||
|
||||
def select_abbreviation_id(uri) -> int | None:
|
||||
QUERY = "SELECT AbbreviationID FROM Abbreviations WHERE URI LIKE ?;"
|
||||
CURS.execute(QUERY, [uri])
|
||||
abbreviation_id = CURS.fetchone()
|
||||
if not abbreviation_id:
|
||||
return None
|
||||
|
||||
# in this case the real id is the first element of the tuple
|
||||
return abbreviation_id[0]
|
||||
|
||||
def parser(element: pd.DataFrame):
|
||||
# df.replace(['Boston Celtics', 'Amir Johnson', 'R.J. Hunter'],
|
||||
# ['Omega Warriors', 'Mitcell Johnson', 'Shivang Thomas'])
|
||||
return element.replace(mapper_key_list, mapper_value_list)
|
||||
# # map by csv
|
||||
|
||||
|
||||
|
||||
def populate():
|
||||
# get subject, relationships, objects
|
||||
# for index, row in df.iterrows():
|
||||
Subjects = pd.read_sql_query('SELECT * FROM Subjects;', CONN)
|
||||
Objects = pd.read_sql_query('SELECT * FROM Objects;', CONN)
|
||||
Relationships = pd.read_sql_query('SELECT * FROM Relationships;', CONN)
|
||||
# add at each df their abbreviation
|
||||
Subjects["Abbreviation"] = Subjects["SubjectURI"]
|
||||
Objects["Abbreviation"] = Objects["ObjectURI"]
|
||||
Relationships["Abbreviation"] = Relationships["RelationshipURI"]
|
||||
|
||||
|
||||
for index, row in Subjects.iterrows():
|
||||
subject_uri = row["SubjectURI"]
|
||||
subject_id = row["SubjectID"]
|
||||
abbreviation = parser(subject_uri)
|
||||
insert_abbreviation(subject_uri,abbreviation)
|
||||
abbreviation_id = select_abbreviation_id(subject_uri)
|
||||
insert_subject_abbreviation(subject_id,abbreviation_id)
|
||||
|
||||
for index, row in Objects.iterrows():
|
||||
object_uri = row["ObjectURI"]
|
||||
object_id = row["ObjectID"]
|
||||
abbreviation = parser(object_uri)
|
||||
insert_abbreviation(object_uri,abbreviation)
|
||||
abbreviation_id = select_abbreviation_id(object_uri)
|
||||
insert_subject_abbreviation(object_id,abbreviation_id)
|
||||
|
||||
for index, row in Relationships.iterrows():
|
||||
relationship_uri = row["RelationshipURI"]
|
||||
relationship_id = row["RelationshipID"]
|
||||
abbreviation = parser(relationship_uri)
|
||||
insert_abbreviation(relationship_uri,abbreviation)
|
||||
abbreviation_id = select_abbreviation_id(relationship_uri)
|
||||
insert_subject_abbreviation(relationship_id,abbreviation_id)
|
||||
|
||||
|
||||
CONN.commit()
|
||||
CONN.close()
|
||||
|
||||
# MAPPER_HANDLER.close()
|
||||
Loading…
x
Reference in New Issue
Block a user