WIP abbrevietion_datawarehouse to creat an abbreviation system
This commit is contained in:
parent
3e30489f86
commit
47197194d5
105
Scripts/DataCleaning/abbrevietion_datawarehouse.py
Normal file
105
Scripts/DataCleaning/abbrevietion_datawarehouse.py
Normal file
@ -0,0 +1,105 @@
|
|||||||
|
import sqlite3
|
||||||
|
import csv
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
DB_NAME = "./Assets/Dataset/DatawareHouse/dataset.db"
|
||||||
|
CSV_MAPPER = "./Assets/Dataset/1-hop/uri-abbreviations.csv"
|
||||||
|
# MAPPER_HANDLER = open(CSV_MAPPER,"r",newline='', encoding="utf-8")
|
||||||
|
mapper = pd.read_csv(CSV_MAPPER)
|
||||||
|
mapper_key_list = mapper["uri"].to_list()
|
||||||
|
mapper_value_list = mapper["abbreviation"].to_list()
|
||||||
|
|
||||||
|
CONN = sqlite3.connect(DB_NAME)
|
||||||
|
CURS = CONN.cursor()
|
||||||
|
|
||||||
|
def insert_abbreviation(uri, abbreviation) -> bool:
|
||||||
|
QUERY = "INSERT INTO Abbreviations(URI, Abbreviation) VALUES (?,?);"
|
||||||
|
try:
|
||||||
|
CURS.execute(QUERY,[uri, abbreviation])
|
||||||
|
return True
|
||||||
|
except sqlite3.IntegrityError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def inserto_object_abbreviation(object_id, abbreviation_id) -> bool:
|
||||||
|
QUERY = "INSERT INTO Objects_Abbreviations(ObjectID, AbrreviationID) VALUES (?,?);"
|
||||||
|
try:
|
||||||
|
CURS.execute(QUERY,[object_id, abbreviation_id])
|
||||||
|
return True
|
||||||
|
except sqlite3.IntegrityError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def insert_relationship_abbreviation(relationship_id, abbreviation_id) -> bool:
|
||||||
|
QUERY = "INSERT INTO Relationships_Abbreviations(RelationshipID, AbbreviationID) VALUES (?,?);"
|
||||||
|
try:
|
||||||
|
CURS.execute(QUERY,[relationship_id, abbreviation_id])
|
||||||
|
return True
|
||||||
|
except sqlite3.IntegrityError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def insert_subject_abbreviation(subject_id, abbreviation_id) -> bool:
|
||||||
|
QUERY = "INSERT INTO Subjects_Abbreviations(SubjectID, AbbreviationID) VALUES (?,?);"
|
||||||
|
try:
|
||||||
|
CURS.execute(QUERY,[subject_id, abbreviation_id])
|
||||||
|
return True
|
||||||
|
except sqlite3.IntegrityError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def select_abbreviation_id(uri) -> int | None:
|
||||||
|
QUERY = "SELECT AbbreviationID FROM Abbreviations WHERE URI LIKE ?;"
|
||||||
|
CURS.execute(QUERY, [uri])
|
||||||
|
abbreviation_id = CURS.fetchone()
|
||||||
|
if not abbreviation_id:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# in this case the real id is the first element of the tuple
|
||||||
|
return abbreviation_id[0]
|
||||||
|
|
||||||
|
def parser(element: pd.DataFrame):
|
||||||
|
# df.replace(['Boston Celtics', 'Amir Johnson', 'R.J. Hunter'],
|
||||||
|
# ['Omega Warriors', 'Mitcell Johnson', 'Shivang Thomas'])
|
||||||
|
return element.replace(mapper_key_list, mapper_value_list)
|
||||||
|
# # map by csv
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def populate():
|
||||||
|
# get subject, relationships, objects
|
||||||
|
# for index, row in df.iterrows():
|
||||||
|
Subjects = pd.read_sql_query('SELECT * FROM Subjects;', CONN)
|
||||||
|
Objects = pd.read_sql_query('SELECT * FROM Objects;', CONN)
|
||||||
|
Relationships = pd.read_sql_query('SELECT * FROM Relationships;', CONN)
|
||||||
|
# add at each df their abbreviation
|
||||||
|
Subjects["Abbreviation"] = Subjects["SubjectURI"]
|
||||||
|
Objects["Abbreviation"] = Objects["ObjectURI"]
|
||||||
|
Relationships["Abbreviation"] = Relationships["RelationshipURI"]
|
||||||
|
|
||||||
|
|
||||||
|
for index, row in Subjects.iterrows():
|
||||||
|
subject_uri = row["SubjectURI"]
|
||||||
|
subject_id = row["SubjectID"]
|
||||||
|
abbreviation = parser(subject_uri)
|
||||||
|
insert_abbreviation(subject_uri,abbreviation)
|
||||||
|
abbreviation_id = select_abbreviation_id(subject_uri)
|
||||||
|
insert_subject_abbreviation(subject_id,abbreviation_id)
|
||||||
|
|
||||||
|
for index, row in Objects.iterrows():
|
||||||
|
object_uri = row["ObjectURI"]
|
||||||
|
object_id = row["ObjectID"]
|
||||||
|
abbreviation = parser(object_uri)
|
||||||
|
insert_abbreviation(object_uri,abbreviation)
|
||||||
|
abbreviation_id = select_abbreviation_id(object_uri)
|
||||||
|
insert_subject_abbreviation(object_id,abbreviation_id)
|
||||||
|
|
||||||
|
for index, row in Relationships.iterrows():
|
||||||
|
relationship_uri = row["RelationshipURI"]
|
||||||
|
relationship_id = row["RelationshipID"]
|
||||||
|
abbreviation = parser(relationship_uri)
|
||||||
|
insert_abbreviation(relationship_uri,abbreviation)
|
||||||
|
abbreviation_id = select_abbreviation_id(relationship_uri)
|
||||||
|
insert_subject_abbreviation(relationship_id,abbreviation_id)
|
||||||
|
|
||||||
|
|
||||||
|
CONN.commit()
|
||||||
|
CONN.close()
|
||||||
|
|
||||||
|
# MAPPER_HANDLER.close()
|
||||||
Loading…
x
Reference in New Issue
Block a user