Merged abbreviation_datawarehouse into datawarehouse

This commit is contained in:
Christian Risi 2025-09-24 19:29:43 +02:00
parent 9a5d633b5e
commit 4315d70109
2 changed files with 334 additions and 181 deletions

View File

@ -1,105 +0,0 @@
import sqlite3
import csv
import pandas as pd
DB_NAME = "./Assets/Dataset/DatawareHouse/dataset.db"
CSV_MAPPER = "./Assets/Dataset/1-hop/uri-abbreviations.csv"
# MAPPER_HANDLER = open(CSV_MAPPER,"r",newline='', encoding="utf-8")
mapper = pd.read_csv(CSV_MAPPER)
mapper_key_list = mapper["uri"].to_list()
mapper_value_list = mapper["abbreviation"].to_list()
CONN = sqlite3.connect(DB_NAME)
CURS = CONN.cursor()
def insert_abbreviation(uri, abbreviation) -> bool:
QUERY = "INSERT INTO Abbreviations(URI, Abbreviation) VALUES (?,?);"
try:
CURS.execute(QUERY,[uri, abbreviation])
return True
except sqlite3.IntegrityError:
return False
def inserto_object_abbreviation(object_id, abbreviation_id) -> bool:
QUERY = "INSERT INTO Objects_Abbreviations(ObjectID, AbrreviationID) VALUES (?,?);"
try:
CURS.execute(QUERY,[object_id, abbreviation_id])
return True
except sqlite3.IntegrityError:
return False
def insert_relationship_abbreviation(relationship_id, abbreviation_id) -> bool:
QUERY = "INSERT INTO Relationships_Abbreviations(RelationshipID, AbbreviationID) VALUES (?,?);"
try:
CURS.execute(QUERY,[relationship_id, abbreviation_id])
return True
except sqlite3.IntegrityError:
return False
def insert_subject_abbreviation(subject_id, abbreviation_id) -> bool:
QUERY = "INSERT INTO Subjects_Abbreviations(SubjectID, AbbreviationID) VALUES (?,?);"
try:
CURS.execute(QUERY,[subject_id, abbreviation_id])
return True
except sqlite3.IntegrityError:
return False
def select_abbreviation_id(uri) -> int | None:
QUERY = "SELECT AbbreviationID FROM Abbreviations WHERE URI LIKE ?;"
CURS.execute(QUERY, [uri])
abbreviation_id = CURS.fetchone()
if not abbreviation_id:
return None
# in this case the real id is the first element of the tuple
return abbreviation_id[0]
def parser(element: pd.DataFrame):
# df.replace(['Boston Celtics', 'Amir Johnson', 'R.J. Hunter'],
# ['Omega Warriors', 'Mitcell Johnson', 'Shivang Thomas'])
return element.replace(mapper_key_list, mapper_value_list)
# # map by csv
def populate():
# get subject, relationships, objects
# for index, row in df.iterrows():
Subjects = pd.read_sql_query('SELECT * FROM Subjects;', CONN)
Objects = pd.read_sql_query('SELECT * FROM Objects;', CONN)
Relationships = pd.read_sql_query('SELECT * FROM Relationships;', CONN)
# add at each df their abbreviation
Subjects["Abbreviation"] = Subjects["SubjectURI"]
Objects["Abbreviation"] = Objects["ObjectURI"]
Relationships["Abbreviation"] = Relationships["RelationshipURI"]
for index, row in Subjects.iterrows():
subject_uri = row["SubjectURI"]
subject_id = row["SubjectID"]
abbreviation = parser(subject_uri)
insert_abbreviation(subject_uri,abbreviation)
abbreviation_id = select_abbreviation_id(subject_uri)
insert_subject_abbreviation(subject_id,abbreviation_id)
for index, row in Objects.iterrows():
object_uri = row["ObjectURI"]
object_id = row["ObjectID"]
abbreviation = parser(object_uri)
insert_abbreviation(object_uri,abbreviation)
abbreviation_id = select_abbreviation_id(object_uri)
insert_subject_abbreviation(object_id,abbreviation_id)
for index, row in Relationships.iterrows():
relationship_uri = row["RelationshipURI"]
relationship_id = row["RelationshipID"]
abbreviation = parser(relationship_uri)
insert_abbreviation(relationship_uri,abbreviation)
abbreviation_id = select_abbreviation_id(relationship_uri)
insert_subject_abbreviation(relationship_id,abbreviation_id)
CONN.commit()
CONN.close()
# MAPPER_HANDLER.close()

View File

@ -17,12 +17,15 @@ PAGEID_CSV = "./Assets/Dataset/1-hop/movie-pageid.csv"
SUMMARY_CSV = "./Assets/Dataset/1-hop/wikipedia-summary.csv"
DATASET_CSV = "./Assets/Dataset/1-hop/dataset.csv"
REVERSE_CSV = "./Assets/Dataset/1-hop/reverse.csv"
URI_CSV = "./Assets/Dataset/1-hop/uri-abbreviations.csv"
MOVIES_CSV_HANDLER = open(MOVIES_CSV, "r", newline="", encoding="utf-8")
PAGEID_CSV_HANDLER = open(PAGEID_CSV, "r", newline="", encoding="utf-8")
SUMMARY_CSV_HANDLER = open(SUMMARY_CSV, "r", newline="", encoding="utf-8")
DATASET_CSV_HANDLER = open(DATASET_CSV, "r", newline="", encoding="utf-8")
REVERSE_CSV_HANDLER = open(REVERSE_CSV, "r", newline="", encoding="utf-8")
URI_ABBR_CSV_HANDLER = open(URI_CSV, "r", newline="", encoding="utf-8")
MOVIES_CSV_HANDLER = open(MOVIES_CSV,"r",newline='', encoding="utf-8")
PAGEID_CSV_HANDLER = open(PAGEID_CSV,"r",newline='', encoding="utf-8")
SUMMARY_CSV_HANDLER = open(SUMMARY_CSV,"r",newline='', encoding="utf-8")
DATASET_CSV_HANDLER = open(DATASET_CSV,"r",newline='', encoding="utf-8")
REVERSE_CSV_HANDLER = open(REVERSE_CSV,"r",newline='', encoding="utf-8")
CONN = sqlite3.connect(DB_NAME)
CURS = CONN.cursor()
@ -30,7 +33,8 @@ CURS = CONN.cursor()
# MARK: SQL Definitions
# Insert MovieURI
def insertOrigin(curs : sqlite3.Cursor ) -> bool:
def insertOrigin(curs: sqlite3.Cursor) -> bool:
QUERY = "INSERT INTO Origins (OriginName) VALUES ('dataset.csv'),('reverse.csv');"
try:
@ -39,6 +43,7 @@ def insertOrigin(curs : sqlite3.Cursor ) -> bool:
except sqlite3.IntegrityError:
return False
def selectOrigin(curs: sqlite3.Cursor, originName: str) -> int | None:
QUERY = "SELECT OriginID FROM Origins WHERE OriginName = ?;"
@ -51,11 +56,12 @@ def selectOrigin(curs: sqlite3.Cursor, originName: str) -> int | None:
# in this case the real id is the first element of the tuple
return originId[0]
def insertMovie(curs : sqlite3.Cursor , movieUri: str) -> bool:
def insertMovie(curs: sqlite3.Cursor, movieUri: str) -> bool:
QUERY = "INSERT INTO Movies (MovieURI) VALUES (?);"
try:
curs.execute(QUERY,[movieUri])
curs.execute(QUERY, [movieUri])
return True
except sqlite3.IntegrityError:
return False
@ -77,12 +83,13 @@ def selectMovieId(curs: sqlite3.Cursor, movieUri: str) -> int | None:
def insertWikiPageId(curs: sqlite3.Cursor, movieId: int, pageId: int) -> bool:
QUERY = "INSERT INTO WikiPageIDs (MovieID, PageID) VALUES (?,?);"
try:
curs.execute(QUERY,[movieId, pageId])
curs.execute(QUERY, [movieId, pageId])
return True
except sqlite3.IntegrityError:
return False
def selectMovieIdFromWikiPageId(curs: sqlite3.Cursor,pageId: int) -> int | None:
def selectMovieIdFromWikiPageId(curs: sqlite3.Cursor, pageId: int) -> int | None:
QUERY = "SELECT MovieID FROM WikiPageIDs WHERE PageID = ?;"
@ -94,38 +101,43 @@ def selectMovieIdFromWikiPageId(curs: sqlite3.Cursor,pageId: int) -> int | None:
# in this case the real id is the first element of the tuple
return movieId[0]
def insertWikiAbstract(curs: sqlite3.Cursor, movieId: int, abstract: str) -> bool:
QUERY = "INSERT INTO WikipediaAbstracts (MovieID, Abstract) VALUES (?,?);"
try:
curs.execute(QUERY,[movieId, abstract])
curs.execute(QUERY, [movieId, abstract])
return True
except sqlite3.IntegrityError:
return False
def insertSubject(curs: sqlite3.Cursor, subjectURI: str, originID: int) -> bool:
QUERY = "INSERT INTO Subjects (SubjectURI, OriginID) VALUES (?,?);"
try:
curs.execute(QUERY,[subjectURI, originID])
curs.execute(QUERY, [subjectURI, originID])
return True
except sqlite3.IntegrityError:
return False
def insertRelationship(curs: sqlite3.Cursor, relationshipURI: str) -> bool:
QUERY = "INSERT INTO Relationships (RelationshipURI) VALUES (?);"
try:
curs.execute(QUERY,[relationshipURI])
curs.execute(QUERY, [relationshipURI])
return True
except sqlite3.IntegrityError:
return False
def insertObject(curs: sqlite3.Cursor, objectURI: str, originID: int) -> bool:
QUERY = "INSERT INTO objects (ObjectURI, OriginID) VALUES (?,?);"
try:
curs.execute(QUERY,[objectURI, originID])
curs.execute(QUERY, [objectURI, originID])
return True
except sqlite3.IntegrityError:
return False
def selectSubjectId(curs: sqlite3.Cursor, subjectURI: str) -> int | None:
QUERY = "SELECT SubjectID FROM Subjects WHERE SubjectURI = ?;"
@ -138,6 +150,7 @@ def selectSubjectId(curs: sqlite3.Cursor, subjectURI: str) -> int | None:
# in this case the real id is the first element of the tuple
return subjectId[0]
def selectRelationshipId(curs: sqlite3.Cursor, relationshipURI: str) -> int | None:
QUERY = "SELECT RelationshipID FROM Relationships WHERE RelationshipURI = ?;"
@ -150,6 +163,7 @@ def selectRelationshipId(curs: sqlite3.Cursor, relationshipURI: str) -> int | No
# in this case the real id is the first element of the tuple
return relationshipId[0]
def selectObjectId(curs: sqlite3.Cursor, objectURI: str) -> int | None:
QUERY = "SELECT ObjectID FROM Objects WHERE ObjectURI = ?;"
@ -162,20 +176,71 @@ def selectObjectId(curs: sqlite3.Cursor, objectURI: str) -> int | None:
# in this case the real id is the first element of the tuple
return objectId[0]
def insertRDF(
curs: sqlite3.Cursor,
movieId: int,
subjectId: int,
relationshipId: int,
objectId: int
objectId: int,
) -> bool:
QUERY = "INSERT INTO RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?,?,?,?);"
try:
curs.execute(QUERY,[movieId,subjectId,relationshipId,objectId])
curs.execute(QUERY, [movieId, subjectId, relationshipId, objectId])
return True
except sqlite3.IntegrityError:
return False
# UGLY: correct method to add cursor
def insert_abbreviation(uri, abbreviation) -> bool:
QUERY = "INSERT INTO Abbreviations(URI, Abbreviation) VALUES (?,?);"
try:
CURS.execute(QUERY, [uri, abbreviation])
return True
except sqlite3.IntegrityError:
return False
# UGLY: correct method to add cursor
def insert_object_abbreviation(object_id, abbreviation_id) -> bool:
QUERY = "INSERT INTO Objects_Abbreviations(ObjectID, AbbreviationID) VALUES (?,?);"
try:
CURS.execute(QUERY, [object_id, abbreviation_id])
return True
except sqlite3.IntegrityError:
return False
# UGLY: correct method to add cursor
def insert_relationship_abbreviation(relationship_id, abbreviation_id) -> bool:
QUERY = "INSERT INTO Relationships_Abbreviations(RelationshipID, AbbreviationID) VALUES (?,?);"
try:
CURS.execute(QUERY, [relationship_id, abbreviation_id])
return True
except sqlite3.IntegrityError:
return False
# UGLY: correct method to add cursor
def insert_subject_abbreviation(subject_id, abbreviation_id) -> bool:
QUERY = (
"INSERT INTO Subjects_Abbreviations(SubjectID, AbbreviationID) VALUES (?,?);"
)
try:
CURS.execute(QUERY, [subject_id, abbreviation_id])
return True
except sqlite3.IntegrityError:
return False
# UGLY: correct method to add cursor
def select_abbreviation_id(uri) -> int | None:
QUERY = "SELECT AbbreviationID FROM Abbreviations WHERE URI LIKE ?;"
CURS.execute(QUERY, [uri])
abbreviation_id = CURS.fetchone()
if not abbreviation_id:
return None
# in this case the real id is the first element of the tuple
return abbreviation_id[0]
# MARK: Parsing
def parseMovies():
@ -208,7 +273,6 @@ def parseAbstract():
ABSTRACT = row["text"]
MOVIE_ID = selectMovieIdFromWikiPageId(CURS, WIKI_PAGE_ID)
if MOVIE_ID is None:
print(f"The WikiPageId: {WIKI_PAGE_ID} has not a MovieId ")
continue
@ -216,10 +280,24 @@ def parseAbstract():
insertWikiAbstract(CURS, MOVIE_ID, ABSTRACT)
def parseAbbreviations():
URI_CSV = csv.DictReader(URI_ABBR_CSV_HANDLER)
for row in URI_CSV:
URI = row["uri"]
ABBREVIATION = row["abbreviation"]
insert_abbreviation(URI, ABBREVIATION)
def parseRDF_Reverse():
REVERSE_CSV_READER = csv.DictReader(REVERSE_CSV_HANDLER)
REVERSE_ORIGIN_ID = selectOrigin(CURS, 'reverse.csv')
REVERSE_ORIGIN_ID = selectOrigin(CURS, "reverse.csv")
if REVERSE_ORIGIN_ID is None:
return
total = 0
for row in REVERSE_CSV_READER:
@ -227,7 +305,7 @@ def parseRDF_Reverse():
RELATIONSHIP = row["relationship"]
OBJECT = row["object"]
print(f"RDF triplets:\n\t{SUBJECT} - {RELATIONSHIP} - {OBJECT}")
insertSubject(CURS,SUBJECT,REVERSE_ORIGIN_ID)
insertSubject(CURS, SUBJECT, REVERSE_ORIGIN_ID)
insertRelationship(CURS, RELATIONSHIP)
insertObject(CURS, OBJECT, REVERSE_ORIGIN_ID)
@ -236,7 +314,6 @@ def parseRDF_Reverse():
RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
MOVIE_ID = selectMovieId(CURS, OBJECT)
skip = False
# guard
@ -259,17 +336,19 @@ def parseRDF_Reverse():
if skip:
continue
if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID):
if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID): # type: ignore
total += 1
print(total)
def parseRDF_Dataset():
DATASET_CSV_READER = csv.DictReader(DATASET_CSV_HANDLER)
DATASET_ORIGIN_ID = selectOrigin(CURS, 'dataset.csv')
DATASET_ORIGIN_ID = selectOrigin(CURS, "dataset.csv")
if DATASET_ORIGIN_ID is None:
return
total = 0
rdf_idx = 0
@ -284,7 +363,7 @@ def parseRDF_Dataset():
if rdf_idx % 100000 == 0:
print(f"RDF number {rdf_idx}:\n\t{SUBJECT} - {RELATIONSHIP} - {OBJECT}")
insertSubject(CURS,SUBJECT,DATASET_ORIGIN_ID)
insertSubject(CURS, SUBJECT, DATASET_ORIGIN_ID)
insertRelationship(CURS, RELATIONSHIP)
insertObject(CURS, OBJECT, DATASET_ORIGIN_ID)
@ -293,7 +372,6 @@ def parseRDF_Dataset():
RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
MOVIE_ID = selectMovieId(CURS, SUBJECT)
skip = False
# guard
@ -316,31 +394,211 @@ def parseRDF_Dataset():
if skip:
continue
if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID):
if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID): # type: ignore
total += 1
print(total)
def parseAbbr_Reverse():
REVERSE_CSV_READER = csv.DictReader(REVERSE_CSV_HANDLER)
REVERSE_ORIGIN_ID = selectOrigin(CURS, "reverse.csv")
if REVERSE_ORIGIN_ID is None:
return
total = 0
for row in REVERSE_CSV_READER:
SUBJECT = row["subject"]
RELATIONSHIP = row["relationship"]
OBJECT = row["object"]
SUBJECT_ID = selectSubjectId(CURS, SUBJECT)
OBJECT_ID = selectObjectId(CURS, OBJECT)
RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
SUB_SECTIONS = SUBJECT.split("/")
REL_SECTIONS = RELATIONSHIP.split("/")
OBJ_SECTIONS = OBJECT.split("/")
SUB_ABBR_ID = None
REL_ABBR_ID = None
OBJ_ABBR_ID = None
skip = False
# guard
if SUBJECT_ID is None:
print(f"No SubjectId for {SUBJECT}")
skip = True
if OBJECT_ID is None:
print(f"No ObjectId for {OBJECT}")
skip = True
if RELATIONSHIP_ID is None:
print(f"No RelationshipId for {RELATIONSHIP}")
skip = True
if skip:
continue
if len(SUB_SECTIONS) > 4:
index = min(len(SUB_SECTIONS), 7)
while index > 3:
PATH = "/".join(SUB_SECTIONS[0:index]) + "%"
SUB_ABBR_ID = select_abbreviation_id(PATH)
if SUB_ABBR_ID is not None:
if insert_subject_abbreviation(SUBJECT_ID, SUB_ABBR_ID):
total += 1
index = 0
index -= 1
if len(REL_SECTIONS) > 4:
index = min(len(REL_SECTIONS), 7)
while index > 2:
PATH = "/".join(REL_SECTIONS[0:index]) + "%"
REL_ABBR_ID = select_abbreviation_id(PATH)
if REL_ABBR_ID is not None:
if insert_relationship_abbreviation(RELATIONSHIP_ID, REL_ABBR_ID):
total += 1
index = 0
index -= 1
if len(OBJ_SECTIONS) > 4:
index = min(len(OBJ_SECTIONS), 7)
while index > 3:
PATH = "/".join(OBJ_SECTIONS[0:index]) + "%"
OBJ_ABBR_ID = select_abbreviation_id(PATH)
if OBJ_ABBR_ID is not None:
if insert_object_abbreviation(OBJECT_ID, OBJ_ABBR_ID):
total += 1
index = 0
index -= 1
print(total)
def parseAbbr_Dataset():
DATASET_CSV_READER = csv.DictReader(DATASET_CSV_HANDLER)
DATASET_ORIGIN_ID = selectOrigin(CURS, "dataset.csv")
if DATASET_ORIGIN_ID is None:
return
total = 0
rdf_idx = 0
for row in DATASET_CSV_READER:
SUBJECT = row["subject"]
RELATIONSHIP = row["relationship"]
OBJECT = row["object"]
rdf_idx += 1
if rdf_idx % 100000 == 0:
print(f"RDF number {rdf_idx}:\n\t{SUBJECT} - {RELATIONSHIP} - {OBJECT}")
SUBJECT_ID = selectSubjectId(CURS, SUBJECT)
OBJECT_ID = selectObjectId(CURS, OBJECT)
RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
SUB_SECTIONS = SUBJECT.split("/")
REL_SECTIONS = RELATIONSHIP.split("/")
OBJ_SECTIONS = OBJECT.split("/")
SUB_ABBR_ID = None
REL_ABBR_ID = None
OBJ_ABBR_ID = None
skip = False
# guard
if SUBJECT_ID is None:
print(f"No SubjectId for {SUBJECT}")
skip = True
if OBJECT_ID is None:
print(f"No ObjectId for {OBJECT}")
skip = True
if RELATIONSHIP_ID is None:
print(f"No RelationshipId for {RELATIONSHIP}")
skip = True
if skip:
continue
if len(SUB_SECTIONS) > 4:
index = min(len(SUB_SECTIONS), 7)
while index > 3:
PATH = "/".join(SUB_SECTIONS[0:index]) + "%"
SUB_ABBR_ID = select_abbreviation_id(PATH)
if SUB_ABBR_ID is not None:
if insert_subject_abbreviation(SUBJECT_ID, SUB_ABBR_ID):
total += 1
index = 0
index -= 1
if len(REL_SECTIONS) > 4:
index = min(len(REL_SECTIONS), 7)
while index > 2:
PATH = "/".join(REL_SECTIONS[0:index]) + "%"
REL_ABBR_ID = select_abbreviation_id(PATH)
if REL_ABBR_ID is not None:
if insert_relationship_abbreviation(RELATIONSHIP_ID, REL_ABBR_ID):
total += 1
index = 0
index -= 1
if len(OBJ_SECTIONS) > 4:
index = min(len(OBJ_SECTIONS), 7)
while index > 3:
PATH = "/".join(OBJ_SECTIONS[0:index]) + "%"
OBJ_ABBR_ID = select_abbreviation_id(PATH)
if OBJ_ABBR_ID is not None:
if insert_object_abbreviation(OBJECT_ID, OBJ_ABBR_ID):
total += 1
index = 0
index -= 1
print(total)
# MARK: Actual Code
# parseMovies()
# parseWikiPageId()
# parseAbstract()
# insertOrigin(CURS)
# parseAbbreviations()
# parseRDF_Reverse()
# parseRDF_Dataset()
# parseAbbr_Reverse()
parseAbbr_Dataset()
CONN.commit()
CONN.close()
MOVIES_CSV_HANDLER.close()
PAGEID_CSV_HANDLER.close()
SUMMARY_CSV_HANDLER.close()
DATASET_CSV_HANDLER.close()
REVERSE_CSV_HANDLER.close()
URI_ABBR_CSV_HANDLER.close()
"""