Built datawarehouse.py which populate the dataset
This commit is contained in:
parent
ac1ed42c49
commit
64f9b41378
373
Scripts/DatasetMerging/datawarehouse.py
Normal file
373
Scripts/DatasetMerging/datawarehouse.py
Normal file
@ -0,0 +1,373 @@
|
||||
import sqlite3
|
||||
import csv
|
||||
|
||||
#####################################################################
|
||||
# This file builds DatawareHouse/dataset.db from 1-hop csv files #
|
||||
# Its Schema in . /SQL_Queries/db_creation.sql #
|
||||
# The sql query used to popualate id in . /SQL_Queries/query.sql #
|
||||
#####################################################################
|
||||
|
||||
|
||||
# --- Global configuration ---
|
||||
DB_NAME = "./Assets/Dataset/DatawareHouse/dataset.db"
|
||||
MOVIES_CSV = "./Assets/Dataset/1-hop/movies.csv"
|
||||
PAGEID_CSV = "./Assets/Dataset/1-hop/movie-pageid.csv"
|
||||
SUMMARY_CSV = "./Assets/Dataset/1-hop/wikipedia-summary.csv"
|
||||
DATASET_CSV = "./Assets/Dataset/1-hop/dataset.csv"
|
||||
REVERSE_CSV = "./Assets/Dataset/1-hop/reverse.csv"
|
||||
|
||||
MOVIES_CSV_HANDLER = open(MOVIES_CSV,"r",newline='', encoding="utf-8")
|
||||
PAGEID_CSV_HANDLER = open(PAGEID_CSV,"r",newline='', encoding="utf-8")
|
||||
SUMMARY_CSV_HANDLER = open(SUMMARY_CSV,"r",newline='', encoding="utf-8")
|
||||
DATASET_CSV_HANDLER = open(DATASET_CSV,"r",newline='', encoding="utf-8")
|
||||
REVERSE_CSV_HANDLER = open(REVERSE_CSV,"r",newline='', encoding="utf-8")
|
||||
|
||||
CONN = sqlite3.connect(DB_NAME)
|
||||
CURS = CONN.cursor()
|
||||
|
||||
# MARK: SQL Definitions
|
||||
# Insert MovieURI
|
||||
|
||||
def insertOrigin(curs : sqlite3.Cursor ) -> bool:
|
||||
|
||||
QUERY = "INSERT INTO Origins (OriginName) VALUES ('dataset.csv'),('reverse.csv');"
|
||||
try:
|
||||
curs.execute(QUERY)
|
||||
return True
|
||||
except sqlite3.IntegrityError:
|
||||
return False
|
||||
|
||||
def selectOrigin(curs: sqlite3.Cursor, originName: str) -> int | None:
|
||||
|
||||
QUERY = "SELECT OriginID FROM Origins WHERE OriginName = ?;"
|
||||
|
||||
curs.execute(QUERY, [originName])
|
||||
originId = curs.fetchone()
|
||||
if not originId:
|
||||
return None
|
||||
|
||||
# in this case the real id is the first element of the tuple
|
||||
return originId[0]
|
||||
|
||||
def insertMovie(curs : sqlite3.Cursor , movieUri: str) -> bool:
|
||||
|
||||
QUERY = "INSERT INTO Movies (MovieURI) VALUES (?);"
|
||||
try:
|
||||
curs.execute(QUERY,[movieUri])
|
||||
return True
|
||||
except sqlite3.IntegrityError:
|
||||
return False
|
||||
|
||||
|
||||
def selectMovieId(curs: sqlite3.Cursor, movieUri: str) -> int | None:
|
||||
|
||||
QUERY = "SELECT MovieID FROM Movies WHERE MovieURI = ?;"
|
||||
|
||||
curs.execute(QUERY, [movieUri])
|
||||
movieId = curs.fetchone()
|
||||
if not movieId:
|
||||
return None
|
||||
|
||||
# in this case the real id is the first element of the tuple
|
||||
return movieId[0]
|
||||
|
||||
|
||||
def insertWikiPageId(curs: sqlite3.Cursor, movieId: int, pageId: int) -> bool:
|
||||
QUERY = "INSERT INTO WikiPageIDs (MovieID, PageID) VALUES (?,?);"
|
||||
try:
|
||||
curs.execute(QUERY,[movieId, pageId])
|
||||
return True
|
||||
except sqlite3.IntegrityError:
|
||||
return False
|
||||
|
||||
def selectMovieIdFromWikiPageId(curs: sqlite3.Cursor,pageId: int) -> int | None:
|
||||
|
||||
QUERY = "SELECT MovieID FROM WikiPageIDs WHERE PageID = ?;"
|
||||
|
||||
curs.execute(QUERY, [pageId])
|
||||
movieId = curs.fetchone()
|
||||
if not movieId:
|
||||
return None
|
||||
|
||||
# in this case the real id is the first element of the tuple
|
||||
return movieId[0]
|
||||
|
||||
def insertWikiAbstract(curs: sqlite3.Cursor, movieId: int, abstract: str) -> bool:
|
||||
QUERY = "INSERT INTO WikipediaAbstracts (MovieID, Abstract) VALUES (?,?);"
|
||||
try:
|
||||
curs.execute(QUERY,[movieId, abstract])
|
||||
return True
|
||||
except sqlite3.IntegrityError:
|
||||
return False
|
||||
|
||||
def insertSubject(curs: sqlite3.Cursor, subjectURI: str, originID: int) -> bool:
|
||||
QUERY = "INSERT INTO Subjects (SubjectURI, OriginID) VALUES (?,?);"
|
||||
try:
|
||||
curs.execute(QUERY,[subjectURI, originID])
|
||||
return True
|
||||
except sqlite3.IntegrityError:
|
||||
return False
|
||||
|
||||
def insertRelationship(curs: sqlite3.Cursor, relationshipURI: str) -> bool:
|
||||
QUERY = "INSERT INTO Relationships (RelationshipURI) VALUES (?);"
|
||||
try:
|
||||
curs.execute(QUERY,[relationshipURI])
|
||||
return True
|
||||
except sqlite3.IntegrityError:
|
||||
return False
|
||||
|
||||
def insertObject(curs: sqlite3.Cursor, objectURI: str, originID: int) -> bool:
|
||||
QUERY = "INSERT INTO objects (ObjectURI, OriginID) VALUES (?,?);"
|
||||
try:
|
||||
curs.execute(QUERY,[objectURI, originID])
|
||||
return True
|
||||
except sqlite3.IntegrityError:
|
||||
return False
|
||||
|
||||
def selectSubjectId(curs: sqlite3.Cursor, subjectURI: str) -> int | None:
|
||||
|
||||
QUERY = "SELECT SubjectID FROM Subjects WHERE SubjectURI = ?;"
|
||||
|
||||
curs.execute(QUERY, [subjectURI])
|
||||
subjectId = curs.fetchone()
|
||||
if not subjectId:
|
||||
return None
|
||||
|
||||
# in this case the real id is the first element of the tuple
|
||||
return subjectId[0]
|
||||
|
||||
def selectRelationshipId(curs: sqlite3.Cursor, relationshipURI: str) -> int | None:
|
||||
|
||||
QUERY = "SELECT RelationshipID FROM Relationships WHERE RelationshipURI = ?;"
|
||||
|
||||
curs.execute(QUERY, [relationshipURI])
|
||||
relationshipId = curs.fetchone()
|
||||
if not relationshipId:
|
||||
return None
|
||||
|
||||
# in this case the real id is the first element of the tuple
|
||||
return relationshipId[0]
|
||||
|
||||
def selectObjectId(curs: sqlite3.Cursor, objectURI: str) -> int | None:
|
||||
|
||||
QUERY = "SELECT ObjectID FROM Objects WHERE ObjectURI = ?;"
|
||||
|
||||
curs.execute(QUERY, [objectURI])
|
||||
objectId = curs.fetchone()
|
||||
if not objectId:
|
||||
return None
|
||||
|
||||
# in this case the real id is the first element of the tuple
|
||||
return objectId[0]
|
||||
|
||||
def insertRDF(
|
||||
curs: sqlite3.Cursor,
|
||||
movieId: int,
|
||||
subjectId: int,
|
||||
relationshipId: int,
|
||||
objectId: int
|
||||
) -> bool:
|
||||
QUERY = "INSERT INTO RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?,?,?,?);"
|
||||
try:
|
||||
curs.execute(QUERY,[movieId,subjectId,relationshipId,objectId])
|
||||
return True
|
||||
except sqlite3.IntegrityError:
|
||||
return False
|
||||
|
||||
# MARK: Parsing
|
||||
def parseMovies():
|
||||
|
||||
CSV_READER = csv.reader(MOVIES_CSV_HANDLER)
|
||||
next(CSV_READER)
|
||||
for row in CSV_READER:
|
||||
MOVIE = row[0]
|
||||
insertMovie(CURS, MOVIE)
|
||||
|
||||
|
||||
def parseWikiPageId():
|
||||
CSV_READER = csv.DictReader(PAGEID_CSV_HANDLER)
|
||||
for row in CSV_READER:
|
||||
MOVIE_URI = row["subject"]
|
||||
WIKI_PAGE_ID = int(row["object"])
|
||||
MOVIE_ID = selectMovieId(CURS, MOVIE_URI)
|
||||
|
||||
if MOVIE_ID is None:
|
||||
print(f"The MovieUri: {MOVIE_URI} has not a MovieId ")
|
||||
continue
|
||||
|
||||
insertWikiPageId(CURS, MOVIE_ID, WIKI_PAGE_ID)
|
||||
|
||||
|
||||
def parseAbstract():
|
||||
CSV_READER = csv.DictReader(SUMMARY_CSV_HANDLER)
|
||||
for row in CSV_READER:
|
||||
|
||||
WIKI_PAGE_ID = int(row["subject"])
|
||||
ABSTRACT = row["text"]
|
||||
MOVIE_ID = selectMovieIdFromWikiPageId(CURS, WIKI_PAGE_ID)
|
||||
|
||||
|
||||
if MOVIE_ID is None:
|
||||
print(f"The WikiPageId: {WIKI_PAGE_ID} has not a MovieId ")
|
||||
continue
|
||||
|
||||
insertWikiAbstract(CURS, MOVIE_ID, ABSTRACT)
|
||||
|
||||
|
||||
def parseRDF_Reverse():
|
||||
|
||||
REVERSE_CSV_READER = csv.DictReader(REVERSE_CSV_HANDLER)
|
||||
REVERSE_ORIGIN_ID = selectOrigin(CURS, 'reverse.csv')
|
||||
total = 0
|
||||
|
||||
for row in REVERSE_CSV_READER:
|
||||
SUBJECT = row["subject"]
|
||||
RELATIONSHIP = row["relationship"]
|
||||
OBJECT = row["object"]
|
||||
print(f"RDF triplets:\n\t{SUBJECT} - {RELATIONSHIP} - {OBJECT}")
|
||||
insertSubject(CURS,SUBJECT,REVERSE_ORIGIN_ID)
|
||||
insertRelationship(CURS, RELATIONSHIP)
|
||||
insertObject(CURS, OBJECT, REVERSE_ORIGIN_ID)
|
||||
|
||||
SUBJECT_ID = selectSubjectId(CURS, SUBJECT)
|
||||
OBJECT_ID = selectObjectId(CURS, OBJECT)
|
||||
RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
|
||||
MOVIE_ID = selectMovieId(CURS, OBJECT)
|
||||
|
||||
|
||||
skip = False
|
||||
|
||||
# guard
|
||||
if SUBJECT_ID is None:
|
||||
print(f"No SubjectId for {SUBJECT}")
|
||||
skip = True
|
||||
|
||||
if OBJECT_ID is None:
|
||||
print(f"No ObjectId for {OBJECT}")
|
||||
skip = True
|
||||
|
||||
if RELATIONSHIP_ID is None:
|
||||
print(f"No RelationshipId for {RELATIONSHIP}")
|
||||
skip = True
|
||||
|
||||
if MOVIE_ID is None:
|
||||
print(f"No MovieId for {OBJECT}")
|
||||
skip = True
|
||||
|
||||
if skip:
|
||||
continue
|
||||
|
||||
if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID):
|
||||
total += 1
|
||||
|
||||
print(total)
|
||||
|
||||
|
||||
|
||||
def parseRDF_Dataset():
|
||||
|
||||
DATASET_CSV_READER = csv.DictReader(DATASET_CSV_HANDLER)
|
||||
DATASET_ORIGIN_ID = selectOrigin(CURS, 'dataset.csv')
|
||||
|
||||
total = 0
|
||||
rdf_idx = 0
|
||||
for row in DATASET_CSV_READER:
|
||||
|
||||
SUBJECT = row["subject"]
|
||||
RELATIONSHIP = row["relationship"]
|
||||
OBJECT = row["object"]
|
||||
|
||||
rdf_idx += 1
|
||||
|
||||
if rdf_idx % 100000 == 0:
|
||||
print(f"RDF number {rdf_idx}:\n\t{SUBJECT} - {RELATIONSHIP} - {OBJECT}")
|
||||
|
||||
insertSubject(CURS,SUBJECT,DATASET_ORIGIN_ID)
|
||||
insertRelationship(CURS, RELATIONSHIP)
|
||||
insertObject(CURS, OBJECT, DATASET_ORIGIN_ID)
|
||||
|
||||
SUBJECT_ID = selectSubjectId(CURS, SUBJECT)
|
||||
OBJECT_ID = selectObjectId(CURS, OBJECT)
|
||||
RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
|
||||
MOVIE_ID = selectMovieId(CURS, SUBJECT)
|
||||
|
||||
|
||||
skip = False
|
||||
|
||||
# guard
|
||||
if SUBJECT_ID is None:
|
||||
print(f"No SubjectId for {SUBJECT}")
|
||||
skip = True
|
||||
|
||||
if OBJECT_ID is None:
|
||||
print(f"No ObjectId for {OBJECT}")
|
||||
skip = True
|
||||
|
||||
if RELATIONSHIP_ID is None:
|
||||
print(f"No RelationshipId for {RELATIONSHIP}")
|
||||
skip = True
|
||||
|
||||
if MOVIE_ID is None:
|
||||
print(f"No MovieId for {SUBJECT}")
|
||||
skip = True
|
||||
|
||||
if skip:
|
||||
continue
|
||||
|
||||
if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID):
|
||||
total += 1
|
||||
|
||||
print(total)
|
||||
|
||||
|
||||
# MARK: Actual Code
|
||||
# parseMovies()
|
||||
# parseWikiPageId()
|
||||
# parseAbstract()
|
||||
# insertOrigin(CURS)
|
||||
# parseRDF_Reverse()
|
||||
# parseRDF_Dataset()
|
||||
|
||||
|
||||
CONN.commit()
|
||||
CONN.close()
|
||||
|
||||
|
||||
|
||||
MOVIES_CSV_HANDLER.close()
|
||||
PAGEID_CSV_HANDLER.close()
|
||||
SUMMARY_CSV_HANDLER.close()
|
||||
DATASET_CSV_HANDLER.close()
|
||||
REVERSE_CSV_HANDLER.close()
|
||||
|
||||
|
||||
"""
|
||||
The MovieUri: http://dbpedia.org/resource/1%25_(film) has not a MovieId
|
||||
The MovieUri: http://dbpedia.org/resource/10%25:_What_Makes_a_Hero%3F has not a MovieId
|
||||
The MovieUri: http://dbpedia.org/resource/100%25_Arabica has not a MovieId
|
||||
The MovieUri: http://dbpedia.org/resource/100%25_Kadhal has not a MovieId
|
||||
The MovieUri: http://dbpedia.org/resource/100%25_Love_(2011_film) has not a MovieId
|
||||
The MovieUri: http://dbpedia.org/resource/100%25_Love_(2012_film) has not a MovieId
|
||||
The MovieUri: http://dbpedia.org/resource/100%25_Wolf has not a MovieId
|
||||
The MovieUri: http://dbpedia.org/resource/Who_the_$&%25_Is_Jackson_Pollock%3F has not a MovieId
|
||||
The MovieUri: http://dbpedia.org/resource/99%25:_The_Occupy_Wall_Street_Collaborative_Film has not a MovieId
|
||||
The MovieUri: http://dbpedia.org/resource/99_and_44/100%25_Dead has not a MovieId
|
||||
The MovieUri: http://dbpedia.org/resource/Postcards_from_the_48%25 has not a MovieId
|
||||
The MovieUri: http://dbpedia.org/resource/Wool_100%25 has not a MovieId
|
||||
"""
|
||||
|
||||
"""
|
||||
The WikiPageId: 10068850 has not a MovieId
|
||||
The WikiPageId: 55069615 has not a MovieId
|
||||
The WikiPageId: 49510056 has not a MovieId
|
||||
The WikiPageId: 4049786 has not a MovieId
|
||||
The WikiPageId: 55510238 has not a MovieId
|
||||
The WikiPageId: 31239628 has not a MovieId
|
||||
The WikiPageId: 34757217 has not a MovieId
|
||||
The WikiPageId: 64311757 has not a MovieId
|
||||
The WikiPageId: 8326198 has not a MovieId
|
||||
The WikiPageId: 42162164 has not a MovieId
|
||||
The WikiPageId: 18502369 has not a MovieId
|
||||
The WikiPageId: 58092358 has not a MovieId
|
||||
The WikiPageId: 40710250 has not a MovieId
|
||||
"""
|
||||
Loading…
x
Reference in New Issue
Block a user