Merged abbreviation_datawarehouse into datawarehouse
This commit is contained in:
parent
9a5d633b5e
commit
4315d70109
@ -1,105 +0,0 @@
|
|||||||
import sqlite3
|
|
||||||
import csv
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
DB_NAME = "./Assets/Dataset/DatawareHouse/dataset.db"
|
|
||||||
CSV_MAPPER = "./Assets/Dataset/1-hop/uri-abbreviations.csv"
|
|
||||||
# MAPPER_HANDLER = open(CSV_MAPPER,"r",newline='', encoding="utf-8")
|
|
||||||
mapper = pd.read_csv(CSV_MAPPER)
|
|
||||||
mapper_key_list = mapper["uri"].to_list()
|
|
||||||
mapper_value_list = mapper["abbreviation"].to_list()
|
|
||||||
|
|
||||||
CONN = sqlite3.connect(DB_NAME)
|
|
||||||
CURS = CONN.cursor()
|
|
||||||
|
|
||||||
def insert_abbreviation(uri, abbreviation) -> bool:
|
|
||||||
QUERY = "INSERT INTO Abbreviations(URI, Abbreviation) VALUES (?,?);"
|
|
||||||
try:
|
|
||||||
CURS.execute(QUERY,[uri, abbreviation])
|
|
||||||
return True
|
|
||||||
except sqlite3.IntegrityError:
|
|
||||||
return False
|
|
||||||
|
|
||||||
def inserto_object_abbreviation(object_id, abbreviation_id) -> bool:
|
|
||||||
QUERY = "INSERT INTO Objects_Abbreviations(ObjectID, AbrreviationID) VALUES (?,?);"
|
|
||||||
try:
|
|
||||||
CURS.execute(QUERY,[object_id, abbreviation_id])
|
|
||||||
return True
|
|
||||||
except sqlite3.IntegrityError:
|
|
||||||
return False
|
|
||||||
|
|
||||||
def insert_relationship_abbreviation(relationship_id, abbreviation_id) -> bool:
|
|
||||||
QUERY = "INSERT INTO Relationships_Abbreviations(RelationshipID, AbbreviationID) VALUES (?,?);"
|
|
||||||
try:
|
|
||||||
CURS.execute(QUERY,[relationship_id, abbreviation_id])
|
|
||||||
return True
|
|
||||||
except sqlite3.IntegrityError:
|
|
||||||
return False
|
|
||||||
|
|
||||||
def insert_subject_abbreviation(subject_id, abbreviation_id) -> bool:
|
|
||||||
QUERY = "INSERT INTO Subjects_Abbreviations(SubjectID, AbbreviationID) VALUES (?,?);"
|
|
||||||
try:
|
|
||||||
CURS.execute(QUERY,[subject_id, abbreviation_id])
|
|
||||||
return True
|
|
||||||
except sqlite3.IntegrityError:
|
|
||||||
return False
|
|
||||||
|
|
||||||
def select_abbreviation_id(uri) -> int | None:
|
|
||||||
QUERY = "SELECT AbbreviationID FROM Abbreviations WHERE URI LIKE ?;"
|
|
||||||
CURS.execute(QUERY, [uri])
|
|
||||||
abbreviation_id = CURS.fetchone()
|
|
||||||
if not abbreviation_id:
|
|
||||||
return None
|
|
||||||
|
|
||||||
# in this case the real id is the first element of the tuple
|
|
||||||
return abbreviation_id[0]
|
|
||||||
|
|
||||||
def parser(element: pd.DataFrame):
|
|
||||||
# df.replace(['Boston Celtics', 'Amir Johnson', 'R.J. Hunter'],
|
|
||||||
# ['Omega Warriors', 'Mitcell Johnson', 'Shivang Thomas'])
|
|
||||||
return element.replace(mapper_key_list, mapper_value_list)
|
|
||||||
# # map by csv
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def populate():
|
|
||||||
# get subject, relationships, objects
|
|
||||||
# for index, row in df.iterrows():
|
|
||||||
Subjects = pd.read_sql_query('SELECT * FROM Subjects;', CONN)
|
|
||||||
Objects = pd.read_sql_query('SELECT * FROM Objects;', CONN)
|
|
||||||
Relationships = pd.read_sql_query('SELECT * FROM Relationships;', CONN)
|
|
||||||
# add at each df their abbreviation
|
|
||||||
Subjects["Abbreviation"] = Subjects["SubjectURI"]
|
|
||||||
Objects["Abbreviation"] = Objects["ObjectURI"]
|
|
||||||
Relationships["Abbreviation"] = Relationships["RelationshipURI"]
|
|
||||||
|
|
||||||
|
|
||||||
for index, row in Subjects.iterrows():
|
|
||||||
subject_uri = row["SubjectURI"]
|
|
||||||
subject_id = row["SubjectID"]
|
|
||||||
abbreviation = parser(subject_uri)
|
|
||||||
insert_abbreviation(subject_uri,abbreviation)
|
|
||||||
abbreviation_id = select_abbreviation_id(subject_uri)
|
|
||||||
insert_subject_abbreviation(subject_id,abbreviation_id)
|
|
||||||
|
|
||||||
for index, row in Objects.iterrows():
|
|
||||||
object_uri = row["ObjectURI"]
|
|
||||||
object_id = row["ObjectID"]
|
|
||||||
abbreviation = parser(object_uri)
|
|
||||||
insert_abbreviation(object_uri,abbreviation)
|
|
||||||
abbreviation_id = select_abbreviation_id(object_uri)
|
|
||||||
insert_subject_abbreviation(object_id,abbreviation_id)
|
|
||||||
|
|
||||||
for index, row in Relationships.iterrows():
|
|
||||||
relationship_uri = row["RelationshipURI"]
|
|
||||||
relationship_id = row["RelationshipID"]
|
|
||||||
abbreviation = parser(relationship_uri)
|
|
||||||
insert_abbreviation(relationship_uri,abbreviation)
|
|
||||||
abbreviation_id = select_abbreviation_id(relationship_uri)
|
|
||||||
insert_subject_abbreviation(relationship_id,abbreviation_id)
|
|
||||||
|
|
||||||
|
|
||||||
CONN.commit()
|
|
||||||
CONN.close()
|
|
||||||
|
|
||||||
# MAPPER_HANDLER.close()
|
|
||||||
@ -8,7 +8,7 @@ import csv
|
|||||||
#####################################################################
|
#####################################################################
|
||||||
|
|
||||||
# sometimes you may need to build a new db file, here a little snippet for you
|
# sometimes you may need to build a new db file, here a little snippet for you
|
||||||
# sqlite3 ./Assets/Dataset/Tmp/dataset.db < ./Scripts/DataCleaning/SQL_Queries/db_creation.sql
|
# sqlite3 ./Assets/Dataset/Tmp/dataset.db < ./Scripts/DataCleaning/SQL_Queries/db_creation.sql
|
||||||
|
|
||||||
# --- Global configuration ---
|
# --- Global configuration ---
|
||||||
DB_NAME = "./Assets/Dataset/DatawareHouse/dataset.db"
|
DB_NAME = "./Assets/Dataset/DatawareHouse/dataset.db"
|
||||||
@ -17,12 +17,15 @@ PAGEID_CSV = "./Assets/Dataset/1-hop/movie-pageid.csv"
|
|||||||
SUMMARY_CSV = "./Assets/Dataset/1-hop/wikipedia-summary.csv"
|
SUMMARY_CSV = "./Assets/Dataset/1-hop/wikipedia-summary.csv"
|
||||||
DATASET_CSV = "./Assets/Dataset/1-hop/dataset.csv"
|
DATASET_CSV = "./Assets/Dataset/1-hop/dataset.csv"
|
||||||
REVERSE_CSV = "./Assets/Dataset/1-hop/reverse.csv"
|
REVERSE_CSV = "./Assets/Dataset/1-hop/reverse.csv"
|
||||||
|
URI_CSV = "./Assets/Dataset/1-hop/uri-abbreviations.csv"
|
||||||
|
|
||||||
|
MOVIES_CSV_HANDLER = open(MOVIES_CSV, "r", newline="", encoding="utf-8")
|
||||||
|
PAGEID_CSV_HANDLER = open(PAGEID_CSV, "r", newline="", encoding="utf-8")
|
||||||
|
SUMMARY_CSV_HANDLER = open(SUMMARY_CSV, "r", newline="", encoding="utf-8")
|
||||||
|
DATASET_CSV_HANDLER = open(DATASET_CSV, "r", newline="", encoding="utf-8")
|
||||||
|
REVERSE_CSV_HANDLER = open(REVERSE_CSV, "r", newline="", encoding="utf-8")
|
||||||
|
URI_ABBR_CSV_HANDLER = open(URI_CSV, "r", newline="", encoding="utf-8")
|
||||||
|
|
||||||
MOVIES_CSV_HANDLER = open(MOVIES_CSV,"r",newline='', encoding="utf-8")
|
|
||||||
PAGEID_CSV_HANDLER = open(PAGEID_CSV,"r",newline='', encoding="utf-8")
|
|
||||||
SUMMARY_CSV_HANDLER = open(SUMMARY_CSV,"r",newline='', encoding="utf-8")
|
|
||||||
DATASET_CSV_HANDLER = open(DATASET_CSV,"r",newline='', encoding="utf-8")
|
|
||||||
REVERSE_CSV_HANDLER = open(REVERSE_CSV,"r",newline='', encoding="utf-8")
|
|
||||||
|
|
||||||
CONN = sqlite3.connect(DB_NAME)
|
CONN = sqlite3.connect(DB_NAME)
|
||||||
CURS = CONN.cursor()
|
CURS = CONN.cursor()
|
||||||
@ -30,7 +33,8 @@ CURS = CONN.cursor()
|
|||||||
# MARK: SQL Definitions
|
# MARK: SQL Definitions
|
||||||
# Insert MovieURI
|
# Insert MovieURI
|
||||||
|
|
||||||
def insertOrigin(curs : sqlite3.Cursor ) -> bool:
|
|
||||||
|
def insertOrigin(curs: sqlite3.Cursor) -> bool:
|
||||||
|
|
||||||
QUERY = "INSERT INTO Origins (OriginName) VALUES ('dataset.csv'),('reverse.csv');"
|
QUERY = "INSERT INTO Origins (OriginName) VALUES ('dataset.csv'),('reverse.csv');"
|
||||||
try:
|
try:
|
||||||
@ -38,24 +42,26 @@ def insertOrigin(curs : sqlite3.Cursor ) -> bool:
|
|||||||
return True
|
return True
|
||||||
except sqlite3.IntegrityError:
|
except sqlite3.IntegrityError:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def selectOrigin(curs: sqlite3.Cursor, originName: str) -> int | None:
|
def selectOrigin(curs: sqlite3.Cursor, originName: str) -> int | None:
|
||||||
|
|
||||||
QUERY = "SELECT OriginID FROM Origins WHERE OriginName = ?;"
|
QUERY = "SELECT OriginID FROM Origins WHERE OriginName = ?;"
|
||||||
|
|
||||||
curs.execute(QUERY, [originName])
|
curs.execute(QUERY, [originName])
|
||||||
originId = curs.fetchone()
|
originId = curs.fetchone()
|
||||||
if not originId:
|
if not originId:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# in this case the real id is the first element of the tuple
|
# in this case the real id is the first element of the tuple
|
||||||
return originId[0]
|
return originId[0]
|
||||||
|
|
||||||
def insertMovie(curs : sqlite3.Cursor , movieUri: str) -> bool:
|
|
||||||
|
def insertMovie(curs: sqlite3.Cursor, movieUri: str) -> bool:
|
||||||
|
|
||||||
QUERY = "INSERT INTO Movies (MovieURI) VALUES (?);"
|
QUERY = "INSERT INTO Movies (MovieURI) VALUES (?);"
|
||||||
try:
|
try:
|
||||||
curs.execute(QUERY,[movieUri])
|
curs.execute(QUERY, [movieUri])
|
||||||
return True
|
return True
|
||||||
except sqlite3.IntegrityError:
|
except sqlite3.IntegrityError:
|
||||||
return False
|
return False
|
||||||
@ -64,12 +70,12 @@ def insertMovie(curs : sqlite3.Cursor , movieUri: str) -> bool:
|
|||||||
def selectMovieId(curs: sqlite3.Cursor, movieUri: str) -> int | None:
|
def selectMovieId(curs: sqlite3.Cursor, movieUri: str) -> int | None:
|
||||||
|
|
||||||
QUERY = "SELECT MovieID FROM Movies WHERE MovieURI = ?;"
|
QUERY = "SELECT MovieID FROM Movies WHERE MovieURI = ?;"
|
||||||
|
|
||||||
curs.execute(QUERY, [movieUri])
|
curs.execute(QUERY, [movieUri])
|
||||||
movieId = curs.fetchone()
|
movieId = curs.fetchone()
|
||||||
if not movieId:
|
if not movieId:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# in this case the real id is the first element of the tuple
|
# in this case the real id is the first element of the tuple
|
||||||
return movieId[0]
|
return movieId[0]
|
||||||
|
|
||||||
@ -77,105 +83,164 @@ def selectMovieId(curs: sqlite3.Cursor, movieUri: str) -> int | None:
|
|||||||
def insertWikiPageId(curs: sqlite3.Cursor, movieId: int, pageId: int) -> bool:
|
def insertWikiPageId(curs: sqlite3.Cursor, movieId: int, pageId: int) -> bool:
|
||||||
QUERY = "INSERT INTO WikiPageIDs (MovieID, PageID) VALUES (?,?);"
|
QUERY = "INSERT INTO WikiPageIDs (MovieID, PageID) VALUES (?,?);"
|
||||||
try:
|
try:
|
||||||
curs.execute(QUERY,[movieId, pageId])
|
curs.execute(QUERY, [movieId, pageId])
|
||||||
return True
|
return True
|
||||||
except sqlite3.IntegrityError:
|
except sqlite3.IntegrityError:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def selectMovieIdFromWikiPageId(curs: sqlite3.Cursor,pageId: int) -> int | None:
|
|
||||||
|
def selectMovieIdFromWikiPageId(curs: sqlite3.Cursor, pageId: int) -> int | None:
|
||||||
|
|
||||||
QUERY = "SELECT MovieID FROM WikiPageIDs WHERE PageID = ?;"
|
QUERY = "SELECT MovieID FROM WikiPageIDs WHERE PageID = ?;"
|
||||||
|
|
||||||
curs.execute(QUERY, [pageId])
|
curs.execute(QUERY, [pageId])
|
||||||
movieId = curs.fetchone()
|
movieId = curs.fetchone()
|
||||||
if not movieId:
|
if not movieId:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# in this case the real id is the first element of the tuple
|
# in this case the real id is the first element of the tuple
|
||||||
return movieId[0]
|
return movieId[0]
|
||||||
|
|
||||||
|
|
||||||
def insertWikiAbstract(curs: sqlite3.Cursor, movieId: int, abstract: str) -> bool:
|
def insertWikiAbstract(curs: sqlite3.Cursor, movieId: int, abstract: str) -> bool:
|
||||||
QUERY = "INSERT INTO WikipediaAbstracts (MovieID, Abstract) VALUES (?,?);"
|
QUERY = "INSERT INTO WikipediaAbstracts (MovieID, Abstract) VALUES (?,?);"
|
||||||
try:
|
try:
|
||||||
curs.execute(QUERY,[movieId, abstract])
|
curs.execute(QUERY, [movieId, abstract])
|
||||||
return True
|
return True
|
||||||
except sqlite3.IntegrityError:
|
except sqlite3.IntegrityError:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def insertSubject(curs: sqlite3.Cursor, subjectURI: str, originID: int) -> bool:
|
def insertSubject(curs: sqlite3.Cursor, subjectURI: str, originID: int) -> bool:
|
||||||
QUERY = "INSERT INTO Subjects (SubjectURI, OriginID) VALUES (?,?);"
|
QUERY = "INSERT INTO Subjects (SubjectURI, OriginID) VALUES (?,?);"
|
||||||
try:
|
try:
|
||||||
curs.execute(QUERY,[subjectURI, originID])
|
curs.execute(QUERY, [subjectURI, originID])
|
||||||
return True
|
return True
|
||||||
except sqlite3.IntegrityError:
|
except sqlite3.IntegrityError:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def insertRelationship(curs: sqlite3.Cursor, relationshipURI: str) -> bool:
|
def insertRelationship(curs: sqlite3.Cursor, relationshipURI: str) -> bool:
|
||||||
QUERY = "INSERT INTO Relationships (RelationshipURI) VALUES (?);"
|
QUERY = "INSERT INTO Relationships (RelationshipURI) VALUES (?);"
|
||||||
try:
|
try:
|
||||||
curs.execute(QUERY,[relationshipURI])
|
curs.execute(QUERY, [relationshipURI])
|
||||||
return True
|
return True
|
||||||
except sqlite3.IntegrityError:
|
except sqlite3.IntegrityError:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def insertObject(curs: sqlite3.Cursor, objectURI: str, originID: int) -> bool:
|
def insertObject(curs: sqlite3.Cursor, objectURI: str, originID: int) -> bool:
|
||||||
QUERY = "INSERT INTO objects (ObjectURI, OriginID) VALUES (?,?);"
|
QUERY = "INSERT INTO objects (ObjectURI, OriginID) VALUES (?,?);"
|
||||||
try:
|
try:
|
||||||
curs.execute(QUERY,[objectURI, originID])
|
curs.execute(QUERY, [objectURI, originID])
|
||||||
return True
|
return True
|
||||||
except sqlite3.IntegrityError:
|
except sqlite3.IntegrityError:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
def selectSubjectId(curs: sqlite3.Cursor, subjectURI: str) -> int | None:
|
def selectSubjectId(curs: sqlite3.Cursor, subjectURI: str) -> int | None:
|
||||||
|
|
||||||
QUERY = "SELECT SubjectID FROM Subjects WHERE SubjectURI = ?;"
|
QUERY = "SELECT SubjectID FROM Subjects WHERE SubjectURI = ?;"
|
||||||
|
|
||||||
curs.execute(QUERY, [subjectURI])
|
curs.execute(QUERY, [subjectURI])
|
||||||
subjectId = curs.fetchone()
|
subjectId = curs.fetchone()
|
||||||
if not subjectId:
|
if not subjectId:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# in this case the real id is the first element of the tuple
|
# in this case the real id is the first element of the tuple
|
||||||
return subjectId[0]
|
return subjectId[0]
|
||||||
|
|
||||||
|
|
||||||
def selectRelationshipId(curs: sqlite3.Cursor, relationshipURI: str) -> int | None:
|
def selectRelationshipId(curs: sqlite3.Cursor, relationshipURI: str) -> int | None:
|
||||||
|
|
||||||
QUERY = "SELECT RelationshipID FROM Relationships WHERE RelationshipURI = ?;"
|
QUERY = "SELECT RelationshipID FROM Relationships WHERE RelationshipURI = ?;"
|
||||||
|
|
||||||
curs.execute(QUERY, [relationshipURI])
|
curs.execute(QUERY, [relationshipURI])
|
||||||
relationshipId = curs.fetchone()
|
relationshipId = curs.fetchone()
|
||||||
if not relationshipId:
|
if not relationshipId:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# in this case the real id is the first element of the tuple
|
# in this case the real id is the first element of the tuple
|
||||||
return relationshipId[0]
|
return relationshipId[0]
|
||||||
|
|
||||||
|
|
||||||
def selectObjectId(curs: sqlite3.Cursor, objectURI: str) -> int | None:
|
def selectObjectId(curs: sqlite3.Cursor, objectURI: str) -> int | None:
|
||||||
|
|
||||||
QUERY = "SELECT ObjectID FROM Objects WHERE ObjectURI = ?;"
|
QUERY = "SELECT ObjectID FROM Objects WHERE ObjectURI = ?;"
|
||||||
|
|
||||||
curs.execute(QUERY, [objectURI])
|
curs.execute(QUERY, [objectURI])
|
||||||
objectId = curs.fetchone()
|
objectId = curs.fetchone()
|
||||||
if not objectId:
|
if not objectId:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# in this case the real id is the first element of the tuple
|
# in this case the real id is the first element of the tuple
|
||||||
return objectId[0]
|
return objectId[0]
|
||||||
|
|
||||||
|
|
||||||
def insertRDF(
|
def insertRDF(
|
||||||
curs: sqlite3.Cursor,
|
curs: sqlite3.Cursor,
|
||||||
movieId: int,
|
movieId: int,
|
||||||
subjectId: int,
|
subjectId: int,
|
||||||
relationshipId: int,
|
relationshipId: int,
|
||||||
objectId: int
|
objectId: int,
|
||||||
) -> bool:
|
) -> bool:
|
||||||
QUERY = "INSERT INTO RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?,?,?,?);"
|
QUERY = "INSERT INTO RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?,?,?,?);"
|
||||||
try:
|
try:
|
||||||
curs.execute(QUERY,[movieId,subjectId,relationshipId,objectId])
|
curs.execute(QUERY, [movieId, subjectId, relationshipId, objectId])
|
||||||
return True
|
return True
|
||||||
except sqlite3.IntegrityError:
|
except sqlite3.IntegrityError:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
# UGLY: correct method to add cursor
|
||||||
|
def insert_abbreviation(uri, abbreviation) -> bool:
|
||||||
|
QUERY = "INSERT INTO Abbreviations(URI, Abbreviation) VALUES (?,?);"
|
||||||
|
try:
|
||||||
|
CURS.execute(QUERY, [uri, abbreviation])
|
||||||
|
return True
|
||||||
|
except sqlite3.IntegrityError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# UGLY: correct method to add cursor
|
||||||
|
def insert_object_abbreviation(object_id, abbreviation_id) -> bool:
|
||||||
|
QUERY = "INSERT INTO Objects_Abbreviations(ObjectID, AbbreviationID) VALUES (?,?);"
|
||||||
|
try:
|
||||||
|
CURS.execute(QUERY, [object_id, abbreviation_id])
|
||||||
|
return True
|
||||||
|
except sqlite3.IntegrityError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# UGLY: correct method to add cursor
|
||||||
|
def insert_relationship_abbreviation(relationship_id, abbreviation_id) -> bool:
|
||||||
|
QUERY = "INSERT INTO Relationships_Abbreviations(RelationshipID, AbbreviationID) VALUES (?,?);"
|
||||||
|
try:
|
||||||
|
CURS.execute(QUERY, [relationship_id, abbreviation_id])
|
||||||
|
return True
|
||||||
|
except sqlite3.IntegrityError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# UGLY: correct method to add cursor
|
||||||
|
def insert_subject_abbreviation(subject_id, abbreviation_id) -> bool:
|
||||||
|
QUERY = (
|
||||||
|
"INSERT INTO Subjects_Abbreviations(SubjectID, AbbreviationID) VALUES (?,?);"
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
CURS.execute(QUERY, [subject_id, abbreviation_id])
|
||||||
|
return True
|
||||||
|
except sqlite3.IntegrityError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# UGLY: correct method to add cursor
|
||||||
|
def select_abbreviation_id(uri) -> int | None:
|
||||||
|
QUERY = "SELECT AbbreviationID FROM Abbreviations WHERE URI LIKE ?;"
|
||||||
|
CURS.execute(QUERY, [uri])
|
||||||
|
abbreviation_id = CURS.fetchone()
|
||||||
|
if not abbreviation_id:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# in this case the real id is the first element of the tuple
|
||||||
|
return abbreviation_id[0]
|
||||||
|
|
||||||
|
|
||||||
# MARK: Parsing
|
# MARK: Parsing
|
||||||
def parseMovies():
|
def parseMovies():
|
||||||
|
|
||||||
@ -203,12 +268,11 @@ def parseWikiPageId():
|
|||||||
def parseAbstract():
|
def parseAbstract():
|
||||||
CSV_READER = csv.DictReader(SUMMARY_CSV_HANDLER)
|
CSV_READER = csv.DictReader(SUMMARY_CSV_HANDLER)
|
||||||
for row in CSV_READER:
|
for row in CSV_READER:
|
||||||
|
|
||||||
WIKI_PAGE_ID = int(row["subject"])
|
WIKI_PAGE_ID = int(row["subject"])
|
||||||
ABSTRACT = row["text"]
|
ABSTRACT = row["text"]
|
||||||
MOVIE_ID = selectMovieIdFromWikiPageId(CURS, WIKI_PAGE_ID)
|
MOVIE_ID = selectMovieIdFromWikiPageId(CURS, WIKI_PAGE_ID)
|
||||||
|
|
||||||
|
|
||||||
if MOVIE_ID is None:
|
if MOVIE_ID is None:
|
||||||
print(f"The WikiPageId: {WIKI_PAGE_ID} has not a MovieId ")
|
print(f"The WikiPageId: {WIKI_PAGE_ID} has not a MovieId ")
|
||||||
continue
|
continue
|
||||||
@ -216,10 +280,24 @@ def parseAbstract():
|
|||||||
insertWikiAbstract(CURS, MOVIE_ID, ABSTRACT)
|
insertWikiAbstract(CURS, MOVIE_ID, ABSTRACT)
|
||||||
|
|
||||||
|
|
||||||
|
def parseAbbreviations():
|
||||||
|
URI_CSV = csv.DictReader(URI_ABBR_CSV_HANDLER)
|
||||||
|
for row in URI_CSV:
|
||||||
|
|
||||||
|
URI = row["uri"]
|
||||||
|
ABBREVIATION = row["abbreviation"]
|
||||||
|
|
||||||
|
insert_abbreviation(URI, ABBREVIATION)
|
||||||
|
|
||||||
|
|
||||||
def parseRDF_Reverse():
|
def parseRDF_Reverse():
|
||||||
|
|
||||||
REVERSE_CSV_READER = csv.DictReader(REVERSE_CSV_HANDLER)
|
REVERSE_CSV_READER = csv.DictReader(REVERSE_CSV_HANDLER)
|
||||||
REVERSE_ORIGIN_ID = selectOrigin(CURS, 'reverse.csv')
|
REVERSE_ORIGIN_ID = selectOrigin(CURS, "reverse.csv")
|
||||||
|
|
||||||
|
if REVERSE_ORIGIN_ID is None:
|
||||||
|
return
|
||||||
|
|
||||||
total = 0
|
total = 0
|
||||||
|
|
||||||
for row in REVERSE_CSV_READER:
|
for row in REVERSE_CSV_READER:
|
||||||
@ -227,7 +305,7 @@ def parseRDF_Reverse():
|
|||||||
RELATIONSHIP = row["relationship"]
|
RELATIONSHIP = row["relationship"]
|
||||||
OBJECT = row["object"]
|
OBJECT = row["object"]
|
||||||
print(f"RDF triplets:\n\t{SUBJECT} - {RELATIONSHIP} - {OBJECT}")
|
print(f"RDF triplets:\n\t{SUBJECT} - {RELATIONSHIP} - {OBJECT}")
|
||||||
insertSubject(CURS,SUBJECT,REVERSE_ORIGIN_ID)
|
insertSubject(CURS, SUBJECT, REVERSE_ORIGIN_ID)
|
||||||
insertRelationship(CURS, RELATIONSHIP)
|
insertRelationship(CURS, RELATIONSHIP)
|
||||||
insertObject(CURS, OBJECT, REVERSE_ORIGIN_ID)
|
insertObject(CURS, OBJECT, REVERSE_ORIGIN_ID)
|
||||||
|
|
||||||
@ -236,7 +314,6 @@ def parseRDF_Reverse():
|
|||||||
RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
|
RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
|
||||||
MOVIE_ID = selectMovieId(CURS, OBJECT)
|
MOVIE_ID = selectMovieId(CURS, OBJECT)
|
||||||
|
|
||||||
|
|
||||||
skip = False
|
skip = False
|
||||||
|
|
||||||
# guard
|
# guard
|
||||||
@ -259,17 +336,19 @@ def parseRDF_Reverse():
|
|||||||
if skip:
|
if skip:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID):
|
if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID): # type: ignore
|
||||||
total += 1
|
total += 1
|
||||||
|
|
||||||
print(total)
|
print(total)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def parseRDF_Dataset():
|
def parseRDF_Dataset():
|
||||||
|
|
||||||
DATASET_CSV_READER = csv.DictReader(DATASET_CSV_HANDLER)
|
DATASET_CSV_READER = csv.DictReader(DATASET_CSV_HANDLER)
|
||||||
DATASET_ORIGIN_ID = selectOrigin(CURS, 'dataset.csv')
|
DATASET_ORIGIN_ID = selectOrigin(CURS, "dataset.csv")
|
||||||
|
|
||||||
|
if DATASET_ORIGIN_ID is None:
|
||||||
|
return
|
||||||
|
|
||||||
total = 0
|
total = 0
|
||||||
rdf_idx = 0
|
rdf_idx = 0
|
||||||
@ -284,7 +363,7 @@ def parseRDF_Dataset():
|
|||||||
if rdf_idx % 100000 == 0:
|
if rdf_idx % 100000 == 0:
|
||||||
print(f"RDF number {rdf_idx}:\n\t{SUBJECT} - {RELATIONSHIP} - {OBJECT}")
|
print(f"RDF number {rdf_idx}:\n\t{SUBJECT} - {RELATIONSHIP} - {OBJECT}")
|
||||||
|
|
||||||
insertSubject(CURS,SUBJECT,DATASET_ORIGIN_ID)
|
insertSubject(CURS, SUBJECT, DATASET_ORIGIN_ID)
|
||||||
insertRelationship(CURS, RELATIONSHIP)
|
insertRelationship(CURS, RELATIONSHIP)
|
||||||
insertObject(CURS, OBJECT, DATASET_ORIGIN_ID)
|
insertObject(CURS, OBJECT, DATASET_ORIGIN_ID)
|
||||||
|
|
||||||
@ -293,7 +372,6 @@ def parseRDF_Dataset():
|
|||||||
RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
|
RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
|
||||||
MOVIE_ID = selectMovieId(CURS, SUBJECT)
|
MOVIE_ID = selectMovieId(CURS, SUBJECT)
|
||||||
|
|
||||||
|
|
||||||
skip = False
|
skip = False
|
||||||
|
|
||||||
# guard
|
# guard
|
||||||
@ -316,24 +394,203 @@ def parseRDF_Dataset():
|
|||||||
if skip:
|
if skip:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID):
|
if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID): # type: ignore
|
||||||
total += 1
|
total += 1
|
||||||
|
|
||||||
print(total)
|
print(total)
|
||||||
|
|
||||||
|
|
||||||
|
def parseAbbr_Reverse():
|
||||||
|
|
||||||
|
REVERSE_CSV_READER = csv.DictReader(REVERSE_CSV_HANDLER)
|
||||||
|
REVERSE_ORIGIN_ID = selectOrigin(CURS, "reverse.csv")
|
||||||
|
|
||||||
|
if REVERSE_ORIGIN_ID is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
total = 0
|
||||||
|
|
||||||
|
for row in REVERSE_CSV_READER:
|
||||||
|
SUBJECT = row["subject"]
|
||||||
|
RELATIONSHIP = row["relationship"]
|
||||||
|
OBJECT = row["object"]
|
||||||
|
|
||||||
|
SUBJECT_ID = selectSubjectId(CURS, SUBJECT)
|
||||||
|
OBJECT_ID = selectObjectId(CURS, OBJECT)
|
||||||
|
RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
|
||||||
|
|
||||||
|
SUB_SECTIONS = SUBJECT.split("/")
|
||||||
|
REL_SECTIONS = RELATIONSHIP.split("/")
|
||||||
|
OBJ_SECTIONS = OBJECT.split("/")
|
||||||
|
|
||||||
|
SUB_ABBR_ID = None
|
||||||
|
REL_ABBR_ID = None
|
||||||
|
OBJ_ABBR_ID = None
|
||||||
|
|
||||||
|
skip = False
|
||||||
|
|
||||||
|
# guard
|
||||||
|
if SUBJECT_ID is None:
|
||||||
|
print(f"No SubjectId for {SUBJECT}")
|
||||||
|
skip = True
|
||||||
|
|
||||||
|
if OBJECT_ID is None:
|
||||||
|
print(f"No ObjectId for {OBJECT}")
|
||||||
|
skip = True
|
||||||
|
|
||||||
|
if RELATIONSHIP_ID is None:
|
||||||
|
print(f"No RelationshipId for {RELATIONSHIP}")
|
||||||
|
skip = True
|
||||||
|
|
||||||
|
|
||||||
|
if skip:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if len(SUB_SECTIONS) > 4:
|
||||||
|
index = min(len(SUB_SECTIONS), 7)
|
||||||
|
while index > 3:
|
||||||
|
PATH = "/".join(SUB_SECTIONS[0:index]) + "%"
|
||||||
|
SUB_ABBR_ID = select_abbreviation_id(PATH)
|
||||||
|
|
||||||
|
if SUB_ABBR_ID is not None:
|
||||||
|
if insert_subject_abbreviation(SUBJECT_ID, SUB_ABBR_ID):
|
||||||
|
total += 1
|
||||||
|
index = 0
|
||||||
|
index -= 1
|
||||||
|
|
||||||
|
if len(REL_SECTIONS) > 4:
|
||||||
|
index = min(len(REL_SECTIONS), 7)
|
||||||
|
while index > 2:
|
||||||
|
PATH = "/".join(REL_SECTIONS[0:index]) + "%"
|
||||||
|
REL_ABBR_ID = select_abbreviation_id(PATH)
|
||||||
|
|
||||||
|
|
||||||
|
if REL_ABBR_ID is not None:
|
||||||
|
if insert_relationship_abbreviation(RELATIONSHIP_ID, REL_ABBR_ID):
|
||||||
|
total += 1
|
||||||
|
index = 0
|
||||||
|
index -= 1
|
||||||
|
|
||||||
|
if len(OBJ_SECTIONS) > 4:
|
||||||
|
index = min(len(OBJ_SECTIONS), 7)
|
||||||
|
while index > 3:
|
||||||
|
PATH = "/".join(OBJ_SECTIONS[0:index]) + "%"
|
||||||
|
OBJ_ABBR_ID = select_abbreviation_id(PATH)
|
||||||
|
|
||||||
|
if OBJ_ABBR_ID is not None:
|
||||||
|
if insert_object_abbreviation(OBJECT_ID, OBJ_ABBR_ID):
|
||||||
|
total += 1
|
||||||
|
index = 0
|
||||||
|
index -= 1
|
||||||
|
|
||||||
|
print(total)
|
||||||
|
|
||||||
|
|
||||||
|
def parseAbbr_Dataset():
|
||||||
|
|
||||||
|
DATASET_CSV_READER = csv.DictReader(DATASET_CSV_HANDLER)
|
||||||
|
DATASET_ORIGIN_ID = selectOrigin(CURS, "dataset.csv")
|
||||||
|
|
||||||
|
if DATASET_ORIGIN_ID is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
total = 0
|
||||||
|
rdf_idx = 0
|
||||||
|
for row in DATASET_CSV_READER:
|
||||||
|
SUBJECT = row["subject"]
|
||||||
|
RELATIONSHIP = row["relationship"]
|
||||||
|
OBJECT = row["object"]
|
||||||
|
|
||||||
|
rdf_idx += 1
|
||||||
|
|
||||||
|
if rdf_idx % 100000 == 0:
|
||||||
|
print(f"RDF number {rdf_idx}:\n\t{SUBJECT} - {RELATIONSHIP} - {OBJECT}")
|
||||||
|
|
||||||
|
SUBJECT_ID = selectSubjectId(CURS, SUBJECT)
|
||||||
|
OBJECT_ID = selectObjectId(CURS, OBJECT)
|
||||||
|
RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
|
||||||
|
|
||||||
|
SUB_SECTIONS = SUBJECT.split("/")
|
||||||
|
REL_SECTIONS = RELATIONSHIP.split("/")
|
||||||
|
OBJ_SECTIONS = OBJECT.split("/")
|
||||||
|
|
||||||
|
SUB_ABBR_ID = None
|
||||||
|
REL_ABBR_ID = None
|
||||||
|
OBJ_ABBR_ID = None
|
||||||
|
|
||||||
|
skip = False
|
||||||
|
|
||||||
|
# guard
|
||||||
|
if SUBJECT_ID is None:
|
||||||
|
print(f"No SubjectId for {SUBJECT}")
|
||||||
|
skip = True
|
||||||
|
|
||||||
|
if OBJECT_ID is None:
|
||||||
|
print(f"No ObjectId for {OBJECT}")
|
||||||
|
skip = True
|
||||||
|
|
||||||
|
if RELATIONSHIP_ID is None:
|
||||||
|
print(f"No RelationshipId for {RELATIONSHIP}")
|
||||||
|
skip = True
|
||||||
|
|
||||||
|
|
||||||
|
if skip:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if len(SUB_SECTIONS) > 4:
|
||||||
|
index = min(len(SUB_SECTIONS), 7)
|
||||||
|
while index > 3:
|
||||||
|
PATH = "/".join(SUB_SECTIONS[0:index]) + "%"
|
||||||
|
SUB_ABBR_ID = select_abbreviation_id(PATH)
|
||||||
|
|
||||||
|
if SUB_ABBR_ID is not None:
|
||||||
|
if insert_subject_abbreviation(SUBJECT_ID, SUB_ABBR_ID):
|
||||||
|
total += 1
|
||||||
|
index = 0
|
||||||
|
index -= 1
|
||||||
|
|
||||||
|
if len(REL_SECTIONS) > 4:
|
||||||
|
index = min(len(REL_SECTIONS), 7)
|
||||||
|
while index > 2:
|
||||||
|
PATH = "/".join(REL_SECTIONS[0:index]) + "%"
|
||||||
|
REL_ABBR_ID = select_abbreviation_id(PATH)
|
||||||
|
|
||||||
|
|
||||||
|
if REL_ABBR_ID is not None:
|
||||||
|
if insert_relationship_abbreviation(RELATIONSHIP_ID, REL_ABBR_ID):
|
||||||
|
total += 1
|
||||||
|
index = 0
|
||||||
|
index -= 1
|
||||||
|
|
||||||
|
if len(OBJ_SECTIONS) > 4:
|
||||||
|
index = min(len(OBJ_SECTIONS), 7)
|
||||||
|
while index > 3:
|
||||||
|
PATH = "/".join(OBJ_SECTIONS[0:index]) + "%"
|
||||||
|
OBJ_ABBR_ID = select_abbreviation_id(PATH)
|
||||||
|
|
||||||
|
if OBJ_ABBR_ID is not None:
|
||||||
|
if insert_object_abbreviation(OBJECT_ID, OBJ_ABBR_ID):
|
||||||
|
total += 1
|
||||||
|
index = 0
|
||||||
|
index -= 1
|
||||||
|
|
||||||
|
print(total)
|
||||||
|
|
||||||
|
|
||||||
# MARK: Actual Code
|
# MARK: Actual Code
|
||||||
# parseMovies()
|
# parseMovies()
|
||||||
# parseWikiPageId()
|
# parseWikiPageId()
|
||||||
# parseAbstract()
|
# parseAbstract()
|
||||||
# insertOrigin(CURS)
|
# insertOrigin(CURS)
|
||||||
|
# parseAbbreviations()
|
||||||
# parseRDF_Reverse()
|
# parseRDF_Reverse()
|
||||||
# parseRDF_Dataset()
|
# parseRDF_Dataset()
|
||||||
|
# parseAbbr_Reverse()
|
||||||
|
parseAbbr_Dataset()
|
||||||
|
|
||||||
|
|
||||||
CONN.commit()
|
CONN.commit()
|
||||||
CONN.close()
|
CONN.close()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
MOVIES_CSV_HANDLER.close()
|
MOVIES_CSV_HANDLER.close()
|
||||||
@ -341,35 +598,36 @@ PAGEID_CSV_HANDLER.close()
|
|||||||
SUMMARY_CSV_HANDLER.close()
|
SUMMARY_CSV_HANDLER.close()
|
||||||
DATASET_CSV_HANDLER.close()
|
DATASET_CSV_HANDLER.close()
|
||||||
REVERSE_CSV_HANDLER.close()
|
REVERSE_CSV_HANDLER.close()
|
||||||
|
URI_ABBR_CSV_HANDLER.close()
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
The MovieUri: http://dbpedia.org/resource/1%25_(film) has not a MovieId
|
The MovieUri: http://dbpedia.org/resource/1%25_(film) has not a MovieId
|
||||||
The MovieUri: http://dbpedia.org/resource/10%25:_What_Makes_a_Hero%3F has not a MovieId
|
The MovieUri: http://dbpedia.org/resource/10%25:_What_Makes_a_Hero%3F has not a MovieId
|
||||||
The MovieUri: http://dbpedia.org/resource/100%25_Arabica has not a MovieId
|
The MovieUri: http://dbpedia.org/resource/100%25_Arabica has not a MovieId
|
||||||
The MovieUri: http://dbpedia.org/resource/100%25_Kadhal has not a MovieId
|
The MovieUri: http://dbpedia.org/resource/100%25_Kadhal has not a MovieId
|
||||||
The MovieUri: http://dbpedia.org/resource/100%25_Love_(2011_film) has not a MovieId
|
The MovieUri: http://dbpedia.org/resource/100%25_Love_(2011_film) has not a MovieId
|
||||||
The MovieUri: http://dbpedia.org/resource/100%25_Love_(2012_film) has not a MovieId
|
The MovieUri: http://dbpedia.org/resource/100%25_Love_(2012_film) has not a MovieId
|
||||||
The MovieUri: http://dbpedia.org/resource/100%25_Wolf has not a MovieId
|
The MovieUri: http://dbpedia.org/resource/100%25_Wolf has not a MovieId
|
||||||
The MovieUri: http://dbpedia.org/resource/Who_the_$&%25_Is_Jackson_Pollock%3F has not a MovieId
|
The MovieUri: http://dbpedia.org/resource/Who_the_$&%25_Is_Jackson_Pollock%3F has not a MovieId
|
||||||
The MovieUri: http://dbpedia.org/resource/99%25:_The_Occupy_Wall_Street_Collaborative_Film has not a MovieId
|
The MovieUri: http://dbpedia.org/resource/99%25:_The_Occupy_Wall_Street_Collaborative_Film has not a MovieId
|
||||||
The MovieUri: http://dbpedia.org/resource/99_and_44/100%25_Dead has not a MovieId
|
The MovieUri: http://dbpedia.org/resource/99_and_44/100%25_Dead has not a MovieId
|
||||||
The MovieUri: http://dbpedia.org/resource/Postcards_from_the_48%25 has not a MovieId
|
The MovieUri: http://dbpedia.org/resource/Postcards_from_the_48%25 has not a MovieId
|
||||||
The MovieUri: http://dbpedia.org/resource/Wool_100%25 has not a MovieId
|
The MovieUri: http://dbpedia.org/resource/Wool_100%25 has not a MovieId
|
||||||
"""
|
"""
|
||||||
|
|
||||||
"""
|
"""
|
||||||
The WikiPageId: 10068850 has not a MovieId
|
The WikiPageId: 10068850 has not a MovieId
|
||||||
The WikiPageId: 55069615 has not a MovieId
|
The WikiPageId: 55069615 has not a MovieId
|
||||||
The WikiPageId: 49510056 has not a MovieId
|
The WikiPageId: 49510056 has not a MovieId
|
||||||
The WikiPageId: 4049786 has not a MovieId
|
The WikiPageId: 4049786 has not a MovieId
|
||||||
The WikiPageId: 55510238 has not a MovieId
|
The WikiPageId: 55510238 has not a MovieId
|
||||||
The WikiPageId: 31239628 has not a MovieId
|
The WikiPageId: 31239628 has not a MovieId
|
||||||
The WikiPageId: 34757217 has not a MovieId
|
The WikiPageId: 34757217 has not a MovieId
|
||||||
The WikiPageId: 64311757 has not a MovieId
|
The WikiPageId: 64311757 has not a MovieId
|
||||||
The WikiPageId: 8326198 has not a MovieId
|
The WikiPageId: 8326198 has not a MovieId
|
||||||
The WikiPageId: 42162164 has not a MovieId
|
The WikiPageId: 42162164 has not a MovieId
|
||||||
The WikiPageId: 18502369 has not a MovieId
|
The WikiPageId: 18502369 has not a MovieId
|
||||||
The WikiPageId: 58092358 has not a MovieId
|
The WikiPageId: 58092358 has not a MovieId
|
||||||
The WikiPageId: 40710250 has not a MovieId
|
The WikiPageId: 40710250 has not a MovieId
|
||||||
"""
|
"""
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user