Moved fetchdata.py to reflect working tree
old - ${Proj}/Scripts/fetchdata.py
new - ${Proj}/Scripts/DataGathering/fetchdata.py
This commit is contained in:
99
Scripts/DataGathering/fetchdata.py
Normal file
99
Scripts/DataGathering/fetchdata.py
Normal file
@@ -0,0 +1,99 @@
|
||||
from math import floor
|
||||
from time import sleep
|
||||
import SPARQLWrapper
|
||||
import requests
|
||||
|
||||
|
||||
BASE_URL = "https://dbpedia.org/sparql"
|
||||
TYPE = SPARQLWrapper.CSV
|
||||
TIMEOUT_SECONDS = 1.5
|
||||
LIMIT = int(1E4)
|
||||
OFFSET = LIMIT
|
||||
INITIAL_OFFSET = 15200000
|
||||
MAX_PAGES = int(1E9)
|
||||
|
||||
# Missing page 13220000
|
||||
|
||||
FILE_URI = "./Assets/Dataset/1-hop/dataset.csv"
|
||||
|
||||
QUERY = """
|
||||
PREFIX dbo: <http://dbpedia.org/ontology/>
|
||||
PREFIX dbp: <http://dbpedia.org/property/>
|
||||
PREFIX dbr: <http://dbpedia.org/resource/>
|
||||
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
|
||||
|
||||
SELECT ?subject, ?relationship, ?object
|
||||
WHERE {
|
||||
?subject ?relationship ?object .
|
||||
?subject rdf:type dbo:Film .
|
||||
?a foaf:primaryTopic ?subject
|
||||
FILTER (?relationship NOT IN (
|
||||
dbo:wikiPageRedirects,
|
||||
dbo:wikiPageExternalLink,
|
||||
dbo:wikiPageWikiLink
|
||||
))
|
||||
}"""
|
||||
|
||||
|
||||
def main():
|
||||
|
||||
exit = False
|
||||
page = int(floor(INITIAL_OFFSET / LIMIT)) -1
|
||||
|
||||
while not exit:
|
||||
|
||||
print(f"Starting to get page {page}")
|
||||
|
||||
CURRENT_OFFSET = int(OFFSET + (page * LIMIT))
|
||||
sparql = SPARQLWrapper.SPARQLWrapper(BASE_URL)
|
||||
|
||||
sparql.setReturnFormat(TYPE)
|
||||
|
||||
CURRENT_PAGE_QUERY = "\n".join([
|
||||
QUERY,
|
||||
f"LIMIT {LIMIT}",
|
||||
f"OFFSET {CURRENT_OFFSET}"
|
||||
])
|
||||
|
||||
print(f"\nCurrent Query:\n{CURRENT_PAGE_QUERY}\n")
|
||||
|
||||
sparql.setQuery(CURRENT_PAGE_QUERY)
|
||||
|
||||
try:
|
||||
res = sparql.queryAndConvert()
|
||||
text = ""
|
||||
|
||||
if type(res) == bytes:
|
||||
|
||||
initial_offset = 0
|
||||
|
||||
if page != 0:
|
||||
initial_offset = 1
|
||||
|
||||
lines = res.decode("utf-8", "ignore").split("\n")
|
||||
text = "\n".join(lines[initial_offset:])
|
||||
|
||||
if text == "":
|
||||
exit = True
|
||||
continue
|
||||
|
||||
with open(FILE_URI, "a+", encoding="utf-8") as dataset:
|
||||
|
||||
print(f"Writing page {page} on {FILE_URI}")
|
||||
dataset.write(
|
||||
text
|
||||
)
|
||||
|
||||
except Exception as ex:
|
||||
print(f"Something went wrong during page {page}:\n\t{ex}")
|
||||
|
||||
print(f"Sleeping for {TIMEOUT_SECONDS}")
|
||||
|
||||
page += 1
|
||||
|
||||
if page == MAX_PAGES - 1:
|
||||
exit = True
|
||||
|
||||
sleep(TIMEOUT_SECONDS)
|
||||
|
||||
main()
|
||||
Reference in New Issue
Block a user