Added script to fetch data from DBPedia
This commit is contained in:
parent
db87295890
commit
7c04309cc1
79
Script/fetchdata.py
Normal file
79
Script/fetchdata.py
Normal file
@ -0,0 +1,79 @@
|
|||||||
|
from time import sleep
|
||||||
|
import SPARQLWrapper
|
||||||
|
import requests
|
||||||
|
|
||||||
|
|
||||||
|
BASE_URL = "https://dbpedia.org/sparql"
|
||||||
|
TYPE = SPARQLWrapper.CSV
|
||||||
|
TIMEOUT_SECONDS = int(10)
|
||||||
|
LIMIT = int(1E6)
|
||||||
|
OFFSET = LIMIT
|
||||||
|
INITIAL_OFFSET = 0
|
||||||
|
MAX_PAGES = int(1E2)
|
||||||
|
|
||||||
|
FILE_URI = "./Assets/Dataset/1-hop/dataset.csv"
|
||||||
|
|
||||||
|
QUERY = """
|
||||||
|
PREFIX dbo: <http://dbpedia.org/ontology/>
|
||||||
|
PREFIX dbp: <http://dbpedia.org/property/>
|
||||||
|
PREFIX dbr: <http://dbpedia.org/resource/>
|
||||||
|
|
||||||
|
SELECT ?subject, ?relationship, ?object
|
||||||
|
WHERE {
|
||||||
|
?subject ?relationship ?object .
|
||||||
|
?object rdf:type dbo:Film .
|
||||||
|
FILTER (?relationship != <http://dbpedia.org/ontology/wikiPageWikiLink>)
|
||||||
|
}"""
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
|
||||||
|
for page in range(INITIAL_OFFSET, MAX_PAGES):
|
||||||
|
|
||||||
|
print(f"Starting to get page {page}")
|
||||||
|
|
||||||
|
CURRENT_OFFSET = OFFSET * page
|
||||||
|
sparql = SPARQLWrapper.SPARQLWrapper(BASE_URL)
|
||||||
|
|
||||||
|
sparql.setReturnFormat(TYPE)
|
||||||
|
|
||||||
|
CURRENT_PAGE_QUERY = "\n".join([
|
||||||
|
QUERY,
|
||||||
|
f"LIMIT {LIMIT}",
|
||||||
|
f"OFFSET {CURRENT_OFFSET}"
|
||||||
|
])
|
||||||
|
|
||||||
|
print(f"\nCurrent Query:\n{CURRENT_PAGE_QUERY}\n")
|
||||||
|
|
||||||
|
sparql.setQuery(CURRENT_PAGE_QUERY)
|
||||||
|
|
||||||
|
try:
|
||||||
|
res = sparql.queryAndConvert()
|
||||||
|
text = ""
|
||||||
|
|
||||||
|
if type(res) == bytes:
|
||||||
|
|
||||||
|
initial_offset = 0
|
||||||
|
|
||||||
|
if page != 0:
|
||||||
|
initial_offset = 1
|
||||||
|
|
||||||
|
lines = res.decode("utf-8", "ignore").split("\n")
|
||||||
|
text = "\n".join(lines[initial_offset:])
|
||||||
|
|
||||||
|
with open(FILE_URI, "a+", encoding="utf-8") as dataset:
|
||||||
|
|
||||||
|
print(f"Writing page {page} on {FILE_URI}")
|
||||||
|
|
||||||
|
dataset.write(
|
||||||
|
text
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as ex:
|
||||||
|
print(f"Something went wrong during page {page}:\n\t{ex}")
|
||||||
|
|
||||||
|
print(f"Sleeping for {TIMEOUT_SECONDS}")
|
||||||
|
sleep(TIMEOUT_SECONDS)
|
||||||
|
|
||||||
|
main()
|
||||||
Loading…
x
Reference in New Issue
Block a user