from math import floor from time import sleep import SPARQLWrapper import requests BASE_URL = "https://dbpedia.org/sparql" TYPE = SPARQLWrapper.CSV TIMEOUT_SECONDS = 1.5 LIMIT = int(1E4) OFFSET = LIMIT INITIAL_OFFSET = 15200000 MAX_PAGES = int(1E9) # Missing page 13220000 FILE_URI = "./Assets/Dataset/1-hop/dataset.csv" QUERY = """ PREFIX dbo: PREFIX dbp: PREFIX dbr: PREFIX foaf: SELECT ?subject, ?relationship, ?object WHERE { ?subject ?relationship ?object . ?subject rdf:type dbo:Film . ?a foaf:primaryTopic ?subject FILTER (?relationship NOT IN ( dbo:wikiPageRedirects, dbo:wikiPageExternalLink, dbo:wikiPageWikiLink )) }""" def main(): exit = False page = int(floor(INITIAL_OFFSET / LIMIT)) -1 while not exit: print(f"Starting to get page {page}") CURRENT_OFFSET = int(OFFSET + (page * LIMIT)) sparql = SPARQLWrapper.SPARQLWrapper(BASE_URL) sparql.setReturnFormat(TYPE) CURRENT_PAGE_QUERY = "\n".join([ QUERY, f"LIMIT {LIMIT}", f"OFFSET {CURRENT_OFFSET}" ]) print(f"\nCurrent Query:\n{CURRENT_PAGE_QUERY}\n") sparql.setQuery(CURRENT_PAGE_QUERY) try: res = sparql.queryAndConvert() text = "" if type(res) == bytes: initial_offset = 0 if page != 0: initial_offset = 1 lines = res.decode("utf-8", "ignore").split("\n") text = "\n".join(lines[initial_offset:]) if text == "": exit = True continue with open(FILE_URI, "a+", encoding="utf-8") as dataset: print(f"Writing page {page} on {FILE_URI}") dataset.write( text ) except Exception as ex: print(f"Something went wrong during page {page}:\n\t{ex}") print(f"Sleeping for {TIMEOUT_SECONDS}") page += 1 if page == MAX_PAGES - 1: exit = True sleep(TIMEOUT_SECONDS) main()