from time import sleep import SPARQLWrapper import requests BASE_URL = "https://dbpedia.org/sparql" TYPE = SPARQLWrapper.CSV TIMEOUT_SECONDS = int(10) LIMIT = int(1E6) OFFSET = LIMIT INITIAL_OFFSET = 0 MAX_PAGES = int(1E2) FILE_URI = "./Assets/Dataset/1-hop/dataset.csv" QUERY = """ PREFIX dbo: PREFIX dbp: PREFIX dbr: SELECT ?subject, ?relationship, ?object WHERE { ?subject ?relationship ?object . ?object rdf:type dbo:Film . FILTER (?relationship != ) }""" def main(): for page in range(INITIAL_OFFSET, MAX_PAGES): print(f"Starting to get page {page}") CURRENT_OFFSET = OFFSET * page sparql = SPARQLWrapper.SPARQLWrapper(BASE_URL) sparql.setReturnFormat(TYPE) CURRENT_PAGE_QUERY = "\n".join([ QUERY, f"LIMIT {LIMIT}", f"OFFSET {CURRENT_OFFSET}" ]) print(f"\nCurrent Query:\n{CURRENT_PAGE_QUERY}\n") sparql.setQuery(CURRENT_PAGE_QUERY) try: res = sparql.queryAndConvert() text = "" if type(res) == bytes: initial_offset = 0 if page != 0: initial_offset = 1 lines = res.decode("utf-8", "ignore").split("\n") text = "\n".join(lines[initial_offset:]) with open(FILE_URI, "a+", encoding="utf-8") as dataset: print(f"Writing page {page} on {FILE_URI}") dataset.write( text ) except Exception as ex: print(f"Something went wrong during page {page}:\n\t{ex}") print(f"Sleeping for {TIMEOUT_SECONDS}") sleep(TIMEOUT_SECONDS) main()