diff --git a/Script/fetchdata.py b/Script/fetchdata.py new file mode 100644 index 0000000..086af7a --- /dev/null +++ b/Script/fetchdata.py @@ -0,0 +1,79 @@ +from time import sleep +import SPARQLWrapper +import requests + + +BASE_URL = "https://dbpedia.org/sparql" +TYPE = SPARQLWrapper.CSV +TIMEOUT_SECONDS = int(10) +LIMIT = int(1E6) +OFFSET = LIMIT +INITIAL_OFFSET = 0 +MAX_PAGES = int(1E2) + +FILE_URI = "./Assets/Dataset/1-hop/dataset.csv" + +QUERY = """ +PREFIX dbo: +PREFIX dbp: +PREFIX dbr: + +SELECT ?subject, ?relationship, ?object +WHERE { + ?subject ?relationship ?object . + ?object rdf:type dbo:Film . + FILTER (?relationship != ) +}""" + + + +def main(): + + for page in range(INITIAL_OFFSET, MAX_PAGES): + + print(f"Starting to get page {page}") + + CURRENT_OFFSET = OFFSET * page + sparql = SPARQLWrapper.SPARQLWrapper(BASE_URL) + + sparql.setReturnFormat(TYPE) + + CURRENT_PAGE_QUERY = "\n".join([ + QUERY, + f"LIMIT {LIMIT}", + f"OFFSET {CURRENT_OFFSET}" + ]) + + print(f"\nCurrent Query:\n{CURRENT_PAGE_QUERY}\n") + + sparql.setQuery(CURRENT_PAGE_QUERY) + + try: + res = sparql.queryAndConvert() + text = "" + + if type(res) == bytes: + + initial_offset = 0 + + if page != 0: + initial_offset = 1 + + lines = res.decode("utf-8", "ignore").split("\n") + text = "\n".join(lines[initial_offset:]) + + with open(FILE_URI, "a+", encoding="utf-8") as dataset: + + print(f"Writing page {page} on {FILE_URI}") + + dataset.write( + text + ) + + except Exception as ex: + print(f"Something went wrong during page {page}:\n\t{ex}") + + print(f"Sleeping for {TIMEOUT_SECONDS}") + sleep(TIMEOUT_SECONDS) + +main()