import argparse from math import floor import sys from time import sleep import SPARQLWrapper class ProgramData: def __init__( self, local_url, query_url, sparql_url, output_type, initial_offset, timeout, limit, max_pages, verbosity_level, ) -> None: self.local_url = local_url self.query_url = query_url self.sparql_url = sparql_url self.output_type = output_type self.initial_offset = initial_offset self.timeout = timeout self.limit = limit self.max_pages = max_pages self.verbosity_level = verbosity_level @property def offset(self): return self.limit @property def query(self): with open(self.query_url, "r") as file: return file.read() DBPEDIA_URL = "https://dbpedia.org/sparql" TYPE = SPARQLWrapper.CSV TIMEOUT_SECONDS = 1.5 LIMIT = int(1E4) INITIAL_OFFSET = 0 MAX_PAGES = int(1E9) def gather_cli_args(args: list[str]) -> ProgramData: # TODO: Add argument for type PARSER = argparse.ArgumentParser("sparql data fetcher") PARSER.add_argument("--file-path", "--file", "--output", "-o", required=True, type=str) PARSER.add_argument("--query-file", "--query", "-q", required=True, type=str) PARSER.add_argument("--url", type=str, default=DBPEDIA_URL) PARSER.add_argument("--limit", type=int, default=LIMIT) PARSER.add_argument("--timeout", type=float, default=TIMEOUT_SECONDS) PARSER.add_argument("--offset", type=int, default=INITIAL_OFFSET) PARSER.add_argument("--max-pages", type=int, default=MAX_PAGES) PARSER.add_argument("--verbose", "-v", action="count", default=0) parsed_args, _ = PARSER.parse_known_args(args) return ProgramData( parsed_args.file_path, parsed_args.query_file, parsed_args.url, SPARQLWrapper.CSV, parsed_args.offset, parsed_args.timeout, parsed_args.limit, parsed_args.max_pages, parsed_args.verbose ) # type: ignore def fetch_data(DATA: ProgramData): # Take correction of page into account page = int(floor(DATA.initial_offset / DATA.limit)) - 1 exit = False while not exit: print(f"Starting to get page {page}") CURRENT_OFFSET = int(DATA.offset + (page * DATA.limit)) sparql = SPARQLWrapper.SPARQLWrapper(DATA.sparql_url) sparql.setReturnFormat(TYPE) CURRENT_PAGE_QUERY = "\n".join([ DATA.query, f"LIMIT {LIMIT}", f"OFFSET {CURRENT_OFFSET}" ]) print(f"\nCurrent Query:\n{CURRENT_PAGE_QUERY}\n") sparql.setQuery(CURRENT_PAGE_QUERY) try: res = sparql.queryAndConvert() text = "" if type(res) == bytes: initial_offset = 0 if page != 0: initial_offset = 1 lines = res.decode("utf-8", "ignore").split("\n") text = "\n".join(lines[initial_offset:]) if text == "": exit = True continue with open(DATA.local_url, "a+", encoding="utf-8") as dataset: print(f"Writing page {page} on {DATA.local_url}") dataset.write( text ) except Exception as ex: print(f"Something went wrong during page {page}:\n\t{ex}") print(f"Sleeping for {TIMEOUT_SECONDS}") page += 1 if page == MAX_PAGES - 1: exit = True sleep(TIMEOUT_SECONDS) if __name__ == "__main__": DATA = gather_cli_args(sys.argv) fetch_data(DATA)