diff --git a/Scripts/DataGathering/fetchdata.py b/Scripts/DataGathering/fetchdata.py index e1cbf7a..ca72a36 100644 --- a/Scripts/DataGathering/fetchdata.py +++ b/Scripts/DataGathering/fetchdata.py @@ -1,56 +1,100 @@ +import argparse from math import floor +import sys from time import sleep import SPARQLWrapper -import requests -BASE_URL = "https://dbpedia.org/sparql" +class ProgramData: + + def __init__( + self, + local_url, + query_url, + sparql_url, + output_type, + initial_offset, + timeout, + limit, + max_pages, + verbosity_level, + ) -> None: + + self.local_url = local_url + self.query_url = query_url + self.sparql_url = sparql_url + self.output_type = output_type + self.initial_offset = initial_offset + self.timeout = timeout + self.limit = limit + self.max_pages = max_pages + self.verbosity_level = verbosity_level + + @property + def offset(self): + return self.limit + + @property + def query(self): + + with open(self.query_url, "r") as file: + return file.read() + + +DBPEDIA_URL = "https://dbpedia.org/sparql" TYPE = SPARQLWrapper.CSV TIMEOUT_SECONDS = 1.5 LIMIT = int(1E4) -OFFSET = LIMIT -INITIAL_OFFSET = 15200000 +INITIAL_OFFSET = 0 MAX_PAGES = int(1E9) -# Missing page 13220000 -FILE_URI = "./Assets/Dataset/1-hop/dataset.csv" +def gather_cli_args(args: list[str]) -> ProgramData: -QUERY = """ -PREFIX dbo: -PREFIX dbp: -PREFIX dbr: -PREFIX foaf: + # TODO: Add argument for type + PARSER = argparse.ArgumentParser("sparql data fetcher") + PARSER.add_argument("--file-path", "--file", "--output", "-o", required=True, type=str) + PARSER.add_argument("--query-file", "--query", "-q", required=True, type=str) + PARSER.add_argument("--url", type=str, default=DBPEDIA_URL) + PARSER.add_argument("--limit", type=int, default=LIMIT) + PARSER.add_argument("--timeout", type=float, default=TIMEOUT_SECONDS) + PARSER.add_argument("--offset", type=int, default=INITIAL_OFFSET) + PARSER.add_argument("--max-pages", type=int, default=MAX_PAGES) + PARSER.add_argument("--verbose", "-v", action="count", default=0) -SELECT ?subject, ?relationship, ?object -WHERE { - ?subject ?relationship ?object . - ?subject rdf:type dbo:Film . - ?a foaf:primaryTopic ?subject - FILTER (?relationship NOT IN ( - dbo:wikiPageRedirects, - dbo:wikiPageExternalLink, - dbo:wikiPageWikiLink - )) -}""" + parsed_args, _ = PARSER.parse_known_args(args) + + return ProgramData( + parsed_args.file_path, + parsed_args.query_file, + parsed_args.url, + SPARQLWrapper.CSV, + parsed_args.offset, + parsed_args.timeout, + parsed_args.limit, + parsed_args.max_pages, + parsed_args.verbose + ) + # type: ignore -def main(): +def fetch_data(DATA: ProgramData): + # Take correction of page into account + page = int(floor(DATA.initial_offset / DATA.limit)) - 1 exit = False - page = int(floor(INITIAL_OFFSET / LIMIT)) -1 while not exit: print(f"Starting to get page {page}") - CURRENT_OFFSET = int(OFFSET + (page * LIMIT)) - sparql = SPARQLWrapper.SPARQLWrapper(BASE_URL) + CURRENT_OFFSET = int(DATA.offset + (page * DATA.limit)) + sparql = SPARQLWrapper.SPARQLWrapper(DATA.sparql_url) sparql.setReturnFormat(TYPE) CURRENT_PAGE_QUERY = "\n".join([ - QUERY, + DATA.query, f"LIMIT {LIMIT}", f"OFFSET {CURRENT_OFFSET}" ]) @@ -77,9 +121,9 @@ def main(): exit = True continue - with open(FILE_URI, "a+", encoding="utf-8") as dataset: + with open(DATA.local_url, "a+", encoding="utf-8") as dataset: - print(f"Writing page {page} on {FILE_URI}") + print(f"Writing page {page} on {DATA.local_url}") dataset.write( text ) @@ -96,4 +140,7 @@ def main(): sleep(TIMEOUT_SECONDS) -main() + +if __name__ == "__main__": + DATA = gather_cli_args(sys.argv) + fetch_data(DATA)