diff --git a/Scripts/DataGathering/fetchdata.py b/Scripts/DataGathering/fetchdata.py
index e1cbf7a..ca72a36 100644
--- a/Scripts/DataGathering/fetchdata.py
+++ b/Scripts/DataGathering/fetchdata.py
@@ -1,56 +1,100 @@
+import argparse
from math import floor
+import sys
from time import sleep
import SPARQLWrapper
-import requests
-BASE_URL = "https://dbpedia.org/sparql"
+class ProgramData:
+
+ def __init__(
+ self,
+ local_url,
+ query_url,
+ sparql_url,
+ output_type,
+ initial_offset,
+ timeout,
+ limit,
+ max_pages,
+ verbosity_level,
+ ) -> None:
+
+ self.local_url = local_url
+ self.query_url = query_url
+ self.sparql_url = sparql_url
+ self.output_type = output_type
+ self.initial_offset = initial_offset
+ self.timeout = timeout
+ self.limit = limit
+ self.max_pages = max_pages
+ self.verbosity_level = verbosity_level
+
+ @property
+ def offset(self):
+ return self.limit
+
+ @property
+ def query(self):
+
+ with open(self.query_url, "r") as file:
+ return file.read()
+
+
+DBPEDIA_URL = "https://dbpedia.org/sparql"
TYPE = SPARQLWrapper.CSV
TIMEOUT_SECONDS = 1.5
LIMIT = int(1E4)
-OFFSET = LIMIT
-INITIAL_OFFSET = 15200000
+INITIAL_OFFSET = 0
MAX_PAGES = int(1E9)
-# Missing page 13220000
-FILE_URI = "./Assets/Dataset/1-hop/dataset.csv"
+def gather_cli_args(args: list[str]) -> ProgramData:
-QUERY = """
-PREFIX dbo:
-PREFIX dbp:
-PREFIX dbr:
-PREFIX foaf:
+ # TODO: Add argument for type
+ PARSER = argparse.ArgumentParser("sparql data fetcher")
+ PARSER.add_argument("--file-path", "--file", "--output", "-o", required=True, type=str)
+ PARSER.add_argument("--query-file", "--query", "-q", required=True, type=str)
+ PARSER.add_argument("--url", type=str, default=DBPEDIA_URL)
+ PARSER.add_argument("--limit", type=int, default=LIMIT)
+ PARSER.add_argument("--timeout", type=float, default=TIMEOUT_SECONDS)
+ PARSER.add_argument("--offset", type=int, default=INITIAL_OFFSET)
+ PARSER.add_argument("--max-pages", type=int, default=MAX_PAGES)
+ PARSER.add_argument("--verbose", "-v", action="count", default=0)
-SELECT ?subject, ?relationship, ?object
-WHERE {
- ?subject ?relationship ?object .
- ?subject rdf:type dbo:Film .
- ?a foaf:primaryTopic ?subject
- FILTER (?relationship NOT IN (
- dbo:wikiPageRedirects,
- dbo:wikiPageExternalLink,
- dbo:wikiPageWikiLink
- ))
-}"""
+ parsed_args, _ = PARSER.parse_known_args(args)
+
+ return ProgramData(
+ parsed_args.file_path,
+ parsed_args.query_file,
+ parsed_args.url,
+ SPARQLWrapper.CSV,
+ parsed_args.offset,
+ parsed_args.timeout,
+ parsed_args.limit,
+ parsed_args.max_pages,
+ parsed_args.verbose
+ )
+ # type: ignore
-def main():
+def fetch_data(DATA: ProgramData):
+ # Take correction of page into account
+ page = int(floor(DATA.initial_offset / DATA.limit)) - 1
exit = False
- page = int(floor(INITIAL_OFFSET / LIMIT)) -1
while not exit:
print(f"Starting to get page {page}")
- CURRENT_OFFSET = int(OFFSET + (page * LIMIT))
- sparql = SPARQLWrapper.SPARQLWrapper(BASE_URL)
+ CURRENT_OFFSET = int(DATA.offset + (page * DATA.limit))
+ sparql = SPARQLWrapper.SPARQLWrapper(DATA.sparql_url)
sparql.setReturnFormat(TYPE)
CURRENT_PAGE_QUERY = "\n".join([
- QUERY,
+ DATA.query,
f"LIMIT {LIMIT}",
f"OFFSET {CURRENT_OFFSET}"
])
@@ -77,9 +121,9 @@ def main():
exit = True
continue
- with open(FILE_URI, "a+", encoding="utf-8") as dataset:
+ with open(DATA.local_url, "a+", encoding="utf-8") as dataset:
- print(f"Writing page {page} on {FILE_URI}")
+ print(f"Writing page {page} on {DATA.local_url}")
dataset.write(
text
)
@@ -96,4 +140,7 @@ def main():
sleep(TIMEOUT_SECONDS)
-main()
+
+if __name__ == "__main__":
+ DATA = gather_cli_args(sys.argv)
+ fetch_data(DATA)