147 lines
3.6 KiB
Python
147 lines
3.6 KiB
Python
import argparse
|
|
from math import floor
|
|
import sys
|
|
from time import sleep
|
|
import SPARQLWrapper
|
|
|
|
|
|
class ProgramData:
|
|
|
|
def __init__(
|
|
self,
|
|
local_url,
|
|
query_url,
|
|
sparql_url,
|
|
output_type,
|
|
initial_offset,
|
|
timeout,
|
|
limit,
|
|
max_pages,
|
|
verbosity_level,
|
|
) -> None:
|
|
|
|
self.local_url = local_url
|
|
self.query_url = query_url
|
|
self.sparql_url = sparql_url
|
|
self.output_type = output_type
|
|
self.initial_offset = initial_offset
|
|
self.timeout = timeout
|
|
self.limit = limit
|
|
self.max_pages = max_pages
|
|
self.verbosity_level = verbosity_level
|
|
|
|
@property
|
|
def offset(self):
|
|
return self.limit
|
|
|
|
@property
|
|
def query(self):
|
|
|
|
with open(self.query_url, "r") as file:
|
|
return file.read()
|
|
|
|
|
|
DBPEDIA_URL = "https://dbpedia.org/sparql"
|
|
TYPE = SPARQLWrapper.CSV
|
|
TIMEOUT_SECONDS = 1.5
|
|
LIMIT = int(1E4)
|
|
INITIAL_OFFSET = 0
|
|
MAX_PAGES = int(1E9)
|
|
|
|
|
|
def gather_cli_args(args: list[str]) -> ProgramData:
|
|
|
|
# TODO: Add argument for type
|
|
PARSER = argparse.ArgumentParser("sparql data fetcher")
|
|
PARSER.add_argument("--file-path", "--file", "--output", "-o", required=True, type=str)
|
|
PARSER.add_argument("--query-file", "--query", "-q", required=True, type=str)
|
|
PARSER.add_argument("--url", type=str, default=DBPEDIA_URL)
|
|
PARSER.add_argument("--limit", type=int, default=LIMIT)
|
|
PARSER.add_argument("--timeout", type=float, default=TIMEOUT_SECONDS)
|
|
PARSER.add_argument("--offset", type=int, default=INITIAL_OFFSET)
|
|
PARSER.add_argument("--max-pages", type=int, default=MAX_PAGES)
|
|
PARSER.add_argument("--verbose", "-v", action="count", default=0)
|
|
|
|
parsed_args, _ = PARSER.parse_known_args(args)
|
|
|
|
return ProgramData(
|
|
parsed_args.file_path,
|
|
parsed_args.query_file,
|
|
parsed_args.url,
|
|
SPARQLWrapper.CSV,
|
|
parsed_args.offset,
|
|
parsed_args.timeout,
|
|
parsed_args.limit,
|
|
parsed_args.max_pages,
|
|
parsed_args.verbose
|
|
)
|
|
# type: ignore
|
|
|
|
|
|
def fetch_data(DATA: ProgramData):
|
|
|
|
# Take correction of page into account
|
|
page = int(floor(DATA.initial_offset / DATA.limit)) - 1
|
|
exit = False
|
|
|
|
while not exit:
|
|
|
|
print(f"Starting to get page {page}")
|
|
|
|
CURRENT_OFFSET = int(DATA.offset + (page * DATA.limit))
|
|
sparql = SPARQLWrapper.SPARQLWrapper(DATA.sparql_url)
|
|
|
|
sparql.setReturnFormat(TYPE)
|
|
|
|
CURRENT_PAGE_QUERY = "\n".join([
|
|
DATA.query,
|
|
f"LIMIT {LIMIT}",
|
|
f"OFFSET {CURRENT_OFFSET}"
|
|
])
|
|
|
|
print(f"\nCurrent Query:\n{CURRENT_PAGE_QUERY}\n")
|
|
|
|
sparql.setQuery(CURRENT_PAGE_QUERY)
|
|
|
|
try:
|
|
res = sparql.queryAndConvert()
|
|
text = ""
|
|
|
|
if type(res) == bytes:
|
|
|
|
initial_offset = 0
|
|
|
|
if page != 0:
|
|
initial_offset = 1
|
|
|
|
lines = res.decode("utf-8", "ignore").split("\n")
|
|
text = "\n".join(lines[initial_offset:])
|
|
|
|
if text == "":
|
|
exit = True
|
|
continue
|
|
|
|
with open(DATA.local_url, "a+", encoding="utf-8") as dataset:
|
|
|
|
print(f"Writing page {page} on {DATA.local_url}")
|
|
dataset.write(
|
|
text
|
|
)
|
|
|
|
except Exception as ex:
|
|
print(f"Something went wrong during page {page}:\n\t{ex}")
|
|
|
|
print(f"Sleeping for {TIMEOUT_SECONDS}")
|
|
|
|
page += 1
|
|
|
|
if page == MAX_PAGES - 1:
|
|
exit = True
|
|
|
|
sleep(TIMEOUT_SECONDS)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
DATA = gather_cli_args(sys.argv)
|
|
fetch_data(DATA)
|