Updated fetchdata to be used in terminal
Changes: - now you can use it as if it were a cli command Missing: - documentation
This commit is contained in:
parent
b74b7ac4f0
commit
e32444df75
@ -1,56 +1,100 @@
|
||||
import argparse
|
||||
from math import floor
|
||||
import sys
|
||||
from time import sleep
|
||||
import SPARQLWrapper
|
||||
import requests
|
||||
|
||||
|
||||
BASE_URL = "https://dbpedia.org/sparql"
|
||||
class ProgramData:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
local_url,
|
||||
query_url,
|
||||
sparql_url,
|
||||
output_type,
|
||||
initial_offset,
|
||||
timeout,
|
||||
limit,
|
||||
max_pages,
|
||||
verbosity_level,
|
||||
) -> None:
|
||||
|
||||
self.local_url = local_url
|
||||
self.query_url = query_url
|
||||
self.sparql_url = sparql_url
|
||||
self.output_type = output_type
|
||||
self.initial_offset = initial_offset
|
||||
self.timeout = timeout
|
||||
self.limit = limit
|
||||
self.max_pages = max_pages
|
||||
self.verbosity_level = verbosity_level
|
||||
|
||||
@property
|
||||
def offset(self):
|
||||
return self.limit
|
||||
|
||||
@property
|
||||
def query(self):
|
||||
|
||||
with open(self.query_url, "r") as file:
|
||||
return file.read()
|
||||
|
||||
|
||||
DBPEDIA_URL = "https://dbpedia.org/sparql"
|
||||
TYPE = SPARQLWrapper.CSV
|
||||
TIMEOUT_SECONDS = 1.5
|
||||
LIMIT = int(1E4)
|
||||
OFFSET = LIMIT
|
||||
INITIAL_OFFSET = 15200000
|
||||
INITIAL_OFFSET = 0
|
||||
MAX_PAGES = int(1E9)
|
||||
|
||||
# Missing page 13220000
|
||||
|
||||
FILE_URI = "./Assets/Dataset/1-hop/dataset.csv"
|
||||
def gather_cli_args(args: list[str]) -> ProgramData:
|
||||
|
||||
QUERY = """
|
||||
PREFIX dbo: <http://dbpedia.org/ontology/>
|
||||
PREFIX dbp: <http://dbpedia.org/property/>
|
||||
PREFIX dbr: <http://dbpedia.org/resource/>
|
||||
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
|
||||
# TODO: Add argument for type
|
||||
PARSER = argparse.ArgumentParser("sparql data fetcher")
|
||||
PARSER.add_argument("--file-path", "--file", "--output", "-o", required=True, type=str)
|
||||
PARSER.add_argument("--query-file", "--query", "-q", required=True, type=str)
|
||||
PARSER.add_argument("--url", type=str, default=DBPEDIA_URL)
|
||||
PARSER.add_argument("--limit", type=int, default=LIMIT)
|
||||
PARSER.add_argument("--timeout", type=float, default=TIMEOUT_SECONDS)
|
||||
PARSER.add_argument("--offset", type=int, default=INITIAL_OFFSET)
|
||||
PARSER.add_argument("--max-pages", type=int, default=MAX_PAGES)
|
||||
PARSER.add_argument("--verbose", "-v", action="count", default=0)
|
||||
|
||||
SELECT ?subject, ?relationship, ?object
|
||||
WHERE {
|
||||
?subject ?relationship ?object .
|
||||
?subject rdf:type dbo:Film .
|
||||
?a foaf:primaryTopic ?subject
|
||||
FILTER (?relationship NOT IN (
|
||||
dbo:wikiPageRedirects,
|
||||
dbo:wikiPageExternalLink,
|
||||
dbo:wikiPageWikiLink
|
||||
))
|
||||
}"""
|
||||
parsed_args, _ = PARSER.parse_known_args(args)
|
||||
|
||||
return ProgramData(
|
||||
parsed_args.file_path,
|
||||
parsed_args.query_file,
|
||||
parsed_args.url,
|
||||
SPARQLWrapper.CSV,
|
||||
parsed_args.offset,
|
||||
parsed_args.timeout,
|
||||
parsed_args.limit,
|
||||
parsed_args.max_pages,
|
||||
parsed_args.verbose
|
||||
)
|
||||
# type: ignore
|
||||
|
||||
|
||||
def main():
|
||||
def fetch_data(DATA: ProgramData):
|
||||
|
||||
# Take correction of page into account
|
||||
page = int(floor(DATA.initial_offset / DATA.limit)) - 1
|
||||
exit = False
|
||||
page = int(floor(INITIAL_OFFSET / LIMIT)) -1
|
||||
|
||||
while not exit:
|
||||
|
||||
print(f"Starting to get page {page}")
|
||||
|
||||
CURRENT_OFFSET = int(OFFSET + (page * LIMIT))
|
||||
sparql = SPARQLWrapper.SPARQLWrapper(BASE_URL)
|
||||
CURRENT_OFFSET = int(DATA.offset + (page * DATA.limit))
|
||||
sparql = SPARQLWrapper.SPARQLWrapper(DATA.sparql_url)
|
||||
|
||||
sparql.setReturnFormat(TYPE)
|
||||
|
||||
CURRENT_PAGE_QUERY = "\n".join([
|
||||
QUERY,
|
||||
DATA.query,
|
||||
f"LIMIT {LIMIT}",
|
||||
f"OFFSET {CURRENT_OFFSET}"
|
||||
])
|
||||
@ -77,9 +121,9 @@ def main():
|
||||
exit = True
|
||||
continue
|
||||
|
||||
with open(FILE_URI, "a+", encoding="utf-8") as dataset:
|
||||
with open(DATA.local_url, "a+", encoding="utf-8") as dataset:
|
||||
|
||||
print(f"Writing page {page} on {FILE_URI}")
|
||||
print(f"Writing page {page} on {DATA.local_url}")
|
||||
dataset.write(
|
||||
text
|
||||
)
|
||||
@ -96,4 +140,7 @@ def main():
|
||||
|
||||
sleep(TIMEOUT_SECONDS)
|
||||
|
||||
main()
|
||||
|
||||
if __name__ == "__main__":
|
||||
DATA = gather_cli_args(sys.argv)
|
||||
fetch_data(DATA)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user