Updated fetchdata to be used in terminal
Changes: - now you can use it as if it were a cli command Missing: - documentation
This commit is contained in:
parent
b74b7ac4f0
commit
e32444df75
@ -1,56 +1,100 @@
|
|||||||
|
import argparse
|
||||||
from math import floor
|
from math import floor
|
||||||
|
import sys
|
||||||
from time import sleep
|
from time import sleep
|
||||||
import SPARQLWrapper
|
import SPARQLWrapper
|
||||||
import requests
|
|
||||||
|
|
||||||
|
|
||||||
BASE_URL = "https://dbpedia.org/sparql"
|
class ProgramData:
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
local_url,
|
||||||
|
query_url,
|
||||||
|
sparql_url,
|
||||||
|
output_type,
|
||||||
|
initial_offset,
|
||||||
|
timeout,
|
||||||
|
limit,
|
||||||
|
max_pages,
|
||||||
|
verbosity_level,
|
||||||
|
) -> None:
|
||||||
|
|
||||||
|
self.local_url = local_url
|
||||||
|
self.query_url = query_url
|
||||||
|
self.sparql_url = sparql_url
|
||||||
|
self.output_type = output_type
|
||||||
|
self.initial_offset = initial_offset
|
||||||
|
self.timeout = timeout
|
||||||
|
self.limit = limit
|
||||||
|
self.max_pages = max_pages
|
||||||
|
self.verbosity_level = verbosity_level
|
||||||
|
|
||||||
|
@property
|
||||||
|
def offset(self):
|
||||||
|
return self.limit
|
||||||
|
|
||||||
|
@property
|
||||||
|
def query(self):
|
||||||
|
|
||||||
|
with open(self.query_url, "r") as file:
|
||||||
|
return file.read()
|
||||||
|
|
||||||
|
|
||||||
|
DBPEDIA_URL = "https://dbpedia.org/sparql"
|
||||||
TYPE = SPARQLWrapper.CSV
|
TYPE = SPARQLWrapper.CSV
|
||||||
TIMEOUT_SECONDS = 1.5
|
TIMEOUT_SECONDS = 1.5
|
||||||
LIMIT = int(1E4)
|
LIMIT = int(1E4)
|
||||||
OFFSET = LIMIT
|
INITIAL_OFFSET = 0
|
||||||
INITIAL_OFFSET = 15200000
|
|
||||||
MAX_PAGES = int(1E9)
|
MAX_PAGES = int(1E9)
|
||||||
|
|
||||||
# Missing page 13220000
|
|
||||||
|
|
||||||
FILE_URI = "./Assets/Dataset/1-hop/dataset.csv"
|
def gather_cli_args(args: list[str]) -> ProgramData:
|
||||||
|
|
||||||
QUERY = """
|
# TODO: Add argument for type
|
||||||
PREFIX dbo: <http://dbpedia.org/ontology/>
|
PARSER = argparse.ArgumentParser("sparql data fetcher")
|
||||||
PREFIX dbp: <http://dbpedia.org/property/>
|
PARSER.add_argument("--file-path", "--file", "--output", "-o", required=True, type=str)
|
||||||
PREFIX dbr: <http://dbpedia.org/resource/>
|
PARSER.add_argument("--query-file", "--query", "-q", required=True, type=str)
|
||||||
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
|
PARSER.add_argument("--url", type=str, default=DBPEDIA_URL)
|
||||||
|
PARSER.add_argument("--limit", type=int, default=LIMIT)
|
||||||
|
PARSER.add_argument("--timeout", type=float, default=TIMEOUT_SECONDS)
|
||||||
|
PARSER.add_argument("--offset", type=int, default=INITIAL_OFFSET)
|
||||||
|
PARSER.add_argument("--max-pages", type=int, default=MAX_PAGES)
|
||||||
|
PARSER.add_argument("--verbose", "-v", action="count", default=0)
|
||||||
|
|
||||||
SELECT ?subject, ?relationship, ?object
|
parsed_args, _ = PARSER.parse_known_args(args)
|
||||||
WHERE {
|
|
||||||
?subject ?relationship ?object .
|
return ProgramData(
|
||||||
?subject rdf:type dbo:Film .
|
parsed_args.file_path,
|
||||||
?a foaf:primaryTopic ?subject
|
parsed_args.query_file,
|
||||||
FILTER (?relationship NOT IN (
|
parsed_args.url,
|
||||||
dbo:wikiPageRedirects,
|
SPARQLWrapper.CSV,
|
||||||
dbo:wikiPageExternalLink,
|
parsed_args.offset,
|
||||||
dbo:wikiPageWikiLink
|
parsed_args.timeout,
|
||||||
))
|
parsed_args.limit,
|
||||||
}"""
|
parsed_args.max_pages,
|
||||||
|
parsed_args.verbose
|
||||||
|
)
|
||||||
|
# type: ignore
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def fetch_data(DATA: ProgramData):
|
||||||
|
|
||||||
|
# Take correction of page into account
|
||||||
|
page = int(floor(DATA.initial_offset / DATA.limit)) - 1
|
||||||
exit = False
|
exit = False
|
||||||
page = int(floor(INITIAL_OFFSET / LIMIT)) -1
|
|
||||||
|
|
||||||
while not exit:
|
while not exit:
|
||||||
|
|
||||||
print(f"Starting to get page {page}")
|
print(f"Starting to get page {page}")
|
||||||
|
|
||||||
CURRENT_OFFSET = int(OFFSET + (page * LIMIT))
|
CURRENT_OFFSET = int(DATA.offset + (page * DATA.limit))
|
||||||
sparql = SPARQLWrapper.SPARQLWrapper(BASE_URL)
|
sparql = SPARQLWrapper.SPARQLWrapper(DATA.sparql_url)
|
||||||
|
|
||||||
sparql.setReturnFormat(TYPE)
|
sparql.setReturnFormat(TYPE)
|
||||||
|
|
||||||
CURRENT_PAGE_QUERY = "\n".join([
|
CURRENT_PAGE_QUERY = "\n".join([
|
||||||
QUERY,
|
DATA.query,
|
||||||
f"LIMIT {LIMIT}",
|
f"LIMIT {LIMIT}",
|
||||||
f"OFFSET {CURRENT_OFFSET}"
|
f"OFFSET {CURRENT_OFFSET}"
|
||||||
])
|
])
|
||||||
@ -77,9 +121,9 @@ def main():
|
|||||||
exit = True
|
exit = True
|
||||||
continue
|
continue
|
||||||
|
|
||||||
with open(FILE_URI, "a+", encoding="utf-8") as dataset:
|
with open(DATA.local_url, "a+", encoding="utf-8") as dataset:
|
||||||
|
|
||||||
print(f"Writing page {page} on {FILE_URI}")
|
print(f"Writing page {page} on {DATA.local_url}")
|
||||||
dataset.write(
|
dataset.write(
|
||||||
text
|
text
|
||||||
)
|
)
|
||||||
@ -96,4 +140,7 @@ def main():
|
|||||||
|
|
||||||
sleep(TIMEOUT_SECONDS)
|
sleep(TIMEOUT_SECONDS)
|
||||||
|
|
||||||
main()
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
DATA = gather_cli_args(sys.argv)
|
||||||
|
fetch_data(DATA)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user