Updated fetchdata to be used in terminal

Changes:
  - now you can use it as if it were a cli command

Missing:
  - documentation
This commit is contained in:
Christian Risi 2025-09-19 12:35:15 +02:00
parent b74b7ac4f0
commit e32444df75

View File

@ -1,56 +1,100 @@
import argparse
from math import floor from math import floor
import sys
from time import sleep from time import sleep
import SPARQLWrapper import SPARQLWrapper
import requests
BASE_URL = "https://dbpedia.org/sparql" class ProgramData:
def __init__(
self,
local_url,
query_url,
sparql_url,
output_type,
initial_offset,
timeout,
limit,
max_pages,
verbosity_level,
) -> None:
self.local_url = local_url
self.query_url = query_url
self.sparql_url = sparql_url
self.output_type = output_type
self.initial_offset = initial_offset
self.timeout = timeout
self.limit = limit
self.max_pages = max_pages
self.verbosity_level = verbosity_level
@property
def offset(self):
return self.limit
@property
def query(self):
with open(self.query_url, "r") as file:
return file.read()
DBPEDIA_URL = "https://dbpedia.org/sparql"
TYPE = SPARQLWrapper.CSV TYPE = SPARQLWrapper.CSV
TIMEOUT_SECONDS = 1.5 TIMEOUT_SECONDS = 1.5
LIMIT = int(1E4) LIMIT = int(1E4)
OFFSET = LIMIT INITIAL_OFFSET = 0
INITIAL_OFFSET = 15200000
MAX_PAGES = int(1E9) MAX_PAGES = int(1E9)
# Missing page 13220000
FILE_URI = "./Assets/Dataset/1-hop/dataset.csv" def gather_cli_args(args: list[str]) -> ProgramData:
QUERY = """ # TODO: Add argument for type
PREFIX dbo: <http://dbpedia.org/ontology/> PARSER = argparse.ArgumentParser("sparql data fetcher")
PREFIX dbp: <http://dbpedia.org/property/> PARSER.add_argument("--file-path", "--file", "--output", "-o", required=True, type=str)
PREFIX dbr: <http://dbpedia.org/resource/> PARSER.add_argument("--query-file", "--query", "-q", required=True, type=str)
PREFIX foaf: <http://xmlns.com/foaf/0.1/> PARSER.add_argument("--url", type=str, default=DBPEDIA_URL)
PARSER.add_argument("--limit", type=int, default=LIMIT)
PARSER.add_argument("--timeout", type=float, default=TIMEOUT_SECONDS)
PARSER.add_argument("--offset", type=int, default=INITIAL_OFFSET)
PARSER.add_argument("--max-pages", type=int, default=MAX_PAGES)
PARSER.add_argument("--verbose", "-v", action="count", default=0)
SELECT ?subject, ?relationship, ?object parsed_args, _ = PARSER.parse_known_args(args)
WHERE {
?subject ?relationship ?object . return ProgramData(
?subject rdf:type dbo:Film . parsed_args.file_path,
?a foaf:primaryTopic ?subject parsed_args.query_file,
FILTER (?relationship NOT IN ( parsed_args.url,
dbo:wikiPageRedirects, SPARQLWrapper.CSV,
dbo:wikiPageExternalLink, parsed_args.offset,
dbo:wikiPageWikiLink parsed_args.timeout,
)) parsed_args.limit,
}""" parsed_args.max_pages,
parsed_args.verbose
)
# type: ignore
def main(): def fetch_data(DATA: ProgramData):
# Take correction of page into account
page = int(floor(DATA.initial_offset / DATA.limit)) - 1
exit = False exit = False
page = int(floor(INITIAL_OFFSET / LIMIT)) -1
while not exit: while not exit:
print(f"Starting to get page {page}") print(f"Starting to get page {page}")
CURRENT_OFFSET = int(OFFSET + (page * LIMIT)) CURRENT_OFFSET = int(DATA.offset + (page * DATA.limit))
sparql = SPARQLWrapper.SPARQLWrapper(BASE_URL) sparql = SPARQLWrapper.SPARQLWrapper(DATA.sparql_url)
sparql.setReturnFormat(TYPE) sparql.setReturnFormat(TYPE)
CURRENT_PAGE_QUERY = "\n".join([ CURRENT_PAGE_QUERY = "\n".join([
QUERY, DATA.query,
f"LIMIT {LIMIT}", f"LIMIT {LIMIT}",
f"OFFSET {CURRENT_OFFSET}" f"OFFSET {CURRENT_OFFSET}"
]) ])
@ -77,9 +121,9 @@ def main():
exit = True exit = True
continue continue
with open(FILE_URI, "a+", encoding="utf-8") as dataset: with open(DATA.local_url, "a+", encoding="utf-8") as dataset:
print(f"Writing page {page} on {FILE_URI}") print(f"Writing page {page} on {DATA.local_url}")
dataset.write( dataset.write(
text text
) )
@ -96,4 +140,7 @@ def main():
sleep(TIMEOUT_SECONDS) sleep(TIMEOUT_SECONDS)
main()
if __name__ == "__main__":
DATA = gather_cli_args(sys.argv)
fetch_data(DATA)