NanoSocrates/Script/fetchdata.py

100 lines
2.2 KiB
Python
Raw Normal View History

2025-09-18 17:23:56 +02:00
from math import floor
from time import sleep
import SPARQLWrapper
import requests
BASE_URL = "https://dbpedia.org/sparql"
TYPE = SPARQLWrapper.CSV
2025-09-18 12:03:09 +02:00
TIMEOUT_SECONDS = 1.5
LIMIT = int(1E4)
OFFSET = LIMIT
2025-09-18 17:23:56 +02:00
INITIAL_OFFSET = 15200000
2025-09-18 12:03:09 +02:00
MAX_PAGES = int(1E9)
2025-09-18 17:23:56 +02:00
# Missing page 13220000
FILE_URI = "./Assets/Dataset/1-hop/dataset.csv"
QUERY = """
PREFIX dbo: <http://dbpedia.org/ontology/>
PREFIX dbp: <http://dbpedia.org/property/>
PREFIX dbr: <http://dbpedia.org/resource/>
2025-09-18 12:03:09 +02:00
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
SELECT ?subject, ?relationship, ?object
WHERE {
?subject ?relationship ?object .
2025-09-18 12:03:09 +02:00
?subject rdf:type dbo:Film .
?a foaf:primaryTopic ?subject
FILTER (?relationship NOT IN (
dbo:wikiPageRedirects,
dbo:wikiPageExternalLink,
dbo:wikiPageWikiLink
))
}"""
def main():
2025-09-18 12:03:09 +02:00
exit = False
2025-09-18 17:23:56 +02:00
page = int(floor(INITIAL_OFFSET / LIMIT)) -1
2025-09-18 12:03:09 +02:00
while not exit:
print(f"Starting to get page {page}")
2025-09-18 17:23:56 +02:00
CURRENT_OFFSET = int(OFFSET + (page * LIMIT))
sparql = SPARQLWrapper.SPARQLWrapper(BASE_URL)
sparql.setReturnFormat(TYPE)
CURRENT_PAGE_QUERY = "\n".join([
QUERY,
f"LIMIT {LIMIT}",
f"OFFSET {CURRENT_OFFSET}"
])
print(f"\nCurrent Query:\n{CURRENT_PAGE_QUERY}\n")
sparql.setQuery(CURRENT_PAGE_QUERY)
try:
res = sparql.queryAndConvert()
text = ""
if type(res) == bytes:
initial_offset = 0
if page != 0:
initial_offset = 1
lines = res.decode("utf-8", "ignore").split("\n")
text = "\n".join(lines[initial_offset:])
2025-09-18 12:03:09 +02:00
if text == "":
exit = True
continue
with open(FILE_URI, "a+", encoding="utf-8") as dataset:
print(f"Writing page {page} on {FILE_URI}")
dataset.write(
text
)
except Exception as ex:
print(f"Something went wrong during page {page}:\n\t{ex}")
print(f"Sleeping for {TIMEOUT_SECONDS}")
2025-09-18 12:03:09 +02:00
page += 1
if page == MAX_PAGES - 1:
exit = True
sleep(TIMEOUT_SECONDS)
main()