Updated file

This commit is contained in:
Christian Risi 2025-09-18 12:03:09 +02:00
parent ca6143ea3c
commit 9838e287a4

View File

@ -5,11 +5,11 @@ import requests
BASE_URL = "https://dbpedia.org/sparql" BASE_URL = "https://dbpedia.org/sparql"
TYPE = SPARQLWrapper.CSV TYPE = SPARQLWrapper.CSV
TIMEOUT_SECONDS = int(10) TIMEOUT_SECONDS = 1.5
LIMIT = int(1E6) LIMIT = int(1E4)
OFFSET = LIMIT OFFSET = LIMIT
INITIAL_OFFSET = 0 INITIAL_OFFSET = 0
MAX_PAGES = int(1E2) MAX_PAGES = int(1E9)
FILE_URI = "./Assets/Dataset/1-hop/dataset.csv" FILE_URI = "./Assets/Dataset/1-hop/dataset.csv"
@ -17,19 +17,27 @@ QUERY = """
PREFIX dbo: <http://dbpedia.org/ontology/> PREFIX dbo: <http://dbpedia.org/ontology/>
PREFIX dbp: <http://dbpedia.org/property/> PREFIX dbp: <http://dbpedia.org/property/>
PREFIX dbr: <http://dbpedia.org/resource/> PREFIX dbr: <http://dbpedia.org/resource/>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
SELECT ?subject, ?relationship, ?object SELECT ?subject, ?relationship, ?object
WHERE { WHERE {
?subject ?relationship ?object . ?subject ?relationship ?object .
?object rdf:type dbo:Film . ?subject rdf:type dbo:Film .
FILTER (?relationship != <http://dbpedia.org/ontology/wikiPageWikiLink>) ?a foaf:primaryTopic ?subject
FILTER (?relationship NOT IN (
dbo:wikiPageRedirects,
dbo:wikiPageExternalLink,
dbo:wikiPageWikiLink
))
}""" }"""
def main(): def main():
for page in range(INITIAL_OFFSET, MAX_PAGES): exit = False
page = INITIAL_OFFSET
while not exit:
print(f"Starting to get page {page}") print(f"Starting to get page {page}")
@ -62,10 +70,13 @@ def main():
lines = res.decode("utf-8", "ignore").split("\n") lines = res.decode("utf-8", "ignore").split("\n")
text = "\n".join(lines[initial_offset:]) text = "\n".join(lines[initial_offset:])
if text == "":
exit = True
continue
with open(FILE_URI, "a+", encoding="utf-8") as dataset: with open(FILE_URI, "a+", encoding="utf-8") as dataset:
print(f"Writing page {page} on {FILE_URI}") print(f"Writing page {page} on {FILE_URI}")
dataset.write( dataset.write(
text text
) )
@ -74,6 +85,12 @@ def main():
print(f"Something went wrong during page {page}:\n\t{ex}") print(f"Something went wrong during page {page}:\n\t{ex}")
print(f"Sleeping for {TIMEOUT_SECONDS}") print(f"Sleeping for {TIMEOUT_SECONDS}")
page += 1
if page == MAX_PAGES - 1:
exit = True
sleep(TIMEOUT_SECONDS) sleep(TIMEOUT_SECONDS)
main() main()