From 9838e287a4e56c873a02b14d95fe2c84cfe0b853 Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Thu, 18 Sep 2025 12:03:09 +0200 Subject: [PATCH] Updated file --- Script/fetchdata.py | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/Script/fetchdata.py b/Script/fetchdata.py index 086af7a..7573751 100644 --- a/Script/fetchdata.py +++ b/Script/fetchdata.py @@ -5,11 +5,11 @@ import requests BASE_URL = "https://dbpedia.org/sparql" TYPE = SPARQLWrapper.CSV -TIMEOUT_SECONDS = int(10) -LIMIT = int(1E6) +TIMEOUT_SECONDS = 1.5 +LIMIT = int(1E4) OFFSET = LIMIT INITIAL_OFFSET = 0 -MAX_PAGES = int(1E2) +MAX_PAGES = int(1E9) FILE_URI = "./Assets/Dataset/1-hop/dataset.csv" @@ -17,19 +17,27 @@ QUERY = """ PREFIX dbo: PREFIX dbp: PREFIX dbr: +PREFIX foaf: SELECT ?subject, ?relationship, ?object WHERE { ?subject ?relationship ?object . - ?object rdf:type dbo:Film . - FILTER (?relationship != ) + ?subject rdf:type dbo:Film . + ?a foaf:primaryTopic ?subject + FILTER (?relationship NOT IN ( + dbo:wikiPageRedirects, + dbo:wikiPageExternalLink, + dbo:wikiPageWikiLink + )) }""" - def main(): - for page in range(INITIAL_OFFSET, MAX_PAGES): + exit = False + page = INITIAL_OFFSET + + while not exit: print(f"Starting to get page {page}") @@ -62,10 +70,13 @@ def main(): lines = res.decode("utf-8", "ignore").split("\n") text = "\n".join(lines[initial_offset:]) + if text == "": + exit = True + continue + with open(FILE_URI, "a+", encoding="utf-8") as dataset: print(f"Writing page {page} on {FILE_URI}") - dataset.write( text ) @@ -74,6 +85,12 @@ def main(): print(f"Something went wrong during page {page}:\n\t{ex}") print(f"Sleeping for {TIMEOUT_SECONDS}") + + page += 1 + + if page == MAX_PAGES - 1: + exit = True + sleep(TIMEOUT_SECONDS) main()