From 9838e287a4e56c873a02b14d95fe2c84cfe0b853 Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Thu, 18 Sep 2025 12:03:09 +0200
Subject: [PATCH] Updated file
---
Script/fetchdata.py | 33 +++++++++++++++++++++++++--------
1 file changed, 25 insertions(+), 8 deletions(-)
diff --git a/Script/fetchdata.py b/Script/fetchdata.py
index 086af7a..7573751 100644
--- a/Script/fetchdata.py
+++ b/Script/fetchdata.py
@@ -5,11 +5,11 @@ import requests
BASE_URL = "https://dbpedia.org/sparql"
TYPE = SPARQLWrapper.CSV
-TIMEOUT_SECONDS = int(10)
-LIMIT = int(1E6)
+TIMEOUT_SECONDS = 1.5
+LIMIT = int(1E4)
OFFSET = LIMIT
INITIAL_OFFSET = 0
-MAX_PAGES = int(1E2)
+MAX_PAGES = int(1E9)
FILE_URI = "./Assets/Dataset/1-hop/dataset.csv"
@@ -17,19 +17,27 @@ QUERY = """
PREFIX dbo:
PREFIX dbp:
PREFIX dbr:
+PREFIX foaf:
SELECT ?subject, ?relationship, ?object
WHERE {
?subject ?relationship ?object .
- ?object rdf:type dbo:Film .
- FILTER (?relationship != )
+ ?subject rdf:type dbo:Film .
+ ?a foaf:primaryTopic ?subject
+ FILTER (?relationship NOT IN (
+ dbo:wikiPageRedirects,
+ dbo:wikiPageExternalLink,
+ dbo:wikiPageWikiLink
+ ))
}"""
-
def main():
- for page in range(INITIAL_OFFSET, MAX_PAGES):
+ exit = False
+ page = INITIAL_OFFSET
+
+ while not exit:
print(f"Starting to get page {page}")
@@ -62,10 +70,13 @@ def main():
lines = res.decode("utf-8", "ignore").split("\n")
text = "\n".join(lines[initial_offset:])
+ if text == "":
+ exit = True
+ continue
+
with open(FILE_URI, "a+", encoding="utf-8") as dataset:
print(f"Writing page {page} on {FILE_URI}")
-
dataset.write(
text
)
@@ -74,6 +85,12 @@ def main():
print(f"Something went wrong during page {page}:\n\t{ex}")
print(f"Sleeping for {TIMEOUT_SECONDS}")
+
+ page += 1
+
+ if page == MAX_PAGES - 1:
+ exit = True
+
sleep(TIMEOUT_SECONDS)
main()