From 9838e287a4e56c873a02b14d95fe2c84cfe0b853 Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Thu, 18 Sep 2025 12:03:09 +0200
Subject: [PATCH] Updated file

---
 Script/fetchdata.py | 33 +++++++++++++++++++++++++--------
 1 file changed, 25 insertions(+), 8 deletions(-)

diff --git a/Script/fetchdata.py b/Script/fetchdata.py
index 086af7a..7573751 100644
--- a/Script/fetchdata.py
+++ b/Script/fetchdata.py
@@ -5,11 +5,11 @@ import requests
 
 BASE_URL = "https://dbpedia.org/sparql"
 TYPE = SPARQLWrapper.CSV
-TIMEOUT_SECONDS = int(10)
-LIMIT = int(1E6)
+TIMEOUT_SECONDS = 1.5
+LIMIT = int(1E4)
 OFFSET = LIMIT
 INITIAL_OFFSET = 0
-MAX_PAGES = int(1E2)
+MAX_PAGES = int(1E9)
 
 FILE_URI = "./Assets/Dataset/1-hop/dataset.csv"
 
@@ -17,19 +17,27 @@ QUERY = """
 PREFIX dbo:  <http://dbpedia.org/ontology/>
 PREFIX dbp:  <http://dbpedia.org/property/>
 PREFIX dbr:  <http://dbpedia.org/resource/>
+PREFIX foaf: <http://xmlns.com/foaf/0.1/>
 
 SELECT ?subject, ?relationship, ?object
 WHERE {
   ?subject ?relationship ?object .
-  ?object rdf:type dbo:Film .
-  FILTER (?relationship != <http://dbpedia.org/ontology/wikiPageWikiLink>)
+  ?subject rdf:type dbo:Film .
+  ?a foaf:primaryTopic ?subject
+  FILTER (?relationship NOT IN (
+    dbo:wikiPageRedirects,
+    dbo:wikiPageExternalLink,
+    dbo:wikiPageWikiLink
+  ))
 }"""
 
 
-
 def main():
 
-    for page in range(INITIAL_OFFSET, MAX_PAGES):
+    exit = False
+    page = INITIAL_OFFSET
+
+    while not exit:
 
         print(f"Starting to get page {page}")
 
@@ -62,10 +70,13 @@ def main():
                 lines = res.decode("utf-8", "ignore").split("\n")
                 text = "\n".join(lines[initial_offset:])
 
+            if text == "":
+                exit = True
+                continue
+
             with open(FILE_URI, "a+", encoding="utf-8") as dataset:
 
                 print(f"Writing page {page} on {FILE_URI}")
-
                 dataset.write(
                     text
                 )
@@ -74,6 +85,12 @@ def main():
             print(f"Something went wrong during page {page}:\n\t{ex}")
 
         print(f"Sleeping for {TIMEOUT_SECONDS}")
+
+        page += 1
+
+        if page == MAX_PAGES - 1:
+            exit = True
+
         sleep(TIMEOUT_SECONDS)
 
 main()