Updated file

2025-09-18 12:03:09 +02:00
parent ca6143ea3c
commit 9838e287a4
1 changed files with 25 additions and 8 deletions
--- a/Script/fetchdata.py
+++ b/Script/fetchdata.py
@@ -5,11 +5,11 @@ import requests
 BASE_URL = "https://dbpedia.org/sparql"
 TYPE = SPARQLWrapper.CSV
-TIMEOUT_SECONDS = int(10)
+TIMEOUT_SECONDS = 1.5
-LIMIT = int(1E6)
+LIMIT = int(1E4)
 OFFSET = LIMIT
 INITIAL_OFFSET = 0
-MAX_PAGES = int(1E2)
+MAX_PAGES = int(1E9)
 FILE_URI = "./Assets/Dataset/1-hop/dataset.csv"
@@ -17,19 +17,27 @@ QUERY = """
 PREFIX dbo:  <http://dbpedia.org/ontology/>
 PREFIX dbp:  <http://dbpedia.org/property/>
 PREFIX dbr:  <http://dbpedia.org/resource/>
 PREFIX foaf: <http://xmlns.com/foaf/0.1/>
 SELECT ?subject, ?relationship, ?object
 WHERE {
  ?subject ?relationship ?object .
-  ?object rdf:type dbo:Film .
+  ?subject rdf:type dbo:Film .
-  FILTER (?relationship != <http://dbpedia.org/ontology/wikiPageWikiLink>)
+  ?a foaf:primaryTopic ?subject
  FILTER (?relationship NOT IN (
    dbo:wikiPageRedirects,
    dbo:wikiPageExternalLink,
    dbo:wikiPageWikiLink
  ))
 }"""
 def main():
-    for page in range(INITIAL_OFFSET, MAX_PAGES):
+    exit = False
    page = INITIAL_OFFSET
    while not exit:
        print(f"Starting to get page {page}")
@@ -62,10 +70,13 @@ def main():
                lines = res.decode("utf-8", "ignore").split("\n")
                text = "\n".join(lines[initial_offset:])
            if text == "":
                exit = True
                continue
            with open(FILE_URI, "a+", encoding="utf-8") as dataset:
                print(f"Writing page {page} on {FILE_URI}")
                dataset.write(
                    text
                )
@@ -74,6 +85,12 @@ def main():
            print(f"Something went wrong during page {page}:\n\t{ex}")
        print(f"Sleeping for {TIMEOUT_SECONDS}")
        page += 1
        if page == MAX_PAGES - 1:
            exit = True
        sleep(TIMEOUT_SECONDS)
 main()