Moved fetchdata.py to reflect working tree

old - ${Proj}/Scripts/fetchdata.py new - ${Proj}/Scripts/DataGathering/fetchdata.py
2025-09-19 08:37:04 +02:00
parent ce3d4bf6c5
commit 00b87e01ea
1 changed files with 0 additions and 0 deletions
--- a/Scripts/DataGathering/fetchdata.py
+++ b/Scripts/DataGathering/fetchdata.py
@@ -0,0 +1,99 @@
+from math import floor
+from time import sleep
+import SPARQLWrapper
+import requests
+
+
+BASE_URL = "https://dbpedia.org/sparql"
+TYPE = SPARQLWrapper.CSV
+TIMEOUT_SECONDS = 1.5
+LIMIT = int(1E4)
+OFFSET = LIMIT
+INITIAL_OFFSET =  15200000
+MAX_PAGES = int(1E9)
+
+# Missing page 13220000
+
+FILE_URI = "./Assets/Dataset/1-hop/dataset.csv"
+
+QUERY = """
+PREFIX dbo:  <http://dbpedia.org/ontology/>
+PREFIX dbp:  <http://dbpedia.org/property/>
+PREFIX dbr:  <http://dbpedia.org/resource/>
+PREFIX foaf: <http://xmlns.com/foaf/0.1/>
+
+SELECT ?subject, ?relationship, ?object
+WHERE {
+  ?subject ?relationship ?object .
+  ?subject rdf:type dbo:Film .
+  ?a foaf:primaryTopic ?subject
+  FILTER (?relationship NOT IN (
+    dbo:wikiPageRedirects,
+    dbo:wikiPageExternalLink,
+    dbo:wikiPageWikiLink
+  ))
+}"""
+
+
+def main():
+
+    exit = False
+    page = int(floor(INITIAL_OFFSET / LIMIT)) -1
+
+    while not exit:
+
+        print(f"Starting to get page {page}")
+
+        CURRENT_OFFSET = int(OFFSET + (page * LIMIT))
+        sparql = SPARQLWrapper.SPARQLWrapper(BASE_URL)
+
+        sparql.setReturnFormat(TYPE)
+
+        CURRENT_PAGE_QUERY = "\n".join([
+            QUERY,
+            f"LIMIT {LIMIT}",
+            f"OFFSET {CURRENT_OFFSET}"
+        ])
+
+        print(f"\nCurrent Query:\n{CURRENT_PAGE_QUERY}\n")
+
+        sparql.setQuery(CURRENT_PAGE_QUERY)
+
+        try:
+            res = sparql.queryAndConvert()
+            text = ""
+
+            if type(res) == bytes:
+
+                initial_offset = 0
+
+                if page != 0:
+                    initial_offset = 1
+
+                lines = res.decode("utf-8", "ignore").split("\n")
+                text = "\n".join(lines[initial_offset:])
+
+            if text == "":
+                exit = True
+                continue
+
+            with open(FILE_URI, "a+", encoding="utf-8") as dataset:
+
+                print(f"Writing page {page} on {FILE_URI}")
+                dataset.write(
+                    text
+                )
+
+        except Exception as ex:
+            print(f"Something went wrong during page {page}:\n\t{ex}")
+
+        print(f"Sleeping for {TIMEOUT_SECONDS}")
+
+        page += 1
+
+        if page == MAX_PAGES - 1:
+            exit = True
+
+        sleep(TIMEOUT_SECONDS)
+
+main()