Updated file
This commit is contained in:
parent
ca6143ea3c
commit
9838e287a4
@ -5,11 +5,11 @@ import requests
|
|||||||
|
|
||||||
BASE_URL = "https://dbpedia.org/sparql"
|
BASE_URL = "https://dbpedia.org/sparql"
|
||||||
TYPE = SPARQLWrapper.CSV
|
TYPE = SPARQLWrapper.CSV
|
||||||
TIMEOUT_SECONDS = int(10)
|
TIMEOUT_SECONDS = 1.5
|
||||||
LIMIT = int(1E6)
|
LIMIT = int(1E4)
|
||||||
OFFSET = LIMIT
|
OFFSET = LIMIT
|
||||||
INITIAL_OFFSET = 0
|
INITIAL_OFFSET = 0
|
||||||
MAX_PAGES = int(1E2)
|
MAX_PAGES = int(1E9)
|
||||||
|
|
||||||
FILE_URI = "./Assets/Dataset/1-hop/dataset.csv"
|
FILE_URI = "./Assets/Dataset/1-hop/dataset.csv"
|
||||||
|
|
||||||
@ -17,19 +17,27 @@ QUERY = """
|
|||||||
PREFIX dbo: <http://dbpedia.org/ontology/>
|
PREFIX dbo: <http://dbpedia.org/ontology/>
|
||||||
PREFIX dbp: <http://dbpedia.org/property/>
|
PREFIX dbp: <http://dbpedia.org/property/>
|
||||||
PREFIX dbr: <http://dbpedia.org/resource/>
|
PREFIX dbr: <http://dbpedia.org/resource/>
|
||||||
|
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
|
||||||
|
|
||||||
SELECT ?subject, ?relationship, ?object
|
SELECT ?subject, ?relationship, ?object
|
||||||
WHERE {
|
WHERE {
|
||||||
?subject ?relationship ?object .
|
?subject ?relationship ?object .
|
||||||
?object rdf:type dbo:Film .
|
?subject rdf:type dbo:Film .
|
||||||
FILTER (?relationship != <http://dbpedia.org/ontology/wikiPageWikiLink>)
|
?a foaf:primaryTopic ?subject
|
||||||
|
FILTER (?relationship NOT IN (
|
||||||
|
dbo:wikiPageRedirects,
|
||||||
|
dbo:wikiPageExternalLink,
|
||||||
|
dbo:wikiPageWikiLink
|
||||||
|
))
|
||||||
}"""
|
}"""
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|
||||||
for page in range(INITIAL_OFFSET, MAX_PAGES):
|
exit = False
|
||||||
|
page = INITIAL_OFFSET
|
||||||
|
|
||||||
|
while not exit:
|
||||||
|
|
||||||
print(f"Starting to get page {page}")
|
print(f"Starting to get page {page}")
|
||||||
|
|
||||||
@ -62,10 +70,13 @@ def main():
|
|||||||
lines = res.decode("utf-8", "ignore").split("\n")
|
lines = res.decode("utf-8", "ignore").split("\n")
|
||||||
text = "\n".join(lines[initial_offset:])
|
text = "\n".join(lines[initial_offset:])
|
||||||
|
|
||||||
|
if text == "":
|
||||||
|
exit = True
|
||||||
|
continue
|
||||||
|
|
||||||
with open(FILE_URI, "a+", encoding="utf-8") as dataset:
|
with open(FILE_URI, "a+", encoding="utf-8") as dataset:
|
||||||
|
|
||||||
print(f"Writing page {page} on {FILE_URI}")
|
print(f"Writing page {page} on {FILE_URI}")
|
||||||
|
|
||||||
dataset.write(
|
dataset.write(
|
||||||
text
|
text
|
||||||
)
|
)
|
||||||
@ -74,6 +85,12 @@ def main():
|
|||||||
print(f"Something went wrong during page {page}:\n\t{ex}")
|
print(f"Something went wrong during page {page}:\n\t{ex}")
|
||||||
|
|
||||||
print(f"Sleeping for {TIMEOUT_SECONDS}")
|
print(f"Sleeping for {TIMEOUT_SECONDS}")
|
||||||
|
|
||||||
|
page += 1
|
||||||
|
|
||||||
|
if page == MAX_PAGES - 1:
|
||||||
|
exit = True
|
||||||
|
|
||||||
sleep(TIMEOUT_SECONDS)
|
sleep(TIMEOUT_SECONDS)
|
||||||
|
|
||||||
main()
|
main()
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user