Renamed dir from Script to Scripts

This commit is contained in:
Christian Risi
2025-09-19 08:31:00 +02:00
parent c415b175a0
commit ce3d4bf6c5
3 changed files with 0 additions and 0 deletions

View File

@@ -0,0 +1,14 @@
import pandas as pd
# Load the CSV
df = pd.read_csv("./Assets/Dataset/1-hop/reverse.csv")
# Extract the last part of the URL in 'relationship'
df["relationship_short"] = df["relationship"].apply(lambda x: x.split("/")[-1])
# Count occurrences of each unique last part
relationship_counts = df["relationship_short"].value_counts()
# Print the counts
for rel, count in relationship_counts.items():
print(f"{rel}: {count}")

View File

@@ -0,0 +1,60 @@
import pandas as pd
import wikipediaapi
import csv
import time
import re
# Initialize Wikipedia API with proper user agent
wiki_wiki = wikipediaapi.Wikipedia(
language='en',
user_agent='MovieAbstractScraper/1.0 (gape@example.com)'
)
input_csv = './Assets/Dataset/1-hop/wikipedia-movie.csv'
output_csv = './Assets/Dataset/1-hop/wikipedia-summary.csv'
# Function to get clean full text from Wikipedia URL
def get_clean_text(wiki_url):
try:
page_title = wiki_url.rsplit('/', 1)[-1] # extract page title
page = wiki_wiki.page(page_title)
if page.exists():
text = page.text
# Remove section headers like == History ==
text = re.sub(r'==.*?==', '', text)
# Collapse multiple spaces and newlines into single space
text = re.sub(r'\s+', ' ', text).strip()
return text
except Exception as e:
print(f"Error fetching {wiki_url}: {e}")
return ''
# Initialize output CSV
with open(output_csv, 'w', newline='', encoding='utf-8') as f_out:
writer = csv.DictWriter(f_out, fieldnames=['subject', 'text'])
writer.writeheader()
# Process input CSV in chunks
chunksize = 1
for chunk in pd.read_csv(input_csv, chunksize=chunksize):
# Clean column names in case of quotes
chunk.columns = chunk.columns.str.replace('"', '').str.strip()
rows_to_write = []
for _, row in chunk.iterrows():
wikipedia_url = row['subject'] # old subject (Wikipedia URL)
dbpedia_url = row['object'] # old object (DBpedia URL)
clean_text = get_clean_text(wikipedia_url)
rows_to_write.append({
'subject': dbpedia_url,
'text': clean_text
})
time.sleep(0.1) # polite delay
# Append to output CSV
with open(output_csv, 'a', newline='', encoding='utf-8') as f_out:
writer = csv.DictWriter(f_out, fieldnames=['subject', 'text'])
writer.writerows(rows_to_write)
print(f"Processed {len(chunk)} rows, appended to {output_csv}")

99
Scripts/fetchdata.py Normal file
View File

@@ -0,0 +1,99 @@
from math import floor
from time import sleep
import SPARQLWrapper
import requests
BASE_URL = "https://dbpedia.org/sparql"
TYPE = SPARQLWrapper.CSV
TIMEOUT_SECONDS = 1.5
LIMIT = int(1E4)
OFFSET = LIMIT
INITIAL_OFFSET = 15200000
MAX_PAGES = int(1E9)
# Missing page 13220000
FILE_URI = "./Assets/Dataset/1-hop/dataset.csv"
QUERY = """
PREFIX dbo: <http://dbpedia.org/ontology/>
PREFIX dbp: <http://dbpedia.org/property/>
PREFIX dbr: <http://dbpedia.org/resource/>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
SELECT ?subject, ?relationship, ?object
WHERE {
?subject ?relationship ?object .
?subject rdf:type dbo:Film .
?a foaf:primaryTopic ?subject
FILTER (?relationship NOT IN (
dbo:wikiPageRedirects,
dbo:wikiPageExternalLink,
dbo:wikiPageWikiLink
))
}"""
def main():
exit = False
page = int(floor(INITIAL_OFFSET / LIMIT)) -1
while not exit:
print(f"Starting to get page {page}")
CURRENT_OFFSET = int(OFFSET + (page * LIMIT))
sparql = SPARQLWrapper.SPARQLWrapper(BASE_URL)
sparql.setReturnFormat(TYPE)
CURRENT_PAGE_QUERY = "\n".join([
QUERY,
f"LIMIT {LIMIT}",
f"OFFSET {CURRENT_OFFSET}"
])
print(f"\nCurrent Query:\n{CURRENT_PAGE_QUERY}\n")
sparql.setQuery(CURRENT_PAGE_QUERY)
try:
res = sparql.queryAndConvert()
text = ""
if type(res) == bytes:
initial_offset = 0
if page != 0:
initial_offset = 1
lines = res.decode("utf-8", "ignore").split("\n")
text = "\n".join(lines[initial_offset:])
if text == "":
exit = True
continue
with open(FILE_URI, "a+", encoding="utf-8") as dataset:
print(f"Writing page {page} on {FILE_URI}")
dataset.write(
text
)
except Exception as ex:
print(f"Something went wrong during page {page}:\n\t{ex}")
print(f"Sleeping for {TIMEOUT_SECONDS}")
page += 1
if page == MAX_PAGES - 1:
exit = True
sleep(TIMEOUT_SECONDS)
main()