Created script to gather wikipedia abstracts
This commit is contained in:
parent
e39bad8348
commit
f89dffff75
95
Scripts/DataGathering/wikipedia_gathering.py
Normal file
95
Scripts/DataGathering/wikipedia_gathering.py
Normal file
@ -0,0 +1,95 @@
|
|||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
import csv
|
||||||
|
import time
|
||||||
|
import requests
|
||||||
|
|
||||||
|
input_csv = "./Assets/Dataset/1-hop/movie-pageid.csv"
|
||||||
|
output_csv = "./Assets/Dataset/Tmp/wikipedia-summary.csv"
|
||||||
|
|
||||||
|
|
||||||
|
sess = requests.Session()
|
||||||
|
|
||||||
|
CHUNK = 20
|
||||||
|
|
||||||
|
|
||||||
|
# Function to get clean full text from Wikipedia PageID
|
||||||
|
def get_clean_text(pageIDS: list[str]):
|
||||||
|
|
||||||
|
API_URL = "https://en.wikipedia.org/w/api.php"
|
||||||
|
headers = {
|
||||||
|
"User-Agent": "CoolBot/0.0 (https://example.org/coolbot/; coolbot@example.org)"
|
||||||
|
}
|
||||||
|
|
||||||
|
ids = "|".join(pageIDS)
|
||||||
|
|
||||||
|
|
||||||
|
res = sess.get(headers=headers, url=f"{API_URL}?action=query&pageids={ids}&prop=extracts&exintro=1&explaintext=1&format=json")
|
||||||
|
data = res.json()
|
||||||
|
|
||||||
|
abstracts = {}
|
||||||
|
# Make sure 'query' and the page exist
|
||||||
|
SKIPPED = 0
|
||||||
|
if "query" in data and "pages" in data["query"]:
|
||||||
|
for pageID in pageIDS:
|
||||||
|
if pageID in data["query"]["pages"]:
|
||||||
|
page = data["query"]["pages"][pageID]
|
||||||
|
extract: str = page.get("extract")
|
||||||
|
|
||||||
|
if extract:
|
||||||
|
print(f"Entry FOUND for pageID {pageID}")
|
||||||
|
extract = extract.strip()
|
||||||
|
extract = extract.replace("\n", "")
|
||||||
|
abstracts[pageID] = extract
|
||||||
|
else:
|
||||||
|
SKIPPED += 1
|
||||||
|
print(f"Entry MISSING for pageID {pageID}")
|
||||||
|
else:
|
||||||
|
SKIPPED += 1
|
||||||
|
print(f"Page MISSING for pageID {pageID}")
|
||||||
|
|
||||||
|
print(f"Chunk done - Skipped {SKIPPED}")
|
||||||
|
return abstracts
|
||||||
|
|
||||||
|
|
||||||
|
def flush(movie_ids):
|
||||||
|
|
||||||
|
|
||||||
|
abstracts = get_clean_text(movie_ids)
|
||||||
|
|
||||||
|
with open(output_csv, "a", newline="", encoding="utf-8") as f_out:
|
||||||
|
writer = csv.DictWriter(f_out, fieldnames=["subject", "text"])
|
||||||
|
|
||||||
|
for id, text in abstracts.items():
|
||||||
|
writer.writerow({"subject": id, "text": text})
|
||||||
|
|
||||||
|
|
||||||
|
# Initialize output CSV
|
||||||
|
with open(output_csv, "w", newline="", encoding="utf-8") as f_out:
|
||||||
|
writer = csv.DictWriter(f_out, fieldnames=["subject", "text"])
|
||||||
|
writer.writeheader()
|
||||||
|
|
||||||
|
|
||||||
|
# Read CSV in RAM
|
||||||
|
with open(input_csv, "r", newline="", encoding="utf-8") as input:
|
||||||
|
|
||||||
|
reader = csv.reader(input)
|
||||||
|
|
||||||
|
index = -1
|
||||||
|
movie_ids = []
|
||||||
|
|
||||||
|
for line in reader:
|
||||||
|
|
||||||
|
index += 1
|
||||||
|
|
||||||
|
if index == 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Save movies in map
|
||||||
|
movie_ids.append(line[1])
|
||||||
|
|
||||||
|
if index % CHUNK == 0:
|
||||||
|
|
||||||
|
# Flush movies
|
||||||
|
flush(movie_ids)
|
||||||
|
movie_ids = []
|
||||||
Loading…
x
Reference in New Issue
Block a user