Created script to gather wikipedia abstracts
This commit is contained in:
parent
e39bad8348
commit
f89dffff75
95
Scripts/DataGathering/wikipedia_gathering.py
Normal file
95
Scripts/DataGathering/wikipedia_gathering.py
Normal file
@ -0,0 +1,95 @@
|
||||
import pandas as pd
|
||||
|
||||
import csv
|
||||
import time
|
||||
import requests
|
||||
|
||||
input_csv = "./Assets/Dataset/1-hop/movie-pageid.csv"
|
||||
output_csv = "./Assets/Dataset/Tmp/wikipedia-summary.csv"
|
||||
|
||||
|
||||
sess = requests.Session()
|
||||
|
||||
CHUNK = 20
|
||||
|
||||
|
||||
# Function to get clean full text from Wikipedia PageID
|
||||
def get_clean_text(pageIDS: list[str]):
|
||||
|
||||
API_URL = "https://en.wikipedia.org/w/api.php"
|
||||
headers = {
|
||||
"User-Agent": "CoolBot/0.0 (https://example.org/coolbot/; coolbot@example.org)"
|
||||
}
|
||||
|
||||
ids = "|".join(pageIDS)
|
||||
|
||||
|
||||
res = sess.get(headers=headers, url=f"{API_URL}?action=query&pageids={ids}&prop=extracts&exintro=1&explaintext=1&format=json")
|
||||
data = res.json()
|
||||
|
||||
abstracts = {}
|
||||
# Make sure 'query' and the page exist
|
||||
SKIPPED = 0
|
||||
if "query" in data and "pages" in data["query"]:
|
||||
for pageID in pageIDS:
|
||||
if pageID in data["query"]["pages"]:
|
||||
page = data["query"]["pages"][pageID]
|
||||
extract: str = page.get("extract")
|
||||
|
||||
if extract:
|
||||
print(f"Entry FOUND for pageID {pageID}")
|
||||
extract = extract.strip()
|
||||
extract = extract.replace("\n", "")
|
||||
abstracts[pageID] = extract
|
||||
else:
|
||||
SKIPPED += 1
|
||||
print(f"Entry MISSING for pageID {pageID}")
|
||||
else:
|
||||
SKIPPED += 1
|
||||
print(f"Page MISSING for pageID {pageID}")
|
||||
|
||||
print(f"Chunk done - Skipped {SKIPPED}")
|
||||
return abstracts
|
||||
|
||||
|
||||
def flush(movie_ids):
|
||||
|
||||
|
||||
abstracts = get_clean_text(movie_ids)
|
||||
|
||||
with open(output_csv, "a", newline="", encoding="utf-8") as f_out:
|
||||
writer = csv.DictWriter(f_out, fieldnames=["subject", "text"])
|
||||
|
||||
for id, text in abstracts.items():
|
||||
writer.writerow({"subject": id, "text": text})
|
||||
|
||||
|
||||
# Initialize output CSV
|
||||
with open(output_csv, "w", newline="", encoding="utf-8") as f_out:
|
||||
writer = csv.DictWriter(f_out, fieldnames=["subject", "text"])
|
||||
writer.writeheader()
|
||||
|
||||
|
||||
# Read CSV in RAM
|
||||
with open(input_csv, "r", newline="", encoding="utf-8") as input:
|
||||
|
||||
reader = csv.reader(input)
|
||||
|
||||
index = -1
|
||||
movie_ids = []
|
||||
|
||||
for line in reader:
|
||||
|
||||
index += 1
|
||||
|
||||
if index == 0:
|
||||
continue
|
||||
|
||||
# Save movies in map
|
||||
movie_ids.append(line[1])
|
||||
|
||||
if index % CHUNK == 0:
|
||||
|
||||
# Flush movies
|
||||
flush(movie_ids)
|
||||
movie_ids = []
|
||||
Loading…
x
Reference in New Issue
Block a user