Created script to gather wikipedia abstracts

2025-09-19 19:01:38 +02:00
parent e39bad8348
commit f89dffff75
1 changed files with 95 additions and 0 deletions
--- a/Scripts/DataGathering/wikipedia_gathering.py
+++ b/Scripts/DataGathering/wikipedia_gathering.py
@@ -0,0 +1,95 @@
 import pandas as pd
 import csv
 import time
 import requests
 input_csv = "./Assets/Dataset/1-hop/movie-pageid.csv"
 output_csv = "./Assets/Dataset/Tmp/wikipedia-summary.csv"
 sess = requests.Session()
 CHUNK = 20
 # Function to get clean full text from Wikipedia PageID
 def get_clean_text(pageIDS: list[str]):
    API_URL = "https://en.wikipedia.org/w/api.php"
    headers = {
        "User-Agent": "CoolBot/0.0 (https://example.org/coolbot/; coolbot@example.org)"
    }
    ids = "|".join(pageIDS)
    res = sess.get(headers=headers, url=f"{API_URL}?action=query&pageids={ids}&prop=extracts&exintro=1&explaintext=1&format=json")
    data = res.json()
    abstracts = {}
    # Make sure 'query' and the page exist
    SKIPPED = 0
    if "query" in data and "pages" in data["query"]:
        for pageID in pageIDS:
            if pageID in data["query"]["pages"]:
                page = data["query"]["pages"][pageID]
                extract: str = page.get("extract")
                if extract:
                    print(f"Entry FOUND for pageID {pageID}")
                    extract = extract.strip()
                    extract = extract.replace("\n", "")
                    abstracts[pageID] = extract
                else:
                    SKIPPED += 1
                    print(f"Entry MISSING for pageID {pageID}")
            else:
                SKIPPED += 1
                print(f"Page MISSING for pageID {pageID}")
    print(f"Chunk done - Skipped {SKIPPED}")
    return abstracts
 def flush(movie_ids):
        abstracts = get_clean_text(movie_ids)
        with open(output_csv, "a", newline="", encoding="utf-8") as f_out:
            writer = csv.DictWriter(f_out, fieldnames=["subject", "text"])
            for id, text in abstracts.items():
                writer.writerow({"subject": id, "text": text})
 # Initialize output CSV
 with open(output_csv, "w", newline="", encoding="utf-8") as f_out:
    writer = csv.DictWriter(f_out, fieldnames=["subject", "text"])
    writer.writeheader()
 # Read CSV in RAM
 with open(input_csv, "r", newline="", encoding="utf-8") as input:
    reader = csv.reader(input)
    index = -1
    movie_ids = []
    for line in reader:
        index += 1
        if index == 0:
            continue
        # Save movies in map
        movie_ids.append(line[1])
        if index % CHUNK == 0:
            # Flush movies
            flush(movie_ids)
            movie_ids = []