Created script to gather wikipedia abstracts

2025-09-19 19:01:38 +02:00
parent e39bad8348
commit f89dffff75
1 changed files with 95 additions and 0 deletions
--- a/Scripts/DataGathering/wikipedia_gathering.py
+++ b/Scripts/DataGathering/wikipedia_gathering.py
@@ -0,0 +1,95 @@
+import pandas as pd
+
+import csv
+import time
+import requests
+
+input_csv = "./Assets/Dataset/1-hop/movie-pageid.csv"
+output_csv = "./Assets/Dataset/Tmp/wikipedia-summary.csv"
+
+
+sess = requests.Session()
+
+CHUNK = 20
+
+
+# Function to get clean full text from Wikipedia PageID
+def get_clean_text(pageIDS: list[str]):
+
+    API_URL = "https://en.wikipedia.org/w/api.php"
+    headers = {
+        "User-Agent": "CoolBot/0.0 (https://example.org/coolbot/; coolbot@example.org)"
+    }
+
+    ids = "|".join(pageIDS)
+
+
+    res = sess.get(headers=headers, url=f"{API_URL}?action=query&pageids={ids}&prop=extracts&exintro=1&explaintext=1&format=json")
+    data = res.json()
+
+    abstracts = {}
+    # Make sure 'query' and the page exist
+    SKIPPED = 0
+    if "query" in data and "pages" in data["query"]:
+        for pageID in pageIDS:
+            if pageID in data["query"]["pages"]:
+                page = data["query"]["pages"][pageID]
+                extract: str = page.get("extract")
+
+                if extract:
+                    print(f"Entry FOUND for pageID {pageID}")
+                    extract = extract.strip()
+                    extract = extract.replace("\n", "")
+                    abstracts[pageID] = extract
+                else:
+                    SKIPPED += 1
+                    print(f"Entry MISSING for pageID {pageID}")
+            else:
+                SKIPPED += 1
+                print(f"Page MISSING for pageID {pageID}")
+
+    print(f"Chunk done - Skipped {SKIPPED}")
+    return abstracts
+
+
+def flush(movie_ids):
+
+
+        abstracts = get_clean_text(movie_ids)
+
+        with open(output_csv, "a", newline="", encoding="utf-8") as f_out:
+            writer = csv.DictWriter(f_out, fieldnames=["subject", "text"])
+
+            for id, text in abstracts.items():
+                writer.writerow({"subject": id, "text": text})
+
+
+# Initialize output CSV
+with open(output_csv, "w", newline="", encoding="utf-8") as f_out:
+    writer = csv.DictWriter(f_out, fieldnames=["subject", "text"])
+    writer.writeheader()
+
+
+# Read CSV in RAM
+with open(input_csv, "r", newline="", encoding="utf-8") as input:
+
+    reader = csv.reader(input)
+
+    index = -1
+    movie_ids = []
+
+    for line in reader:
+
+        index += 1
+
+        if index == 0:
+            continue
+
+        # Save movies in map
+        movie_ids.append(line[1])
+
+        if index % CHUNK == 0:
+
+            # Flush movies
+            flush(movie_ids)
+            movie_ids = []