From 854e5f1d9873130a7e5d5f13c3ffe207b23ec9eb Mon Sep 17 00:00:00 2001
From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com>
Date: Sat, 20 Sep 2025 14:32:30 +0200
Subject: [PATCH] Updated file to gather data from wikipedia

---
 Scripts/DataGathering/wikipedia_gathering.py  | 11 +++-
 .../wikipedia_summary_gatherer.py             | 60 -------------------
 2 files changed, 9 insertions(+), 62 deletions(-)
 delete mode 100644 Scripts/DataGathering/wikipedia_summary_gatherer.py

diff --git a/Scripts/DataGathering/wikipedia_gathering.py b/Scripts/DataGathering/wikipedia_gathering.py
index d666885..50625f2 100644
--- a/Scripts/DataGathering/wikipedia_gathering.py
+++ b/Scripts/DataGathering/wikipedia_gathering.py
@@ -21,15 +21,22 @@ def get_clean_text(pageIDS: list[str]):
     start_full = time.time()
     API_URL = "https://en.wikipedia.org/w/api.php"
     headers = {
-        "User-Agent": "CoolBot/0.0 (https://example.org/coolbot/; coolbot@example.org)"
+        "User-Agent": "CoolBot/0.0"
+        ""
+        " (https://example.org/coolbot/; coolbot@example.org)"
     }
 
     ids = "|".join(pageIDS)
 
-
+    start_fetch = time.time()
     res = sess.get(headers=headers, url=f"{API_URL}?action=query&pageids={ids}&prop=extracts&exintro=1&explaintext=1&format=json")
+    end_fetch = time.time()
+    fetch_time = end_fetch - start_fetch
+    print(f"Time elapsed FETCH: {fetch_time} seconds")
+
     data = res.json()
 
+
     abstracts = {}
     # Make sure 'query' and the page exist
     SKIPPED = 0
diff --git a/Scripts/DataGathering/wikipedia_summary_gatherer.py b/Scripts/DataGathering/wikipedia_summary_gatherer.py
deleted file mode 100644
index c49dd83..0000000
--- a/Scripts/DataGathering/wikipedia_summary_gatherer.py
+++ /dev/null
@@ -1,60 +0,0 @@
-import pandas as pd
-import wikipediaapi
-import csv
-import time
-import re
-
-# Initialize Wikipedia API with proper user agent
-wiki_wiki = wikipediaapi.Wikipedia(
-    language='en',
-    user_agent='MovieAbstractScraper/1.0 (gape@example.com)'
-)
-
-input_csv = './Assets/Dataset/1-hop/wikipedia-movie.csv'
-output_csv = './Assets/Dataset/1-hop/wikipedia-summary.csv'
-
-# Function to get clean full text from Wikipedia URL
-def get_clean_text(wiki_url):
-    try:
-        page_title = wiki_url.rsplit('/', 1)[-1]  # extract page title
-        page = wiki_wiki.page(page_title)
-        if page.exists():
-            text = page.text
-            # Remove section headers like == History ==
-            text = re.sub(r'==.*?==', '', text)
-            # Collapse multiple spaces and newlines into single space
-            text = re.sub(r'\s+', ' ', text).strip()
-            return text
-    except Exception as e:
-        print(f"Error fetching {wiki_url}: {e}")
-    return ''
-
-# Initialize output CSV
-with open(output_csv, 'w', newline='', encoding='utf-8') as f_out:
-    writer = csv.DictWriter(f_out, fieldnames=['subject', 'text'])
-    writer.writeheader()
-
-# Process input CSV in chunks
-chunksize = 1
-for chunk in pd.read_csv(input_csv, chunksize=chunksize):
-    # Clean column names in case of quotes
-    chunk.columns = chunk.columns.str.replace('"', '').str.strip()
-    
-    rows_to_write = []
-    for _, row in chunk.iterrows():
-        wikipedia_url = row['subject']    # old subject (Wikipedia URL)
-        dbpedia_url = row['object']       # old object (DBpedia URL)
-        
-        clean_text = get_clean_text(wikipedia_url)
-        rows_to_write.append({
-            'subject': dbpedia_url,
-            'text': clean_text
-        })
-        time.sleep(0.1)  # polite delay
-
-    # Append to output CSV
-    with open(output_csv, 'a', newline='', encoding='utf-8') as f_out:
-        writer = csv.DictWriter(f_out, fieldnames=['subject', 'text'])
-        writer.writerows(rows_to_write)
-
-    print(f"Processed {len(chunk)} rows, appended to {output_csv}")