Updated file to gather data from wikipedia

2025-09-20 14:32:30 +02:00
parent de8c2afceb
commit 854e5f1d98
2 changed files with 9 additions and 62 deletions
--- a/Scripts/DataGathering/wikipedia_gathering.py
+++ b/Scripts/DataGathering/wikipedia_gathering.py
@@ -21,15 +21,22 @@ def get_clean_text(pageIDS: list[str]):
    start_full = time.time()
    API_URL = "https://en.wikipedia.org/w/api.php"
    headers = {
-        "User-Agent": "CoolBot/0.0 (https://example.org/coolbot/; coolbot@example.org)"
+        "User-Agent": "CoolBot/0.0"
+        ""
+        " (https://example.org/coolbot/; coolbot@example.org)"
    }

    ids = "|".join(pageIDS)

-
+    start_fetch = time.time()
    res = sess.get(headers=headers, url=f"{API_URL}?action=query&pageids={ids}&prop=extracts&exintro=1&explaintext=1&format=json")
+    end_fetch = time.time()
+    fetch_time = end_fetch - start_fetch
+    print(f"Time elapsed FETCH: {fetch_time} seconds")
+
    data = res.json()

+
    abstracts = {}
    # Make sure 'query' and the page exist
    SKIPPED = 0
--- a/Scripts/DataGathering/wikipedia_summary_gatherer.py
+++ b/Scripts/DataGathering/wikipedia_summary_gatherer.py
@@ -1,60 +0,0 @@
-import pandas as pd
-import wikipediaapi
-import csv
-import time
-import re
-
-# Initialize Wikipedia API with proper user agent
-wiki_wiki = wikipediaapi.Wikipedia(
-    language='en',
-    user_agent='MovieAbstractScraper/1.0 (gape@example.com)'
-)
-
-input_csv = './Assets/Dataset/1-hop/wikipedia-movie.csv'
-output_csv = './Assets/Dataset/1-hop/wikipedia-summary.csv'
-
-# Function to get clean full text from Wikipedia URL
-def get_clean_text(wiki_url):
-    try:
-        page_title = wiki_url.rsplit('/', 1)[-1]  # extract page title
-        page = wiki_wiki.page(page_title)
-        if page.exists():
-            text = page.text
-            # Remove section headers like == History ==
-            text = re.sub(r'==.*?==', '', text)
-            # Collapse multiple spaces and newlines into single space
-            text = re.sub(r'\s+', ' ', text).strip()
-            return text
-    except Exception as e:
-        print(f"Error fetching {wiki_url}: {e}")
-    return ''
-
-# Initialize output CSV
-with open(output_csv, 'w', newline='', encoding='utf-8') as f_out:
-    writer = csv.DictWriter(f_out, fieldnames=['subject', 'text'])
-    writer.writeheader()
-
-# Process input CSV in chunks
-chunksize = 1
-for chunk in pd.read_csv(input_csv, chunksize=chunksize):
-    # Clean column names in case of quotes
-    chunk.columns = chunk.columns.str.replace('"', '').str.strip()
-    
-    rows_to_write = []
-    for _, row in chunk.iterrows():
-        wikipedia_url = row['subject']    # old subject (Wikipedia URL)
-        dbpedia_url = row['object']       # old object (DBpedia URL)
-        
-        clean_text = get_clean_text(wikipedia_url)
-        rows_to_write.append({
-            'subject': dbpedia_url,
-            'text': clean_text
-        })
-        time.sleep(0.1)  # polite delay
-
-    # Append to output CSV
-    with open(output_csv, 'a', newline='', encoding='utf-8') as f_out:
-        writer = csv.DictWriter(f_out, fieldnames=['subject', 'text'])
-        writer.writerows(rows_to_write)
-
-    print(f"Processed {len(chunk)} rows, appended to {output_csv}")