Updated file to gather data from wikipedia

2025-09-20 14:32:30 +02:00
parent de8c2afceb
commit 854e5f1d98
2 changed files with 9 additions and 62 deletions
--- a/Scripts/DataGathering/wikipedia_gathering.py
+++ b/Scripts/DataGathering/wikipedia_gathering.py
@@ -21,15 +21,22 @@ def get_clean_text(pageIDS: list[str]):
    start_full = time.time()
    API_URL = "https://en.wikipedia.org/w/api.php"
    headers = {
-        "User-Agent": "CoolBot/0.0 (https://example.org/coolbot/; coolbot@example.org)"
+        "User-Agent": "CoolBot/0.0"
        ""
        " (https://example.org/coolbot/; coolbot@example.org)"
    }
    ids = "|".join(pageIDS)
-
+    start_fetch = time.time()
    res = sess.get(headers=headers, url=f"{API_URL}?action=query&pageids={ids}&prop=extracts&exintro=1&explaintext=1&format=json")
    end_fetch = time.time()
    fetch_time = end_fetch - start_fetch
    print(f"Time elapsed FETCH: {fetch_time} seconds")
    data = res.json()
    abstracts = {}
    # Make sure 'query' and the page exist
    SKIPPED = 0
--- a/Scripts/DataGathering/wikipedia_summary_gatherer.py
+++ b/Scripts/DataGathering/wikipedia_summary_gatherer.py
@@ -1,60 +0,0 @@
 import pandas as pd
 import wikipediaapi
 import csv
 import time
 import re
 # Initialize Wikipedia API with proper user agent
 wiki_wiki = wikipediaapi.Wikipedia(
    language='en',
    user_agent='MovieAbstractScraper/1.0 (gape@example.com)'
 )
 input_csv = './Assets/Dataset/1-hop/wikipedia-movie.csv'
 output_csv = './Assets/Dataset/1-hop/wikipedia-summary.csv'
 # Function to get clean full text from Wikipedia URL
 def get_clean_text(wiki_url):
    try:
        page_title = wiki_url.rsplit('/', 1)[-1]  # extract page title
        page = wiki_wiki.page(page_title)
        if page.exists():
            text = page.text
            # Remove section headers like == History ==
            text = re.sub(r'==.*?==', '', text)
            # Collapse multiple spaces and newlines into single space
            text = re.sub(r'\s+', ' ', text).strip()
            return text
    except Exception as e:
        print(f"Error fetching {wiki_url}: {e}")
    return ''
 # Initialize output CSV
 with open(output_csv, 'w', newline='', encoding='utf-8') as f_out:
    writer = csv.DictWriter(f_out, fieldnames=['subject', 'text'])
    writer.writeheader()
 # Process input CSV in chunks
 chunksize = 1
 for chunk in pd.read_csv(input_csv, chunksize=chunksize):
    # Clean column names in case of quotes
    chunk.columns = chunk.columns.str.replace('"', '').str.strip()
    rows_to_write = []
    for _, row in chunk.iterrows():
        wikipedia_url = row['subject']    # old subject (Wikipedia URL)
        dbpedia_url = row['object']       # old object (DBpedia URL)
        clean_text = get_clean_text(wikipedia_url)
        rows_to_write.append({
            'subject': dbpedia_url,
            'text': clean_text
        })
        time.sleep(0.1)  # polite delay
    # Append to output CSV
    with open(output_csv, 'a', newline='', encoding='utf-8') as f_out:
        writer = csv.DictWriter(f_out, fieldnames=['subject', 'text'])
        writer.writerows(rows_to_write)
    print(f"Processed {len(chunk)} rows, appended to {output_csv}")