From 854e5f1d9873130a7e5d5f13c3ffe207b23ec9eb Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Sat, 20 Sep 2025 14:32:30 +0200 Subject: [PATCH] Updated file to gather data from wikipedia --- Scripts/DataGathering/wikipedia_gathering.py | 11 +++- .../wikipedia_summary_gatherer.py | 60 ------------------- 2 files changed, 9 insertions(+), 62 deletions(-) delete mode 100644 Scripts/DataGathering/wikipedia_summary_gatherer.py diff --git a/Scripts/DataGathering/wikipedia_gathering.py b/Scripts/DataGathering/wikipedia_gathering.py index d666885..50625f2 100644 --- a/Scripts/DataGathering/wikipedia_gathering.py +++ b/Scripts/DataGathering/wikipedia_gathering.py @@ -21,15 +21,22 @@ def get_clean_text(pageIDS: list[str]): start_full = time.time() API_URL = "https://en.wikipedia.org/w/api.php" headers = { - "User-Agent": "CoolBot/0.0 (https://example.org/coolbot/; coolbot@example.org)" + "User-Agent": "CoolBot/0.0" + "" + " (https://example.org/coolbot/; coolbot@example.org)" } ids = "|".join(pageIDS) - + start_fetch = time.time() res = sess.get(headers=headers, url=f"{API_URL}?action=query&pageids={ids}&prop=extracts&exintro=1&explaintext=1&format=json") + end_fetch = time.time() + fetch_time = end_fetch - start_fetch + print(f"Time elapsed FETCH: {fetch_time} seconds") + data = res.json() + abstracts = {} # Make sure 'query' and the page exist SKIPPED = 0 diff --git a/Scripts/DataGathering/wikipedia_summary_gatherer.py b/Scripts/DataGathering/wikipedia_summary_gatherer.py deleted file mode 100644 index c49dd83..0000000 --- a/Scripts/DataGathering/wikipedia_summary_gatherer.py +++ /dev/null @@ -1,60 +0,0 @@ -import pandas as pd -import wikipediaapi -import csv -import time -import re - -# Initialize Wikipedia API with proper user agent -wiki_wiki = wikipediaapi.Wikipedia( - language='en', - user_agent='MovieAbstractScraper/1.0 (gape@example.com)' -) - -input_csv = './Assets/Dataset/1-hop/wikipedia-movie.csv' -output_csv = './Assets/Dataset/1-hop/wikipedia-summary.csv' - -# Function to get clean full text from Wikipedia URL -def get_clean_text(wiki_url): - try: - page_title = wiki_url.rsplit('/', 1)[-1] # extract page title - page = wiki_wiki.page(page_title) - if page.exists(): - text = page.text - # Remove section headers like == History == - text = re.sub(r'==.*?==', '', text) - # Collapse multiple spaces and newlines into single space - text = re.sub(r'\s+', ' ', text).strip() - return text - except Exception as e: - print(f"Error fetching {wiki_url}: {e}") - return '' - -# Initialize output CSV -with open(output_csv, 'w', newline='', encoding='utf-8') as f_out: - writer = csv.DictWriter(f_out, fieldnames=['subject', 'text']) - writer.writeheader() - -# Process input CSV in chunks -chunksize = 1 -for chunk in pd.read_csv(input_csv, chunksize=chunksize): - # Clean column names in case of quotes - chunk.columns = chunk.columns.str.replace('"', '').str.strip() - - rows_to_write = [] - for _, row in chunk.iterrows(): - wikipedia_url = row['subject'] # old subject (Wikipedia URL) - dbpedia_url = row['object'] # old object (DBpedia URL) - - clean_text = get_clean_text(wikipedia_url) - rows_to_write.append({ - 'subject': dbpedia_url, - 'text': clean_text - }) - time.sleep(0.1) # polite delay - - # Append to output CSV - with open(output_csv, 'a', newline='', encoding='utf-8') as f_out: - writer = csv.DictWriter(f_out, fieldnames=['subject', 'text']) - writer.writerows(rows_to_write) - - print(f"Processed {len(chunk)} rows, appended to {output_csv}")