diff --git a/Script/DataGathering/wikipedia_summary_gatherer.py b/Script/DataGathering/wikipedia_summary_gatherer.py new file mode 100644 index 0000000..c49dd83 --- /dev/null +++ b/Script/DataGathering/wikipedia_summary_gatherer.py @@ -0,0 +1,60 @@ +import pandas as pd +import wikipediaapi +import csv +import time +import re + +# Initialize Wikipedia API with proper user agent +wiki_wiki = wikipediaapi.Wikipedia( + language='en', + user_agent='MovieAbstractScraper/1.0 (gape@example.com)' +) + +input_csv = './Assets/Dataset/1-hop/wikipedia-movie.csv' +output_csv = './Assets/Dataset/1-hop/wikipedia-summary.csv' + +# Function to get clean full text from Wikipedia URL +def get_clean_text(wiki_url): + try: + page_title = wiki_url.rsplit('/', 1)[-1] # extract page title + page = wiki_wiki.page(page_title) + if page.exists(): + text = page.text + # Remove section headers like == History == + text = re.sub(r'==.*?==', '', text) + # Collapse multiple spaces and newlines into single space + text = re.sub(r'\s+', ' ', text).strip() + return text + except Exception as e: + print(f"Error fetching {wiki_url}: {e}") + return '' + +# Initialize output CSV +with open(output_csv, 'w', newline='', encoding='utf-8') as f_out: + writer = csv.DictWriter(f_out, fieldnames=['subject', 'text']) + writer.writeheader() + +# Process input CSV in chunks +chunksize = 1 +for chunk in pd.read_csv(input_csv, chunksize=chunksize): + # Clean column names in case of quotes + chunk.columns = chunk.columns.str.replace('"', '').str.strip() + + rows_to_write = [] + for _, row in chunk.iterrows(): + wikipedia_url = row['subject'] # old subject (Wikipedia URL) + dbpedia_url = row['object'] # old object (DBpedia URL) + + clean_text = get_clean_text(wikipedia_url) + rows_to_write.append({ + 'subject': dbpedia_url, + 'text': clean_text + }) + time.sleep(0.1) # polite delay + + # Append to output CSV + with open(output_csv, 'a', newline='', encoding='utf-8') as f_out: + writer = csv.DictWriter(f_out, fieldnames=['subject', 'text']) + writer.writerows(rows_to_write) + + print(f"Processed {len(chunk)} rows, appended to {output_csv}")