Added file to gather wikipedia abstract from url

2025-09-18 20:26:11 +02:00
parent 4bb03f86b3
commit ec81ea7930
1 changed files with 60 additions and 0 deletions
--- a/Script/DataGathering/wikipedia_summary_gatherer.py
+++ b/Script/DataGathering/wikipedia_summary_gatherer.py
@@ -0,0 +1,60 @@
+import pandas as pd
+import wikipediaapi
+import csv
+import time
+import re
+
+# Initialize Wikipedia API with proper user agent
+wiki_wiki = wikipediaapi.Wikipedia(
+    language='en',
+    user_agent='MovieAbstractScraper/1.0 (gape@example.com)'
+)
+
+input_csv = './Assets/Dataset/1-hop/wikipedia-movie.csv'
+output_csv = './Assets/Dataset/1-hop/wikipedia-summary.csv'
+
+# Function to get clean full text from Wikipedia URL
+def get_clean_text(wiki_url):
+    try:
+        page_title = wiki_url.rsplit('/', 1)[-1]  # extract page title
+        page = wiki_wiki.page(page_title)
+        if page.exists():
+            text = page.text
+            # Remove section headers like == History ==
+            text = re.sub(r'==.*?==', '', text)
+            # Collapse multiple spaces and newlines into single space
+            text = re.sub(r'\s+', ' ', text).strip()
+            return text
+    except Exception as e:
+        print(f"Error fetching {wiki_url}: {e}")
+    return ''
+
+# Initialize output CSV
+with open(output_csv, 'w', newline='', encoding='utf-8') as f_out:
+    writer = csv.DictWriter(f_out, fieldnames=['subject', 'text'])
+    writer.writeheader()
+
+# Process input CSV in chunks
+chunksize = 1
+for chunk in pd.read_csv(input_csv, chunksize=chunksize):
+    # Clean column names in case of quotes
+    chunk.columns = chunk.columns.str.replace('"', '').str.strip()
+    
+    rows_to_write = []
+    for _, row in chunk.iterrows():
+        wikipedia_url = row['subject']    # old subject (Wikipedia URL)
+        dbpedia_url = row['object']       # old object (DBpedia URL)
+        
+        clean_text = get_clean_text(wikipedia_url)
+        rows_to_write.append({
+            'subject': dbpedia_url,
+            'text': clean_text
+        })
+        time.sleep(0.1)  # polite delay
+
+    # Append to output CSV
+    with open(output_csv, 'a', newline='', encoding='utf-8') as f_out:
+        writer = csv.DictWriter(f_out, fieldnames=['subject', 'text'])
+        writer.writerows(rows_to_write)
+
+    print(f"Processed {len(chunk)} rows, appended to {output_csv}")