Added file to gather wikipedia abstract from url
This commit is contained in:
parent
4bb03f86b3
commit
ec81ea7930
60
Script/DataGathering/wikipedia_summary_gatherer.py
Normal file
60
Script/DataGathering/wikipedia_summary_gatherer.py
Normal file
@ -0,0 +1,60 @@
|
||||
import pandas as pd
|
||||
import wikipediaapi
|
||||
import csv
|
||||
import time
|
||||
import re
|
||||
|
||||
# Initialize Wikipedia API with proper user agent
|
||||
wiki_wiki = wikipediaapi.Wikipedia(
|
||||
language='en',
|
||||
user_agent='MovieAbstractScraper/1.0 (gape@example.com)'
|
||||
)
|
||||
|
||||
input_csv = './Assets/Dataset/1-hop/wikipedia-movie.csv'
|
||||
output_csv = './Assets/Dataset/1-hop/wikipedia-summary.csv'
|
||||
|
||||
# Function to get clean full text from Wikipedia URL
|
||||
def get_clean_text(wiki_url):
|
||||
try:
|
||||
page_title = wiki_url.rsplit('/', 1)[-1] # extract page title
|
||||
page = wiki_wiki.page(page_title)
|
||||
if page.exists():
|
||||
text = page.text
|
||||
# Remove section headers like == History ==
|
||||
text = re.sub(r'==.*?==', '', text)
|
||||
# Collapse multiple spaces and newlines into single space
|
||||
text = re.sub(r'\s+', ' ', text).strip()
|
||||
return text
|
||||
except Exception as e:
|
||||
print(f"Error fetching {wiki_url}: {e}")
|
||||
return ''
|
||||
|
||||
# Initialize output CSV
|
||||
with open(output_csv, 'w', newline='', encoding='utf-8') as f_out:
|
||||
writer = csv.DictWriter(f_out, fieldnames=['subject', 'text'])
|
||||
writer.writeheader()
|
||||
|
||||
# Process input CSV in chunks
|
||||
chunksize = 1
|
||||
for chunk in pd.read_csv(input_csv, chunksize=chunksize):
|
||||
# Clean column names in case of quotes
|
||||
chunk.columns = chunk.columns.str.replace('"', '').str.strip()
|
||||
|
||||
rows_to_write = []
|
||||
for _, row in chunk.iterrows():
|
||||
wikipedia_url = row['subject'] # old subject (Wikipedia URL)
|
||||
dbpedia_url = row['object'] # old object (DBpedia URL)
|
||||
|
||||
clean_text = get_clean_text(wikipedia_url)
|
||||
rows_to_write.append({
|
||||
'subject': dbpedia_url,
|
||||
'text': clean_text
|
||||
})
|
||||
time.sleep(0.1) # polite delay
|
||||
|
||||
# Append to output CSV
|
||||
with open(output_csv, 'a', newline='', encoding='utf-8') as f_out:
|
||||
writer = csv.DictWriter(f_out, fieldnames=['subject', 'text'])
|
||||
writer.writerows(rows_to_write)
|
||||
|
||||
print(f"Processed {len(chunk)} rows, appended to {output_csv}")
|
||||
Loading…
x
Reference in New Issue
Block a user