Updated file to gather data from wikipedia
This commit is contained in:
parent
de8c2afceb
commit
854e5f1d98
@ -21,15 +21,22 @@ def get_clean_text(pageIDS: list[str]):
|
|||||||
start_full = time.time()
|
start_full = time.time()
|
||||||
API_URL = "https://en.wikipedia.org/w/api.php"
|
API_URL = "https://en.wikipedia.org/w/api.php"
|
||||||
headers = {
|
headers = {
|
||||||
"User-Agent": "CoolBot/0.0 (https://example.org/coolbot/; coolbot@example.org)"
|
"User-Agent": "CoolBot/0.0"
|
||||||
|
""
|
||||||
|
" (https://example.org/coolbot/; coolbot@example.org)"
|
||||||
}
|
}
|
||||||
|
|
||||||
ids = "|".join(pageIDS)
|
ids = "|".join(pageIDS)
|
||||||
|
|
||||||
|
start_fetch = time.time()
|
||||||
res = sess.get(headers=headers, url=f"{API_URL}?action=query&pageids={ids}&prop=extracts&exintro=1&explaintext=1&format=json")
|
res = sess.get(headers=headers, url=f"{API_URL}?action=query&pageids={ids}&prop=extracts&exintro=1&explaintext=1&format=json")
|
||||||
|
end_fetch = time.time()
|
||||||
|
fetch_time = end_fetch - start_fetch
|
||||||
|
print(f"Time elapsed FETCH: {fetch_time} seconds")
|
||||||
|
|
||||||
data = res.json()
|
data = res.json()
|
||||||
|
|
||||||
|
|
||||||
abstracts = {}
|
abstracts = {}
|
||||||
# Make sure 'query' and the page exist
|
# Make sure 'query' and the page exist
|
||||||
SKIPPED = 0
|
SKIPPED = 0
|
||||||
|
|||||||
@ -1,60 +0,0 @@
|
|||||||
import pandas as pd
|
|
||||||
import wikipediaapi
|
|
||||||
import csv
|
|
||||||
import time
|
|
||||||
import re
|
|
||||||
|
|
||||||
# Initialize Wikipedia API with proper user agent
|
|
||||||
wiki_wiki = wikipediaapi.Wikipedia(
|
|
||||||
language='en',
|
|
||||||
user_agent='MovieAbstractScraper/1.0 (gape@example.com)'
|
|
||||||
)
|
|
||||||
|
|
||||||
input_csv = './Assets/Dataset/1-hop/wikipedia-movie.csv'
|
|
||||||
output_csv = './Assets/Dataset/1-hop/wikipedia-summary.csv'
|
|
||||||
|
|
||||||
# Function to get clean full text from Wikipedia URL
|
|
||||||
def get_clean_text(wiki_url):
|
|
||||||
try:
|
|
||||||
page_title = wiki_url.rsplit('/', 1)[-1] # extract page title
|
|
||||||
page = wiki_wiki.page(page_title)
|
|
||||||
if page.exists():
|
|
||||||
text = page.text
|
|
||||||
# Remove section headers like == History ==
|
|
||||||
text = re.sub(r'==.*?==', '', text)
|
|
||||||
# Collapse multiple spaces and newlines into single space
|
|
||||||
text = re.sub(r'\s+', ' ', text).strip()
|
|
||||||
return text
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Error fetching {wiki_url}: {e}")
|
|
||||||
return ''
|
|
||||||
|
|
||||||
# Initialize output CSV
|
|
||||||
with open(output_csv, 'w', newline='', encoding='utf-8') as f_out:
|
|
||||||
writer = csv.DictWriter(f_out, fieldnames=['subject', 'text'])
|
|
||||||
writer.writeheader()
|
|
||||||
|
|
||||||
# Process input CSV in chunks
|
|
||||||
chunksize = 1
|
|
||||||
for chunk in pd.read_csv(input_csv, chunksize=chunksize):
|
|
||||||
# Clean column names in case of quotes
|
|
||||||
chunk.columns = chunk.columns.str.replace('"', '').str.strip()
|
|
||||||
|
|
||||||
rows_to_write = []
|
|
||||||
for _, row in chunk.iterrows():
|
|
||||||
wikipedia_url = row['subject'] # old subject (Wikipedia URL)
|
|
||||||
dbpedia_url = row['object'] # old object (DBpedia URL)
|
|
||||||
|
|
||||||
clean_text = get_clean_text(wikipedia_url)
|
|
||||||
rows_to_write.append({
|
|
||||||
'subject': dbpedia_url,
|
|
||||||
'text': clean_text
|
|
||||||
})
|
|
||||||
time.sleep(0.1) # polite delay
|
|
||||||
|
|
||||||
# Append to output CSV
|
|
||||||
with open(output_csv, 'a', newline='', encoding='utf-8') as f_out:
|
|
||||||
writer = csv.DictWriter(f_out, fieldnames=['subject', 'text'])
|
|
||||||
writer.writerows(rows_to_write)
|
|
||||||
|
|
||||||
print(f"Processed {len(chunk)} rows, appended to {output_csv}")
|
|
||||||
Loading…
x
Reference in New Issue
Block a user