diff --git a/Scripts/DataGathering/wikipedia_gathering.py b/Scripts/DataGathering/wikipedia_gathering.py new file mode 100644 index 0000000..8e4ef55 --- /dev/null +++ b/Scripts/DataGathering/wikipedia_gathering.py @@ -0,0 +1,95 @@ +import pandas as pd + +import csv +import time +import requests + +input_csv = "./Assets/Dataset/1-hop/movie-pageid.csv" +output_csv = "./Assets/Dataset/Tmp/wikipedia-summary.csv" + + +sess = requests.Session() + +CHUNK = 20 + + +# Function to get clean full text from Wikipedia PageID +def get_clean_text(pageIDS: list[str]): + + API_URL = "https://en.wikipedia.org/w/api.php" + headers = { + "User-Agent": "CoolBot/0.0 (https://example.org/coolbot/; coolbot@example.org)" + } + + ids = "|".join(pageIDS) + + + res = sess.get(headers=headers, url=f"{API_URL}?action=query&pageids={ids}&prop=extracts&exintro=1&explaintext=1&format=json") + data = res.json() + + abstracts = {} + # Make sure 'query' and the page exist + SKIPPED = 0 + if "query" in data and "pages" in data["query"]: + for pageID in pageIDS: + if pageID in data["query"]["pages"]: + page = data["query"]["pages"][pageID] + extract: str = page.get("extract") + + if extract: + print(f"Entry FOUND for pageID {pageID}") + extract = extract.strip() + extract = extract.replace("\n", "") + abstracts[pageID] = extract + else: + SKIPPED += 1 + print(f"Entry MISSING for pageID {pageID}") + else: + SKIPPED += 1 + print(f"Page MISSING for pageID {pageID}") + + print(f"Chunk done - Skipped {SKIPPED}") + return abstracts + + +def flush(movie_ids): + + + abstracts = get_clean_text(movie_ids) + + with open(output_csv, "a", newline="", encoding="utf-8") as f_out: + writer = csv.DictWriter(f_out, fieldnames=["subject", "text"]) + + for id, text in abstracts.items(): + writer.writerow({"subject": id, "text": text}) + + +# Initialize output CSV +with open(output_csv, "w", newline="", encoding="utf-8") as f_out: + writer = csv.DictWriter(f_out, fieldnames=["subject", "text"]) + writer.writeheader() + + +# Read CSV in RAM +with open(input_csv, "r", newline="", encoding="utf-8") as input: + + reader = csv.reader(input) + + index = -1 + movie_ids = [] + + for line in reader: + + index += 1 + + if index == 0: + continue + + # Save movies in map + movie_ids.append(line[1]) + + if index % CHUNK == 0: + + # Flush movies + flush(movie_ids) + movie_ids = []