diff --git a/Scripts/DataGathering/wikipedia_gathering.py b/Scripts/DataGathering/wikipedia_gathering.py index 8e4ef55..d666885 100644 --- a/Scripts/DataGathering/wikipedia_gathering.py +++ b/Scripts/DataGathering/wikipedia_gathering.py @@ -1,3 +1,4 @@ +from pathlib import Path import pandas as pd import csv @@ -16,6 +17,8 @@ CHUNK = 20 # Function to get clean full text from Wikipedia PageID def get_clean_text(pageIDS: list[str]): + parsing_time = 0 + start_full = time.time() API_URL = "https://en.wikipedia.org/w/api.php" headers = { "User-Agent": "CoolBot/0.0 (https://example.org/coolbot/; coolbot@example.org)" @@ -38,8 +41,12 @@ def get_clean_text(pageIDS: list[str]): if extract: print(f"Entry FOUND for pageID {pageID}") + start_parse = time.time() extract = extract.strip() extract = extract.replace("\n", "") + end_parse = time.time() + parsing_time = end_parse - start_parse + print(f"Time elapsed PARSE: {parsing_time} seconds") abstracts[pageID] = extract else: SKIPPED += 1 @@ -49,6 +56,9 @@ def get_clean_text(pageIDS: list[str]): print(f"Page MISSING for pageID {pageID}") print(f"Chunk done - Skipped {SKIPPED}") + end_full = time.time() + + print(f"Time elapsed FULL: {end_full - start_full} seconds\n\tNO PARSE: {(end_full - start_full) - parsing_time} seconds") return abstracts @@ -57,22 +67,64 @@ def flush(movie_ids): abstracts = get_clean_text(movie_ids) + start = time.time() with open(output_csv, "a", newline="", encoding="utf-8") as f_out: writer = csv.DictWriter(f_out, fieldnames=["subject", "text"]) for id, text in abstracts.items(): writer.writerow({"subject": id, "text": text}) + end = time.time() + + print(f"Time elapsed WRITE: {end - start} seconds") -# Initialize output CSV -with open(output_csv, "w", newline="", encoding="utf-8") as f_out: - writer = csv.DictWriter(f_out, fieldnames=["subject", "text"]) - writer.writeheader() + + +def reconcile() -> int: + + start = time.time() + input_file = open(input_csv, "r", newline="", encoding="utf-8") + output_file = open(output_csv, "r", newline="", encoding="utf-8") + + next(input_file) + LAST_CHECKED = output_file.readlines()[-1].split(",")[0] + current_check = input_file.readline().split(",")[1] + + index = 1 + + while current_check != LAST_CHECKED: + current_check = input_file.readline().split(",")[1].replace("\n", "") + index += 1 + + input_file.close() + output_file.close() + end = time.time() + + + print(f"Time elapsed RECONCILE: {end - start} seconds") + + print(f"FOUND, we need to skip {index} lines") + + return index + + +if not Path(output_csv).is_file(): + # Initialize output CSV + with open(output_csv, "w", newline="", encoding="utf-8") as f_out: + writer = csv.DictWriter(f_out, fieldnames=["subject", "text"]) + writer.writeheader() + + +SKIP = reconcile() # Read CSV in RAM with open(input_csv, "r", newline="", encoding="utf-8") as input: + # Skip already done + for i in range(0, SKIP): + next(input) + reader = csv.reader(input) index = -1