Added reconciliation

2025-09-19 22:22:09 +02:00
parent f89dffff75
commit de8c2afceb
1 changed files with 56 additions and 4 deletions
--- a/Scripts/DataGathering/wikipedia_gathering.py
+++ b/Scripts/DataGathering/wikipedia_gathering.py
@@ -1,3 +1,4 @@
 from pathlib import Path
 import pandas as pd
 import csv
@@ -16,6 +17,8 @@ CHUNK = 20
 # Function to get clean full text from Wikipedia PageID
 def get_clean_text(pageIDS: list[str]):
    parsing_time = 0
    start_full = time.time()
    API_URL = "https://en.wikipedia.org/w/api.php"
    headers = {
        "User-Agent": "CoolBot/0.0 (https://example.org/coolbot/; coolbot@example.org)"
@@ -38,8 +41,12 @@ def get_clean_text(pageIDS: list[str]):
                if extract:
                    print(f"Entry FOUND for pageID {pageID}")
                    start_parse = time.time()
                    extract = extract.strip()
                    extract = extract.replace("\n", "")
                    end_parse = time.time()
                    parsing_time = end_parse - start_parse
                    print(f"Time elapsed PARSE: {parsing_time} seconds")
                    abstracts[pageID] = extract
                else:
                    SKIPPED += 1
@@ -49,6 +56,9 @@ def get_clean_text(pageIDS: list[str]):
                print(f"Page MISSING for pageID {pageID}")
    print(f"Chunk done - Skipped {SKIPPED}")
    end_full = time.time()
    print(f"Time elapsed FULL: {end_full - start_full} seconds\n\tNO PARSE: {(end_full - start_full) - parsing_time} seconds")
    return abstracts
@@ -57,22 +67,64 @@ def flush(movie_ids):
        abstracts = get_clean_text(movie_ids)
        start = time.time()
        with open(output_csv, "a", newline="", encoding="utf-8") as f_out:
            writer = csv.DictWriter(f_out, fieldnames=["subject", "text"])
            for id, text in abstracts.items():
                writer.writerow({"subject": id, "text": text})
        end = time.time()
        print(f"Time elapsed WRITE: {end - start} seconds")
-# Initialize output CSV
+
-with open(output_csv, "w", newline="", encoding="utf-8") as f_out:
+
-    writer = csv.DictWriter(f_out, fieldnames=["subject", "text"])
+def reconcile() -> int:
-    writer.writeheader()
+
    start = time.time()
    input_file = open(input_csv, "r", newline="", encoding="utf-8")
    output_file = open(output_csv, "r", newline="", encoding="utf-8")
    next(input_file)
    LAST_CHECKED = output_file.readlines()[-1].split(",")[0]
    current_check = input_file.readline().split(",")[1]
    index = 1
    while current_check != LAST_CHECKED:
        current_check = input_file.readline().split(",")[1].replace("\n", "")
        index += 1
    input_file.close()
    output_file.close()
    end = time.time()
    print(f"Time elapsed RECONCILE: {end - start} seconds")
    print(f"FOUND, we need to skip {index} lines")
    return index
 if not Path(output_csv).is_file():
    # Initialize output CSV
    with open(output_csv, "w", newline="", encoding="utf-8") as f_out:
        writer = csv.DictWriter(f_out, fieldnames=["subject", "text"])
        writer.writeheader()
 SKIP = reconcile()
 # Read CSV in RAM
 with open(input_csv, "r", newline="", encoding="utf-8") as input:
    # Skip already done
    for i in range(0, SKIP):
        next(input)
    reader = csv.reader(input)
    index = -1