Added reconciliation

2025-09-19 22:22:09 +02:00
parent f89dffff75
commit de8c2afceb
1 changed files with 56 additions and 4 deletions
--- a/Scripts/DataGathering/wikipedia_gathering.py
+++ b/Scripts/DataGathering/wikipedia_gathering.py
@@ -1,3 +1,4 @@
+from pathlib import Path
 import pandas as pd

 import csv
@@ -16,6 +17,8 @@ CHUNK = 20
 # Function to get clean full text from Wikipedia PageID
 def get_clean_text(pageIDS: list[str]):

+    parsing_time = 0
+    start_full = time.time()
    API_URL = "https://en.wikipedia.org/w/api.php"
    headers = {
        "User-Agent": "CoolBot/0.0 (https://example.org/coolbot/; coolbot@example.org)"
@@ -38,8 +41,12 @@ def get_clean_text(pageIDS: list[str]):

                if extract:
                    print(f"Entry FOUND for pageID {pageID}")
+                    start_parse = time.time()
                    extract = extract.strip()
                    extract = extract.replace("\n", "")
+                    end_parse = time.time()
+                    parsing_time = end_parse - start_parse
+                    print(f"Time elapsed PARSE: {parsing_time} seconds")
                    abstracts[pageID] = extract
                else:
                    SKIPPED += 1
@@ -49,6 +56,9 @@ def get_clean_text(pageIDS: list[str]):
                print(f"Page MISSING for pageID {pageID}")

    print(f"Chunk done - Skipped {SKIPPED}")
+    end_full = time.time()
+
+    print(f"Time elapsed FULL: {end_full - start_full} seconds\n\tNO PARSE: {(end_full - start_full) - parsing_time} seconds")
    return abstracts


@@ -57,22 +67,64 @@ def flush(movie_ids):

        abstracts = get_clean_text(movie_ids)

+        start = time.time()
        with open(output_csv, "a", newline="", encoding="utf-8") as f_out:
            writer = csv.DictWriter(f_out, fieldnames=["subject", "text"])

            for id, text in abstracts.items():
                writer.writerow({"subject": id, "text": text})
+        end = time.time()
+
+        print(f"Time elapsed WRITE: {end - start} seconds")


-# Initialize output CSV
-with open(output_csv, "w", newline="", encoding="utf-8") as f_out:
-    writer = csv.DictWriter(f_out, fieldnames=["subject", "text"])
-    writer.writeheader()
+
+
+def reconcile() -> int:
+
+    start = time.time()
+    input_file = open(input_csv, "r", newline="", encoding="utf-8")
+    output_file = open(output_csv, "r", newline="", encoding="utf-8")
+
+    next(input_file)
+    LAST_CHECKED = output_file.readlines()[-1].split(",")[0]
+    current_check = input_file.readline().split(",")[1]
+
+    index = 1
+
+    while current_check != LAST_CHECKED:
+        current_check = input_file.readline().split(",")[1].replace("\n", "")
+        index += 1
+
+    input_file.close()
+    output_file.close()
+    end = time.time()
+
+
+    print(f"Time elapsed RECONCILE: {end - start} seconds")
+
+    print(f"FOUND, we need to skip {index} lines")
+
+    return index
+
+
+if not Path(output_csv).is_file():
+    # Initialize output CSV
+    with open(output_csv, "w", newline="", encoding="utf-8") as f_out:
+        writer = csv.DictWriter(f_out, fieldnames=["subject", "text"])
+        writer.writeheader()
+
+
+SKIP = reconcile()


 # Read CSV in RAM
 with open(input_csv, "r", newline="", encoding="utf-8") as input:

+    # Skip already done
+    for i in range(0, SKIP):
+        next(input)
+
    reader = csv.reader(input)

    index = -1