Added reconciliation
This commit is contained in:
parent
f89dffff75
commit
de8c2afceb
@ -1,3 +1,4 @@
|
||||
from pathlib import Path
|
||||
import pandas as pd
|
||||
|
||||
import csv
|
||||
@ -16,6 +17,8 @@ CHUNK = 20
|
||||
# Function to get clean full text from Wikipedia PageID
|
||||
def get_clean_text(pageIDS: list[str]):
|
||||
|
||||
parsing_time = 0
|
||||
start_full = time.time()
|
||||
API_URL = "https://en.wikipedia.org/w/api.php"
|
||||
headers = {
|
||||
"User-Agent": "CoolBot/0.0 (https://example.org/coolbot/; coolbot@example.org)"
|
||||
@ -38,8 +41,12 @@ def get_clean_text(pageIDS: list[str]):
|
||||
|
||||
if extract:
|
||||
print(f"Entry FOUND for pageID {pageID}")
|
||||
start_parse = time.time()
|
||||
extract = extract.strip()
|
||||
extract = extract.replace("\n", "")
|
||||
end_parse = time.time()
|
||||
parsing_time = end_parse - start_parse
|
||||
print(f"Time elapsed PARSE: {parsing_time} seconds")
|
||||
abstracts[pageID] = extract
|
||||
else:
|
||||
SKIPPED += 1
|
||||
@ -49,6 +56,9 @@ def get_clean_text(pageIDS: list[str]):
|
||||
print(f"Page MISSING for pageID {pageID}")
|
||||
|
||||
print(f"Chunk done - Skipped {SKIPPED}")
|
||||
end_full = time.time()
|
||||
|
||||
print(f"Time elapsed FULL: {end_full - start_full} seconds\n\tNO PARSE: {(end_full - start_full) - parsing_time} seconds")
|
||||
return abstracts
|
||||
|
||||
|
||||
@ -57,22 +67,64 @@ def flush(movie_ids):
|
||||
|
||||
abstracts = get_clean_text(movie_ids)
|
||||
|
||||
start = time.time()
|
||||
with open(output_csv, "a", newline="", encoding="utf-8") as f_out:
|
||||
writer = csv.DictWriter(f_out, fieldnames=["subject", "text"])
|
||||
|
||||
for id, text in abstracts.items():
|
||||
writer.writerow({"subject": id, "text": text})
|
||||
end = time.time()
|
||||
|
||||
print(f"Time elapsed WRITE: {end - start} seconds")
|
||||
|
||||
|
||||
# Initialize output CSV
|
||||
with open(output_csv, "w", newline="", encoding="utf-8") as f_out:
|
||||
writer = csv.DictWriter(f_out, fieldnames=["subject", "text"])
|
||||
writer.writeheader()
|
||||
|
||||
|
||||
def reconcile() -> int:
|
||||
|
||||
start = time.time()
|
||||
input_file = open(input_csv, "r", newline="", encoding="utf-8")
|
||||
output_file = open(output_csv, "r", newline="", encoding="utf-8")
|
||||
|
||||
next(input_file)
|
||||
LAST_CHECKED = output_file.readlines()[-1].split(",")[0]
|
||||
current_check = input_file.readline().split(",")[1]
|
||||
|
||||
index = 1
|
||||
|
||||
while current_check != LAST_CHECKED:
|
||||
current_check = input_file.readline().split(",")[1].replace("\n", "")
|
||||
index += 1
|
||||
|
||||
input_file.close()
|
||||
output_file.close()
|
||||
end = time.time()
|
||||
|
||||
|
||||
print(f"Time elapsed RECONCILE: {end - start} seconds")
|
||||
|
||||
print(f"FOUND, we need to skip {index} lines")
|
||||
|
||||
return index
|
||||
|
||||
|
||||
if not Path(output_csv).is_file():
|
||||
# Initialize output CSV
|
||||
with open(output_csv, "w", newline="", encoding="utf-8") as f_out:
|
||||
writer = csv.DictWriter(f_out, fieldnames=["subject", "text"])
|
||||
writer.writeheader()
|
||||
|
||||
|
||||
SKIP = reconcile()
|
||||
|
||||
|
||||
# Read CSV in RAM
|
||||
with open(input_csv, "r", newline="", encoding="utf-8") as input:
|
||||
|
||||
# Skip already done
|
||||
for i in range(0, SKIP):
|
||||
next(input)
|
||||
|
||||
reader = csv.reader(input)
|
||||
|
||||
index = -1
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user