Added reconciliation
This commit is contained in:
parent
f89dffff75
commit
de8c2afceb
@ -1,3 +1,4 @@
|
|||||||
|
from pathlib import Path
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
import csv
|
import csv
|
||||||
@ -16,6 +17,8 @@ CHUNK = 20
|
|||||||
# Function to get clean full text from Wikipedia PageID
|
# Function to get clean full text from Wikipedia PageID
|
||||||
def get_clean_text(pageIDS: list[str]):
|
def get_clean_text(pageIDS: list[str]):
|
||||||
|
|
||||||
|
parsing_time = 0
|
||||||
|
start_full = time.time()
|
||||||
API_URL = "https://en.wikipedia.org/w/api.php"
|
API_URL = "https://en.wikipedia.org/w/api.php"
|
||||||
headers = {
|
headers = {
|
||||||
"User-Agent": "CoolBot/0.0 (https://example.org/coolbot/; coolbot@example.org)"
|
"User-Agent": "CoolBot/0.0 (https://example.org/coolbot/; coolbot@example.org)"
|
||||||
@ -38,8 +41,12 @@ def get_clean_text(pageIDS: list[str]):
|
|||||||
|
|
||||||
if extract:
|
if extract:
|
||||||
print(f"Entry FOUND for pageID {pageID}")
|
print(f"Entry FOUND for pageID {pageID}")
|
||||||
|
start_parse = time.time()
|
||||||
extract = extract.strip()
|
extract = extract.strip()
|
||||||
extract = extract.replace("\n", "")
|
extract = extract.replace("\n", "")
|
||||||
|
end_parse = time.time()
|
||||||
|
parsing_time = end_parse - start_parse
|
||||||
|
print(f"Time elapsed PARSE: {parsing_time} seconds")
|
||||||
abstracts[pageID] = extract
|
abstracts[pageID] = extract
|
||||||
else:
|
else:
|
||||||
SKIPPED += 1
|
SKIPPED += 1
|
||||||
@ -49,6 +56,9 @@ def get_clean_text(pageIDS: list[str]):
|
|||||||
print(f"Page MISSING for pageID {pageID}")
|
print(f"Page MISSING for pageID {pageID}")
|
||||||
|
|
||||||
print(f"Chunk done - Skipped {SKIPPED}")
|
print(f"Chunk done - Skipped {SKIPPED}")
|
||||||
|
end_full = time.time()
|
||||||
|
|
||||||
|
print(f"Time elapsed FULL: {end_full - start_full} seconds\n\tNO PARSE: {(end_full - start_full) - parsing_time} seconds")
|
||||||
return abstracts
|
return abstracts
|
||||||
|
|
||||||
|
|
||||||
@ -57,22 +67,64 @@ def flush(movie_ids):
|
|||||||
|
|
||||||
abstracts = get_clean_text(movie_ids)
|
abstracts = get_clean_text(movie_ids)
|
||||||
|
|
||||||
|
start = time.time()
|
||||||
with open(output_csv, "a", newline="", encoding="utf-8") as f_out:
|
with open(output_csv, "a", newline="", encoding="utf-8") as f_out:
|
||||||
writer = csv.DictWriter(f_out, fieldnames=["subject", "text"])
|
writer = csv.DictWriter(f_out, fieldnames=["subject", "text"])
|
||||||
|
|
||||||
for id, text in abstracts.items():
|
for id, text in abstracts.items():
|
||||||
writer.writerow({"subject": id, "text": text})
|
writer.writerow({"subject": id, "text": text})
|
||||||
|
end = time.time()
|
||||||
|
|
||||||
|
print(f"Time elapsed WRITE: {end - start} seconds")
|
||||||
|
|
||||||
|
|
||||||
# Initialize output CSV
|
|
||||||
with open(output_csv, "w", newline="", encoding="utf-8") as f_out:
|
|
||||||
writer = csv.DictWriter(f_out, fieldnames=["subject", "text"])
|
def reconcile() -> int:
|
||||||
writer.writeheader()
|
|
||||||
|
start = time.time()
|
||||||
|
input_file = open(input_csv, "r", newline="", encoding="utf-8")
|
||||||
|
output_file = open(output_csv, "r", newline="", encoding="utf-8")
|
||||||
|
|
||||||
|
next(input_file)
|
||||||
|
LAST_CHECKED = output_file.readlines()[-1].split(",")[0]
|
||||||
|
current_check = input_file.readline().split(",")[1]
|
||||||
|
|
||||||
|
index = 1
|
||||||
|
|
||||||
|
while current_check != LAST_CHECKED:
|
||||||
|
current_check = input_file.readline().split(",")[1].replace("\n", "")
|
||||||
|
index += 1
|
||||||
|
|
||||||
|
input_file.close()
|
||||||
|
output_file.close()
|
||||||
|
end = time.time()
|
||||||
|
|
||||||
|
|
||||||
|
print(f"Time elapsed RECONCILE: {end - start} seconds")
|
||||||
|
|
||||||
|
print(f"FOUND, we need to skip {index} lines")
|
||||||
|
|
||||||
|
return index
|
||||||
|
|
||||||
|
|
||||||
|
if not Path(output_csv).is_file():
|
||||||
|
# Initialize output CSV
|
||||||
|
with open(output_csv, "w", newline="", encoding="utf-8") as f_out:
|
||||||
|
writer = csv.DictWriter(f_out, fieldnames=["subject", "text"])
|
||||||
|
writer.writeheader()
|
||||||
|
|
||||||
|
|
||||||
|
SKIP = reconcile()
|
||||||
|
|
||||||
|
|
||||||
# Read CSV in RAM
|
# Read CSV in RAM
|
||||||
with open(input_csv, "r", newline="", encoding="utf-8") as input:
|
with open(input_csv, "r", newline="", encoding="utf-8") as input:
|
||||||
|
|
||||||
|
# Skip already done
|
||||||
|
for i in range(0, SKIP):
|
||||||
|
next(input)
|
||||||
|
|
||||||
reader = csv.reader(input)
|
reader = csv.reader(input)
|
||||||
|
|
||||||
index = -1
|
index = -1
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user