Added script to take dbpedia uris
This commit is contained in:
parent
f696f5950b
commit
59796c37cb
77
Scripts/DataCleaning/dbpedia-uri.py
Normal file
77
Scripts/DataCleaning/dbpedia-uri.py
Normal file
@ -0,0 +1,77 @@
|
||||
import argparse
|
||||
import csv
|
||||
import sys
|
||||
from typing import Self
|
||||
|
||||
|
||||
class ProgramArgs:
|
||||
|
||||
def __init__(self, file: str, output: str, treshold: int):
|
||||
self.file = file
|
||||
self.output = output
|
||||
self.treshold = treshold
|
||||
|
||||
def get_args(args: list[str]) -> ProgramArgs:
|
||||
|
||||
PARSER = argparse.ArgumentParser()
|
||||
PARSER.add_argument("--input-file", "-i", required=True, type=str)
|
||||
PARSER.add_argument("--output-file", "-o", required=True, type=str)
|
||||
PARSER.add_argument("--treshold", "-t", type=int, default=1)
|
||||
parsed_args, _ = PARSER.parse_known_args(args)
|
||||
|
||||
# print(parsed_args.input_file)
|
||||
|
||||
return ProgramArgs(parsed_args.input_file,parsed_args.output_file, parsed_args.treshold) # type ignore
|
||||
|
||||
|
||||
def print_dbpedia(file: str, out: str):
|
||||
|
||||
|
||||
FILE = open(file, "r", encoding="utf-8")
|
||||
OUT = open(out, mode="w", encoding="utf-8")
|
||||
|
||||
DOMAIN_PART = "dbpedia"
|
||||
|
||||
already_parsed : set[str] = set()
|
||||
|
||||
|
||||
for row in FILE:
|
||||
|
||||
sections = row.split("/")
|
||||
sections = list(filter(lambda item: item != "", sections))
|
||||
|
||||
# print(sections)
|
||||
|
||||
if len(sections) < 3:
|
||||
continue
|
||||
|
||||
URI = "/".join(sections[:3])
|
||||
|
||||
if URI in already_parsed:
|
||||
continue
|
||||
|
||||
DOMAIN = sections[1]
|
||||
SUBDOMAINS = DOMAIN.split(".")
|
||||
TYPE = sections[2][0]
|
||||
|
||||
if DOMAIN_PART not in SUBDOMAINS:
|
||||
continue
|
||||
|
||||
already_parsed.add(URI)
|
||||
|
||||
SUB_ID = SUBDOMAINS[0]
|
||||
|
||||
if len(SUB_ID) > 3:
|
||||
SUB_ID = SUB_ID[:3]
|
||||
|
||||
OUT.write(f"\"{URI}\", \"{SUB_ID}-db{TYPE}\"\n")
|
||||
|
||||
|
||||
FILE.close()
|
||||
OUT.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
ARGS = get_args(sys.argv)
|
||||
# ARGS = get_debug_args()
|
||||
print_dbpedia(ARGS.file, ARGS.output)
|
||||
Loading…
x
Reference in New Issue
Block a user