diff --git a/Scripts/DataCleaning/dbpedia-uri.py b/Scripts/DataCleaning/dbpedia-uri.py new file mode 100644 index 0000000..ceafe87 --- /dev/null +++ b/Scripts/DataCleaning/dbpedia-uri.py @@ -0,0 +1,77 @@ +import argparse +import csv +import sys +from typing import Self + + +class ProgramArgs: + + def __init__(self, file: str, output: str, treshold: int): + self.file = file + self.output = output + self.treshold = treshold + +def get_args(args: list[str]) -> ProgramArgs: + + PARSER = argparse.ArgumentParser() + PARSER.add_argument("--input-file", "-i", required=True, type=str) + PARSER.add_argument("--output-file", "-o", required=True, type=str) + PARSER.add_argument("--treshold", "-t", type=int, default=1) + parsed_args, _ = PARSER.parse_known_args(args) + + # print(parsed_args.input_file) + + return ProgramArgs(parsed_args.input_file,parsed_args.output_file, parsed_args.treshold) # type ignore + + +def print_dbpedia(file: str, out: str): + + + FILE = open(file, "r", encoding="utf-8") + OUT = open(out, mode="w", encoding="utf-8") + + DOMAIN_PART = "dbpedia" + + already_parsed : set[str] = set() + + + for row in FILE: + + sections = row.split("/") + sections = list(filter(lambda item: item != "", sections)) + + # print(sections) + + if len(sections) < 3: + continue + + URI = "/".join(sections[:3]) + + if URI in already_parsed: + continue + + DOMAIN = sections[1] + SUBDOMAINS = DOMAIN.split(".") + TYPE = sections[2][0] + + if DOMAIN_PART not in SUBDOMAINS: + continue + + already_parsed.add(URI) + + SUB_ID = SUBDOMAINS[0] + + if len(SUB_ID) > 3: + SUB_ID = SUB_ID[:3] + + OUT.write(f"\"{URI}\", \"{SUB_ID}-db{TYPE}\"\n") + + + FILE.close() + OUT.close() + + +if __name__ == "__main__": + ARGS = get_args(sys.argv) + # ARGS = get_debug_args() + print_dbpedia(ARGS.file, ARGS.output)