import argparse import sys class ProgramArgs: def __init__(self, file: str, output: str, treshold: int): self.file = file self.output = output self.treshold = treshold def get_args(args: list[str]) -> ProgramArgs: PARSER = argparse.ArgumentParser() PARSER.add_argument("--input-file", "-i", required=True, type=str) PARSER.add_argument("--output-file", "-o", required=True, type=str) PARSER.add_argument("--treshold", "-t", type=int, default=1) parsed_args, _ = PARSER.parse_known_args(args) # print(parsed_args.input_file) return ProgramArgs(parsed_args.input_file,parsed_args.output_file, parsed_args.treshold) # type ignore def print_dbpedia(file: str, out: str): FILE = open(file, "r", encoding="utf-8") OUT = open(out, mode="w", encoding="utf-8") DOMAIN_PART = "dbpedia" already_parsed : set[str] = set() for row in FILE: sections = row.split("/") sections = list(filter(lambda item: item != "", sections)) # print(sections) if len(sections) < 3: continue URI = "/".join(sections[1:3]) URI = "//".join([sections[0], URI]) if URI in already_parsed: continue DOMAIN = sections[1] SUBDOMAINS = DOMAIN.split(".") TYPE = sections[2][0] if DOMAIN_PART not in SUBDOMAINS: continue already_parsed.add(URI) SUB_ID = SUBDOMAINS[0] if len(SUB_ID) > 3: SUB_ID = SUB_ID[:3] OUT.write(f"\"{URI}/\", \"{SUB_ID}-db{TYPE}\"\n") FILE.close() OUT.close() if __name__ == "__main__": ARGS = get_args(sys.argv) # ARGS = get_debug_args() print_dbpedia(ARGS.file, ARGS.output)