78 lines
1.7 KiB
Python
Raw Normal View History

2025-09-24 13:49:29 +02:00
import argparse
import csv
import sys
from typing import Self
class ProgramArgs:
def __init__(self, file: str, output: str, treshold: int):
self.file = file
self.output = output
self.treshold = treshold
def get_args(args: list[str]) -> ProgramArgs:
PARSER = argparse.ArgumentParser()
PARSER.add_argument("--input-file", "-i", required=True, type=str)
PARSER.add_argument("--output-file", "-o", required=True, type=str)
PARSER.add_argument("--treshold", "-t", type=int, default=1)
parsed_args, _ = PARSER.parse_known_args(args)
# print(parsed_args.input_file)
return ProgramArgs(parsed_args.input_file,parsed_args.output_file, parsed_args.treshold) # type ignore
def print_dbpedia(file: str, out: str):
FILE = open(file, "r", encoding="utf-8")
OUT = open(out, mode="w", encoding="utf-8")
DOMAIN_PART = "dbpedia"
already_parsed : set[str] = set()
for row in FILE:
sections = row.split("/")
sections = list(filter(lambda item: item != "", sections))
# print(sections)
if len(sections) < 3:
continue
URI = "/".join(sections[:3])
if URI in already_parsed:
continue
DOMAIN = sections[1]
SUBDOMAINS = DOMAIN.split(".")
TYPE = sections[2][0]
if DOMAIN_PART not in SUBDOMAINS:
continue
already_parsed.add(URI)
SUB_ID = SUBDOMAINS[0]
if len(SUB_ID) > 3:
SUB_ID = SUB_ID[:3]
OUT.write(f"\"{URI}\", \"{SUB_ID}-db{TYPE}\"\n")
FILE.close()
OUT.close()
if __name__ == "__main__":
ARGS = get_args(sys.argv)
# ARGS = get_debug_args()
print_dbpedia(ARGS.file, ARGS.output)