78 lines
1.7 KiB
Python
78 lines
1.7 KiB
Python
|
|
import argparse
|
||
|
|
import csv
|
||
|
|
import sys
|
||
|
|
from typing import Self
|
||
|
|
|
||
|
|
|
||
|
|
class ProgramArgs:
|
||
|
|
|
||
|
|
def __init__(self, file: str, output: str, treshold: int):
|
||
|
|
self.file = file
|
||
|
|
self.output = output
|
||
|
|
self.treshold = treshold
|
||
|
|
|
||
|
|
def get_args(args: list[str]) -> ProgramArgs:
|
||
|
|
|
||
|
|
PARSER = argparse.ArgumentParser()
|
||
|
|
PARSER.add_argument("--input-file", "-i", required=True, type=str)
|
||
|
|
PARSER.add_argument("--output-file", "-o", required=True, type=str)
|
||
|
|
PARSER.add_argument("--treshold", "-t", type=int, default=1)
|
||
|
|
parsed_args, _ = PARSER.parse_known_args(args)
|
||
|
|
|
||
|
|
# print(parsed_args.input_file)
|
||
|
|
|
||
|
|
return ProgramArgs(parsed_args.input_file,parsed_args.output_file, parsed_args.treshold) # type ignore
|
||
|
|
|
||
|
|
|
||
|
|
def print_dbpedia(file: str, out: str):
|
||
|
|
|
||
|
|
|
||
|
|
FILE = open(file, "r", encoding="utf-8")
|
||
|
|
OUT = open(out, mode="w", encoding="utf-8")
|
||
|
|
|
||
|
|
DOMAIN_PART = "dbpedia"
|
||
|
|
|
||
|
|
already_parsed : set[str] = set()
|
||
|
|
|
||
|
|
|
||
|
|
for row in FILE:
|
||
|
|
|
||
|
|
sections = row.split("/")
|
||
|
|
sections = list(filter(lambda item: item != "", sections))
|
||
|
|
|
||
|
|
# print(sections)
|
||
|
|
|
||
|
|
if len(sections) < 3:
|
||
|
|
continue
|
||
|
|
|
||
|
|
URI = "/".join(sections[:3])
|
||
|
|
|
||
|
|
if URI in already_parsed:
|
||
|
|
continue
|
||
|
|
|
||
|
|
DOMAIN = sections[1]
|
||
|
|
SUBDOMAINS = DOMAIN.split(".")
|
||
|
|
TYPE = sections[2][0]
|
||
|
|
|
||
|
|
if DOMAIN_PART not in SUBDOMAINS:
|
||
|
|
continue
|
||
|
|
|
||
|
|
already_parsed.add(URI)
|
||
|
|
|
||
|
|
SUB_ID = SUBDOMAINS[0]
|
||
|
|
|
||
|
|
if len(SUB_ID) > 3:
|
||
|
|
SUB_ID = SUB_ID[:3]
|
||
|
|
|
||
|
|
OUT.write(f"\"{URI}\", \"{SUB_ID}-db{TYPE}\"\n")
|
||
|
|
|
||
|
|
|
||
|
|
FILE.close()
|
||
|
|
OUT.close()
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
ARGS = get_args(sys.argv)
|
||
|
|
# ARGS = get_debug_args()
|
||
|
|
print_dbpedia(ARGS.file, ARGS.output)
|