NanoSocrates/Scripts/DataCleaning/path_splitter_tree.py

141 lines
3.0 KiB
Python
Raw Normal View History

2025-09-23 15:34:53 +02:00
import argparse
import csv
import sys
from typing import Self
2025-09-23 15:34:53 +02:00
class ProgramArgs:
def __init__(self, file: str, output: str, treshold: int):
2025-09-23 15:34:53 +02:00
self.file = file
self.output = output
2025-09-23 15:34:53 +02:00
self.treshold = treshold
class Node:
def __init__(
self,
name: str,
quantity: int = 0,
):
self.name = name
self.quantity = quantity
self.children: dict[str, Node] = {}
2025-09-23 15:34:53 +02:00
@property
def is_leaf(self):
return len(self.children) == 0
def append_child(self, child: list[str]):
2025-09-23 15:34:53 +02:00
# print(child)
2025-09-23 15:34:53 +02:00
KEY = child[0]
if not self.children.get(KEY):
self.children[KEY] = Node(KEY, 0)
CHILD = self.children[KEY]
self.quantity += 1
2025-09-23 15:34:53 +02:00
if len(child) == 1:
return
new_children = child[1:]
CHILD.append_child(new_children)
2025-09-23 15:34:53 +02:00
def __str__(self):
return f"{self.name}/ - {self.quantity}"
2025-09-23 15:34:53 +02:00
def get_args(args: list[str]) -> ProgramArgs:
PARSER = argparse.ArgumentParser()
PARSER.add_argument("--input-file", "-i", required=True, type=str)
PARSER.add_argument("--output-file", "-o", required=True, type=str)
2025-09-23 15:34:53 +02:00
PARSER.add_argument("--treshold", "-t", type=int, default=1)
parsed_args, _ = PARSER.parse_known_args(args)
# print(parsed_args.input_file)
return ProgramArgs(parsed_args.input_file,parsed_args.output_file, parsed_args.treshold) # type ignore
def get_debug_args() -> ProgramArgs:
FILE = "./Assets/Dataset/Tmp/reverse-rel.txt"
TRESHOLD = 1
2025-09-23 15:34:53 +02:00
return ProgramArgs(
FILE,
TRESHOLD
)
2025-09-23 15:34:53 +02:00
def tree_like(file: str, out: str):
2025-09-23 15:34:53 +02:00
INDENTATION = " "
2025-09-23 15:34:53 +02:00
properties: dict[str, Node] = {}
2025-09-23 15:34:53 +02:00
properties["pure"] = Node("pure", 0)
properties["URI"] = Node("uri", 0)
FILE = open(file, "r", encoding="utf-8")
2025-09-25 12:00:26 +02:00
# TODO: Change here so it takes single URI from a CSV file
2025-09-23 15:34:53 +02:00
for row in FILE:
sections = row.split("/")
sections = list(filter(lambda item: item != "", sections))
2025-09-23 15:34:53 +02:00
# print(sections)
2025-09-23 15:34:53 +02:00
if sections[0] != "http:" and sections[0] != "https:":
2025-09-23 15:34:53 +02:00
properties["pure"].append_child(sections)
continue
properties["URI"].append_child(sections)
2025-09-23 15:34:53 +02:00
FILE.close()
stack: list[tuple[Node, int]] = []
2025-09-23 15:34:53 +02:00
for _, item in properties.items():
stack.append((item, 0))
OUT = open(out, mode="w", encoding="utf-8")
2025-09-23 15:34:53 +02:00
while len(stack) > 0:
LAST_ITEM = stack.pop()
NODE: Node = LAST_ITEM[0]
DEPTH: int = LAST_ITEM[1]
2025-09-23 15:34:53 +02:00
INDENT: str = INDENTATION * DEPTH
2025-09-23 15:34:53 +02:00
if NODE.quantity < ARGS.treshold:
continue
OUT.write(f"{INDENT}- {NODE}\n")
2025-09-23 15:34:53 +02:00
if NODE.is_leaf:
continue
CHILDREN = []
for _, child in NODE.children.items():
CHILDREN.append((child, DEPTH + 1))
stack.extend(CHILDREN)
2025-09-23 15:34:53 +02:00
OUT.close()
2025-09-23 15:34:53 +02:00
if __name__ == "__main__":
ARGS = get_args(sys.argv)
# ARGS = get_debug_args()
tree_like(ARGS.file, ARGS.output)