From 25f401b57733c6053e6fa1160d7df3080d76dbbc Mon Sep 17 00:00:00 2001 From: Christian Risi <75698846+CnF-Gris@users.noreply.github.com> Date: Tue, 23 Sep 2025 17:58:08 +0200 Subject: [PATCH] Fixed bug for parsing and added CLI functionalities --- Scripts/DataCleaning/path_splitter_tree.py | 86 +++++++++++----------- 1 file changed, 42 insertions(+), 44 deletions(-) diff --git a/Scripts/DataCleaning/path_splitter_tree.py b/Scripts/DataCleaning/path_splitter_tree.py index 5d1e0b6..51df619 100644 --- a/Scripts/DataCleaning/path_splitter_tree.py +++ b/Scripts/DataCleaning/path_splitter_tree.py @@ -3,14 +3,12 @@ import csv import sys from typing import Self + class ProgramArgs: - def __init__( - self, - file: str, - treshold: int - ): + def __init__(self, file: str, output: str, treshold: int): self.file = file + self.output = output self.treshold = treshold @@ -23,61 +21,63 @@ class Node: ): self.name = name self.quantity = quantity - self.children : dict[str, Node] = {} - + self.children: dict[str, Node] = {} @property def is_leaf(self): return len(self.children) == 0 + def append_child(self, child: list[str]): - def append_child(self, child : list[str]): - - print(child) + # print(child) KEY = child[0] if not self.children.get(KEY): self.children[KEY] = Node(KEY, 0) CHILD = self.children[KEY] - CHILD.quantity += 1 + self.quantity += 1 if len(child) == 1: return new_children = child[1:] - CHILD.append_child(new_children[1:]) - + CHILD.append_child(new_children) def __str__(self): - return f"{self.name}: {self.quantity}" - - - - + return f"{self.name}/ - {self.quantity}" def get_args(args: list[str]) -> ProgramArgs: PARSER = argparse.ArgumentParser() PARSER.add_argument("--input-file", "-i", required=True, type=str) + PARSER.add_argument("--output-file", "-o", required=True, type=str) PARSER.add_argument("--treshold", "-t", type=int, default=1) parsed_args, _ = PARSER.parse_known_args(args) - print(parsed_args.input_file) + # print(parsed_args.input_file) + + return ProgramArgs(parsed_args.input_file,parsed_args.output_file, parsed_args.treshold) # type ignore + + +def get_debug_args() -> ProgramArgs: + + FILE = "./Assets/Dataset/Tmp/reverse-rel.txt" + TRESHOLD = 1 return ProgramArgs( - parsed_args.input_file, - parsed_args.treshold - ) # type ignore + FILE, + TRESHOLD + ) -def tree_like(file: str): +def tree_like(file: str, out: str): - INDENTATION = "\t" + INDENTATION = " " - properties : dict[str, Node] = {} + properties: dict[str, Node] = {} properties["pure"] = Node("pure", 0) properties["URI"] = Node("uri", 0) @@ -87,39 +87,38 @@ def tree_like(file: str): for row in FILE: sections = row.split("/") + sections = list(filter(lambda item: item != "", sections)) - print(sections) + # print(sections) - if len(sections) < 2: + if sections[0] != "http:" and sections[0] != "https:": properties["pure"].append_child(sections) continue - properties["URI"].append_child( - sections - ) - - + properties["URI"].append_child(sections) FILE.close() - stack : list[(Node, int)] = [] + stack: list[tuple[Node, int]] = [] for _, item in properties.items(): stack.append((item, 0)) + OUT = open(out, mode="w", encoding="utf-8") + while len(stack) > 0: LAST_ITEM = stack.pop() - NODE : Node = LAST_ITEM[0] - DEPTH : int = LAST_ITEM[1] + NODE: Node = LAST_ITEM[0] + DEPTH: int = LAST_ITEM[1] - INDENT : str = INDENTATION * DEPTH + INDENT: str = INDENTATION * DEPTH if NODE.quantity < ARGS.treshold: continue - print(f"{INDENT}{NODE}") + OUT.write(f"{INDENT}- {NODE}\n") if NODE.is_leaf: continue @@ -127,15 +126,14 @@ def tree_like(file: str): CHILDREN = [] for _, child in NODE.children.items(): - CHILDREN.append( - (child, DEPTH + 1) - ) + CHILDREN.append((child, DEPTH + 1)) - stack.extend( - CHILDREN - ) + stack.extend(CHILDREN) + + OUT.close() if __name__ == "__main__": ARGS = get_args(sys.argv) - tree_like(ARGS.file) \ No newline at end of file + # ARGS = get_debug_args() + tree_like(ARGS.file, ARGS.output)