From 4c9c51f9026aa18ad9855b2a03101caf10ae2a96 Mon Sep 17 00:00:00 2001 From: chris-admin Date: Tue, 23 Sep 2025 15:34:53 +0200 Subject: [PATCH] Added barebone to have a splitter --- Scripts/DataCleaning/path_splitter_tree.py | 141 +++++++++++++++++++++ 1 file changed, 141 insertions(+) create mode 100644 Scripts/DataCleaning/path_splitter_tree.py diff --git a/Scripts/DataCleaning/path_splitter_tree.py b/Scripts/DataCleaning/path_splitter_tree.py new file mode 100644 index 0000000..5d1e0b6 --- /dev/null +++ b/Scripts/DataCleaning/path_splitter_tree.py @@ -0,0 +1,141 @@ +import argparse +import csv +import sys +from typing import Self + +class ProgramArgs: + + def __init__( + self, + file: str, + treshold: int + ): + self.file = file + self.treshold = treshold + + +class Node: + + def __init__( + self, + name: str, + quantity: int = 0, + ): + self.name = name + self.quantity = quantity + self.children : dict[str, Node] = {} + + + @property + def is_leaf(self): + return len(self.children) == 0 + + + def append_child(self, child : list[str]): + + print(child) + KEY = child[0] + + if not self.children.get(KEY): + self.children[KEY] = Node(KEY, 0) + + CHILD = self.children[KEY] + CHILD.quantity += 1 + + if len(child) == 1: + return + + new_children = child[1:] + + CHILD.append_child(new_children[1:]) + + + def __str__(self): + return f"{self.name}: {self.quantity}" + + + + + + +def get_args(args: list[str]) -> ProgramArgs: + + PARSER = argparse.ArgumentParser() + PARSER.add_argument("--input-file", "-i", required=True, type=str) + PARSER.add_argument("--treshold", "-t", type=int, default=1) + parsed_args, _ = PARSER.parse_known_args(args) + + print(parsed_args.input_file) + + return ProgramArgs( + parsed_args.input_file, + parsed_args.treshold + ) # type ignore + + +def tree_like(file: str): + + INDENTATION = "\t" + + properties : dict[str, Node] = {} + + properties["pure"] = Node("pure", 0) + properties["URI"] = Node("uri", 0) + + FILE = open(file, "r", encoding="utf-8") + + for row in FILE: + + sections = row.split("/") + + print(sections) + + if len(sections) < 2: + properties["pure"].append_child(sections) + continue + + properties["URI"].append_child( + sections + ) + + + + FILE.close() + + stack : list[(Node, int)] = [] + + for _, item in properties.items(): + stack.append((item, 0)) + + while len(stack) > 0: + + LAST_ITEM = stack.pop() + + NODE : Node = LAST_ITEM[0] + DEPTH : int = LAST_ITEM[1] + + INDENT : str = INDENTATION * DEPTH + + if NODE.quantity < ARGS.treshold: + continue + + print(f"{INDENT}{NODE}") + + if NODE.is_leaf: + continue + + CHILDREN = [] + + for _, child in NODE.children.items(): + CHILDREN.append( + (child, DEPTH + 1) + ) + + stack.extend( + CHILDREN + ) + + +if __name__ == "__main__": + ARGS = get_args(sys.argv) + tree_like(ARGS.file) \ No newline at end of file