NanoSocrates/Scripts/DataCleaning/path_splitter_tree.py

import argparse
import csv
import sys
from typing import Self


class ProgramArgs:

    def __init__(self, file: str, output: str, treshold: int):
        self.file = file
        self.output = output
        self.treshold = treshold


class Node:

    def __init__(
        self,
        name: str,
        quantity: int = 0,
    ):
        self.name = name
        self.quantity = quantity
        self.children: dict[str, Node] = {}

    @property
    def is_leaf(self):
        return len(self.children) == 0

    def append_child(self, child: list[str]):

        # print(child)
        KEY = child[0]

        if not self.children.get(KEY):
            self.children[KEY] = Node(KEY, 0)

        CHILD = self.children[KEY]
        self.quantity += 1

        if len(child) == 1:
            return

        new_children = child[1:]

        CHILD.append_child(new_children)

    def __str__(self):
        return f"{self.name}/ - {self.quantity}"


def get_args(args: list[str]) -> ProgramArgs:

    PARSER = argparse.ArgumentParser()
    PARSER.add_argument("--input-file", "-i", required=True, type=str)
    PARSER.add_argument("--output-file", "-o", required=True, type=str)
    PARSER.add_argument("--treshold", "-t", type=int, default=1)
    parsed_args, _ = PARSER.parse_known_args(args)

    # print(parsed_args.input_file)

    return ProgramArgs(parsed_args.input_file,parsed_args.output_file, parsed_args.treshold)  # type ignore


def get_debug_args() -> ProgramArgs:

    FILE = "./Assets/Dataset/Tmp/reverse-rel.txt"
    TRESHOLD = 1

    return ProgramArgs(
        FILE,
        TRESHOLD
    )


def tree_like(file: str, out: str):

    INDENTATION = "    "

    properties: dict[str, Node] = {}

    properties["pure"] = Node("pure", 0)
    properties["URI"] = Node("uri", 0)

    FILE = open(file, "r", encoding="utf-8")

    # TODO: Change here so it takes single URI from a CSV file
    for row in FILE:

        sections = row.split("/")
        sections = list(filter(lambda item: item != "", sections))

        # print(sections)

        if sections[0] != "http:" and sections[0] != "https:":
            properties["pure"].append_child(sections)
            continue

        properties["URI"].append_child(sections)

    FILE.close()

    stack: list[tuple[Node, int]] = []

    for _, item in properties.items():
        stack.append((item, 0))

    OUT = open(out, mode="w", encoding="utf-8")

    while len(stack) > 0:

        LAST_ITEM = stack.pop()

        NODE: Node = LAST_ITEM[0]
        DEPTH: int = LAST_ITEM[1]

        INDENT: str = INDENTATION * DEPTH

        if NODE.quantity < ARGS.treshold:
            continue

        OUT.write(f"{INDENT}- {NODE}\n")

        if NODE.is_leaf:
            continue

        CHILDREN = []

        for _, child in NODE.children.items():
            CHILDREN.append((child, DEPTH + 1))

        stack.extend(CHILDREN)

    OUT.close()


if __name__ == "__main__":
    ARGS = get_args(sys.argv)
    # ARGS = get_debug_args()
    tree_like(ARGS.file, ARGS.output)
Added barebone to have a splitter 2025-09-23 15:34:53 +02:00			`import argparse`
			`import csv`
			`import sys`
			`from typing import Self`

Fixed bug for parsing and added CLI functionalities 2025-09-23 17:58:08 +02:00
Added barebone to have a splitter 2025-09-23 15:34:53 +02:00			`class ProgramArgs:`

Fixed bug for parsing and added CLI functionalities 2025-09-23 17:58:08 +02:00			`def __init__(self, file: str, output: str, treshold: int):`
Added barebone to have a splitter 2025-09-23 15:34:53 +02:00			`self.file = file`
Fixed bug for parsing and added CLI functionalities 2025-09-23 17:58:08 +02:00			`self.output = output`
Added barebone to have a splitter 2025-09-23 15:34:53 +02:00			`self.treshold = treshold`


			`class Node:`

			`def __init__(`
			`self,`
			`name: str,`
			`quantity: int = 0,`
			`):`
			`self.name = name`
			`self.quantity = quantity`
Fixed bug for parsing and added CLI functionalities 2025-09-23 17:58:08 +02:00			`self.children: dict[str, Node] = {}`
Added barebone to have a splitter 2025-09-23 15:34:53 +02:00
			`@property`
			`def is_leaf(self):`
			`return len(self.children) == 0`

Fixed bug for parsing and added CLI functionalities 2025-09-23 17:58:08 +02:00			`def append_child(self, child: list[str]):`
Added barebone to have a splitter 2025-09-23 15:34:53 +02:00
Fixed bug for parsing and added CLI functionalities 2025-09-23 17:58:08 +02:00			`# print(child)`
Added barebone to have a splitter 2025-09-23 15:34:53 +02:00			`KEY = child[0]`

			`if not self.children.get(KEY):`
			`self.children[KEY] = Node(KEY, 0)`

			`CHILD = self.children[KEY]`
Fixed bug for parsing and added CLI functionalities 2025-09-23 17:58:08 +02:00			`self.quantity += 1`
Added barebone to have a splitter 2025-09-23 15:34:53 +02:00
			`if len(child) == 1:`
			`return`

			`new_children = child[1:]`

Fixed bug for parsing and added CLI functionalities 2025-09-23 17:58:08 +02:00			`CHILD.append_child(new_children)`
Added barebone to have a splitter 2025-09-23 15:34:53 +02:00
			`def __str__(self):`
Fixed bug for parsing and added CLI functionalities 2025-09-23 17:58:08 +02:00			`return f"{self.name}/ - {self.quantity}"`
Added barebone to have a splitter 2025-09-23 15:34:53 +02:00

			`def get_args(args: list[str]) -> ProgramArgs:`

			`PARSER = argparse.ArgumentParser()`
			`PARSER.add_argument("--input-file", "-i", required=True, type=str)`
Fixed bug for parsing and added CLI functionalities 2025-09-23 17:58:08 +02:00			`PARSER.add_argument("--output-file", "-o", required=True, type=str)`
Added barebone to have a splitter 2025-09-23 15:34:53 +02:00			`PARSER.add_argument("--treshold", "-t", type=int, default=1)`
			`parsed_args, _ = PARSER.parse_known_args(args)`

Fixed bug for parsing and added CLI functionalities 2025-09-23 17:58:08 +02:00			`# print(parsed_args.input_file)`

			`return ProgramArgs(parsed_args.input_file,parsed_args.output_file, parsed_args.treshold) # type ignore`


			`def get_debug_args() -> ProgramArgs:`

			`FILE = "./Assets/Dataset/Tmp/reverse-rel.txt"`
			`TRESHOLD = 1`
Added barebone to have a splitter 2025-09-23 15:34:53 +02:00
			`return ProgramArgs(`
Fixed bug for parsing and added CLI functionalities 2025-09-23 17:58:08 +02:00			`FILE,`
			`TRESHOLD`
			`)`
Added barebone to have a splitter 2025-09-23 15:34:53 +02:00

Fixed bug for parsing and added CLI functionalities 2025-09-23 17:58:08 +02:00			`def tree_like(file: str, out: str):`
Added barebone to have a splitter 2025-09-23 15:34:53 +02:00
Fixed bug for parsing and added CLI functionalities 2025-09-23 17:58:08 +02:00			`INDENTATION = " "`
Added barebone to have a splitter 2025-09-23 15:34:53 +02:00
Fixed bug for parsing and added CLI functionalities 2025-09-23 17:58:08 +02:00			`properties: dict[str, Node] = {}`
Added barebone to have a splitter 2025-09-23 15:34:53 +02:00
			`properties["pure"] = Node("pure", 0)`
			`properties["URI"] = Node("uri", 0)`

			`FILE = open(file, "r", encoding="utf-8")`

Added todo 2025-09-25 12:00:26 +02:00			`# TODO: Change here so it takes single URI from a CSV file`
Added barebone to have a splitter 2025-09-23 15:34:53 +02:00			`for row in FILE:`

			`sections = row.split("/")`
Fixed bug for parsing and added CLI functionalities 2025-09-23 17:58:08 +02:00			`sections = list(filter(lambda item: item != "", sections))`
Added barebone to have a splitter 2025-09-23 15:34:53 +02:00
Fixed bug for parsing and added CLI functionalities 2025-09-23 17:58:08 +02:00			`# print(sections)`
Added barebone to have a splitter 2025-09-23 15:34:53 +02:00
Fixed bug for parsing and added CLI functionalities 2025-09-23 17:58:08 +02:00			`if sections[0] != "http:" and sections[0] != "https:":`
Added barebone to have a splitter 2025-09-23 15:34:53 +02:00			`properties["pure"].append_child(sections)`
			`continue`

Fixed bug for parsing and added CLI functionalities 2025-09-23 17:58:08 +02:00			`properties["URI"].append_child(sections)`
Added barebone to have a splitter 2025-09-23 15:34:53 +02:00
			`FILE.close()`

Fixed bug for parsing and added CLI functionalities 2025-09-23 17:58:08 +02:00			`stack: list[tuple[Node, int]] = []`
Added barebone to have a splitter 2025-09-23 15:34:53 +02:00
			`for _, item in properties.items():`
			`stack.append((item, 0))`

Fixed bug for parsing and added CLI functionalities 2025-09-23 17:58:08 +02:00			`OUT = open(out, mode="w", encoding="utf-8")`

Added barebone to have a splitter 2025-09-23 15:34:53 +02:00			`while len(stack) > 0:`

			`LAST_ITEM = stack.pop()`

Fixed bug for parsing and added CLI functionalities 2025-09-23 17:58:08 +02:00			`NODE: Node = LAST_ITEM[0]`
			`DEPTH: int = LAST_ITEM[1]`
Added barebone to have a splitter 2025-09-23 15:34:53 +02:00
Fixed bug for parsing and added CLI functionalities 2025-09-23 17:58:08 +02:00			`INDENT: str = INDENTATION * DEPTH`
Added barebone to have a splitter 2025-09-23 15:34:53 +02:00
			`if NODE.quantity < ARGS.treshold:`
			`continue`

Fixed bug for parsing and added CLI functionalities 2025-09-23 17:58:08 +02:00			`OUT.write(f"{INDENT}- {NODE}\n")`
Added barebone to have a splitter 2025-09-23 15:34:53 +02:00
			`if NODE.is_leaf:`
			`continue`

			`CHILDREN = []`

			`for _, child in NODE.children.items():`
Fixed bug for parsing and added CLI functionalities 2025-09-23 17:58:08 +02:00			`CHILDREN.append((child, DEPTH + 1))`

			`stack.extend(CHILDREN)`
Added barebone to have a splitter 2025-09-23 15:34:53 +02:00
Fixed bug for parsing and added CLI functionalities 2025-09-23 17:58:08 +02:00			`OUT.close()`
Added barebone to have a splitter 2025-09-23 15:34:53 +02:00

			`if __name__ == "__main__":`
			`ARGS = get_args(sys.argv)`
Fixed bug for parsing and added CLI functionalities 2025-09-23 17:58:08 +02:00			`# ARGS = get_debug_args()`
			`tree_like(ARGS.file, ARGS.output)`