NanoSocrates/Scripts/DataCleaning/path_splitter_tree.py

import argparse
import csv
import sys
from typing import Self


class ProgramArgs:

    def __init__(self, file: str, csv_uri_header: str, output: str, treshold: int):
        """
        Args:
            file (str):
            csv_header (str): The name of the column of the csv file from which the program will get the URIs
            output (str):
            treshold (int):
        """
        self.file = file
        self.csv_uri_header = csv_uri_header
        self.output = output
        self.treshold = treshold


class Node:

    def __init__(
        self,
        name: str,
        quantity: int = 0,
    ):
        self.name = name
        self.quantity = quantity
        self.children: dict[str, Node] = {}

    @property
    def is_leaf(self):
        return len(self.children) == 0

    def append_child(self, child: list[str]):

        # print(child)
        KEY = child[0]

        if not self.children.get(KEY):
            # if the key has no value, it means we are traversing this branch for the first time
            # create another node for the key
            self.children[KEY] = Node(KEY, 0)

        # take the node for the key
        CHILD = self.children[KEY]
        self.quantity += 1

        # if the child list to enter has only one element, which is KEY, no more node will be created
        if len(child) == 1:
            return

        new_children = child[1:]

        CHILD.append_child(new_children)

    def __str__(self):
        return f"{self.name}/ - {self.quantity}"


def get_args(args: list[str]) -> ProgramArgs:

    PARSER = argparse.ArgumentParser()
    PARSER.add_argument("--input-file", "-i", required=True, type=str)
    PARSER.add_argument("--header-name", "-c", required=True, type=str)                       # c stands for column
    PARSER.add_argument("--output-file", "-o", required=True, type=str)
    PARSER.add_argument("--treshold", "-t", type=int, default=1)
    parsed_args, _ = PARSER.parse_known_args(args)

    # print(parsed_args.input_file)

    return ProgramArgs(parsed_args.input_file, parsed_args.header_name ,parsed_args.output_file, parsed_args.treshold)  # type ignore


def get_debug_args() -> ProgramArgs:
    # -i ./Assets/Dataset/1-hop/movies.csv  -c subject -o Assets/Dataset/Tmp/prova.csv -t 1
    FILE = "./Assets/Dataset/1-hop/movies.csv"
    CSV_HEADER = "subject"
    OUTPUT = "./Assets/Dataset/Tmp/prova.csv"
    TRESHOLD = 1

    return ProgramArgs(
        FILE,
        CSV_HEADER,
        OUTPUT,
        TRESHOLD
    )


def tree_like(file: str, csv_uri_header:str, out: str):

    INDENTATION = "    "

    properties: dict[str, Node] = {}

    properties["pure"] = Node("pure", 0)
    properties["URI"] = Node("uri", 0)

    FILE = open(file, "r", encoding="utf-8")

    # It is needed the header-name
    for row in csv.DictReader(FILE):

        uri_element = row[csv_uri_header]
        sections = uri_element.split("/")
        sections = list(filter(lambda item: item != "", sections))

        # print(sections)

        if sections[0] != "http:" and sections[0] != "https:":
            properties["pure"].append_child(sections)
            continue

        properties["URI"].append_child(sections)

    FILE.close()

    stack: list[tuple[Node, int]] = []

    for _, item in properties.items():
        stack.append((item, 0))

    OUT = open(out, mode="w", encoding="utf-8")

    while len(stack) > 0:

        LAST_ITEM = stack.pop()

        NODE: Node = LAST_ITEM[0]
        DEPTH: int = LAST_ITEM[1]

        INDENT: str = INDENTATION * DEPTH

        # Leaf node have quantity 0, so if i want them to appear the threshold have to be 0
        # if NODE.quantity < ARGS.treshold:
        if ARGS.treshold > NODE.quantity:
            continue

        OUT.write(f"{INDENT}- {NODE}\n")

        if NODE.is_leaf:
            continue

        CHILDREN = []

        for _, child in NODE.children.items():
            CHILDREN.append((child, DEPTH + 1))

        stack.extend(CHILDREN)

    OUT.close()


if __name__ == "__main__":
    ARGS = get_args(sys.argv)
    # ARGS = get_debug_args()
    tree_like(ARGS.file,ARGS.csv_uri_header, ARGS.output)