2025-09-23 15:34:53 +02:00
|
|
|
import argparse
|
|
|
|
|
import csv
|
|
|
|
|
import sys
|
|
|
|
|
from typing import Self
|
|
|
|
|
|
2025-09-23 17:58:08 +02:00
|
|
|
|
2025-09-23 15:34:53 +02:00
|
|
|
class ProgramArgs:
|
|
|
|
|
|
2025-09-25 17:57:46 +02:00
|
|
|
def __init__(self, file: str, csv_uri_header: str, output: str, treshold: int):
|
|
|
|
|
"""
|
|
|
|
|
Args:
|
|
|
|
|
file (str):
|
|
|
|
|
csv_header (str): The name of the column of the csv file from which the program will get the URIs
|
|
|
|
|
output (str):
|
|
|
|
|
treshold (int):
|
|
|
|
|
"""
|
2025-09-23 15:34:53 +02:00
|
|
|
self.file = file
|
2025-09-25 17:57:46 +02:00
|
|
|
self.csv_uri_header = csv_uri_header
|
2025-09-23 17:58:08 +02:00
|
|
|
self.output = output
|
2025-09-23 15:34:53 +02:00
|
|
|
self.treshold = treshold
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Node:
|
|
|
|
|
|
|
|
|
|
def __init__(
|
|
|
|
|
self,
|
|
|
|
|
name: str,
|
|
|
|
|
quantity: int = 0,
|
|
|
|
|
):
|
|
|
|
|
self.name = name
|
|
|
|
|
self.quantity = quantity
|
2025-09-23 17:58:08 +02:00
|
|
|
self.children: dict[str, Node] = {}
|
2025-09-23 15:34:53 +02:00
|
|
|
|
|
|
|
|
@property
|
|
|
|
|
def is_leaf(self):
|
|
|
|
|
return len(self.children) == 0
|
|
|
|
|
|
2025-09-23 17:58:08 +02:00
|
|
|
def append_child(self, child: list[str]):
|
2025-09-23 15:34:53 +02:00
|
|
|
|
2025-09-23 17:58:08 +02:00
|
|
|
# print(child)
|
2025-09-23 15:34:53 +02:00
|
|
|
KEY = child[0]
|
|
|
|
|
|
|
|
|
|
if not self.children.get(KEY):
|
2025-09-25 17:57:46 +02:00
|
|
|
# if the key has no value, it means we are traversing this branch for the first time
|
|
|
|
|
# create another node for the key
|
2025-09-23 15:34:53 +02:00
|
|
|
self.children[KEY] = Node(KEY, 0)
|
|
|
|
|
|
2025-09-25 17:57:46 +02:00
|
|
|
# take the node for the key
|
2025-09-23 15:34:53 +02:00
|
|
|
CHILD = self.children[KEY]
|
2025-09-23 17:58:08 +02:00
|
|
|
self.quantity += 1
|
2025-09-23 15:34:53 +02:00
|
|
|
|
2025-09-25 17:57:46 +02:00
|
|
|
# if the child list to enter has only one element, which is KEY, no more node will be created
|
2025-09-23 15:34:53 +02:00
|
|
|
if len(child) == 1:
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
new_children = child[1:]
|
|
|
|
|
|
2025-09-23 17:58:08 +02:00
|
|
|
CHILD.append_child(new_children)
|
2025-09-23 15:34:53 +02:00
|
|
|
|
|
|
|
|
def __str__(self):
|
2025-09-23 17:58:08 +02:00
|
|
|
return f"{self.name}/ - {self.quantity}"
|
2025-09-23 15:34:53 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_args(args: list[str]) -> ProgramArgs:
|
|
|
|
|
|
|
|
|
|
PARSER = argparse.ArgumentParser()
|
|
|
|
|
PARSER.add_argument("--input-file", "-i", required=True, type=str)
|
2025-09-25 17:57:46 +02:00
|
|
|
PARSER.add_argument("--header-name", "-c", required=True, type=str) # c stands for column
|
2025-09-23 17:58:08 +02:00
|
|
|
PARSER.add_argument("--output-file", "-o", required=True, type=str)
|
2025-09-23 15:34:53 +02:00
|
|
|
PARSER.add_argument("--treshold", "-t", type=int, default=1)
|
|
|
|
|
parsed_args, _ = PARSER.parse_known_args(args)
|
|
|
|
|
|
2025-09-23 17:58:08 +02:00
|
|
|
# print(parsed_args.input_file)
|
|
|
|
|
|
2025-09-25 17:57:46 +02:00
|
|
|
return ProgramArgs(parsed_args.input_file, parsed_args.header_name ,parsed_args.output_file, parsed_args.treshold) # type ignore
|
2025-09-23 17:58:08 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_debug_args() -> ProgramArgs:
|
2025-09-25 17:57:46 +02:00
|
|
|
# -i ./Assets/Dataset/1-hop/movies.csv -c subject -o Assets/Dataset/Tmp/prova.csv -t 1
|
|
|
|
|
FILE = "./Assets/Dataset/1-hop/movies.csv"
|
|
|
|
|
CSV_HEADER = "subject"
|
|
|
|
|
OUTPUT = "./Assets/Dataset/Tmp/prova.csv"
|
2025-09-23 17:58:08 +02:00
|
|
|
TRESHOLD = 1
|
2025-09-23 15:34:53 +02:00
|
|
|
|
|
|
|
|
return ProgramArgs(
|
2025-09-23 17:58:08 +02:00
|
|
|
FILE,
|
2025-09-25 17:57:46 +02:00
|
|
|
CSV_HEADER,
|
|
|
|
|
OUTPUT,
|
2025-09-23 17:58:08 +02:00
|
|
|
TRESHOLD
|
|
|
|
|
)
|
2025-09-23 15:34:53 +02:00
|
|
|
|
|
|
|
|
|
2025-09-25 17:57:46 +02:00
|
|
|
def tree_like(file: str, csv_uri_header:str, out: str):
|
2025-09-23 15:34:53 +02:00
|
|
|
|
2025-09-23 17:58:08 +02:00
|
|
|
INDENTATION = " "
|
2025-09-23 15:34:53 +02:00
|
|
|
|
2025-09-23 17:58:08 +02:00
|
|
|
properties: dict[str, Node] = {}
|
2025-09-23 15:34:53 +02:00
|
|
|
|
|
|
|
|
properties["pure"] = Node("pure", 0)
|
|
|
|
|
properties["URI"] = Node("uri", 0)
|
|
|
|
|
|
|
|
|
|
FILE = open(file, "r", encoding="utf-8")
|
|
|
|
|
|
2025-09-25 17:57:46 +02:00
|
|
|
# It is needed the header-name
|
|
|
|
|
for row in csv.DictReader(FILE):
|
2025-09-23 15:34:53 +02:00
|
|
|
|
2025-09-25 17:57:46 +02:00
|
|
|
uri_element = row[csv_uri_header]
|
|
|
|
|
sections = uri_element.split("/")
|
2025-09-23 17:58:08 +02:00
|
|
|
sections = list(filter(lambda item: item != "", sections))
|
2025-09-23 15:34:53 +02:00
|
|
|
|
2025-09-23 17:58:08 +02:00
|
|
|
# print(sections)
|
2025-09-23 15:34:53 +02:00
|
|
|
|
2025-09-23 17:58:08 +02:00
|
|
|
if sections[0] != "http:" and sections[0] != "https:":
|
2025-09-23 15:34:53 +02:00
|
|
|
properties["pure"].append_child(sections)
|
|
|
|
|
continue
|
|
|
|
|
|
2025-09-23 17:58:08 +02:00
|
|
|
properties["URI"].append_child(sections)
|
2025-09-23 15:34:53 +02:00
|
|
|
|
|
|
|
|
FILE.close()
|
|
|
|
|
|
2025-09-23 17:58:08 +02:00
|
|
|
stack: list[tuple[Node, int]] = []
|
2025-09-23 15:34:53 +02:00
|
|
|
|
|
|
|
|
for _, item in properties.items():
|
|
|
|
|
stack.append((item, 0))
|
|
|
|
|
|
2025-09-23 17:58:08 +02:00
|
|
|
OUT = open(out, mode="w", encoding="utf-8")
|
|
|
|
|
|
2025-09-23 15:34:53 +02:00
|
|
|
while len(stack) > 0:
|
|
|
|
|
|
|
|
|
|
LAST_ITEM = stack.pop()
|
|
|
|
|
|
2025-09-23 17:58:08 +02:00
|
|
|
NODE: Node = LAST_ITEM[0]
|
|
|
|
|
DEPTH: int = LAST_ITEM[1]
|
2025-09-23 15:34:53 +02:00
|
|
|
|
2025-09-23 17:58:08 +02:00
|
|
|
INDENT: str = INDENTATION * DEPTH
|
2025-09-23 15:34:53 +02:00
|
|
|
|
2025-09-25 17:57:46 +02:00
|
|
|
# Leaf node have quantity 0, so if i want them to appear the threshold have to be 0
|
|
|
|
|
# if NODE.quantity < ARGS.treshold:
|
|
|
|
|
if ARGS.treshold > NODE.quantity:
|
2025-09-23 15:34:53 +02:00
|
|
|
continue
|
|
|
|
|
|
2025-09-23 17:58:08 +02:00
|
|
|
OUT.write(f"{INDENT}- {NODE}\n")
|
2025-09-23 15:34:53 +02:00
|
|
|
|
|
|
|
|
if NODE.is_leaf:
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
CHILDREN = []
|
|
|
|
|
|
|
|
|
|
for _, child in NODE.children.items():
|
2025-09-23 17:58:08 +02:00
|
|
|
CHILDREN.append((child, DEPTH + 1))
|
|
|
|
|
|
|
|
|
|
stack.extend(CHILDREN)
|
2025-09-23 15:34:53 +02:00
|
|
|
|
2025-09-23 17:58:08 +02:00
|
|
|
OUT.close()
|
2025-09-23 15:34:53 +02:00
|
|
|
|
|
|
|
|
|
2025-09-25 17:57:46 +02:00
|
|
|
|
2025-09-23 15:34:53 +02:00
|
|
|
if __name__ == "__main__":
|
|
|
|
|
ARGS = get_args(sys.argv)
|
2025-09-23 17:58:08 +02:00
|
|
|
# ARGS = get_debug_args()
|
2025-09-25 17:57:46 +02:00
|
|
|
tree_like(ARGS.file,ARGS.csv_uri_header, ARGS.output)
|