diff --git a/Scripts/DataCleaning/path_splitter_tree.py b/Scripts/DataCleaning/path_splitter_tree.py index e237308..e7f6f9e 100644 --- a/Scripts/DataCleaning/path_splitter_tree.py +++ b/Scripts/DataCleaning/path_splitter_tree.py @@ -6,8 +6,16 @@ from typing import Self class ProgramArgs: - def __init__(self, file: str, output: str, treshold: int): + def __init__(self, file: str, csv_uri_header: str, output: str, treshold: int): + """ + Args: + file (str): + csv_header (str): The name of the column of the csv file from which the program will get the URIs + output (str): + treshold (int): + """ self.file = file + self.csv_uri_header = csv_uri_header self.output = output self.treshold = treshold @@ -33,11 +41,15 @@ class Node: KEY = child[0] if not self.children.get(KEY): + # if the key has no value, it means we are traversing this branch for the first time + # create another node for the key self.children[KEY] = Node(KEY, 0) + # take the node for the key CHILD = self.children[KEY] self.quantity += 1 + # if the child list to enter has only one element, which is KEY, no more node will be created if len(child) == 1: return @@ -53,27 +65,32 @@ def get_args(args: list[str]) -> ProgramArgs: PARSER = argparse.ArgumentParser() PARSER.add_argument("--input-file", "-i", required=True, type=str) + PARSER.add_argument("--header-name", "-c", required=True, type=str) # c stands for column PARSER.add_argument("--output-file", "-o", required=True, type=str) PARSER.add_argument("--treshold", "-t", type=int, default=1) parsed_args, _ = PARSER.parse_known_args(args) # print(parsed_args.input_file) - return ProgramArgs(parsed_args.input_file,parsed_args.output_file, parsed_args.treshold) # type ignore + return ProgramArgs(parsed_args.input_file, parsed_args.header_name ,parsed_args.output_file, parsed_args.treshold) # type ignore def get_debug_args() -> ProgramArgs: - - FILE = "./Assets/Dataset/Tmp/reverse-rel.txt" + # -i ./Assets/Dataset/1-hop/movies.csv -c subject -o Assets/Dataset/Tmp/prova.csv -t 1 + FILE = "./Assets/Dataset/1-hop/movies.csv" + CSV_HEADER = "subject" + OUTPUT = "./Assets/Dataset/Tmp/prova.csv" TRESHOLD = 1 return ProgramArgs( FILE, + CSV_HEADER, + OUTPUT, TRESHOLD ) -def tree_like(file: str, out: str): +def tree_like(file: str, csv_uri_header:str, out: str): INDENTATION = " " @@ -85,9 +102,11 @@ def tree_like(file: str, out: str): FILE = open(file, "r", encoding="utf-8") # TODO: Change here so it takes single URI from a CSV file - for row in FILE: + # It is needed the header-name + for row in csv.DictReader(FILE): - sections = row.split("/") + uri_element = row[csv_uri_header] + sections = uri_element.split("/") sections = list(filter(lambda item: item != "", sections)) # print(sections) @@ -116,7 +135,9 @@ def tree_like(file: str, out: str): INDENT: str = INDENTATION * DEPTH - if NODE.quantity < ARGS.treshold: + # Leaf node have quantity 0, so if i want them to appear the threshold have to be 0 + # if NODE.quantity < ARGS.treshold: + if ARGS.treshold > NODE.quantity: continue OUT.write(f"{INDENT}- {NODE}\n") @@ -134,7 +155,8 @@ def tree_like(file: str, out: str): OUT.close() + if __name__ == "__main__": ARGS = get_args(sys.argv) # ARGS = get_debug_args() - tree_like(ARGS.file, ARGS.output) + tree_like(ARGS.file,ARGS.csv_uri_header, ARGS.output)