Fixed bug for parsing and added CLI functionalities

Added CLI functionalities
Added barebone to have a splitter
2025-09-23 17:58:08 +02:00 · 2025-09-23 17:57:38 +02:00 · 2025-09-23 15:34:53 +02:00 · 2025-09-22 17:52:23 +02:00 · 2025-09-22 17:51:35 +02:00 · 2025-09-22 17:39:44 +02:00
29 changed files with 1470 additions and 2 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,2 +1,3 @@
 Exam/Deep_Learning_2025_VIII.pdf filter=lfs diff=lfs merge=lfs -text
 Assets/** filter=lfs diff=lfs merge=lfs -text
 Assets/Dataset/1-hop/dataset.csv filter=lfs diff=lfs merge=lfs -text
--- a/.gitignore
+++ b/.gitignore
@@ -191,6 +191,7 @@ ipython_config.py
 # Icon must end with two \r
 Icon
 # Thumbnails
 ._*
@@ -251,3 +252,6 @@ $RECYCLE.BIN/
 # .nfs files are created when an open file is removed but is still being accessed
 .nfs*
 # ---> Custom
 **/Tmp/**
 !**/.gitkeep
--- a/.vscode/extensions.json
+++ b/.vscode/extensions.json
@@ -0,0 +1,14 @@
 {
    "recommendations": [
        "bierner.github-markdown-preview",
        "bierner.markdown-checkbox",
        "bierner.markdown-emoji",
        "bierner.markdown-footnotes",
        "bierner.markdown-mermaid",
        "bierner.markdown-preview-github-styles",
        "bierner.markdown-yaml-preamble",
        "davidanson.vscode-markdownlint",
        "kejun.markdown-alert",
        "yzhang.markdown-all-in-one"
    ]
 }
--- a/Assets/Dataset/1-hop/dataset.csv
+++ b/Assets/Dataset/1-hop/dataset.csv
--- a/Assets/Dataset/1-hop/movie-pageid.csv
+++ b/Assets/Dataset/1-hop/movie-pageid.csv
--- a/Assets/Dataset/1-hop/movies.csv
+++ b/Assets/Dataset/1-hop/movies.csv
--- a/Assets/Dataset/1-hop/reverse.csv
+++ b/Assets/Dataset/1-hop/reverse.csv
--- a/Assets/Dataset/1-hop/wikipedia-movie.csv
+++ b/Assets/Dataset/1-hop/wikipedia-movie.csv
--- a/Assets/Dataset/1-hop/wikipedia-summary.csv
+++ b/Assets/Dataset/1-hop/wikipedia-summary.csv
--- a/Assets/Dataset/DatawareHouse/dataset.db
+++ b/Assets/Dataset/DatawareHouse/dataset.db
--- a/Assets/Dataset/Tmp/.gitkeep
+++ b/Assets/Dataset/Tmp/.gitkeep
--- a/README.md
+++ b/README.md
@@ -1,3 +1,28 @@
 # NanoSocrates
 This is the work project for the DeepLearning exam of 16th September 2025
 ## Index
 - [Resources](./docs/RESOURCES.md)
 ## Setup
 Create and activate you Conda enviroment with:
       conda env create -f environment.yaml
       conda activate deep_learning
 Now install dependencies on pip:
        pip install -r requirements.txt
 ## TroubleShooting
 Sometimes when uploading really large batch of data, git can stop the uploads thanks to the timeout.
 The solution is to locally change its settings:
       git config lfs.dialtimeout 3600
       git config lfs.activitytimeout 3600
 for clearance check the link: https://stackoverflow.com/questions/58961697/i-o-timeout-when-pushing-to-a-git-reporsitory
--- a/Scripts/DataCleaning/path_splitter_tree.py
+++ b/Scripts/DataCleaning/path_splitter_tree.py
@@ -0,0 +1,139 @@
 import argparse
 import csv
 import sys
 from typing import Self
 class ProgramArgs:
    def __init__(self, file: str, output: str, treshold: int):
        self.file = file
        self.output = output
        self.treshold = treshold
 class Node:
    def __init__(
        self,
        name: str,
        quantity: int = 0,
    ):
        self.name = name
        self.quantity = quantity
        self.children: dict[str, Node] = {}
    @property
    def is_leaf(self):
        return len(self.children) == 0
    def append_child(self, child: list[str]):
        # print(child)
        KEY = child[0]
        if not self.children.get(KEY):
            self.children[KEY] = Node(KEY, 0)
        CHILD = self.children[KEY]
        self.quantity += 1
        if len(child) == 1:
            return
        new_children = child[1:]
        CHILD.append_child(new_children)
    def __str__(self):
        return f"{self.name}/ - {self.quantity}"
 def get_args(args: list[str]) -> ProgramArgs:
    PARSER = argparse.ArgumentParser()
    PARSER.add_argument("--input-file", "-i", required=True, type=str)
    PARSER.add_argument("--output-file", "-o", required=True, type=str)
    PARSER.add_argument("--treshold", "-t", type=int, default=1)
    parsed_args, _ = PARSER.parse_known_args(args)
    # print(parsed_args.input_file)
    return ProgramArgs(parsed_args.input_file,parsed_args.output_file, parsed_args.treshold)  # type ignore
 def get_debug_args() -> ProgramArgs:
    FILE = "./Assets/Dataset/Tmp/reverse-rel.txt"
    TRESHOLD = 1
    return ProgramArgs(
        FILE,
        TRESHOLD
    )
 def tree_like(file: str, out: str):
    INDENTATION = "    "
    properties: dict[str, Node] = {}
    properties["pure"] = Node("pure", 0)
    properties["URI"] = Node("uri", 0)
    FILE = open(file, "r", encoding="utf-8")
    for row in FILE:
        sections = row.split("/")
        sections = list(filter(lambda item: item != "", sections))
        # print(sections)
        if sections[0] != "http:" and sections[0] != "https:":
            properties["pure"].append_child(sections)
            continue
        properties["URI"].append_child(sections)
    FILE.close()
    stack: list[tuple[Node, int]] = []
    for _, item in properties.items():
        stack.append((item, 0))
    OUT = open(out, mode="w", encoding="utf-8")
    while len(stack) > 0:
        LAST_ITEM = stack.pop()
        NODE: Node = LAST_ITEM[0]
        DEPTH: int = LAST_ITEM[1]
        INDENT: str = INDENTATION * DEPTH
        if NODE.quantity < ARGS.treshold:
            continue
        OUT.write(f"{INDENT}- {NODE}\n")
        if NODE.is_leaf:
            continue
        CHILDREN = []
        for _, child in NODE.children.items():
            CHILDREN.append((child, DEPTH + 1))
        stack.extend(CHILDREN)
    OUT.close()
 if __name__ == "__main__":
    ARGS = get_args(sys.argv)
    # ARGS = get_debug_args()
    tree_like(ARGS.file, ARGS.output)
--- a/Scripts/DataGathering/analysis.py
+++ b/Scripts/DataGathering/analysis.py
@@ -0,0 +1,53 @@
 import argparse
 import sys
 import pandas as pd
 class ProgramArgs:
    def __init__(
        self, input_file: str, column: str, output_file: str, count: bool
    ) -> None:
        self.input_file = input_file
        self.column = column
        self.output_file = output_file
        self.count = count
 def get_args(args: list[str]) -> ProgramArgs:
    PARSER = argparse.ArgumentParser()
    PARSER.add_argument("--input-file", "--input", "-i", required=True, type=str)
    PARSER.add_argument("--output-file", "--output", "-o", required=True, type=str)
    PARSER.add_argument("--column", "--col", required=True, type=str)
    PARSER.add_argument(
        "--count", "-c", action="store_const", const=True, default=False
    )
    parsed_args, _ = PARSER.parse_known_args(args)
    return ProgramArgs(
        parsed_args.input_file,
        parsed_args.column,
        parsed_args.output_file,
        parsed_args.count,
    )  # type ignore
 if __name__ == "__main__":
    ARGS = get_args(sys.argv)
    OUTPUT_FILE = open(ARGS.output_file, "w+", encoding="utf-8")
    # Load the CSV
    df = pd.read_csv(ARGS.input_file)
    # Count occurrences of each unique last part
    item_counts = df[ARGS.column].value_counts()
    # Print the counts
    for item, count in item_counts.items():
        if ARGS.count:
            OUTPUT_FILE.write(f"{item}: {count}\n")
        else:
            OUTPUT_FILE.write(f"{item}\n")
--- a/Scripts/DataGathering/fetchdata.py
+++ b/Scripts/DataGathering/fetchdata.py
@@ -0,0 +1,146 @@
 import argparse
 from math import floor
 import sys
 from time import sleep
 import SPARQLWrapper
 class ProgramData:
    def __init__(
        self,
        local_url,
        query_url,
        sparql_url,
        output_type,
        initial_offset,
        timeout,
        limit,
        max_pages,
        verbosity_level,
    ) -> None:
        self.local_url = local_url
        self.query_url = query_url
        self.sparql_url = sparql_url
        self.output_type = output_type
        self.initial_offset = initial_offset
        self.timeout = timeout
        self.limit = limit
        self.max_pages = max_pages
        self.verbosity_level = verbosity_level
    @property
    def offset(self):
        return self.limit
    @property
    def query(self):
        with open(self.query_url, "r") as file:
            return file.read()
 DBPEDIA_URL = "https://dbpedia.org/sparql"
 TYPE = SPARQLWrapper.CSV
 TIMEOUT_SECONDS = 1.5
 LIMIT = int(1E4)
 INITIAL_OFFSET = 0
 MAX_PAGES = int(1E9)
 def gather_cli_args(args: list[str]) -> ProgramData:
    # TODO: Add argument for type
    PARSER = argparse.ArgumentParser("sparql data fetcher")
    PARSER.add_argument("--file-path", "--file", "--output", "-o", required=True, type=str)
    PARSER.add_argument("--query-file", "--query", "-q", required=True, type=str)
    PARSER.add_argument("--url", type=str, default=DBPEDIA_URL)
    PARSER.add_argument("--limit", type=int, default=LIMIT)
    PARSER.add_argument("--timeout", type=float, default=TIMEOUT_SECONDS)
    PARSER.add_argument("--offset", type=int, default=INITIAL_OFFSET)
    PARSER.add_argument("--max-pages", type=int, default=MAX_PAGES)
    PARSER.add_argument("--verbose", "-v", action="count", default=0)
    parsed_args, _ = PARSER.parse_known_args(args)
    return ProgramData(
        parsed_args.file_path,
        parsed_args.query_file,
        parsed_args.url,
        SPARQLWrapper.CSV,
        parsed_args.offset,
        parsed_args.timeout,
        parsed_args.limit,
        parsed_args.max_pages,
        parsed_args.verbose
    )
    # type: ignore
 def fetch_data(DATA: ProgramData):
    # Take correction of page into account
    page = int(floor(DATA.initial_offset / DATA.limit)) - 1
    exit = False
    while not exit:
        print(f"Starting to get page {page}")
        CURRENT_OFFSET = int(DATA.offset + (page * DATA.limit))
        sparql = SPARQLWrapper.SPARQLWrapper(DATA.sparql_url)
        sparql.setReturnFormat(TYPE)
        CURRENT_PAGE_QUERY = "\n".join([
            DATA.query,
            f"LIMIT {LIMIT}",
            f"OFFSET {CURRENT_OFFSET}"
        ])
        print(f"\nCurrent Query:\n{CURRENT_PAGE_QUERY}\n")
        sparql.setQuery(CURRENT_PAGE_QUERY)
        try:
            res = sparql.queryAndConvert()
            text = ""
            if type(res) == bytes:
                initial_offset = 0
                if page != 0:
                    initial_offset = 1
                lines = res.decode("utf-8", "ignore").split("\n")
                text = "\n".join(lines[initial_offset:])
            if text == "":
                exit = True
                continue
            with open(DATA.local_url, "a+", encoding="utf-8") as dataset:
                print(f"Writing page {page} on {DATA.local_url}")
                dataset.write(
                    text
                )
        except Exception as ex:
            print(f"Something went wrong during page {page}:\n\t{ex}")
        print(f"Sleeping for {TIMEOUT_SECONDS}")
        page += 1
        if page == MAX_PAGES - 1:
            exit = True
        sleep(TIMEOUT_SECONDS)
 if __name__ == "__main__":
    DATA = gather_cli_args(sys.argv)
    fetch_data(DATA)
--- a/Scripts/DataGathering/wikipedia_gathering.py
+++ b/Scripts/DataGathering/wikipedia_gathering.py
@@ -0,0 +1,154 @@
 from pathlib import Path
 import pandas as pd
 import csv
 import time
 import requests
 input_csv = "./Assets/Dataset/1-hop/movie-pageid.csv"
 output_csv = "./Assets/Dataset/Tmp/wikipedia-summary.csv"
 sess = requests.Session()
 CHUNK = 20
 # Function to get clean full text from Wikipedia PageID
 def get_clean_text(pageIDS: list[str]):
    parsing_time = 0
    start_full = time.time()
    API_URL = "https://en.wikipedia.org/w/api.php"
    headers = {
        "User-Agent": "CoolBot/0.0"
        ""
        " (https://example.org/coolbot/; coolbot@example.org)"
    }
    ids = "|".join(pageIDS)
    start_fetch = time.time()
    res = sess.get(headers=headers, url=f"{API_URL}?action=query&pageids={ids}&prop=extracts&exintro=1&explaintext=1&format=json")
    end_fetch = time.time()
    fetch_time = end_fetch - start_fetch
    print(f"Time elapsed FETCH: {fetch_time} seconds")
    data = res.json()
    abstracts = {}
    # Make sure 'query' and the page exist
    SKIPPED = 0
    if "query" in data and "pages" in data["query"]:
        for pageID in pageIDS:
            if pageID in data["query"]["pages"]:
                page = data["query"]["pages"][pageID]
                extract: str = page.get("extract")
                if extract:
                    print(f"Entry FOUND for pageID {pageID}")
                    start_parse = time.time()
                    extract = extract.strip()
                    extract = extract.replace("\n", "")
                    end_parse = time.time()
                    parsing_time = end_parse - start_parse
                    print(f"Time elapsed PARSE: {parsing_time} seconds")
                    abstracts[pageID] = extract
                else:
                    SKIPPED += 1
                    print(f"Entry MISSING for pageID {pageID}")
            else:
                SKIPPED += 1
                print(f"Page MISSING for pageID {pageID}")
    print(f"Chunk done - Skipped {SKIPPED}")
    end_full = time.time()
    print(f"Time elapsed FULL: {end_full - start_full} seconds\n\tNO PARSE: {(end_full - start_full) - parsing_time} seconds")
    return abstracts
 def flush(movie_ids):
        abstracts = get_clean_text(movie_ids)
        start = time.time()
        with open(output_csv, "a", newline="", encoding="utf-8") as f_out:
            writer = csv.DictWriter(f_out, fieldnames=["subject", "text"])
            for id, text in abstracts.items():
                writer.writerow({"subject": id, "text": text})
        end = time.time()
        print(f"Time elapsed WRITE: {end - start} seconds")
 def reconcile() -> int:
    start = time.time()
    input_file = open(input_csv, "r", newline="", encoding="utf-8")
    output_file = open(output_csv, "r", newline="", encoding="utf-8")
    next(input_file)
    LAST_CHECKED = output_file.readlines()[-1].split(",")[0]
    current_check = input_file.readline().split(",")[1]
    index = 1
    while current_check != LAST_CHECKED:
        current_check = input_file.readline().split(",")[1].replace("\n", "")
        index += 1
    input_file.close()
    output_file.close()
    end = time.time()
    print(f"Time elapsed RECONCILE: {end - start} seconds")
    print(f"FOUND, we need to skip {index} lines")
    return index
 if not Path(output_csv).is_file():
    # Initialize output CSV
    with open(output_csv, "w", newline="", encoding="utf-8") as f_out:
        writer = csv.DictWriter(f_out, fieldnames=["subject", "text"])
        writer.writeheader()
 SKIP = reconcile()
 # Read CSV in RAM
 with open(input_csv, "r", newline="", encoding="utf-8") as input:
    # Skip already done
    for i in range(0, SKIP):
        next(input)
    reader = csv.reader(input)
    index = -1
    movie_ids = []
    for line in reader:
        index += 1
        if index == 0:
            continue
        # Save movies in map
        movie_ids.append(line[1])
        if index % CHUNK == 0:
            # Flush movies
            flush(movie_ids)
            movie_ids = []
--- a/Scripts/DatasetMerging/SQL_Queries/db_creation.sql
+++ b/Scripts/DatasetMerging/SQL_Queries/db_creation.sql
@@ -0,0 +1,65 @@
 CREATE TABLE IF NOT EXISTS Movies (
    MovieID INTEGER PRIMARY KEY AUTOINCREMENT,
    MovieURI TEXT UNIQUE NOT NULL
 );
 CREATE TABLE IF NOT EXISTS WikiPageIDs (
    MovieID INTEGER PRIMARY KEY,
    PageID INTEGER UNIQUE NOT NULL,
    FOREIGN KEY(MovieID) REFERENCES Movies(MovieID)
 );
 CREATE TABLE IF NOT EXISTS WikipediaAbstracts (
    MovieID INTEGER PRIMARY KEY,
    Abstract TEXT NOT NULL,
    FOREIGN KEY(MovieID) REFERENCES Movies(MovieID)
 );
 CREATE TABLE IF NOT EXISTS Origins (
    OriginID INTEGER PRIMARY KEY AUTOINCREMENT,
    OriginName TEXT UNIQUE NOT NULL
 );
 CREATE TABLE IF NOT EXISTS Subjects (
    SubjectID INTEGER PRIMARY KEY AUTOINCREMENT,
    SubjectURI TEXT UNIQUE NOT NULL,
    OriginID BIGINT NOT NULL,
    FOREIGN KEY(OriginID) REFERENCES Origins(OriginID)
 );
 CREATE TABLE IF NOT EXISTS Relationships (
    RelationshipID INTEGER PRIMARY KEY AUTOINCREMENT,
    RelationshipURI TEXT UNIQUE NOT NULL
 );
 CREATE TABLE IF NOT EXISTS Objects (
    ObjectID INTEGER PRIMARY KEY AUTOINCREMENT,
    ObjectURI TEXT UNIQUE NOT NULL,
    OriginID BIGINT NOT NULL,
    FOREIGN KEY(OriginID) REFERENCES Origins(OriginID)
 );
 CREATE TABLE IF NOT EXISTS RDFs (
    RDF_ID INTEGER PRIMARY KEY AUTOINCREMENT,
    MovieID INTEGER NOT NULL,
    SubjectID INTEGER NOT NULL,
    RelationshipID INTEGER NOT NULL,
    ObjectID INTEGER NOT NULL,
    UNIQUE(SubjectID, RelationshipID, ObjectID),
    FOREIGN KEY(MovieID) REFERENCES Movies(MovieID),
    FOREIGN KEY(SubjectID) REFERENCES Subjects(SubjectID),
    FOREIGN KEY(RelationshipID) REFERENCES Relationships(RelationshipID),
    FOREIGN KEY(ObjectID) REFERENCES Objects(ObjectID)
 );
 CREATE INDEX IF NOT EXISTS idx_rdf_movie_id ON RDFs(MovieID);
 CREATE INDEX IF NOT EXISTS idx_rdf_subject_id ON RDFs(SubjectID);
 CREATE INDEX IF NOT EXISTS idx_rdf_relationship_id ON RDFs(RelationshipID);
 CREATE INDEX IF NOT EXISTS idx_rdf_object_id ON RDFs(ObjectID);
--- a/Scripts/DatasetMerging/SQL_Queries/query.sql
+++ b/Scripts/DatasetMerging/SQL_Queries/query.sql
@@ -0,0 +1,35 @@
 -- Insert MovieURI into Movies ; MovieID is auto incremental
 INSERT INTO  Movies (MovieURI) VALUES (?);
 -- Get MovieID where MovieURI equal given value
 SELECT MovieID FROM Movies WHERE MovieURI = ?;
 -- SetPageId
 INSERT INTO  WikiPageIDs (MovieID, PageID) VALUES (?,?);
 -- Get MovieId by PageID ... ( to create WikipediaAbstract)
 SELECT MovieID FROM WikiPageIDs WHERE PageID = ?;
 -- SetAbstract ...
 INSERT INTO  WikipediaAbstracts (MovieID, Abstract) VALUES (?,?);
 -- SetOrigin
 ---
 INSERT INTO  Origins (OriginName) VALUES ("dataset.csv"),("reverse.csv");
 -- GetOrigin
 SELECT OriginID FROM Origins WHERE OriginName = ?;
 -- Subject, Relationship, Object, RDF
 INSERT INTO  Subjects (SubjectURI, OriginID) VALUES (?,?);
 INSERT INTO  Relationships (RelationshipURI) VALUES (?);
 INSERT INTO  Objects (ObjectURI, OriginID) VALUES (?,?);
 SELECT SubjectID FROM Subjects WHERE SubjectURI = ?;
 SELECT RelationshipID FROM Relationships WHERE RelationshipURI = ?;
 SELECT ObjectID FROM Objects WHERE ObjectURI = ?;
 INSERT INTO  RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?,?,?,?);
--- a/Scripts/DatasetMerging/datasetInfo.md
+++ b/Scripts/DatasetMerging/datasetInfo.md
@@ -0,0 +1,26 @@
 # HOW THE DATASET IS BUILT AND POPULATED
 Note: the data are taken from CSV files in 1-hop
 ## CSV files composition
 | CSV files          | Original structure                    | Saved AS                            |
 |--------------------|---------------------------------------|-------------------------------------|
 | Wikipeda-summary   | PageId / abstract                     | subject, text                       |
 | Movies             | Movie URI                             | "subject"                           |
 | Dataset            | Movie URI / Relationship / Object [RDF] | subject, relationship, object       |
 | Movies-PageId      | Movie URI / PageId (wiki)             | "subject", "object"                 |
 | Reverse            | Subject / Relationship / Movie URI    | "subject", "relationship", "object" |
 ## Wanted tables schema
 | Table         | Columns                                                                 |
 |---------------|-------------------------------------------------------------------------|
 | Movies        | MovieID [PK], Movie URI                                                 |
 | WikiPageIDs   | MovieID [PK, FK], PageId [IDX] (wiki) *(Not important for now)*         |
 | Abstracts     | MovieID [PK, FK], abstract                                              |
 | Subjects      | SubjectID [PK], RDF Subject (from Dataset.csv or Reverse.csv), OriginID [FK] |
 | Relationships | RelationshipID [PK], RDF Relationship (value only, not the actual relation) |
 | Objects       | ObjectID [PK], RDF Object, OriginID [FK]                                |
 | Origins       | OriginID [PK], Origin Name                                              |
 | RDFs          | RDF_ID [PK], MovieID [FK], SubjectID [FK], RelationshipID [FK], ObjectID [FK] |
--- a/Scripts/DatasetMerging/datawarehouse.py
+++ b/Scripts/DatasetMerging/datawarehouse.py
@@ -0,0 +1,375 @@
 import sqlite3
 import csv
 #####################################################################
 #   This file builds DatawareHouse/dataset.db from 1-hop csv files  #
 #   Its Schema in . /SQL_Queries/db_creation.sql                    #
 #   The sql query used to popualate id in . /SQL_Queries/query.sql  #
 #####################################################################
 # sometimes you may need to build a new db file, here a little snippet for you
 # sqlite3 ./Assets/Dataset/Tmp/dataset.db < ./Scripts/DataCleaning/SQL_Queries/db_creation.sql  
 # --- Global configuration ---
 DB_NAME = "./Assets/Dataset/DatawareHouse/dataset.db"
 MOVIES_CSV = "./Assets/Dataset/1-hop/movies.csv"
 PAGEID_CSV = "./Assets/Dataset/1-hop/movie-pageid.csv"
 SUMMARY_CSV = "./Assets/Dataset/1-hop/wikipedia-summary.csv"
 DATASET_CSV = "./Assets/Dataset/1-hop/dataset.csv"
 REVERSE_CSV = "./Assets/Dataset/1-hop/reverse.csv"
 MOVIES_CSV_HANDLER = open(MOVIES_CSV,"r",newline='', encoding="utf-8")
 PAGEID_CSV_HANDLER = open(PAGEID_CSV,"r",newline='', encoding="utf-8")
 SUMMARY_CSV_HANDLER = open(SUMMARY_CSV,"r",newline='', encoding="utf-8")
 DATASET_CSV_HANDLER = open(DATASET_CSV,"r",newline='', encoding="utf-8")
 REVERSE_CSV_HANDLER = open(REVERSE_CSV,"r",newline='', encoding="utf-8")
 CONN = sqlite3.connect(DB_NAME)
 CURS = CONN.cursor()
 # MARK: SQL Definitions
 # Insert MovieURI
 def insertOrigin(curs : sqlite3.Cursor ) -> bool:
    QUERY = "INSERT INTO  Origins (OriginName) VALUES ('dataset.csv'),('reverse.csv');"
    try:
        curs.execute(QUERY)
        return True
    except sqlite3.IntegrityError:
        return False
 def selectOrigin(curs: sqlite3.Cursor, originName: str) -> int | None:
    QUERY = "SELECT OriginID FROM Origins WHERE OriginName = ?;"
    curs.execute(QUERY, [originName])
    originId = curs.fetchone()
    if not originId:
        return None
    # in this case the real id is the first element of the tuple
    return originId[0]
 def insertMovie(curs : sqlite3.Cursor , movieUri: str) -> bool:
    QUERY = "INSERT INTO Movies (MovieURI) VALUES (?);"
    try:
        curs.execute(QUERY,[movieUri])
        return True
    except sqlite3.IntegrityError:
        return False
 def selectMovieId(curs: sqlite3.Cursor, movieUri: str) -> int | None:
    QUERY = "SELECT MovieID FROM Movies WHERE MovieURI = ?;"
    curs.execute(QUERY, [movieUri])
    movieId = curs.fetchone()
    if not movieId:
        return None
    # in this case the real id is the first element of the tuple
    return movieId[0]
 def insertWikiPageId(curs: sqlite3.Cursor, movieId: int, pageId: int) -> bool:
    QUERY = "INSERT INTO  WikiPageIDs (MovieID, PageID) VALUES (?,?);"
    try:
        curs.execute(QUERY,[movieId, pageId])
        return True
    except sqlite3.IntegrityError:
        return False
 def selectMovieIdFromWikiPageId(curs: sqlite3.Cursor,pageId: int) -> int | None:
    QUERY = "SELECT MovieID FROM WikiPageIDs WHERE PageID = ?;"
    curs.execute(QUERY, [pageId])
    movieId = curs.fetchone()
    if not movieId:
        return None
    # in this case the real id is the first element of the tuple
    return movieId[0]
 def insertWikiAbstract(curs: sqlite3.Cursor, movieId: int, abstract: str) -> bool:
    QUERY = "INSERT INTO  WikipediaAbstracts (MovieID, Abstract) VALUES (?,?);"
    try:
        curs.execute(QUERY,[movieId, abstract])
        return True
    except sqlite3.IntegrityError:
        return False
 def insertSubject(curs: sqlite3.Cursor, subjectURI: str, originID: int) -> bool:
    QUERY = "INSERT INTO  Subjects (SubjectURI, OriginID) VALUES (?,?);"
    try:
        curs.execute(QUERY,[subjectURI, originID])
        return True
    except sqlite3.IntegrityError:
        return False
 def insertRelationship(curs: sqlite3.Cursor, relationshipURI: str) -> bool:
    QUERY = "INSERT INTO  Relationships (RelationshipURI) VALUES (?);"
    try:
        curs.execute(QUERY,[relationshipURI])
        return True
    except sqlite3.IntegrityError:
        return False
 def insertObject(curs: sqlite3.Cursor, objectURI: str, originID: int) -> bool:
    QUERY = "INSERT INTO  objects (ObjectURI, OriginID) VALUES (?,?);"
    try:
        curs.execute(QUERY,[objectURI, originID])
        return True
    except sqlite3.IntegrityError:
        return False
 def selectSubjectId(curs: sqlite3.Cursor, subjectURI: str) -> int | None:
    QUERY = "SELECT SubjectID FROM Subjects WHERE SubjectURI = ?;"
    curs.execute(QUERY, [subjectURI])
    subjectId = curs.fetchone()
    if not subjectId:
        return None
    # in this case the real id is the first element of the tuple
    return subjectId[0]
 def selectRelationshipId(curs: sqlite3.Cursor, relationshipURI: str) -> int | None:
    QUERY = "SELECT RelationshipID FROM Relationships WHERE RelationshipURI = ?;"
    curs.execute(QUERY, [relationshipURI])
    relationshipId = curs.fetchone()
    if not relationshipId:
        return None
    # in this case the real id is the first element of the tuple
    return relationshipId[0]
 def selectObjectId(curs: sqlite3.Cursor, objectURI: str) -> int | None:
    QUERY = "SELECT ObjectID FROM Objects WHERE ObjectURI = ?;"
    curs.execute(QUERY, [objectURI])
    objectId = curs.fetchone()
    if not objectId:
        return None
    # in this case the real id is the first element of the tuple
    return objectId[0]
 def insertRDF(
    curs: sqlite3.Cursor, 
    movieId: int, 
    subjectId: int,
    relationshipId: int,
    objectId: int 
 ) -> bool:
    QUERY = "INSERT INTO  RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?,?,?,?);"
    try:
        curs.execute(QUERY,[movieId,subjectId,relationshipId,objectId])
        return True
    except sqlite3.IntegrityError:
        return False
 # MARK: Parsing
 def parseMovies():
    CSV_READER = csv.reader(MOVIES_CSV_HANDLER)
    next(CSV_READER)
    for row in CSV_READER:
        MOVIE = row[0]
        insertMovie(CURS, MOVIE)
 def parseWikiPageId():
    CSV_READER = csv.DictReader(PAGEID_CSV_HANDLER)
    for row in CSV_READER:
        MOVIE_URI = row["subject"]
        WIKI_PAGE_ID = int(row["object"])
        MOVIE_ID = selectMovieId(CURS, MOVIE_URI)
        if MOVIE_ID is None:
            print(f"The MovieUri: {MOVIE_URI} has not a MovieId ")
            continue
        insertWikiPageId(CURS, MOVIE_ID, WIKI_PAGE_ID)
 def parseAbstract():
    CSV_READER = csv.DictReader(SUMMARY_CSV_HANDLER)
    for row in CSV_READER:
        WIKI_PAGE_ID = int(row["subject"])
        ABSTRACT = row["text"]
        MOVIE_ID = selectMovieIdFromWikiPageId(CURS, WIKI_PAGE_ID)
        if MOVIE_ID is None:
            print(f"The WikiPageId: {WIKI_PAGE_ID} has not a MovieId ")
            continue
        insertWikiAbstract(CURS, MOVIE_ID, ABSTRACT)
 def parseRDF_Reverse():
    REVERSE_CSV_READER = csv.DictReader(REVERSE_CSV_HANDLER)
    REVERSE_ORIGIN_ID = selectOrigin(CURS, 'reverse.csv')
    total = 0
    for row in REVERSE_CSV_READER:
        SUBJECT = row["subject"]
        RELATIONSHIP = row["relationship"]
        OBJECT = row["object"]
        print(f"RDF triplets:\n\t{SUBJECT} - {RELATIONSHIP} - {OBJECT}")
        insertSubject(CURS,SUBJECT,REVERSE_ORIGIN_ID)
        insertRelationship(CURS, RELATIONSHIP)
        insertObject(CURS, OBJECT, REVERSE_ORIGIN_ID)
        SUBJECT_ID = selectSubjectId(CURS, SUBJECT)
        OBJECT_ID = selectObjectId(CURS, OBJECT)
        RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
        MOVIE_ID = selectMovieId(CURS, OBJECT)
        skip = False
        # guard
        if SUBJECT_ID is None:
            print(f"No SubjectId for {SUBJECT}")
            skip = True
        if OBJECT_ID is None:
            print(f"No ObjectId for {OBJECT}")
            skip = True
        if RELATIONSHIP_ID is None:
            print(f"No RelationshipId for {RELATIONSHIP}")
            skip = True
        if MOVIE_ID is None:
            print(f"No MovieId for {OBJECT}")
            skip = True
        if skip:
            continue
        if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID):
            total += 1
    print(total)
 def parseRDF_Dataset():
    DATASET_CSV_READER = csv.DictReader(DATASET_CSV_HANDLER)
    DATASET_ORIGIN_ID = selectOrigin(CURS, 'dataset.csv')
    total = 0
    rdf_idx = 0
    for row in DATASET_CSV_READER:
        SUBJECT = row["subject"]
        RELATIONSHIP = row["relationship"]
        OBJECT = row["object"]
        rdf_idx += 1
        if rdf_idx % 100000 == 0:
            print(f"RDF number {rdf_idx}:\n\t{SUBJECT} - {RELATIONSHIP} - {OBJECT}")
        insertSubject(CURS,SUBJECT,DATASET_ORIGIN_ID)
        insertRelationship(CURS, RELATIONSHIP)
        insertObject(CURS, OBJECT, DATASET_ORIGIN_ID)
        SUBJECT_ID = selectSubjectId(CURS, SUBJECT)
        OBJECT_ID = selectObjectId(CURS, OBJECT)
        RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
        MOVIE_ID = selectMovieId(CURS, SUBJECT)
        skip = False
        # guard
        if SUBJECT_ID is None:
            print(f"No SubjectId for {SUBJECT}")
            skip = True
        if OBJECT_ID is None:
            print(f"No ObjectId for {OBJECT}")
            skip = True
        if RELATIONSHIP_ID is None:
            print(f"No RelationshipId for {RELATIONSHIP}")
            skip = True
        if MOVIE_ID is None:
            print(f"No MovieId for {SUBJECT}")
            skip = True
        if skip:
            continue
        if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID):
            total += 1
    print(total)
 # MARK: Actual Code
 # parseMovies()
 # parseWikiPageId()
 # parseAbstract()
 # insertOrigin(CURS)
 # parseRDF_Reverse()
 # parseRDF_Dataset()
 CONN.commit()
 CONN.close()
 MOVIES_CSV_HANDLER.close()
 PAGEID_CSV_HANDLER.close()
 SUMMARY_CSV_HANDLER.close()
 DATASET_CSV_HANDLER.close()
 REVERSE_CSV_HANDLER.close()
 """
 The MovieUri: http://dbpedia.org/resource/1%25_(film) has not a MovieId 
 The MovieUri: http://dbpedia.org/resource/10%25:_What_Makes_a_Hero%3F has not a MovieId 
 The MovieUri: http://dbpedia.org/resource/100%25_Arabica has not a MovieId 
 The MovieUri: http://dbpedia.org/resource/100%25_Kadhal has not a MovieId 
 The MovieUri: http://dbpedia.org/resource/100%25_Love_(2011_film) has not a MovieId 
 The MovieUri: http://dbpedia.org/resource/100%25_Love_(2012_film) has not a MovieId 
 The MovieUri: http://dbpedia.org/resource/100%25_Wolf has not a MovieId 
 The MovieUri: http://dbpedia.org/resource/Who_the_$&%25_Is_Jackson_Pollock%3F has not a MovieId 
 The MovieUri: http://dbpedia.org/resource/99%25:_The_Occupy_Wall_Street_Collaborative_Film has not a MovieId 
 The MovieUri: http://dbpedia.org/resource/99_and_44/100%25_Dead has not a MovieId 
 The MovieUri: http://dbpedia.org/resource/Postcards_from_the_48%25 has not a MovieId 
 The MovieUri: http://dbpedia.org/resource/Wool_100%25 has not a MovieId 
 """
 """
 The WikiPageId: 10068850 has not a MovieId 
 The WikiPageId: 55069615 has not a MovieId 
 The WikiPageId: 49510056 has not a MovieId 
 The WikiPageId: 4049786 has not a MovieId 
 The WikiPageId: 55510238 has not a MovieId 
 The WikiPageId: 31239628 has not a MovieId 
 The WikiPageId: 34757217 has not a MovieId 
 The WikiPageId: 64311757 has not a MovieId 
 The WikiPageId: 8326198 has not a MovieId 
 The WikiPageId: 42162164 has not a MovieId 
 The WikiPageId: 18502369 has not a MovieId 
 The WikiPageId: 58092358 has not a MovieId 
 The WikiPageId: 40710250 has not a MovieId 
 """
--- a/Scripts/Experiments/.gitkeep
+++ b/Scripts/Experiments/.gitkeep
--- a/Scripts/Experiments/Queries/.gitkeep
+++ b/Scripts/Experiments/Queries/.gitkeep
--- a/Scripts/Experiments/Tmp/.gitkeep
+++ b/Scripts/Experiments/Tmp/.gitkeep
--- a/docs/DBPEDIA.md
+++ b/docs/DBPEDIA.md
@@ -0,0 +1,215 @@
 # DBPedia
 ## GraphIRI
 This is the graph identifier (URI):
 `http://dbpedia.org`
 ## History of queries
 ```SQL
 PREFIX dbo:  <http://dbpedia.org/ontology/>
 PREFIX dbp:  <http://dbpedia.org/property/>
 PREFIX dbr:  <http://dbpedia.org/resource/>
 SELECT ?subject, ?relationship, ?object
 WHERE {
  ?subject ?relationship ?object .
  {
    SELECT ?object
    WHERE {
      ?m rdf:type dbo:Film .
      ?object ?r ?m
    }
  }
 }
 ```
 ### 2 Hops
 ```SQL
 PREFIX dbo:  <http://dbpedia.org/ontology/>
 PREFIX dbp:  <http://dbpedia.org/property/>
 PREFIX dbr:  <http://dbpedia.org/resource/>
 SELECT ?subject, ?relationship, ?object
 WHERE {
  ?subject ?relationship ?object .
  FILTER (?relationship != <http://dbpedia.org/ontology/wikiPageWikiLink>)
  {
    SELECT ?object
    WHERE {
      ?m rdf:type dbo:Film .
      ?object ?r ?m
      FILTER (?r != <http://dbpedia.org/ontology/wikiPageWikiLink>)
    }
  }
 }
 LIMIT 1000000
 ```
 ### 1 Hop
 ```SQL
 PREFIX dbo:  <http://dbpedia.org/ontology/>
 PREFIX dbp:  <http://dbpedia.org/property/>
 PREFIX dbr:  <http://dbpedia.org/resource/>
 SELECT ?subject, ?relationship, ?object
 WHERE {
  ?subject ?relationship ?object .
  ?object rdf:type dbo:Film .
  FILTER (?relationship != <http://dbpedia.org/ontology/wikiPageWikiLink>)
 }
 LIMIT 1000000
 ```
 ```SQL
 PREFIX dbo:  <http://dbpedia.org/ontology/>
 PREFIX dbp:  <http://dbpedia.org/property/>
 PREFIX dbr:  <http://dbpedia.org/resource/>
 SELECT ?subject, ?relationship, ?object
 WHERE {
  ?subject ?relationship ?object .
  ?subject rdf:type dbo:Film .
 }
 ```
 ```SQL
 PREFIX dbo:  <http://dbpedia.org/ontology/>
 PREFIX dbp:  <http://dbpedia.org/property/>
 PREFIX dbr:  <http://dbpedia.org/resource/>
 PREFIX foaf: <http://xmlns.com/foaf/0.1/>
 SELECT ?subject, ?relationship, ?object
 WHERE {
  ?subject ?relationship ?object .
  ?subject rdf:type dbo:Film .
  ?a foaf:primaryTopic ?subject
 }
 ```
 ```SQL
 PREFIX dbo:  <http://dbpedia.org/ontology/>
 PREFIX dbp:  <http://dbpedia.org/property/>
 PREFIX dbr:  <http://dbpedia.org/resource/>
 SELECT ?subject
 WHERE {
  ?subject rdf:type dbo:Film .
 }
 ```
 ```SQL
 PREFIX dbo:  <http://dbpedia.org/ontology/>
 PREFIX dbp:  <http://dbpedia.org/property/>
 PREFIX dbr:  <http://dbpedia.org/resource/>
 PREFIX foaf: <http://xmlns.com/foaf/0.1/>
 SELECT ?subject
 WHERE {
  ?subject rdf:type dbo:Film .
  ?a foaf:primaryTopic ?subject
 }
 ```
 ```SQL
 PREFIX dbo:  <http://dbpedia.org/ontology/>
 PREFIX dbp:  <http://dbpedia.org/property/>
 PREFIX dbr:  <http://dbpedia.org/resource/>
 PREFIX foaf: <http://xmlns.com/foaf/0.1/>
 SELECT ?subject, ?relationship, ?object
 WHERE {
  ?subject ?relationship ?object .
  ?subject rdf:type dbo:Film .
  ?a foaf:primaryTopic ?subject
  FILTER (?relationship NOT IN (
    dbo:wikiPageRedirects,
    dbo:wikiPageExternalLink,
    dbo:wikiPageWikiLink,
    foaf:primaryTopic
  ))
 }
 ```
 #### Wikipedia-movie
 a.k.a the file with the wikipedia abstract
 ```SQL
 PREFIX dbo:  <http://dbpedia.org/ontology/>
 PREFIX dbp:  <http://dbpedia.org/property/>
 PREFIX dbr:  <http://dbpedia.org/resource/>
 PREFIX foaf: <http://xmlns.com/foaf/0.1/>
 SELECT  ?subject , ?object
 WHERE {
  ?subject foaf:primaryTopic ?object .
  ?object rdf:type dbo:Film 
 }
 ```
 #### Reverse
 ```SQL
 PREFIX dbo:  <http://dbpedia.org/ontology/>
 PREFIX dbp:  <http://dbpedia.org/property/>
 PREFIX dbr:  <http://dbpedia.org/resource/>
 PREFIX foaf: <http://xmlns.com/foaf/0.1/>
 SELECT ?subject, ?relationship, ?object
 WHERE {
  ?subject ?relationship ?object .
  ?object rdf:type dbo:Film .
  ?a foaf:primaryTopic ?object
  FILTER (?relationship NOT IN (
    dbo:wikiPageRedirects,
    dbo:wikiPageExternalLink,
    dbo:wikiPageWikiLink,
    foaf:primaryTopic
  ))
 }
 ```
 ```SQL
 PREFIX dbo:  <http://dbpedia.org/ontology/>
 PREFIX dbp:  <http://dbpedia.org/property/>
 PREFIX dbr:  <http://dbpedia.org/resource/>
 PREFIX foaf: <http://xmlns.com/foaf/0.1/>
 SELECT ?subject, ?relationship, ?object
 WHERE {
  ?subject ?relationship ?object .
  ?object rdf:type dbo:Film .
  ?a foaf:primaryTopic ?object
  FILTER (?relationship NOT IN (
    dbo:wikiPageRedirects,
    dbo:wikiPageExternalLink,
    dbo:wikiPageWikiLink,
    foaf:primaryTopic
  ))
 ```
 #### Film \ wiki page ID
 ```SQL
 PREFIX dbo:  <http://dbpedia.org/ontology/>
 PREFIX dbp:  <http://dbpedia.org/property/>
 PREFIX dbr:  <http://dbpedia.org/resource/>
 PREFIX foaf: <http://xmlns.com/foaf/0.1/>
 PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
 SELECT ?subject ?pageID
 WHERE {
  ?subject rdf:type dbo:Film .
  ?subject dbo:wikiPageID ?pageID .
  ?subject rdfs:label ?label .
  FILTER (lang(?label) = "en")
 }
 ```
--- a/docs/DEVELOPMENT.md
+++ b/docs/DEVELOPMENT.md
@@ -0,0 +1,3 @@
 # Development
 ## Data Gathering
--- a/docs/RESOURCES.md
+++ b/docs/RESOURCES.md
@@ -0,0 +1,108 @@
 # Resources
 ## Byte-Pair Encoding (BPE)
 ### Overview
 Byte-Pair Encoding (BPE) is a simple but powerful text compression and tokenization algorithm.
 Originally introduced as a data compression method, it has been widely adopted in **Natural Language Processing (NLP)** to build subword vocabularies for models such as GPT and BERT.
 ---
 ### Key Idea
 BPE works by iteratively replacing the most frequent pair of symbols (initially characters) with a new symbol.
 Over time, frequent character sequences (e.g., common morphemes, prefixes, suffixes) are merged into single tokens.
 ---
 ### Algorithm Steps
 1. **Initialization**
   - Treat each character of the input text as a token.
 2. **Find Frequent Pairs**
   - Count all adjacent token pairs in the sequence.
 3. **Merge Most Frequent Pair**
   - Replace the most frequent pair with a new symbol not used in the text.
 4. **Repeat**
   - Continue until no frequent pairs remain or a desired vocabulary size is reached.
 ---
 ### Example
 Suppose the data to be encoded is:
 ```text
 aaabdaaabac
 ```
 #### Step 1: Merge `"aa"`
 Most frequent pair: `"aa"` → replace with `"Z"`
 ```text
 ZabdZabac
 Z = aa
 ```
 ---
 #### Step 2: Merge `"ab"`
 Most frequent pair: `"ab"` → replace with `"Y"`
 ```text
 ZYdZYac
 Y = ab
 Z = aa
 ```
 ---
 #### Step 3: Merge `"ZY"`
 Most frequent pair: `"ZY"` → replace with `"X"`
 ```text
 XdXac
 X = ZY
 Y = ab
 Z = aa
 ```
 ---
 At this point, no pairs occur more than once, so the process stops.
 ---
 ### Decompression
 To recover the original data, replacements are applied in **reverse order**:
 ```text
 XdXac
 → ZYdZYac
 → ZabdZabac
 → aaabdaaabac
 ```
 ---
 ### Advantages
 - **Efficient vocabulary building**: reduces the need for massive word lists.
 - **Handles rare words**: breaks them into meaningful subword units.
 - **Balances character- and word-level tokenization**.
 ---
 ### Limitations
 - Does not consider linguistic meaning—merges are frequency-based.
 - May create tokens that are not linguistically natural.
 - Vocabulary is fixed after training.
--- a/docs/SPARQL.md
+++ b/docs/SPARQL.md
@@ -0,0 +1,67 @@
 # SparQL
 > [!NOTE]
 > Resources taken from [this website](https://sparql.dev/)
 ## SQL Queries
 ### SELECT
 ```SQL
 SELECT ?var1, ?var2, ...
 ```
 ### WHERE
 ```SQL
 WHERE {
    pattern1 .
    pattern2 .
    ...
 }
 ```
 ### FILTER
 It's used to restrict [`WHERE`](#where) clauses
 ```SQL
 WHERE {
  ?person <http://example.com/hasCar> ?car .
  FILTER (?car = <http://example.com/Car1>)
 }
 ```
 ### OPTIONAL
 It's used to fetch available content if exists
 ```SQL
 SELECT ?person ?car
 WHERE {
  ?person <http://example.com/hasCar> ?car .
  OPTIONAL {
    ?car <http://example.com/hasColor> ?color .
  }
 }
 ```
 ### LIMIT
 Limits results
 ```SQL
 LIMIT 10 -- Take only 10 results
 ```
 ## SparQL functions
 ### COUNT
 ```SQL
 SELECT (COUNT(?person) AS ?count)
 WHERE {
  ?person <http://example.com/hasCar> ?car .
 }
 ```
--- a/environment.yaml
+++ b/environment.yaml
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,17 @@
 certifi==2025.8.3
 charset-normalizer==3.4.3
 idna==3.10
 numpy==2.3.3
 pandas==2.3.2
 pyparsing==3.2.4
 python-dateutil==2.9.0.post0
 pytz==2025.2
 rdflib==7.1.4
 requests==2.32.5
 setuptools==78.1.1
 six==1.17.0
 SPARQLWrapper==2.0.0
 tzdata==2025.2
 urllib3==2.5.0
 wheel==0.45.1
 Wikipedia-API==0.8.1
Author	SHA1	Message	Date
Christian Risi	25f401b577	Fixed bug for parsing and added CLI functionalities	2025-09-23 17:58:08 +02:00
Christian Risi	14c5ade230	Added CLI functionalities	2025-09-23 17:57:38 +02:00
chris-admin	4c9c51f902	Added barebone to have a splitter	2025-09-23 15:34:53 +02:00
GassiGiuseppe	63c1a4a160	added little snippet to rebuild db from db_creation.sql	2025-09-22 17:52:23 +02:00
GassiGiuseppe	51114af853	DataRetrivial deleted since it does the same thing as datawarehouse.py	2025-09-22 17:51:35 +02:00
GassiGiuseppe	3a6dca0681	Infos about Dataset contruction from csv moved from python file to markdown	2025-09-22 17:39:44 +02:00
GassiGiuseppe	346098d2b7	Added query.sql , file with the query used to populate the Dataset	2025-09-22 17:21:32 +02:00
GassiGiuseppe	64f9b41378	Built datawarehouse.py which populate the dataset	2025-09-22 17:17:22 +02:00
GassiGiuseppe	ac1ed42c49	Folder DataCleaning renamed to DatasetMerging since it doesn't clean nothing and instead Build the dataset	2025-09-22 17:11:49 +02:00
GassiGiuseppe	edd01a2c83	Dataset updated, the new one is built with the new method ( 50 new rows found ... upon 13 milion )	2025-09-22 16:57:06 +02:00
GassiGiuseppe	5aa9e3fcf3	Added in DBPEDIA the query to get Film \ wiki page ID plus some editing	2025-09-22 15:42:57 +02:00
GassiGiuseppe	0970cabf92	reverse.csv grammar correction of the header it seemed to have missplaced the header also in the middle of the csv	2025-09-22 13:47:20 +02:00
GassiGiuseppe	a26d92750f	Update movie-pageid.csv : grammar correction of the header	2025-09-22 12:59:35 +02:00
GassiGiuseppe	34c4782232	Dataset.db update. it seems to be correct	2025-09-20 23:33:56 +02:00
GassiGiuseppe	c5439533e6	DataRetrivial update, without df	2025-09-20 23:32:08 +02:00
GassiGiuseppe	8819b8e87f	DataRetrivial populate the db from csv	2025-09-20 19:56:24 +02:00
Christian Risi	1076dc8aa6	Run /Scripts/DataCleaning/SQL_Queries/db_creation.sql	2025-09-20 16:39:16 +02:00
Christian Risi	3d15e03b09	Renamed file to fix spelling	2025-09-20 16:38:38 +02:00
Christian Risi	0ee2ec6fcd	Spelling corrections	2025-09-20 16:37:57 +02:00
Christian Risi	95cfa5486c	Added instructions to create databse schema	2025-09-20 16:30:08 +02:00
GassiGiuseppe	0d30e90ee0	Created file for the db DatawareHouse Also decided firsts schema models into DBMerger	2025-09-20 15:53:32 +02:00
GassiGiuseppe	faaba17a98	Merge branch 'dev' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev	2025-09-20 14:34:25 +02:00
Christian Risi	854e5f1d98	Updated file to gather data from wikipedia	2025-09-20 14:32:30 +02:00
GassiGiuseppe	242d7f674f	wikipedia summary file uploaded Dataset composed of PageId and wikipedia Summary	2025-09-20 14:32:25 +02:00
Christian Risi	de8c2afceb	Added reconciliation	2025-09-19 22:22:09 +02:00
Christian Risi	f89dffff75	Created script to gather wikipedia abstracts	2025-09-19 19:01:38 +02:00
GassiGiuseppe	e39bad8348	Added Troubleshooting section to README where are corrected some potential issue with git and big files	2025-09-19 13:39:56 +02:00
GassiGiuseppe	7a1a221017	update of the database of movie-pageid which has subject has film uri and object wikipage id	2025-09-19 13:37:56 +02:00
Christian Risi	fafe6ae0f9	Modified tree structure with more TMP directories	2025-09-19 12:46:31 +02:00
Christian Risi	e32444df75	Updated fetchdata to be used in terminal Changes: - now you can use it as if it were a cli command Missing: - documentation	2025-09-19 12:35:15 +02:00
Christian Risi	b74b7ac4f0	Added new directories to make experiments and updated .gitignore Changes: - Added /Scripts/Experiments/Queries to keep track of important queries, once set - Added /Scripts/Experiments/Tmp to run quick experiments when still unsure while explorating datasets	2025-09-19 08:43:54 +02:00
Christian Risi	22134391d9	Added Scripts/Experiment directory This directory is to place files to make experiments	2025-09-19 08:41:46 +02:00
Christian Risi	82c9023849	Ignoring Scripts/Experiments files and always tracking .gitkeep files	2025-09-19 08:39:47 +02:00
Christian Risi	00b87e01ea	Moved fetchdata.py to reflect working tree old - ${Proj}/Scripts/fetchdata.py new - ${Proj}/Scripts/DataGathering/fetchdata.py	2025-09-19 08:37:04 +02:00
Christian Risi	ce3d4bf6c5	Renamed dir from Script to Scripts	2025-09-19 08:31:00 +02:00
GassiGiuseppe	c415b175a0	added reverse.csv with the reletion incoming to films	2025-09-18 20:26:51 +02:00
GassiGiuseppe	ec81ea7930	Added file to gather wikipedia abstract from url	2025-09-18 20:26:11 +02:00
GassiGiuseppe	4bb03f86b3	Added file to study the most frequent relationship into a csv triplet	2025-09-18 20:25:25 +02:00
GassiGiuseppe	e5f201f3db	DEVELOPMENT file makrdown created	2025-09-18 20:24:54 +02:00
GassiGiuseppe	1c715dc569	Typo correction in the markdown	2025-09-18 20:24:11 +02:00
GassiGiuseppe	6686b47328	Added SQL to obtain wikipedia url with movies	2025-09-18 20:23:10 +02:00
GassiGiuseppe	9a5a7d84fd	Merge branch 'dev' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev	2025-09-18 19:20:26 +02:00
GassiGiuseppe	9678ece9c0	Requirements changed added Pandas and some other	2025-09-18 19:07:38 +02:00
Christian Risi	67bcd732b5	Updated movies	2025-09-18 18:36:52 +02:00
Christian Risi	1a4f900500	Updated git attributes	2025-09-18 18:36:42 +02:00
Christian Risi	ca8729b67c	Merge branch 'dev' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev	2025-09-18 18:36:30 +02:00
GassiGiuseppe	9dbffc52ed	Added dataset of movies and their wikipedia's page link	2025-09-18 18:16:51 +02:00
Christian Risi	b7f504942a	Created Dataset	2025-09-18 17:24:08 +02:00
Christian Risi	7f0c5ce8d3	Updated File for fetching	2025-09-18 17:23:56 +02:00
Christian Risi	9838e287a4	Updated file	2025-09-18 12:03:09 +02:00
Christian Risi	ca6143ea3c	Updated Query histories	2025-09-18 11:46:32 +02:00
Christian Risi	16e7ab4d9f	Modified Datasets	2025-09-17 17:30:51 +02:00
Christian Risi	28723ab662	Merge branch 'dev' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev	2025-09-17 17:06:16 +02:00
Christian Risi	3e59efcf33	Generated datasets	2025-09-17 17:06:14 +02:00
Christian Risi	7c04309cc1	Added script to fetch data from DBPedia	2025-09-17 17:05:27 +02:00
Christian Risi	db87295890	Added history of queries	2025-09-17 17:04:58 +02:00
GassiGiuseppe	61568200a8	README update with setup chapter where are scripted the command to manage conda and pip	2025-09-17 16:50:50 +02:00
Christian Risi	8df2736b97	Added environments	2025-09-17 16:16:58 +02:00
Christian Risi	eb5b7f629a	Conda env	2025-09-17 15:53:17 +02:00
Christian Risi	79232b391e	First SparQL query	2025-09-17 14:26:37 +02:00
Christian Risi	72eb937b47	Fixed Markdown violations	2025-09-17 12:51:14 +02:00
Christian Risi	cececa14ce	Merge branch 'dev' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev	2025-09-17 12:48:34 +02:00
Christian Risi	2487d44abd	Added SparQL	2025-09-17 12:48:33 +02:00
GassiGiuseppe	553b86cac2	Resources file updated with Byte-Pair Encoding a technique we will use to tokenize the engress' words	2025-09-17 12:06:01 +02:00
Christian Risi	12bd781fd3	Added workspace recommendations	2025-09-17 11:38:23 +02:00
Christian Risi	463f4907b8	Added Resources documentation	2025-09-17 11:36:02 +02:00