Fixed bug for parsing and added CLI functionalities

Added CLI functionalities
Added barebone to have a splitter
2025-09-23 17:58:08 +02:00 · 2025-09-23 17:57:38 +02:00 · 2025-09-23 15:34:53 +02:00 · 2025-09-22 17:52:23 +02:00 · 2025-09-22 17:51:35 +02:00 · 2025-09-22 17:39:44 +02:00
29 changed files with 1470 additions and 2 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,2 +1,3 @@
 Exam/Deep_Learning_2025_VIII.pdf filter=lfs diff=lfs merge=lfs -text
 Assets/** filter=lfs diff=lfs merge=lfs -text
+Assets/Dataset/1-hop/dataset.csv filter=lfs diff=lfs merge=lfs -text
--- a/.gitignore
+++ b/.gitignore
@@ -189,7 +189,8 @@ ipython_config.py
 .LSOverride

 # Icon must end with two \r
-Icon
+Icon
+

 # Thumbnails
 ._*
@@ -251,3 +252,6 @@ $RECYCLE.BIN/
 # .nfs files are created when an open file is removed but is still being accessed
 .nfs*

+# ---> Custom
+**/Tmp/**
+!**/.gitkeep
--- a/.vscode/extensions.json
+++ b/.vscode/extensions.json
@@ -0,0 +1,14 @@
+{
+    "recommendations": [
+        "bierner.github-markdown-preview",
+        "bierner.markdown-checkbox",
+        "bierner.markdown-emoji",
+        "bierner.markdown-footnotes",
+        "bierner.markdown-mermaid",
+        "bierner.markdown-preview-github-styles",
+        "bierner.markdown-yaml-preamble",
+        "davidanson.vscode-markdownlint",
+        "kejun.markdown-alert",
+        "yzhang.markdown-all-in-one"
+    ]
+}
--- a/Assets/Dataset/1-hop/dataset.csv
+++ b/Assets/Dataset/1-hop/dataset.csv
--- a/Assets/Dataset/1-hop/movie-pageid.csv
+++ b/Assets/Dataset/1-hop/movie-pageid.csv
--- a/Assets/Dataset/1-hop/movies.csv
+++ b/Assets/Dataset/1-hop/movies.csv
--- a/Assets/Dataset/1-hop/reverse.csv
+++ b/Assets/Dataset/1-hop/reverse.csv
--- a/Assets/Dataset/1-hop/wikipedia-movie.csv
+++ b/Assets/Dataset/1-hop/wikipedia-movie.csv
--- a/Assets/Dataset/1-hop/wikipedia-summary.csv
+++ b/Assets/Dataset/1-hop/wikipedia-summary.csv
--- a/Assets/Dataset/DatawareHouse/dataset.db
+++ b/Assets/Dataset/DatawareHouse/dataset.db
--- a/Assets/Dataset/Tmp/.gitkeep
+++ b/Assets/Dataset/Tmp/.gitkeep
--- a/README.md
+++ b/README.md
@@ -1,3 +1,28 @@
 # NanoSocrates

-This is the work project for the DeepLearning exam of 16th September 2025
+This is the work project for the DeepLearning exam of 16th September 2025
+
+## Index
+
+- [Resources](./docs/RESOURCES.md)
+
+## Setup
+
+Create and activate you Conda enviroment with:
+
+       conda env create -f environment.yaml
+       conda activate deep_learning
+  
+Now install dependencies on pip:
+
+        pip install -r requirements.txt
+
+## TroubleShooting
+
+Sometimes when uploading really large batch of data, git can stop the uploads thanks to the timeout.
+The solution is to locally change its settings:
+
+       git config lfs.dialtimeout 3600
+       git config lfs.activitytimeout 3600
+
+for clearance check the link: https://stackoverflow.com/questions/58961697/i-o-timeout-when-pushing-to-a-git-reporsitory
--- a/Scripts/DataCleaning/path_splitter_tree.py
+++ b/Scripts/DataCleaning/path_splitter_tree.py
@@ -0,0 +1,139 @@
+import argparse
+import csv
+import sys
+from typing import Self
+
+
+class ProgramArgs:
+
+    def __init__(self, file: str, output: str, treshold: int):
+        self.file = file
+        self.output = output
+        self.treshold = treshold
+
+
+class Node:
+
+    def __init__(
+        self,
+        name: str,
+        quantity: int = 0,
+    ):
+        self.name = name
+        self.quantity = quantity
+        self.children: dict[str, Node] = {}
+
+    @property
+    def is_leaf(self):
+        return len(self.children) == 0
+
+    def append_child(self, child: list[str]):
+
+        # print(child)
+        KEY = child[0]
+
+        if not self.children.get(KEY):
+            self.children[KEY] = Node(KEY, 0)
+
+        CHILD = self.children[KEY]
+        self.quantity += 1
+
+        if len(child) == 1:
+            return
+
+        new_children = child[1:]
+
+        CHILD.append_child(new_children)
+
+    def __str__(self):
+        return f"{self.name}/ - {self.quantity}"
+
+
+def get_args(args: list[str]) -> ProgramArgs:
+
+    PARSER = argparse.ArgumentParser()
+    PARSER.add_argument("--input-file", "-i", required=True, type=str)
+    PARSER.add_argument("--output-file", "-o", required=True, type=str)
+    PARSER.add_argument("--treshold", "-t", type=int, default=1)
+    parsed_args, _ = PARSER.parse_known_args(args)
+
+    # print(parsed_args.input_file)
+
+    return ProgramArgs(parsed_args.input_file,parsed_args.output_file, parsed_args.treshold)  # type ignore
+
+
+def get_debug_args() -> ProgramArgs:
+
+    FILE = "./Assets/Dataset/Tmp/reverse-rel.txt"
+    TRESHOLD = 1
+
+    return ProgramArgs(
+        FILE,
+        TRESHOLD
+    )
+
+
+def tree_like(file: str, out: str):
+
+    INDENTATION = "    "
+
+    properties: dict[str, Node] = {}
+
+    properties["pure"] = Node("pure", 0)
+    properties["URI"] = Node("uri", 0)
+
+    FILE = open(file, "r", encoding="utf-8")
+
+    for row in FILE:
+
+        sections = row.split("/")
+        sections = list(filter(lambda item: item != "", sections))
+
+        # print(sections)
+
+        if sections[0] != "http:" and sections[0] != "https:":
+            properties["pure"].append_child(sections)
+            continue
+
+        properties["URI"].append_child(sections)
+
+    FILE.close()
+
+    stack: list[tuple[Node, int]] = []
+
+    for _, item in properties.items():
+        stack.append((item, 0))
+
+    OUT = open(out, mode="w", encoding="utf-8")
+
+    while len(stack) > 0:
+
+        LAST_ITEM = stack.pop()
+
+        NODE: Node = LAST_ITEM[0]
+        DEPTH: int = LAST_ITEM[1]
+
+        INDENT: str = INDENTATION * DEPTH
+
+        if NODE.quantity < ARGS.treshold:
+            continue
+
+        OUT.write(f"{INDENT}- {NODE}\n")
+
+        if NODE.is_leaf:
+            continue
+
+        CHILDREN = []
+
+        for _, child in NODE.children.items():
+            CHILDREN.append((child, DEPTH + 1))
+
+        stack.extend(CHILDREN)
+
+    OUT.close()
+
+
+if __name__ == "__main__":
+    ARGS = get_args(sys.argv)
+    # ARGS = get_debug_args()
+    tree_like(ARGS.file, ARGS.output)
--- a/Scripts/DataGathering/analysis.py
+++ b/Scripts/DataGathering/analysis.py
@@ -0,0 +1,53 @@
+import argparse
+import sys
+import pandas as pd
+
+
+class ProgramArgs:
+
+    def __init__(
+        self, input_file: str, column: str, output_file: str, count: bool
+    ) -> None:
+        self.input_file = input_file
+        self.column = column
+        self.output_file = output_file
+        self.count = count
+
+
+def get_args(args: list[str]) -> ProgramArgs:
+
+    PARSER = argparse.ArgumentParser()
+    PARSER.add_argument("--input-file", "--input", "-i", required=True, type=str)
+    PARSER.add_argument("--output-file", "--output", "-o", required=True, type=str)
+    PARSER.add_argument("--column", "--col", required=True, type=str)
+    PARSER.add_argument(
+        "--count", "-c", action="store_const", const=True, default=False
+    )
+    parsed_args, _ = PARSER.parse_known_args(args)
+
+    return ProgramArgs(
+        parsed_args.input_file,
+        parsed_args.column,
+        parsed_args.output_file,
+        parsed_args.count,
+    )  # type ignore
+
+
+if __name__ == "__main__":
+    ARGS = get_args(sys.argv)
+
+    OUTPUT_FILE = open(ARGS.output_file, "w+", encoding="utf-8")
+
+    # Load the CSV
+    df = pd.read_csv(ARGS.input_file)
+
+    # Count occurrences of each unique last part
+    item_counts = df[ARGS.column].value_counts()
+
+    # Print the counts
+    for item, count in item_counts.items():
+
+        if ARGS.count:
+            OUTPUT_FILE.write(f"{item}: {count}\n")
+        else:
+            OUTPUT_FILE.write(f"{item}\n")
--- a/Scripts/DataGathering/fetchdata.py
+++ b/Scripts/DataGathering/fetchdata.py
@@ -0,0 +1,146 @@
+import argparse
+from math import floor
+import sys
+from time import sleep
+import SPARQLWrapper
+
+
+class ProgramData:
+
+    def __init__(
+        self,
+        local_url,
+        query_url,
+        sparql_url,
+        output_type,
+        initial_offset,
+        timeout,
+        limit,
+        max_pages,
+        verbosity_level,
+    ) -> None:
+
+        self.local_url = local_url
+        self.query_url = query_url
+        self.sparql_url = sparql_url
+        self.output_type = output_type
+        self.initial_offset = initial_offset
+        self.timeout = timeout
+        self.limit = limit
+        self.max_pages = max_pages
+        self.verbosity_level = verbosity_level
+
+    @property
+    def offset(self):
+        return self.limit
+
+    @property
+    def query(self):
+
+        with open(self.query_url, "r") as file:
+            return file.read()
+
+
+DBPEDIA_URL = "https://dbpedia.org/sparql"
+TYPE = SPARQLWrapper.CSV
+TIMEOUT_SECONDS = 1.5
+LIMIT = int(1E4)
+INITIAL_OFFSET = 0
+MAX_PAGES = int(1E9)
+
+
+def gather_cli_args(args: list[str]) -> ProgramData:
+
+    # TODO: Add argument for type
+    PARSER = argparse.ArgumentParser("sparql data fetcher")
+    PARSER.add_argument("--file-path", "--file", "--output", "-o", required=True, type=str)
+    PARSER.add_argument("--query-file", "--query", "-q", required=True, type=str)
+    PARSER.add_argument("--url", type=str, default=DBPEDIA_URL)
+    PARSER.add_argument("--limit", type=int, default=LIMIT)
+    PARSER.add_argument("--timeout", type=float, default=TIMEOUT_SECONDS)
+    PARSER.add_argument("--offset", type=int, default=INITIAL_OFFSET)
+    PARSER.add_argument("--max-pages", type=int, default=MAX_PAGES)
+    PARSER.add_argument("--verbose", "-v", action="count", default=0)
+
+    parsed_args, _ = PARSER.parse_known_args(args)
+
+    return ProgramData(
+        parsed_args.file_path,
+        parsed_args.query_file,
+        parsed_args.url,
+        SPARQLWrapper.CSV,
+        parsed_args.offset,
+        parsed_args.timeout,
+        parsed_args.limit,
+        parsed_args.max_pages,
+        parsed_args.verbose
+    )
+    # type: ignore
+
+
+def fetch_data(DATA: ProgramData):
+
+    # Take correction of page into account
+    page = int(floor(DATA.initial_offset / DATA.limit)) - 1
+    exit = False
+
+    while not exit:
+
+        print(f"Starting to get page {page}")
+
+        CURRENT_OFFSET = int(DATA.offset + (page * DATA.limit))
+        sparql = SPARQLWrapper.SPARQLWrapper(DATA.sparql_url)
+
+        sparql.setReturnFormat(TYPE)
+
+        CURRENT_PAGE_QUERY = "\n".join([
+            DATA.query,
+            f"LIMIT {LIMIT}",
+            f"OFFSET {CURRENT_OFFSET}"
+        ])
+
+        print(f"\nCurrent Query:\n{CURRENT_PAGE_QUERY}\n")
+
+        sparql.setQuery(CURRENT_PAGE_QUERY)
+
+        try:
+            res = sparql.queryAndConvert()
+            text = ""
+
+            if type(res) == bytes:
+
+                initial_offset = 0
+
+                if page != 0:
+                    initial_offset = 1
+
+                lines = res.decode("utf-8", "ignore").split("\n")
+                text = "\n".join(lines[initial_offset:])
+
+            if text == "":
+                exit = True
+                continue
+
+            with open(DATA.local_url, "a+", encoding="utf-8") as dataset:
+
+                print(f"Writing page {page} on {DATA.local_url}")
+                dataset.write(
+                    text
+                )
+
+        except Exception as ex:
+            print(f"Something went wrong during page {page}:\n\t{ex}")
+
+        print(f"Sleeping for {TIMEOUT_SECONDS}")
+
+        page += 1
+
+        if page == MAX_PAGES - 1:
+            exit = True
+
+        sleep(TIMEOUT_SECONDS)
+
+
+if __name__ == "__main__":
+    DATA = gather_cli_args(sys.argv)
+    fetch_data(DATA)
--- a/Scripts/DataGathering/wikipedia_gathering.py
+++ b/Scripts/DataGathering/wikipedia_gathering.py
@@ -0,0 +1,154 @@
+from pathlib import Path
+import pandas as pd
+
+import csv
+import time
+import requests
+
+input_csv = "./Assets/Dataset/1-hop/movie-pageid.csv"
+output_csv = "./Assets/Dataset/Tmp/wikipedia-summary.csv"
+
+
+sess = requests.Session()
+
+CHUNK = 20
+
+
+# Function to get clean full text from Wikipedia PageID
+def get_clean_text(pageIDS: list[str]):
+
+    parsing_time = 0
+    start_full = time.time()
+    API_URL = "https://en.wikipedia.org/w/api.php"
+    headers = {
+        "User-Agent": "CoolBot/0.0"
+        ""
+        " (https://example.org/coolbot/; coolbot@example.org)"
+    }
+
+    ids = "|".join(pageIDS)
+
+    start_fetch = time.time()
+    res = sess.get(headers=headers, url=f"{API_URL}?action=query&pageids={ids}&prop=extracts&exintro=1&explaintext=1&format=json")
+    end_fetch = time.time()
+    fetch_time = end_fetch - start_fetch
+    print(f"Time elapsed FETCH: {fetch_time} seconds")
+
+    data = res.json()
+
+
+    abstracts = {}
+    # Make sure 'query' and the page exist
+    SKIPPED = 0
+    if "query" in data and "pages" in data["query"]:
+        for pageID in pageIDS:
+            if pageID in data["query"]["pages"]:
+                page = data["query"]["pages"][pageID]
+                extract: str = page.get("extract")
+
+                if extract:
+                    print(f"Entry FOUND for pageID {pageID}")
+                    start_parse = time.time()
+                    extract = extract.strip()
+                    extract = extract.replace("\n", "")
+                    end_parse = time.time()
+                    parsing_time = end_parse - start_parse
+                    print(f"Time elapsed PARSE: {parsing_time} seconds")
+                    abstracts[pageID] = extract
+                else:
+                    SKIPPED += 1
+                    print(f"Entry MISSING for pageID {pageID}")
+            else:
+                SKIPPED += 1
+                print(f"Page MISSING for pageID {pageID}")
+
+    print(f"Chunk done - Skipped {SKIPPED}")
+    end_full = time.time()
+
+    print(f"Time elapsed FULL: {end_full - start_full} seconds\n\tNO PARSE: {(end_full - start_full) - parsing_time} seconds")
+    return abstracts
+
+
+def flush(movie_ids):
+
+
+        abstracts = get_clean_text(movie_ids)
+
+        start = time.time()
+        with open(output_csv, "a", newline="", encoding="utf-8") as f_out:
+            writer = csv.DictWriter(f_out, fieldnames=["subject", "text"])
+
+            for id, text in abstracts.items():
+                writer.writerow({"subject": id, "text": text})
+        end = time.time()
+
+        print(f"Time elapsed WRITE: {end - start} seconds")
+
+
+
+
+def reconcile() -> int:
+
+    start = time.time()
+    input_file = open(input_csv, "r", newline="", encoding="utf-8")
+    output_file = open(output_csv, "r", newline="", encoding="utf-8")
+
+    next(input_file)
+    LAST_CHECKED = output_file.readlines()[-1].split(",")[0]
+    current_check = input_file.readline().split(",")[1]
+
+    index = 1
+
+    while current_check != LAST_CHECKED:
+        current_check = input_file.readline().split(",")[1].replace("\n", "")
+        index += 1
+
+    input_file.close()
+    output_file.close()
+    end = time.time()
+
+
+    print(f"Time elapsed RECONCILE: {end - start} seconds")
+
+    print(f"FOUND, we need to skip {index} lines")
+
+    return index
+
+
+if not Path(output_csv).is_file():
+    # Initialize output CSV
+    with open(output_csv, "w", newline="", encoding="utf-8") as f_out:
+        writer = csv.DictWriter(f_out, fieldnames=["subject", "text"])
+        writer.writeheader()
+
+
+SKIP = reconcile()
+
+
+# Read CSV in RAM
+with open(input_csv, "r", newline="", encoding="utf-8") as input:
+
+    # Skip already done
+    for i in range(0, SKIP):
+        next(input)
+
+    reader = csv.reader(input)
+
+    index = -1
+    movie_ids = []
+
+    for line in reader:
+
+        index += 1
+
+        if index == 0:
+            continue
+
+        # Save movies in map
+        movie_ids.append(line[1])
+
+        if index % CHUNK == 0:
+
+            # Flush movies
+            flush(movie_ids)
+            movie_ids = []
--- a/Scripts/DatasetMerging/SQL_Queries/db_creation.sql
+++ b/Scripts/DatasetMerging/SQL_Queries/db_creation.sql
@@ -0,0 +1,65 @@
+CREATE TABLE IF NOT EXISTS Movies (
+    MovieID INTEGER PRIMARY KEY AUTOINCREMENT,
+    MovieURI TEXT UNIQUE NOT NULL
+);
+
+
+CREATE TABLE IF NOT EXISTS WikiPageIDs (
+    MovieID INTEGER PRIMARY KEY,
+    PageID INTEGER UNIQUE NOT NULL,
+    FOREIGN KEY(MovieID) REFERENCES Movies(MovieID)
+);
+
+
+CREATE TABLE IF NOT EXISTS WikipediaAbstracts (
+    MovieID INTEGER PRIMARY KEY,
+    Abstract TEXT NOT NULL,
+    FOREIGN KEY(MovieID) REFERENCES Movies(MovieID)
+);
+
+
+CREATE TABLE IF NOT EXISTS Origins (
+    OriginID INTEGER PRIMARY KEY AUTOINCREMENT,
+    OriginName TEXT UNIQUE NOT NULL
+);
+
+
+CREATE TABLE IF NOT EXISTS Subjects (
+    SubjectID INTEGER PRIMARY KEY AUTOINCREMENT,
+    SubjectURI TEXT UNIQUE NOT NULL,
+    OriginID BIGINT NOT NULL,
+    FOREIGN KEY(OriginID) REFERENCES Origins(OriginID)
+);
+
+
+CREATE TABLE IF NOT EXISTS Relationships (
+    RelationshipID INTEGER PRIMARY KEY AUTOINCREMENT,
+    RelationshipURI TEXT UNIQUE NOT NULL
+);
+
+
+CREATE TABLE IF NOT EXISTS Objects (
+    ObjectID INTEGER PRIMARY KEY AUTOINCREMENT,
+    ObjectURI TEXT UNIQUE NOT NULL,
+    OriginID BIGINT NOT NULL,
+    FOREIGN KEY(OriginID) REFERENCES Origins(OriginID)
+);
+
+CREATE TABLE IF NOT EXISTS RDFs (
+    RDF_ID INTEGER PRIMARY KEY AUTOINCREMENT,
+    MovieID INTEGER NOT NULL,
+    SubjectID INTEGER NOT NULL,
+    RelationshipID INTEGER NOT NULL,
+    ObjectID INTEGER NOT NULL,
+    UNIQUE(SubjectID, RelationshipID, ObjectID),
+    FOREIGN KEY(MovieID) REFERENCES Movies(MovieID),
+    FOREIGN KEY(SubjectID) REFERENCES Subjects(SubjectID),
+    FOREIGN KEY(RelationshipID) REFERENCES Relationships(RelationshipID),
+    FOREIGN KEY(ObjectID) REFERENCES Objects(ObjectID)
+);
+
+CREATE INDEX IF NOT EXISTS idx_rdf_movie_id ON RDFs(MovieID);
+CREATE INDEX IF NOT EXISTS idx_rdf_subject_id ON RDFs(SubjectID);
+CREATE INDEX IF NOT EXISTS idx_rdf_relationship_id ON RDFs(RelationshipID);
+CREATE INDEX IF NOT EXISTS idx_rdf_object_id ON RDFs(ObjectID);
+
--- a/Scripts/DatasetMerging/SQL_Queries/query.sql
+++ b/Scripts/DatasetMerging/SQL_Queries/query.sql
@@ -0,0 +1,35 @@
+-- Insert MovieURI into Movies ; MovieID is auto incremental
+INSERT INTO  Movies (MovieURI) VALUES (?);
+
+-- Get MovieID where MovieURI equal given value
+SELECT MovieID FROM Movies WHERE MovieURI = ?;
+
+-- SetPageId
+INSERT INTO  WikiPageIDs (MovieID, PageID) VALUES (?,?);
+
+-- Get MovieId by PageID ... ( to create WikipediaAbstract)
+SELECT MovieID FROM WikiPageIDs WHERE PageID = ?;
+
+-- SetAbstract ...
+
+INSERT INTO  WikipediaAbstracts (MovieID, Abstract) VALUES (?,?);
+
+
+-- SetOrigin
+---
+INSERT INTO  Origins (OriginName) VALUES ("dataset.csv"),("reverse.csv");
+
+-- GetOrigin
+SELECT OriginID FROM Origins WHERE OriginName = ?;
+
+-- Subject, Relationship, Object, RDF
+INSERT INTO  Subjects (SubjectURI, OriginID) VALUES (?,?);
+INSERT INTO  Relationships (RelationshipURI) VALUES (?);
+INSERT INTO  Objects (ObjectURI, OriginID) VALUES (?,?);
+
+SELECT SubjectID FROM Subjects WHERE SubjectURI = ?;
+SELECT RelationshipID FROM Relationships WHERE RelationshipURI = ?;
+SELECT ObjectID FROM Objects WHERE ObjectURI = ?;
+
+
+INSERT INTO  RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?,?,?,?);
--- a/Scripts/DatasetMerging/datasetInfo.md
+++ b/Scripts/DatasetMerging/datasetInfo.md
@@ -0,0 +1,26 @@
+# HOW THE DATASET IS BUILT AND POPULATED
+
+Note: the data are taken from CSV files in 1-hop
+
+## CSV files composition
+
+| CSV files          | Original structure                    | Saved AS                            |
+|--------------------|---------------------------------------|-------------------------------------|
+| Wikipeda-summary   | PageId / abstract                     | subject, text                       |
+| Movies             | Movie URI                             | "subject"                           |
+| Dataset            | Movie URI / Relationship / Object [RDF] | subject, relationship, object       |
+| Movies-PageId      | Movie URI / PageId (wiki)             | "subject", "object"                 |
+| Reverse            | Subject / Relationship / Movie URI    | "subject", "relationship", "object" |
+
+## Wanted tables schema
+
+| Table         | Columns                                                                 |
+|---------------|-------------------------------------------------------------------------|
+| Movies        | MovieID [PK], Movie URI                                                 |
+| WikiPageIDs   | MovieID [PK, FK], PageId [IDX] (wiki) *(Not important for now)*         |
+| Abstracts     | MovieID [PK, FK], abstract                                              |
+| Subjects      | SubjectID [PK], RDF Subject (from Dataset.csv or Reverse.csv), OriginID [FK] |
+| Relationships | RelationshipID [PK], RDF Relationship (value only, not the actual relation) |
+| Objects       | ObjectID [PK], RDF Object, OriginID [FK]                                |
+| Origins       | OriginID [PK], Origin Name                                              |
+| RDFs          | RDF_ID [PK], MovieID [FK], SubjectID [FK], RelationshipID [FK], ObjectID [FK] |
--- a/Scripts/DatasetMerging/datawarehouse.py
+++ b/Scripts/DatasetMerging/datawarehouse.py
@@ -0,0 +1,375 @@
+import sqlite3
+import csv
+
+#####################################################################
+#   This file builds DatawareHouse/dataset.db from 1-hop csv files  #
+#   Its Schema in . /SQL_Queries/db_creation.sql                    #
+#   The sql query used to popualate id in . /SQL_Queries/query.sql  #
+#####################################################################
+
+# sometimes you may need to build a new db file, here a little snippet for you
+# sqlite3 ./Assets/Dataset/Tmp/dataset.db < ./Scripts/DataCleaning/SQL_Queries/db_creation.sql  
+
+# --- Global configuration ---
+DB_NAME = "./Assets/Dataset/DatawareHouse/dataset.db"
+MOVIES_CSV = "./Assets/Dataset/1-hop/movies.csv"
+PAGEID_CSV = "./Assets/Dataset/1-hop/movie-pageid.csv"
+SUMMARY_CSV = "./Assets/Dataset/1-hop/wikipedia-summary.csv"
+DATASET_CSV = "./Assets/Dataset/1-hop/dataset.csv"
+REVERSE_CSV = "./Assets/Dataset/1-hop/reverse.csv"
+
+MOVIES_CSV_HANDLER = open(MOVIES_CSV,"r",newline='', encoding="utf-8")
+PAGEID_CSV_HANDLER = open(PAGEID_CSV,"r",newline='', encoding="utf-8")
+SUMMARY_CSV_HANDLER = open(SUMMARY_CSV,"r",newline='', encoding="utf-8")
+DATASET_CSV_HANDLER = open(DATASET_CSV,"r",newline='', encoding="utf-8")
+REVERSE_CSV_HANDLER = open(REVERSE_CSV,"r",newline='', encoding="utf-8")
+
+CONN = sqlite3.connect(DB_NAME)
+CURS = CONN.cursor()
+
+# MARK: SQL Definitions
+# Insert MovieURI
+
+def insertOrigin(curs : sqlite3.Cursor ) -> bool:
+
+    QUERY = "INSERT INTO  Origins (OriginName) VALUES ('dataset.csv'),('reverse.csv');"
+    try:
+        curs.execute(QUERY)
+        return True
+    except sqlite3.IntegrityError:
+        return False
+    
+def selectOrigin(curs: sqlite3.Cursor, originName: str) -> int | None:
+
+    QUERY = "SELECT OriginID FROM Origins WHERE OriginName = ?;"
+    
+    curs.execute(QUERY, [originName])
+    originId = curs.fetchone()
+    if not originId:
+        return None
+    
+    # in this case the real id is the first element of the tuple
+    return originId[0]
+
+def insertMovie(curs : sqlite3.Cursor , movieUri: str) -> bool:
+
+    QUERY = "INSERT INTO Movies (MovieURI) VALUES (?);"
+    try:
+        curs.execute(QUERY,[movieUri])
+        return True
+    except sqlite3.IntegrityError:
+        return False
+
+
+def selectMovieId(curs: sqlite3.Cursor, movieUri: str) -> int | None:
+
+    QUERY = "SELECT MovieID FROM Movies WHERE MovieURI = ?;"
+    
+    curs.execute(QUERY, [movieUri])
+    movieId = curs.fetchone()
+    if not movieId:
+        return None
+    
+    # in this case the real id is the first element of the tuple
+    return movieId[0]
+
+
+def insertWikiPageId(curs: sqlite3.Cursor, movieId: int, pageId: int) -> bool:
+    QUERY = "INSERT INTO  WikiPageIDs (MovieID, PageID) VALUES (?,?);"
+    try:
+        curs.execute(QUERY,[movieId, pageId])
+        return True
+    except sqlite3.IntegrityError:
+        return False
+    
+def selectMovieIdFromWikiPageId(curs: sqlite3.Cursor,pageId: int) -> int | None:
+
+    QUERY = "SELECT MovieID FROM WikiPageIDs WHERE PageID = ?;"
+    
+    curs.execute(QUERY, [pageId])
+    movieId = curs.fetchone()
+    if not movieId:
+        return None
+    
+    # in this case the real id is the first element of the tuple
+    return movieId[0]
+
+def insertWikiAbstract(curs: sqlite3.Cursor, movieId: int, abstract: str) -> bool:
+    QUERY = "INSERT INTO  WikipediaAbstracts (MovieID, Abstract) VALUES (?,?);"
+    try:
+        curs.execute(QUERY,[movieId, abstract])
+        return True
+    except sqlite3.IntegrityError:
+        return False
+
+def insertSubject(curs: sqlite3.Cursor, subjectURI: str, originID: int) -> bool:
+    QUERY = "INSERT INTO  Subjects (SubjectURI, OriginID) VALUES (?,?);"
+    try:
+        curs.execute(QUERY,[subjectURI, originID])
+        return True
+    except sqlite3.IntegrityError:
+        return False
+    
+def insertRelationship(curs: sqlite3.Cursor, relationshipURI: str) -> bool:
+    QUERY = "INSERT INTO  Relationships (RelationshipURI) VALUES (?);"
+    try:
+        curs.execute(QUERY,[relationshipURI])
+        return True
+    except sqlite3.IntegrityError:
+        return False
+
+def insertObject(curs: sqlite3.Cursor, objectURI: str, originID: int) -> bool:
+    QUERY = "INSERT INTO  objects (ObjectURI, OriginID) VALUES (?,?);"
+    try:
+        curs.execute(QUERY,[objectURI, originID])
+        return True
+    except sqlite3.IntegrityError:
+        return False
+    
+def selectSubjectId(curs: sqlite3.Cursor, subjectURI: str) -> int | None:
+
+    QUERY = "SELECT SubjectID FROM Subjects WHERE SubjectURI = ?;"
+    
+    curs.execute(QUERY, [subjectURI])
+    subjectId = curs.fetchone()
+    if not subjectId:
+        return None
+    
+    # in this case the real id is the first element of the tuple
+    return subjectId[0]
+
+def selectRelationshipId(curs: sqlite3.Cursor, relationshipURI: str) -> int | None:
+
+    QUERY = "SELECT RelationshipID FROM Relationships WHERE RelationshipURI = ?;"
+    
+    curs.execute(QUERY, [relationshipURI])
+    relationshipId = curs.fetchone()
+    if not relationshipId:
+        return None
+    
+    # in this case the real id is the first element of the tuple
+    return relationshipId[0]
+
+def selectObjectId(curs: sqlite3.Cursor, objectURI: str) -> int | None:
+
+    QUERY = "SELECT ObjectID FROM Objects WHERE ObjectURI = ?;"
+    
+    curs.execute(QUERY, [objectURI])
+    objectId = curs.fetchone()
+    if not objectId:
+        return None
+    
+    # in this case the real id is the first element of the tuple
+    return objectId[0]
+    
+def insertRDF(
+    curs: sqlite3.Cursor, 
+    movieId: int, 
+    subjectId: int,
+    relationshipId: int,
+    objectId: int 
+) -> bool:
+    QUERY = "INSERT INTO  RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?,?,?,?);"
+    try:
+        curs.execute(QUERY,[movieId,subjectId,relationshipId,objectId])
+        return True
+    except sqlite3.IntegrityError:
+        return False
+    
+# MARK: Parsing
+def parseMovies():
+
+    CSV_READER = csv.reader(MOVIES_CSV_HANDLER)
+    next(CSV_READER)
+    for row in CSV_READER:
+        MOVIE = row[0]
+        insertMovie(CURS, MOVIE)
+
+
+def parseWikiPageId():
+    CSV_READER = csv.DictReader(PAGEID_CSV_HANDLER)
+    for row in CSV_READER:
+        MOVIE_URI = row["subject"]
+        WIKI_PAGE_ID = int(row["object"])
+        MOVIE_ID = selectMovieId(CURS, MOVIE_URI)
+
+        if MOVIE_ID is None:
+            print(f"The MovieUri: {MOVIE_URI} has not a MovieId ")
+            continue
+
+        insertWikiPageId(CURS, MOVIE_ID, WIKI_PAGE_ID)
+
+
+def parseAbstract():
+    CSV_READER = csv.DictReader(SUMMARY_CSV_HANDLER)
+    for row in CSV_READER:
+        
+        WIKI_PAGE_ID = int(row["subject"])
+        ABSTRACT = row["text"]
+        MOVIE_ID = selectMovieIdFromWikiPageId(CURS, WIKI_PAGE_ID)
+
+
+        if MOVIE_ID is None:
+            print(f"The WikiPageId: {WIKI_PAGE_ID} has not a MovieId ")
+            continue
+
+        insertWikiAbstract(CURS, MOVIE_ID, ABSTRACT)
+
+
+def parseRDF_Reverse():
+
+    REVERSE_CSV_READER = csv.DictReader(REVERSE_CSV_HANDLER)
+    REVERSE_ORIGIN_ID = selectOrigin(CURS, 'reverse.csv')
+    total = 0
+
+    for row in REVERSE_CSV_READER:
+        SUBJECT = row["subject"]
+        RELATIONSHIP = row["relationship"]
+        OBJECT = row["object"]
+        print(f"RDF triplets:\n\t{SUBJECT} - {RELATIONSHIP} - {OBJECT}")
+        insertSubject(CURS,SUBJECT,REVERSE_ORIGIN_ID)
+        insertRelationship(CURS, RELATIONSHIP)
+        insertObject(CURS, OBJECT, REVERSE_ORIGIN_ID)
+
+        SUBJECT_ID = selectSubjectId(CURS, SUBJECT)
+        OBJECT_ID = selectObjectId(CURS, OBJECT)
+        RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
+        MOVIE_ID = selectMovieId(CURS, OBJECT)
+
+
+        skip = False
+
+        # guard
+        if SUBJECT_ID is None:
+            print(f"No SubjectId for {SUBJECT}")
+            skip = True
+
+        if OBJECT_ID is None:
+            print(f"No ObjectId for {OBJECT}")
+            skip = True
+
+        if RELATIONSHIP_ID is None:
+            print(f"No RelationshipId for {RELATIONSHIP}")
+            skip = True
+
+        if MOVIE_ID is None:
+            print(f"No MovieId for {OBJECT}")
+            skip = True
+
+        if skip:
+            continue
+
+        if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID):
+            total += 1
+
+    print(total)
+
+
+
+def parseRDF_Dataset():
+
+    DATASET_CSV_READER = csv.DictReader(DATASET_CSV_HANDLER)
+    DATASET_ORIGIN_ID = selectOrigin(CURS, 'dataset.csv')
+
+    total = 0
+    rdf_idx = 0
+    for row in DATASET_CSV_READER:
+
+        SUBJECT = row["subject"]
+        RELATIONSHIP = row["relationship"]
+        OBJECT = row["object"]
+
+        rdf_idx += 1
+
+        if rdf_idx % 100000 == 0:
+            print(f"RDF number {rdf_idx}:\n\t{SUBJECT} - {RELATIONSHIP} - {OBJECT}")
+
+        insertSubject(CURS,SUBJECT,DATASET_ORIGIN_ID)
+        insertRelationship(CURS, RELATIONSHIP)
+        insertObject(CURS, OBJECT, DATASET_ORIGIN_ID)
+
+        SUBJECT_ID = selectSubjectId(CURS, SUBJECT)
+        OBJECT_ID = selectObjectId(CURS, OBJECT)
+        RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
+        MOVIE_ID = selectMovieId(CURS, SUBJECT)
+
+
+        skip = False
+
+        # guard
+        if SUBJECT_ID is None:
+            print(f"No SubjectId for {SUBJECT}")
+            skip = True
+
+        if OBJECT_ID is None:
+            print(f"No ObjectId for {OBJECT}")
+            skip = True
+
+        if RELATIONSHIP_ID is None:
+            print(f"No RelationshipId for {RELATIONSHIP}")
+            skip = True
+
+        if MOVIE_ID is None:
+            print(f"No MovieId for {SUBJECT}")
+            skip = True
+
+        if skip:
+            continue
+
+        if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID):
+            total += 1
+
+    print(total)
+        
+
+# MARK: Actual Code
+# parseMovies()
+# parseWikiPageId()
+# parseAbstract()
+# insertOrigin(CURS)
+# parseRDF_Reverse()
+# parseRDF_Dataset()
+
+
+CONN.commit()
+CONN.close()
+    
+
+
+MOVIES_CSV_HANDLER.close()
+PAGEID_CSV_HANDLER.close()
+SUMMARY_CSV_HANDLER.close()
+DATASET_CSV_HANDLER.close()
+REVERSE_CSV_HANDLER.close()
+
+
+"""
+The MovieUri: http://dbpedia.org/resource/1%25_(film) has not a MovieId 
+The MovieUri: http://dbpedia.org/resource/10%25:_What_Makes_a_Hero%3F has not a MovieId 
+The MovieUri: http://dbpedia.org/resource/100%25_Arabica has not a MovieId 
+The MovieUri: http://dbpedia.org/resource/100%25_Kadhal has not a MovieId 
+The MovieUri: http://dbpedia.org/resource/100%25_Love_(2011_film) has not a MovieId 
+The MovieUri: http://dbpedia.org/resource/100%25_Love_(2012_film) has not a MovieId 
+The MovieUri: http://dbpedia.org/resource/100%25_Wolf has not a MovieId 
+The MovieUri: http://dbpedia.org/resource/Who_the_$&%25_Is_Jackson_Pollock%3F has not a MovieId 
+The MovieUri: http://dbpedia.org/resource/99%25:_The_Occupy_Wall_Street_Collaborative_Film has not a MovieId 
+The MovieUri: http://dbpedia.org/resource/99_and_44/100%25_Dead has not a MovieId 
+The MovieUri: http://dbpedia.org/resource/Postcards_from_the_48%25 has not a MovieId 
+The MovieUri: http://dbpedia.org/resource/Wool_100%25 has not a MovieId 
+"""
+
+"""
+The WikiPageId: 10068850 has not a MovieId 
+The WikiPageId: 55069615 has not a MovieId 
+The WikiPageId: 49510056 has not a MovieId 
+The WikiPageId: 4049786 has not a MovieId 
+The WikiPageId: 55510238 has not a MovieId 
+The WikiPageId: 31239628 has not a MovieId 
+The WikiPageId: 34757217 has not a MovieId 
+The WikiPageId: 64311757 has not a MovieId 
+The WikiPageId: 8326198 has not a MovieId 
+The WikiPageId: 42162164 has not a MovieId 
+The WikiPageId: 18502369 has not a MovieId 
+The WikiPageId: 58092358 has not a MovieId 
+The WikiPageId: 40710250 has not a MovieId 
+"""
--- a/Scripts/Experiments/.gitkeep
+++ b/Scripts/Experiments/.gitkeep
--- a/Scripts/Experiments/Queries/.gitkeep
+++ b/Scripts/Experiments/Queries/.gitkeep
--- a/Scripts/Experiments/Tmp/.gitkeep
+++ b/Scripts/Experiments/Tmp/.gitkeep
--- a/docs/DBPEDIA.md
+++ b/docs/DBPEDIA.md
@@ -0,0 +1,215 @@
+# DBPedia
+
+## GraphIRI
+
+This is the graph identifier (URI):
+
+`http://dbpedia.org`
+
+## History of queries
+
+```SQL
+PREFIX dbo:  <http://dbpedia.org/ontology/>
+PREFIX dbp:  <http://dbpedia.org/property/>
+PREFIX dbr:  <http://dbpedia.org/resource/>
+
+SELECT ?subject, ?relationship, ?object
+WHERE {
+  ?subject ?relationship ?object .
+  {
+    SELECT ?object
+    WHERE {
+      ?m rdf:type dbo:Film .
+      ?object ?r ?m
+    }
+  }
+}
+```
+
+### 2 Hops
+
+```SQL
+PREFIX dbo:  <http://dbpedia.org/ontology/>
+PREFIX dbp:  <http://dbpedia.org/property/>
+PREFIX dbr:  <http://dbpedia.org/resource/>
+
+SELECT ?subject, ?relationship, ?object
+WHERE {
+  ?subject ?relationship ?object .
+  FILTER (?relationship != <http://dbpedia.org/ontology/wikiPageWikiLink>)
+  {
+    SELECT ?object
+    WHERE {
+      ?m rdf:type dbo:Film .
+      ?object ?r ?m
+      FILTER (?r != <http://dbpedia.org/ontology/wikiPageWikiLink>)
+    }
+  }
+}
+LIMIT 1000000
+```
+
+### 1 Hop
+
+```SQL
+PREFIX dbo:  <http://dbpedia.org/ontology/>
+PREFIX dbp:  <http://dbpedia.org/property/>
+PREFIX dbr:  <http://dbpedia.org/resource/>
+
+SELECT ?subject, ?relationship, ?object
+WHERE {
+  ?subject ?relationship ?object .
+  ?object rdf:type dbo:Film .
+  FILTER (?relationship != <http://dbpedia.org/ontology/wikiPageWikiLink>)
+}
+LIMIT 1000000
+```
+
+```SQL
+PREFIX dbo:  <http://dbpedia.org/ontology/>
+PREFIX dbp:  <http://dbpedia.org/property/>
+PREFIX dbr:  <http://dbpedia.org/resource/>
+
+SELECT ?subject, ?relationship, ?object
+WHERE {
+  ?subject ?relationship ?object .
+  ?subject rdf:type dbo:Film .
+}
+```
+
+```SQL
+PREFIX dbo:  <http://dbpedia.org/ontology/>
+PREFIX dbp:  <http://dbpedia.org/property/>
+PREFIX dbr:  <http://dbpedia.org/resource/>
+PREFIX foaf: <http://xmlns.com/foaf/0.1/>
+
+SELECT ?subject, ?relationship, ?object
+WHERE {
+  ?subject ?relationship ?object .
+  ?subject rdf:type dbo:Film .
+  ?a foaf:primaryTopic ?subject
+}
+```
+
+```SQL
+PREFIX dbo:  <http://dbpedia.org/ontology/>
+PREFIX dbp:  <http://dbpedia.org/property/>
+PREFIX dbr:  <http://dbpedia.org/resource/>
+
+SELECT ?subject
+WHERE {
+  ?subject rdf:type dbo:Film .
+}
+```
+
+```SQL
+PREFIX dbo:  <http://dbpedia.org/ontology/>
+PREFIX dbp:  <http://dbpedia.org/property/>
+PREFIX dbr:  <http://dbpedia.org/resource/>
+PREFIX foaf: <http://xmlns.com/foaf/0.1/>
+
+SELECT ?subject
+WHERE {
+  ?subject rdf:type dbo:Film .
+  ?a foaf:primaryTopic ?subject
+}
+```
+
+```SQL
+PREFIX dbo:  <http://dbpedia.org/ontology/>
+PREFIX dbp:  <http://dbpedia.org/property/>
+PREFIX dbr:  <http://dbpedia.org/resource/>
+PREFIX foaf: <http://xmlns.com/foaf/0.1/>
+
+SELECT ?subject, ?relationship, ?object
+WHERE {
+  ?subject ?relationship ?object .
+  ?subject rdf:type dbo:Film .
+  ?a foaf:primaryTopic ?subject
+  FILTER (?relationship NOT IN (
+    dbo:wikiPageRedirects,
+    dbo:wikiPageExternalLink,
+    dbo:wikiPageWikiLink,
+    foaf:primaryTopic
+  ))
+}
+
+```
+
+#### Wikipedia-movie
+
+a.k.a the file with the wikipedia abstract
+
+```SQL
+PREFIX dbo:  <http://dbpedia.org/ontology/>
+PREFIX dbp:  <http://dbpedia.org/property/>
+PREFIX dbr:  <http://dbpedia.org/resource/>
+PREFIX foaf: <http://xmlns.com/foaf/0.1/>
+
+SELECT  ?subject , ?object
+WHERE {
+  ?subject foaf:primaryTopic ?object .
+  ?object rdf:type dbo:Film 
+}
+```
+
+#### Reverse
+
+```SQL
+PREFIX dbo:  <http://dbpedia.org/ontology/>
+PREFIX dbp:  <http://dbpedia.org/property/>
+PREFIX dbr:  <http://dbpedia.org/resource/>
+PREFIX foaf: <http://xmlns.com/foaf/0.1/>
+
+SELECT ?subject, ?relationship, ?object
+WHERE {
+  ?subject ?relationship ?object .
+  ?object rdf:type dbo:Film .
+  ?a foaf:primaryTopic ?object
+  FILTER (?relationship NOT IN (
+    dbo:wikiPageRedirects,
+    dbo:wikiPageExternalLink,
+    dbo:wikiPageWikiLink,
+    foaf:primaryTopic
+  ))
+}
+```
+
+```SQL
+PREFIX dbo:  <http://dbpedia.org/ontology/>
+PREFIX dbp:  <http://dbpedia.org/property/>
+PREFIX dbr:  <http://dbpedia.org/resource/>
+PREFIX foaf: <http://xmlns.com/foaf/0.1/>
+
+SELECT ?subject, ?relationship, ?object
+WHERE {
+  ?subject ?relationship ?object .
+  ?object rdf:type dbo:Film .
+  ?a foaf:primaryTopic ?object
+  FILTER (?relationship NOT IN (
+    dbo:wikiPageRedirects,
+    dbo:wikiPageExternalLink,
+    dbo:wikiPageWikiLink,
+    foaf:primaryTopic
+  ))
+
+```
+
+#### Film \ wiki page ID
+
+```SQL
+PREFIX dbo:  <http://dbpedia.org/ontology/>
+PREFIX dbp:  <http://dbpedia.org/property/>
+PREFIX dbr:  <http://dbpedia.org/resource/>
+PREFIX foaf: <http://xmlns.com/foaf/0.1/>
+PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+
+SELECT ?subject ?pageID
+WHERE {
+  ?subject rdf:type dbo:Film .
+  ?subject dbo:wikiPageID ?pageID .
+  ?subject rdfs:label ?label .
+  FILTER (lang(?label) = "en")
+}
+
+```
--- a/docs/DEVELOPMENT.md
+++ b/docs/DEVELOPMENT.md
@@ -0,0 +1,3 @@
+# Development
+
+## Data Gathering
--- a/docs/RESOURCES.md
+++ b/docs/RESOURCES.md
@@ -0,0 +1,108 @@
+# Resources
+
+## Byte-Pair Encoding (BPE)
+
+### Overview
+
+Byte-Pair Encoding (BPE) is a simple but powerful text compression and tokenization algorithm.
+Originally introduced as a data compression method, it has been widely adopted in **Natural Language Processing (NLP)** to build subword vocabularies for models such as GPT and BERT.
+
+---
+
+### Key Idea
+
+BPE works by iteratively replacing the most frequent pair of symbols (initially characters) with a new symbol.
+Over time, frequent character sequences (e.g., common morphemes, prefixes, suffixes) are merged into single tokens.
+
+---
+
+### Algorithm Steps
+
+1. **Initialization**
+   - Treat each character of the input text as a token.
+
+2. **Find Frequent Pairs**
+   - Count all adjacent token pairs in the sequence.
+
+3. **Merge Most Frequent Pair**
+   - Replace the most frequent pair with a new symbol not used in the text.
+
+4. **Repeat**
+   - Continue until no frequent pairs remain or a desired vocabulary size is reached.
+
+---
+
+### Example
+
+Suppose the data to be encoded is:
+
+```text
+aaabdaaabac
+```
+
+#### Step 1: Merge `"aa"`
+
+Most frequent pair: `"aa"` → replace with `"Z"`
+
+```text
+ZabdZabac
+Z = aa
+```
+
+---
+
+#### Step 2: Merge `"ab"`
+
+Most frequent pair: `"ab"` → replace with `"Y"`
+
+```text
+ZYdZYac
+Y = ab
+Z = aa
+```
+
+---
+
+#### Step 3: Merge `"ZY"`
+
+Most frequent pair: `"ZY"` → replace with `"X"`
+
+```text
+XdXac
+X = ZY
+Y = ab
+Z = aa
+```
+
+---
+
+At this point, no pairs occur more than once, so the process stops.
+
+---
+
+### Decompression
+
+To recover the original data, replacements are applied in **reverse order**:
+
+```text
+XdXac
+→ ZYdZYac
+→ ZabdZabac
+→ aaabdaaabac
+```
+
+---
+
+### Advantages
+
+- **Efficient vocabulary building**: reduces the need for massive word lists.
+- **Handles rare words**: breaks them into meaningful subword units.
+- **Balances character- and word-level tokenization**.
+
+---
+
+### Limitations
+
+- Does not consider linguistic meaning—merges are frequency-based.
+- May create tokens that are not linguistically natural.
+- Vocabulary is fixed after training.
--- a/docs/SPARQL.md
+++ b/docs/SPARQL.md
@@ -0,0 +1,67 @@
+# SparQL
+
+> [!NOTE]
+> Resources taken from [this website](https://sparql.dev/)
+
+## SQL Queries
+
+### SELECT
+
+```SQL
+SELECT ?var1, ?var2, ...
+```
+
+### WHERE
+
+```SQL
+WHERE {
+    pattern1 .
+    pattern2 .
+    ...
+}
+```
+
+### FILTER
+
+It's used to restrict [`WHERE`](#where) clauses
+
+```SQL
+WHERE {
+  ?person <http://example.com/hasCar> ?car .
+  FILTER (?car = <http://example.com/Car1>)
+}
+```
+
+### OPTIONAL
+
+It's used to fetch available content if exists
+
+```SQL
+SELECT ?person ?car
+WHERE {
+  ?person <http://example.com/hasCar> ?car .
+  OPTIONAL {
+    ?car <http://example.com/hasColor> ?color .
+  }
+}
+```
+
+### LIMIT
+
+Limits results
+
+```SQL
+LIMIT 10 -- Take only 10 results
+```
+
+## SparQL functions
+
+### COUNT
+
+```SQL
+SELECT (COUNT(?person) AS ?count)
+WHERE {
+  ?person <http://example.com/hasCar> ?car .
+}
+```
+
--- a/environment.yaml
+++ b/environment.yaml
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,17 @@
+certifi==2025.8.3
+charset-normalizer==3.4.3
+idna==3.10
+numpy==2.3.3
+pandas==2.3.2
+pyparsing==3.2.4
+python-dateutil==2.9.0.post0
+pytz==2025.2
+rdflib==7.1.4
+requests==2.32.5
+setuptools==78.1.1
+six==1.17.0
+SPARQLWrapper==2.0.0
+tzdata==2025.2
+urllib3==2.5.0
+wheel==0.45.1
+Wikipedia-API==0.8.1
Author	SHA1	Message	Date
Christian Risi	25f401b577	Fixed bug for parsing and added CLI functionalities	2025-09-23 17:58:08 +02:00
Christian Risi	14c5ade230	Added CLI functionalities	2025-09-23 17:57:38 +02:00
chris-admin	4c9c51f902	Added barebone to have a splitter	2025-09-23 15:34:53 +02:00
GassiGiuseppe	63c1a4a160	added little snippet to rebuild db from db_creation.sql	2025-09-22 17:52:23 +02:00
GassiGiuseppe	51114af853	DataRetrivial deleted since it does the same thing as datawarehouse.py	2025-09-22 17:51:35 +02:00
GassiGiuseppe	3a6dca0681	Infos about Dataset contruction from csv moved from python file to markdown	2025-09-22 17:39:44 +02:00
GassiGiuseppe	346098d2b7	Added query.sql , file with the query used to populate the Dataset	2025-09-22 17:21:32 +02:00
GassiGiuseppe	64f9b41378	Built datawarehouse.py which populate the dataset	2025-09-22 17:17:22 +02:00
GassiGiuseppe	ac1ed42c49	Folder DataCleaning renamed to DatasetMerging since it doesn't clean nothing and instead Build the dataset	2025-09-22 17:11:49 +02:00
GassiGiuseppe	edd01a2c83	Dataset updated, the new one is built with the new method ( 50 new rows found ... upon 13 milion )	2025-09-22 16:57:06 +02:00
GassiGiuseppe	5aa9e3fcf3	Added in DBPEDIA the query to get Film \ wiki page ID plus some editing	2025-09-22 15:42:57 +02:00
GassiGiuseppe	0970cabf92	reverse.csv grammar correction of the header it seemed to have missplaced the header also in the middle of the csv	2025-09-22 13:47:20 +02:00
GassiGiuseppe	a26d92750f	Update movie-pageid.csv : grammar correction of the header	2025-09-22 12:59:35 +02:00
GassiGiuseppe	34c4782232	Dataset.db update. it seems to be correct	2025-09-20 23:33:56 +02:00
GassiGiuseppe	c5439533e6	DataRetrivial update, without df	2025-09-20 23:32:08 +02:00
GassiGiuseppe	8819b8e87f	DataRetrivial populate the db from csv	2025-09-20 19:56:24 +02:00
Christian Risi	1076dc8aa6	Run /Scripts/DataCleaning/SQL_Queries/db_creation.sql	2025-09-20 16:39:16 +02:00
Christian Risi	3d15e03b09	Renamed file to fix spelling	2025-09-20 16:38:38 +02:00
Christian Risi	0ee2ec6fcd	Spelling corrections	2025-09-20 16:37:57 +02:00
Christian Risi	95cfa5486c	Added instructions to create databse schema	2025-09-20 16:30:08 +02:00
GassiGiuseppe	0d30e90ee0	Created file for the db DatawareHouse Also decided firsts schema models into DBMerger	2025-09-20 15:53:32 +02:00
GassiGiuseppe	faaba17a98	Merge branch 'dev' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev	2025-09-20 14:34:25 +02:00
Christian Risi	854e5f1d98	Updated file to gather data from wikipedia	2025-09-20 14:32:30 +02:00
GassiGiuseppe	242d7f674f	wikipedia summary file uploaded Dataset composed of PageId and wikipedia Summary	2025-09-20 14:32:25 +02:00
Christian Risi	de8c2afceb	Added reconciliation	2025-09-19 22:22:09 +02:00
Christian Risi	f89dffff75	Created script to gather wikipedia abstracts	2025-09-19 19:01:38 +02:00
GassiGiuseppe	e39bad8348	Added Troubleshooting section to README where are corrected some potential issue with git and big files	2025-09-19 13:39:56 +02:00
GassiGiuseppe	7a1a221017	update of the database of movie-pageid which has subject has film uri and object wikipage id	2025-09-19 13:37:56 +02:00
Christian Risi	fafe6ae0f9	Modified tree structure with more TMP directories	2025-09-19 12:46:31 +02:00
Christian Risi	e32444df75	Updated fetchdata to be used in terminal Changes: - now you can use it as if it were a cli command Missing: - documentation	2025-09-19 12:35:15 +02:00
Christian Risi	b74b7ac4f0	Added new directories to make experiments and updated .gitignore Changes: - Added /Scripts/Experiments/Queries to keep track of important queries, once set - Added /Scripts/Experiments/Tmp to run quick experiments when still unsure while explorating datasets	2025-09-19 08:43:54 +02:00
Christian Risi	22134391d9	Added Scripts/Experiment directory This directory is to place files to make experiments	2025-09-19 08:41:46 +02:00
Christian Risi	82c9023849	Ignoring Scripts/Experiments files and always tracking .gitkeep files	2025-09-19 08:39:47 +02:00
Christian Risi	00b87e01ea	Moved fetchdata.py to reflect working tree old - ${Proj}/Scripts/fetchdata.py new - ${Proj}/Scripts/DataGathering/fetchdata.py	2025-09-19 08:37:04 +02:00
Christian Risi	ce3d4bf6c5	Renamed dir from Script to Scripts	2025-09-19 08:31:00 +02:00
GassiGiuseppe	c415b175a0	added reverse.csv with the reletion incoming to films	2025-09-18 20:26:51 +02:00
GassiGiuseppe	ec81ea7930	Added file to gather wikipedia abstract from url	2025-09-18 20:26:11 +02:00
GassiGiuseppe	4bb03f86b3	Added file to study the most frequent relationship into a csv triplet	2025-09-18 20:25:25 +02:00
GassiGiuseppe	e5f201f3db	DEVELOPMENT file makrdown created	2025-09-18 20:24:54 +02:00
GassiGiuseppe	1c715dc569	Typo correction in the markdown	2025-09-18 20:24:11 +02:00
GassiGiuseppe	6686b47328	Added SQL to obtain wikipedia url with movies	2025-09-18 20:23:10 +02:00
GassiGiuseppe	9a5a7d84fd	Merge branch 'dev' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev	2025-09-18 19:20:26 +02:00
GassiGiuseppe	9678ece9c0	Requirements changed added Pandas and some other	2025-09-18 19:07:38 +02:00
Christian Risi	67bcd732b5	Updated movies	2025-09-18 18:36:52 +02:00
Christian Risi	1a4f900500	Updated git attributes	2025-09-18 18:36:42 +02:00
Christian Risi	ca8729b67c	Merge branch 'dev' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev	2025-09-18 18:36:30 +02:00
GassiGiuseppe	9dbffc52ed	Added dataset of movies and their wikipedia's page link	2025-09-18 18:16:51 +02:00
Christian Risi	b7f504942a	Created Dataset	2025-09-18 17:24:08 +02:00
Christian Risi	7f0c5ce8d3	Updated File for fetching	2025-09-18 17:23:56 +02:00
Christian Risi	9838e287a4	Updated file	2025-09-18 12:03:09 +02:00
Christian Risi	ca6143ea3c	Updated Query histories	2025-09-18 11:46:32 +02:00
Christian Risi	16e7ab4d9f	Modified Datasets	2025-09-17 17:30:51 +02:00
Christian Risi	28723ab662	Merge branch 'dev' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev	2025-09-17 17:06:16 +02:00
Christian Risi	3e59efcf33	Generated datasets	2025-09-17 17:06:14 +02:00
Christian Risi	7c04309cc1	Added script to fetch data from DBPedia	2025-09-17 17:05:27 +02:00
Christian Risi	db87295890	Added history of queries	2025-09-17 17:04:58 +02:00
GassiGiuseppe	61568200a8	README update with setup chapter where are scripted the command to manage conda and pip	2025-09-17 16:50:50 +02:00
Christian Risi	8df2736b97	Added environments	2025-09-17 16:16:58 +02:00
Christian Risi	eb5b7f629a	Conda env	2025-09-17 15:53:17 +02:00
Christian Risi	79232b391e	First SparQL query	2025-09-17 14:26:37 +02:00
Christian Risi	72eb937b47	Fixed Markdown violations	2025-09-17 12:51:14 +02:00
Christian Risi	cececa14ce	Merge branch 'dev' of https://repositories.communitynotfound.work/PoliBa-DeepLearning/NanoSocrates into dev	2025-09-17 12:48:34 +02:00
Christian Risi	2487d44abd	Added SparQL	2025-09-17 12:48:33 +02:00
GassiGiuseppe	553b86cac2	Resources file updated with Byte-Pair Encoding a technique we will use to tokenize the engress' words	2025-09-17 12:06:01 +02:00
Christian Risi	12bd781fd3	Added workspace recommendations	2025-09-17 11:38:23 +02:00
Christian Risi	463f4907b8	Added Resources documentation	2025-09-17 11:36:02 +02:00