Compare commits

..

No commits in common. "dev.splitter" and "main" have entirely different histories.

29 changed files with 2 additions and 1470 deletions

1
.gitattributes vendored
View File

@ -1,3 +1,2 @@
Exam/Deep_Learning_2025_VIII.pdf filter=lfs diff=lfs merge=lfs -text Exam/Deep_Learning_2025_VIII.pdf filter=lfs diff=lfs merge=lfs -text
Assets/** filter=lfs diff=lfs merge=lfs -text Assets/** filter=lfs diff=lfs merge=lfs -text
Assets/Dataset/1-hop/dataset.csv filter=lfs diff=lfs merge=lfs -text

6
.gitignore vendored
View File

@ -189,8 +189,7 @@ ipython_config.py
.LSOverride .LSOverride
# Icon must end with two \r # Icon must end with two \r
Icon Icon
# Thumbnails # Thumbnails
._* ._*
@ -252,6 +251,3 @@ $RECYCLE.BIN/
# .nfs files are created when an open file is removed but is still being accessed # .nfs files are created when an open file is removed but is still being accessed
.nfs* .nfs*
# ---> Custom
**/Tmp/**
!**/.gitkeep

View File

@ -1,14 +0,0 @@
{
"recommendations": [
"bierner.github-markdown-preview",
"bierner.markdown-checkbox",
"bierner.markdown-emoji",
"bierner.markdown-footnotes",
"bierner.markdown-mermaid",
"bierner.markdown-preview-github-styles",
"bierner.markdown-yaml-preamble",
"davidanson.vscode-markdownlint",
"kejun.markdown-alert",
"yzhang.markdown-all-in-one"
]
}

BIN
Assets/Dataset/1-hop/dataset.csv (Stored with Git LFS)

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:331d8ef4e99c5200f1323e7149bd8aade39dc17ee5778b553bb32c593ff601cf
3 size 2443211793

BIN
Assets/Dataset/1-hop/movie-pageid.csv (Stored with Git LFS)

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:88e387ed1338bdfd34ded22f3f8bebb2be5127857bf36fcffc266b35c534587c
3 size 10148507

BIN
Assets/Dataset/1-hop/movies.csv (Stored with Git LFS)

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:8d81c8801ea79bd46747769a288cd0c507b3b94b2fb4bbb9605e282776ca5efb
3 size 8808636

BIN
Assets/Dataset/1-hop/reverse.csv (Stored with Git LFS)

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:b4878aed66c382e73982b19fa02129d5b3c3e3e8690c28e4dd662257e1d9b119
3 size 32343972

BIN
Assets/Dataset/1-hop/wikipedia-movie.csv (Stored with Git LFS)

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:1730dc111c0290b16d094a4b6a6577d966978d97ee9ef4202e86148cc9d8e8e8
3 size 17445736

BIN
Assets/Dataset/1-hop/wikipedia-summary.csv (Stored with Git LFS)

Binary file not shown.
1 version https://git-lfs.github.com/spec/v1
2 oid sha256:ef7b680257f16b193a9b4ea2914564b58c676955809e6b9d58058adaab7855c1
3 size 73089553

BIN
Assets/Dataset/DatawareHouse/dataset.db (Stored with Git LFS)

Binary file not shown.

View File

@ -1,28 +1,3 @@
# NanoSocrates # NanoSocrates
This is the work project for the DeepLearning exam of 16th September 2025 This is the work project for the DeepLearning exam of 16th September 2025
## Index
- [Resources](./docs/RESOURCES.md)
## Setup
Create and activate you Conda enviroment with:
conda env create -f environment.yaml
conda activate deep_learning
Now install dependencies on pip:
pip install -r requirements.txt
## TroubleShooting
Sometimes when uploading really large batch of data, git can stop the uploads thanks to the timeout.
The solution is to locally change its settings:
git config lfs.dialtimeout 3600
git config lfs.activitytimeout 3600
for clearance check the link: https://stackoverflow.com/questions/58961697/i-o-timeout-when-pushing-to-a-git-reporsitory

View File

@ -1,139 +0,0 @@
import argparse
import csv
import sys
from typing import Self
class ProgramArgs:
def __init__(self, file: str, output: str, treshold: int):
self.file = file
self.output = output
self.treshold = treshold
class Node:
def __init__(
self,
name: str,
quantity: int = 0,
):
self.name = name
self.quantity = quantity
self.children: dict[str, Node] = {}
@property
def is_leaf(self):
return len(self.children) == 0
def append_child(self, child: list[str]):
# print(child)
KEY = child[0]
if not self.children.get(KEY):
self.children[KEY] = Node(KEY, 0)
CHILD = self.children[KEY]
self.quantity += 1
if len(child) == 1:
return
new_children = child[1:]
CHILD.append_child(new_children)
def __str__(self):
return f"{self.name}/ - {self.quantity}"
def get_args(args: list[str]) -> ProgramArgs:
PARSER = argparse.ArgumentParser()
PARSER.add_argument("--input-file", "-i", required=True, type=str)
PARSER.add_argument("--output-file", "-o", required=True, type=str)
PARSER.add_argument("--treshold", "-t", type=int, default=1)
parsed_args, _ = PARSER.parse_known_args(args)
# print(parsed_args.input_file)
return ProgramArgs(parsed_args.input_file,parsed_args.output_file, parsed_args.treshold) # type ignore
def get_debug_args() -> ProgramArgs:
FILE = "./Assets/Dataset/Tmp/reverse-rel.txt"
TRESHOLD = 1
return ProgramArgs(
FILE,
TRESHOLD
)
def tree_like(file: str, out: str):
INDENTATION = " "
properties: dict[str, Node] = {}
properties["pure"] = Node("pure", 0)
properties["URI"] = Node("uri", 0)
FILE = open(file, "r", encoding="utf-8")
for row in FILE:
sections = row.split("/")
sections = list(filter(lambda item: item != "", sections))
# print(sections)
if sections[0] != "http:" and sections[0] != "https:":
properties["pure"].append_child(sections)
continue
properties["URI"].append_child(sections)
FILE.close()
stack: list[tuple[Node, int]] = []
for _, item in properties.items():
stack.append((item, 0))
OUT = open(out, mode="w", encoding="utf-8")
while len(stack) > 0:
LAST_ITEM = stack.pop()
NODE: Node = LAST_ITEM[0]
DEPTH: int = LAST_ITEM[1]
INDENT: str = INDENTATION * DEPTH
if NODE.quantity < ARGS.treshold:
continue
OUT.write(f"{INDENT}- {NODE}\n")
if NODE.is_leaf:
continue
CHILDREN = []
for _, child in NODE.children.items():
CHILDREN.append((child, DEPTH + 1))
stack.extend(CHILDREN)
OUT.close()
if __name__ == "__main__":
ARGS = get_args(sys.argv)
# ARGS = get_debug_args()
tree_like(ARGS.file, ARGS.output)

View File

@ -1,53 +0,0 @@
import argparse
import sys
import pandas as pd
class ProgramArgs:
def __init__(
self, input_file: str, column: str, output_file: str, count: bool
) -> None:
self.input_file = input_file
self.column = column
self.output_file = output_file
self.count = count
def get_args(args: list[str]) -> ProgramArgs:
PARSER = argparse.ArgumentParser()
PARSER.add_argument("--input-file", "--input", "-i", required=True, type=str)
PARSER.add_argument("--output-file", "--output", "-o", required=True, type=str)
PARSER.add_argument("--column", "--col", required=True, type=str)
PARSER.add_argument(
"--count", "-c", action="store_const", const=True, default=False
)
parsed_args, _ = PARSER.parse_known_args(args)
return ProgramArgs(
parsed_args.input_file,
parsed_args.column,
parsed_args.output_file,
parsed_args.count,
) # type ignore
if __name__ == "__main__":
ARGS = get_args(sys.argv)
OUTPUT_FILE = open(ARGS.output_file, "w+", encoding="utf-8")
# Load the CSV
df = pd.read_csv(ARGS.input_file)
# Count occurrences of each unique last part
item_counts = df[ARGS.column].value_counts()
# Print the counts
for item, count in item_counts.items():
if ARGS.count:
OUTPUT_FILE.write(f"{item}: {count}\n")
else:
OUTPUT_FILE.write(f"{item}\n")

View File

@ -1,146 +0,0 @@
import argparse
from math import floor
import sys
from time import sleep
import SPARQLWrapper
class ProgramData:
def __init__(
self,
local_url,
query_url,
sparql_url,
output_type,
initial_offset,
timeout,
limit,
max_pages,
verbosity_level,
) -> None:
self.local_url = local_url
self.query_url = query_url
self.sparql_url = sparql_url
self.output_type = output_type
self.initial_offset = initial_offset
self.timeout = timeout
self.limit = limit
self.max_pages = max_pages
self.verbosity_level = verbosity_level
@property
def offset(self):
return self.limit
@property
def query(self):
with open(self.query_url, "r") as file:
return file.read()
DBPEDIA_URL = "https://dbpedia.org/sparql"
TYPE = SPARQLWrapper.CSV
TIMEOUT_SECONDS = 1.5
LIMIT = int(1E4)
INITIAL_OFFSET = 0
MAX_PAGES = int(1E9)
def gather_cli_args(args: list[str]) -> ProgramData:
# TODO: Add argument for type
PARSER = argparse.ArgumentParser("sparql data fetcher")
PARSER.add_argument("--file-path", "--file", "--output", "-o", required=True, type=str)
PARSER.add_argument("--query-file", "--query", "-q", required=True, type=str)
PARSER.add_argument("--url", type=str, default=DBPEDIA_URL)
PARSER.add_argument("--limit", type=int, default=LIMIT)
PARSER.add_argument("--timeout", type=float, default=TIMEOUT_SECONDS)
PARSER.add_argument("--offset", type=int, default=INITIAL_OFFSET)
PARSER.add_argument("--max-pages", type=int, default=MAX_PAGES)
PARSER.add_argument("--verbose", "-v", action="count", default=0)
parsed_args, _ = PARSER.parse_known_args(args)
return ProgramData(
parsed_args.file_path,
parsed_args.query_file,
parsed_args.url,
SPARQLWrapper.CSV,
parsed_args.offset,
parsed_args.timeout,
parsed_args.limit,
parsed_args.max_pages,
parsed_args.verbose
)
# type: ignore
def fetch_data(DATA: ProgramData):
# Take correction of page into account
page = int(floor(DATA.initial_offset / DATA.limit)) - 1
exit = False
while not exit:
print(f"Starting to get page {page}")
CURRENT_OFFSET = int(DATA.offset + (page * DATA.limit))
sparql = SPARQLWrapper.SPARQLWrapper(DATA.sparql_url)
sparql.setReturnFormat(TYPE)
CURRENT_PAGE_QUERY = "\n".join([
DATA.query,
f"LIMIT {LIMIT}",
f"OFFSET {CURRENT_OFFSET}"
])
print(f"\nCurrent Query:\n{CURRENT_PAGE_QUERY}\n")
sparql.setQuery(CURRENT_PAGE_QUERY)
try:
res = sparql.queryAndConvert()
text = ""
if type(res) == bytes:
initial_offset = 0
if page != 0:
initial_offset = 1
lines = res.decode("utf-8", "ignore").split("\n")
text = "\n".join(lines[initial_offset:])
if text == "":
exit = True
continue
with open(DATA.local_url, "a+", encoding="utf-8") as dataset:
print(f"Writing page {page} on {DATA.local_url}")
dataset.write(
text
)
except Exception as ex:
print(f"Something went wrong during page {page}:\n\t{ex}")
print(f"Sleeping for {TIMEOUT_SECONDS}")
page += 1
if page == MAX_PAGES - 1:
exit = True
sleep(TIMEOUT_SECONDS)
if __name__ == "__main__":
DATA = gather_cli_args(sys.argv)
fetch_data(DATA)

View File

@ -1,154 +0,0 @@
from pathlib import Path
import pandas as pd
import csv
import time
import requests
input_csv = "./Assets/Dataset/1-hop/movie-pageid.csv"
output_csv = "./Assets/Dataset/Tmp/wikipedia-summary.csv"
sess = requests.Session()
CHUNK = 20
# Function to get clean full text from Wikipedia PageID
def get_clean_text(pageIDS: list[str]):
parsing_time = 0
start_full = time.time()
API_URL = "https://en.wikipedia.org/w/api.php"
headers = {
"User-Agent": "CoolBot/0.0"
""
" (https://example.org/coolbot/; coolbot@example.org)"
}
ids = "|".join(pageIDS)
start_fetch = time.time()
res = sess.get(headers=headers, url=f"{API_URL}?action=query&pageids={ids}&prop=extracts&exintro=1&explaintext=1&format=json")
end_fetch = time.time()
fetch_time = end_fetch - start_fetch
print(f"Time elapsed FETCH: {fetch_time} seconds")
data = res.json()
abstracts = {}
# Make sure 'query' and the page exist
SKIPPED = 0
if "query" in data and "pages" in data["query"]:
for pageID in pageIDS:
if pageID in data["query"]["pages"]:
page = data["query"]["pages"][pageID]
extract: str = page.get("extract")
if extract:
print(f"Entry FOUND for pageID {pageID}")
start_parse = time.time()
extract = extract.strip()
extract = extract.replace("\n", "")
end_parse = time.time()
parsing_time = end_parse - start_parse
print(f"Time elapsed PARSE: {parsing_time} seconds")
abstracts[pageID] = extract
else:
SKIPPED += 1
print(f"Entry MISSING for pageID {pageID}")
else:
SKIPPED += 1
print(f"Page MISSING for pageID {pageID}")
print(f"Chunk done - Skipped {SKIPPED}")
end_full = time.time()
print(f"Time elapsed FULL: {end_full - start_full} seconds\n\tNO PARSE: {(end_full - start_full) - parsing_time} seconds")
return abstracts
def flush(movie_ids):
abstracts = get_clean_text(movie_ids)
start = time.time()
with open(output_csv, "a", newline="", encoding="utf-8") as f_out:
writer = csv.DictWriter(f_out, fieldnames=["subject", "text"])
for id, text in abstracts.items():
writer.writerow({"subject": id, "text": text})
end = time.time()
print(f"Time elapsed WRITE: {end - start} seconds")
def reconcile() -> int:
start = time.time()
input_file = open(input_csv, "r", newline="", encoding="utf-8")
output_file = open(output_csv, "r", newline="", encoding="utf-8")
next(input_file)
LAST_CHECKED = output_file.readlines()[-1].split(",")[0]
current_check = input_file.readline().split(",")[1]
index = 1
while current_check != LAST_CHECKED:
current_check = input_file.readline().split(",")[1].replace("\n", "")
index += 1
input_file.close()
output_file.close()
end = time.time()
print(f"Time elapsed RECONCILE: {end - start} seconds")
print(f"FOUND, we need to skip {index} lines")
return index
if not Path(output_csv).is_file():
# Initialize output CSV
with open(output_csv, "w", newline="", encoding="utf-8") as f_out:
writer = csv.DictWriter(f_out, fieldnames=["subject", "text"])
writer.writeheader()
SKIP = reconcile()
# Read CSV in RAM
with open(input_csv, "r", newline="", encoding="utf-8") as input:
# Skip already done
for i in range(0, SKIP):
next(input)
reader = csv.reader(input)
index = -1
movie_ids = []
for line in reader:
index += 1
if index == 0:
continue
# Save movies in map
movie_ids.append(line[1])
if index % CHUNK == 0:
# Flush movies
flush(movie_ids)
movie_ids = []

View File

@ -1,65 +0,0 @@
CREATE TABLE IF NOT EXISTS Movies (
MovieID INTEGER PRIMARY KEY AUTOINCREMENT,
MovieURI TEXT UNIQUE NOT NULL
);
CREATE TABLE IF NOT EXISTS WikiPageIDs (
MovieID INTEGER PRIMARY KEY,
PageID INTEGER UNIQUE NOT NULL,
FOREIGN KEY(MovieID) REFERENCES Movies(MovieID)
);
CREATE TABLE IF NOT EXISTS WikipediaAbstracts (
MovieID INTEGER PRIMARY KEY,
Abstract TEXT NOT NULL,
FOREIGN KEY(MovieID) REFERENCES Movies(MovieID)
);
CREATE TABLE IF NOT EXISTS Origins (
OriginID INTEGER PRIMARY KEY AUTOINCREMENT,
OriginName TEXT UNIQUE NOT NULL
);
CREATE TABLE IF NOT EXISTS Subjects (
SubjectID INTEGER PRIMARY KEY AUTOINCREMENT,
SubjectURI TEXT UNIQUE NOT NULL,
OriginID BIGINT NOT NULL,
FOREIGN KEY(OriginID) REFERENCES Origins(OriginID)
);
CREATE TABLE IF NOT EXISTS Relationships (
RelationshipID INTEGER PRIMARY KEY AUTOINCREMENT,
RelationshipURI TEXT UNIQUE NOT NULL
);
CREATE TABLE IF NOT EXISTS Objects (
ObjectID INTEGER PRIMARY KEY AUTOINCREMENT,
ObjectURI TEXT UNIQUE NOT NULL,
OriginID BIGINT NOT NULL,
FOREIGN KEY(OriginID) REFERENCES Origins(OriginID)
);
CREATE TABLE IF NOT EXISTS RDFs (
RDF_ID INTEGER PRIMARY KEY AUTOINCREMENT,
MovieID INTEGER NOT NULL,
SubjectID INTEGER NOT NULL,
RelationshipID INTEGER NOT NULL,
ObjectID INTEGER NOT NULL,
UNIQUE(SubjectID, RelationshipID, ObjectID),
FOREIGN KEY(MovieID) REFERENCES Movies(MovieID),
FOREIGN KEY(SubjectID) REFERENCES Subjects(SubjectID),
FOREIGN KEY(RelationshipID) REFERENCES Relationships(RelationshipID),
FOREIGN KEY(ObjectID) REFERENCES Objects(ObjectID)
);
CREATE INDEX IF NOT EXISTS idx_rdf_movie_id ON RDFs(MovieID);
CREATE INDEX IF NOT EXISTS idx_rdf_subject_id ON RDFs(SubjectID);
CREATE INDEX IF NOT EXISTS idx_rdf_relationship_id ON RDFs(RelationshipID);
CREATE INDEX IF NOT EXISTS idx_rdf_object_id ON RDFs(ObjectID);

View File

@ -1,35 +0,0 @@
-- Insert MovieURI into Movies ; MovieID is auto incremental
INSERT INTO Movies (MovieURI) VALUES (?);
-- Get MovieID where MovieURI equal given value
SELECT MovieID FROM Movies WHERE MovieURI = ?;
-- SetPageId
INSERT INTO WikiPageIDs (MovieID, PageID) VALUES (?,?);
-- Get MovieId by PageID ... ( to create WikipediaAbstract)
SELECT MovieID FROM WikiPageIDs WHERE PageID = ?;
-- SetAbstract ...
INSERT INTO WikipediaAbstracts (MovieID, Abstract) VALUES (?,?);
-- SetOrigin
---
INSERT INTO Origins (OriginName) VALUES ("dataset.csv"),("reverse.csv");
-- GetOrigin
SELECT OriginID FROM Origins WHERE OriginName = ?;
-- Subject, Relationship, Object, RDF
INSERT INTO Subjects (SubjectURI, OriginID) VALUES (?,?);
INSERT INTO Relationships (RelationshipURI) VALUES (?);
INSERT INTO Objects (ObjectURI, OriginID) VALUES (?,?);
SELECT SubjectID FROM Subjects WHERE SubjectURI = ?;
SELECT RelationshipID FROM Relationships WHERE RelationshipURI = ?;
SELECT ObjectID FROM Objects WHERE ObjectURI = ?;
INSERT INTO RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?,?,?,?);

View File

@ -1,26 +0,0 @@
# HOW THE DATASET IS BUILT AND POPULATED
Note: the data are taken from CSV files in 1-hop
## CSV files composition
| CSV files | Original structure | Saved AS |
|--------------------|---------------------------------------|-------------------------------------|
| Wikipeda-summary | PageId / abstract | subject, text |
| Movies | Movie URI | "subject" |
| Dataset | Movie URI / Relationship / Object [RDF] | subject, relationship, object |
| Movies-PageId | Movie URI / PageId (wiki) | "subject", "object" |
| Reverse | Subject / Relationship / Movie URI | "subject", "relationship", "object" |
## Wanted tables schema
| Table | Columns |
|---------------|-------------------------------------------------------------------------|
| Movies | MovieID [PK], Movie URI |
| WikiPageIDs | MovieID [PK, FK], PageId [IDX] (wiki) *(Not important for now)* |
| Abstracts | MovieID [PK, FK], abstract |
| Subjects | SubjectID [PK], RDF Subject (from Dataset.csv or Reverse.csv), OriginID [FK] |
| Relationships | RelationshipID [PK], RDF Relationship (value only, not the actual relation) |
| Objects | ObjectID [PK], RDF Object, OriginID [FK] |
| Origins | OriginID [PK], Origin Name |
| RDFs | RDF_ID [PK], MovieID [FK], SubjectID [FK], RelationshipID [FK], ObjectID [FK] |

View File

@ -1,375 +0,0 @@
import sqlite3
import csv
#####################################################################
# This file builds DatawareHouse/dataset.db from 1-hop csv files #
# Its Schema in . /SQL_Queries/db_creation.sql #
# The sql query used to popualate id in . /SQL_Queries/query.sql #
#####################################################################
# sometimes you may need to build a new db file, here a little snippet for you
# sqlite3 ./Assets/Dataset/Tmp/dataset.db < ./Scripts/DataCleaning/SQL_Queries/db_creation.sql
# --- Global configuration ---
DB_NAME = "./Assets/Dataset/DatawareHouse/dataset.db"
MOVIES_CSV = "./Assets/Dataset/1-hop/movies.csv"
PAGEID_CSV = "./Assets/Dataset/1-hop/movie-pageid.csv"
SUMMARY_CSV = "./Assets/Dataset/1-hop/wikipedia-summary.csv"
DATASET_CSV = "./Assets/Dataset/1-hop/dataset.csv"
REVERSE_CSV = "./Assets/Dataset/1-hop/reverse.csv"
MOVIES_CSV_HANDLER = open(MOVIES_CSV,"r",newline='', encoding="utf-8")
PAGEID_CSV_HANDLER = open(PAGEID_CSV,"r",newline='', encoding="utf-8")
SUMMARY_CSV_HANDLER = open(SUMMARY_CSV,"r",newline='', encoding="utf-8")
DATASET_CSV_HANDLER = open(DATASET_CSV,"r",newline='', encoding="utf-8")
REVERSE_CSV_HANDLER = open(REVERSE_CSV,"r",newline='', encoding="utf-8")
CONN = sqlite3.connect(DB_NAME)
CURS = CONN.cursor()
# MARK: SQL Definitions
# Insert MovieURI
def insertOrigin(curs : sqlite3.Cursor ) -> bool:
QUERY = "INSERT INTO Origins (OriginName) VALUES ('dataset.csv'),('reverse.csv');"
try:
curs.execute(QUERY)
return True
except sqlite3.IntegrityError:
return False
def selectOrigin(curs: sqlite3.Cursor, originName: str) -> int | None:
QUERY = "SELECT OriginID FROM Origins WHERE OriginName = ?;"
curs.execute(QUERY, [originName])
originId = curs.fetchone()
if not originId:
return None
# in this case the real id is the first element of the tuple
return originId[0]
def insertMovie(curs : sqlite3.Cursor , movieUri: str) -> bool:
QUERY = "INSERT INTO Movies (MovieURI) VALUES (?);"
try:
curs.execute(QUERY,[movieUri])
return True
except sqlite3.IntegrityError:
return False
def selectMovieId(curs: sqlite3.Cursor, movieUri: str) -> int | None:
QUERY = "SELECT MovieID FROM Movies WHERE MovieURI = ?;"
curs.execute(QUERY, [movieUri])
movieId = curs.fetchone()
if not movieId:
return None
# in this case the real id is the first element of the tuple
return movieId[0]
def insertWikiPageId(curs: sqlite3.Cursor, movieId: int, pageId: int) -> bool:
QUERY = "INSERT INTO WikiPageIDs (MovieID, PageID) VALUES (?,?);"
try:
curs.execute(QUERY,[movieId, pageId])
return True
except sqlite3.IntegrityError:
return False
def selectMovieIdFromWikiPageId(curs: sqlite3.Cursor,pageId: int) -> int | None:
QUERY = "SELECT MovieID FROM WikiPageIDs WHERE PageID = ?;"
curs.execute(QUERY, [pageId])
movieId = curs.fetchone()
if not movieId:
return None
# in this case the real id is the first element of the tuple
return movieId[0]
def insertWikiAbstract(curs: sqlite3.Cursor, movieId: int, abstract: str) -> bool:
QUERY = "INSERT INTO WikipediaAbstracts (MovieID, Abstract) VALUES (?,?);"
try:
curs.execute(QUERY,[movieId, abstract])
return True
except sqlite3.IntegrityError:
return False
def insertSubject(curs: sqlite3.Cursor, subjectURI: str, originID: int) -> bool:
QUERY = "INSERT INTO Subjects (SubjectURI, OriginID) VALUES (?,?);"
try:
curs.execute(QUERY,[subjectURI, originID])
return True
except sqlite3.IntegrityError:
return False
def insertRelationship(curs: sqlite3.Cursor, relationshipURI: str) -> bool:
QUERY = "INSERT INTO Relationships (RelationshipURI) VALUES (?);"
try:
curs.execute(QUERY,[relationshipURI])
return True
except sqlite3.IntegrityError:
return False
def insertObject(curs: sqlite3.Cursor, objectURI: str, originID: int) -> bool:
QUERY = "INSERT INTO objects (ObjectURI, OriginID) VALUES (?,?);"
try:
curs.execute(QUERY,[objectURI, originID])
return True
except sqlite3.IntegrityError:
return False
def selectSubjectId(curs: sqlite3.Cursor, subjectURI: str) -> int | None:
QUERY = "SELECT SubjectID FROM Subjects WHERE SubjectURI = ?;"
curs.execute(QUERY, [subjectURI])
subjectId = curs.fetchone()
if not subjectId:
return None
# in this case the real id is the first element of the tuple
return subjectId[0]
def selectRelationshipId(curs: sqlite3.Cursor, relationshipURI: str) -> int | None:
QUERY = "SELECT RelationshipID FROM Relationships WHERE RelationshipURI = ?;"
curs.execute(QUERY, [relationshipURI])
relationshipId = curs.fetchone()
if not relationshipId:
return None
# in this case the real id is the first element of the tuple
return relationshipId[0]
def selectObjectId(curs: sqlite3.Cursor, objectURI: str) -> int | None:
QUERY = "SELECT ObjectID FROM Objects WHERE ObjectURI = ?;"
curs.execute(QUERY, [objectURI])
objectId = curs.fetchone()
if not objectId:
return None
# in this case the real id is the first element of the tuple
return objectId[0]
def insertRDF(
curs: sqlite3.Cursor,
movieId: int,
subjectId: int,
relationshipId: int,
objectId: int
) -> bool:
QUERY = "INSERT INTO RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?,?,?,?);"
try:
curs.execute(QUERY,[movieId,subjectId,relationshipId,objectId])
return True
except sqlite3.IntegrityError:
return False
# MARK: Parsing
def parseMovies():
CSV_READER = csv.reader(MOVIES_CSV_HANDLER)
next(CSV_READER)
for row in CSV_READER:
MOVIE = row[0]
insertMovie(CURS, MOVIE)
def parseWikiPageId():
CSV_READER = csv.DictReader(PAGEID_CSV_HANDLER)
for row in CSV_READER:
MOVIE_URI = row["subject"]
WIKI_PAGE_ID = int(row["object"])
MOVIE_ID = selectMovieId(CURS, MOVIE_URI)
if MOVIE_ID is None:
print(f"The MovieUri: {MOVIE_URI} has not a MovieId ")
continue
insertWikiPageId(CURS, MOVIE_ID, WIKI_PAGE_ID)
def parseAbstract():
CSV_READER = csv.DictReader(SUMMARY_CSV_HANDLER)
for row in CSV_READER:
WIKI_PAGE_ID = int(row["subject"])
ABSTRACT = row["text"]
MOVIE_ID = selectMovieIdFromWikiPageId(CURS, WIKI_PAGE_ID)
if MOVIE_ID is None:
print(f"The WikiPageId: {WIKI_PAGE_ID} has not a MovieId ")
continue
insertWikiAbstract(CURS, MOVIE_ID, ABSTRACT)
def parseRDF_Reverse():
REVERSE_CSV_READER = csv.DictReader(REVERSE_CSV_HANDLER)
REVERSE_ORIGIN_ID = selectOrigin(CURS, 'reverse.csv')
total = 0
for row in REVERSE_CSV_READER:
SUBJECT = row["subject"]
RELATIONSHIP = row["relationship"]
OBJECT = row["object"]
print(f"RDF triplets:\n\t{SUBJECT} - {RELATIONSHIP} - {OBJECT}")
insertSubject(CURS,SUBJECT,REVERSE_ORIGIN_ID)
insertRelationship(CURS, RELATIONSHIP)
insertObject(CURS, OBJECT, REVERSE_ORIGIN_ID)
SUBJECT_ID = selectSubjectId(CURS, SUBJECT)
OBJECT_ID = selectObjectId(CURS, OBJECT)
RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
MOVIE_ID = selectMovieId(CURS, OBJECT)
skip = False
# guard
if SUBJECT_ID is None:
print(f"No SubjectId for {SUBJECT}")
skip = True
if OBJECT_ID is None:
print(f"No ObjectId for {OBJECT}")
skip = True
if RELATIONSHIP_ID is None:
print(f"No RelationshipId for {RELATIONSHIP}")
skip = True
if MOVIE_ID is None:
print(f"No MovieId for {OBJECT}")
skip = True
if skip:
continue
if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID):
total += 1
print(total)
def parseRDF_Dataset():
DATASET_CSV_READER = csv.DictReader(DATASET_CSV_HANDLER)
DATASET_ORIGIN_ID = selectOrigin(CURS, 'dataset.csv')
total = 0
rdf_idx = 0
for row in DATASET_CSV_READER:
SUBJECT = row["subject"]
RELATIONSHIP = row["relationship"]
OBJECT = row["object"]
rdf_idx += 1
if rdf_idx % 100000 == 0:
print(f"RDF number {rdf_idx}:\n\t{SUBJECT} - {RELATIONSHIP} - {OBJECT}")
insertSubject(CURS,SUBJECT,DATASET_ORIGIN_ID)
insertRelationship(CURS, RELATIONSHIP)
insertObject(CURS, OBJECT, DATASET_ORIGIN_ID)
SUBJECT_ID = selectSubjectId(CURS, SUBJECT)
OBJECT_ID = selectObjectId(CURS, OBJECT)
RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
MOVIE_ID = selectMovieId(CURS, SUBJECT)
skip = False
# guard
if SUBJECT_ID is None:
print(f"No SubjectId for {SUBJECT}")
skip = True
if OBJECT_ID is None:
print(f"No ObjectId for {OBJECT}")
skip = True
if RELATIONSHIP_ID is None:
print(f"No RelationshipId for {RELATIONSHIP}")
skip = True
if MOVIE_ID is None:
print(f"No MovieId for {SUBJECT}")
skip = True
if skip:
continue
if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID):
total += 1
print(total)
# MARK: Actual Code
# parseMovies()
# parseWikiPageId()
# parseAbstract()
# insertOrigin(CURS)
# parseRDF_Reverse()
# parseRDF_Dataset()
CONN.commit()
CONN.close()
MOVIES_CSV_HANDLER.close()
PAGEID_CSV_HANDLER.close()
SUMMARY_CSV_HANDLER.close()
DATASET_CSV_HANDLER.close()
REVERSE_CSV_HANDLER.close()
"""
The MovieUri: http://dbpedia.org/resource/1%25_(film) has not a MovieId
The MovieUri: http://dbpedia.org/resource/10%25:_What_Makes_a_Hero%3F has not a MovieId
The MovieUri: http://dbpedia.org/resource/100%25_Arabica has not a MovieId
The MovieUri: http://dbpedia.org/resource/100%25_Kadhal has not a MovieId
The MovieUri: http://dbpedia.org/resource/100%25_Love_(2011_film) has not a MovieId
The MovieUri: http://dbpedia.org/resource/100%25_Love_(2012_film) has not a MovieId
The MovieUri: http://dbpedia.org/resource/100%25_Wolf has not a MovieId
The MovieUri: http://dbpedia.org/resource/Who_the_$&%25_Is_Jackson_Pollock%3F has not a MovieId
The MovieUri: http://dbpedia.org/resource/99%25:_The_Occupy_Wall_Street_Collaborative_Film has not a MovieId
The MovieUri: http://dbpedia.org/resource/99_and_44/100%25_Dead has not a MovieId
The MovieUri: http://dbpedia.org/resource/Postcards_from_the_48%25 has not a MovieId
The MovieUri: http://dbpedia.org/resource/Wool_100%25 has not a MovieId
"""
"""
The WikiPageId: 10068850 has not a MovieId
The WikiPageId: 55069615 has not a MovieId
The WikiPageId: 49510056 has not a MovieId
The WikiPageId: 4049786 has not a MovieId
The WikiPageId: 55510238 has not a MovieId
The WikiPageId: 31239628 has not a MovieId
The WikiPageId: 34757217 has not a MovieId
The WikiPageId: 64311757 has not a MovieId
The WikiPageId: 8326198 has not a MovieId
The WikiPageId: 42162164 has not a MovieId
The WikiPageId: 18502369 has not a MovieId
The WikiPageId: 58092358 has not a MovieId
The WikiPageId: 40710250 has not a MovieId
"""

View File

@ -1,215 +0,0 @@
# DBPedia
## GraphIRI
This is the graph identifier (URI):
`http://dbpedia.org`
## History of queries
```SQL
PREFIX dbo: <http://dbpedia.org/ontology/>
PREFIX dbp: <http://dbpedia.org/property/>
PREFIX dbr: <http://dbpedia.org/resource/>
SELECT ?subject, ?relationship, ?object
WHERE {
?subject ?relationship ?object .
{
SELECT ?object
WHERE {
?m rdf:type dbo:Film .
?object ?r ?m
}
}
}
```
### 2 Hops
```SQL
PREFIX dbo: <http://dbpedia.org/ontology/>
PREFIX dbp: <http://dbpedia.org/property/>
PREFIX dbr: <http://dbpedia.org/resource/>
SELECT ?subject, ?relationship, ?object
WHERE {
?subject ?relationship ?object .
FILTER (?relationship != <http://dbpedia.org/ontology/wikiPageWikiLink>)
{
SELECT ?object
WHERE {
?m rdf:type dbo:Film .
?object ?r ?m
FILTER (?r != <http://dbpedia.org/ontology/wikiPageWikiLink>)
}
}
}
LIMIT 1000000
```
### 1 Hop
```SQL
PREFIX dbo: <http://dbpedia.org/ontology/>
PREFIX dbp: <http://dbpedia.org/property/>
PREFIX dbr: <http://dbpedia.org/resource/>
SELECT ?subject, ?relationship, ?object
WHERE {
?subject ?relationship ?object .
?object rdf:type dbo:Film .
FILTER (?relationship != <http://dbpedia.org/ontology/wikiPageWikiLink>)
}
LIMIT 1000000
```
```SQL
PREFIX dbo: <http://dbpedia.org/ontology/>
PREFIX dbp: <http://dbpedia.org/property/>
PREFIX dbr: <http://dbpedia.org/resource/>
SELECT ?subject, ?relationship, ?object
WHERE {
?subject ?relationship ?object .
?subject rdf:type dbo:Film .
}
```
```SQL
PREFIX dbo: <http://dbpedia.org/ontology/>
PREFIX dbp: <http://dbpedia.org/property/>
PREFIX dbr: <http://dbpedia.org/resource/>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
SELECT ?subject, ?relationship, ?object
WHERE {
?subject ?relationship ?object .
?subject rdf:type dbo:Film .
?a foaf:primaryTopic ?subject
}
```
```SQL
PREFIX dbo: <http://dbpedia.org/ontology/>
PREFIX dbp: <http://dbpedia.org/property/>
PREFIX dbr: <http://dbpedia.org/resource/>
SELECT ?subject
WHERE {
?subject rdf:type dbo:Film .
}
```
```SQL
PREFIX dbo: <http://dbpedia.org/ontology/>
PREFIX dbp: <http://dbpedia.org/property/>
PREFIX dbr: <http://dbpedia.org/resource/>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
SELECT ?subject
WHERE {
?subject rdf:type dbo:Film .
?a foaf:primaryTopic ?subject
}
```
```SQL
PREFIX dbo: <http://dbpedia.org/ontology/>
PREFIX dbp: <http://dbpedia.org/property/>
PREFIX dbr: <http://dbpedia.org/resource/>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
SELECT ?subject, ?relationship, ?object
WHERE {
?subject ?relationship ?object .
?subject rdf:type dbo:Film .
?a foaf:primaryTopic ?subject
FILTER (?relationship NOT IN (
dbo:wikiPageRedirects,
dbo:wikiPageExternalLink,
dbo:wikiPageWikiLink,
foaf:primaryTopic
))
}
```
#### Wikipedia-movie
a.k.a the file with the wikipedia abstract
```SQL
PREFIX dbo: <http://dbpedia.org/ontology/>
PREFIX dbp: <http://dbpedia.org/property/>
PREFIX dbr: <http://dbpedia.org/resource/>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
SELECT ?subject , ?object
WHERE {
?subject foaf:primaryTopic ?object .
?object rdf:type dbo:Film
}
```
#### Reverse
```SQL
PREFIX dbo: <http://dbpedia.org/ontology/>
PREFIX dbp: <http://dbpedia.org/property/>
PREFIX dbr: <http://dbpedia.org/resource/>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
SELECT ?subject, ?relationship, ?object
WHERE {
?subject ?relationship ?object .
?object rdf:type dbo:Film .
?a foaf:primaryTopic ?object
FILTER (?relationship NOT IN (
dbo:wikiPageRedirects,
dbo:wikiPageExternalLink,
dbo:wikiPageWikiLink,
foaf:primaryTopic
))
}
```
```SQL
PREFIX dbo: <http://dbpedia.org/ontology/>
PREFIX dbp: <http://dbpedia.org/property/>
PREFIX dbr: <http://dbpedia.org/resource/>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
SELECT ?subject, ?relationship, ?object
WHERE {
?subject ?relationship ?object .
?object rdf:type dbo:Film .
?a foaf:primaryTopic ?object
FILTER (?relationship NOT IN (
dbo:wikiPageRedirects,
dbo:wikiPageExternalLink,
dbo:wikiPageWikiLink,
foaf:primaryTopic
))
```
#### Film \ wiki page ID
```SQL
PREFIX dbo: <http://dbpedia.org/ontology/>
PREFIX dbp: <http://dbpedia.org/property/>
PREFIX dbr: <http://dbpedia.org/resource/>
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
SELECT ?subject ?pageID
WHERE {
?subject rdf:type dbo:Film .
?subject dbo:wikiPageID ?pageID .
?subject rdfs:label ?label .
FILTER (lang(?label) = "en")
}
```

View File

@ -1,3 +0,0 @@
# Development
## Data Gathering

View File

@ -1,108 +0,0 @@
# Resources
## Byte-Pair Encoding (BPE)
### Overview
Byte-Pair Encoding (BPE) is a simple but powerful text compression and tokenization algorithm.
Originally introduced as a data compression method, it has been widely adopted in **Natural Language Processing (NLP)** to build subword vocabularies for models such as GPT and BERT.
---
### Key Idea
BPE works by iteratively replacing the most frequent pair of symbols (initially characters) with a new symbol.
Over time, frequent character sequences (e.g., common morphemes, prefixes, suffixes) are merged into single tokens.
---
### Algorithm Steps
1. **Initialization**
- Treat each character of the input text as a token.
2. **Find Frequent Pairs**
- Count all adjacent token pairs in the sequence.
3. **Merge Most Frequent Pair**
- Replace the most frequent pair with a new symbol not used in the text.
4. **Repeat**
- Continue until no frequent pairs remain or a desired vocabulary size is reached.
---
### Example
Suppose the data to be encoded is:
```text
aaabdaaabac
```
#### Step 1: Merge `"aa"`
Most frequent pair: `"aa"` → replace with `"Z"`
```text
ZabdZabac
Z = aa
```
---
#### Step 2: Merge `"ab"`
Most frequent pair: `"ab"` → replace with `"Y"`
```text
ZYdZYac
Y = ab
Z = aa
```
---
#### Step 3: Merge `"ZY"`
Most frequent pair: `"ZY"` → replace with `"X"`
```text
XdXac
X = ZY
Y = ab
Z = aa
```
---
At this point, no pairs occur more than once, so the process stops.
---
### Decompression
To recover the original data, replacements are applied in **reverse order**:
```text
XdXac
→ ZYdZYac
→ ZabdZabac
→ aaabdaaabac
```
---
### Advantages
- **Efficient vocabulary building**: reduces the need for massive word lists.
- **Handles rare words**: breaks them into meaningful subword units.
- **Balances character- and word-level tokenization**.
---
### Limitations
- Does not consider linguistic meaning—merges are frequency-based.
- May create tokens that are not linguistically natural.
- Vocabulary is fixed after training.

View File

@ -1,67 +0,0 @@
# SparQL
> [!NOTE]
> Resources taken from [this website](https://sparql.dev/)
## SQL Queries
### SELECT
```SQL
SELECT ?var1, ?var2, ...
```
### WHERE
```SQL
WHERE {
pattern1 .
pattern2 .
...
}
```
### FILTER
It's used to restrict [`WHERE`](#where) clauses
```SQL
WHERE {
?person <http://example.com/hasCar> ?car .
FILTER (?car = <http://example.com/Car1>)
}
```
### OPTIONAL
It's used to fetch available content if exists
```SQL
SELECT ?person ?car
WHERE {
?person <http://example.com/hasCar> ?car .
OPTIONAL {
?car <http://example.com/hasColor> ?color .
}
}
```
### LIMIT
Limits results
```SQL
LIMIT 10 -- Take only 10 results
```
## SparQL functions
### COUNT
```SQL
SELECT (COUNT(?person) AS ?count)
WHERE {
?person <http://example.com/hasCar> ?car .
}
```

Binary file not shown.

View File

@ -1,17 +0,0 @@
certifi==2025.8.3
charset-normalizer==3.4.3
idna==3.10
numpy==2.3.3
pandas==2.3.2
pyparsing==3.2.4
python-dateutil==2.9.0.post0
pytz==2025.2
rdflib==7.1.4
requests==2.32.5
setuptools==78.1.1
six==1.17.0
SPARQLWrapper==2.0.0
tzdata==2025.2
urllib3==2.5.0
wheel==0.45.1
Wikipedia-API==0.8.1