Compare commits
No commits in common. "dev.splitter" and "main" have entirely different histories.
dev.splitt
...
main
1
.gitattributes
vendored
1
.gitattributes
vendored
@ -1,3 +1,2 @@
|
|||||||
Exam/Deep_Learning_2025_VIII.pdf filter=lfs diff=lfs merge=lfs -text
|
Exam/Deep_Learning_2025_VIII.pdf filter=lfs diff=lfs merge=lfs -text
|
||||||
Assets/** filter=lfs diff=lfs merge=lfs -text
|
Assets/** filter=lfs diff=lfs merge=lfs -text
|
||||||
Assets/Dataset/1-hop/dataset.csv filter=lfs diff=lfs merge=lfs -text
|
|
||||||
|
|||||||
6
.gitignore
vendored
6
.gitignore
vendored
@ -189,8 +189,7 @@ ipython_config.py
|
|||||||
.LSOverride
|
.LSOverride
|
||||||
|
|
||||||
# Icon must end with two \r
|
# Icon must end with two \r
|
||||||
Icon
|
Icon
|
||||||
|
|
||||||
|
|
||||||
# Thumbnails
|
# Thumbnails
|
||||||
._*
|
._*
|
||||||
@ -252,6 +251,3 @@ $RECYCLE.BIN/
|
|||||||
# .nfs files are created when an open file is removed but is still being accessed
|
# .nfs files are created when an open file is removed but is still being accessed
|
||||||
.nfs*
|
.nfs*
|
||||||
|
|
||||||
# ---> Custom
|
|
||||||
**/Tmp/**
|
|
||||||
!**/.gitkeep
|
|
||||||
|
|||||||
14
.vscode/extensions.json
vendored
14
.vscode/extensions.json
vendored
@ -1,14 +0,0 @@
|
|||||||
{
|
|
||||||
"recommendations": [
|
|
||||||
"bierner.github-markdown-preview",
|
|
||||||
"bierner.markdown-checkbox",
|
|
||||||
"bierner.markdown-emoji",
|
|
||||||
"bierner.markdown-footnotes",
|
|
||||||
"bierner.markdown-mermaid",
|
|
||||||
"bierner.markdown-preview-github-styles",
|
|
||||||
"bierner.markdown-yaml-preamble",
|
|
||||||
"davidanson.vscode-markdownlint",
|
|
||||||
"kejun.markdown-alert",
|
|
||||||
"yzhang.markdown-all-in-one"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
BIN
Assets/Dataset/1-hop/dataset.csv
(Stored with Git LFS)
BIN
Assets/Dataset/1-hop/dataset.csv
(Stored with Git LFS)
Binary file not shown.
|
BIN
Assets/Dataset/1-hop/movie-pageid.csv
(Stored with Git LFS)
BIN
Assets/Dataset/1-hop/movie-pageid.csv
(Stored with Git LFS)
Binary file not shown.
|
BIN
Assets/Dataset/1-hop/movies.csv
(Stored with Git LFS)
BIN
Assets/Dataset/1-hop/movies.csv
(Stored with Git LFS)
Binary file not shown.
|
BIN
Assets/Dataset/1-hop/reverse.csv
(Stored with Git LFS)
BIN
Assets/Dataset/1-hop/reverse.csv
(Stored with Git LFS)
Binary file not shown.
|
BIN
Assets/Dataset/1-hop/wikipedia-movie.csv
(Stored with Git LFS)
BIN
Assets/Dataset/1-hop/wikipedia-movie.csv
(Stored with Git LFS)
Binary file not shown.
|
BIN
Assets/Dataset/1-hop/wikipedia-summary.csv
(Stored with Git LFS)
BIN
Assets/Dataset/1-hop/wikipedia-summary.csv
(Stored with Git LFS)
Binary file not shown.
|
BIN
Assets/Dataset/DatawareHouse/dataset.db
(Stored with Git LFS)
BIN
Assets/Dataset/DatawareHouse/dataset.db
(Stored with Git LFS)
Binary file not shown.
27
README.md
27
README.md
@ -1,28 +1,3 @@
|
|||||||
# NanoSocrates
|
# NanoSocrates
|
||||||
|
|
||||||
This is the work project for the DeepLearning exam of 16th September 2025
|
This is the work project for the DeepLearning exam of 16th September 2025
|
||||||
|
|
||||||
## Index
|
|
||||||
|
|
||||||
- [Resources](./docs/RESOURCES.md)
|
|
||||||
|
|
||||||
## Setup
|
|
||||||
|
|
||||||
Create and activate you Conda enviroment with:
|
|
||||||
|
|
||||||
conda env create -f environment.yaml
|
|
||||||
conda activate deep_learning
|
|
||||||
|
|
||||||
Now install dependencies on pip:
|
|
||||||
|
|
||||||
pip install -r requirements.txt
|
|
||||||
|
|
||||||
## TroubleShooting
|
|
||||||
|
|
||||||
Sometimes when uploading really large batch of data, git can stop the uploads thanks to the timeout.
|
|
||||||
The solution is to locally change its settings:
|
|
||||||
|
|
||||||
git config lfs.dialtimeout 3600
|
|
||||||
git config lfs.activitytimeout 3600
|
|
||||||
|
|
||||||
for clearance check the link: https://stackoverflow.com/questions/58961697/i-o-timeout-when-pushing-to-a-git-reporsitory
|
|
||||||
@ -1,139 +0,0 @@
|
|||||||
import argparse
|
|
||||||
import csv
|
|
||||||
import sys
|
|
||||||
from typing import Self
|
|
||||||
|
|
||||||
|
|
||||||
class ProgramArgs:
|
|
||||||
|
|
||||||
def __init__(self, file: str, output: str, treshold: int):
|
|
||||||
self.file = file
|
|
||||||
self.output = output
|
|
||||||
self.treshold = treshold
|
|
||||||
|
|
||||||
|
|
||||||
class Node:
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
name: str,
|
|
||||||
quantity: int = 0,
|
|
||||||
):
|
|
||||||
self.name = name
|
|
||||||
self.quantity = quantity
|
|
||||||
self.children: dict[str, Node] = {}
|
|
||||||
|
|
||||||
@property
|
|
||||||
def is_leaf(self):
|
|
||||||
return len(self.children) == 0
|
|
||||||
|
|
||||||
def append_child(self, child: list[str]):
|
|
||||||
|
|
||||||
# print(child)
|
|
||||||
KEY = child[0]
|
|
||||||
|
|
||||||
if not self.children.get(KEY):
|
|
||||||
self.children[KEY] = Node(KEY, 0)
|
|
||||||
|
|
||||||
CHILD = self.children[KEY]
|
|
||||||
self.quantity += 1
|
|
||||||
|
|
||||||
if len(child) == 1:
|
|
||||||
return
|
|
||||||
|
|
||||||
new_children = child[1:]
|
|
||||||
|
|
||||||
CHILD.append_child(new_children)
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
return f"{self.name}/ - {self.quantity}"
|
|
||||||
|
|
||||||
|
|
||||||
def get_args(args: list[str]) -> ProgramArgs:
|
|
||||||
|
|
||||||
PARSER = argparse.ArgumentParser()
|
|
||||||
PARSER.add_argument("--input-file", "-i", required=True, type=str)
|
|
||||||
PARSER.add_argument("--output-file", "-o", required=True, type=str)
|
|
||||||
PARSER.add_argument("--treshold", "-t", type=int, default=1)
|
|
||||||
parsed_args, _ = PARSER.parse_known_args(args)
|
|
||||||
|
|
||||||
# print(parsed_args.input_file)
|
|
||||||
|
|
||||||
return ProgramArgs(parsed_args.input_file,parsed_args.output_file, parsed_args.treshold) # type ignore
|
|
||||||
|
|
||||||
|
|
||||||
def get_debug_args() -> ProgramArgs:
|
|
||||||
|
|
||||||
FILE = "./Assets/Dataset/Tmp/reverse-rel.txt"
|
|
||||||
TRESHOLD = 1
|
|
||||||
|
|
||||||
return ProgramArgs(
|
|
||||||
FILE,
|
|
||||||
TRESHOLD
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def tree_like(file: str, out: str):
|
|
||||||
|
|
||||||
INDENTATION = " "
|
|
||||||
|
|
||||||
properties: dict[str, Node] = {}
|
|
||||||
|
|
||||||
properties["pure"] = Node("pure", 0)
|
|
||||||
properties["URI"] = Node("uri", 0)
|
|
||||||
|
|
||||||
FILE = open(file, "r", encoding="utf-8")
|
|
||||||
|
|
||||||
for row in FILE:
|
|
||||||
|
|
||||||
sections = row.split("/")
|
|
||||||
sections = list(filter(lambda item: item != "", sections))
|
|
||||||
|
|
||||||
# print(sections)
|
|
||||||
|
|
||||||
if sections[0] != "http:" and sections[0] != "https:":
|
|
||||||
properties["pure"].append_child(sections)
|
|
||||||
continue
|
|
||||||
|
|
||||||
properties["URI"].append_child(sections)
|
|
||||||
|
|
||||||
FILE.close()
|
|
||||||
|
|
||||||
stack: list[tuple[Node, int]] = []
|
|
||||||
|
|
||||||
for _, item in properties.items():
|
|
||||||
stack.append((item, 0))
|
|
||||||
|
|
||||||
OUT = open(out, mode="w", encoding="utf-8")
|
|
||||||
|
|
||||||
while len(stack) > 0:
|
|
||||||
|
|
||||||
LAST_ITEM = stack.pop()
|
|
||||||
|
|
||||||
NODE: Node = LAST_ITEM[0]
|
|
||||||
DEPTH: int = LAST_ITEM[1]
|
|
||||||
|
|
||||||
INDENT: str = INDENTATION * DEPTH
|
|
||||||
|
|
||||||
if NODE.quantity < ARGS.treshold:
|
|
||||||
continue
|
|
||||||
|
|
||||||
OUT.write(f"{INDENT}- {NODE}\n")
|
|
||||||
|
|
||||||
if NODE.is_leaf:
|
|
||||||
continue
|
|
||||||
|
|
||||||
CHILDREN = []
|
|
||||||
|
|
||||||
for _, child in NODE.children.items():
|
|
||||||
CHILDREN.append((child, DEPTH + 1))
|
|
||||||
|
|
||||||
stack.extend(CHILDREN)
|
|
||||||
|
|
||||||
OUT.close()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
ARGS = get_args(sys.argv)
|
|
||||||
# ARGS = get_debug_args()
|
|
||||||
tree_like(ARGS.file, ARGS.output)
|
|
||||||
@ -1,53 +0,0 @@
|
|||||||
import argparse
|
|
||||||
import sys
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
|
|
||||||
class ProgramArgs:
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self, input_file: str, column: str, output_file: str, count: bool
|
|
||||||
) -> None:
|
|
||||||
self.input_file = input_file
|
|
||||||
self.column = column
|
|
||||||
self.output_file = output_file
|
|
||||||
self.count = count
|
|
||||||
|
|
||||||
|
|
||||||
def get_args(args: list[str]) -> ProgramArgs:
|
|
||||||
|
|
||||||
PARSER = argparse.ArgumentParser()
|
|
||||||
PARSER.add_argument("--input-file", "--input", "-i", required=True, type=str)
|
|
||||||
PARSER.add_argument("--output-file", "--output", "-o", required=True, type=str)
|
|
||||||
PARSER.add_argument("--column", "--col", required=True, type=str)
|
|
||||||
PARSER.add_argument(
|
|
||||||
"--count", "-c", action="store_const", const=True, default=False
|
|
||||||
)
|
|
||||||
parsed_args, _ = PARSER.parse_known_args(args)
|
|
||||||
|
|
||||||
return ProgramArgs(
|
|
||||||
parsed_args.input_file,
|
|
||||||
parsed_args.column,
|
|
||||||
parsed_args.output_file,
|
|
||||||
parsed_args.count,
|
|
||||||
) # type ignore
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
ARGS = get_args(sys.argv)
|
|
||||||
|
|
||||||
OUTPUT_FILE = open(ARGS.output_file, "w+", encoding="utf-8")
|
|
||||||
|
|
||||||
# Load the CSV
|
|
||||||
df = pd.read_csv(ARGS.input_file)
|
|
||||||
|
|
||||||
# Count occurrences of each unique last part
|
|
||||||
item_counts = df[ARGS.column].value_counts()
|
|
||||||
|
|
||||||
# Print the counts
|
|
||||||
for item, count in item_counts.items():
|
|
||||||
|
|
||||||
if ARGS.count:
|
|
||||||
OUTPUT_FILE.write(f"{item}: {count}\n")
|
|
||||||
else:
|
|
||||||
OUTPUT_FILE.write(f"{item}\n")
|
|
||||||
@ -1,146 +0,0 @@
|
|||||||
import argparse
|
|
||||||
from math import floor
|
|
||||||
import sys
|
|
||||||
from time import sleep
|
|
||||||
import SPARQLWrapper
|
|
||||||
|
|
||||||
|
|
||||||
class ProgramData:
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
local_url,
|
|
||||||
query_url,
|
|
||||||
sparql_url,
|
|
||||||
output_type,
|
|
||||||
initial_offset,
|
|
||||||
timeout,
|
|
||||||
limit,
|
|
||||||
max_pages,
|
|
||||||
verbosity_level,
|
|
||||||
) -> None:
|
|
||||||
|
|
||||||
self.local_url = local_url
|
|
||||||
self.query_url = query_url
|
|
||||||
self.sparql_url = sparql_url
|
|
||||||
self.output_type = output_type
|
|
||||||
self.initial_offset = initial_offset
|
|
||||||
self.timeout = timeout
|
|
||||||
self.limit = limit
|
|
||||||
self.max_pages = max_pages
|
|
||||||
self.verbosity_level = verbosity_level
|
|
||||||
|
|
||||||
@property
|
|
||||||
def offset(self):
|
|
||||||
return self.limit
|
|
||||||
|
|
||||||
@property
|
|
||||||
def query(self):
|
|
||||||
|
|
||||||
with open(self.query_url, "r") as file:
|
|
||||||
return file.read()
|
|
||||||
|
|
||||||
|
|
||||||
DBPEDIA_URL = "https://dbpedia.org/sparql"
|
|
||||||
TYPE = SPARQLWrapper.CSV
|
|
||||||
TIMEOUT_SECONDS = 1.5
|
|
||||||
LIMIT = int(1E4)
|
|
||||||
INITIAL_OFFSET = 0
|
|
||||||
MAX_PAGES = int(1E9)
|
|
||||||
|
|
||||||
|
|
||||||
def gather_cli_args(args: list[str]) -> ProgramData:
|
|
||||||
|
|
||||||
# TODO: Add argument for type
|
|
||||||
PARSER = argparse.ArgumentParser("sparql data fetcher")
|
|
||||||
PARSER.add_argument("--file-path", "--file", "--output", "-o", required=True, type=str)
|
|
||||||
PARSER.add_argument("--query-file", "--query", "-q", required=True, type=str)
|
|
||||||
PARSER.add_argument("--url", type=str, default=DBPEDIA_URL)
|
|
||||||
PARSER.add_argument("--limit", type=int, default=LIMIT)
|
|
||||||
PARSER.add_argument("--timeout", type=float, default=TIMEOUT_SECONDS)
|
|
||||||
PARSER.add_argument("--offset", type=int, default=INITIAL_OFFSET)
|
|
||||||
PARSER.add_argument("--max-pages", type=int, default=MAX_PAGES)
|
|
||||||
PARSER.add_argument("--verbose", "-v", action="count", default=0)
|
|
||||||
|
|
||||||
parsed_args, _ = PARSER.parse_known_args(args)
|
|
||||||
|
|
||||||
return ProgramData(
|
|
||||||
parsed_args.file_path,
|
|
||||||
parsed_args.query_file,
|
|
||||||
parsed_args.url,
|
|
||||||
SPARQLWrapper.CSV,
|
|
||||||
parsed_args.offset,
|
|
||||||
parsed_args.timeout,
|
|
||||||
parsed_args.limit,
|
|
||||||
parsed_args.max_pages,
|
|
||||||
parsed_args.verbose
|
|
||||||
)
|
|
||||||
# type: ignore
|
|
||||||
|
|
||||||
|
|
||||||
def fetch_data(DATA: ProgramData):
|
|
||||||
|
|
||||||
# Take correction of page into account
|
|
||||||
page = int(floor(DATA.initial_offset / DATA.limit)) - 1
|
|
||||||
exit = False
|
|
||||||
|
|
||||||
while not exit:
|
|
||||||
|
|
||||||
print(f"Starting to get page {page}")
|
|
||||||
|
|
||||||
CURRENT_OFFSET = int(DATA.offset + (page * DATA.limit))
|
|
||||||
sparql = SPARQLWrapper.SPARQLWrapper(DATA.sparql_url)
|
|
||||||
|
|
||||||
sparql.setReturnFormat(TYPE)
|
|
||||||
|
|
||||||
CURRENT_PAGE_QUERY = "\n".join([
|
|
||||||
DATA.query,
|
|
||||||
f"LIMIT {LIMIT}",
|
|
||||||
f"OFFSET {CURRENT_OFFSET}"
|
|
||||||
])
|
|
||||||
|
|
||||||
print(f"\nCurrent Query:\n{CURRENT_PAGE_QUERY}\n")
|
|
||||||
|
|
||||||
sparql.setQuery(CURRENT_PAGE_QUERY)
|
|
||||||
|
|
||||||
try:
|
|
||||||
res = sparql.queryAndConvert()
|
|
||||||
text = ""
|
|
||||||
|
|
||||||
if type(res) == bytes:
|
|
||||||
|
|
||||||
initial_offset = 0
|
|
||||||
|
|
||||||
if page != 0:
|
|
||||||
initial_offset = 1
|
|
||||||
|
|
||||||
lines = res.decode("utf-8", "ignore").split("\n")
|
|
||||||
text = "\n".join(lines[initial_offset:])
|
|
||||||
|
|
||||||
if text == "":
|
|
||||||
exit = True
|
|
||||||
continue
|
|
||||||
|
|
||||||
with open(DATA.local_url, "a+", encoding="utf-8") as dataset:
|
|
||||||
|
|
||||||
print(f"Writing page {page} on {DATA.local_url}")
|
|
||||||
dataset.write(
|
|
||||||
text
|
|
||||||
)
|
|
||||||
|
|
||||||
except Exception as ex:
|
|
||||||
print(f"Something went wrong during page {page}:\n\t{ex}")
|
|
||||||
|
|
||||||
print(f"Sleeping for {TIMEOUT_SECONDS}")
|
|
||||||
|
|
||||||
page += 1
|
|
||||||
|
|
||||||
if page == MAX_PAGES - 1:
|
|
||||||
exit = True
|
|
||||||
|
|
||||||
sleep(TIMEOUT_SECONDS)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
DATA = gather_cli_args(sys.argv)
|
|
||||||
fetch_data(DATA)
|
|
||||||
@ -1,154 +0,0 @@
|
|||||||
from pathlib import Path
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
import csv
|
|
||||||
import time
|
|
||||||
import requests
|
|
||||||
|
|
||||||
input_csv = "./Assets/Dataset/1-hop/movie-pageid.csv"
|
|
||||||
output_csv = "./Assets/Dataset/Tmp/wikipedia-summary.csv"
|
|
||||||
|
|
||||||
|
|
||||||
sess = requests.Session()
|
|
||||||
|
|
||||||
CHUNK = 20
|
|
||||||
|
|
||||||
|
|
||||||
# Function to get clean full text from Wikipedia PageID
|
|
||||||
def get_clean_text(pageIDS: list[str]):
|
|
||||||
|
|
||||||
parsing_time = 0
|
|
||||||
start_full = time.time()
|
|
||||||
API_URL = "https://en.wikipedia.org/w/api.php"
|
|
||||||
headers = {
|
|
||||||
"User-Agent": "CoolBot/0.0"
|
|
||||||
""
|
|
||||||
" (https://example.org/coolbot/; coolbot@example.org)"
|
|
||||||
}
|
|
||||||
|
|
||||||
ids = "|".join(pageIDS)
|
|
||||||
|
|
||||||
start_fetch = time.time()
|
|
||||||
res = sess.get(headers=headers, url=f"{API_URL}?action=query&pageids={ids}&prop=extracts&exintro=1&explaintext=1&format=json")
|
|
||||||
end_fetch = time.time()
|
|
||||||
fetch_time = end_fetch - start_fetch
|
|
||||||
print(f"Time elapsed FETCH: {fetch_time} seconds")
|
|
||||||
|
|
||||||
data = res.json()
|
|
||||||
|
|
||||||
|
|
||||||
abstracts = {}
|
|
||||||
# Make sure 'query' and the page exist
|
|
||||||
SKIPPED = 0
|
|
||||||
if "query" in data and "pages" in data["query"]:
|
|
||||||
for pageID in pageIDS:
|
|
||||||
if pageID in data["query"]["pages"]:
|
|
||||||
page = data["query"]["pages"][pageID]
|
|
||||||
extract: str = page.get("extract")
|
|
||||||
|
|
||||||
if extract:
|
|
||||||
print(f"Entry FOUND for pageID {pageID}")
|
|
||||||
start_parse = time.time()
|
|
||||||
extract = extract.strip()
|
|
||||||
extract = extract.replace("\n", "")
|
|
||||||
end_parse = time.time()
|
|
||||||
parsing_time = end_parse - start_parse
|
|
||||||
print(f"Time elapsed PARSE: {parsing_time} seconds")
|
|
||||||
abstracts[pageID] = extract
|
|
||||||
else:
|
|
||||||
SKIPPED += 1
|
|
||||||
print(f"Entry MISSING for pageID {pageID}")
|
|
||||||
else:
|
|
||||||
SKIPPED += 1
|
|
||||||
print(f"Page MISSING for pageID {pageID}")
|
|
||||||
|
|
||||||
print(f"Chunk done - Skipped {SKIPPED}")
|
|
||||||
end_full = time.time()
|
|
||||||
|
|
||||||
print(f"Time elapsed FULL: {end_full - start_full} seconds\n\tNO PARSE: {(end_full - start_full) - parsing_time} seconds")
|
|
||||||
return abstracts
|
|
||||||
|
|
||||||
|
|
||||||
def flush(movie_ids):
|
|
||||||
|
|
||||||
|
|
||||||
abstracts = get_clean_text(movie_ids)
|
|
||||||
|
|
||||||
start = time.time()
|
|
||||||
with open(output_csv, "a", newline="", encoding="utf-8") as f_out:
|
|
||||||
writer = csv.DictWriter(f_out, fieldnames=["subject", "text"])
|
|
||||||
|
|
||||||
for id, text in abstracts.items():
|
|
||||||
writer.writerow({"subject": id, "text": text})
|
|
||||||
end = time.time()
|
|
||||||
|
|
||||||
print(f"Time elapsed WRITE: {end - start} seconds")
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def reconcile() -> int:
|
|
||||||
|
|
||||||
start = time.time()
|
|
||||||
input_file = open(input_csv, "r", newline="", encoding="utf-8")
|
|
||||||
output_file = open(output_csv, "r", newline="", encoding="utf-8")
|
|
||||||
|
|
||||||
next(input_file)
|
|
||||||
LAST_CHECKED = output_file.readlines()[-1].split(",")[0]
|
|
||||||
current_check = input_file.readline().split(",")[1]
|
|
||||||
|
|
||||||
index = 1
|
|
||||||
|
|
||||||
while current_check != LAST_CHECKED:
|
|
||||||
current_check = input_file.readline().split(",")[1].replace("\n", "")
|
|
||||||
index += 1
|
|
||||||
|
|
||||||
input_file.close()
|
|
||||||
output_file.close()
|
|
||||||
end = time.time()
|
|
||||||
|
|
||||||
|
|
||||||
print(f"Time elapsed RECONCILE: {end - start} seconds")
|
|
||||||
|
|
||||||
print(f"FOUND, we need to skip {index} lines")
|
|
||||||
|
|
||||||
return index
|
|
||||||
|
|
||||||
|
|
||||||
if not Path(output_csv).is_file():
|
|
||||||
# Initialize output CSV
|
|
||||||
with open(output_csv, "w", newline="", encoding="utf-8") as f_out:
|
|
||||||
writer = csv.DictWriter(f_out, fieldnames=["subject", "text"])
|
|
||||||
writer.writeheader()
|
|
||||||
|
|
||||||
|
|
||||||
SKIP = reconcile()
|
|
||||||
|
|
||||||
|
|
||||||
# Read CSV in RAM
|
|
||||||
with open(input_csv, "r", newline="", encoding="utf-8") as input:
|
|
||||||
|
|
||||||
# Skip already done
|
|
||||||
for i in range(0, SKIP):
|
|
||||||
next(input)
|
|
||||||
|
|
||||||
reader = csv.reader(input)
|
|
||||||
|
|
||||||
index = -1
|
|
||||||
movie_ids = []
|
|
||||||
|
|
||||||
for line in reader:
|
|
||||||
|
|
||||||
index += 1
|
|
||||||
|
|
||||||
if index == 0:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Save movies in map
|
|
||||||
movie_ids.append(line[1])
|
|
||||||
|
|
||||||
if index % CHUNK == 0:
|
|
||||||
|
|
||||||
# Flush movies
|
|
||||||
flush(movie_ids)
|
|
||||||
movie_ids = []
|
|
||||||
@ -1,65 +0,0 @@
|
|||||||
CREATE TABLE IF NOT EXISTS Movies (
|
|
||||||
MovieID INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
||||||
MovieURI TEXT UNIQUE NOT NULL
|
|
||||||
);
|
|
||||||
|
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS WikiPageIDs (
|
|
||||||
MovieID INTEGER PRIMARY KEY,
|
|
||||||
PageID INTEGER UNIQUE NOT NULL,
|
|
||||||
FOREIGN KEY(MovieID) REFERENCES Movies(MovieID)
|
|
||||||
);
|
|
||||||
|
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS WikipediaAbstracts (
|
|
||||||
MovieID INTEGER PRIMARY KEY,
|
|
||||||
Abstract TEXT NOT NULL,
|
|
||||||
FOREIGN KEY(MovieID) REFERENCES Movies(MovieID)
|
|
||||||
);
|
|
||||||
|
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS Origins (
|
|
||||||
OriginID INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
||||||
OriginName TEXT UNIQUE NOT NULL
|
|
||||||
);
|
|
||||||
|
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS Subjects (
|
|
||||||
SubjectID INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
||||||
SubjectURI TEXT UNIQUE NOT NULL,
|
|
||||||
OriginID BIGINT NOT NULL,
|
|
||||||
FOREIGN KEY(OriginID) REFERENCES Origins(OriginID)
|
|
||||||
);
|
|
||||||
|
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS Relationships (
|
|
||||||
RelationshipID INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
||||||
RelationshipURI TEXT UNIQUE NOT NULL
|
|
||||||
);
|
|
||||||
|
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS Objects (
|
|
||||||
ObjectID INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
||||||
ObjectURI TEXT UNIQUE NOT NULL,
|
|
||||||
OriginID BIGINT NOT NULL,
|
|
||||||
FOREIGN KEY(OriginID) REFERENCES Origins(OriginID)
|
|
||||||
);
|
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS RDFs (
|
|
||||||
RDF_ID INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
||||||
MovieID INTEGER NOT NULL,
|
|
||||||
SubjectID INTEGER NOT NULL,
|
|
||||||
RelationshipID INTEGER NOT NULL,
|
|
||||||
ObjectID INTEGER NOT NULL,
|
|
||||||
UNIQUE(SubjectID, RelationshipID, ObjectID),
|
|
||||||
FOREIGN KEY(MovieID) REFERENCES Movies(MovieID),
|
|
||||||
FOREIGN KEY(SubjectID) REFERENCES Subjects(SubjectID),
|
|
||||||
FOREIGN KEY(RelationshipID) REFERENCES Relationships(RelationshipID),
|
|
||||||
FOREIGN KEY(ObjectID) REFERENCES Objects(ObjectID)
|
|
||||||
);
|
|
||||||
|
|
||||||
CREATE INDEX IF NOT EXISTS idx_rdf_movie_id ON RDFs(MovieID);
|
|
||||||
CREATE INDEX IF NOT EXISTS idx_rdf_subject_id ON RDFs(SubjectID);
|
|
||||||
CREATE INDEX IF NOT EXISTS idx_rdf_relationship_id ON RDFs(RelationshipID);
|
|
||||||
CREATE INDEX IF NOT EXISTS idx_rdf_object_id ON RDFs(ObjectID);
|
|
||||||
|
|
||||||
@ -1,35 +0,0 @@
|
|||||||
-- Insert MovieURI into Movies ; MovieID is auto incremental
|
|
||||||
INSERT INTO Movies (MovieURI) VALUES (?);
|
|
||||||
|
|
||||||
-- Get MovieID where MovieURI equal given value
|
|
||||||
SELECT MovieID FROM Movies WHERE MovieURI = ?;
|
|
||||||
|
|
||||||
-- SetPageId
|
|
||||||
INSERT INTO WikiPageIDs (MovieID, PageID) VALUES (?,?);
|
|
||||||
|
|
||||||
-- Get MovieId by PageID ... ( to create WikipediaAbstract)
|
|
||||||
SELECT MovieID FROM WikiPageIDs WHERE PageID = ?;
|
|
||||||
|
|
||||||
-- SetAbstract ...
|
|
||||||
|
|
||||||
INSERT INTO WikipediaAbstracts (MovieID, Abstract) VALUES (?,?);
|
|
||||||
|
|
||||||
|
|
||||||
-- SetOrigin
|
|
||||||
---
|
|
||||||
INSERT INTO Origins (OriginName) VALUES ("dataset.csv"),("reverse.csv");
|
|
||||||
|
|
||||||
-- GetOrigin
|
|
||||||
SELECT OriginID FROM Origins WHERE OriginName = ?;
|
|
||||||
|
|
||||||
-- Subject, Relationship, Object, RDF
|
|
||||||
INSERT INTO Subjects (SubjectURI, OriginID) VALUES (?,?);
|
|
||||||
INSERT INTO Relationships (RelationshipURI) VALUES (?);
|
|
||||||
INSERT INTO Objects (ObjectURI, OriginID) VALUES (?,?);
|
|
||||||
|
|
||||||
SELECT SubjectID FROM Subjects WHERE SubjectURI = ?;
|
|
||||||
SELECT RelationshipID FROM Relationships WHERE RelationshipURI = ?;
|
|
||||||
SELECT ObjectID FROM Objects WHERE ObjectURI = ?;
|
|
||||||
|
|
||||||
|
|
||||||
INSERT INTO RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?,?,?,?);
|
|
||||||
@ -1,26 +0,0 @@
|
|||||||
# HOW THE DATASET IS BUILT AND POPULATED
|
|
||||||
|
|
||||||
Note: the data are taken from CSV files in 1-hop
|
|
||||||
|
|
||||||
## CSV files composition
|
|
||||||
|
|
||||||
| CSV files | Original structure | Saved AS |
|
|
||||||
|--------------------|---------------------------------------|-------------------------------------|
|
|
||||||
| Wikipeda-summary | PageId / abstract | subject, text |
|
|
||||||
| Movies | Movie URI | "subject" |
|
|
||||||
| Dataset | Movie URI / Relationship / Object [RDF] | subject, relationship, object |
|
|
||||||
| Movies-PageId | Movie URI / PageId (wiki) | "subject", "object" |
|
|
||||||
| Reverse | Subject / Relationship / Movie URI | "subject", "relationship", "object" |
|
|
||||||
|
|
||||||
## Wanted tables schema
|
|
||||||
|
|
||||||
| Table | Columns |
|
|
||||||
|---------------|-------------------------------------------------------------------------|
|
|
||||||
| Movies | MovieID [PK], Movie URI |
|
|
||||||
| WikiPageIDs | MovieID [PK, FK], PageId [IDX] (wiki) *(Not important for now)* |
|
|
||||||
| Abstracts | MovieID [PK, FK], abstract |
|
|
||||||
| Subjects | SubjectID [PK], RDF Subject (from Dataset.csv or Reverse.csv), OriginID [FK] |
|
|
||||||
| Relationships | RelationshipID [PK], RDF Relationship (value only, not the actual relation) |
|
|
||||||
| Objects | ObjectID [PK], RDF Object, OriginID [FK] |
|
|
||||||
| Origins | OriginID [PK], Origin Name |
|
|
||||||
| RDFs | RDF_ID [PK], MovieID [FK], SubjectID [FK], RelationshipID [FK], ObjectID [FK] |
|
|
||||||
@ -1,375 +0,0 @@
|
|||||||
import sqlite3
|
|
||||||
import csv
|
|
||||||
|
|
||||||
#####################################################################
|
|
||||||
# This file builds DatawareHouse/dataset.db from 1-hop csv files #
|
|
||||||
# Its Schema in . /SQL_Queries/db_creation.sql #
|
|
||||||
# The sql query used to popualate id in . /SQL_Queries/query.sql #
|
|
||||||
#####################################################################
|
|
||||||
|
|
||||||
# sometimes you may need to build a new db file, here a little snippet for you
|
|
||||||
# sqlite3 ./Assets/Dataset/Tmp/dataset.db < ./Scripts/DataCleaning/SQL_Queries/db_creation.sql
|
|
||||||
|
|
||||||
# --- Global configuration ---
|
|
||||||
DB_NAME = "./Assets/Dataset/DatawareHouse/dataset.db"
|
|
||||||
MOVIES_CSV = "./Assets/Dataset/1-hop/movies.csv"
|
|
||||||
PAGEID_CSV = "./Assets/Dataset/1-hop/movie-pageid.csv"
|
|
||||||
SUMMARY_CSV = "./Assets/Dataset/1-hop/wikipedia-summary.csv"
|
|
||||||
DATASET_CSV = "./Assets/Dataset/1-hop/dataset.csv"
|
|
||||||
REVERSE_CSV = "./Assets/Dataset/1-hop/reverse.csv"
|
|
||||||
|
|
||||||
MOVIES_CSV_HANDLER = open(MOVIES_CSV,"r",newline='', encoding="utf-8")
|
|
||||||
PAGEID_CSV_HANDLER = open(PAGEID_CSV,"r",newline='', encoding="utf-8")
|
|
||||||
SUMMARY_CSV_HANDLER = open(SUMMARY_CSV,"r",newline='', encoding="utf-8")
|
|
||||||
DATASET_CSV_HANDLER = open(DATASET_CSV,"r",newline='', encoding="utf-8")
|
|
||||||
REVERSE_CSV_HANDLER = open(REVERSE_CSV,"r",newline='', encoding="utf-8")
|
|
||||||
|
|
||||||
CONN = sqlite3.connect(DB_NAME)
|
|
||||||
CURS = CONN.cursor()
|
|
||||||
|
|
||||||
# MARK: SQL Definitions
|
|
||||||
# Insert MovieURI
|
|
||||||
|
|
||||||
def insertOrigin(curs : sqlite3.Cursor ) -> bool:
|
|
||||||
|
|
||||||
QUERY = "INSERT INTO Origins (OriginName) VALUES ('dataset.csv'),('reverse.csv');"
|
|
||||||
try:
|
|
||||||
curs.execute(QUERY)
|
|
||||||
return True
|
|
||||||
except sqlite3.IntegrityError:
|
|
||||||
return False
|
|
||||||
|
|
||||||
def selectOrigin(curs: sqlite3.Cursor, originName: str) -> int | None:
|
|
||||||
|
|
||||||
QUERY = "SELECT OriginID FROM Origins WHERE OriginName = ?;"
|
|
||||||
|
|
||||||
curs.execute(QUERY, [originName])
|
|
||||||
originId = curs.fetchone()
|
|
||||||
if not originId:
|
|
||||||
return None
|
|
||||||
|
|
||||||
# in this case the real id is the first element of the tuple
|
|
||||||
return originId[0]
|
|
||||||
|
|
||||||
def insertMovie(curs : sqlite3.Cursor , movieUri: str) -> bool:
|
|
||||||
|
|
||||||
QUERY = "INSERT INTO Movies (MovieURI) VALUES (?);"
|
|
||||||
try:
|
|
||||||
curs.execute(QUERY,[movieUri])
|
|
||||||
return True
|
|
||||||
except sqlite3.IntegrityError:
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def selectMovieId(curs: sqlite3.Cursor, movieUri: str) -> int | None:
|
|
||||||
|
|
||||||
QUERY = "SELECT MovieID FROM Movies WHERE MovieURI = ?;"
|
|
||||||
|
|
||||||
curs.execute(QUERY, [movieUri])
|
|
||||||
movieId = curs.fetchone()
|
|
||||||
if not movieId:
|
|
||||||
return None
|
|
||||||
|
|
||||||
# in this case the real id is the first element of the tuple
|
|
||||||
return movieId[0]
|
|
||||||
|
|
||||||
|
|
||||||
def insertWikiPageId(curs: sqlite3.Cursor, movieId: int, pageId: int) -> bool:
|
|
||||||
QUERY = "INSERT INTO WikiPageIDs (MovieID, PageID) VALUES (?,?);"
|
|
||||||
try:
|
|
||||||
curs.execute(QUERY,[movieId, pageId])
|
|
||||||
return True
|
|
||||||
except sqlite3.IntegrityError:
|
|
||||||
return False
|
|
||||||
|
|
||||||
def selectMovieIdFromWikiPageId(curs: sqlite3.Cursor,pageId: int) -> int | None:
|
|
||||||
|
|
||||||
QUERY = "SELECT MovieID FROM WikiPageIDs WHERE PageID = ?;"
|
|
||||||
|
|
||||||
curs.execute(QUERY, [pageId])
|
|
||||||
movieId = curs.fetchone()
|
|
||||||
if not movieId:
|
|
||||||
return None
|
|
||||||
|
|
||||||
# in this case the real id is the first element of the tuple
|
|
||||||
return movieId[0]
|
|
||||||
|
|
||||||
def insertWikiAbstract(curs: sqlite3.Cursor, movieId: int, abstract: str) -> bool:
|
|
||||||
QUERY = "INSERT INTO WikipediaAbstracts (MovieID, Abstract) VALUES (?,?);"
|
|
||||||
try:
|
|
||||||
curs.execute(QUERY,[movieId, abstract])
|
|
||||||
return True
|
|
||||||
except sqlite3.IntegrityError:
|
|
||||||
return False
|
|
||||||
|
|
||||||
def insertSubject(curs: sqlite3.Cursor, subjectURI: str, originID: int) -> bool:
|
|
||||||
QUERY = "INSERT INTO Subjects (SubjectURI, OriginID) VALUES (?,?);"
|
|
||||||
try:
|
|
||||||
curs.execute(QUERY,[subjectURI, originID])
|
|
||||||
return True
|
|
||||||
except sqlite3.IntegrityError:
|
|
||||||
return False
|
|
||||||
|
|
||||||
def insertRelationship(curs: sqlite3.Cursor, relationshipURI: str) -> bool:
|
|
||||||
QUERY = "INSERT INTO Relationships (RelationshipURI) VALUES (?);"
|
|
||||||
try:
|
|
||||||
curs.execute(QUERY,[relationshipURI])
|
|
||||||
return True
|
|
||||||
except sqlite3.IntegrityError:
|
|
||||||
return False
|
|
||||||
|
|
||||||
def insertObject(curs: sqlite3.Cursor, objectURI: str, originID: int) -> bool:
|
|
||||||
QUERY = "INSERT INTO objects (ObjectURI, OriginID) VALUES (?,?);"
|
|
||||||
try:
|
|
||||||
curs.execute(QUERY,[objectURI, originID])
|
|
||||||
return True
|
|
||||||
except sqlite3.IntegrityError:
|
|
||||||
return False
|
|
||||||
|
|
||||||
def selectSubjectId(curs: sqlite3.Cursor, subjectURI: str) -> int | None:
|
|
||||||
|
|
||||||
QUERY = "SELECT SubjectID FROM Subjects WHERE SubjectURI = ?;"
|
|
||||||
|
|
||||||
curs.execute(QUERY, [subjectURI])
|
|
||||||
subjectId = curs.fetchone()
|
|
||||||
if not subjectId:
|
|
||||||
return None
|
|
||||||
|
|
||||||
# in this case the real id is the first element of the tuple
|
|
||||||
return subjectId[0]
|
|
||||||
|
|
||||||
def selectRelationshipId(curs: sqlite3.Cursor, relationshipURI: str) -> int | None:
|
|
||||||
|
|
||||||
QUERY = "SELECT RelationshipID FROM Relationships WHERE RelationshipURI = ?;"
|
|
||||||
|
|
||||||
curs.execute(QUERY, [relationshipURI])
|
|
||||||
relationshipId = curs.fetchone()
|
|
||||||
if not relationshipId:
|
|
||||||
return None
|
|
||||||
|
|
||||||
# in this case the real id is the first element of the tuple
|
|
||||||
return relationshipId[0]
|
|
||||||
|
|
||||||
def selectObjectId(curs: sqlite3.Cursor, objectURI: str) -> int | None:
|
|
||||||
|
|
||||||
QUERY = "SELECT ObjectID FROM Objects WHERE ObjectURI = ?;"
|
|
||||||
|
|
||||||
curs.execute(QUERY, [objectURI])
|
|
||||||
objectId = curs.fetchone()
|
|
||||||
if not objectId:
|
|
||||||
return None
|
|
||||||
|
|
||||||
# in this case the real id is the first element of the tuple
|
|
||||||
return objectId[0]
|
|
||||||
|
|
||||||
def insertRDF(
|
|
||||||
curs: sqlite3.Cursor,
|
|
||||||
movieId: int,
|
|
||||||
subjectId: int,
|
|
||||||
relationshipId: int,
|
|
||||||
objectId: int
|
|
||||||
) -> bool:
|
|
||||||
QUERY = "INSERT INTO RDFs (MovieID, SubjectID, RelationshipID, ObjectID) VALUES (?,?,?,?);"
|
|
||||||
try:
|
|
||||||
curs.execute(QUERY,[movieId,subjectId,relationshipId,objectId])
|
|
||||||
return True
|
|
||||||
except sqlite3.IntegrityError:
|
|
||||||
return False
|
|
||||||
|
|
||||||
# MARK: Parsing
|
|
||||||
def parseMovies():
|
|
||||||
|
|
||||||
CSV_READER = csv.reader(MOVIES_CSV_HANDLER)
|
|
||||||
next(CSV_READER)
|
|
||||||
for row in CSV_READER:
|
|
||||||
MOVIE = row[0]
|
|
||||||
insertMovie(CURS, MOVIE)
|
|
||||||
|
|
||||||
|
|
||||||
def parseWikiPageId():
|
|
||||||
CSV_READER = csv.DictReader(PAGEID_CSV_HANDLER)
|
|
||||||
for row in CSV_READER:
|
|
||||||
MOVIE_URI = row["subject"]
|
|
||||||
WIKI_PAGE_ID = int(row["object"])
|
|
||||||
MOVIE_ID = selectMovieId(CURS, MOVIE_URI)
|
|
||||||
|
|
||||||
if MOVIE_ID is None:
|
|
||||||
print(f"The MovieUri: {MOVIE_URI} has not a MovieId ")
|
|
||||||
continue
|
|
||||||
|
|
||||||
insertWikiPageId(CURS, MOVIE_ID, WIKI_PAGE_ID)
|
|
||||||
|
|
||||||
|
|
||||||
def parseAbstract():
|
|
||||||
CSV_READER = csv.DictReader(SUMMARY_CSV_HANDLER)
|
|
||||||
for row in CSV_READER:
|
|
||||||
|
|
||||||
WIKI_PAGE_ID = int(row["subject"])
|
|
||||||
ABSTRACT = row["text"]
|
|
||||||
MOVIE_ID = selectMovieIdFromWikiPageId(CURS, WIKI_PAGE_ID)
|
|
||||||
|
|
||||||
|
|
||||||
if MOVIE_ID is None:
|
|
||||||
print(f"The WikiPageId: {WIKI_PAGE_ID} has not a MovieId ")
|
|
||||||
continue
|
|
||||||
|
|
||||||
insertWikiAbstract(CURS, MOVIE_ID, ABSTRACT)
|
|
||||||
|
|
||||||
|
|
||||||
def parseRDF_Reverse():
|
|
||||||
|
|
||||||
REVERSE_CSV_READER = csv.DictReader(REVERSE_CSV_HANDLER)
|
|
||||||
REVERSE_ORIGIN_ID = selectOrigin(CURS, 'reverse.csv')
|
|
||||||
total = 0
|
|
||||||
|
|
||||||
for row in REVERSE_CSV_READER:
|
|
||||||
SUBJECT = row["subject"]
|
|
||||||
RELATIONSHIP = row["relationship"]
|
|
||||||
OBJECT = row["object"]
|
|
||||||
print(f"RDF triplets:\n\t{SUBJECT} - {RELATIONSHIP} - {OBJECT}")
|
|
||||||
insertSubject(CURS,SUBJECT,REVERSE_ORIGIN_ID)
|
|
||||||
insertRelationship(CURS, RELATIONSHIP)
|
|
||||||
insertObject(CURS, OBJECT, REVERSE_ORIGIN_ID)
|
|
||||||
|
|
||||||
SUBJECT_ID = selectSubjectId(CURS, SUBJECT)
|
|
||||||
OBJECT_ID = selectObjectId(CURS, OBJECT)
|
|
||||||
RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
|
|
||||||
MOVIE_ID = selectMovieId(CURS, OBJECT)
|
|
||||||
|
|
||||||
|
|
||||||
skip = False
|
|
||||||
|
|
||||||
# guard
|
|
||||||
if SUBJECT_ID is None:
|
|
||||||
print(f"No SubjectId for {SUBJECT}")
|
|
||||||
skip = True
|
|
||||||
|
|
||||||
if OBJECT_ID is None:
|
|
||||||
print(f"No ObjectId for {OBJECT}")
|
|
||||||
skip = True
|
|
||||||
|
|
||||||
if RELATIONSHIP_ID is None:
|
|
||||||
print(f"No RelationshipId for {RELATIONSHIP}")
|
|
||||||
skip = True
|
|
||||||
|
|
||||||
if MOVIE_ID is None:
|
|
||||||
print(f"No MovieId for {OBJECT}")
|
|
||||||
skip = True
|
|
||||||
|
|
||||||
if skip:
|
|
||||||
continue
|
|
||||||
|
|
||||||
if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID):
|
|
||||||
total += 1
|
|
||||||
|
|
||||||
print(total)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def parseRDF_Dataset():
|
|
||||||
|
|
||||||
DATASET_CSV_READER = csv.DictReader(DATASET_CSV_HANDLER)
|
|
||||||
DATASET_ORIGIN_ID = selectOrigin(CURS, 'dataset.csv')
|
|
||||||
|
|
||||||
total = 0
|
|
||||||
rdf_idx = 0
|
|
||||||
for row in DATASET_CSV_READER:
|
|
||||||
|
|
||||||
SUBJECT = row["subject"]
|
|
||||||
RELATIONSHIP = row["relationship"]
|
|
||||||
OBJECT = row["object"]
|
|
||||||
|
|
||||||
rdf_idx += 1
|
|
||||||
|
|
||||||
if rdf_idx % 100000 == 0:
|
|
||||||
print(f"RDF number {rdf_idx}:\n\t{SUBJECT} - {RELATIONSHIP} - {OBJECT}")
|
|
||||||
|
|
||||||
insertSubject(CURS,SUBJECT,DATASET_ORIGIN_ID)
|
|
||||||
insertRelationship(CURS, RELATIONSHIP)
|
|
||||||
insertObject(CURS, OBJECT, DATASET_ORIGIN_ID)
|
|
||||||
|
|
||||||
SUBJECT_ID = selectSubjectId(CURS, SUBJECT)
|
|
||||||
OBJECT_ID = selectObjectId(CURS, OBJECT)
|
|
||||||
RELATIONSHIP_ID = selectRelationshipId(CURS, RELATIONSHIP)
|
|
||||||
MOVIE_ID = selectMovieId(CURS, SUBJECT)
|
|
||||||
|
|
||||||
|
|
||||||
skip = False
|
|
||||||
|
|
||||||
# guard
|
|
||||||
if SUBJECT_ID is None:
|
|
||||||
print(f"No SubjectId for {SUBJECT}")
|
|
||||||
skip = True
|
|
||||||
|
|
||||||
if OBJECT_ID is None:
|
|
||||||
print(f"No ObjectId for {OBJECT}")
|
|
||||||
skip = True
|
|
||||||
|
|
||||||
if RELATIONSHIP_ID is None:
|
|
||||||
print(f"No RelationshipId for {RELATIONSHIP}")
|
|
||||||
skip = True
|
|
||||||
|
|
||||||
if MOVIE_ID is None:
|
|
||||||
print(f"No MovieId for {SUBJECT}")
|
|
||||||
skip = True
|
|
||||||
|
|
||||||
if skip:
|
|
||||||
continue
|
|
||||||
|
|
||||||
if insertRDF(CURS, MOVIE_ID, SUBJECT_ID, RELATIONSHIP_ID, OBJECT_ID):
|
|
||||||
total += 1
|
|
||||||
|
|
||||||
print(total)
|
|
||||||
|
|
||||||
|
|
||||||
# MARK: Actual Code
|
|
||||||
# parseMovies()
|
|
||||||
# parseWikiPageId()
|
|
||||||
# parseAbstract()
|
|
||||||
# insertOrigin(CURS)
|
|
||||||
# parseRDF_Reverse()
|
|
||||||
# parseRDF_Dataset()
|
|
||||||
|
|
||||||
|
|
||||||
CONN.commit()
|
|
||||||
CONN.close()
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
MOVIES_CSV_HANDLER.close()
|
|
||||||
PAGEID_CSV_HANDLER.close()
|
|
||||||
SUMMARY_CSV_HANDLER.close()
|
|
||||||
DATASET_CSV_HANDLER.close()
|
|
||||||
REVERSE_CSV_HANDLER.close()
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
|
||||||
The MovieUri: http://dbpedia.org/resource/1%25_(film) has not a MovieId
|
|
||||||
The MovieUri: http://dbpedia.org/resource/10%25:_What_Makes_a_Hero%3F has not a MovieId
|
|
||||||
The MovieUri: http://dbpedia.org/resource/100%25_Arabica has not a MovieId
|
|
||||||
The MovieUri: http://dbpedia.org/resource/100%25_Kadhal has not a MovieId
|
|
||||||
The MovieUri: http://dbpedia.org/resource/100%25_Love_(2011_film) has not a MovieId
|
|
||||||
The MovieUri: http://dbpedia.org/resource/100%25_Love_(2012_film) has not a MovieId
|
|
||||||
The MovieUri: http://dbpedia.org/resource/100%25_Wolf has not a MovieId
|
|
||||||
The MovieUri: http://dbpedia.org/resource/Who_the_$&%25_Is_Jackson_Pollock%3F has not a MovieId
|
|
||||||
The MovieUri: http://dbpedia.org/resource/99%25:_The_Occupy_Wall_Street_Collaborative_Film has not a MovieId
|
|
||||||
The MovieUri: http://dbpedia.org/resource/99_and_44/100%25_Dead has not a MovieId
|
|
||||||
The MovieUri: http://dbpedia.org/resource/Postcards_from_the_48%25 has not a MovieId
|
|
||||||
The MovieUri: http://dbpedia.org/resource/Wool_100%25 has not a MovieId
|
|
||||||
"""
|
|
||||||
|
|
||||||
"""
|
|
||||||
The WikiPageId: 10068850 has not a MovieId
|
|
||||||
The WikiPageId: 55069615 has not a MovieId
|
|
||||||
The WikiPageId: 49510056 has not a MovieId
|
|
||||||
The WikiPageId: 4049786 has not a MovieId
|
|
||||||
The WikiPageId: 55510238 has not a MovieId
|
|
||||||
The WikiPageId: 31239628 has not a MovieId
|
|
||||||
The WikiPageId: 34757217 has not a MovieId
|
|
||||||
The WikiPageId: 64311757 has not a MovieId
|
|
||||||
The WikiPageId: 8326198 has not a MovieId
|
|
||||||
The WikiPageId: 42162164 has not a MovieId
|
|
||||||
The WikiPageId: 18502369 has not a MovieId
|
|
||||||
The WikiPageId: 58092358 has not a MovieId
|
|
||||||
The WikiPageId: 40710250 has not a MovieId
|
|
||||||
"""
|
|
||||||
215
docs/DBPEDIA.md
215
docs/DBPEDIA.md
@ -1,215 +0,0 @@
|
|||||||
# DBPedia
|
|
||||||
|
|
||||||
## GraphIRI
|
|
||||||
|
|
||||||
This is the graph identifier (URI):
|
|
||||||
|
|
||||||
`http://dbpedia.org`
|
|
||||||
|
|
||||||
## History of queries
|
|
||||||
|
|
||||||
```SQL
|
|
||||||
PREFIX dbo: <http://dbpedia.org/ontology/>
|
|
||||||
PREFIX dbp: <http://dbpedia.org/property/>
|
|
||||||
PREFIX dbr: <http://dbpedia.org/resource/>
|
|
||||||
|
|
||||||
SELECT ?subject, ?relationship, ?object
|
|
||||||
WHERE {
|
|
||||||
?subject ?relationship ?object .
|
|
||||||
{
|
|
||||||
SELECT ?object
|
|
||||||
WHERE {
|
|
||||||
?m rdf:type dbo:Film .
|
|
||||||
?object ?r ?m
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
### 2 Hops
|
|
||||||
|
|
||||||
```SQL
|
|
||||||
PREFIX dbo: <http://dbpedia.org/ontology/>
|
|
||||||
PREFIX dbp: <http://dbpedia.org/property/>
|
|
||||||
PREFIX dbr: <http://dbpedia.org/resource/>
|
|
||||||
|
|
||||||
SELECT ?subject, ?relationship, ?object
|
|
||||||
WHERE {
|
|
||||||
?subject ?relationship ?object .
|
|
||||||
FILTER (?relationship != <http://dbpedia.org/ontology/wikiPageWikiLink>)
|
|
||||||
{
|
|
||||||
SELECT ?object
|
|
||||||
WHERE {
|
|
||||||
?m rdf:type dbo:Film .
|
|
||||||
?object ?r ?m
|
|
||||||
FILTER (?r != <http://dbpedia.org/ontology/wikiPageWikiLink>)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
LIMIT 1000000
|
|
||||||
```
|
|
||||||
|
|
||||||
### 1 Hop
|
|
||||||
|
|
||||||
```SQL
|
|
||||||
PREFIX dbo: <http://dbpedia.org/ontology/>
|
|
||||||
PREFIX dbp: <http://dbpedia.org/property/>
|
|
||||||
PREFIX dbr: <http://dbpedia.org/resource/>
|
|
||||||
|
|
||||||
SELECT ?subject, ?relationship, ?object
|
|
||||||
WHERE {
|
|
||||||
?subject ?relationship ?object .
|
|
||||||
?object rdf:type dbo:Film .
|
|
||||||
FILTER (?relationship != <http://dbpedia.org/ontology/wikiPageWikiLink>)
|
|
||||||
}
|
|
||||||
LIMIT 1000000
|
|
||||||
```
|
|
||||||
|
|
||||||
```SQL
|
|
||||||
PREFIX dbo: <http://dbpedia.org/ontology/>
|
|
||||||
PREFIX dbp: <http://dbpedia.org/property/>
|
|
||||||
PREFIX dbr: <http://dbpedia.org/resource/>
|
|
||||||
|
|
||||||
SELECT ?subject, ?relationship, ?object
|
|
||||||
WHERE {
|
|
||||||
?subject ?relationship ?object .
|
|
||||||
?subject rdf:type dbo:Film .
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
```SQL
|
|
||||||
PREFIX dbo: <http://dbpedia.org/ontology/>
|
|
||||||
PREFIX dbp: <http://dbpedia.org/property/>
|
|
||||||
PREFIX dbr: <http://dbpedia.org/resource/>
|
|
||||||
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
|
|
||||||
|
|
||||||
SELECT ?subject, ?relationship, ?object
|
|
||||||
WHERE {
|
|
||||||
?subject ?relationship ?object .
|
|
||||||
?subject rdf:type dbo:Film .
|
|
||||||
?a foaf:primaryTopic ?subject
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
```SQL
|
|
||||||
PREFIX dbo: <http://dbpedia.org/ontology/>
|
|
||||||
PREFIX dbp: <http://dbpedia.org/property/>
|
|
||||||
PREFIX dbr: <http://dbpedia.org/resource/>
|
|
||||||
|
|
||||||
SELECT ?subject
|
|
||||||
WHERE {
|
|
||||||
?subject rdf:type dbo:Film .
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
```SQL
|
|
||||||
PREFIX dbo: <http://dbpedia.org/ontology/>
|
|
||||||
PREFIX dbp: <http://dbpedia.org/property/>
|
|
||||||
PREFIX dbr: <http://dbpedia.org/resource/>
|
|
||||||
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
|
|
||||||
|
|
||||||
SELECT ?subject
|
|
||||||
WHERE {
|
|
||||||
?subject rdf:type dbo:Film .
|
|
||||||
?a foaf:primaryTopic ?subject
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
```SQL
|
|
||||||
PREFIX dbo: <http://dbpedia.org/ontology/>
|
|
||||||
PREFIX dbp: <http://dbpedia.org/property/>
|
|
||||||
PREFIX dbr: <http://dbpedia.org/resource/>
|
|
||||||
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
|
|
||||||
|
|
||||||
SELECT ?subject, ?relationship, ?object
|
|
||||||
WHERE {
|
|
||||||
?subject ?relationship ?object .
|
|
||||||
?subject rdf:type dbo:Film .
|
|
||||||
?a foaf:primaryTopic ?subject
|
|
||||||
FILTER (?relationship NOT IN (
|
|
||||||
dbo:wikiPageRedirects,
|
|
||||||
dbo:wikiPageExternalLink,
|
|
||||||
dbo:wikiPageWikiLink,
|
|
||||||
foaf:primaryTopic
|
|
||||||
))
|
|
||||||
}
|
|
||||||
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Wikipedia-movie
|
|
||||||
|
|
||||||
a.k.a the file with the wikipedia abstract
|
|
||||||
|
|
||||||
```SQL
|
|
||||||
PREFIX dbo: <http://dbpedia.org/ontology/>
|
|
||||||
PREFIX dbp: <http://dbpedia.org/property/>
|
|
||||||
PREFIX dbr: <http://dbpedia.org/resource/>
|
|
||||||
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
|
|
||||||
|
|
||||||
SELECT ?subject , ?object
|
|
||||||
WHERE {
|
|
||||||
?subject foaf:primaryTopic ?object .
|
|
||||||
?object rdf:type dbo:Film
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Reverse
|
|
||||||
|
|
||||||
```SQL
|
|
||||||
PREFIX dbo: <http://dbpedia.org/ontology/>
|
|
||||||
PREFIX dbp: <http://dbpedia.org/property/>
|
|
||||||
PREFIX dbr: <http://dbpedia.org/resource/>
|
|
||||||
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
|
|
||||||
|
|
||||||
SELECT ?subject, ?relationship, ?object
|
|
||||||
WHERE {
|
|
||||||
?subject ?relationship ?object .
|
|
||||||
?object rdf:type dbo:Film .
|
|
||||||
?a foaf:primaryTopic ?object
|
|
||||||
FILTER (?relationship NOT IN (
|
|
||||||
dbo:wikiPageRedirects,
|
|
||||||
dbo:wikiPageExternalLink,
|
|
||||||
dbo:wikiPageWikiLink,
|
|
||||||
foaf:primaryTopic
|
|
||||||
))
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
```SQL
|
|
||||||
PREFIX dbo: <http://dbpedia.org/ontology/>
|
|
||||||
PREFIX dbp: <http://dbpedia.org/property/>
|
|
||||||
PREFIX dbr: <http://dbpedia.org/resource/>
|
|
||||||
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
|
|
||||||
|
|
||||||
SELECT ?subject, ?relationship, ?object
|
|
||||||
WHERE {
|
|
||||||
?subject ?relationship ?object .
|
|
||||||
?object rdf:type dbo:Film .
|
|
||||||
?a foaf:primaryTopic ?object
|
|
||||||
FILTER (?relationship NOT IN (
|
|
||||||
dbo:wikiPageRedirects,
|
|
||||||
dbo:wikiPageExternalLink,
|
|
||||||
dbo:wikiPageWikiLink,
|
|
||||||
foaf:primaryTopic
|
|
||||||
))
|
|
||||||
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Film \ wiki page ID
|
|
||||||
|
|
||||||
```SQL
|
|
||||||
PREFIX dbo: <http://dbpedia.org/ontology/>
|
|
||||||
PREFIX dbp: <http://dbpedia.org/property/>
|
|
||||||
PREFIX dbr: <http://dbpedia.org/resource/>
|
|
||||||
PREFIX foaf: <http://xmlns.com/foaf/0.1/>
|
|
||||||
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
|
|
||||||
|
|
||||||
SELECT ?subject ?pageID
|
|
||||||
WHERE {
|
|
||||||
?subject rdf:type dbo:Film .
|
|
||||||
?subject dbo:wikiPageID ?pageID .
|
|
||||||
?subject rdfs:label ?label .
|
|
||||||
FILTER (lang(?label) = "en")
|
|
||||||
}
|
|
||||||
|
|
||||||
```
|
|
||||||
@ -1,3 +0,0 @@
|
|||||||
# Development
|
|
||||||
|
|
||||||
## Data Gathering
|
|
||||||
@ -1,108 +0,0 @@
|
|||||||
# Resources
|
|
||||||
|
|
||||||
## Byte-Pair Encoding (BPE)
|
|
||||||
|
|
||||||
### Overview
|
|
||||||
|
|
||||||
Byte-Pair Encoding (BPE) is a simple but powerful text compression and tokenization algorithm.
|
|
||||||
Originally introduced as a data compression method, it has been widely adopted in **Natural Language Processing (NLP)** to build subword vocabularies for models such as GPT and BERT.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Key Idea
|
|
||||||
|
|
||||||
BPE works by iteratively replacing the most frequent pair of symbols (initially characters) with a new symbol.
|
|
||||||
Over time, frequent character sequences (e.g., common morphemes, prefixes, suffixes) are merged into single tokens.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Algorithm Steps
|
|
||||||
|
|
||||||
1. **Initialization**
|
|
||||||
- Treat each character of the input text as a token.
|
|
||||||
|
|
||||||
2. **Find Frequent Pairs**
|
|
||||||
- Count all adjacent token pairs in the sequence.
|
|
||||||
|
|
||||||
3. **Merge Most Frequent Pair**
|
|
||||||
- Replace the most frequent pair with a new symbol not used in the text.
|
|
||||||
|
|
||||||
4. **Repeat**
|
|
||||||
- Continue until no frequent pairs remain or a desired vocabulary size is reached.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Example
|
|
||||||
|
|
||||||
Suppose the data to be encoded is:
|
|
||||||
|
|
||||||
```text
|
|
||||||
aaabdaaabac
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Step 1: Merge `"aa"`
|
|
||||||
|
|
||||||
Most frequent pair: `"aa"` → replace with `"Z"`
|
|
||||||
|
|
||||||
```text
|
|
||||||
ZabdZabac
|
|
||||||
Z = aa
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
#### Step 2: Merge `"ab"`
|
|
||||||
|
|
||||||
Most frequent pair: `"ab"` → replace with `"Y"`
|
|
||||||
|
|
||||||
```text
|
|
||||||
ZYdZYac
|
|
||||||
Y = ab
|
|
||||||
Z = aa
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
#### Step 3: Merge `"ZY"`
|
|
||||||
|
|
||||||
Most frequent pair: `"ZY"` → replace with `"X"`
|
|
||||||
|
|
||||||
```text
|
|
||||||
XdXac
|
|
||||||
X = ZY
|
|
||||||
Y = ab
|
|
||||||
Z = aa
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
At this point, no pairs occur more than once, so the process stops.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Decompression
|
|
||||||
|
|
||||||
To recover the original data, replacements are applied in **reverse order**:
|
|
||||||
|
|
||||||
```text
|
|
||||||
XdXac
|
|
||||||
→ ZYdZYac
|
|
||||||
→ ZabdZabac
|
|
||||||
→ aaabdaaabac
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Advantages
|
|
||||||
|
|
||||||
- **Efficient vocabulary building**: reduces the need for massive word lists.
|
|
||||||
- **Handles rare words**: breaks them into meaningful subword units.
|
|
||||||
- **Balances character- and word-level tokenization**.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Limitations
|
|
||||||
|
|
||||||
- Does not consider linguistic meaning—merges are frequency-based.
|
|
||||||
- May create tokens that are not linguistically natural.
|
|
||||||
- Vocabulary is fixed after training.
|
|
||||||
@ -1,67 +0,0 @@
|
|||||||
# SparQL
|
|
||||||
|
|
||||||
> [!NOTE]
|
|
||||||
> Resources taken from [this website](https://sparql.dev/)
|
|
||||||
|
|
||||||
## SQL Queries
|
|
||||||
|
|
||||||
### SELECT
|
|
||||||
|
|
||||||
```SQL
|
|
||||||
SELECT ?var1, ?var2, ...
|
|
||||||
```
|
|
||||||
|
|
||||||
### WHERE
|
|
||||||
|
|
||||||
```SQL
|
|
||||||
WHERE {
|
|
||||||
pattern1 .
|
|
||||||
pattern2 .
|
|
||||||
...
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
### FILTER
|
|
||||||
|
|
||||||
It's used to restrict [`WHERE`](#where) clauses
|
|
||||||
|
|
||||||
```SQL
|
|
||||||
WHERE {
|
|
||||||
?person <http://example.com/hasCar> ?car .
|
|
||||||
FILTER (?car = <http://example.com/Car1>)
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
### OPTIONAL
|
|
||||||
|
|
||||||
It's used to fetch available content if exists
|
|
||||||
|
|
||||||
```SQL
|
|
||||||
SELECT ?person ?car
|
|
||||||
WHERE {
|
|
||||||
?person <http://example.com/hasCar> ?car .
|
|
||||||
OPTIONAL {
|
|
||||||
?car <http://example.com/hasColor> ?color .
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
### LIMIT
|
|
||||||
|
|
||||||
Limits results
|
|
||||||
|
|
||||||
```SQL
|
|
||||||
LIMIT 10 -- Take only 10 results
|
|
||||||
```
|
|
||||||
|
|
||||||
## SparQL functions
|
|
||||||
|
|
||||||
### COUNT
|
|
||||||
|
|
||||||
```SQL
|
|
||||||
SELECT (COUNT(?person) AS ?count)
|
|
||||||
WHERE {
|
|
||||||
?person <http://example.com/hasCar> ?car .
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
BIN
environment.yaml
BIN
environment.yaml
Binary file not shown.
@ -1,17 +0,0 @@
|
|||||||
certifi==2025.8.3
|
|
||||||
charset-normalizer==3.4.3
|
|
||||||
idna==3.10
|
|
||||||
numpy==2.3.3
|
|
||||||
pandas==2.3.2
|
|
||||||
pyparsing==3.2.4
|
|
||||||
python-dateutil==2.9.0.post0
|
|
||||||
pytz==2025.2
|
|
||||||
rdflib==7.1.4
|
|
||||||
requests==2.32.5
|
|
||||||
setuptools==78.1.1
|
|
||||||
six==1.17.0
|
|
||||||
SPARQLWrapper==2.0.0
|
|
||||||
tzdata==2025.2
|
|
||||||
urllib3==2.5.0
|
|
||||||
wheel==0.45.1
|
|
||||||
Wikipedia-API==0.8.1
|
|
||||||
Loading…
x
Reference in New Issue
Block a user