NanoSocrates/Scripts/DataGathering/fetchdata.py

import argparse
from math import floor
import sys
from time import sleep
import SPARQLWrapper


class ProgramData:

    def __init__(
        self,
        local_url,
        query_url,
        sparql_url,
        output_type,
        initial_offset,
        timeout,
        limit,
        max_pages,
        verbosity_level,
    ) -> None:

        self.local_url = local_url
        self.query_url = query_url
        self.sparql_url = sparql_url
        self.output_type = output_type
        self.initial_offset = initial_offset
        self.timeout = timeout
        self.limit = limit
        self.max_pages = max_pages
        self.verbosity_level = verbosity_level

    @property
    def offset(self):
        return self.limit

    @property
    def query(self):

        with open(self.query_url, "r") as file:
            return file.read()


DBPEDIA_URL = "https://dbpedia.org/sparql"
TYPE = SPARQLWrapper.CSV
TIMEOUT_SECONDS = 1.5
LIMIT = int(1E4)
INITIAL_OFFSET = 0
MAX_PAGES = int(1E9)


def gather_cli_args(args: list[str]) -> ProgramData:

    # TODO: Add argument for type
    PARSER = argparse.ArgumentParser("sparql data fetcher")
    PARSER.add_argument("--file-path", "--file", "--output", "-o", required=True, type=str)
    PARSER.add_argument("--query-file", "--query", "-q", required=True, type=str)
    PARSER.add_argument("--url", type=str, default=DBPEDIA_URL)
    PARSER.add_argument("--limit", type=int, default=LIMIT)
    PARSER.add_argument("--timeout", type=float, default=TIMEOUT_SECONDS)
    PARSER.add_argument("--offset", type=int, default=INITIAL_OFFSET)
    PARSER.add_argument("--max-pages", type=int, default=MAX_PAGES)
    PARSER.add_argument("--verbose", "-v", action="count", default=0)

    parsed_args, _ = PARSER.parse_known_args(args)

    return ProgramData(
        parsed_args.file_path,
        parsed_args.query_file,
        parsed_args.url,
        SPARQLWrapper.CSV,
        parsed_args.offset,
        parsed_args.timeout,
        parsed_args.limit,
        parsed_args.max_pages,
        parsed_args.verbose
    )
    # type: ignore


def fetch_data(DATA: ProgramData):

    # Take correction of page into account
    page = int(floor(DATA.initial_offset / DATA.limit)) - 1
    exit = False

    while not exit:

        print(f"Starting to get page {page}")

        CURRENT_OFFSET = int(DATA.offset + (page * DATA.limit))
        sparql = SPARQLWrapper.SPARQLWrapper(DATA.sparql_url)

        sparql.setReturnFormat(TYPE)

        CURRENT_PAGE_QUERY = "\n".join([
            DATA.query,
            f"LIMIT {LIMIT}",
            f"OFFSET {CURRENT_OFFSET}"
        ])

        print(f"\nCurrent Query:\n{CURRENT_PAGE_QUERY}\n")

        sparql.setQuery(CURRENT_PAGE_QUERY)

        try:
            res = sparql.queryAndConvert()
            text = ""

            if type(res) == bytes:

                initial_offset = 0

                if page != 0:
                    initial_offset = 1

                lines = res.decode("utf-8", "ignore").split("\n")
                text = "\n".join(lines[initial_offset:])

            if text == "":
                exit = True
                continue

            with open(DATA.local_url, "a+", encoding="utf-8") as dataset:

                print(f"Writing page {page} on {DATA.local_url}")
                dataset.write(
                    text
                )

        except Exception as ex:
            print(f"Something went wrong during page {page}:\n\t{ex}")

        print(f"Sleeping for {TIMEOUT_SECONDS}")

        page += 1

        if page == MAX_PAGES - 1:
            exit = True

        sleep(TIMEOUT_SECONDS)


if __name__ == "__main__":
    DATA = gather_cli_args(sys.argv)
    fetch_data(DATA)
Updated fetchdata to be used in terminal Changes: - now you can use it as if it were a cli command Missing: - documentation 2025-09-19 12:35:15 +02:00			`import argparse`
Updated File for fetching 2025-09-18 17:23:56 +02:00			`from math import floor`
Updated fetchdata to be used in terminal Changes: - now you can use it as if it were a cli command Missing: - documentation 2025-09-19 12:35:15 +02:00			`import sys`
Added script to fetch data from DBPedia 2025-09-17 17:05:27 +02:00			`from time import sleep`
			`import SPARQLWrapper`


Updated fetchdata to be used in terminal Changes: - now you can use it as if it were a cli command Missing: - documentation 2025-09-19 12:35:15 +02:00			`class ProgramData:`

			`def __init__(`
			`self,`
			`local_url,`
			`query_url,`
			`sparql_url,`
			`output_type,`
			`initial_offset,`
			`timeout,`
			`limit,`
			`max_pages,`
			`verbosity_level,`
			`) -> None:`

			`self.local_url = local_url`
			`self.query_url = query_url`
			`self.sparql_url = sparql_url`
			`self.output_type = output_type`
			`self.initial_offset = initial_offset`
			`self.timeout = timeout`
			`self.limit = limit`
			`self.max_pages = max_pages`
			`self.verbosity_level = verbosity_level`

			`@property`
			`def offset(self):`
			`return self.limit`

			`@property`
			`def query(self):`

			`with open(self.query_url, "r") as file:`
			`return file.read()`


			`DBPEDIA_URL = "https://dbpedia.org/sparql"`
Added script to fetch data from DBPedia 2025-09-17 17:05:27 +02:00			`TYPE = SPARQLWrapper.CSV`
Updated file 2025-09-18 12:03:09 +02:00			`TIMEOUT_SECONDS = 1.5`
			`LIMIT = int(1E4)`
Updated fetchdata to be used in terminal Changes: - now you can use it as if it were a cli command Missing: - documentation 2025-09-19 12:35:15 +02:00			`INITIAL_OFFSET = 0`
Updated file 2025-09-18 12:03:09 +02:00			`MAX_PAGES = int(1E9)`
Added script to fetch data from DBPedia 2025-09-17 17:05:27 +02:00
Updated File for fetching 2025-09-18 17:23:56 +02:00
Updated fetchdata to be used in terminal Changes: - now you can use it as if it were a cli command Missing: - documentation 2025-09-19 12:35:15 +02:00			`def gather_cli_args(args: list[str]) -> ProgramData:`
Added script to fetch data from DBPedia 2025-09-17 17:05:27 +02:00
Updated fetchdata to be used in terminal Changes: - now you can use it as if it were a cli command Missing: - documentation 2025-09-19 12:35:15 +02:00			`# TODO: Add argument for type`
			`PARSER = argparse.ArgumentParser("sparql data fetcher")`
			`PARSER.add_argument("--file-path", "--file", "--output", "-o", required=True, type=str)`
			`PARSER.add_argument("--query-file", "--query", "-q", required=True, type=str)`
			`PARSER.add_argument("--url", type=str, default=DBPEDIA_URL)`
			`PARSER.add_argument("--limit", type=int, default=LIMIT)`
			`PARSER.add_argument("--timeout", type=float, default=TIMEOUT_SECONDS)`
			`PARSER.add_argument("--offset", type=int, default=INITIAL_OFFSET)`
			`PARSER.add_argument("--max-pages", type=int, default=MAX_PAGES)`
			`PARSER.add_argument("--verbose", "-v", action="count", default=0)`
Added script to fetch data from DBPedia 2025-09-17 17:05:27 +02:00
Updated fetchdata to be used in terminal Changes: - now you can use it as if it were a cli command Missing: - documentation 2025-09-19 12:35:15 +02:00			`parsed_args, _ = PARSER.parse_known_args(args)`
Added script to fetch data from DBPedia 2025-09-17 17:05:27 +02:00
Updated fetchdata to be used in terminal Changes: - now you can use it as if it were a cli command Missing: - documentation 2025-09-19 12:35:15 +02:00			`return ProgramData(`
			`parsed_args.file_path,`
			`parsed_args.query_file,`
			`parsed_args.url,`
			`SPARQLWrapper.CSV,`
			`parsed_args.offset,`
			`parsed_args.timeout,`
			`parsed_args.limit,`
			`parsed_args.max_pages,`
			`parsed_args.verbose`
			`)`
			`# type: ignore`
Added script to fetch data from DBPedia 2025-09-17 17:05:27 +02:00

Updated fetchdata to be used in terminal Changes: - now you can use it as if it were a cli command Missing: - documentation 2025-09-19 12:35:15 +02:00			`def fetch_data(DATA: ProgramData):`

			`# Take correction of page into account`
			`page = int(floor(DATA.initial_offset / DATA.limit)) - 1`
Updated file 2025-09-18 12:03:09 +02:00			`exit = False`

			`while not exit:`
Added script to fetch data from DBPedia 2025-09-17 17:05:27 +02:00
			`print(f"Starting to get page {page}")`

Updated fetchdata to be used in terminal Changes: - now you can use it as if it were a cli command Missing: - documentation 2025-09-19 12:35:15 +02:00			`CURRENT_OFFSET = int(DATA.offset + (page * DATA.limit))`
			`sparql = SPARQLWrapper.SPARQLWrapper(DATA.sparql_url)`
Added script to fetch data from DBPedia 2025-09-17 17:05:27 +02:00
			`sparql.setReturnFormat(TYPE)`

			`CURRENT_PAGE_QUERY = "\n".join([`
Updated fetchdata to be used in terminal Changes: - now you can use it as if it were a cli command Missing: - documentation 2025-09-19 12:35:15 +02:00			`DATA.query,`
Added script to fetch data from DBPedia 2025-09-17 17:05:27 +02:00			`f"LIMIT {LIMIT}",`
			`f"OFFSET {CURRENT_OFFSET}"`
			`])`

			`print(f"\nCurrent Query:\n{CURRENT_PAGE_QUERY}\n")`

			`sparql.setQuery(CURRENT_PAGE_QUERY)`

			`try:`
			`res = sparql.queryAndConvert()`
			`text = ""`

			`if type(res) == bytes:`

			`initial_offset = 0`

			`if page != 0:`
			`initial_offset = 1`

			`lines = res.decode("utf-8", "ignore").split("\n")`
			`text = "\n".join(lines[initial_offset:])`

Updated file 2025-09-18 12:03:09 +02:00			`if text == "":`
			`exit = True`
			`continue`

Updated fetchdata to be used in terminal Changes: - now you can use it as if it were a cli command Missing: - documentation 2025-09-19 12:35:15 +02:00			`with open(DATA.local_url, "a+", encoding="utf-8") as dataset:`
Added script to fetch data from DBPedia 2025-09-17 17:05:27 +02:00
Updated fetchdata to be used in terminal Changes: - now you can use it as if it were a cli command Missing: - documentation 2025-09-19 12:35:15 +02:00			`print(f"Writing page {page} on {DATA.local_url}")`
Added script to fetch data from DBPedia 2025-09-17 17:05:27 +02:00			`dataset.write(`
			`text`
			`)`

			`except Exception as ex:`
			`print(f"Something went wrong during page {page}:\n\t{ex}")`

			`print(f"Sleeping for {TIMEOUT_SECONDS}")`
Updated file 2025-09-18 12:03:09 +02:00
			`page += 1`

			`if page == MAX_PAGES - 1:`
			`exit = True`

Added script to fetch data from DBPedia 2025-09-17 17:05:27 +02:00			`sleep(TIMEOUT_SECONDS)`

Updated fetchdata to be used in terminal Changes: - now you can use it as if it were a cli command Missing: - documentation 2025-09-19 12:35:15 +02:00
			`if __name__ == "__main__":`
			`DATA = gather_cli_args(sys.argv)`
			`fetch_data(DATA)`