Renamed dir from Script to Scripts

2025-09-19 08:31:00 +02:00
parent c415b175a0
commit ce3d4bf6c5
3 changed files with 0 additions and 0 deletions
--- a/Scripts/DataGathering/analysis.py
+++ b/Scripts/DataGathering/analysis.py
@@ -0,0 +1,14 @@
+import pandas as pd
+
+# Load the CSV
+df = pd.read_csv("./Assets/Dataset/1-hop/reverse.csv")
+
+# Extract the last part of the URL in 'relationship'
+df["relationship_short"] = df["relationship"].apply(lambda x: x.split("/")[-1])
+
+# Count occurrences of each unique last part
+relationship_counts = df["relationship_short"].value_counts()
+
+# Print the counts
+for rel, count in relationship_counts.items():
+    print(f"{rel}: {count}")
--- a/Scripts/DataGathering/wikipedia_summary_gatherer.py
+++ b/Scripts/DataGathering/wikipedia_summary_gatherer.py
@@ -0,0 +1,60 @@
+import pandas as pd
+import wikipediaapi
+import csv
+import time
+import re
+
+# Initialize Wikipedia API with proper user agent
+wiki_wiki = wikipediaapi.Wikipedia(
+    language='en',
+    user_agent='MovieAbstractScraper/1.0 (gape@example.com)'
+)
+
+input_csv = './Assets/Dataset/1-hop/wikipedia-movie.csv'
+output_csv = './Assets/Dataset/1-hop/wikipedia-summary.csv'
+
+# Function to get clean full text from Wikipedia URL
+def get_clean_text(wiki_url):
+    try:
+        page_title = wiki_url.rsplit('/', 1)[-1]  # extract page title
+        page = wiki_wiki.page(page_title)
+        if page.exists():
+            text = page.text
+            # Remove section headers like == History ==
+            text = re.sub(r'==.*?==', '', text)
+            # Collapse multiple spaces and newlines into single space
+            text = re.sub(r'\s+', ' ', text).strip()
+            return text
+    except Exception as e:
+        print(f"Error fetching {wiki_url}: {e}")
+    return ''
+
+# Initialize output CSV
+with open(output_csv, 'w', newline='', encoding='utf-8') as f_out:
+    writer = csv.DictWriter(f_out, fieldnames=['subject', 'text'])
+    writer.writeheader()
+
+# Process input CSV in chunks
+chunksize = 1
+for chunk in pd.read_csv(input_csv, chunksize=chunksize):
+    # Clean column names in case of quotes
+    chunk.columns = chunk.columns.str.replace('"', '').str.strip()
+    
+    rows_to_write = []
+    for _, row in chunk.iterrows():
+        wikipedia_url = row['subject']    # old subject (Wikipedia URL)
+        dbpedia_url = row['object']       # old object (DBpedia URL)
+        
+        clean_text = get_clean_text(wikipedia_url)
+        rows_to_write.append({
+            'subject': dbpedia_url,
+            'text': clean_text
+        })
+        time.sleep(0.1)  # polite delay
+
+    # Append to output CSV
+    with open(output_csv, 'a', newline='', encoding='utf-8') as f_out:
+        writer = csv.DictWriter(f_out, fieldnames=['subject', 'text'])
+        writer.writerows(rows_to_write)
+
+    print(f"Processed {len(chunk)} rows, appended to {output_csv}")
--- a/Scripts/fetchdata.py
+++ b/Scripts/fetchdata.py
@@ -0,0 +1,99 @@
+from math import floor
+from time import sleep
+import SPARQLWrapper
+import requests
+
+
+BASE_URL = "https://dbpedia.org/sparql"
+TYPE = SPARQLWrapper.CSV
+TIMEOUT_SECONDS = 1.5
+LIMIT = int(1E4)
+OFFSET = LIMIT
+INITIAL_OFFSET =  15200000
+MAX_PAGES = int(1E9)
+
+# Missing page 13220000
+
+FILE_URI = "./Assets/Dataset/1-hop/dataset.csv"
+
+QUERY = """
+PREFIX dbo:  <http://dbpedia.org/ontology/>
+PREFIX dbp:  <http://dbpedia.org/property/>
+PREFIX dbr:  <http://dbpedia.org/resource/>
+PREFIX foaf: <http://xmlns.com/foaf/0.1/>
+
+SELECT ?subject, ?relationship, ?object
+WHERE {
+  ?subject ?relationship ?object .
+  ?subject rdf:type dbo:Film .
+  ?a foaf:primaryTopic ?subject
+  FILTER (?relationship NOT IN (
+    dbo:wikiPageRedirects,
+    dbo:wikiPageExternalLink,
+    dbo:wikiPageWikiLink
+  ))
+}"""
+
+
+def main():
+
+    exit = False
+    page = int(floor(INITIAL_OFFSET / LIMIT)) -1
+
+    while not exit:
+
+        print(f"Starting to get page {page}")
+
+        CURRENT_OFFSET = int(OFFSET + (page * LIMIT))
+        sparql = SPARQLWrapper.SPARQLWrapper(BASE_URL)
+
+        sparql.setReturnFormat(TYPE)
+
+        CURRENT_PAGE_QUERY = "\n".join([
+            QUERY,
+            f"LIMIT {LIMIT}",
+            f"OFFSET {CURRENT_OFFSET}"
+        ])
+
+        print(f"\nCurrent Query:\n{CURRENT_PAGE_QUERY}\n")
+
+        sparql.setQuery(CURRENT_PAGE_QUERY)
+
+        try:
+            res = sparql.queryAndConvert()
+            text = ""
+
+            if type(res) == bytes:
+
+                initial_offset = 0
+
+                if page != 0:
+                    initial_offset = 1
+
+                lines = res.decode("utf-8", "ignore").split("\n")
+                text = "\n".join(lines[initial_offset:])
+
+            if text == "":
+                exit = True
+                continue
+
+            with open(FILE_URI, "a+", encoding="utf-8") as dataset:
+
+                print(f"Writing page {page} on {FILE_URI}")
+                dataset.write(
+                    text
+                )
+
+        except Exception as ex:
+            print(f"Something went wrong during page {page}:\n\t{ex}")
+
+        print(f"Sleeping for {TIMEOUT_SECONDS}")
+
+        page += 1
+
+        if page == MAX_PAGES - 1:
+            exit = True
+
+        sleep(TIMEOUT_SECONDS)
+
+main()