WIP: added test file: clean_relationship.jupyter

to create a first cleaning pipeline
2025-09-25 16:28:24 +02:00
parent 0bc7f4b227
commit 3eec49ffa5
1 changed files with 186 additions and 0 deletions
--- a/Scripts/DataCleaning/clean_relationship.ipynb
+++ b/Scripts/DataCleaning/clean_relationship.ipynb
@@ -0,0 +1,186 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "b9081b7c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# This file deletes in the pipeline the unwanted relationship by different rules\n",
+    "import pandas as pd\n",
+    "import sqlite3\n",
+    "import numpy as np\n",
+    "\n",
+    "\n",
+    "CONN = sqlite3.connect('../../Assets/Dataset/Tmp/dataset2.db')\n",
+    "\n",
+    "def get_RDF() -> pd.DataFrame:\n",
+    "    \"\"\"\n",
+    "    QUERY = \"SELECT * FROM RDFs \" \\\n",
+    "    \"INNER JOIN Subjects USING (SubjectID) \" \\\n",
+    "    \"INNER JOIN Relationships USING (RelationshipID) \" \\\n",
+    "    \"INNER JOIN Objects USING (ObjectID);\"\n",
+    "    RDF = pd.read_sql_query(QUERY, CONN)\n",
+    "    RDF = RDF[[\"SubjectURI\", \"RelationshipURI\", \"ObjectURI\"]]\n",
+    "    RDF = RDF.dropna()\n",
+    "    \"\"\"\n",
+    "    Subjects = pd.read_sql_query('SELECT * FROM Subjects;', CONN)\n",
+    "    Objects = pd.read_sql_query('SELECT * FROM Objects;', CONN)\n",
+    "    Relationships = pd.read_sql_query('SELECT * FROM Relationships;', CONN)\n",
+    "    RDF = pd.read_sql_query('SELECT * FROM RDFs;', CONN)\n",
+    "\n",
+    "    # drop '' values \n",
+    "    Subjects = Subjects.replace('', np.nan)# .dropna()\n",
+    "    Relationships = Relationships.replace('', np.nan)# .dropna()\n",
+    "    Objects = Objects.replace('', np.nan)# .dropna()\n",
+    "\n",
+    "    # join RDF with its components\n",
+    "    RDF = RDF.merge(Subjects, left_on=\"SubjectID\", right_on=\"SubjectID\")\n",
+    "    RDF = RDF.merge(Objects, left_on=\"ObjectID\", right_on=\"ObjectID\")\n",
+    "    RDF = RDF.merge(Relationships, left_on=\"RelationshipID\", right_on=\"RelationshipID\")\n",
+    "    RDF = RDF[[\"SubjectURI\", \"RelationshipURI\", \"ObjectURI\", \"MovieID\"]]\n",
+    "    return RDF\n",
+    "\n",
+    "\n",
+    "#def delete_relationship_by_uri(RDF: pd.DataFrame, )\n",
+    "\n",
+    "def delete_relationship_by_uri(RDF: pd.DataFrame, uri: str) -> pd.DataFrame:\n",
+    "    return RDF[RDF[\"RelationshipURI\"]!= uri]\n",
+    "\n",
+    "\n",
+    "\n",
+    "RDF = get_RDF()\n",
+    "# RDF = RDF.dropna()\n",
+    "# print(RDF)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "644690bb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def filter_by_frequence_relationship_uri(RDF: pd.DataFrame, count_treshold) -> pd.DataFrame:\n",
+    "    counts = RDF[\"RelationshipURI\"].value_counts() \n",
+    "    RDF[\"RelationshipFreq\"] = RDF[\"RelationshipURI\"].map(counts)\n",
+    "    RDF = RDF[RDF[\"RelationshipFreq\"] >= count_treshold]\n",
+    "    # counts is a series as key: relationship, value: count\n",
+    "    # counts = counts[counts > count_treshold]\n",
+    "    # relationships = counts.index\n",
+    "    # RDF = RDF[RDF[\"RelationshipURI\"].isin(relationships)]\n",
+    "    # RDF = RDF.groupby(\"RelationshipURI\").filter(lambda x: len(x) >= count_treshold)\n",
+    "    return RDF\n",
+    "\n",
+    "RDF = filter_by_frequence_relationship_uri(RDF, 1)\n",
+    "# print(new_RDF)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "34525be6",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                                                 SubjectURI  \\\n",
+      "0             http://dbpedia.org/resource/Nights_of_Cabiria   \n",
+      "1         http://dbpedia.org/resource/California_Science...   \n",
+      "2                 http://dbpedia.org/resource/China_Captain   \n",
+      "3         http://dbpedia.org/resource/Caravan_of_Courage...   \n",
+      "4                http://dbpedia.org/resource/WHIH_Newsfront   \n",
+      "...                                                     ...   \n",
+      "12725500   http://dbpedia.org/resource/I_Will_Follow_(film)   \n",
+      "12725501   http://dbpedia.org/resource/I_Will_Follow_(film)   \n",
+      "12725502  http://dbpedia.org/resource/I_Witnessed_Genoci...   \n",
+      "12725503  http://dbpedia.org/resource/I_Woke_Up_Early_th...   \n",
+      "12725504           http://dbpedia.org/resource/I_Won't_Play   \n",
+      "\n",
+      "                                       RelationshipURI  \\\n",
+      "0          http://www.w3.org/2002/07/owl#differentFrom   \n",
+      "1          http://www.w3.org/2002/07/owl#differentFrom   \n",
+      "2          http://www.w3.org/2002/07/owl#differentFrom   \n",
+      "3          http://www.w3.org/2002/07/owl#differentFrom   \n",
+      "4         http://www.w3.org/2000/01/rdf-schema#seeAlso   \n",
+      "...                                                ...   \n",
+      "12725500          http://dbpedia.org/ontology/producer   \n",
+      "12725501          http://dbpedia.org/ontology/producer   \n",
+      "12725502          http://dbpedia.org/ontology/producer   \n",
+      "12725503          http://dbpedia.org/ontology/producer   \n",
+      "12725504          http://dbpedia.org/ontology/producer   \n",
+      "\n",
+      "                                                  ObjectURI  MovieID  \\\n",
+      "0                       http://dbpedia.org/resource/Cabiria       26   \n",
+      "1         http://dbpedia.org/resource/California_Academy...      185   \n",
+      "2                 http://dbpedia.org/resource/Captain_China      614   \n",
+      "3         http://dbpedia.org/resource/Caravan_of_Courage...      740   \n",
+      "4         http://dbpedia.org/resource/Captain_America:_C...      594   \n",
+      "...                                                     ...      ...   \n",
+      "12725500           http://dbpedia.org/resource/Ava_DuVernay   145854   \n",
+      "12725501           http://dbpedia.org/resource/Molly_Mayeux   145854   \n",
+      "12725502        http://dbpedia.org/resource/Headlines_Today   145861   \n",
+      "12725503             http://dbpedia.org/resource/Billy_Zane   145862   \n",
+      "12725504    http://dbpedia.org/resource/Gordon_Hollingshead   145864   \n",
+      "\n",
+      "          RelationshipFreq  MovieFreq  \n",
+      "0                     2132        216  \n",
+      "1                     2132        264  \n",
+      "2                     2132         66  \n",
+      "3                     2132        131  \n",
+      "4                     1653        133  \n",
+      "...                    ...        ...  \n",
+      "12725500             80077         95  \n",
+      "12725501             80077         95  \n",
+      "12725502             80077         41  \n",
+      "12725503             80077         98  \n",
+      "12725504             80077         91  \n",
+      "\n",
+      "[12725505 rows x 6 columns]\n"
+     ]
+    }
+   ],
+   "source": [
+    "def filter_by_frequence_movie_id(RDF: pd.DataFrame, min_treshold, max_treshold) -> pd.DataFrame:\n",
+    "    counts = RDF[\"MovieID\"].value_counts() \n",
+    "    RDF[\"MovieFreq\"] = RDF[\"MovieID\"].map(counts)\n",
+    "    RDF = RDF[RDF[\"MovieFreq\"] >= min_treshold]\n",
+    "    RDF = RDF[RDF[\"MovieFreq\"] < max_treshold]\n",
+    "    # counts is a series as key: relationship, value: count\n",
+    "    # counts = counts[counts > count_treshold]\n",
+    "    # relationships = counts.index\n",
+    "    # RDF = RDF[RDF[\"RelationshipURI\"].isin(relationships)]\n",
+    "    # RDF = RDF.groupby(\"RelationshipURI\").filter(lambda x: len(x) >= count_treshold)\n",
+    "    return RDF\n",
+    "\n",
+    "RDF = filter_by_frequence_movie_id(RDF, 1, 1500)\n",
+    "print(RDF)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "deep_learning",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}