diff --git a/Scripts/DataCleaning/clean_relationship.ipynb b/Scripts/DataCleaning/clean_relationship.ipynb new file mode 100644 index 0000000..435d2c0 --- /dev/null +++ b/Scripts/DataCleaning/clean_relationship.ipynb @@ -0,0 +1,186 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "b9081b7c", + "metadata": {}, + "outputs": [], + "source": [ + "# This file deletes in the pipeline the unwanted relationship by different rules\n", + "import pandas as pd\n", + "import sqlite3\n", + "import numpy as np\n", + "\n", + "\n", + "CONN = sqlite3.connect('../../Assets/Dataset/Tmp/dataset2.db')\n", + "\n", + "def get_RDF() -> pd.DataFrame:\n", + " \"\"\"\n", + " QUERY = \"SELECT * FROM RDFs \" \\\n", + " \"INNER JOIN Subjects USING (SubjectID) \" \\\n", + " \"INNER JOIN Relationships USING (RelationshipID) \" \\\n", + " \"INNER JOIN Objects USING (ObjectID);\"\n", + " RDF = pd.read_sql_query(QUERY, CONN)\n", + " RDF = RDF[[\"SubjectURI\", \"RelationshipURI\", \"ObjectURI\"]]\n", + " RDF = RDF.dropna()\n", + " \"\"\"\n", + " Subjects = pd.read_sql_query('SELECT * FROM Subjects;', CONN)\n", + " Objects = pd.read_sql_query('SELECT * FROM Objects;', CONN)\n", + " Relationships = pd.read_sql_query('SELECT * FROM Relationships;', CONN)\n", + " RDF = pd.read_sql_query('SELECT * FROM RDFs;', CONN)\n", + "\n", + " # drop '' values \n", + " Subjects = Subjects.replace('', np.nan)# .dropna()\n", + " Relationships = Relationships.replace('', np.nan)# .dropna()\n", + " Objects = Objects.replace('', np.nan)# .dropna()\n", + "\n", + " # join RDF with its components\n", + " RDF = RDF.merge(Subjects, left_on=\"SubjectID\", right_on=\"SubjectID\")\n", + " RDF = RDF.merge(Objects, left_on=\"ObjectID\", right_on=\"ObjectID\")\n", + " RDF = RDF.merge(Relationships, left_on=\"RelationshipID\", right_on=\"RelationshipID\")\n", + " RDF = RDF[[\"SubjectURI\", \"RelationshipURI\", \"ObjectURI\", \"MovieID\"]]\n", + " return RDF\n", + "\n", + "\n", + "#def delete_relationship_by_uri(RDF: pd.DataFrame, )\n", + "\n", + "def delete_relationship_by_uri(RDF: pd.DataFrame, uri: str) -> pd.DataFrame:\n", + " return RDF[RDF[\"RelationshipURI\"]!= uri]\n", + "\n", + "\n", + "\n", + "RDF = get_RDF()\n", + "# RDF = RDF.dropna()\n", + "# print(RDF)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "644690bb", + "metadata": {}, + "outputs": [], + "source": [ + "def filter_by_frequence_relationship_uri(RDF: pd.DataFrame, count_treshold) -> pd.DataFrame:\n", + " counts = RDF[\"RelationshipURI\"].value_counts() \n", + " RDF[\"RelationshipFreq\"] = RDF[\"RelationshipURI\"].map(counts)\n", + " RDF = RDF[RDF[\"RelationshipFreq\"] >= count_treshold]\n", + " # counts is a series as key: relationship, value: count\n", + " # counts = counts[counts > count_treshold]\n", + " # relationships = counts.index\n", + " # RDF = RDF[RDF[\"RelationshipURI\"].isin(relationships)]\n", + " # RDF = RDF.groupby(\"RelationshipURI\").filter(lambda x: len(x) >= count_treshold)\n", + " return RDF\n", + "\n", + "RDF = filter_by_frequence_relationship_uri(RDF, 1)\n", + "# print(new_RDF)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "34525be6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " SubjectURI \\\n", + "0 http://dbpedia.org/resource/Nights_of_Cabiria \n", + "1 http://dbpedia.org/resource/California_Science... \n", + "2 http://dbpedia.org/resource/China_Captain \n", + "3 http://dbpedia.org/resource/Caravan_of_Courage... \n", + "4 http://dbpedia.org/resource/WHIH_Newsfront \n", + "... ... \n", + "12725500 http://dbpedia.org/resource/I_Will_Follow_(film) \n", + "12725501 http://dbpedia.org/resource/I_Will_Follow_(film) \n", + "12725502 http://dbpedia.org/resource/I_Witnessed_Genoci... \n", + "12725503 http://dbpedia.org/resource/I_Woke_Up_Early_th... \n", + "12725504 http://dbpedia.org/resource/I_Won't_Play \n", + "\n", + " RelationshipURI \\\n", + "0 http://www.w3.org/2002/07/owl#differentFrom \n", + "1 http://www.w3.org/2002/07/owl#differentFrom \n", + "2 http://www.w3.org/2002/07/owl#differentFrom \n", + "3 http://www.w3.org/2002/07/owl#differentFrom \n", + "4 http://www.w3.org/2000/01/rdf-schema#seeAlso \n", + "... ... \n", + "12725500 http://dbpedia.org/ontology/producer \n", + "12725501 http://dbpedia.org/ontology/producer \n", + "12725502 http://dbpedia.org/ontology/producer \n", + "12725503 http://dbpedia.org/ontology/producer \n", + "12725504 http://dbpedia.org/ontology/producer \n", + "\n", + " ObjectURI MovieID \\\n", + "0 http://dbpedia.org/resource/Cabiria 26 \n", + "1 http://dbpedia.org/resource/California_Academy... 185 \n", + "2 http://dbpedia.org/resource/Captain_China 614 \n", + "3 http://dbpedia.org/resource/Caravan_of_Courage... 740 \n", + "4 http://dbpedia.org/resource/Captain_America:_C... 594 \n", + "... ... ... \n", + "12725500 http://dbpedia.org/resource/Ava_DuVernay 145854 \n", + "12725501 http://dbpedia.org/resource/Molly_Mayeux 145854 \n", + "12725502 http://dbpedia.org/resource/Headlines_Today 145861 \n", + "12725503 http://dbpedia.org/resource/Billy_Zane 145862 \n", + "12725504 http://dbpedia.org/resource/Gordon_Hollingshead 145864 \n", + "\n", + " RelationshipFreq MovieFreq \n", + "0 2132 216 \n", + "1 2132 264 \n", + "2 2132 66 \n", + "3 2132 131 \n", + "4 1653 133 \n", + "... ... ... \n", + "12725500 80077 95 \n", + "12725501 80077 95 \n", + "12725502 80077 41 \n", + "12725503 80077 98 \n", + "12725504 80077 91 \n", + "\n", + "[12725505 rows x 6 columns]\n" + ] + } + ], + "source": [ + "def filter_by_frequence_movie_id(RDF: pd.DataFrame, min_treshold, max_treshold) -> pd.DataFrame:\n", + " counts = RDF[\"MovieID\"].value_counts() \n", + " RDF[\"MovieFreq\"] = RDF[\"MovieID\"].map(counts)\n", + " RDF = RDF[RDF[\"MovieFreq\"] >= min_treshold]\n", + " RDF = RDF[RDF[\"MovieFreq\"] < max_treshold]\n", + " # counts is a series as key: relationship, value: count\n", + " # counts = counts[counts > count_treshold]\n", + " # relationships = counts.index\n", + " # RDF = RDF[RDF[\"RelationshipURI\"].isin(relationships)]\n", + " # RDF = RDF.groupby(\"RelationshipURI\").filter(lambda x: len(x) >= count_treshold)\n", + " return RDF\n", + "\n", + "RDF = filter_by_frequence_movie_id(RDF, 1, 1500)\n", + "print(RDF)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "deep_learning", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}