WIP: added test file: clean_relationship.jupyter
to create a first cleaning pipeline
This commit is contained in:
parent
0bc7f4b227
commit
3eec49ffa5
186
Scripts/DataCleaning/clean_relationship.ipynb
Normal file
186
Scripts/DataCleaning/clean_relationship.ipynb
Normal file
@ -0,0 +1,186 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "b9081b7c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# This file deletes in the pipeline the unwanted relationship by different rules\n",
|
||||
"import pandas as pd\n",
|
||||
"import sqlite3\n",
|
||||
"import numpy as np\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"CONN = sqlite3.connect('../../Assets/Dataset/Tmp/dataset2.db')\n",
|
||||
"\n",
|
||||
"def get_RDF() -> pd.DataFrame:\n",
|
||||
" \"\"\"\n",
|
||||
" QUERY = \"SELECT * FROM RDFs \" \\\n",
|
||||
" \"INNER JOIN Subjects USING (SubjectID) \" \\\n",
|
||||
" \"INNER JOIN Relationships USING (RelationshipID) \" \\\n",
|
||||
" \"INNER JOIN Objects USING (ObjectID);\"\n",
|
||||
" RDF = pd.read_sql_query(QUERY, CONN)\n",
|
||||
" RDF = RDF[[\"SubjectURI\", \"RelationshipURI\", \"ObjectURI\"]]\n",
|
||||
" RDF = RDF.dropna()\n",
|
||||
" \"\"\"\n",
|
||||
" Subjects = pd.read_sql_query('SELECT * FROM Subjects;', CONN)\n",
|
||||
" Objects = pd.read_sql_query('SELECT * FROM Objects;', CONN)\n",
|
||||
" Relationships = pd.read_sql_query('SELECT * FROM Relationships;', CONN)\n",
|
||||
" RDF = pd.read_sql_query('SELECT * FROM RDFs;', CONN)\n",
|
||||
"\n",
|
||||
" # drop '' values \n",
|
||||
" Subjects = Subjects.replace('', np.nan)# .dropna()\n",
|
||||
" Relationships = Relationships.replace('', np.nan)# .dropna()\n",
|
||||
" Objects = Objects.replace('', np.nan)# .dropna()\n",
|
||||
"\n",
|
||||
" # join RDF with its components\n",
|
||||
" RDF = RDF.merge(Subjects, left_on=\"SubjectID\", right_on=\"SubjectID\")\n",
|
||||
" RDF = RDF.merge(Objects, left_on=\"ObjectID\", right_on=\"ObjectID\")\n",
|
||||
" RDF = RDF.merge(Relationships, left_on=\"RelationshipID\", right_on=\"RelationshipID\")\n",
|
||||
" RDF = RDF[[\"SubjectURI\", \"RelationshipURI\", \"ObjectURI\", \"MovieID\"]]\n",
|
||||
" return RDF\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"#def delete_relationship_by_uri(RDF: pd.DataFrame, )\n",
|
||||
"\n",
|
||||
"def delete_relationship_by_uri(RDF: pd.DataFrame, uri: str) -> pd.DataFrame:\n",
|
||||
" return RDF[RDF[\"RelationshipURI\"]!= uri]\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"RDF = get_RDF()\n",
|
||||
"# RDF = RDF.dropna()\n",
|
||||
"# print(RDF)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "644690bb",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def filter_by_frequence_relationship_uri(RDF: pd.DataFrame, count_treshold) -> pd.DataFrame:\n",
|
||||
" counts = RDF[\"RelationshipURI\"].value_counts() \n",
|
||||
" RDF[\"RelationshipFreq\"] = RDF[\"RelationshipURI\"].map(counts)\n",
|
||||
" RDF = RDF[RDF[\"RelationshipFreq\"] >= count_treshold]\n",
|
||||
" # counts is a series as key: relationship, value: count\n",
|
||||
" # counts = counts[counts > count_treshold]\n",
|
||||
" # relationships = counts.index\n",
|
||||
" # RDF = RDF[RDF[\"RelationshipURI\"].isin(relationships)]\n",
|
||||
" # RDF = RDF.groupby(\"RelationshipURI\").filter(lambda x: len(x) >= count_treshold)\n",
|
||||
" return RDF\n",
|
||||
"\n",
|
||||
"RDF = filter_by_frequence_relationship_uri(RDF, 1)\n",
|
||||
"# print(new_RDF)\n",
|
||||
"\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "34525be6",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" SubjectURI \\\n",
|
||||
"0 http://dbpedia.org/resource/Nights_of_Cabiria \n",
|
||||
"1 http://dbpedia.org/resource/California_Science... \n",
|
||||
"2 http://dbpedia.org/resource/China_Captain \n",
|
||||
"3 http://dbpedia.org/resource/Caravan_of_Courage... \n",
|
||||
"4 http://dbpedia.org/resource/WHIH_Newsfront \n",
|
||||
"... ... \n",
|
||||
"12725500 http://dbpedia.org/resource/I_Will_Follow_(film) \n",
|
||||
"12725501 http://dbpedia.org/resource/I_Will_Follow_(film) \n",
|
||||
"12725502 http://dbpedia.org/resource/I_Witnessed_Genoci... \n",
|
||||
"12725503 http://dbpedia.org/resource/I_Woke_Up_Early_th... \n",
|
||||
"12725504 http://dbpedia.org/resource/I_Won't_Play \n",
|
||||
"\n",
|
||||
" RelationshipURI \\\n",
|
||||
"0 http://www.w3.org/2002/07/owl#differentFrom \n",
|
||||
"1 http://www.w3.org/2002/07/owl#differentFrom \n",
|
||||
"2 http://www.w3.org/2002/07/owl#differentFrom \n",
|
||||
"3 http://www.w3.org/2002/07/owl#differentFrom \n",
|
||||
"4 http://www.w3.org/2000/01/rdf-schema#seeAlso \n",
|
||||
"... ... \n",
|
||||
"12725500 http://dbpedia.org/ontology/producer \n",
|
||||
"12725501 http://dbpedia.org/ontology/producer \n",
|
||||
"12725502 http://dbpedia.org/ontology/producer \n",
|
||||
"12725503 http://dbpedia.org/ontology/producer \n",
|
||||
"12725504 http://dbpedia.org/ontology/producer \n",
|
||||
"\n",
|
||||
" ObjectURI MovieID \\\n",
|
||||
"0 http://dbpedia.org/resource/Cabiria 26 \n",
|
||||
"1 http://dbpedia.org/resource/California_Academy... 185 \n",
|
||||
"2 http://dbpedia.org/resource/Captain_China 614 \n",
|
||||
"3 http://dbpedia.org/resource/Caravan_of_Courage... 740 \n",
|
||||
"4 http://dbpedia.org/resource/Captain_America:_C... 594 \n",
|
||||
"... ... ... \n",
|
||||
"12725500 http://dbpedia.org/resource/Ava_DuVernay 145854 \n",
|
||||
"12725501 http://dbpedia.org/resource/Molly_Mayeux 145854 \n",
|
||||
"12725502 http://dbpedia.org/resource/Headlines_Today 145861 \n",
|
||||
"12725503 http://dbpedia.org/resource/Billy_Zane 145862 \n",
|
||||
"12725504 http://dbpedia.org/resource/Gordon_Hollingshead 145864 \n",
|
||||
"\n",
|
||||
" RelationshipFreq MovieFreq \n",
|
||||
"0 2132 216 \n",
|
||||
"1 2132 264 \n",
|
||||
"2 2132 66 \n",
|
||||
"3 2132 131 \n",
|
||||
"4 1653 133 \n",
|
||||
"... ... ... \n",
|
||||
"12725500 80077 95 \n",
|
||||
"12725501 80077 95 \n",
|
||||
"12725502 80077 41 \n",
|
||||
"12725503 80077 98 \n",
|
||||
"12725504 80077 91 \n",
|
||||
"\n",
|
||||
"[12725505 rows x 6 columns]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"def filter_by_frequence_movie_id(RDF: pd.DataFrame, min_treshold, max_treshold) -> pd.DataFrame:\n",
|
||||
" counts = RDF[\"MovieID\"].value_counts() \n",
|
||||
" RDF[\"MovieFreq\"] = RDF[\"MovieID\"].map(counts)\n",
|
||||
" RDF = RDF[RDF[\"MovieFreq\"] >= min_treshold]\n",
|
||||
" RDF = RDF[RDF[\"MovieFreq\"] < max_treshold]\n",
|
||||
" # counts is a series as key: relationship, value: count\n",
|
||||
" # counts = counts[counts > count_treshold]\n",
|
||||
" # relationships = counts.index\n",
|
||||
" # RDF = RDF[RDF[\"RelationshipURI\"].isin(relationships)]\n",
|
||||
" # RDF = RDF.groupby(\"RelationshipURI\").filter(lambda x: len(x) >= count_treshold)\n",
|
||||
" return RDF\n",
|
||||
"\n",
|
||||
"RDF = filter_by_frequence_movie_id(RDF, 1, 1500)\n",
|
||||
"print(RDF)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "deep_learning",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.13.7"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user