Update pattern mining
This commit is contained in:
parent
ada69748ce
commit
6d6969d60c
62
cheese.ipynb
62
cheese.ipynb
@ -82,12 +82,11 @@
|
|||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
"id": "a0a77563-518e-4808-b744-9fc0c76763fe",
|
"id": "5d76fde3-8c65-4b50-a097-6dd81a68c1ca",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"print(len(data[pd.isnull(data[\"calcium_content\"])]))\n",
|
"data.describe().T.plot(kind='bar')"
|
||||||
"print(len(data[pd.isnull(data[\"fat_content\"])]))"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -95,7 +94,7 @@
|
|||||||
"id": "4590cffd-d4a9-4e15-8fd5-cbb22f048300",
|
"id": "4590cffd-d4a9-4e15-8fd5-cbb22f048300",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"source": [
|
"source": [
|
||||||
"Since those two columns have too much null values, we choose to remove them. \n",
|
"Since `calcium_content` and `fat_content` columns have too much null values, we choose to remove them. \n",
|
||||||
"Similarly, we removed other columns we are not interested in: "
|
"Similarly, we removed other columns we are not interested in: "
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@ -335,17 +334,6 @@
|
|||||||
" return df.copy()"
|
" return df.copy()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "fda6aaad-7b1e-4daa-8d28-cd049df9cec2",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"data_features=filter_df(data)\n",
|
|
||||||
"data_features"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"id": "d1eb67d7-d16b-4b93-8486-582830ac3903",
|
"id": "d1eb67d7-d16b-4b93-8486-582830ac3903",
|
||||||
@ -389,14 +377,6 @@
|
|||||||
" return list(c[0] for c in data_colors), list(c[1] for c in data_colors), list(c[2] for c in data_colors)"
|
" return list(c[0] for c in data_colors), list(c[1] for c in data_colors), list(c[2] for c in data_colors)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "471728e0-5543-4afd-bf54-d21bd49dda75",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": []
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
@ -418,11 +398,9 @@
|
|||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "markdown",
|
||||||
"execution_count": null,
|
"id": "979b9eef-9ca2-4299-a4e0-e8d3813f45c6",
|
||||||
"id": "24e7ff6e-c308-4cc8-aeac-eeb372f4c479",
|
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
"source": [
|
||||||
"In this part, we achieved to do two things for the classification: create a decision tree on the database and, given a cheese and its characteristics, find where it originates from. \n",
|
"In this part, we achieved to do two things for the classification: create a decision tree on the database and, given a cheese and its characteristics, find where it originates from. \n",
|
||||||
"\n"
|
"\n"
|
||||||
@ -540,8 +518,7 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"yprime=pd.DataFrame(model.predict(X),columns=[\"latitude\",\"longitude\"])\n",
|
"yprime=pd.DataFrame(model.predict(X),columns=[\"latitude\",\"longitude\"])"
|
||||||
"\n"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -561,7 +538,7 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"unused_columns = {\"vegetarian\", \"vegan\", \"cheese\", \"region\", \"color\", \"location\", \"latitude\", \"longitude\", \"country\",\"color_r\",\"color_g\",\"color_b\"}\n",
|
"unused_columns = {\"vegetarian\", \"vegan\", \"cheese\", \"region\", \"color\", \"location\", \"latitude\", \"longitude\", \"country\",\"color_r\",\"color_g\",\"color_b\"}\n",
|
||||||
"data_features_only=data_features.drop(columns=list(unused_columns.intersection(data_features.columns)))\n",
|
"data_features_only=data_features.drop(columns=list(unused_columns.intersection(data_features.columns)))\n",
|
||||||
"data_features_only.shape[1]"
|
"print(\"Number of features:\", data_features_only.shape[1])"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -595,7 +572,7 @@
|
|||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"assoc_rules = association_rules(frequent_itemsets, min_threshold=.5)\n",
|
"assoc_rules = association_rules(frequent_itemsets, min_threshold=.5)\n",
|
||||||
"\n",
|
"assoc_rules=assoc_rules.sort_values(by=['confidence'], ascending=False)\n",
|
||||||
"display(HTML(assoc_rules.to_html()))"
|
"display(HTML(assoc_rules.to_html()))"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@ -606,21 +583,32 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"assoc_rules[assoc_rules[\"consequents\"].astype(str).str.contains(\"cow\")]"
|
"assoc_rules[assoc_rules[\"antecedents\"].astype(str).str.contains(\"rich\")]"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "markdown",
|
||||||
"execution_count": null,
|
"id": "84e2f426-8077-46c7-bc7e-357e631972d2",
|
||||||
"id": "78ef08e7-1436-440f-b035-8b480af1cc7b",
|
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
"source": [
|
||||||
"For Pattern Mining, we only kept relevant columns (binary attributes) thus dropping RGB colors and any location based information, keeping only information relevant to the final cheese itself.\n",
|
"For Pattern Mining, we only kept relevant columns (binary attributes) thus dropping RGB colors and any location based information, keeping only information relevant to the final cheese itself.\n",
|
||||||
"\n",
|
"\n",
|
||||||
"We applied the apriori algorithm for frequent itemsets and searched for association rules.\n",
|
"We applied the apriori algorithm for frequent itemsets and searched for association rules.\n",
|
||||||
"\n",
|
"\n",
|
||||||
"Est-ce que les fromages artisanaux ont souvent plus de \"goûts\" que les autres ?\n"
|
"If we observe the association rules with the highest degree of confidence, we can interpolate the following statements (then verified to be true):\n",
|
||||||
|
"- cheddar is primarily a cow cheese\n",
|
||||||
|
"- "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "104b476d-5531-40e7-8bf6-987f00a8f5c1",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"data_f=text_to_boolean(data)\n",
|
||||||
|
"data_f[(data_f[\"bloomy\"] == True)]"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
Loading…
x
Reference in New Issue
Block a user