Update pattern mining
This commit is contained in:
parent
ada69748ce
commit
6d6969d60c
62
cheese.ipynb
62
cheese.ipynb
@ -82,12 +82,11 @@
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a0a77563-518e-4808-b744-9fc0c76763fe",
|
||||
"id": "5d76fde3-8c65-4b50-a097-6dd81a68c1ca",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(len(data[pd.isnull(data[\"calcium_content\"])]))\n",
|
||||
"print(len(data[pd.isnull(data[\"fat_content\"])]))"
|
||||
"data.describe().T.plot(kind='bar')"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -95,7 +94,7 @@
|
||||
"id": "4590cffd-d4a9-4e15-8fd5-cbb22f048300",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Since those two columns have too much null values, we choose to remove them. \n",
|
||||
"Since `calcium_content` and `fat_content` columns have too much null values, we choose to remove them. \n",
|
||||
"Similarly, we removed other columns we are not interested in: "
|
||||
]
|
||||
},
|
||||
@ -335,17 +334,6 @@
|
||||
" return df.copy()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "fda6aaad-7b1e-4daa-8d28-cd049df9cec2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data_features=filter_df(data)\n",
|
||||
"data_features"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d1eb67d7-d16b-4b93-8486-582830ac3903",
|
||||
@ -389,14 +377,6 @@
|
||||
" return list(c[0] for c in data_colors), list(c[1] for c in data_colors), list(c[2] for c in data_colors)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "471728e0-5543-4afd-bf54-d21bd49dda75",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
@ -418,11 +398,9 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "24e7ff6e-c308-4cc8-aeac-eeb372f4c479",
|
||||
"cell_type": "markdown",
|
||||
"id": "979b9eef-9ca2-4299-a4e0-e8d3813f45c6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"In this part, we achieved to do two things for the classification: create a decision tree on the database and, given a cheese and its characteristics, find where it originates from. \n",
|
||||
"\n"
|
||||
@ -540,8 +518,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"yprime=pd.DataFrame(model.predict(X),columns=[\"latitude\",\"longitude\"])\n",
|
||||
"\n"
|
||||
"yprime=pd.DataFrame(model.predict(X),columns=[\"latitude\",\"longitude\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -561,7 +538,7 @@
|
||||
"source": [
|
||||
"unused_columns = {\"vegetarian\", \"vegan\", \"cheese\", \"region\", \"color\", \"location\", \"latitude\", \"longitude\", \"country\",\"color_r\",\"color_g\",\"color_b\"}\n",
|
||||
"data_features_only=data_features.drop(columns=list(unused_columns.intersection(data_features.columns)))\n",
|
||||
"data_features_only.shape[1]"
|
||||
"print(\"Number of features:\", data_features_only.shape[1])"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -595,7 +572,7 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"assoc_rules = association_rules(frequent_itemsets, min_threshold=.5)\n",
|
||||
"\n",
|
||||
"assoc_rules=assoc_rules.sort_values(by=['confidence'], ascending=False)\n",
|
||||
"display(HTML(assoc_rules.to_html()))"
|
||||
]
|
||||
},
|
||||
@ -606,21 +583,32 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"assoc_rules[assoc_rules[\"consequents\"].astype(str).str.contains(\"cow\")]"
|
||||
"assoc_rules[assoc_rules[\"antecedents\"].astype(str).str.contains(\"rich\")]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "78ef08e7-1436-440f-b035-8b480af1cc7b",
|
||||
"cell_type": "markdown",
|
||||
"id": "84e2f426-8077-46c7-bc7e-357e631972d2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"For Pattern Mining, we only kept relevant columns (binary attributes) thus dropping RGB colors and any location based information, keeping only information relevant to the final cheese itself.\n",
|
||||
"\n",
|
||||
"We applied the apriori algorithm for frequent itemsets and searched for association rules.\n",
|
||||
"\n",
|
||||
"Est-ce que les fromages artisanaux ont souvent plus de \"goûts\" que les autres ?\n"
|
||||
"If we observe the association rules with the highest degree of confidence, we can interpolate the following statements (then verified to be true):\n",
|
||||
"- cheddar is primarily a cow cheese\n",
|
||||
"- "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "104b476d-5531-40e7-8bf6-987f00a8f5c1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data_f=text_to_boolean(data)\n",
|
||||
"data_f[(data_f[\"bloomy\"] == True)]"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
Loading…
x
Reference in New Issue
Block a user