From ada69748ce10a849b71968f17c5f70aae0bdfb8b Mon Sep 17 00:00:00 2001 From: augustin64 Date: Tue, 1 Apr 2025 09:30:28 +0200 Subject: [PATCH 1/2] Merge changes --- cheese.ipynb | 64 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 40 insertions(+), 24 deletions(-) diff --git a/cheese.ipynb b/cheese.ipynb index df8e66c..fe0d268 100644 --- a/cheese.ipynb +++ b/cheese.ipynb @@ -335,6 +335,17 @@ " return df.copy()" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "fda6aaad-7b1e-4daa-8d28-cd049df9cec2", + "metadata": {}, + "outputs": [], + "source": [ + "data_features=filter_df(data)\n", + "data_features" + ] + }, { "cell_type": "markdown", "id": "d1eb67d7-d16b-4b93-8486-582830ac3903", @@ -407,9 +418,11 @@ ] }, { - "cell_type": "markdown", - "id": "48cbb634-6754-4956-a945-539d329812ef", + "cell_type": "code", + "execution_count": null, + "id": "24e7ff6e-c308-4cc8-aeac-eeb372f4c479", "metadata": {}, + "outputs": [], "source": [ "In this part, we achieved to do two things for the classification: create a decision tree on the database and, given a cheese and its characteristics, find where it originates from. \n", "\n" @@ -507,8 +520,27 @@ "metadata": {}, "source": [ "Not good, even quite bad. \n", - "We cannot find the region a cheese originates from given its characteristic. \n", - "\n", + "In short, it seems that we cannot find the region a cheese originates from given its characteristic. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7fd507d0-1a68-4cd7-a12e-12c9ab1061e3", + "metadata": {}, + "outputs": [], + "source": [ + "model.predict(X)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9faf2aee-84f5-4633-b3de-039af42d31d3", + "metadata": {}, + "outputs": [], + "source": [ + "yprime=pd.DataFrame(model.predict(X),columns=[\"latitude\",\"longitude\"])\n", "\n" ] }, @@ -578,9 +610,11 @@ ] }, { - "cell_type": "markdown", - "id": "84bf779a-36d0-4aa2-b3a2-0da9bb25fc01", + "cell_type": "code", + "execution_count": null, + "id": "78ef08e7-1436-440f-b035-8b480af1cc7b", "metadata": {}, + "outputs": [], "source": [ "For Pattern Mining, we only kept relevant columns (binary attributes) thus dropping RGB colors and any location based information, keeping only information relevant to the final cheese itself.\n", "\n", @@ -588,24 +622,6 @@ "\n", "Est-ce que les fromages artisanaux ont souvent plus de \"goûts\" que les autres ?\n" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ce4db7c9-8049-4838-af30-b9fe2bca2925", - "metadata": {}, - "outputs": [], - "source": [ - "len(data_features[data_features[\"pecorino\"] == True]), len(data_features[data_features[\"pecorino\"] == False])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7d9f17c2-6c42-4f24-b0d0-e8640a661801", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { From 6d6969d60cf151787c2854b4183625033bc58fb6 Mon Sep 17 00:00:00 2001 From: augustin64 Date: Tue, 1 Apr 2025 10:18:28 +0200 Subject: [PATCH 2/2] Update pattern mining --- cheese.ipynb | 62 +++++++++++++++++++++------------------------------- 1 file changed, 25 insertions(+), 37 deletions(-) diff --git a/cheese.ipynb b/cheese.ipynb index fe0d268..57b8706 100644 --- a/cheese.ipynb +++ b/cheese.ipynb @@ -82,12 +82,11 @@ { "cell_type": "code", "execution_count": null, - "id": "a0a77563-518e-4808-b744-9fc0c76763fe", + "id": "5d76fde3-8c65-4b50-a097-6dd81a68c1ca", "metadata": {}, "outputs": [], "source": [ - "print(len(data[pd.isnull(data[\"calcium_content\"])]))\n", - "print(len(data[pd.isnull(data[\"fat_content\"])]))" + "data.describe().T.plot(kind='bar')" ] }, { @@ -95,7 +94,7 @@ "id": "4590cffd-d4a9-4e15-8fd5-cbb22f048300", "metadata": {}, "source": [ - "Since those two columns have too much null values, we choose to remove them. \n", + "Since `calcium_content` and `fat_content` columns have too much null values, we choose to remove them. \n", "Similarly, we removed other columns we are not interested in: " ] }, @@ -335,17 +334,6 @@ " return df.copy()" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "fda6aaad-7b1e-4daa-8d28-cd049df9cec2", - "metadata": {}, - "outputs": [], - "source": [ - "data_features=filter_df(data)\n", - "data_features" - ] - }, { "cell_type": "markdown", "id": "d1eb67d7-d16b-4b93-8486-582830ac3903", @@ -389,14 +377,6 @@ " return list(c[0] for c in data_colors), list(c[1] for c in data_colors), list(c[2] for c in data_colors)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "471728e0-5543-4afd-bf54-d21bd49dda75", - "metadata": {}, - "outputs": [], - "source": [] - }, { "cell_type": "code", "execution_count": null, @@ -418,11 +398,9 @@ ] }, { - "cell_type": "code", - "execution_count": null, - "id": "24e7ff6e-c308-4cc8-aeac-eeb372f4c479", + "cell_type": "markdown", + "id": "979b9eef-9ca2-4299-a4e0-e8d3813f45c6", "metadata": {}, - "outputs": [], "source": [ "In this part, we achieved to do two things for the classification: create a decision tree on the database and, given a cheese and its characteristics, find where it originates from. \n", "\n" @@ -540,8 +518,7 @@ "metadata": {}, "outputs": [], "source": [ - "yprime=pd.DataFrame(model.predict(X),columns=[\"latitude\",\"longitude\"])\n", - "\n" + "yprime=pd.DataFrame(model.predict(X),columns=[\"latitude\",\"longitude\"])" ] }, { @@ -561,7 +538,7 @@ "source": [ "unused_columns = {\"vegetarian\", \"vegan\", \"cheese\", \"region\", \"color\", \"location\", \"latitude\", \"longitude\", \"country\",\"color_r\",\"color_g\",\"color_b\"}\n", "data_features_only=data_features.drop(columns=list(unused_columns.intersection(data_features.columns)))\n", - "data_features_only.shape[1]" + "print(\"Number of features:\", data_features_only.shape[1])" ] }, { @@ -595,7 +572,7 @@ "outputs": [], "source": [ "assoc_rules = association_rules(frequent_itemsets, min_threshold=.5)\n", - "\n", + "assoc_rules=assoc_rules.sort_values(by=['confidence'], ascending=False)\n", "display(HTML(assoc_rules.to_html()))" ] }, @@ -606,21 +583,32 @@ "metadata": {}, "outputs": [], "source": [ - "assoc_rules[assoc_rules[\"consequents\"].astype(str).str.contains(\"cow\")]" + "assoc_rules[assoc_rules[\"antecedents\"].astype(str).str.contains(\"rich\")]" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "78ef08e7-1436-440f-b035-8b480af1cc7b", + "cell_type": "markdown", + "id": "84e2f426-8077-46c7-bc7e-357e631972d2", "metadata": {}, - "outputs": [], "source": [ "For Pattern Mining, we only kept relevant columns (binary attributes) thus dropping RGB colors and any location based information, keeping only information relevant to the final cheese itself.\n", "\n", "We applied the apriori algorithm for frequent itemsets and searched for association rules.\n", "\n", - "Est-ce que les fromages artisanaux ont souvent plus de \"goûts\" que les autres ?\n" + "If we observe the association rules with the highest degree of confidence, we can interpolate the following statements (then verified to be true):\n", + "- cheddar is primarily a cow cheese\n", + "- " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "104b476d-5531-40e7-8bf6-987f00a8f5c1", + "metadata": {}, + "outputs": [], + "source": [ + "data_f=text_to_boolean(data)\n", + "data_f[(data_f[\"bloomy\"] == True)]" ] } ],