# DM project: cheese

In [None]:
import os
import time
import json
import random
import pandas as pd
import plotly.express as px
import tqdm.notebook as tqdm
import matplotlib.pyplot as plt
from geopy.geocoders import Nominatim
from IPython.display import display, HTML
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression

We use the following dataset from Kaggle: [Cheese: 248 different types of cheese with various characteristics](https://www.kaggle.com/datasets/joebeachcapital/cheese). 

In [None]:
data = pd.read_csv("cheeses.csv")
data

## Cleaning and pre-processing

In [None]:
print(set(data["color"]))
data[pd.isnull(data["color"])]
data

In [None]:
print(len(data[pd.isnull(data["calcium_content"])]))
print(len(data[pd.isnull(data["fat_content"])]))

Since those two columns have too much null data, we choose to remove them. 
Similarly, we removed other columns we are not interested in: 

In [None]:
unused_columns = ["alt_spellings", "producers", "calcium_content", "url", "fat_content", "synonyms"]
for col in unused_columns:
 if col in data.columns:
 del data[col]
data

Now, we are interested in having only one column representing the location for each cheese. 

In [None]:
data=data.dropna(subset=["country","region"], how="all")
data=data.fillna(value={"country":""})
data=data.fillna(value={"region":""})
print(f"{len(data)} rows remaining")
data

We removed 6 rows for which we could not find a suitable location. 

In [None]:
data.loc[data.country.str.contains("England, Great Britain, United Kingdom")|data.country.str.contains("England, United Kingdom"),"country"]="England"
data.loc[data.country.str.contains("Scotland"),"country"]="Scotland"
data.loc[data.country.str.contains("Great Britain, United Kingdom, Wales")|data.country.str.contains("United Kingdom, Wales"),"country"]="Wales"

We change some countries to get more easily the location. 

In [None]:
data=data.drop(index=data[data["country"].str.contains(",")].index)
data=data.drop(index=data[data["country"].str.contains(" and ")].index)
data.reset_index()
data

We removed 41 cheeses because they can come froms several countries. 

In [None]:
data["location"]=data["region"]+", "+data["country"]
data

### Converting the locations to GPS coordinates

In order to have more numeric data to apply a classification algorithm, we transform the location to GPS coordinates and the color to RGB. 

In [None]:
def str_to_gps(loc):
 l=loc.split(",")
 loc=",".join([l[0],l[-1]])# removing details gives less errors while fetching the GPS coordinates
 try:
 res=Nominatim(user_agent="dmProject").geocode(loc) 
 return (res.latitude, res.longitude)
 except AttributeError:
 loc=l[-1]
 res=Nominatim(user_agent="dmProject").geocode(loc) 
 return (res.latitude, res.longitude)
def get_locations(backup_file):
 errors=set()
 if os.path.isfile(backup_file):
 with open(backup_file) as f:
 return json.load(f)
 locations_to_gps = {}
 for loc in tqdm.tqdm(locs):
 time.sleep(1) # We don't want to overload the Nominatim server which will stop responding
 try:
 locations_to_gps[loc] = str_to_gps(loc)
 print(loc, locations_to_gps[loc])
 except AttributeError:
 errors.add(loc)
 print(loc, file=sys.stderr)
 with open(backup_file, "w") as f:
 json.dump(locations_to_gps, f)
 return locations_to_gps

In [None]:
locs=set(data["location"])
locations_to_gps = get_locations("locations_to_gps.json")
latitudes, longitudes = [], []
for i, value in enumerate(data.location):
 latitudes.append(locations_to_gps[value][0])
 longitudes.append(locations_to_gps[value][1])
data["latitude"] = latitudes
data["longitude"] = longitudes

In [None]:
fig = px.scatter_map(data, 
 lat="latitude", 
 lon="longitude", 
 hover_name="cheese", 
 hover_data=["cheese"],
 color="milk",
 zoom=1.5,
 height=800,
 width=1400)

fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show();

In [None]:
def filter_df(df, cols=None):
 if cols is None:
 cols = ["milk", 
 "color",
 "type", "texture", "flavor", "aroma", "family", "rind"]

 df = df.copy()
 attributes = set() # Get all the possible attributes (some are mixed in different columns)
 for col in cols:
 values = set()
 for val in set(df[col]):
 if type(val) == float: # skip NaN values
 continue
 values = values.union([x.strip() for x in set(val.split(","))])
 attributes = attributes.union(values)
 row_attrs = [set() for _ in range(len(df))] # get the attributes specific to each row
 for col in cols:
 for i, row in enumerate(df[col]):
 if type(row) != float:
 row_attrs[i] = row_attrs[i].union([x.strip() for x in row.split(",")])
 for attr in attributes: # Add attributes rows
 df[attr] = list(attr in row_attrs[i] for i in range(len(df[col])))
 df=df.copy()
 for col in cols:
 del df[col]

 return df.copy()

In [None]:
data_features=filter_df(data)
data_features

## Classification

Transformer: la couleur en RGB; la localisation en GPS
1ère question: est-ce que la couleur suffit à savoir d'où ça vient ? 
2ème question: est-ce que si on ajoute le type ça marche ? 
3ème question: et les caractéristiques gustatives ?

In [None]:
data_features.drop(columns=["region"])
data_features

In [None]:
Y=LabelEncoder().fit_transform(data_features["country"])
X=data_features.drop(columns=["cheese","country","region","vegetarian","location","latitude","longitude"])
data_train, data_test, target_train, target_test = train_test_split(
 X, Y)
c=tree.DecisionTreeClassifier(max_depth=4)
c=c.fit(data_train,target_train)
plt.figure(figsize=(100,150))
ax=plt.subplot()

tree.plot_tree(c,ax=ax,filled=True,feature_names=X.columns);


## Pattern Mining

In [None]:
unused_columns = {"vegetarian", "vegan", "cheese", "region", "color", "location", "latitude", "longitude", "country"}
data_features_only=data_features.drop(columns=list(unused_columns.intersection(data_features.columns)))
data_features_only.shape[1]

We have $164$ features in our data, that is very big compared to the number of rows of our data. So, we choose a min_support of $0.1$ during the apriori algorithm for pattern mining. 

In [None]:
frequent_itemsets=apriori(data_features_only,min_support=.1, use_colnames=True)
display(HTML(frequent_itemsets.to_html()))

In [None]:
assoc_rules = association_rules(frequent_itemsets, min_threshold=.5)

display(HTML(assoc_rules.to_html()))