# DM project: cheese

In [None]:
import os
import time
import json
import random

from matplotlib import colors
import matplotlib.pyplot as plt
import plotly.express as px
import tqdm.notebook as tqdm
from IPython.display import display, HTML

from geopy.geocoders import Nominatim


import pandas as pd

from mlxtend.frequent_patterns import apriori, association_rules

from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet

For this project, we chose to study cheeses. We retrieved a [dataset from Kaggle](https://www.kaggle.com/datasets/joebeachcapital/cheese) that gives several characteristics for more than $1000$ cheeses. We have information about the origin, the milk, types, texture, rind, flavor, etc. of these cheeses. 

In [None]:
data = pd.read_csv("cheeses.csv")
data

## I.  Cleaning and pre-processing

We achieved several tasks during cleaning and preprocessing: 
- Dropping some rows that did not have enough data
-  Dropping some columns that had too much ```NaN``` values (eg. ```fat_content```)
-  Convert locations (given as region and country name) to GPS coordinates to use them in linear regression
-  Same for cheese colors: we converted them to RGB. 
-  Some characteristics of cheeses were given as lists of adjectives, we chose to put them as booleans into separate columns to ease the processing. 


In [None]:
print(len(data[pd.isnull(data["calcium_content"])]))
print(len(data[pd.isnull(data["fat_content"])]))

Since those two columns have too much null values, we choose to remove them. 
Similarly, we removed other columns we are not interested in: 

In [None]:
unused_columns = [ "fat_content", "calcium_content", "alt_spellings", "producers",  "url", "synonyms"]
for col in unused_columns:
    if col in data.columns:
        del data[col]
data

### Converting the locations to GPS coordinates



Now, we are interested in having only one column representing the location for each cheese. 

In [None]:
data=data.dropna(subset=["country","region"], how="all")
data=data.fillna(value={"country":""})
data=data.fillna(value={"region":""})
print(f"{len(data)} rows remaining")
data

We removed 6 rows for which we could not find a suitable location. 

In [None]:
data.loc[data.country.str.contains("England, Great Britain, United Kingdom")|data.country.str.contains("England, United Kingdom"),"country"]="England"
data.loc[data.country.str.contains("Scotland"),"country"]="Scotland"
data.loc[data.country.str.contains("Great Britain, United Kingdom, Wales")|data.country.str.contains("United Kingdom, Wales"),"country"]="Wales"

We change some countries to get more easily the location. 

In [None]:
data=data.drop(index=data[data["country"].str.contains(",")].index)
data=data.drop(index=data[data["country"].str.contains(" and ")].index)
data.reset_index()
data

We removed 41 cheeses because they can come froms several countries. 

In [None]:
data["location"]=data["region"]+", "+data["country"]

In order to have more numeric data to apply a classification algorithm, we transform the location to GPS coordinates and the color to RGB. 

In [None]:
def str_to_gps(loc):
    l=loc.split(",")
    loc=",".join([l[0],l[-1]])# removing details gives less errors while fetching the GPS coordinates
    try:
        res=Nominatim(user_agent="dmProject").geocode(loc) 
        return (res.latitude, res.longitude)
        # In the real world, we would have used a real (non-free) API to compute all this. 
        # This one is free and does not give so bad results. 
    except AttributeError:
        loc=l[-1]
        res=Nominatim(user_agent="dmProject").geocode(loc) 
        return (res.latitude, res.longitude)
        
def get_locations(backup_file):
    errors=set()
    if os.path.isfile(backup_file):
        with open(backup_file) as f:
            return json.load(f)
    locations_to_gps = {}
    for loc in tqdm.tqdm(locs):
        time.sleep(1)  # We don't want to overload the Nominatim server which will stop responding
        try:
            locations_to_gps[loc] = str_to_gps(loc)
            print(loc, locations_to_gps[loc])
        except AttributeError:
            errors.add(loc)
            print(loc, file=sys.stderr)
    with open(backup_file, "w") as f:
        json.dump(locations_to_gps, f)
    return locations_to_gps

In [None]:
locs=set(data["location"])
locations_to_gps = get_locations("locations_to_gps.json")
latitudes, longitudes = [], []
for i, value in enumerate(data.location):
    latitudes.append(locations_to_gps[value][0])
    longitudes.append(locations_to_gps[value][1])
data["latitude"] = latitudes
data["longitude"] = longitudes

In [None]:
fig = px.scatter_map(data, 
                        lat="latitude", 
                        lon="longitude", 
                        hover_name="cheese", 
                        hover_data=["cheese"],
                        color="milk",
                        zoom=1.5,
                        height=800,
                        width=1400)

fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show();

### Converting the text data to boolean values

We want to transform the many characteristics of the cheeses to boolean values, to be able to use them as numeric data. 

In [None]:
def text_to_boolean(df, cols=None):
    if cols is None:
        cols = ["milk", 
                "color",
                "type", "texture", "flavor", "aroma", "family", "rind"]

    df = df.copy()
    attributes = set() # Get all the possible attributes (some are mixed in different columns)
    for col in cols:
        values = set()
        for val in set(df[col]):
            if type(val) == float: # skip NaN values
                continue
            values = values.union([x.strip() for x in set(val.split(","))])
        attributes = attributes.union(values)
    row_attrs = [set() for _ in range(len(df))]  # get the attributes specific to each row
    for col in cols:
        for i, row in enumerate(df[col]):
            if type(row) != float:
                row_attrs[i] = row_attrs[i].union([x.strip() for x in row.split(",")])
    for attr in attributes: # Add attributes rows
        df[attr] = list(attr in row_attrs[i] for i in range(len(df[col])))
        df=df.copy()
    for col in cols:
        del df[col]

    return df.copy()

In [None]:
data_features=filter_df(data)
data_features

Similarly, we convert the colors to their RGB representations. 

In [None]:
def color_columns(data):
    """
    Returns 3 columns corresponding to approximate RGB values of the colors on the cheeses
    """
    color_to_hex = {
        'blue': "#50564B",
        'blue-grey': "#504E4A",
        'brown': "#D19651",
        'brownish yellow': "#E5CD80",
        'cream': "#D9D9CE",
        'golden orange': "#D0915D",
        'golden yellow': "#DCBE9A",
        'green': "#6AA57F",
        'ivory': "#E8C891",
        'orange': "#C7980D",
        'pale white': "#DAD5C2",
        'pale yellow': "#F3D7B1",
        'pink and white': "#C0AB94",
        'red': "#984F18",
        'straw': "#F8EAC6",
        'white': "#F8F8F8",
        'yellow': "#EBD88B",
    }
    color_to_rgb = {color: colors.to_rgb(color_to_hex[color]) for color in color_to_hex}
    data_colors = list(color_to_rgb[color] if color in color_to_rgb else (0, 0, 0) for color in data["color"])
    return list(c[0] for c in data_colors), list(c[1] for c in data_colors), list(c[2] for c in data_colors)

In [None]:
data["color_r"], data["color_g"], data["color_b"] = color_columns(data)
data_features=text_to_boolean(data)
data_features

## II. Classification

In [None]:
In this part, we achieved to do two things for the classification: create a decision tree on the database and, given a cheese and its characteristics, find where it originates from. 



### II.A Decision tree

In [None]:
Y=LabelEncoder().fit_transform(data_features["country"])
X=data_features.drop(columns=["cheese","country","region","vegetarian","location","latitude","longitude"])
data_train, data_test, target_train, target_test = train_test_split(
    X, Y)
c=tree.DecisionTreeClassifier(max_depth=4,random_state=0)
c=c.fit(data_train,target_train)
plt.figure(figsize=(150,100))
ax=plt.subplot()

tree.plot_tree(c,ax=ax,filled=True,feature_names=X.columns,);

We built a decision tree for our cheese database. 
We noticed that the most relevant features, those used by the decision tree, focus on the texture of the cheese and the taste on the cheeses (rindless, bloomy, soft, tangy), rather than on the animal milk used. 


### Linear regression: find location depending on the cheese characteristics

We try to do a linear regression over the data to see whether, given a cheese, we can guess where it originates from. We are going to see that it does not work very well, each regression model has a $R^2$ coefficient of less than $0.3$, which is very bad. 


In [None]:
for col in ["cheese","country","region","location","vegetarian","vegan"]:
    try: 
        del data_features[col]
    except:
        pass
data_features

In [None]:
X=data_features.copy()
del X["latitude"]
del X["longitude"]
y=data_features[["longitude","latitude"]]

In [None]:
for model in LinearRegression(), Ridge(), Lasso(), ElasticNet():  
    model.fit(X,y)
    print(model.score(X,y))

Not good, even quite bad. 
In short, it seems that we cannot find the region a cheese originates from given its characteristic. 

In [None]:
model.predict(X)

In [None]:
yprime=pd.DataFrame(model.predict(X),columns=["latitude","longitude"])



## III. Pattern Mining

In [None]:
unused_columns = {"vegetarian", "vegan", "cheese", "region", "color", "location", "latitude", "longitude", "country","color_r","color_g","color_b"}
data_features_only=data_features.drop(columns=list(unused_columns.intersection(data_features.columns)))
data_features_only.shape[1]

We have $164$ features in our data, that is very big compared to the number of rows of our data. So, we choose a min_support of $0.1$ during the apriori algorithm for pattern mining. 

In [None]:
frequent_itemsets = apriori(data_features_only,min_support=.05, use_colnames=True)
display(HTML(frequent_itemsets.to_html()))

In [None]:
assoc_rules = association_rules(frequent_itemsets, min_threshold=.5)

display(HTML(assoc_rules.to_html()))

In [None]:
assoc_rules[assoc_rules["consequents"].astype(str).str.contains("cow")]

In [None]:
For Pattern Mining, we only kept relevant columns (binary attributes) thus dropping RGB colors and any location based information, keeping only information relevant to the final cheese itself.

We applied the apriori algorithm for frequent itemsets and searched for association rules.

Est-ce que les fromages artisanaux ont souvent plus de "go√ªts" que les autres ?
