import gc
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import warnings
from IPython.core.interactiveshell import InteractiveShell
%matplotlib inline
InteractiveShell.ast_node_interactivity = "all"
colour = sns.color_palette()
warnings.filterwarnings('ignore')
gc.collect()
0
### Define column names, and column datatypes. ###
prod_cols = ['product_id','product_name','aisle_id','department_id']
prod_dtypes= {'product_id':str,'product_name':str,'aisle_id':str,'department_id':str}
order_cols = ['order_id','user_id','eval_set','order_number','order_dow',
'order_hour_of_day','days_since_prior_order']
order_dtypes = {'order_id':str,'user_id':str,'eval_set':str,'order_number':str,
'order_dow':str,'order_hour_of_day':str,'days_since_prior_order':str}
order_prods_cols = ['order_id','product_id','add_to_cart_order','reordered']
order_prods_dtypes = {'order_id':str,'product_id':str,'add_to_cart_order':str,'reordered':str}
department_names = ["department_id", "department"]
department_dtypes = {"department_id":str, "department":str}
aisle_names = ["aisle_id", "aisle"]
aisle_dtypes = {"aisle_id":str, "aisle":str}
### Paths to the various files ###
prod_path = '/home/serena/workspace/Jupyter/Instacart/products.csv'
order_path = '/home/serena/workspace/Jupyter/Instacart/orders.csv'
order_prods_path = '/home/serena/workspace/Jupyter/Instacart/order_products__train.csv'
order_prods_prior_path = '/home/serena/workspace/Jupyter/Instacart/order_products__prior.csv'
department_path = '/home/serena/workspace/Jupyter/Instacart/departments/departments.csv'
aisle_path = '/home/serena/workspace/Jupyter/Instacart/aisles/aisles.csv'
### Read the files to dataframes ###
prods = pd.read_csv(prod_path, names=prod_cols, dtype=prod_dtypes, na_filter=False)
orders = pd.read_csv(order_path, names=order_cols, dtype=order_dtypes, na_filter=False)
order_prods = pd.read_csv(order_prods_path, names=order_prods_cols, dtype=order_prods_dtypes, na_filter=False)
order_prods_prior = pd.read_csv(order_prods_prior_path, names=order_prods_cols, dtype=order_prods_dtypes, na_filter=False)
departments = pd.read_csv(department_path, names=department_names, dtype=department_dtypes)
prods = prods.iloc[1:]
orders = orders.iloc[1:]
order_prods = order_prods.iloc[1:]
order_prods_prior = order_prods_prior.iloc[1:]
departments = departments.iloc[1:]
# Combine order_prods and order_prods_prior
# order_prods contains the most recent orders for a shopper
# order_prods_prior contains earlier orders
order_prods_all = pd.concat([order_prods, order_prods_prior], axis=0)
### View the first rows of each file ###
orders.head()
prods.head()
order_prods.head()
order_prods_prior.head()
order_id | user_id | eval_set | order_number | order_dow | order_hour_of_day | days_since_prior_order | |
---|---|---|---|---|---|---|---|
1 | 2539329 | 1 | prior | 1 | 2 | 08 | |
2 | 2398795 | 1 | prior | 2 | 3 | 07 | 15.0 |
3 | 473747 | 1 | prior | 3 | 3 | 12 | 21.0 |
4 | 2254736 | 1 | prior | 4 | 4 | 07 | 29.0 |
5 | 431534 | 1 | prior | 5 | 4 | 15 | 28.0 |
product_id | product_name | aisle_id | department_id | |
---|---|---|---|---|
1 | 1 | Chocolate Sandwich Cookies | 61 | 19 |
2 | 2 | All-Seasons Salt | 104 | 13 |
3 | 3 | Robust Golden Unsweetened Oolong Tea | 94 | 7 |
4 | 4 | Smart Ones Classic Favorites Mini Rigatoni Wit... | 38 | 1 |
5 | 5 | Green Chile Anytime Sauce | 5 | 13 |
order_id | product_id | add_to_cart_order | reordered | |
---|---|---|---|---|
1 | 1 | 49302 | 1 | 1 |
2 | 1 | 11109 | 2 | 1 |
3 | 1 | 10246 | 3 | 0 |
4 | 1 | 49683 | 4 | 0 |
5 | 1 | 43633 | 5 | 1 |
order_id | product_id | add_to_cart_order | reordered | |
---|---|---|---|---|
1 | 2 | 33120 | 1 | 1 |
2 | 2 | 28985 | 2 | 1 |
3 | 2 | 9327 | 3 | 0 |
4 | 2 | 45918 | 4 | 1 |
5 | 2 | 30035 | 5 | 0 |
### View the departments dataframe.###
departments
department_id | department | |
---|---|---|
1 | 1 | frozen |
2 | 2 | other |
3 | 3 | bakery |
4 | 4 | produce |
5 | 5 | alcohol |
6 | 6 | international |
7 | 7 | beverages |
8 | 8 | pets |
9 | 9 | dry goods pasta |
10 | 10 | bulk |
11 | 11 | personal care |
12 | 12 | meat seafood |
13 | 13 | pantry |
14 | 14 | breakfast |
15 | 15 | canned goods |
16 | 16 | dairy eggs |
17 | 17 | household |
18 | 18 | babies |
19 | 19 | snacks |
20 | 20 | deli |
21 | 21 | missing |
### Size of the tables: (number of rows, number of columns) ###
# E.g. order_prods has 1384618 rows, and 4 columns: order_id, product_id, add_to_cart_order, reordered
print("The order_prods size is : ", order_prods.shape)
print("The order_prods_prior size is : ", order_prods_prior.shape)
print("The order_products_all size is : ", order_prods_all.shape)
The order_prods size is : (1384617, 4) The order_prods_prior size is : (32434489, 4) The order_products_all size is : (33819106, 4)
order_data = order_prods_prior.merge(prods, on = "product_id", how = "left")
final_order_data = order_data.merge(orders, on = "order_id", how = "left")
final_order_data.head()
order_id | product_id | add_to_cart_order | reordered | product_name | aisle_id | department_id | user_id | eval_set | order_number | order_dow | order_hour_of_day | days_since_prior_order | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2 | 33120 | 1 | 1 | Organic Egg Whites | 86 | 16 | 202279 | prior | 3 | 5 | 09 | 8.0 |
1 | 2 | 28985 | 2 | 1 | Michigan Organic Kale | 83 | 4 | 202279 | prior | 3 | 5 | 09 | 8.0 |
2 | 2 | 9327 | 3 | 0 | Garlic Powder | 104 | 13 | 202279 | prior | 3 | 5 | 09 | 8.0 |
3 | 2 | 45918 | 4 | 1 | Coconut Butter | 19 | 13 | 202279 | prior | 3 | 5 | 09 | 8.0 |
4 | 2 | 30035 | 5 | 0 | Natural Sweetener | 17 | 13 | 202279 | prior | 3 | 5 | 09 | 8.0 |
### Count the number of vegetarians. ###
# First look at "not vegetarians"- these are people who have purchased from the meat seafood category.
not_vegetarians = list(final_order_data[final_order_data.department_id == "12"].user_id.unique())
print("There are %s shoppers who have purchased meat or seafood." %len(not_vegetarians))
print("So there are %s shoppers who are vegetarian." % (final_order_data.user_id.nunique()- len(not_vegetarians)))
print("As a reminder, there are %s shoppers in the database." %final_order_data.user_id.nunique())
There are 113891 shoppers who have purchased meat or seafood. So there are 92318 shoppers who are vegetarian. As a reminder, there are 206209 shoppers in the database.
### Create veg column. ###
final_order_data["veg"] = np.where(final_order_data["user_id"].isin(not_vegetarians), 0, 1)
### Convert values to numeric ##
final_order_data["order_dow"] = pd.to_numeric(final_order_data["order_dow"])
final_order_data["order_hour_of_day"] = pd.to_numeric(final_order_data["order_hour_of_day"])
final_order_data["order_number"] = pd.to_numeric(final_order_data["order_number"])
final_order_data["days_since_prior_order"] = pd.to_numeric(final_order_data["days_since_prior_order"])
final_order_data["add_to_cart_order"] = pd.to_numeric(final_order_data["days_since_prior_order"])
final_order_data["reordered"] = pd.to_numeric(final_order_data["reordered"])
final_order_data.head()
order_id | product_id | add_to_cart_order | reordered | product_name | aisle_id | department_id | user_id | eval_set | order_number | order_dow | order_hour_of_day | days_since_prior_order | veg | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2 | 33120 | 8.0 | 1 | Organic Egg Whites | 86 | 16 | 202279 | prior | 3 | 5 | 9 | 8.0 | 1 |
1 | 2 | 28985 | 8.0 | 1 | Michigan Organic Kale | 83 | 4 | 202279 | prior | 3 | 5 | 9 | 8.0 | 1 |
2 | 2 | 9327 | 8.0 | 0 | Garlic Powder | 104 | 13 | 202279 | prior | 3 | 5 | 9 | 8.0 | 1 |
3 | 2 | 45918 | 8.0 | 1 | Coconut Butter | 19 | 13 | 202279 | prior | 3 | 5 | 9 | 8.0 | 1 |
4 | 2 | 30035 | 8.0 | 0 | Natural Sweetener | 17 | 13 | 202279 | prior | 3 | 5 | 9 | 8.0 | 1 |
def count_plot(data, col):
f, axes = plt.subplots(2, 1, figsize=(16,12), sharex=True)
plt.figure(figsize=(20,8))
g1 = sns.countplot(x=col, data=data[data.veg==1], ax=axes[0])
g1.set_title("vegetarian")
g2 = sns.countplot(x=col, data=data[data.veg==0], ax=axes[1])
g2.set_title("not vegetarian")
plt.tight_layout()
count_plot(final_order_data, "order_dow")
<Figure size 1440x576 with 0 Axes>
count_plot(final_order_data, "order_hour_of_day")
<Figure size 1440x576 with 0 Axes>
def count_plot_dist(data,col):
f, axes = plt.subplots(2, 1, figsize=(14,14), sharex=True)
plt.figure(figsize=(20,8))
g1 = sns.countplot(x=col, data=data[data.veg==1], ax=axes[0])
g1.set_title("vegetarian")
g1.set_xlabel('')
g2 = sns.countplot(x=col, data=data[data.veg==0], ax=axes[1])
g2.set_title("not vegetarian")
plt.tight_layout()
count_plot_dist(final_order_data,"days_since_prior_order")
<Figure size 1440x576 with 0 Axes>
# Heatmap of day of week vs hour of day
def dow_vs_hour(data):
group="veg"
grouped_df0 = data[data[group]==0].groupby(["order_dow", "order_hour_of_day"])["order_number"].aggregate("count").reset_index()
grouped_df0 = grouped_df0.pivot('order_dow', 'order_hour_of_day', 'order_number')
grouped_df1 = data[data[group]==1].groupby(["order_dow", "order_hour_of_day"])["order_number"].aggregate("count").reset_index()
grouped_df1 = grouped_df1.pivot('order_dow', 'order_hour_of_day', 'order_number')
f, axes = plt.subplots(1, 2, figsize=(12,4), sharex=True)
plt.figure(figsize=(16,8))
g0 = sns.heatmap(grouped_df0,ax=axes[0])
g0.set_title("Frequency of Day of week Vs Hour of day: Not {}".format(group))
g1 = sns.heatmap(grouped_df1,ax=axes[1])
g1.set_title("Frequency of Day of week Vs Hour of day: {}".format(group))
plt.tight_layout()
plt.show()
dow_vs_hour(final_order_data)
<Figure size 1152x576 with 0 Axes>
def proportion_plot_both_groups(data):
group="veg"
data = data[data.department!="meat seafood"]
x, y, hue = "department", "purchase_proportion", group
hue_order = [group, "not_{}".format(group)]
plt.figure(figsize=(10,6))
(data[x]
.groupby(data[hue])
.value_counts(normalize=True)
.rename(y)
.reset_index()
.pipe((sns.barplot, "data"), x=x, y=y, hue=hue))
plt.xticks(rotation='45')
l = plt.legend(loc="upper right")
l.set_title('Non-{group} (0) vs {group} (1)'.format(group=group))
merged_data = final_order_data.merge(departments, on = "department_id")
proportion_plot_both_groups(merged_data)
def reorders_plot_both_groups(data):
group="veg"
data = data[data.department!="meat seafood"]
plot_title="Department wise reorder ratio: {group} vs Not {group}".format(group=group)
grouped_df = data.groupby(["department",group])["reordered"].aggregate("mean").reset_index()
plt.figure(figsize=(10,6))
sns.pointplot(grouped_df['department'].values, grouped_df['reordered'].values,
alpha=0.8, color=colour[2], hue=grouped_df[group])
plt.ylabel('Reorder ratio', fontsize=12)
plt.xlabel('Department', fontsize=12)
plt.title(plot_title, fontsize=15)
plt.xticks(rotation='45')
plt.show()
reorders_plot_both_groups(merged_data)
def reorders_dow_both_groups(data):
group="veg"
plot_title = "Reorder ratio across day of week for {group} and non-{group}".format(group=group)
grouped_df = data.groupby(["order_dow", group])["reordered"].aggregate("mean").reset_index()
plt.figure(figsize=(10,6))
sns.barplot(grouped_df['order_dow'].values, grouped_df['reordered'].values,
alpha=0.8, color=colour[3], hue=grouped_df[group])
plt.ylabel('Reorder ratio', fontsize=12)
plt.xlabel('Day of week', fontsize=12)
plt.title(plot_title, fontsize=15)
plt.xticks(rotation='vertical')
plt.ylim(0.5, 0.7)
plt.show()
reorders_dow_both_groups(final_order_data)
def plot_correlations(feature_df, col_prefix):
group = "veg"
to_drop = "{}_12".format(col_prefix)
feature_df = feature_df.drop(to_drop, axis=1)
plot_title = "Correlation matrix (Pearson) for the {group} group.".format(group=group)
labels = final_order_data[["user_id", group]].drop_duplicates()
labels=labels[labels[group]==1]
df = feature_df.merge(labels,on="user_id")
corr = df.corr()
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(10, 8))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.5, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.title(plot_title, fontsize=15)
def plot_group_proportions(feature_df, order_data, col, n, format_xticks):
group = "veg"
data = feature_df.merge(order_data[["user_id", group]].drop_duplicates(), on = "user_id")
x, y, hue = col, "purchase_proportion", group
hue_order = [group, "not_{}".format(group)]
plt.figure(figsize=(12,6))
(data[x]
.groupby(data[hue])
.value_counts(normalize=True)
.rename(y)
.reset_index()
.pipe((sns.barplot, "data"), x=x, y=y, hue=hue))
plt.xticks(rotation='45')
ax = plt.gca()
if format_xticks is True:
for index, label in enumerate(ax.xaxis.get_ticklabels()):
if index % n != 0:
label.set_visible(False)
l = plt.legend(loc="upper right")
l.set_title('Non-{group} (0) vs {group} (1)'.format(group=group))
def feature_correlation(f1, f2):
feature = f1.merge(f2, on = "user_id")
corr = feature.corr()
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.5, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5})
def get_dept_reorder_rate(data):
dept_reorder_rate = data.groupby(["user_id", "department_id"])["reordered"].aggregate("mean").reset_index(name="reorder_rate")
dept_reorder_rate = dept_reorder_rate.pivot_table(index='user_id', columns='department_id', values='reorder_rate')
dept_reorder_rate.fillna(0, inplace=True)
dept_reorder_rate.columns = ['reord_rt_dept_' + str(col) for col in dept_reorder_rate.columns]
return dept_reorder_rate
def get_num_dept_reorders(data):
dept_num_reorders = data.groupby(["user_id", "department_id"])["reordered"].aggregate("sum").reset_index(name="num_reorders")
dept_num_reorders = dept_num_reorders.pivot_table(index='user_id', columns='department_id', values='num_reorders')
dept_num_reorders.fillna(0, inplace=True)
dept_num_reorders.columns = ['num_reorder_dept_' + str(col) for col in dept_num_reorders.columns]
return dept_num_reorders
def get_time_bw_dept_purchases(data):
avg_time_bw_dept = data.groupby(["user_id","department_id"])["days_since_prior_order"].aggregate("mean").reset_index(name="avg_days_bw_order_dept")
avg_time_bw_dept["avg_days_bw_order_dept"] = round(avg_time_bw_dept["avg_days_bw_order_dept"],1)
avg_time_bw_dept = avg_time_bw_dept.pivot_table(index='user_id', columns='department_id', values='avg_days_bw_order_dept')
avg_time_bw_dept.fillna(0, inplace=True)
avg_time_bw_dept.columns = ['avg_days_bw_dept_' + str(col) for col in avg_time_bw_dept.columns]
return avg_time_bw_dept
def get_num_dept_purchases(data):
num_dept_purchases = data.groupby(["user_id", "department_id"]).size().reset_index(name="num_orders")
num_dept_purchases = num_dept_purchases.pivot_table(index='user_id', columns='department_id', values='num_orders')
num_dept_purchases.fillna(0, inplace=True)
num_dept_purchases.columns = ['dept_' + str(col) for col in num_dept_purchases.columns]
# Drop the columns that were used to define the segments
# We don't need to see them in the correlation plot
return num_dept_purchases
num_dept_purchases = get_num_dept_purchases(final_order_data) # Feature
plot_correlations(num_dept_purchases, "dept")
def get_num_unique_dept_purchases(data):
num_unique_dept = data.groupby(["user_id"])["department"].aggregate("nunique").reset_index()
num_unique_dept.columns = ["user_id", "num_departments_purchased_from"]
return num_unique_dept
num_unique_dept = get_num_unique_dept_purchases(merged_data) # Feature
plot_group_proportions(num_unique_dept, final_order_data, "num_departments_purchased_from", 0, False)
dept_reorder_rate = get_dept_reorder_rate(merged_data) # Feature
plot_correlations(dept_reorder_rate,"reord_rt_dept")
num_dept_reorders = get_num_dept_reorders(merged_data) # Feature
plot_correlations(num_dept_reorders,"num_reorder_dept")
feature_correlation(num_dept_purchases, dept_reorder_rate)
feature_correlation(num_dept_purchases, num_dept_reorders)
time_bw_dept_purchases = get_time_bw_dept_purchases(merged_data) # Feature #get time bw dept purcahses
plot_correlations(time_bw_dept_purchases, "avg_days_bw_dept")
feature_correlation(time_bw_dept_purchases, dept_reorder_rate)
def get_time_bw_orders(data):
avg_time_bw = data.groupby(["user_id"])["days_since_prior_order"].aggregate("mean").reset_index(name="avg_days_bw_order")
avg_time_bw["avg_days_bw_order"] = round(avg_time_bw["avg_days_bw_order"])
return avg_time_bw
def get_avg_tod(data):
avg_tod = data.groupby(["user_id"])["order_hour_of_day"].aggregate("mean").reset_index(name="avg_tod")
avg_tod["avg_tod"] = round(avg_tod["avg_tod"],1)
return avg_tod
def get_avg_dow(data):
avg_dow = final_order_data.groupby(["user_id"])["order_dow"].aggregate("mean").reset_index(name="avg_dow")
avg_dow["avg_dow"] = round(avg_dow["avg_dow"],1)
return avg_dow
# Add to cart order is a way to get the number of items in the basket
# Max "add_to_cart_order" is the total number of items
def get_num_items_basket(data):
add_to_cart_info = data.groupby(["user_id", "order_id"])["add_to_cart_order"].aggregate("max").reset_index(name="num_items")
avg_order_size = add_to_cart_info.groupby(["user_id"])["num_items"].aggregate("mean").reset_index(name="avg_num_items")
avg_order_size["avg_num_items"] = round(avg_order_size["avg_num_items"],1)
return avg_order_size
def get_order_size_std(data):
add_to_cart_info = data.groupby(["user_id", "order_id"])["add_to_cart_order"].aggregate("max").reset_index(name="num_items")
std_order_size = add_to_cart_info.groupby(["user_id"])["num_items"].aggregate("std").reset_index(name="std_num_items")
std_order_size["std_num_items"] = round(std_order_size["std_num_items"],1)
return std_order_size
def get_user_reorder_rate(data):
user_reorder_rate = data.groupby(["user_id"])["reordered"].aggregate("mean").reset_index(name="user_reorder_rate")
user_reorder_rate["user_reorder_rate"] = round(user_reorder_rate["user_reorder_rate"],2)
return user_reorder_rate
avg_time_bw = get_time_bw_orders(merged_data)
plot_group_proportions(avg_time_bw, final_order_data, "avg_days_bw_order", 0, False)
avg_tod = get_avg_tod(merged_data)
plot_group_proportions(avg_tod, final_order_data, "avg_tod", 12, True)
avg_dow = get_avg_dow(merged_data)
plot_group_proportions(avg_dow, final_order_data, "avg_dow", 2, True)
avg_order_size = get_num_items_basket(merged_data)
plot_group_proportions(avg_order_size, final_order_data, "avg_num_items", 12, True)
std_order_size = get_order_size_std(merged_data)
plot_group_proportions(std_order_size, final_order_data, "std_num_items", 12, True)
user_reorder_rate = get_user_reorder_rate(merged_data)
plot_group_proportions(user_reorder_rate, final_order_data, "user_reorder_rate", 12, True)
Let's use 1,2,5,6,7,8,9, 10, and 11. 1, 3, 4 are all showed some degree of correlation with each other. If I cared to put more effort into this, I would take more time into investigating the relationships between variables.
# Merge all the features into one dataframe
model_data = num_dept_purchases.merge(
num_unique_dept, on ="user_id").merge(
avg_time_bw, on = "user_id").merge(
avg_tod, on = "user_id").merge(
avg_dow, on = "user_id").merge(
avg_order_size, on = "user_id").merge(
std_order_size, on = "user_id").merge(
time_bw_dept_purchases, on = "user_id").merge(
user_reorder_rate, on = "user_id")
model_data = model_data.merge(
final_order_data[["user_id", "veg"]].drop_duplicates(), on = "user_id")
# Take a look at the data
model_data.head()
model_data.columns
user_id | dept_1 | dept_10 | dept_11 | dept_12 | dept_13 | dept_14 | dept_15 | dept_16 | dept_17 | ... | avg_days_bw_dept_21 | avg_days_bw_dept_3 | avg_days_bw_dept_4 | avg_days_bw_dept_5 | avg_days_bw_dept_6 | avg_days_bw_dept_7 | avg_days_bw_dept_8 | avg_days_bw_dept_9 | user_reorder_rate | veg | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 3.0 | 0.0 | 13.0 | 2.0 | ... | 0.0 | 0.0 | 25.4 | 0.0 | 0.0 | 18.3 | 0.0 | 0.0 | 0.69 | 1 |
1 | 10 | 1.0 | 0.0 | 0.0 | 13.0 | 22.0 | 0.0 | 2.0 | 16.0 | 0.0 | ... | 0.0 | 0.0 | 19.4 | 0.0 | 30.0 | 0.0 | 0.0 | 0.0 | 0.34 | 0 |
2 | 100 | 3.0 | 0.0 | 0.0 | 3.0 | 0.0 | 0.0 | 0.0 | 2.0 | 0.0 | ... | 0.0 | 28.0 | 24.8 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.26 | 0 |
3 | 1000 | 5.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 4.0 | 32.0 | 0.0 | ... | 7.5 | 6.0 | 11.4 | 0.0 | 0.0 | 13.4 | 0.0 | 0.0 | 0.69 | 1 |
4 | 10000 | 56.0 | 1.0 | 6.0 | 56.0 | 108.0 | 13.0 | 23.0 | 231.0 | 18.0 | ... | 0.0 | 4.9 | 5.3 | 0.0 | 5.0 | 4.1 | 0.0 | 5.3 | 0.77 | 0 |
5 rows × 51 columns
Index(['user_id', 'dept_1', 'dept_10', 'dept_11', 'dept_12', 'dept_13', 'dept_14', 'dept_15', 'dept_16', 'dept_17', 'dept_18', 'dept_19', 'dept_2', 'dept_20', 'dept_21', 'dept_3', 'dept_4', 'dept_5', 'dept_6', 'dept_7', 'dept_8', 'dept_9', 'num_departments_purchased_from', 'avg_days_bw_order', 'avg_tod', 'avg_dow', 'avg_num_items', 'std_num_items', 'avg_days_bw_dept_1', 'avg_days_bw_dept_10', 'avg_days_bw_dept_11', 'avg_days_bw_dept_12', 'avg_days_bw_dept_13', 'avg_days_bw_dept_14', 'avg_days_bw_dept_15', 'avg_days_bw_dept_16', 'avg_days_bw_dept_17', 'avg_days_bw_dept_18', 'avg_days_bw_dept_19', 'avg_days_bw_dept_2', 'avg_days_bw_dept_20', 'avg_days_bw_dept_21', 'avg_days_bw_dept_3', 'avg_days_bw_dept_4', 'avg_days_bw_dept_5', 'avg_days_bw_dept_6', 'avg_days_bw_dept_7', 'avg_days_bw_dept_8', 'avg_days_bw_dept_9', 'user_reorder_rate', 'veg'], dtype='object')
model_data.drop("dept_12", inplace=True, axis=1)
model_data.drop("avg_days_bw_dept_12", inplace=True, axis=1)
model_data[model_data.veg==1].user_id.nunique()
model_data[model_data.veg==0].user_id.nunique()
model_data[model_data.veg==1].user_id.nunique()/model_data.user_id.nunique()
92318
113891
0.4476914198701318
# Fix this cell
num_veg = len(model_data[model_data['veg'] == 1])
veg_indices = model_data[model_data.veg == 0].index #indices of non veg
random_indices = np.random.choice(veg_indices, num_veg, replace=False) # get random indices of nonparents
veg_indices = model_data[model_data.veg == 1].index # get indices of veg
under_sample_indices = np.concatenate([veg_indices,random_indices])
under_sample = model_data.loc[under_sample_indices]
# New X and y
features = model_data.columns
X = under_sample[features.drop(["user_id","veg"])]
y = under_sample[["veg"]]
X.columns
print("We have %s features." %len(features))
Index(['dept_1', 'dept_10', 'dept_11', 'dept_13', 'dept_14', 'dept_15', 'dept_16', 'dept_17', 'dept_18', 'dept_19', 'dept_2', 'dept_20', 'dept_21', 'dept_3', 'dept_4', 'dept_5', 'dept_6', 'dept_7', 'dept_8', 'dept_9', 'num_departments_purchased_from', 'avg_days_bw_order', 'avg_tod', 'avg_dow', 'avg_num_items', 'std_num_items', 'avg_days_bw_dept_1', 'avg_days_bw_dept_10', 'avg_days_bw_dept_11', 'avg_days_bw_dept_13', 'avg_days_bw_dept_14', 'avg_days_bw_dept_15', 'avg_days_bw_dept_16', 'avg_days_bw_dept_17', 'avg_days_bw_dept_18', 'avg_days_bw_dept_19', 'avg_days_bw_dept_2', 'avg_days_bw_dept_20', 'avg_days_bw_dept_21', 'avg_days_bw_dept_3', 'avg_days_bw_dept_4', 'avg_days_bw_dept_5', 'avg_days_bw_dept_6', 'avg_days_bw_dept_7', 'avg_days_bw_dept_8', 'avg_days_bw_dept_9', 'user_reorder_rate'], dtype='object')
We have 49 features.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
print("We have %s shoppers who are not veg in the train set."
%y_train[y_train.veg==0].index.nunique())
print("We have %s shoppers who are veg in the train set."
%y_train[y_train.veg==1].index.nunique())
print("We have %s shoppers who are not veg in the test set."
%y_test[y_test.veg==0].index.nunique())
print("We have %s shoppers who are veg in the test set."
%y_test[y_test.veg==1].index.nunique())
We have 61777 shoppers who are not veg in the train set. We have 61929 shoppers who are veg in the train set. We have 30541 shoppers who are not veg in the test set. We have 30389 shoppers who are veg in the test set.
# Helper functions
def inspect_df(predictions, y_test):
true_vs_pred = np.hstack((predictions, y_test))
true_df = pd.DataFrame(true_vs_pred)
true_df.columns = ["Predicted", "Actual"]
return true_df
def accuracy_stats(df):
fn = len(true_vs_pred[(true_vs_pred.Predicted==0)&(true_vs_pred.Actual==1)])
tn = len(true_vs_pred[(true_vs_pred.Predicted==0)&(true_vs_pred.Actual==0)])
fp = len(true_vs_pred[(true_vs_pred.Predicted==1)&(true_vs_pred.Actual==0)])
tp = len(true_vs_pred[(true_vs_pred.Predicted==1)&(true_vs_pred.Actual==1)])
print("There are %s false positives."%fp)
print("There are %s false negatives."%fn)
print("There are %s true positives."%tp)
print("There are %s true negatives."%tn)
return fp, fn, tp, tn
from sklearn.linear_model import LogisticRegression
# Fit the out of the box model- we won't be doing any parameter tuning since it's a POC
lr = LogisticRegression()
lr.fit(X_train, y_train)
score = lr.score(X_test, y_test)
# Get predictions on the test set from the trained model
preds = pd.DataFrame(lr.predict(X_test),index=y_test.index)
preds.columns = ["prediction"]
# Inspect results
true_vs_pred = inspect_df(preds,y_test)
true_vs_pred.head(10)
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='warn', n_jobs=None, penalty='l2', random_state=None, solver='warn', tol=0.0001, verbose=0, warm_start=False)
Predicted | Actual | |
---|---|---|
0 | 0 | 1 |
1 | 0 | 1 |
2 | 0 | 0 |
3 | 1 | 1 |
4 | 1 | 1 |
5 | 0 | 0 |
6 | 0 | 0 |
7 | 0 | 0 |
8 | 1 | 0 |
9 | 0 | 1 |
fp, fn, tp, tn = accuracy_stats(true_vs_pred)
There are 6276 false positives. There are 7219 false negatives. There are 23170 true positives. There are 24265 true negatives.
print("Total missclassified: " + str(fp + fn))
print("Total correct: " + str(tp + tn))
print("So %s percent are correct."%round(score,2))
Total missclassified: 13495 Total correct: 47435 So 0.78 percent are correct.
# Scikit learn doesn't allow error estimate for inference.
# Let's use Statsmodel instead
import statsmodels.api as sm
lr_sm = sm.Logit(y_train, X_train).fit()
print(lr_sm.summary())
Optimization terminated successfully. Current function value: 0.519442 Iterations 6 Logit Regression Results ============================================================================== Dep. Variable: veg No. Observations: 123706 Model: Logit Df Residuals: 123659 Method: MLE Df Model: 46 Date: Thu, 29 Nov 2018 Pseudo R-squ.: 0.2506 Time: 15:03:44 Log-Likelihood: -64258. converged: True LL-Null: -85746. LLR p-value: 0.000 ================================================================================================== coef std err z P>|z| [0.025 0.975] -------------------------------------------------------------------------------------------------- dept_1 -0.0044 0.001 -7.875 0.000 -0.006 -0.003 dept_10 0.0663 0.007 9.936 0.000 0.053 0.079 dept_11 0.0196 0.002 11.385 0.000 0.016 0.023 dept_13 -0.0030 0.001 -3.193 0.001 -0.005 -0.001 dept_14 0.0144 0.001 11.372 0.000 0.012 0.017 dept_15 -0.0059 0.001 -4.870 0.000 -0.008 -0.004 dept_16 -0.0014 0.000 -4.287 0.000 -0.002 -0.001 dept_17 0.0034 0.001 3.243 0.001 0.001 0.006 dept_18 0.0005 0.001 0.525 0.600 -0.001 0.002 dept_19 0.0066 0.000 15.271 0.000 0.006 0.007 dept_2 0.0605 0.008 7.122 0.000 0.044 0.077 dept_20 0.0020 0.001 1.989 0.047 2.9e-05 0.004 dept_21 0.0633 0.006 10.492 0.000 0.052 0.075 dept_3 -0.0061 0.001 -5.891 0.000 -0.008 -0.004 dept_4 -0.0036 0.000 -18.099 0.000 -0.004 -0.003 dept_5 0.0069 0.001 5.131 0.000 0.004 0.010 dept_6 0.0392 0.003 13.485 0.000 0.033 0.045 dept_7 0.0040 0.000 9.814 0.000 0.003 0.005 dept_8 -0.0008 0.002 -0.424 0.671 -0.004 0.003 dept_9 -0.0211 0.002 -13.467 0.000 -0.024 -0.018 num_departments_purchased_from -0.4109 0.004 -105.153 0.000 -0.419 -0.403 avg_days_bw_order -0.0285 0.005 -6.032 0.000 -0.038 -0.019 avg_tod 0.1772 0.003 66.030 0.000 0.172 0.182 avg_dow 0.2088 0.006 32.349 0.000 0.196 0.221 avg_num_items 0.0323 0.004 7.557 0.000 0.024 0.041 std_num_items 0.0216 0.002 11.742 0.000 0.018 0.025 avg_days_bw_dept_1 -0.0028 0.001 -3.234 0.001 -0.005 -0.001 avg_days_bw_dept_10 0.0235 0.002 11.223 0.000 0.019 0.028 avg_days_bw_dept_11 0.0178 0.001 21.174 0.000 0.016 0.019 avg_days_bw_dept_13 0.0012 0.001 1.379 0.168 -0.001 0.003 avg_days_bw_dept_14 0.0136 0.001 17.142 0.000 0.012 0.015 avg_days_bw_dept_15 -0.0020 0.001 -2.436 0.015 -0.004 -0.000 avg_days_bw_dept_16 -0.0031 0.001 -2.416 0.016 -0.006 -0.001 avg_days_bw_dept_17 0.0151 0.001 18.740 0.000 0.014 0.017 avg_days_bw_dept_18 0.0163 0.001 13.047 0.000 0.014 0.019 avg_days_bw_dept_19 0.0122 0.001 13.471 0.000 0.010 0.014 avg_days_bw_dept_2 0.0179 0.002 9.986 0.000 0.014 0.021 avg_days_bw_dept_20 0.0009 0.001 1.104 0.270 -0.001 0.002 avg_days_bw_dept_21 0.0147 0.001 10.041 0.000 0.012 0.018 avg_days_bw_dept_3 0.0039 0.001 4.815 0.000 0.002 0.006 avg_days_bw_dept_4 -0.0114 0.001 -8.044 0.000 -0.014 -0.009 avg_days_bw_dept_5 0.0226 0.002 12.682 0.000 0.019 0.026 avg_days_bw_dept_6 0.0059 0.001 6.411 0.000 0.004 0.008 avg_days_bw_dept_7 0.0104 0.001 11.955 0.000 0.009 0.012 avg_days_bw_dept_8 0.0129 0.002 7.524 0.000 0.010 0.016 avg_days_bw_dept_9 -0.0003 0.001 -0.318 0.751 -0.002 0.001 user_reorder_rate 1.5783 0.041 38.654 0.000 1.498 1.658 ==================================================================================================
import lime # Local Interpretable Model-Agnostic Explanations
import lime.lime_tabular # Since we're using dataframes
# Input our data as arrays
predictors = list(X_train.columns)
train_array = X_train.as_matrix()
test_array = X_test.as_matrix()
# Create the explainer from training data
explainer = lime.lime_tabular.LimeTabularExplainer(train_array, feature_names=predictors, class_names=[0,1])
# Explain instances in the test data
def explain_prediction(index, num_features):
explain_lr = explainer.explain_instance(test_array[index], lr.predict_proba, num_features = num_features)
explain_lr.show_in_notebook()
explain_prediction(0, len(predictors))
explain_prediction(0, 10)
explain_prediction(4, 10)