import pandas as pd
import datetime
import math
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
%matplotlib inline
cs_df = pd.read_excel(io=r'Online Retail.xlsx')
cs_df.head()
InvoiceNo | StockCode | Description | Quantity | InvoiceDate | UnitPrice | CustomerID | Country | |
---|---|---|---|---|---|---|---|---|
0 | 536365 | 85123A | WHITE HANGING HEART T-LIGHT HOLDER | 6 | 2010-12-01 08:26:00 | 2.55 | 17850.0 | United Kingdom |
1 | 536365 | 71053 | WHITE METAL LANTERN | 6 | 2010-12-01 08:26:00 | 3.39 | 17850.0 | United Kingdom |
2 | 536365 | 84406B | CREAM CUPID HEARTS COAT HANGER | 8 | 2010-12-01 08:26:00 | 2.75 | 17850.0 | United Kingdom |
3 | 536365 | 84029G | KNITTED UNION FLAG HOT WATER BOTTLE | 6 | 2010-12-01 08:26:00 | 3.39 | 17850.0 | United Kingdom |
4 | 536365 | 84029E | RED WOOLLY HOTTIE WHITE HEART. | 6 | 2010-12-01 08:26:00 | 3.39 | 17850.0 | United Kingdom |
Transactions size
cs_df.shape
(541909, 8)
cs_df.Country.value_counts().reset_index().head(n=10)
index | Country | |
---|---|---|
0 | United Kingdom | 495478 |
1 | Germany | 9495 |
2 | France | 8557 |
3 | EIRE | 8196 |
4 | Spain | 2533 |
5 | Netherlands | 2371 |
6 | Belgium | 2069 |
7 | Switzerland | 2002 |
8 | Portugal | 1519 |
9 | Australia | 1259 |
Number of customers
cs_df.CustomerID.unique().shape
(4373,)
(cs_df.CustomerID.value_counts()/sum(cs_df.CustomerID.value_counts())*100).head(n=13).cumsum()
17841.0 1.962249 14911.0 3.413228 14096.0 4.673708 12748.0 5.814728 14606.0 6.498553 15311.0 7.110850 14646.0 7.623350 13089.0 8.079807 13263.0 8.492020 14298.0 8.895138 15039.0 9.265809 14156.0 9.614850 18118.0 9.930462 Name: CustomerID, dtype: float64
Number of unique items
cs_df.StockCode.unique().shape
(4070,)
Description of items: We see that the descriptions are more then the stock code so there must be some stock code which have more than one decription
cs_df.Description.unique().shape
(4224,)
cs_df.dtypes
InvoiceNo object StockCode object Description object Quantity int64 InvoiceDate datetime64[ns] UnitPrice float64 CustomerID float64 Country object dtype: object
cat_des_df = cs_df.groupby(["StockCode","Description"]).count().reset_index()
Stockcode which have more than one description
cat_des_df.StockCode.value_counts()[cat_des_df.StockCode.value_counts()>1].reset_index().head()
index | StockCode | |
---|---|---|
0 | 20713 | 8 |
1 | 23084 | 7 |
2 | 21830 | 6 |
3 | 85175 | 6 |
4 | 85172 | 5 |
Example of one such stockcode
cs_df[cs_df['StockCode'] == cat_des_df.StockCode.value_counts()[cat_des_df.StockCode.value_counts()>1]
.reset_index()['index'][5]]['Description'].unique()
array(['JUMBO BAG VINTAGE CHRISTMAS ', 'came coded as 20713', 'wrongly coded 20713', '20713 wrongly marked', 20713], dtype=object)
cs_df['invdatetime'] = pd.to_datetime(cs_df.InvoiceDate)
cs_df.Quantity.describe()
count 541909.000000 mean 9.552250 std 218.081158 min -80995.000000 25% 1.000000 50% 3.000000 75% 10.000000 max 80995.000000 Name: Quantity, dtype: float64
cs_df.UnitPrice.describe()
count 541909.000000 mean 4.611114 std 96.759853 min -11062.060000 25% 1.250000 50% 2.080000 75% 4.130000 max 38970.000000 Name: UnitPrice, dtype: float64
# Seperate data for one geography
cs_df = cs_df[cs_df.Country == 'United Kingdom']
# Seperate attribute for total amount
cs_df['amount'] = cs_df.Quantity*cs_df.UnitPrice
# Remove negative or return transactions
cs_df = cs_df[~(cs_df.amount<0)]
cs_df.head()
cs_df = cs_df[~(cs_df.CustomerID.isnull())]
cs_df.shape
(354345, 10)
cs_df.InvoiceDate.max()
Timestamp('2011-12-09 12:49:00')
cs_df.InvoiceDate.min()
Timestamp('2010-12-01 08:26:00')
refrence_date = cs_df.InvoiceDate.max()
refrence_date = refrence_date + datetime.timedelta(days = 1)
cs_df['days_since_last_purchase'] = refrence_date - cs_df.InvoiceDate
cs_df['days_since_last_purchase_num'] = cs_df['days_since_last_purchase'].astype('timedelta64[D]')
Time period of transactions
customer_history_df = cs_df.groupby("CustomerID").min().reset_index()[['CustomerID', 'days_since_last_purchase_num']]
customer_history_df.rename(columns={'days_since_last_purchase_num':'recency'}, inplace=True)
customer_history_df.recency.describe()
count 3921.000000 mean 92.188472 std 99.528995 min 1.000000 25% 18.000000 50% 51.000000 75% 143.000000 max 374.000000 Name: recency, dtype: float64
customer_history_df.head()
CustomerID | recency | |
---|---|---|
0 | 12346.0 | 326.0 |
1 | 12747.0 | 2.0 |
2 | 12748.0 | 1.0 |
3 | 12749.0 | 4.0 |
4 | 12820.0 | 3.0 |
customer_history_df.recency.describe()
count 3921.000000 mean 92.188472 std 99.528995 min 1.000000 25% 18.000000 50% 51.000000 75% 143.000000 max 374.000000 Name: recency, dtype: float64
x = customer_history_df.recency
mu = np.mean(customer_history_df.recency)
sigma = math.sqrt(np.var(customer_history_df.recency))
n, bins, patches = plt.hist(x, 1000, facecolor='green', alpha=0.75)
# add a 'best fit' line
y = mlab.normpdf(bins, mu, sigma)
l = plt.plot(bins, y, 'r--', linewidth=2)
plt.xlabel('Recency in days')
plt.ylabel('Number of transactions')
plt.title(r'$\mathrm{Histogram\ of\ sales\ recency}\ $')
plt.grid(True)
customer_monetary_val = cs_df[['CustomerID', 'amount']].groupby("CustomerID").sum().reset_index()
customer_history_df = customer_history_df.merge(customer_monetary_val, how='outer')
customer_history_df.amount = customer_history_df.amount+0.001
customer_freq = cs_df[['CustomerID', 'amount']].groupby("CustomerID").count().reset_index()
customer_freq.rename(columns={'amount':'frequency'},inplace=True)
customer_history_df = customer_history_df.merge(customer_freq, how='outer')
Remove returns so that we only have purchases of a customer
customer_history_df.head()
CustomerID | recency | amount | frequency | |
---|---|---|---|---|
0 | 12346.0 | 326.0 | 77183.601 | 1 |
1 | 12747.0 | 2.0 | 4196.011 | 103 |
2 | 12748.0 | 1.0 | 33719.731 | 4596 |
3 | 12749.0 | 4.0 | 4090.881 | 199 |
4 | 12820.0 | 3.0 | 942.341 | 59 |
from sklearn import preprocessing
import math
customer_history_df['recency_log'] = customer_history_df['recency'].apply(math.log)
customer_history_df['frequency_log'] = customer_history_df['frequency'].apply(math.log)
customer_history_df['amount_log'] = customer_history_df['amount'].apply(math.log)
feature_vector = ['amount_log', 'recency_log','frequency_log']
X_subset = customer_history_df[feature_vector].as_matrix()
scaler = preprocessing.StandardScaler().fit(X_subset)
X_scaled = scaler.transform(X_subset)
plt.scatter(customer_history_df.recency_log, customer_history_df.amount_log, alpha=0.5)
<matplotlib.collections.PathCollection at 0x23fd2d19a90>
x = customer_history_df.amount_log
n, bins, patches = plt.hist(x, 1000, facecolor='green', alpha=0.75)
plt.xlabel('Log of Sales Amount')
plt.ylabel('Probability')
plt.title(r'$\mathrm{Histogram\ of\ Log\ transformed\ Customer\ Monetary\ value}\ $')
plt.grid(True)
#plt.show()
customer_history_df.head()
CustomerID | recency | amount | frequency | recency_log | frequency_log | amount_log | |
---|---|---|---|---|---|---|---|
0 | 12346.0 | 326.0 | 77183.601 | 1 | 5.786897 | 0.000000 | 11.253942 |
1 | 12747.0 | 2.0 | 4196.011 | 103 | 0.693147 | 4.634729 | 8.341890 |
2 | 12748.0 | 1.0 | 33719.731 | 4596 | 0.000000 | 8.432942 | 10.425838 |
3 | 12749.0 | 4.0 | 4090.881 | 199 | 1.386294 | 5.293305 | 8.316516 |
4 | 12820.0 | 3.0 | 942.341 | 59 | 1.098612 | 4.077537 | 6.848367 |
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111, projection='3d')
xs =customer_history_df.recency_log
ys = customer_history_df.frequency_log
zs = customer_history_df.amount_log
ax.scatter(xs, ys, zs, s=5)
ax.set_xlabel('Recency')
ax.set_ylabel('Frequency')
ax.set_zlabel('Monetary')
#plt.show()
<matplotlib.text.Text at 0x23fc2e56828>
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.cm as cm
X = X_scaled
cluster_centers = dict()
for n_clusters in range(3,6,2):
fig, (ax1, ax2) = plt.subplots(1, 2)
#ax2 = plt.subplot(111, projection='3d')
fig.set_size_inches(18, 7)
ax1.set_xlim([-0.1, 1])
ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])
clusterer = KMeans(n_clusters=n_clusters, random_state=10)
cluster_labels = clusterer.fit_predict(X)
silhouette_avg = silhouette_score(X, cluster_labels)
cluster_centers.update({n_clusters :{
'cluster_center':clusterer.cluster_centers_,
'silhouette_score':silhouette_avg,
'labels':cluster_labels}
})
sample_silhouette_values = silhouette_samples(X, cluster_labels)
y_lower = 10
for i in range(n_clusters):
ith_cluster_silhouette_values = \
sample_silhouette_values[cluster_labels == i]
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
color = cm.spectral(float(i) / n_clusters)
ax1.fill_betweenx(np.arange(y_lower, y_upper),
0, ith_cluster_silhouette_values,
facecolor=color, edgecolor=color, alpha=0.7)
ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
y_lower = y_upper + 10 # 10 for the 0 samples
ax1.set_title("The silhouette plot for the various clusters.")
ax1.set_xlabel("The silhouette coefficient values")
ax1.set_ylabel("Cluster label")
ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
ax1.set_yticks([])
ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
colors = cm.spectral(cluster_labels.astype(float) / n_clusters)
feature1 = 0
feature2 = 2
ax2.scatter(X[:, feature1], X[:, feature2], marker='.', s=30, lw=0, alpha=0.7,
c=colors, edgecolor='k')
centers = clusterer.cluster_centers_
ax2.scatter(centers[:, feature1], centers[:, feature2], marker='o',
c="white", alpha=1, s=200, edgecolor='k')
for i, c in enumerate(centers):
ax2.scatter(c[feature1], c[feature2], marker='$%d$' % i, alpha=1,
s=50, edgecolor='k')
ax2.set_title("The visualization of the clustered data.")
ax2.set_xlabel("Feature space for the 1st feature i.e. monetary value")
ax2.set_ylabel("Feature space for the 2nd feature i.e. frequency")
plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
"with n_clusters = %d" % n_clusters),
fontsize=14, fontweight='bold')
#plt.show()
for i in range(3,6,2):
print("for {} number of clusters".format(i))
cent_transformed = scaler.inverse_transform(cluster_centers[i]['cluster_center'])
print(pd.DataFrame(np.exp(cent_transformed),columns=feature_vector))
print("Silhouette score for cluster {} is {}". format(i, cluster_centers[i]['silhouette_score']))
print()
for 3 number of clusters amount_log recency_log frequency_log 0 843.937271 44.083222 53.920633 1 221.236034 121.766072 10.668661 2 3159.294272 7.196647 177.789098 Silhouette score for cluster 3 is 0.30437444714898737 for 5 number of clusters amount_log recency_log frequency_log 0 3905.544371 5.627973 214.465989 1 1502.519606 46.880212 92.306262 2 142.867249 126.546751 5.147370 3 408.235418 139.056216 25.530424 4 464.371885 13.386419 29.581298 Silhouette score for cluster 5 is 0.27958641427323727
labels = cluster_centers[5]['labels']
customer_history_df['num_cluster5_labels'] = labels
labels = cluster_centers[3]['labels']
customer_history_df['num_cluster3_labels'] = labels
customer_history_df.head()
CustomerID | recency | amount | frequency | recency_log | frequency_log | amount_log | num_cluster5_labels | num_cluster3_labels | |
---|---|---|---|---|---|---|---|---|---|
0 | 12346.0 | 326.0 | 77183.601 | 1 | 5.786897 | 0.000000 | 11.253942 | 1 | 0 |
1 | 12747.0 | 2.0 | 4196.011 | 103 | 0.693147 | 4.634729 | 8.341890 | 0 | 2 |
2 | 12748.0 | 1.0 | 33719.731 | 4596 | 0.000000 | 8.432942 | 10.425838 | 0 | 2 |
3 | 12749.0 | 4.0 | 4090.881 | 199 | 1.386294 | 5.293305 | 8.316516 | 0 | 2 |
4 | 12820.0 | 3.0 | 942.341 | 59 | 1.098612 | 4.077537 | 6.848367 | 4 | 2 |
import plotly as py
import plotly.graph_objs as go
py.offline.init_notebook_mode()
x_data = ['Cluster 1','Cluster 2','Cluster 3','Cluster 4', 'Cluster 5']
cutoff_quantile = 100
field_to_plot = 'recency'
y0 = customer_history_df[customer_history_df['num_cluster5_labels']==0][field_to_plot].values
y0 = y0[y0<np.percentile(y0, cutoff_quantile)]
y1 = customer_history_df[customer_history_df['num_cluster5_labels']==1][field_to_plot].values
y1 = y1[y1<np.percentile(y1, cutoff_quantile)]
y2 = customer_history_df[customer_history_df['num_cluster5_labels']==2][field_to_plot].values
y2 = y2[y2<np.percentile(y2, cutoff_quantile)]
y3 = customer_history_df[customer_history_df['num_cluster5_labels']==3][field_to_plot].values
y3 = y3[y3<np.percentile(y3, cutoff_quantile)]
y4 = customer_history_df[customer_history_df['num_cluster5_labels']==4][field_to_plot].values
y4 = y4[y4<np.percentile(y4, cutoff_quantile)]
y_data = [y0,y1,y2,y3,y4]
colors = ['rgba(93, 164, 214, 0.5)', 'rgba(255, 144, 14, 0.5)', 'rgba(44, 160, 101, 0.5)', 'rgba(255, 65, 54, 0.5)', 'rgba(207, 114, 255, 0.5)', 'rgba(127, 96, 0, 0.5)']
traces = []
for xd, yd, cls in zip(x_data, y_data, colors):
traces.append(go.Box(
y=yd,
name=xd,
boxpoints=False,
jitter=0.5,
whiskerwidth=0.2,
fillcolor=cls,
marker=dict(
size=2,
),
line=dict(width=1),
))
layout = go.Layout(
title='Difference in sales {} from cluster to cluster'.format(field_to_plot),
yaxis=dict(
autorange=True,
showgrid=True,
zeroline=True,
dtick=50,
gridcolor='black',
gridwidth=0.1,
zerolinecolor='rgb(255, 255, 255)',
zerolinewidth=2,
),
margin=dict(
l=40,
r=30,
b=80,
t=100,
),
paper_bgcolor='white',
plot_bgcolor='white',
showlegend=False
)
fig = go.Figure(data=traces, layout=layout)
py.offline.iplot(fig)
x_data = ['Cluster 1','Cluster 2','Cluster 3','Cluster 4', 'Cluster 5']
cutoff_quantile = 80
field_to_plot = 'amount'
y0 = customer_history_df[customer_history_df['num_cluster5_labels']==0][field_to_plot].values
y0 = y0[y0<np.percentile(y0, cutoff_quantile)]
y1 = customer_history_df[customer_history_df['num_cluster5_labels']==1][field_to_plot].values
y1 = y1[y1<np.percentile(y1, cutoff_quantile)]
y2 = customer_history_df[customer_history_df['num_cluster5_labels']==2][field_to_plot].values
y2 = y2[y2<np.percentile(y2, cutoff_quantile)]
y3 = customer_history_df[customer_history_df['num_cluster5_labels']==3][field_to_plot].values
y3 = y3[y3<np.percentile(y3, cutoff_quantile)]
y4 = customer_history_df[customer_history_df['num_cluster5_labels']==4][field_to_plot].values
y4 = y4[y4<np.percentile(y4, cutoff_quantile)]
y_data = [y0,y1,y2,y3,y4]
colors = ['rgba(93, 164, 214, 0.5)', 'rgba(255, 144, 14, 0.5)', 'rgba(44, 160, 101, 0.5)', 'rgba(255, 65, 54, 0.5)', 'rgba(207, 114, 255, 0.5)', 'rgba(127, 96, 0, 0.5)']
traces = []
for xd, yd, cls in zip(x_data, y_data, colors):
traces.append(go.Box(
y=yd,
name=xd,
boxpoints=False,
jitter=0.5,
whiskerwidth=0.2,
fillcolor=cls,
marker=dict(
size=2,
),
line=dict(width=1),
))
layout = go.Layout(
title='Difference in sales {} from cluster to cluster'.format(field_to_plot),
yaxis=dict(
autorange=True,
showgrid=True,
zeroline=True,
dtick=1000,
gridcolor='black',
gridwidth=0.1,
zerolinecolor='rgb(255, 255, 255)',
zerolinewidth=2,
),
margin=dict(
l=40,
r=30,
b=80,
t=100,
),
paper_bgcolor='white',
plot_bgcolor='white',
showlegend=False
)
fig = go.Figure(data=traces, layout=layout)
py.offline.iplot(fig)
x_data = ['Cluster 1','Cluster 2','Cluster 3','Cluster 4', 'Cluster 5']
cutoff_quantile = 80
field_to_plot = 'frequency'
y0 = customer_history_df[customer_history_df['num_cluster5_labels']==0][field_to_plot].values
y0 = y0[y0<np.percentile(y0, cutoff_quantile)]
y1 = customer_history_df[customer_history_df['num_cluster5_labels']==1][field_to_plot].values
y1 = y1[y1<np.percentile(y1, cutoff_quantile)]
y2 = customer_history_df[customer_history_df['num_cluster5_labels']==2][field_to_plot].values
y2 = y2[y2<np.percentile(y2, cutoff_quantile)]
y3 = customer_history_df[customer_history_df['num_cluster5_labels']==3][field_to_plot].values
y3 = y3[y3<np.percentile(y3, cutoff_quantile)]
y4 = customer_history_df[customer_history_df['num_cluster5_labels']==4][field_to_plot].values
y4 = y4[y4<np.percentile(y4, cutoff_quantile)]
y_data = [y0,y1,y2,y3,y4]
colors = ['rgba(93, 164, 214, 0.5)', 'rgba(255, 144, 14, 0.5)', 'rgba(44, 160, 101, 0.5)', 'rgba(255, 65, 54, 0.5)', 'rgba(207, 114, 255, 0.5)', 'rgba(127, 96, 0, 0.5)']
traces = []
for xd, yd, cls in zip(x_data, y_data, colors):
traces.append(go.Box(
y=yd,
name=xd,
boxpoints=False,
jitter=0.5,
whiskerwidth=0.2,
fillcolor=cls,
marker=dict(
size=2,
),
line=dict(width=1),
))
layout = go.Layout(
title='Difference in sales {} from cluster to cluster'.format(field_to_plot),
yaxis=dict(
autorange=True,
showgrid=True,
zeroline=True,
dtick=100,
gridcolor='black',
gridwidth=0.1,
zerolinecolor='rgb(255, 255, 255)',
zerolinewidth=2,
),
margin=dict(
l=40,
r=30,
b=80,
t=100,
),
paper_bgcolor='white',
plot_bgcolor='white',
showlegend=False
)
fig = go.Figure(data=traces, layout=layout)
py.offline.iplot(fig)
x_data = ['Cluster 1','Cluster 2','Cluster 3']
cutoff_quantile = 100
field_to_plot = 'recency'
y0 = customer_history_df[customer_history_df['num_cluster3_labels']==0][field_to_plot].values
y0 = y0[y0<np.percentile(y0, cutoff_quantile)]
y1 = customer_history_df[customer_history_df['num_cluster3_labels']==1][field_to_plot].values
y1 = y1[y1<np.percentile(y1, cutoff_quantile)]
y2 = customer_history_df[customer_history_df['num_cluster3_labels']==2][field_to_plot].values
y2 = y2[y2<np.percentile(y2, cutoff_quantile)]
y_data = [y0,y1,y2]
colors = ['rgba(93, 164, 214, 0.5)', 'rgba(255, 144, 14, 0.5)', 'rgba(44, 160, 101, 0.5)', 'rgba(255, 65, 54, 0.5)', 'rgba(207, 114, 255, 0.5)', 'rgba(127, 96, 0, 0.5)']
traces = []
for xd, yd, cls in zip(x_data, y_data, colors):
traces.append(go.Box(
y=yd,
name=xd,
boxpoints=False,
jitter=0.5,
whiskerwidth=0.2,
fillcolor=cls,
marker=dict(
size=2,
),
line=dict(width=1),
))
layout = go.Layout(
title='Difference in sales {} from cluster to cluster'.format(field_to_plot),
yaxis=dict(
autorange=True,
showgrid=True,
zeroline=True,
dtick=50,
gridcolor='black',
gridwidth=0.1,
zerolinecolor='rgb(255, 255, 255)',
zerolinewidth=2,
),
margin=dict(
l=40,
r=30,
b=80,
t=100,
),
plot_bgcolor='white',
showlegend=False
)
fig = go.Figure(data=traces, layout=layout)
py.offline.iplot(fig)
x_data = ['Cluster 1','Cluster 2','Cluster 3']
cutoff_quantile = 80
field_to_plot = 'amount'
y0 = customer_history_df[customer_history_df['num_cluster3_labels']==0][field_to_plot].values
y0 = y0[y0<np.percentile(y0, cutoff_quantile)]
y1 = customer_history_df[customer_history_df['num_cluster3_labels']==1][field_to_plot].values
y1 = y1[y1<np.percentile(y1, cutoff_quantile)]
y2 = customer_history_df[customer_history_df['num_cluster3_labels']==2][field_to_plot].values
y2 = y2[y2<np.percentile(y2, cutoff_quantile)]
y_data = [y0,y1,y2]
colors = ['rgba(93, 164, 214, 0.5)', 'rgba(255, 144, 14, 0.5)', 'rgba(44, 160, 101, 0.5)', 'rgba(255, 65, 54, 0.5)', 'rgba(207, 114, 255, 0.5)', 'rgba(127, 96, 0, 0.5)']
traces = []
for xd, yd, cls in zip(x_data, y_data, colors):
traces.append(go.Box(
y=yd,
name=xd,
boxpoints=False,
jitter=0.5,
whiskerwidth=0.2,
fillcolor=cls,
marker=dict(
size=2,
),
line=dict(width=1),
))
layout = go.Layout(
title='Difference in sales {} from cluster to cluster'.format(field_to_plot),
yaxis=dict(
dtick=1000,
)
)
fig = go.Figure(data=traces, layout=layout)
py.offline.iplot(fig)
x_data = ['Cluster 1','Cluster 2','Cluster 3']
cutoff_quantile = 90
field_to_plot = 'frequency'
y0 = customer_history_df[customer_history_df['num_cluster3_labels']==0][field_to_plot].values
y0 = y0[y0<np.percentile(y0, cutoff_quantile)]
y1 = customer_history_df[customer_history_df['num_cluster3_labels']==1][field_to_plot].values
y1 = y1[y1<np.percentile(y1, cutoff_quantile)]
y2 = customer_history_df[customer_history_df['num_cluster3_labels']==2][field_to_plot].values
y2 = y2[y2<np.percentile(y2, cutoff_quantile)]
y_data = [y0,y1,y2]
colors = ['rgba(93, 164, 214, 0.5)', 'rgba(255, 144, 14, 0.5)', 'rgba(44, 160, 101, 0.5)', 'rgba(255, 65, 54, 0.5)', 'rgba(207, 114, 255, 0.5)', 'rgba(127, 96, 0, 0.5)']
traces = []
for xd, yd, cls in zip(x_data, y_data, colors):
traces.append(go.Box(
y=yd,
name=xd,
boxpoints=False,
jitter=0.5,
whiskerwidth=0.2,
fillcolor=cls,
marker=dict(
size=2,
),
line=dict(width=1),
))
layout = go.Layout(
title='Difference in sales {} from cluster to cluster'.format(field_to_plot),
yaxis=dict(
autorange=True,
showgrid=True,
zeroline=True,
dtick=100,
gridcolor='black',
gridwidth=0.1,
zerolinecolor='rgb(255, 255, 255)',
zerolinewidth=2,
),
margin=dict(
l=40,
r=30,
b=80,
t=100,
),
paper_bgcolor='white',
plot_bgcolor='white',
showlegend=False
)
fig = go.Figure(data=traces, layout=layout)
py.offline.iplot(fig)