#!/usr/bin/env python # coding: utf-8 # # Load Dependencies and Configuration Settings # In[1]: import pandas as pd import datetime import math import numpy as np import matplotlib.pyplot as plt import matplotlib.mlab as mlab get_ipython().run_line_magic('matplotlib', 'inline') # # Load and View the Dataset # In[2]: cs_df = pd.read_excel(io=r'Online Retail.xlsx') # In[3]: cs_df.head() # Transactions size # In[4]: cs_df.shape # # Top Sales by Country # In[5]: cs_df.Country.value_counts().reset_index().head(n=10) # # Top Customers contributing to 10% of total Sales # Number of customers # In[6]: cs_df.CustomerID.unique().shape # In[7]: (cs_df.CustomerID.value_counts()/sum(cs_df.CustomerID.value_counts())*100).head(n=13).cumsum() # # Analyzing Data Quality Issues # Number of unique items # In[8]: cs_df.StockCode.unique().shape # Description of items: We see that the descriptions are more then the stock code so there must be some stock code which have more than one decription # In[9]: cs_df.Description.unique().shape # In[10]: cs_df.dtypes # In[11]: cat_des_df = cs_df.groupby(["StockCode","Description"]).count().reset_index() # Stockcode which have more than one description # In[12]: cat_des_df.StockCode.value_counts()[cat_des_df.StockCode.value_counts()>1].reset_index().head() # Example of one such stockcode # In[14]: cs_df[cs_df['StockCode'] == cat_des_df.StockCode.value_counts()[cat_des_df.StockCode.value_counts()>1] .reset_index()['index'][5]]['Description'].unique() # In[15]: cs_df['invdatetime'] = pd.to_datetime(cs_df.InvoiceDate) # In[16]: cs_df.Quantity.describe() # In[17]: cs_df.UnitPrice.describe() # # Data Cleaning # In[18]: # Seperate data for one geography cs_df = cs_df[cs_df.Country == 'United Kingdom'] # Seperate attribute for total amount cs_df['amount'] = cs_df.Quantity*cs_df.UnitPrice # Remove negative or return transactions cs_df = cs_df[~(cs_df.amount<0)] cs_df.head() cs_df = cs_df[~(cs_df.CustomerID.isnull())] # In[19]: cs_df.shape # # Build Recency Feature # In[20]: cs_df.InvoiceDate.max() # In[21]: cs_df.InvoiceDate.min() # In[22]: refrence_date = cs_df.InvoiceDate.max() refrence_date = refrence_date + datetime.timedelta(days = 1) # In[23]: cs_df['days_since_last_purchase'] = refrence_date - cs_df.InvoiceDate cs_df['days_since_last_purchase_num'] = cs_df['days_since_last_purchase'].astype('timedelta64[D]') # Time period of transactions # In[24]: customer_history_df = cs_df.groupby("CustomerID").min().reset_index()[['CustomerID', 'days_since_last_purchase_num']] customer_history_df.rename(columns={'days_since_last_purchase_num':'recency'}, inplace=True) customer_history_df.recency.describe() # In[25]: customer_history_df.head() # In[26]: customer_history_df.recency.describe() # In[27]: x = customer_history_df.recency mu = np.mean(customer_history_df.recency) sigma = math.sqrt(np.var(customer_history_df.recency)) n, bins, patches = plt.hist(x, 1000, facecolor='green', alpha=0.75) # add a 'best fit' line y = mlab.normpdf(bins, mu, sigma) l = plt.plot(bins, y, 'r--', linewidth=2) plt.xlabel('Recency in days') plt.ylabel('Number of transactions') plt.title(r'$\mathrm{Histogram\ of\ sales\ recency}\ $') plt.grid(True) # # Build Frequency & Monetary value Features # In[28]: customer_monetary_val = cs_df[['CustomerID', 'amount']].groupby("CustomerID").sum().reset_index() customer_history_df = customer_history_df.merge(customer_monetary_val, how='outer') customer_history_df.amount = customer_history_df.amount+0.001 customer_freq = cs_df[['CustomerID', 'amount']].groupby("CustomerID").count().reset_index() customer_freq.rename(columns={'amount':'frequency'},inplace=True) customer_history_df = customer_history_df.merge(customer_freq, how='outer') # Remove returns so that we only have purchases of a customer # In[29]: customer_history_df.head() # In[30]: from sklearn import preprocessing import math customer_history_df['recency_log'] = customer_history_df['recency'].apply(math.log) customer_history_df['frequency_log'] = customer_history_df['frequency'].apply(math.log) customer_history_df['amount_log'] = customer_history_df['amount'].apply(math.log) feature_vector = ['amount_log', 'recency_log','frequency_log'] X_subset = customer_history_df[feature_vector].as_matrix() scaler = preprocessing.StandardScaler().fit(X_subset) X_scaled = scaler.transform(X_subset) # # Visualizing Recency vs Monetary Value (scaled) # In[31]: plt.scatter(customer_history_df.recency_log, customer_history_df.amount_log, alpha=0.5) # # Visualizing Monetary Value distribution (scaled) # In[32]: x = customer_history_df.amount_log n, bins, patches = plt.hist(x, 1000, facecolor='green', alpha=0.75) plt.xlabel('Log of Sales Amount') plt.ylabel('Probability') plt.title(r'$\mathrm{Histogram\ of\ Log\ transformed\ Customer\ Monetary\ value}\ $') plt.grid(True) #plt.show() # In[33]: customer_history_df.head() # In[34]: from mpl_toolkits.mplot3d import Axes3D fig = plt.figure(figsize=(8, 6)) ax = fig.add_subplot(111, projection='3d') xs =customer_history_df.recency_log ys = customer_history_df.frequency_log zs = customer_history_df.amount_log ax.scatter(xs, ys, zs, s=5) ax.set_xlabel('Recency') ax.set_ylabel('Frequency') ax.set_zlabel('Monetary') #plt.show() # # Analyze Customer Segments with Clustering # In[34]: from sklearn.cluster import KMeans from sklearn.metrics import silhouette_samples, silhouette_score import matplotlib.cm as cm X = X_scaled cluster_centers = dict() for n_clusters in range(3,6,2): fig, (ax1, ax2) = plt.subplots(1, 2) #ax2 = plt.subplot(111, projection='3d') fig.set_size_inches(18, 7) ax1.set_xlim([-0.1, 1]) ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10]) clusterer = KMeans(n_clusters=n_clusters, random_state=10) cluster_labels = clusterer.fit_predict(X) silhouette_avg = silhouette_score(X, cluster_labels) cluster_centers.update({n_clusters :{ 'cluster_center':clusterer.cluster_centers_, 'silhouette_score':silhouette_avg, 'labels':cluster_labels} }) sample_silhouette_values = silhouette_samples(X, cluster_labels) y_lower = 10 for i in range(n_clusters): ith_cluster_silhouette_values = \ sample_silhouette_values[cluster_labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.spectral(float(i) / n_clusters) ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7) ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) y_lower = y_upper + 10 # 10 for the 0 samples ax1.set_title("The silhouette plot for the various clusters.") ax1.set_xlabel("The silhouette coefficient values") ax1.set_ylabel("Cluster label") ax1.axvline(x=silhouette_avg, color="red", linestyle="--") ax1.set_yticks([]) ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) colors = cm.spectral(cluster_labels.astype(float) / n_clusters) feature1 = 0 feature2 = 2 ax2.scatter(X[:, feature1], X[:, feature2], marker='.', s=30, lw=0, alpha=0.7, c=colors, edgecolor='k') centers = clusterer.cluster_centers_ ax2.scatter(centers[:, feature1], centers[:, feature2], marker='o', c="white", alpha=1, s=200, edgecolor='k') for i, c in enumerate(centers): ax2.scatter(c[feature1], c[feature2], marker='$%d$' % i, alpha=1, s=50, edgecolor='k') ax2.set_title("The visualization of the clustered data.") ax2.set_xlabel("Feature space for the 1st feature i.e. monetary value") ax2.set_ylabel("Feature space for the 2nd feature i.e. frequency") plt.suptitle(("Silhouette analysis for KMeans clustering on sample data " "with n_clusters = %d" % n_clusters), fontsize=14, fontweight='bold') #plt.show() # In[35]: for i in range(3,6,2): print("for {} number of clusters".format(i)) cent_transformed = scaler.inverse_transform(cluster_centers[i]['cluster_center']) print(pd.DataFrame(np.exp(cent_transformed),columns=feature_vector)) print("Silhouette score for cluster {} is {}". format(i, cluster_centers[i]['silhouette_score'])) print() # # Assign Cluster Labels # In[36]: labels = cluster_centers[5]['labels'] customer_history_df['num_cluster5_labels'] = labels labels = cluster_centers[3]['labels'] customer_history_df['num_cluster3_labels'] = labels # In[37]: customer_history_df.head() # # Visualize Segments # In[38]: import plotly as py import plotly.graph_objs as go py.offline.init_notebook_mode() x_data = ['Cluster 1','Cluster 2','Cluster 3','Cluster 4', 'Cluster 5'] cutoff_quantile = 100 field_to_plot = 'recency' y0 = customer_history_df[customer_history_df['num_cluster5_labels']==0][field_to_plot].values y0 = y0[y0