In [1]:

import pandas as pd
import math
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from datetime import datetime, date
plt.style.use('ggplot')

In [2]:

# Loading the Transactions and Customer Demographics Datasets

trans = pd.read_csv('Transactions_Cleaned.csv')
cust = pd.read_csv('CustomerDemographic_Cleaned.csv')

In [3]:

# Fetching first 5 transaction records

trans.head(5)

Out[3]:

	transaction_id	product_id	customer_id	transaction_date	online_order	order_status	brand	product_line	product_class	product_size	list_price	standard_cost	product_first_sold_date	Profit
0	1	2	2950	2017-02-25	0.0	Approved	Solex	Standard	medium	medium	71.49	53.62	41245.0	17.87
1	2	3	3120	2017-05-21	1.0	Approved	Trek Bicycles	Standard	medium	large	2091.47	388.92	41701.0	1702.55
2	3	37	402	2017-10-16	0.0	Approved	OHM Cycles	Standard	low	medium	1793.43	248.82	36361.0	1544.61
3	4	88	3135	2017-08-31	0.0	Approved	Norco Bicycles	Standard	medium	medium	1198.46	381.10	36145.0	817.36
4	5	78	787	2017-10-01	1.0	Approved	Giant Bicycles	Standard	medium	large	1765.30	709.48	42226.0	1055.82

In [4]:

print("Total records (rows) in the Transaction Dataset : {}".format(trans.shape[0]))
print("Total features (columns) in the Transaction Dataset : {}".format(trans.shape[1]))

Total records (rows) in the Transaction Dataset : 19803
Total features (columns) in the Transaction Dataset : 14

In [5]:

# Fetching first 5 Customer Demographics records

cust.head(5)

Out[5]:

	customer_id	first_name	last_name	gender	past_3_years_bike_related_purchases	DOB	job_title	job_industry_category	wealth_segment	deceased_indicator	owns_car	tenure	Age
0	1	Laraine	Medendorp	Female	93	1953-10-12	Executive Secretary	Health	Mass Customer	N	Yes	11.0	67
1	2	Eli	Bockman	Male	81	1980-12-16	Administrative Officer	Financial Services	Mass Customer	N	Yes	16.0	40
2	3	Arlin	Dearle	Male	61	1954-01-20	Recruiting Manager	Property	Mass Customer	N	Yes	15.0	67
3	4	Talbot	None	Male	33	1961-10-03	Missing	IT	Mass Customer	N	No	7.0	59
4	5	Sheila-kathryn	Calton	Female	56	1977-05-13	Senior Editor	Missing	Affluent Customer	N	Yes	8.0	44

In [6]:

print("Total records (rows) in the Customer Demographics Dataset : {}".format(cust.shape[0]))
print("Total features (columns) in the Customer Demographics Dataset : {}".format(cust.shape[1]))

Total records (rows) in the Customer Demographics Dataset : 3912
Total features (columns) in the Customer Demographics Dataset : 13

In [7]:

# Merging both the Transaction Dataset and Customer Demographics Dataset based on customer_id.

merged_trans_cust = pd.merge(trans, cust, left_on='customer_id', right_on='customer_id', how='inner')

In [8]:

# Fetching the first 5 records of the merged dataset.

merged_trans_cust.head(5)

Out[8]:

	transaction_id	product_id	customer_id	transaction_date	online_order	order_status	brand	product_line	product_class	product_size	...	gender	past_3_years_bike_related_purchases	DOB	job_title	job_industry_category	wealth_segment	deceased_indicator	owns_car	tenure	Age
0	1	2	2950	2017-02-25	0.0	Approved	Solex	Standard	medium	medium	...	Male	19	1955-01-11	Software Engineer I	Financial Services	Mass Customer	N	Yes	10.0	66
1	11065	1	2950	2017-10-16	0.0	Approved	Giant Bicycles	Standard	medium	medium	...	Male	19	1955-01-11	Software Engineer I	Financial Services	Mass Customer	N	Yes	10.0	66
2	18923	62	2950	2017-04-26	0.0	Approved	Solex	Standard	medium	medium	...	Male	19	1955-01-11	Software Engineer I	Financial Services	Mass Customer	N	Yes	10.0	66
3	2	3	3120	2017-05-21	1.0	Approved	Trek Bicycles	Standard	medium	large	...	Female	89	1979-02-04	Clinical Specialist	Health	Mass Customer	N	Yes	10.0	42
4	6862	4	3120	2017-10-05	0.0	Approved	Giant Bicycles	Standard	high	medium	...	Female	89	1979-02-04	Clinical Specialist	Health	Mass Customer	N	Yes	10.0	42

5 rows × 26 columns

In [9]:

print("Total records (rows) in the Merged Dataset : {}".format(merged_trans_cust.shape[0]))
print("Total features (columns) in the Merged Dataset : {}".format(merged_trans_cust.shape[1]))

Total records (rows) in the Merged Dataset : 19354
Total features (columns) in the Merged Dataset : 26

In [10]:

merged_trans_cust.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19354 entries, 0 to 19353
Data columns (total 26 columns):
transaction_id                         19354 non-null int64
product_id                             19354 non-null int64
customer_id                            19354 non-null int64
transaction_date                       19354 non-null object
online_order                           19354 non-null float64
order_status                           19354 non-null object
brand                                  19354 non-null object
product_line                           19354 non-null object
product_class                          19354 non-null object
product_size                           19354 non-null object
list_price                             19354 non-null float64
standard_cost                          19354 non-null float64
product_first_sold_date                19354 non-null float64
Profit                                 19354 non-null float64
first_name                             19354 non-null object
last_name                              19354 non-null object
gender                                 19354 non-null object
past_3_years_bike_related_purchases    19354 non-null int64
DOB                                    19354 non-null object
job_title                              19354 non-null object
job_industry_category                  19354 non-null object
wealth_segment                         19354 non-null object
deceased_indicator                     19354 non-null object
owns_car                               19354 non-null object
tenure                                 19354 non-null float64
Age                                    19354 non-null int64
dtypes: float64(6), int64(5), object(15)
memory usage: 4.0+ MB

The data-type of transaction_date column is not in date-time format. Hence the data-type of the column should be changed from object to datetime type.

In [11]:

merged_trans_cust['transaction_date']= pd.to_datetime(merged_trans_cust['transaction_date'])

1. RFM Analysis¶

RFM (Recency, Frequency, Monetary) analysis is a behavior-based approach grouping customers into segments. It groups the customers on the basis of their previous purchase transactions. How recently, how often, and how much did a customer buy. RFM filters customers into various groups for the purpose of better service. There is a segment of customer who is the big spender but what if they purchased only once or how recently they purchased? Do they often purchase our product? Also, It helps managers to run an effective promotional campaign for personalized service.

Recency (R): Who have purchased recently? Number of days since last purchase (least recency)
Frequency (F): Who has purchased frequently? It means the total number of purchases. ( high frequency)
Monetary Value(M): Who have high purchase amount? It means the total money customer spent (high monetary value)

In [12]:

# Maximum Transaction Date or the latest transaction date.

max_trans_date = max(merged_trans_cust['transaction_date']).date()
max_trans_date

Out[12]:

datetime.date(2017, 12, 30)

In [13]:

# Taking the last transaction date as a reference date for comparision and 
# finding the number of days between a transaction date and last transaction date to compute the recency.

comparison_date = datetime.strptime(str(max_trans_date), "%Y-%m-%d")

In [14]:

# Creating a RFM table that will contain all the values for recency , frequency and Monetray data. 

rfm_table = merged_trans_cust.groupby(['customer_id']).agg({'transaction_date': lambda date : (comparison_date - date.max()).days,
                                                            'product_id' : lambda prod_id : len(prod_id), 
                                                            'Profit' : lambda p : sum(p)})

In [15]:

# The columns in rfm_table dataframe are not properly named. Renaming of the columns to appropiate name is needed

rfm_table.columns

Out[15]:

Index(['transaction_date', 'product_id', 'Profit'], dtype='object')

In [16]:

# Renaming column names to appropiate names

rfm_table.rename(columns={'transaction_date' : 'recency', 
                        'product_id' : 'frequency',
                        'Profit' : 'monetary'} , inplace=True)

In [17]:

# Dividing the recency, frequency and monetary into 4 quartiles (min, 25%, 50%, 75% and max).
# These values will help us to calculate RFM score for a customer and classify based on their RFM score.

rfm_table['r_quartile'] = pd.qcut(rfm_table['recency'], 4, ['4','3','2','1'])
rfm_table['f_quartile'] = pd.qcut(rfm_table['frequency'], 4, ['1','2','3','4'])
rfm_table['m_quartile'] = pd.qcut(rfm_table['monetary'], 4, ['1','2','3','4'])

In [18]:

# RFM_table dataset

rfm_table

Out[18]:

	recency	frequency	monetary	r_quartile	f_quartile	m_quartile
customer_id
1	7	11	3018.09	4	4	3
2	128	3	2226.26	1	1	2
3	102	8	3362.81	1	4	3
4	195	2	220.57	1	1	1
5	16	6	2394.94	4	2	2
6	64	5	3946.55	2	2	3
7	253	3	220.11	1	1	1
8	22	10	7066.94	3	4	4
9	78	6	2353.11	2	2	2
10	43	5	3358.28	3	2	3
11	46	6	3638.84	2	2	3
12	67	7	3540.03	2	3	3
13	27	7	4337.38	3	3	4
14	47	3	1713.90	2	1	1
15	35	6	1728.39	3	2	1
16	99	5	4521.84	1	2	4
17	0	5	2015.61	4	2	2
18	134	7	3543.38	1	3	3
19	102	3	2951.79	1	1	3
20	31	4	3608.28	3	1	3
21	6	5	4229.41	4	2	4
22	97	8	5159.84	1	4	4
23	10	6	4376.15	4	2	4
24	17	7	3689.35	4	3	3
25	5	12	5333.66	4	4	4
26	259	2	268.24	1	1	1
27	143	7	3274.25	1	3	3
28	83	6	3366.56	2	2	3
29	22	9	6175.30	3	4	4
30	11	3	2633.95	4	1	2
...	...	...	...	...	...	...
3470	2	8	7228.80	4	4	4
3471	148	4	1914.10	1	1	2
3472	5	6	2779.58	4	2	2
3474	71	5	2358.99	2	2	2
3475	3	4	2515.14	4	1	2
3476	50	5	1152.36	2	2	1
3477	64	8	4401.92	2	4	4
3478	29	6	4297.85	3	2	4
3479	31	6	2265.96	3	2	2
3480	16	4	1770.26	4	1	1
3481	54	7	2438.45	2	3	2
3482	27	8	5549.59	3	4	4
3483	117	6	2193.04	1	2	2
3484	79	7	5924.55	2	3	4
3485	25	3	2491.47	3	1	2
3486	9	4	2972.34	4	1	3
3487	10	3	1837.75	4	1	2
3488	17	2	815.95	4	1	1
3489	108	6	2644.44	1	2	2
3490	166	5	2379.57	1	2	2
3491	189	4	1430.28	1	1	1
3492	80	3	2193.81	2	1	2
3493	93	6	3728.88	1	2	3
3494	4	4	2755.11	4	1	2
3495	13	7	3847.65	4	3	3
3496	256	4	2045.84	1	1	2
3497	52	3	1648.32	2	1	1
3498	127	6	3147.33	1	2	3
3499	51	7	4955.25	2	3	4
3500	144	6	1785.86	1	2	1

3416 rows × 6 columns

In [19]:

# Caluclation of RFM Score.
# Max weightage is given to recency then frequency and then  monetary.

rfm_table['rfm_score'] = 100*rfm_table['r_quartile'].astype(int)+10*rfm_table['f_quartile'].astype(int)+rfm_table['m_quartile'].astype(int)

In [20]:

# Assigning a title to a cuustomer.
# Platinum corresponds to highest range of RFM score down to Bronze to lowest range of RFM score.

rfm_table['customer_title'] = pd.qcut(rfm_table['rfm_score'], 4, ['Bronze','Silver','Gold','Platinum'])

In [21]:

# RFM table dataset

rfm_table

Out[21]:

	recency	frequency	monetary	r_quartile	f_quartile	m_quartile	rfm_score	customer_title
customer_id
1	7	11	3018.09	4	4	3	443	Platinum
2	128	3	2226.26	1	1	2	112	Bronze
3	102	8	3362.81	1	4	3	143	Bronze
4	195	2	220.57	1	1	1	111	Bronze
5	16	6	2394.94	4	2	2	422	Platinum
6	64	5	3946.55	2	2	3	223	Silver
7	253	3	220.11	1	1	1	111	Bronze
8	22	10	7066.94	3	4	4	344	Gold
9	78	6	2353.11	2	2	2	222	Silver
10	43	5	3358.28	3	2	3	323	Gold
11	46	6	3638.84	2	2	3	223	Silver
12	67	7	3540.03	2	3	3	233	Silver
13	27	7	4337.38	3	3	4	334	Gold
14	47	3	1713.90	2	1	1	211	Bronze
15	35	6	1728.39	3	2	1	321	Gold
16	99	5	4521.84	1	2	4	124	Bronze
17	0	5	2015.61	4	2	2	422	Platinum
18	134	7	3543.38	1	3	3	133	Bronze
19	102	3	2951.79	1	1	3	113	Bronze
20	31	4	3608.28	3	1	3	313	Gold
21	6	5	4229.41	4	2	4	424	Platinum
22	97	8	5159.84	1	4	4	144	Bronze
23	10	6	4376.15	4	2	4	424	Platinum
24	17	7	3689.35	4	3	3	433	Platinum
25	5	12	5333.66	4	4	4	444	Platinum
26	259	2	268.24	1	1	1	111	Bronze
27	143	7	3274.25	1	3	3	133	Bronze
28	83	6	3366.56	2	2	3	223	Silver
29	22	9	6175.30	3	4	4	344	Gold
30	11	3	2633.95	4	1	2	412	Platinum
...	...	...	...	...	...	...	...	...
3470	2	8	7228.80	4	4	4	444	Platinum
3471	148	4	1914.10	1	1	2	112	Bronze
3472	5	6	2779.58	4	2	2	422	Platinum
3474	71	5	2358.99	2	2	2	222	Silver
3475	3	4	2515.14	4	1	2	412	Platinum
3476	50	5	1152.36	2	2	1	221	Silver
3477	64	8	4401.92	2	4	4	244	Silver
3478	29	6	4297.85	3	2	4	324	Gold
3479	31	6	2265.96	3	2	2	322	Gold
3480	16	4	1770.26	4	1	1	411	Gold
3481	54	7	2438.45	2	3	2	232	Silver
3482	27	8	5549.59	3	4	4	344	Gold
3483	117	6	2193.04	1	2	2	122	Bronze
3484	79	7	5924.55	2	3	4	234	Silver
3485	25	3	2491.47	3	1	2	312	Gold
3486	9	4	2972.34	4	1	3	413	Platinum
3487	10	3	1837.75	4	1	2	412	Platinum
3488	17	2	815.95	4	1	1	411	Gold
3489	108	6	2644.44	1	2	2	122	Bronze
3490	166	5	2379.57	1	2	2	122	Bronze
3491	189	4	1430.28	1	1	1	111	Bronze
3492	80	3	2193.81	2	1	2	212	Silver
3493	93	6	3728.88	1	2	3	123	Bronze
3494	4	4	2755.11	4	1	2	412	Platinum
3495	13	7	3847.65	4	3	3	433	Platinum
3496	256	4	2045.84	1	1	2	112	Bronze
3497	52	3	1648.32	2	1	1	211	Bronze
3498	127	6	3147.33	1	2	3	123	Bronze
3499	51	7	4955.25	2	3	4	234	Silver
3500	144	6	1785.86	1	2	1	121	Bronze

3416 rows × 8 columns

Merging both RFM Table with Transaction and Customer Tables¶

The RFM_Table dataframe is merged with the Transactions and Customer Demographics datasets, to gain depper insights of Customer Segemnts along with transactions. The dataframes are joined based on customer_ids from both the datasets

In [22]:

cust_trans_rfm = pd.merge(merged_trans_cust, rfm_table, left_on='customer_id', right_on='customer_id', how='inner')

In [23]:

cust_trans_rfm.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19354 entries, 0 to 19353
Data columns (total 34 columns):
transaction_id                         19354 non-null int64
product_id                             19354 non-null int64
customer_id                            19354 non-null int64
transaction_date                       19354 non-null datetime64[ns]
online_order                           19354 non-null float64
order_status                           19354 non-null object
brand                                  19354 non-null object
product_line                           19354 non-null object
product_class                          19354 non-null object
product_size                           19354 non-null object
list_price                             19354 non-null float64
standard_cost                          19354 non-null float64
product_first_sold_date                19354 non-null float64
Profit                                 19354 non-null float64
first_name                             19354 non-null object
last_name                              19354 non-null object
gender                                 19354 non-null object
past_3_years_bike_related_purchases    19354 non-null int64
DOB                                    19354 non-null object
job_title                              19354 non-null object
job_industry_category                  19354 non-null object
wealth_segment                         19354 non-null object
deceased_indicator                     19354 non-null object
owns_car                               19354 non-null object
tenure                                 19354 non-null float64
Age                                    19354 non-null int64
recency                                19354 non-null int64
frequency                              19354 non-null int64
monetary                               19354 non-null float64
r_quartile                             19354 non-null category
f_quartile                             19354 non-null category
m_quartile                             19354 non-null category
rfm_score                              19354 non-null int32
customer_title                         19354 non-null category
dtypes: category(4), datetime64[ns](1), float64(7), int32(1), int64(7), object(14)
memory usage: 4.6+ MB

The datatypes of the columns looks fine.

Creating an Age Group Feature¶

In [24]:

cust_trans_rfm['Age_Group'] = cust_trans_rfm['Age'].apply(lambda x : (math.floor(x/10)+1)*10)

Creating a Detail Customer title / tag based on RFM Score¶

An extended version of customer title is made which divides the entire number of customers into 11 groups. The groups are mainly Platinum Customers, Very Loyal, Becoming Loyal, Recent Customers, Potential Customers, Late Bloomer, Loosing Customers, High Risk Customers, Almost Lost Customers, Evasive Customers and Lost Customers.
The demarkation of customers into the above mentioned groups is based on their RFM scores.

In [25]:

# Function as a lookup to appropiate customer titles based on RFM score.

def cust_score_title_lkup(cols):
    
    rfm_score = cols[0]
    
    if rfm_score >= 444:
        return 'Platinum Customer'
    elif rfm_score >=433 and rfm_score < 444:
        return 'Very Loyal'
    elif rfm_score >=421 and rfm_score < 433:
        return 'Becoming Loyal'
    elif rfm_score >=344 and rfm_score < 421:
        return 'Recent Customer'
    elif rfm_score >=323 and rfm_score < 344:
        return 'Potential Customer'
    elif rfm_score >=311 and rfm_score < 323:
        return 'Late Bloomer'
    elif rfm_score >=224 and rfm_score < 311:
        return 'Loosing Customer'
    elif rfm_score >=212 and rfm_score < 224:
        return 'High Risk Customer'
    elif rfm_score >=124 and rfm_score < 212:
        return 'Almost Lost Customer'
    elif rfm_score >=112 and rfm_score < 124:
        return 'Evasive Customer'
    else :
        return 'Lost Customer'

In [26]:

# Applying the above functions and creating a new feature detail_cust_title

cust_trans_rfm['detail_cust_title']=cust_trans_rfm[['rfm_score']].apply(cust_score_title_lkup, axis=1)

In [27]:

# Function to provide ranks to the customers based on their title.

def get_rank(cols):
    
    title = cols[0]
    
    if title=='Platinum Customer':
        return 1
    elif title=='Very Loyal':
        return 2
    elif title == 'Becoming Loyal':
        return 3
    elif title == 'Recent Customer':
        return 4
    elif title=='Potential Customer':
        return 5
    elif title == 'Late Bloomer':
        return 6
    elif title == 'Loosing Customer':
        return 7
    elif title=='High Risk Customer':
        return 8
    elif title == 'Almost Lost Customer':
        return 9
    elif title == 'Evasive Customer':
        return 10
    else :
        return 11
        

In [28]:

# Applying the above functions and creating a new feature rank

cust_trans_rfm['rank']=cust_trans_rfm[['detail_cust_title']].apply(get_rank, axis=1)

2. Exporting to CSV File¶

After performing data quality assessment(DQA), data cleaning and RFM Analysis on the dataset, it's time to export the dataset to a csv file for further exploratory data analysis (EDA) and this data will drive the Sales Customer Segmenation Dashboard developed in Tableau.

In [29]:

cust_trans_rfm.to_csv('Customer_Trans_RFM_Analysis.csv', index=False)

In [30]:

print("Total records in final dataset : {}".format(cust_trans_rfm.shape[0]))

Total records in final dataset : 19354

3. Data Analysis and Exploration¶

3.1. New Customer vs Old Customer Age Distributions¶

Most New / Old Customers are aged between 40-49. The lowest age groups are under 20 and 80+ for both Old and New Customers dataset.
Among the New Customers the most populated age bracket is 20-29 and 60-69, while the maximum Old Customers are from the age bracket 50-69.
There is a steep drop in number of customers in 30-39 age groupsd among the New Customers.

In [31]:

# Loading the New Customers Dataset

new_cust = pd.read_csv('NewCustomerList_Cleaned.csv')

In [42]:

plt.figure(figsize=(10,8))
sns.distplot(new_cust['Age Group'], kde=False, bins=15)
plt.xlabel('Age Group')
plt.ylabel('Number of Customers')
plt.title('New Customers - Age Distribution')

plt.show()

Here 20 = under 20, 30 = 20-29 age bracket

In [43]:

plt.figure(figsize=(10,8))
sns.distplot(cust_trans_rfm['Age_Group'], kde=False, bins=15)
plt.xlabel('Age Group')
plt.ylabel('Number of Customers')
plt.title('Old Customers - Age Distribution')

plt.show()

Here 20 = under 20, 30 = 20-29 age bracket

Over the last 3 years approximately 51% of the buyers are women and 49% were male buyers.
Female purchases are approximately 10,000 more than male (numerically). Gender wise majority of the bike sales comes from female customers.

In [45]:

cust_bike_purchase_by_gender = cust_trans_rfm.groupby('gender').agg({'past_3_years_bike_related_purchases' : sum}
                                                                 ).reset_index()

In [47]:

total_records = cust_trans_rfm['past_3_years_bike_related_purchases'].sum()

In [48]:

cust_bike_purchase_by_gender['Percent_of_total'] = (cust_bike_purchase_by_gender['past_3_years_bike_related_purchases']
                                                        /total_records)*100

In [49]:

cust_bike_purchase_by_gender

Out[49]:

	gender	past_3_years_bike_related_purchases	Percent_of_total
0	Female	478488	50.503731
1	Male	468943	49.496269

In [53]:

plt.figure(figsize=(8,5))
sns.barplot(x='gender',y='Percent_of_total',data=cust_bike_purchase_by_gender)
plt.xlabel('Gender')
plt.ylabel('Percent of Total Purchases')
plt.title('Female vs Male past 3 years Bike purchases')

plt.show()

3.3 Job Industry Customer Distribution¶

Among the New Customers the highest amount of sales comes from customers having a job in Manufacturing and Financial services sector. The samllest chunk of sales comes from customers in Agriculture sector and from Telecom sector with 3% sales only. Similar trend is observed among Old Customers.

In [59]:

plt.figure(figsize=(15,8))
sns.countplot(x='job_industry_category',data=new_cust[~(new_cust['job_industry_category']=='Missing')])
plt.xlabel('Job Industry')
plt.ylabel('Number of Customers')
plt.title('New Customers - Job Industry Customer Distribution')

plt.show()

In [60]:

plt.figure(figsize=(15,8))
sns.countplot(x='job_industry_category',data=cust_trans_rfm[~(cust_trans_rfm['job_industry_category']=='Missing')])
plt.xlabel('Job Industry')
plt.ylabel('Number of Customers')
plt.title('Old Customers - Job Industry Customer Distribution')

plt.show()

3.4. Wealth Segmentation by Age Group¶

Across all Age Groups the maximum number of customers are classified as 'Mass Customers'. The next being 'High Net Worth'. However among 40-49 aged customers 'Affluent Customers' outperforms the 'High Net Worth' customers.

New Customers

In [61]:

wealth_age_seg_new = new_cust.groupby(['wealth_segment', 'Age Group']).size().reset_index()

In [62]:

wealth_age_seg_new.rename(columns={0:'Number of Customers'}, inplace=True)
wealth_age_seg_new

Out[62]:

	wealth_segment	Age Group	Number of Customers
0	Affluent Customer	20	3
1	Affluent Customer	30	49
2	Affluent Customer	40	15
3	Affluent Customer	50	58
4	Affluent Customer	60	40
5	Affluent Customer	70	41
6	Affluent Customer	80	20
7	Affluent Customer	90	9
8	High Net Worth	30	42
9	High Net Worth	40	34
10	High Net Worth	50	53
11	High Net Worth	60	36
12	High Net Worth	70	50
13	High Net Worth	80	24
14	High Net Worth	90	10
15	Mass Customer	20	5
16	Mass Customer	30	74
17	Mass Customer	40	52
18	Mass Customer	50	121
19	Mass Customer	60	93
20	Mass Customer	70	81
21	Mass Customer	80	51
22	Mass Customer	90	22

In [66]:

plt.figure(figsize=(15,8))
sns.barplot(x='Age Group', y='Number of Customers' , hue='wealth_segment', data=wealth_age_seg_new)
plt.xlabel('Age Group')
plt.ylabel('Number of Customers')
plt.title('New Customers - Wealth Segmentation by Age Group')
plt.show()

In all the age groups the most number of customers are 'Mass Customers'. The 2nd largest customer base being the 'High Net Worth' group.
In the age group 40-49 the 'Affluent Customer' group outperforms 'High Net Worth' group

Old Customers

Similar treand (like that of New Customers) is observed among Old Customers

In [64]:

wealth_age_seg_old = cust_trans_rfm.groupby(['wealth_segment', 'Age_Group']).size().reset_index()

In [65]:

wealth_age_seg_old.rename(columns={0:'Number of Customers'}, inplace=True)

In [67]:

plt.figure(figsize=(15,8))
sns.barplot(x='Age_Group', y='Number of Customers' , hue='wealth_segment', data=wealth_age_seg_old)
plt.xlabel('Age Group')
plt.ylabel('Number of Customers')
plt.title('Old Customers - Wealth Segmentation by Age Group')
plt.show()

3.5. Car owner across each State¶

The customer base of the automobile bike company lies in New South Wales, Queensland and Victoria, Australia.
In New South Wales (NSW) it seems there is a greater amount of people who donot own a car. In Victoria (VIC) the proportion is evenly split. However in Queensland (QLD) there are relatively more people who own the car.

In [68]:

# Loading the Customer Address Dataset.

cust_addr_info = pd.read_csv('CustomerAddress_Cleaned.csv')

In [69]:

# Merging the RFM data with Customer Address dataset.

cust_trans_addr = pd.merge(cust_trans_rfm , cust_addr_info, left_on = 'customer_id' , 
                           right_on = 'customer_id', how='inner')

In [70]:

print("RFM table Records count : {}\nAddress Table Records count :{}".format(cust_trans_rfm.shape[0] ,cust_addr_info.shape[0]))

RFM table Records count : 19354
Address Table Records count :3999

In [72]:

state_car_owners = cust_trans_addr[['state' , 'owns_car' , 'customer_id']].drop_duplicates().groupby(['state', 'owns_car']).size().reset_index()

In [73]:

state_car_owners.rename(columns={0:'Number of Customers'}, inplace=True)
state_car_owners

Out[73]:

	state	owns_car	Number of Customers
0	NSW	No	889
1	NSW	Yes	935
2	QLD	No	365
3	QLD	Yes	363
4	VIC	No	435
5	VIC	Yes	425

In [74]:

plt.figure(figsize=(8,7))
sns.barplot(x='state', y='Number of Customers' , hue='owns_car', data=state_car_owners)
plt.xlabel('States')
plt.ylabel('Number of Customers')
plt.title('Number of Customers who own a car')
plt.show()

NSW has the largest number of people that donot own a car. It seems that a higher amount of data is collected from NSW compared to other states. In QLD the distribution between customers having a car or not having is even. In Victoria the number is split evenly. Both the numbers are significantly lower than that of NSW

4. RFM Analysis Scatter Plots¶

4.1. Recency vs Monetary¶

The chart shows that customers who purchased recently generated more revenue than customers who visited long time ago. Customers from recent past (50-100) days generated a moderate revenue. Customers who visited 200 days ago generated a low revenue.

In [76]:

plt.figure(figsize=(8,7))
cust_trans_rfm.plot.scatter(x='recency' , y='monetary')
plt.xlabel('Recency')
plt.ylabel('Monetary ($)')
plt.title('Recency vs Monetary')
plt.show()

<Figure size 576x504 with 0 Axes>

4.2 Frequency vs Monetary¶

Customers classified as "Platinum Custoers" , "Very Loyal" and "Becoming Loyal" visit frequently, which correlated with increased revenue for the business.

In [78]:

plt.figure(figsize=(8,7))
cust_trans_rfm.plot.scatter(x='frequency' , y='monetary')
plt.xlabel('Frequency')
plt.ylabel('Monetary ($)')
plt.title('Frequency vs Monetary')
plt.show()

<Figure size 576x504 with 0 Axes>

5. Customer Segment Distribution¶

Finally we can plot the Number of Customers present under a Customer Segment.

In [79]:

# Calculating the number of unique customers under a customer title.

cust_per_title = cust_trans_rfm[['detail_cust_title', 'customer_id','rank']].drop_duplicates().groupby(
    ['detail_cust_title','rank']).size().reset_index().sort_values('rank')

In [81]:

cust_per_title.rename(columns={0:'Number of Customers'}, inplace=True)
cust_per_title

Out[81]:

	detail_cust_title	rank	Number of Customers
7	Platinum Customer	1	164
10	Very Loyal	2	181
1	Becoming Loyal	3	344
9	Recent Customer	4	357
8	Potential Customer	5	340
4	Late Bloomer	6	332
5	Loosing Customer	7	333
3	High Risk Customer	8	371
0	Almost Lost Customer	9	315
2	Evasive Customer	10	388
6	Lost Customer	11	291

In [84]:

# Plotting the Number of Customers

plt.figure(figsize=(15,8))
sns.barplot(y='detail_cust_title' , x='Number of Customers', data=cust_per_title)
plt.xlabel('Number of Customers')
plt.ylabel('Customer Segment')
plt.title('Number of Customers by Customer Segment')
plt.show()