import pandas as pd
import math
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from datetime import datetime, date
plt.style.use('ggplot')
# Loading the Transactions and Customer Demographics Datasets
trans = pd.read_csv('Transactions_Cleaned.csv')
cust = pd.read_csv('CustomerDemographic_Cleaned.csv')
# Fetching first 5 transaction records
trans.head(5)
transaction_id | product_id | customer_id | transaction_date | online_order | order_status | brand | product_line | product_class | product_size | list_price | standard_cost | product_first_sold_date | Profit | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 2 | 2950 | 2017-02-25 | 0.0 | Approved | Solex | Standard | medium | medium | 71.49 | 53.62 | 41245.0 | 17.87 |
1 | 2 | 3 | 3120 | 2017-05-21 | 1.0 | Approved | Trek Bicycles | Standard | medium | large | 2091.47 | 388.92 | 41701.0 | 1702.55 |
2 | 3 | 37 | 402 | 2017-10-16 | 0.0 | Approved | OHM Cycles | Standard | low | medium | 1793.43 | 248.82 | 36361.0 | 1544.61 |
3 | 4 | 88 | 3135 | 2017-08-31 | 0.0 | Approved | Norco Bicycles | Standard | medium | medium | 1198.46 | 381.10 | 36145.0 | 817.36 |
4 | 5 | 78 | 787 | 2017-10-01 | 1.0 | Approved | Giant Bicycles | Standard | medium | large | 1765.30 | 709.48 | 42226.0 | 1055.82 |
print("Total records (rows) in the Transaction Dataset : {}".format(trans.shape[0]))
print("Total features (columns) in the Transaction Dataset : {}".format(trans.shape[1]))
Total records (rows) in the Transaction Dataset : 19803 Total features (columns) in the Transaction Dataset : 14
# Fetching first 5 Customer Demographics records
cust.head(5)
customer_id | first_name | last_name | gender | past_3_years_bike_related_purchases | DOB | job_title | job_industry_category | wealth_segment | deceased_indicator | owns_car | tenure | Age | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Laraine | Medendorp | Female | 93 | 1953-10-12 | Executive Secretary | Health | Mass Customer | N | Yes | 11.0 | 67 |
1 | 2 | Eli | Bockman | Male | 81 | 1980-12-16 | Administrative Officer | Financial Services | Mass Customer | N | Yes | 16.0 | 40 |
2 | 3 | Arlin | Dearle | Male | 61 | 1954-01-20 | Recruiting Manager | Property | Mass Customer | N | Yes | 15.0 | 67 |
3 | 4 | Talbot | None | Male | 33 | 1961-10-03 | Missing | IT | Mass Customer | N | No | 7.0 | 59 |
4 | 5 | Sheila-kathryn | Calton | Female | 56 | 1977-05-13 | Senior Editor | Missing | Affluent Customer | N | Yes | 8.0 | 44 |
print("Total records (rows) in the Customer Demographics Dataset : {}".format(cust.shape[0]))
print("Total features (columns) in the Customer Demographics Dataset : {}".format(cust.shape[1]))
Total records (rows) in the Customer Demographics Dataset : 3912 Total features (columns) in the Customer Demographics Dataset : 13
# Merging both the Transaction Dataset and Customer Demographics Dataset based on customer_id.
merged_trans_cust = pd.merge(trans, cust, left_on='customer_id', right_on='customer_id', how='inner')
# Fetching the first 5 records of the merged dataset.
merged_trans_cust.head(5)
transaction_id | product_id | customer_id | transaction_date | online_order | order_status | brand | product_line | product_class | product_size | ... | gender | past_3_years_bike_related_purchases | DOB | job_title | job_industry_category | wealth_segment | deceased_indicator | owns_car | tenure | Age | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 2 | 2950 | 2017-02-25 | 0.0 | Approved | Solex | Standard | medium | medium | ... | Male | 19 | 1955-01-11 | Software Engineer I | Financial Services | Mass Customer | N | Yes | 10.0 | 66 |
1 | 11065 | 1 | 2950 | 2017-10-16 | 0.0 | Approved | Giant Bicycles | Standard | medium | medium | ... | Male | 19 | 1955-01-11 | Software Engineer I | Financial Services | Mass Customer | N | Yes | 10.0 | 66 |
2 | 18923 | 62 | 2950 | 2017-04-26 | 0.0 | Approved | Solex | Standard | medium | medium | ... | Male | 19 | 1955-01-11 | Software Engineer I | Financial Services | Mass Customer | N | Yes | 10.0 | 66 |
3 | 2 | 3 | 3120 | 2017-05-21 | 1.0 | Approved | Trek Bicycles | Standard | medium | large | ... | Female | 89 | 1979-02-04 | Clinical Specialist | Health | Mass Customer | N | Yes | 10.0 | 42 |
4 | 6862 | 4 | 3120 | 2017-10-05 | 0.0 | Approved | Giant Bicycles | Standard | high | medium | ... | Female | 89 | 1979-02-04 | Clinical Specialist | Health | Mass Customer | N | Yes | 10.0 | 42 |
5 rows × 26 columns
print("Total records (rows) in the Merged Dataset : {}".format(merged_trans_cust.shape[0]))
print("Total features (columns) in the Merged Dataset : {}".format(merged_trans_cust.shape[1]))
Total records (rows) in the Merged Dataset : 19354 Total features (columns) in the Merged Dataset : 26
merged_trans_cust.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 19354 entries, 0 to 19353 Data columns (total 26 columns): transaction_id 19354 non-null int64 product_id 19354 non-null int64 customer_id 19354 non-null int64 transaction_date 19354 non-null object online_order 19354 non-null float64 order_status 19354 non-null object brand 19354 non-null object product_line 19354 non-null object product_class 19354 non-null object product_size 19354 non-null object list_price 19354 non-null float64 standard_cost 19354 non-null float64 product_first_sold_date 19354 non-null float64 Profit 19354 non-null float64 first_name 19354 non-null object last_name 19354 non-null object gender 19354 non-null object past_3_years_bike_related_purchases 19354 non-null int64 DOB 19354 non-null object job_title 19354 non-null object job_industry_category 19354 non-null object wealth_segment 19354 non-null object deceased_indicator 19354 non-null object owns_car 19354 non-null object tenure 19354 non-null float64 Age 19354 non-null int64 dtypes: float64(6), int64(5), object(15) memory usage: 4.0+ MB
The data-type of transaction_date column is not in date-time format. Hence the data-type of the column should be changed from object to datetime type.
merged_trans_cust['transaction_date']= pd.to_datetime(merged_trans_cust['transaction_date'])
RFM (Recency, Frequency, Monetary) analysis is a behavior-based approach grouping customers into segments. It groups the customers on the basis of their previous purchase transactions. How recently, how often, and how much did a customer buy. RFM filters customers into various groups for the purpose of better service. There is a segment of customer who is the big spender but what if they purchased only once or how recently they purchased? Do they often purchase our product? Also, It helps managers to run an effective promotional campaign for personalized service.
# Maximum Transaction Date or the latest transaction date.
max_trans_date = max(merged_trans_cust['transaction_date']).date()
max_trans_date
datetime.date(2017, 12, 30)
# Taking the last transaction date as a reference date for comparision and
# finding the number of days between a transaction date and last transaction date to compute the recency.
comparison_date = datetime.strptime(str(max_trans_date), "%Y-%m-%d")
# Creating a RFM table that will contain all the values for recency , frequency and Monetray data.
rfm_table = merged_trans_cust.groupby(['customer_id']).agg({'transaction_date': lambda date : (comparison_date - date.max()).days,
'product_id' : lambda prod_id : len(prod_id),
'Profit' : lambda p : sum(p)})
# The columns in rfm_table dataframe are not properly named. Renaming of the columns to appropiate name is needed
rfm_table.columns
Index(['transaction_date', 'product_id', 'Profit'], dtype='object')
# Renaming column names to appropiate names
rfm_table.rename(columns={'transaction_date' : 'recency',
'product_id' : 'frequency',
'Profit' : 'monetary'} , inplace=True)
# Dividing the recency, frequency and monetary into 4 quartiles (min, 25%, 50%, 75% and max).
# These values will help us to calculate RFM score for a customer and classify based on their RFM score.
rfm_table['r_quartile'] = pd.qcut(rfm_table['recency'], 4, ['4','3','2','1'])
rfm_table['f_quartile'] = pd.qcut(rfm_table['frequency'], 4, ['1','2','3','4'])
rfm_table['m_quartile'] = pd.qcut(rfm_table['monetary'], 4, ['1','2','3','4'])
# RFM_table dataset
rfm_table
recency | frequency | monetary | r_quartile | f_quartile | m_quartile | |
---|---|---|---|---|---|---|
customer_id | ||||||
1 | 7 | 11 | 3018.09 | 4 | 4 | 3 |
2 | 128 | 3 | 2226.26 | 1 | 1 | 2 |
3 | 102 | 8 | 3362.81 | 1 | 4 | 3 |
4 | 195 | 2 | 220.57 | 1 | 1 | 1 |
5 | 16 | 6 | 2394.94 | 4 | 2 | 2 |
6 | 64 | 5 | 3946.55 | 2 | 2 | 3 |
7 | 253 | 3 | 220.11 | 1 | 1 | 1 |
8 | 22 | 10 | 7066.94 | 3 | 4 | 4 |
9 | 78 | 6 | 2353.11 | 2 | 2 | 2 |
10 | 43 | 5 | 3358.28 | 3 | 2 | 3 |
11 | 46 | 6 | 3638.84 | 2 | 2 | 3 |
12 | 67 | 7 | 3540.03 | 2 | 3 | 3 |
13 | 27 | 7 | 4337.38 | 3 | 3 | 4 |
14 | 47 | 3 | 1713.90 | 2 | 1 | 1 |
15 | 35 | 6 | 1728.39 | 3 | 2 | 1 |
16 | 99 | 5 | 4521.84 | 1 | 2 | 4 |
17 | 0 | 5 | 2015.61 | 4 | 2 | 2 |
18 | 134 | 7 | 3543.38 | 1 | 3 | 3 |
19 | 102 | 3 | 2951.79 | 1 | 1 | 3 |
20 | 31 | 4 | 3608.28 | 3 | 1 | 3 |
21 | 6 | 5 | 4229.41 | 4 | 2 | 4 |
22 | 97 | 8 | 5159.84 | 1 | 4 | 4 |
23 | 10 | 6 | 4376.15 | 4 | 2 | 4 |
24 | 17 | 7 | 3689.35 | 4 | 3 | 3 |
25 | 5 | 12 | 5333.66 | 4 | 4 | 4 |
26 | 259 | 2 | 268.24 | 1 | 1 | 1 |
27 | 143 | 7 | 3274.25 | 1 | 3 | 3 |
28 | 83 | 6 | 3366.56 | 2 | 2 | 3 |
29 | 22 | 9 | 6175.30 | 3 | 4 | 4 |
30 | 11 | 3 | 2633.95 | 4 | 1 | 2 |
... | ... | ... | ... | ... | ... | ... |
3470 | 2 | 8 | 7228.80 | 4 | 4 | 4 |
3471 | 148 | 4 | 1914.10 | 1 | 1 | 2 |
3472 | 5 | 6 | 2779.58 | 4 | 2 | 2 |
3474 | 71 | 5 | 2358.99 | 2 | 2 | 2 |
3475 | 3 | 4 | 2515.14 | 4 | 1 | 2 |
3476 | 50 | 5 | 1152.36 | 2 | 2 | 1 |
3477 | 64 | 8 | 4401.92 | 2 | 4 | 4 |
3478 | 29 | 6 | 4297.85 | 3 | 2 | 4 |
3479 | 31 | 6 | 2265.96 | 3 | 2 | 2 |
3480 | 16 | 4 | 1770.26 | 4 | 1 | 1 |
3481 | 54 | 7 | 2438.45 | 2 | 3 | 2 |
3482 | 27 | 8 | 5549.59 | 3 | 4 | 4 |
3483 | 117 | 6 | 2193.04 | 1 | 2 | 2 |
3484 | 79 | 7 | 5924.55 | 2 | 3 | 4 |
3485 | 25 | 3 | 2491.47 | 3 | 1 | 2 |
3486 | 9 | 4 | 2972.34 | 4 | 1 | 3 |
3487 | 10 | 3 | 1837.75 | 4 | 1 | 2 |
3488 | 17 | 2 | 815.95 | 4 | 1 | 1 |
3489 | 108 | 6 | 2644.44 | 1 | 2 | 2 |
3490 | 166 | 5 | 2379.57 | 1 | 2 | 2 |
3491 | 189 | 4 | 1430.28 | 1 | 1 | 1 |
3492 | 80 | 3 | 2193.81 | 2 | 1 | 2 |
3493 | 93 | 6 | 3728.88 | 1 | 2 | 3 |
3494 | 4 | 4 | 2755.11 | 4 | 1 | 2 |
3495 | 13 | 7 | 3847.65 | 4 | 3 | 3 |
3496 | 256 | 4 | 2045.84 | 1 | 1 | 2 |
3497 | 52 | 3 | 1648.32 | 2 | 1 | 1 |
3498 | 127 | 6 | 3147.33 | 1 | 2 | 3 |
3499 | 51 | 7 | 4955.25 | 2 | 3 | 4 |
3500 | 144 | 6 | 1785.86 | 1 | 2 | 1 |
3416 rows × 6 columns
# Caluclation of RFM Score.
# Max weightage is given to recency then frequency and then monetary.
rfm_table['rfm_score'] = 100*rfm_table['r_quartile'].astype(int)+10*rfm_table['f_quartile'].astype(int)+rfm_table['m_quartile'].astype(int)
# Assigning a title to a cuustomer.
# Platinum corresponds to highest range of RFM score down to Bronze to lowest range of RFM score.
rfm_table['customer_title'] = pd.qcut(rfm_table['rfm_score'], 4, ['Bronze','Silver','Gold','Platinum'])
# RFM table dataset
rfm_table
recency | frequency | monetary | r_quartile | f_quartile | m_quartile | rfm_score | customer_title | |
---|---|---|---|---|---|---|---|---|
customer_id | ||||||||
1 | 7 | 11 | 3018.09 | 4 | 4 | 3 | 443 | Platinum |
2 | 128 | 3 | 2226.26 | 1 | 1 | 2 | 112 | Bronze |
3 | 102 | 8 | 3362.81 | 1 | 4 | 3 | 143 | Bronze |
4 | 195 | 2 | 220.57 | 1 | 1 | 1 | 111 | Bronze |
5 | 16 | 6 | 2394.94 | 4 | 2 | 2 | 422 | Platinum |
6 | 64 | 5 | 3946.55 | 2 | 2 | 3 | 223 | Silver |
7 | 253 | 3 | 220.11 | 1 | 1 | 1 | 111 | Bronze |
8 | 22 | 10 | 7066.94 | 3 | 4 | 4 | 344 | Gold |
9 | 78 | 6 | 2353.11 | 2 | 2 | 2 | 222 | Silver |
10 | 43 | 5 | 3358.28 | 3 | 2 | 3 | 323 | Gold |
11 | 46 | 6 | 3638.84 | 2 | 2 | 3 | 223 | Silver |
12 | 67 | 7 | 3540.03 | 2 | 3 | 3 | 233 | Silver |
13 | 27 | 7 | 4337.38 | 3 | 3 | 4 | 334 | Gold |
14 | 47 | 3 | 1713.90 | 2 | 1 | 1 | 211 | Bronze |
15 | 35 | 6 | 1728.39 | 3 | 2 | 1 | 321 | Gold |
16 | 99 | 5 | 4521.84 | 1 | 2 | 4 | 124 | Bronze |
17 | 0 | 5 | 2015.61 | 4 | 2 | 2 | 422 | Platinum |
18 | 134 | 7 | 3543.38 | 1 | 3 | 3 | 133 | Bronze |
19 | 102 | 3 | 2951.79 | 1 | 1 | 3 | 113 | Bronze |
20 | 31 | 4 | 3608.28 | 3 | 1 | 3 | 313 | Gold |
21 | 6 | 5 | 4229.41 | 4 | 2 | 4 | 424 | Platinum |
22 | 97 | 8 | 5159.84 | 1 | 4 | 4 | 144 | Bronze |
23 | 10 | 6 | 4376.15 | 4 | 2 | 4 | 424 | Platinum |
24 | 17 | 7 | 3689.35 | 4 | 3 | 3 | 433 | Platinum |
25 | 5 | 12 | 5333.66 | 4 | 4 | 4 | 444 | Platinum |
26 | 259 | 2 | 268.24 | 1 | 1 | 1 | 111 | Bronze |
27 | 143 | 7 | 3274.25 | 1 | 3 | 3 | 133 | Bronze |
28 | 83 | 6 | 3366.56 | 2 | 2 | 3 | 223 | Silver |
29 | 22 | 9 | 6175.30 | 3 | 4 | 4 | 344 | Gold |
30 | 11 | 3 | 2633.95 | 4 | 1 | 2 | 412 | Platinum |
... | ... | ... | ... | ... | ... | ... | ... | ... |
3470 | 2 | 8 | 7228.80 | 4 | 4 | 4 | 444 | Platinum |
3471 | 148 | 4 | 1914.10 | 1 | 1 | 2 | 112 | Bronze |
3472 | 5 | 6 | 2779.58 | 4 | 2 | 2 | 422 | Platinum |
3474 | 71 | 5 | 2358.99 | 2 | 2 | 2 | 222 | Silver |
3475 | 3 | 4 | 2515.14 | 4 | 1 | 2 | 412 | Platinum |
3476 | 50 | 5 | 1152.36 | 2 | 2 | 1 | 221 | Silver |
3477 | 64 | 8 | 4401.92 | 2 | 4 | 4 | 244 | Silver |
3478 | 29 | 6 | 4297.85 | 3 | 2 | 4 | 324 | Gold |
3479 | 31 | 6 | 2265.96 | 3 | 2 | 2 | 322 | Gold |
3480 | 16 | 4 | 1770.26 | 4 | 1 | 1 | 411 | Gold |
3481 | 54 | 7 | 2438.45 | 2 | 3 | 2 | 232 | Silver |
3482 | 27 | 8 | 5549.59 | 3 | 4 | 4 | 344 | Gold |
3483 | 117 | 6 | 2193.04 | 1 | 2 | 2 | 122 | Bronze |
3484 | 79 | 7 | 5924.55 | 2 | 3 | 4 | 234 | Silver |
3485 | 25 | 3 | 2491.47 | 3 | 1 | 2 | 312 | Gold |
3486 | 9 | 4 | 2972.34 | 4 | 1 | 3 | 413 | Platinum |
3487 | 10 | 3 | 1837.75 | 4 | 1 | 2 | 412 | Platinum |
3488 | 17 | 2 | 815.95 | 4 | 1 | 1 | 411 | Gold |
3489 | 108 | 6 | 2644.44 | 1 | 2 | 2 | 122 | Bronze |
3490 | 166 | 5 | 2379.57 | 1 | 2 | 2 | 122 | Bronze |
3491 | 189 | 4 | 1430.28 | 1 | 1 | 1 | 111 | Bronze |
3492 | 80 | 3 | 2193.81 | 2 | 1 | 2 | 212 | Silver |
3493 | 93 | 6 | 3728.88 | 1 | 2 | 3 | 123 | Bronze |
3494 | 4 | 4 | 2755.11 | 4 | 1 | 2 | 412 | Platinum |
3495 | 13 | 7 | 3847.65 | 4 | 3 | 3 | 433 | Platinum |
3496 | 256 | 4 | 2045.84 | 1 | 1 | 2 | 112 | Bronze |
3497 | 52 | 3 | 1648.32 | 2 | 1 | 1 | 211 | Bronze |
3498 | 127 | 6 | 3147.33 | 1 | 2 | 3 | 123 | Bronze |
3499 | 51 | 7 | 4955.25 | 2 | 3 | 4 | 234 | Silver |
3500 | 144 | 6 | 1785.86 | 1 | 2 | 1 | 121 | Bronze |
3416 rows × 8 columns
The RFM_Table dataframe is merged with the Transactions and Customer Demographics datasets, to gain depper insights of Customer Segemnts along with transactions. The dataframes are joined based on customer_ids from both the datasets
cust_trans_rfm = pd.merge(merged_trans_cust, rfm_table, left_on='customer_id', right_on='customer_id', how='inner')
cust_trans_rfm.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 19354 entries, 0 to 19353 Data columns (total 34 columns): transaction_id 19354 non-null int64 product_id 19354 non-null int64 customer_id 19354 non-null int64 transaction_date 19354 non-null datetime64[ns] online_order 19354 non-null float64 order_status 19354 non-null object brand 19354 non-null object product_line 19354 non-null object product_class 19354 non-null object product_size 19354 non-null object list_price 19354 non-null float64 standard_cost 19354 non-null float64 product_first_sold_date 19354 non-null float64 Profit 19354 non-null float64 first_name 19354 non-null object last_name 19354 non-null object gender 19354 non-null object past_3_years_bike_related_purchases 19354 non-null int64 DOB 19354 non-null object job_title 19354 non-null object job_industry_category 19354 non-null object wealth_segment 19354 non-null object deceased_indicator 19354 non-null object owns_car 19354 non-null object tenure 19354 non-null float64 Age 19354 non-null int64 recency 19354 non-null int64 frequency 19354 non-null int64 monetary 19354 non-null float64 r_quartile 19354 non-null category f_quartile 19354 non-null category m_quartile 19354 non-null category rfm_score 19354 non-null int32 customer_title 19354 non-null category dtypes: category(4), datetime64[ns](1), float64(7), int32(1), int64(7), object(14) memory usage: 4.6+ MB
The datatypes of the columns looks fine.
cust_trans_rfm['Age_Group'] = cust_trans_rfm['Age'].apply(lambda x : (math.floor(x/10)+1)*10)
An extended version of customer title is made which divides the entire number of customers into 11 groups. The groups are mainly Platinum Customers, Very Loyal, Becoming Loyal, Recent Customers, Potential Customers, Late Bloomer, Loosing Customers, High Risk Customers, Almost Lost Customers, Evasive Customers and Lost Customers.
The demarkation of customers into the above mentioned groups is based on their RFM scores.
# Function as a lookup to appropiate customer titles based on RFM score.
def cust_score_title_lkup(cols):
rfm_score = cols[0]
if rfm_score >= 444:
return 'Platinum Customer'
elif rfm_score >=433 and rfm_score < 444:
return 'Very Loyal'
elif rfm_score >=421 and rfm_score < 433:
return 'Becoming Loyal'
elif rfm_score >=344 and rfm_score < 421:
return 'Recent Customer'
elif rfm_score >=323 and rfm_score < 344:
return 'Potential Customer'
elif rfm_score >=311 and rfm_score < 323:
return 'Late Bloomer'
elif rfm_score >=224 and rfm_score < 311:
return 'Loosing Customer'
elif rfm_score >=212 and rfm_score < 224:
return 'High Risk Customer'
elif rfm_score >=124 and rfm_score < 212:
return 'Almost Lost Customer'
elif rfm_score >=112 and rfm_score < 124:
return 'Evasive Customer'
else :
return 'Lost Customer'
# Applying the above functions and creating a new feature detail_cust_title
cust_trans_rfm['detail_cust_title']=cust_trans_rfm[['rfm_score']].apply(cust_score_title_lkup, axis=1)
# Function to provide ranks to the customers based on their title.
def get_rank(cols):
title = cols[0]
if title=='Platinum Customer':
return 1
elif title=='Very Loyal':
return 2
elif title == 'Becoming Loyal':
return 3
elif title == 'Recent Customer':
return 4
elif title=='Potential Customer':
return 5
elif title == 'Late Bloomer':
return 6
elif title == 'Loosing Customer':
return 7
elif title=='High Risk Customer':
return 8
elif title == 'Almost Lost Customer':
return 9
elif title == 'Evasive Customer':
return 10
else :
return 11
# Applying the above functions and creating a new feature rank
cust_trans_rfm['rank']=cust_trans_rfm[['detail_cust_title']].apply(get_rank, axis=1)
After performing data quality assessment(DQA), data cleaning and RFM Analysis on the dataset, it's time to export the dataset to a csv file for further exploratory data analysis (EDA) and this data will drive the Sales Customer Segmenation Dashboard developed in Tableau.
cust_trans_rfm.to_csv('Customer_Trans_RFM_Analysis.csv', index=False)
print("Total records in final dataset : {}".format(cust_trans_rfm.shape[0]))
Total records in final dataset : 19354
Most New / Old Customers are aged between 40-49. The lowest age groups are under 20 and 80+ for both Old and New Customers dataset.
Among the New Customers the most populated age bracket is 20-29 and 60-69, while the maximum Old Customers are from the age bracket 50-69.
There is a steep drop in number of customers in 30-39 age groupsd among the New Customers.
# Loading the New Customers Dataset
new_cust = pd.read_csv('NewCustomerList_Cleaned.csv')
plt.figure(figsize=(10,8))
sns.distplot(new_cust['Age Group'], kde=False, bins=15)
plt.xlabel('Age Group')
plt.ylabel('Number of Customers')
plt.title('New Customers - Age Distribution')
plt.show()
Here 20 = under 20, 30 = 20-29 age bracket
plt.figure(figsize=(10,8))
sns.distplot(cust_trans_rfm['Age_Group'], kde=False, bins=15)
plt.xlabel('Age Group')
plt.ylabel('Number of Customers')
plt.title('Old Customers - Age Distribution')
plt.show()
Here 20 = under 20, 30 = 20-29 age bracket
Over the last 3 years approximately 51% of the buyers are women and 49% were male buyers.
Female purchases are approximately 10,000 more than male (numerically). Gender wise majority of the bike sales comes from female customers.
cust_bike_purchase_by_gender = cust_trans_rfm.groupby('gender').agg({'past_3_years_bike_related_purchases' : sum}
).reset_index()
total_records = cust_trans_rfm['past_3_years_bike_related_purchases'].sum()
cust_bike_purchase_by_gender['Percent_of_total'] = (cust_bike_purchase_by_gender['past_3_years_bike_related_purchases']
/total_records)*100
cust_bike_purchase_by_gender
gender | past_3_years_bike_related_purchases | Percent_of_total | |
---|---|---|---|
0 | Female | 478488 | 50.503731 |
1 | Male | 468943 | 49.496269 |
plt.figure(figsize=(8,5))
sns.barplot(x='gender',y='Percent_of_total',data=cust_bike_purchase_by_gender)
plt.xlabel('Gender')
plt.ylabel('Percent of Total Purchases')
plt.title('Female vs Male past 3 years Bike purchases')
plt.show()
Among the New Customers the highest amount of sales comes from customers having a job in Manufacturing and Financial services sector. The samllest chunk of sales comes from customers in Agriculture sector and from Telecom sector with 3% sales only. Similar trend is observed among Old Customers.
plt.figure(figsize=(15,8))
sns.countplot(x='job_industry_category',data=new_cust[~(new_cust['job_industry_category']=='Missing')])
plt.xlabel('Job Industry')
plt.ylabel('Number of Customers')
plt.title('New Customers - Job Industry Customer Distribution')
plt.show()
plt.figure(figsize=(15,8))
sns.countplot(x='job_industry_category',data=cust_trans_rfm[~(cust_trans_rfm['job_industry_category']=='Missing')])
plt.xlabel('Job Industry')
plt.ylabel('Number of Customers')
plt.title('Old Customers - Job Industry Customer Distribution')
plt.show()
Across all Age Groups the maximum number of customers are classified as 'Mass Customers'. The next being 'High Net Worth'. However among 40-49 aged customers 'Affluent Customers' outperforms the 'High Net Worth' customers.
New Customers
wealth_age_seg_new = new_cust.groupby(['wealth_segment', 'Age Group']).size().reset_index()
wealth_age_seg_new.rename(columns={0:'Number of Customers'}, inplace=True)
wealth_age_seg_new
wealth_segment | Age Group | Number of Customers | |
---|---|---|---|
0 | Affluent Customer | 20 | 3 |
1 | Affluent Customer | 30 | 49 |
2 | Affluent Customer | 40 | 15 |
3 | Affluent Customer | 50 | 58 |
4 | Affluent Customer | 60 | 40 |
5 | Affluent Customer | 70 | 41 |
6 | Affluent Customer | 80 | 20 |
7 | Affluent Customer | 90 | 9 |
8 | High Net Worth | 30 | 42 |
9 | High Net Worth | 40 | 34 |
10 | High Net Worth | 50 | 53 |
11 | High Net Worth | 60 | 36 |
12 | High Net Worth | 70 | 50 |
13 | High Net Worth | 80 | 24 |
14 | High Net Worth | 90 | 10 |
15 | Mass Customer | 20 | 5 |
16 | Mass Customer | 30 | 74 |
17 | Mass Customer | 40 | 52 |
18 | Mass Customer | 50 | 121 |
19 | Mass Customer | 60 | 93 |
20 | Mass Customer | 70 | 81 |
21 | Mass Customer | 80 | 51 |
22 | Mass Customer | 90 | 22 |
plt.figure(figsize=(15,8))
sns.barplot(x='Age Group', y='Number of Customers' , hue='wealth_segment', data=wealth_age_seg_new)
plt.xlabel('Age Group')
plt.ylabel('Number of Customers')
plt.title('New Customers - Wealth Segmentation by Age Group')
plt.show()
In all the age groups the most number of customers are 'Mass Customers'. The 2nd largest customer base being the 'High Net Worth' group.
In the age group 40-49 the 'Affluent Customer' group outperforms 'High Net Worth' group
Old Customers
Similar treand (like that of New Customers) is observed among Old Customers
wealth_age_seg_old = cust_trans_rfm.groupby(['wealth_segment', 'Age_Group']).size().reset_index()
wealth_age_seg_old.rename(columns={0:'Number of Customers'}, inplace=True)
plt.figure(figsize=(15,8))
sns.barplot(x='Age_Group', y='Number of Customers' , hue='wealth_segment', data=wealth_age_seg_old)
plt.xlabel('Age Group')
plt.ylabel('Number of Customers')
plt.title('Old Customers - Wealth Segmentation by Age Group')
plt.show()
The customer base of the automobile bike company lies in New South Wales, Queensland and Victoria, Australia.
In New South Wales (NSW) it seems there is a greater amount of people who donot own a car. In Victoria (VIC) the proportion is evenly split. However in Queensland (QLD) there are relatively more people who own the car.
# Loading the Customer Address Dataset.
cust_addr_info = pd.read_csv('CustomerAddress_Cleaned.csv')
# Merging the RFM data with Customer Address dataset.
cust_trans_addr = pd.merge(cust_trans_rfm , cust_addr_info, left_on = 'customer_id' ,
right_on = 'customer_id', how='inner')
print("RFM table Records count : {}\nAddress Table Records count :{}".format(cust_trans_rfm.shape[0] ,cust_addr_info.shape[0]))
RFM table Records count : 19354 Address Table Records count :3999
state_car_owners = cust_trans_addr[['state' , 'owns_car' , 'customer_id']].drop_duplicates().groupby(['state', 'owns_car']).size().reset_index()
state_car_owners.rename(columns={0:'Number of Customers'}, inplace=True)
state_car_owners
state | owns_car | Number of Customers | |
---|---|---|---|
0 | NSW | No | 889 |
1 | NSW | Yes | 935 |
2 | QLD | No | 365 |
3 | QLD | Yes | 363 |
4 | VIC | No | 435 |
5 | VIC | Yes | 425 |
plt.figure(figsize=(8,7))
sns.barplot(x='state', y='Number of Customers' , hue='owns_car', data=state_car_owners)
plt.xlabel('States')
plt.ylabel('Number of Customers')
plt.title('Number of Customers who own a car')
plt.show()
NSW has the largest number of people that donot own a car. It seems that a higher amount of data is collected from NSW compared to other states. In QLD the distribution between customers having a car or not having is even. In Victoria the number is split evenly. Both the numbers are significantly lower than that of NSW
The chart shows that customers who purchased recently generated more revenue than customers who visited long time ago. Customers from recent past (50-100) days generated a moderate revenue. Customers who visited 200 days ago generated a low revenue.
plt.figure(figsize=(8,7))
cust_trans_rfm.plot.scatter(x='recency' , y='monetary')
plt.xlabel('Recency')
plt.ylabel('Monetary ($)')
plt.title('Recency vs Monetary')
plt.show()
<Figure size 576x504 with 0 Axes>
Customers classified as "Platinum Custoers" , "Very Loyal" and "Becoming Loyal" visit frequently, which correlated with increased revenue for the business.
plt.figure(figsize=(8,7))
cust_trans_rfm.plot.scatter(x='frequency' , y='monetary')
plt.xlabel('Frequency')
plt.ylabel('Monetary ($)')
plt.title('Frequency vs Monetary')
plt.show()
<Figure size 576x504 with 0 Axes>
Finally we can plot the Number of Customers present under a Customer Segment.
# Calculating the number of unique customers under a customer title.
cust_per_title = cust_trans_rfm[['detail_cust_title', 'customer_id','rank']].drop_duplicates().groupby(
['detail_cust_title','rank']).size().reset_index().sort_values('rank')
cust_per_title.rename(columns={0:'Number of Customers'}, inplace=True)
cust_per_title
detail_cust_title | rank | Number of Customers | |
---|---|---|---|
7 | Platinum Customer | 1 | 164 |
10 | Very Loyal | 2 | 181 |
1 | Becoming Loyal | 3 | 344 |
9 | Recent Customer | 4 | 357 |
8 | Potential Customer | 5 | 340 |
4 | Late Bloomer | 6 | 332 |
5 | Loosing Customer | 7 | 333 |
3 | High Risk Customer | 8 | 371 |
0 | Almost Lost Customer | 9 | 315 |
2 | Evasive Customer | 10 | 388 |
6 | Lost Customer | 11 | 291 |
# Plotting the Number of Customers
plt.figure(figsize=(15,8))
sns.barplot(y='detail_cust_title' , x='Number of Customers', data=cust_per_title)
plt.xlabel('Number of Customers')
plt.ylabel('Customer Segment')
plt.title('Number of Customers by Customer Segment')
plt.show()