import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from datetime import datetime, date
# Loading the Customer Address Data from the excel file
cust_address = pd.read_excel('Raw_data.xlsx' , sheet_name='CustomerAddress')
# Checking first 5 records from Customer Address Data
cust_address.head(5)
customer_id | address | postcode | state | country | property_valuation | |
---|---|---|---|---|---|---|
0 | 1 | 060 Morning Avenue | 2016 | New South Wales | Australia | 10 |
1 | 2 | 6 Meadow Vale Court | 2153 | New South Wales | Australia | 10 |
2 | 4 | 0 Holy Cross Court | 4211 | QLD | Australia | 9 |
3 | 5 | 17979 Del Mar Point | 2448 | New South Wales | Australia | 4 |
4 | 6 | 9 Oakridge Court | 3216 | VIC | Australia | 9 |
# Information of columns and data-types of Customer Address Data.
cust_address.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3999 entries, 0 to 3998 Data columns (total 6 columns): customer_id 3999 non-null int64 address 3999 non-null object postcode 3999 non-null int64 state 3999 non-null object country 3999 non-null object property_valuation 3999 non-null int64 dtypes: int64(3), object(3) memory usage: 187.5+ KB
The data-type of columns looks fine. Let;s check for the data quality and apply data cleaning process where ever applicable to clean our dataset before performing any analysis.
print("Total records (rows) in the dataset : {}".format(cust_address.shape[0]))
print("Total columns (features) in the dataset : {}".format(cust_address.shape[1]))
Total records (rows) in the dataset : 3999 Total columns (features) in the dataset : 6
# select numeric columns
df_numeric = cust_address.select_dtypes(include=[np.number])
numeric_cols = df_numeric.columns.values
print("The numeric columns are : {}".format(numeric_cols))
# select non-numeric columns
df_non_numeric = cust_address.select_dtypes(exclude=[np.number])
non_numeric_cols = df_non_numeric.columns.values
print("The non-numeric columns are : {}".format(non_numeric_cols))
The numeric columns are : ['customer_id' 'postcode' 'property_valuation'] The non-numeric columns are : ['address' 'state' 'country']
Checking for the presence of any missing values in the dataset. If missing values are present for a particular feature then depending upon the situation the feature may be either dropped (cases when a major amount of data is missing) or an appropiate value will be imputed in the feature column with missing values.
# Total number of missing values
cust_address.isnull().sum()
customer_id 0 address 0 postcode 0 state 0 country 0 property_valuation 0 dtype: int64
In the dataset there are no missing values.
We will check whether there is inconsistent data / typo error data is present in the categorical columns.
The columns to be checked are 'address', 'postcode' ,'state', 'country'.
cust_address['state'].value_counts()
NSW 2054 VIC 939 QLD 838 New South Wales 86 Victoria 82 Name: state, dtype: int64
Here there are inconsistent data in State column. For New South Wales and Victoria we have two values, one being the full name and the other being their short name. The State names should be standardised and columns with state as New South Wales will be replaced by NSW and columns with state as Victoria will be replaced by VIC.
# Function to replace full state names with their short forms.
def replace_state_names(state_name):
# Making Short Form of State Names as standards
if state_name=='New South Wales':
return 'NSW'
elif state_name=='Victoria':
return 'VIC'
else :
return state_name
# Applying the above fuction to state column
cust_address['state'] = cust_address['state'].apply(replace_state_names)
After applying the above function the state name is standardised and there is no inconsistency in the state column.
cust_address['state'].value_counts()
NSW 2140 VIC 1021 QLD 838 Name: state, dtype: int64
cust_address['country'].value_counts()
Australia 3999 Name: country, dtype: int64
There is no inconsistency of data in the Country column.
The Postcode column looks perfect. There is no inconsistency / typo in the data.
cust_address[['address','postcode', 'state' , 'country']].drop_duplicates()
address | postcode | state | country | |
---|---|---|---|---|
0 | 060 Morning Avenue | 2016 | NSW | Australia |
1 | 6 Meadow Vale Court | 2153 | NSW | Australia |
2 | 0 Holy Cross Court | 4211 | QLD | Australia |
3 | 17979 Del Mar Point | 2448 | NSW | Australia |
4 | 9 Oakridge Court | 3216 | VIC | Australia |
5 | 4 Delaware Trail | 2210 | NSW | Australia |
6 | 49 Londonderry Lane | 2650 | NSW | Australia |
7 | 97736 7th Trail | 2023 | NSW | Australia |
8 | 93405 Ludington Park | 3044 | VIC | Australia |
9 | 44339 Golden Leaf Alley | 4557 | QLD | Australia |
10 | 2 Sutherland Street | 3799 | VIC | Australia |
11 | 9 Mcbride Trail | 2760 | NSW | Australia |
12 | 9861 New Castle Avenue | 2428 | NSW | Australia |
13 | 52 Moland Street | 3331 | VIC | Australia |
14 | 82391 Kensington Lane | 3058 | VIC | Australia |
15 | 092 2nd Alley | 2135 | NSW | Australia |
16 | 59 Spaight Circle | 2233 | NSW | Australia |
17 | 032 Bartelt Crossing | 2444 | NSW | Australia |
18 | 18 Jenna Center | 2650 | NSW | Australia |
19 | 3 Cordelia Plaza | 2153 | NSW | Australia |
20 | 28 5th Center | 4413 | QLD | Australia |
21 | 52 Carey Alley | 4740 | QLD | Australia |
22 | 96 Texas Plaza | 3218 | VIC | Australia |
23 | 48 Eagan Avenue | 4868 | QLD | Australia |
24 | 9 Buell Park | 4116 | QLD | Australia |
25 | 02663 Buell Parkway | 2519 | NSW | Australia |
26 | 2294 Pleasure Place | 2135 | NSW | Australia |
27 | 2951 Petterle Place | 2756 | NSW | Australia |
28 | 63 Lukken Drive | 2170 | NSW | Australia |
29 | 833 Luster Way | 4005 | QLD | Australia |
... | ... | ... | ... | ... |
3969 | 81609 Vernon Terrace | 3934 | VIC | Australia |
3970 | 37 Hintze Court | 2168 | NSW | Australia |
3971 | 07 Morning Court | 3805 | VIC | Australia |
3972 | 327 Loeprich Street | 3187 | VIC | Australia |
3973 | 5106 Northridge Drive | 2560 | NSW | Australia |
3974 | 9754 High Crossing Terrace | 2148 | NSW | Australia |
3975 | 7 Meadow Vale Court | 2114 | NSW | Australia |
3976 | 80260 Morning Road | 3178 | VIC | Australia |
3977 | 38017 Briar Crest Drive | 2165 | NSW | Australia |
3978 | 60 Morningstar Center | 2126 | NSW | Australia |
3979 | 218 Stuart Junction | 2223 | NSW | Australia |
3980 | 9 Butterfield Lane | 4077 | QLD | Australia |
3981 | 614 Burning Wood Way | 2148 | NSW | Australia |
3982 | 9 Grover Point | 4218 | QLD | Australia |
3983 | 565 Bunting Park | 2076 | NSW | Australia |
3984 | 9461 Saint Paul Trail | 2428 | NSW | Australia |
3985 | 5204 Delaware Pass | 2560 | NSW | Australia |
3986 | 24 Scott Pass | 4300 | QLD | Australia |
3987 | 8 Randy Parkway | 2209 | NSW | Australia |
3988 | 681 Elmside Place | 3750 | VIC | Australia |
3989 | 2918 Summer Ridge Hill | 3030 | VIC | Australia |
3990 | 613 Erie Lane | 2088 | NSW | Australia |
3991 | 0 Transport Center | 3977 | VIC | Australia |
3992 | 4 Dovetail Crossing | 2350 | NSW | Australia |
3993 | 736 Roxbury Junction | 2540 | NSW | Australia |
3994 | 1482 Hauk Trail | 3064 | VIC | Australia |
3995 | 57042 Village Green Point | 4511 | QLD | Australia |
3996 | 87 Crescent Oaks Alley | 2756 | NSW | Australia |
3997 | 8194 Lien Street | 4032 | QLD | Australia |
3998 | 320 Acker Drive | 2251 | NSW | Australia |
3999 rows × 4 columns
We need to ensure that there is no duplication of records in the dataset. This may lead to error in data analysis due to poor data quality. If there are duplicate rows of data then we need to drop such records.
For checking for duplicate records we need to firstly remove the primary key column of the dataset then apply drop_duplicates() function provided by Python.
# Dropping the primary key column i.e customer_id and storing into a temporary dataframe.
cust_address_dedupped = cust_address.drop('customer_id', axis=1).drop_duplicates()
print("Number of records after removing customer_id (pk), duplicates : {}".format(cust_address_dedupped.shape[0]))
print("Number of records in original dataset : {}".format(cust_address.shape[0]))
Number of records after removing customer_id (pk), duplicates : 3999 Number of records in original dataset : 3999
Since both the numbers are same. There are no duplicate records in the dataset
Currently the Customer Address dataset is clean. Hence we can export the data to a csv to continue our data analysis of Customer Segments by joining it to other tables.
cust_address.to_csv('CustomerAddress_Cleaned.csv', index=False)
Checking with the Master Table (CustomerDemographic_Cleaned.csv) containing the entire Customer Data for the Customer IDs which are getting dropped from the Customer Address Dataset.
Basically these are the Customers who have an address but are not a part of the Demographics dataset yet.
cust_demo_detail = pd.read_csv('CustomerDemographic_Cleaned.csv')
cust_demo_detail.head()
customer_id | first_name | last_name | gender | past_3_years_bike_related_purchases | DOB | job_title | job_industry_category | wealth_segment | deceased_indicator | owns_car | tenure | Age | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Laraine | Medendorp | Female | 93 | 1953-10-12 | Executive Secretary | Health | Mass Customer | N | Yes | 11.0 | 67 |
1 | 2 | Eli | Bockman | Male | 81 | 1980-12-16 | Administrative Officer | Financial Services | Mass Customer | N | Yes | 16.0 | 40 |
2 | 3 | Arlin | Dearle | Male | 61 | 1954-01-20 | Recruiting Manager | Property | Mass Customer | N | Yes | 15.0 | 67 |
3 | 4 | Talbot | None | Male | 33 | 1961-10-03 | Missing | IT | Mass Customer | N | No | 7.0 | 59 |
4 | 5 | Sheila-kathryn | Calton | Female | 56 | 1977-05-13 | Senior Editor | Missing | Affluent Customer | N | Yes | 8.0 | 43 |
print("Total Records in Customer_Demographic_Table : {}".format(cust_demo_detail.shape[0]))
print("Total Records in Customer_Address_Table : {}".format(cust_address.shape[0]))
print('In Demographic Table {} records are getting dropped due to data cleaning process in Demographic Table'
.format(cust_address.shape[0]-cust_demo_detail.shape[0]))
Total Records in Customer_Demographic_Table : 3912 Total Records in Customer_Address_Table : 3999 In Demographic Table 87 records are getting dropped due to data cleaning process in Demographic Table
cust_drop = cust_address.merge(cust_demo_detail , left_on = 'customer_id', right_on='customer_id'
, how='outer')
cust_drop.head()
customer_id | address | postcode | state | country | property_valuation | first_name | last_name | gender | past_3_years_bike_related_purchases | DOB | job_title | job_industry_category | wealth_segment | deceased_indicator | owns_car | tenure | Age | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 060 Morning Avenue | 2016.0 | NSW | Australia | 10.0 | Laraine | Medendorp | Female | 93.0 | 1953-10-12 | Executive Secretary | Health | Mass Customer | N | Yes | 11.0 | 67.0 |
1 | 2 | 6 Meadow Vale Court | 2153.0 | NSW | Australia | 10.0 | Eli | Bockman | Male | 81.0 | 1980-12-16 | Administrative Officer | Financial Services | Mass Customer | N | Yes | 16.0 | 40.0 |
2 | 4 | 0 Holy Cross Court | 4211.0 | QLD | Australia | 9.0 | Talbot | None | Male | 33.0 | 1961-10-03 | Missing | IT | Mass Customer | N | No | 7.0 | 59.0 |
3 | 5 | 17979 Del Mar Point | 2448.0 | NSW | Australia | 4.0 | Sheila-kathryn | Calton | Female | 56.0 | 1977-05-13 | Senior Editor | Missing | Affluent Customer | N | Yes | 8.0 | 43.0 |
4 | 6 | 9 Oakridge Court | 3216.0 | VIC | Australia | 9.0 | Curr | Duckhouse | Male | 35.0 | 1966-09-16 | Missing | Retail | High Net Worth | N | Yes | 13.0 | 54.0 |