#!/usr/bin/env python
# coding: utf-8
#
#
Machine Learning Using Python (MEAFA Workshop)
# Lesson 5: Trees and Random Forests
#
#
# In this lesson we can consider a case study from customer relationship management to discuss decision trees and random forests.
#
# Customer Acquisition Data
# Exploratory Data Analysis
# Decision Tree
# Bagging
# Random Forest
# Extremely Randomised Trees
# Model Selection
# Model Evaluation
#
# This notebook relies on the following libraries and settings.
# In[1]:
# Packages
import numpy as np
from scipy import stats
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
# In[2]:
# Plot settings
sns.set_context('notebook')
sns.set_style('ticks')
colours = ['#1F77B4', '#FF7F0E', '#2CA02C', '#DB2728', '#9467BD', '#8C564B', '#E377C2','#7F7F7F', '#BCBD22', '#17BECF']
crayon = ['#4E79A7','#F28E2C','#E15759','#76B7B2','#59A14F', '#EDC949','#AF7AA1','#FF9DA7','#9C755F','#BAB0AB']
sns.set_palette(colours)
get_ipython().run_line_magic('matplotlib', 'inline')
plt.rcParams['figure.figsize'] = (9, 6)
# In[3]:
# Methods
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
# Model selection and evaluation tools
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_predict
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from sklearn.metrics import precision_score, average_precision_score, log_loss
# ## Customer Retention Data
#
# This dataset is taken from [Statistical Methods in Customer Relationship Management](http://onlinelibrary.wiley.com/book/10.1002/9781118349212), authored by V. Kumar and J. Andrew Petersen.
#
# **Business objective**: to predict which customers will end the relationship with the business.
# In[4]:
data = pd.read_excel('Datasets/CustomerChurn.xls', index_col=[0])
data['Churn'] = (data['Censor']==0).astype(int)
data.head()
# In[5]:
response='Churn'
predictors=['Avg_Ret_Exp', 'Revenue', 'Employees', 'Total_Crossbuy', 'Total_Freq', 'Industry']
data = data[[response]+predictors] # discarding variables that we will not use
index_train, index_test = train_test_split(np.array(data.index), stratify=data[response], train_size=0.8, random_state=5)
train = data.loc[index_train,].copy()
test = data.loc[index_test,:].copy()
y_train = train[response]
y_test = test[response]
X_train = train[predictors]
X_test = test[predictors]
# ##Exploratory Data Analysis
# In[6]:
train.describe().round(2)
# In[7]:
from statlearning import plot_histograms
plot_histograms(train[predictors[:-1]]) # excludes the last variable, since it is binary
plt.show()
# In[8]:
from statlearning import plot_logistic_regressions
with sns.color_palette(crayon):
plot_logistic_regressions(train[predictors], train[response])
plt.show()
# In[9]:
from statlearning import plot_conditional_distributions
plot_conditional_distributions(train[predictors[:-1]], y_train, labels=['Retention', 'Churn'])
plt.show()
# In[10]:
table=pd.crosstab(train[response], train['Industry'])
table = (table/table.sum()).round(3)
table
# In[11]:
fig, ax = plt.subplots(figsize=(6,4))
(table.T).plot(kind='bar', alpha=0.8, ax=ax)
ax.set_xlabel('Industry')
ax.set_ylabel('Proportions')
ax.legend_.set_title('Churn')
plt.tight_layout()
sns.despine()
plt.show()
# ## Decision Tree
# In[12]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
tree = DecisionTreeClassifier(criterion='entropy', max_depth=2, min_samples_leaf=5)
tree.fit(X_train, y_train)
# In[13]:
import graphviz
from sklearn.tree import export_graphviz
dot_data = export_graphviz(tree, out_file=None, feature_names=predictors, impurity=False,
class_names=['not acquired','acquired'], rounded=True)
graph = graphviz.Source(dot_data)
graph.render('tree01') # saves tree to a file
graph
# In[14]:
get_ipython().run_cell_magic('time', '', "\nmodel = DecisionTreeClassifier(criterion='entropy')\n\ntuning_parameters = {\n 'min_samples_leaf': [1,5,10,20,30,40,50],\n}\n\ntree_search = GridSearchCV(model, tuning_parameters, cv= 5 , return_train_score=False)\ntree_search.fit(X_train, y_train)\n\ntree = tree_search.best_estimator_\n\nprint('Best parameters found by grid search:', tree_search.best_params_, '\\n')\n")
# In[15]:
dot_data = export_graphviz(tree, out_file=None, feature_names=predictors, impurity=False,
class_names=['retention','churn'], rounded=True)
graph = graphviz.Source(dot_data)
graph.render('tree02') # saves tree to a file
graph
# ## Bagging
# In[16]:
from sklearn.ensemble import BaggingClassifier
bag = BaggingClassifier(DecisionTreeClassifier(criterion='entropy'), n_estimators=1000, random_state=1)
bag.fit(X_train, y_train)
# ## Random Forest
#
# The syntax to fit a [random forest classifier](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html) is the following.
# In[17]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(criterion='entropy', max_features= 2, min_samples_leaf=5, n_estimators=1000, random_state=1)
rf.fit(X_train, y_train)
# To tune the random forest, we should select a parameter that controls the size of the trees (such as the minimum number of observations in a terminal node) and the number of predictors that are sampled as candidate split variables at each node of a tree.
# In[18]:
get_ipython().run_cell_magic('time', '', "\nmodel = RandomForestClassifier(criterion = 'entropy', n_estimators=1000)\n\ntuning_parameters = {\n 'min_samples_leaf': [1, 5, 10, 20, 50],\n 'max_features': np.arange(1, len(predictors)+1),\n}\n\nrf_search = RandomizedSearchCV(model, tuning_parameters, cv = 5, n_iter= 16, return_train_score=False, n_jobs=4)\nrf_search.fit(X_train, y_train)\n\nrf = rf_search.best_estimator_\n\nprint('Best parameters found by randomised search:', rf_search.best_params_, '\\n')\n")
# After tuning the random forest, we may want to increase the number of trees to improve accuracy.
# In[19]:
rf.n_estimators = 10000
rf.fit(X_train, y_train)
# In[20]:
from statlearning import plot_feature_importance
plot_feature_importance(rf, predictors)
plt.show()
# ## Extremely Randomised Trees
# In[21]:
from sklearn.ensemble import ExtraTreesClassifier
# In[22]:
get_ipython().run_cell_magic('time', '', "\nmodel = ExtraTreesClassifier(criterion = 'entropy', n_estimators=1000)\n\ntuning_parameters = {\n 'min_samples_leaf': [1, 5, 10, 20, 50],\n 'max_features': np.arange(1, len(predictors)+1),\n}\n\nxtrees_search = RandomizedSearchCV(model, tuning_parameters, cv = 5, n_iter= 16, \n return_train_score=False, n_jobs=4)\nxtrees_search.fit(X_train, y_train)\n\nxtrees = xtrees_search.best_estimator_\n\nprint('Best parameters found by randomised search:', xtrees_search.best_params_, '\\n')\n")
# In[23]:
xtrees.n_estimators = 10000
xtrees.fit(X_train, y_train)
# ## Model Selection
# In[24]:
logit = LogisticRegression(C=1e3)
logit.fit(X_train, y_train)
# In[25]:
columns=['Error rate', 'Sensitivity', 'Specificity', 'AUC', 'Precision']
rows=['Logistic', 'Decision Tree', 'Bagged trees', 'Random forest', 'Extra Trees']
results=pd.DataFrame(0.0, columns=columns, index=rows)
methods=[logit, tree, bag, rf, xtrees]
for i, method in enumerate(methods):
y_prob = cross_val_predict(method, X_train, y_train, cv=10, method='predict_proba')
y_pred = (y_prob[:,1] > 0.5).astype(int)
confusion = confusion_matrix(y_train, y_pred)
results.iloc[i,0]= 1 - accuracy_score(y_train, y_pred)
results.iloc[i,1]= confusion[1,1]/np.sum(confusion[1,:])
results.iloc[i,2]= confusion[0,0]/np.sum(confusion[0,:])
results.iloc[i,3]= roc_auc_score(y_train, y_prob[:,1])
results.iloc[i,4]= precision_score(y_train, y_pred)
results.round(3)
# ## Model Evaluation
# In[26]:
columns=['Error rate', 'Sensitivity', 'Specificity', 'AUC', 'Precision']
rows=['Logistic', 'Decision Tree', 'Bagged trees', 'Random forest', 'Extra Trees']
results=pd.DataFrame(0.0, columns=columns, index=rows)
methods=[logit, tree, bag, rf, xtrees]
y_prob = np.zeros((len(test), len(rows)))
for i, method in enumerate(methods):
y_pred = method.predict(X_test)
y_prob[:, i] = method.predict_proba(X_test)[:,1]
confusion = confusion_matrix(y_test, y_pred)
results.iloc[i,0]= 1 - accuracy_score(y_test, y_pred)
results.iloc[i,1]= confusion[1,1]/np.sum(confusion[1,:])
results.iloc[i,2]= confusion[0,0]/np.sum(confusion[0,:])
results.iloc[i,3]= roc_auc_score(y_test, y_prob[:,i])
results.iloc[i,4]= precision_score(y_test, y_pred)
results.round(3)
# In[27]:
from statlearning import plot_roc_curves
with sns.color_palette(crayon):
fig, ax = plot_roc_curves(y_test, y_prob[:,[0,1,3,4]], labels=pd.Series(rows).iloc[[0,1,3,4]])
plt.show()