#!/usr/bin/env python # coding: utf-8 #

Machine Learning Using Python (MEAFA Workshop)

Lesson 5: Trees and Random Forests

#
# # In this lesson we can consider a case study from customer relationship management to discuss decision trees and random forests. # # Customer Acquisition Data
# Exploratory Data Analysis
# Decision Tree
# Bagging
# Random Forest
# Extremely Randomised Trees
# Model Selection
# Model Evaluation
# # This notebook relies on the following libraries and settings. # In[1]: # Packages import numpy as np from scipy import stats import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import warnings warnings.filterwarnings('ignore') # In[2]: # Plot settings sns.set_context('notebook') sns.set_style('ticks') colours = ['#1F77B4', '#FF7F0E', '#2CA02C', '#DB2728', '#9467BD', '#8C564B', '#E377C2','#7F7F7F', '#BCBD22', '#17BECF'] crayon = ['#4E79A7','#F28E2C','#E15759','#76B7B2','#59A14F', '#EDC949','#AF7AA1','#FF9DA7','#9C755F','#BAB0AB'] sns.set_palette(colours) get_ipython().run_line_magic('matplotlib', 'inline') plt.rcParams['figure.figsize'] = (9, 6) # In[3]: # Methods from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier # Model selection and evaluation tools from sklearn.model_selection import train_test_split from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_predict from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score from sklearn.metrics import precision_score, average_precision_score, log_loss # ## Customer Retention Data # # This dataset is taken from [Statistical Methods in Customer Relationship Management](http://onlinelibrary.wiley.com/book/10.1002/9781118349212), authored by V. Kumar and J. Andrew Petersen. # # **Business objective**: to predict which customers will end the relationship with the business. # In[4]: data = pd.read_excel('Datasets/CustomerChurn.xls', index_col=[0]) data['Churn'] = (data['Censor']==0).astype(int) data.head() # In[5]: response='Churn' predictors=['Avg_Ret_Exp', 'Revenue', 'Employees', 'Total_Crossbuy', 'Total_Freq', 'Industry'] data = data[[response]+predictors] # discarding variables that we will not use index_train, index_test = train_test_split(np.array(data.index), stratify=data[response], train_size=0.8, random_state=5) train = data.loc[index_train,].copy() test = data.loc[index_test,:].copy() y_train = train[response] y_test = test[response] X_train = train[predictors] X_test = test[predictors] # ##Exploratory Data Analysis # In[6]: train.describe().round(2) # In[7]: from statlearning import plot_histograms plot_histograms(train[predictors[:-1]]) # excludes the last variable, since it is binary plt.show() # In[8]: from statlearning import plot_logistic_regressions with sns.color_palette(crayon): plot_logistic_regressions(train[predictors], train[response]) plt.show() # In[9]: from statlearning import plot_conditional_distributions plot_conditional_distributions(train[predictors[:-1]], y_train, labels=['Retention', 'Churn']) plt.show() # In[10]: table=pd.crosstab(train[response], train['Industry']) table = (table/table.sum()).round(3) table # In[11]: fig, ax = plt.subplots(figsize=(6,4)) (table.T).plot(kind='bar', alpha=0.8, ax=ax) ax.set_xlabel('Industry') ax.set_ylabel('Proportions') ax.legend_.set_title('Churn') plt.tight_layout() sns.despine() plt.show() # ## Decision Tree # In[12]: from sklearn.tree import DecisionTreeClassifier, export_graphviz tree = DecisionTreeClassifier(criterion='entropy', max_depth=2, min_samples_leaf=5) tree.fit(X_train, y_train) # In[13]: import graphviz from sklearn.tree import export_graphviz dot_data = export_graphviz(tree, out_file=None, feature_names=predictors, impurity=False, class_names=['not acquired','acquired'], rounded=True) graph = graphviz.Source(dot_data) graph.render('tree01') # saves tree to a file graph # In[14]: get_ipython().run_cell_magic('time', '', "\nmodel = DecisionTreeClassifier(criterion='entropy')\n\ntuning_parameters = {\n 'min_samples_leaf': [1,5,10,20,30,40,50],\n}\n\ntree_search = GridSearchCV(model, tuning_parameters, cv= 5 , return_train_score=False)\ntree_search.fit(X_train, y_train)\n\ntree = tree_search.best_estimator_\n\nprint('Best parameters found by grid search:', tree_search.best_params_, '\\n')\n") # In[15]: dot_data = export_graphviz(tree, out_file=None, feature_names=predictors, impurity=False, class_names=['retention','churn'], rounded=True) graph = graphviz.Source(dot_data) graph.render('tree02') # saves tree to a file graph # ## Bagging # In[16]: from sklearn.ensemble import BaggingClassifier bag = BaggingClassifier(DecisionTreeClassifier(criterion='entropy'), n_estimators=1000, random_state=1) bag.fit(X_train, y_train) # ## Random Forest # # The syntax to fit a [random forest classifier](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html) is the following. # In[17]: from sklearn.ensemble import RandomForestClassifier rf = RandomForestClassifier(criterion='entropy', max_features= 2, min_samples_leaf=5, n_estimators=1000, random_state=1) rf.fit(X_train, y_train) # To tune the random forest, we should select a parameter that controls the size of the trees (such as the minimum number of observations in a terminal node) and the number of predictors that are sampled as candidate split variables at each node of a tree. # In[18]: get_ipython().run_cell_magic('time', '', "\nmodel = RandomForestClassifier(criterion = 'entropy', n_estimators=1000)\n\ntuning_parameters = {\n 'min_samples_leaf': [1, 5, 10, 20, 50],\n 'max_features': np.arange(1, len(predictors)+1),\n}\n\nrf_search = RandomizedSearchCV(model, tuning_parameters, cv = 5, n_iter= 16, return_train_score=False, n_jobs=4)\nrf_search.fit(X_train, y_train)\n\nrf = rf_search.best_estimator_\n\nprint('Best parameters found by randomised search:', rf_search.best_params_, '\\n')\n") # After tuning the random forest, we may want to increase the number of trees to improve accuracy. # In[19]: rf.n_estimators = 10000 rf.fit(X_train, y_train) # In[20]: from statlearning import plot_feature_importance plot_feature_importance(rf, predictors) plt.show() # ## Extremely Randomised Trees # In[21]: from sklearn.ensemble import ExtraTreesClassifier # In[22]: get_ipython().run_cell_magic('time', '', "\nmodel = ExtraTreesClassifier(criterion = 'entropy', n_estimators=1000)\n\ntuning_parameters = {\n 'min_samples_leaf': [1, 5, 10, 20, 50],\n 'max_features': np.arange(1, len(predictors)+1),\n}\n\nxtrees_search = RandomizedSearchCV(model, tuning_parameters, cv = 5, n_iter= 16, \n return_train_score=False, n_jobs=4)\nxtrees_search.fit(X_train, y_train)\n\nxtrees = xtrees_search.best_estimator_\n\nprint('Best parameters found by randomised search:', xtrees_search.best_params_, '\\n')\n") # In[23]: xtrees.n_estimators = 10000 xtrees.fit(X_train, y_train) # ## Model Selection # In[24]: logit = LogisticRegression(C=1e3) logit.fit(X_train, y_train) # In[25]: columns=['Error rate', 'Sensitivity', 'Specificity', 'AUC', 'Precision'] rows=['Logistic', 'Decision Tree', 'Bagged trees', 'Random forest', 'Extra Trees'] results=pd.DataFrame(0.0, columns=columns, index=rows) methods=[logit, tree, bag, rf, xtrees] for i, method in enumerate(methods): y_prob = cross_val_predict(method, X_train, y_train, cv=10, method='predict_proba') y_pred = (y_prob[:,1] > 0.5).astype(int) confusion = confusion_matrix(y_train, y_pred) results.iloc[i,0]= 1 - accuracy_score(y_train, y_pred) results.iloc[i,1]= confusion[1,1]/np.sum(confusion[1,:]) results.iloc[i,2]= confusion[0,0]/np.sum(confusion[0,:]) results.iloc[i,3]= roc_auc_score(y_train, y_prob[:,1]) results.iloc[i,4]= precision_score(y_train, y_pred) results.round(3) # ## Model Evaluation # In[26]: columns=['Error rate', 'Sensitivity', 'Specificity', 'AUC', 'Precision'] rows=['Logistic', 'Decision Tree', 'Bagged trees', 'Random forest', 'Extra Trees'] results=pd.DataFrame(0.0, columns=columns, index=rows) methods=[logit, tree, bag, rf, xtrees] y_prob = np.zeros((len(test), len(rows))) for i, method in enumerate(methods): y_pred = method.predict(X_test) y_prob[:, i] = method.predict_proba(X_test)[:,1] confusion = confusion_matrix(y_test, y_pred) results.iloc[i,0]= 1 - accuracy_score(y_test, y_pred) results.iloc[i,1]= confusion[1,1]/np.sum(confusion[1,:]) results.iloc[i,2]= confusion[0,0]/np.sum(confusion[0,:]) results.iloc[i,3]= roc_auc_score(y_test, y_prob[:,i]) results.iloc[i,4]= precision_score(y_test, y_pred) results.round(3) # In[27]: from statlearning import plot_roc_curves with sns.color_palette(crayon): fig, ax = plot_roc_curves(y_test, y_prob[:,[0,1,3,4]], labels=pd.Series(rows).iloc[[0,1,3,4]]) plt.show()