import numpy as np
import pandas as pd
import patsy as pt
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from IPython.display import HTML
from ipywidgets import interact
import ipywidgets as widgets
import copy
import warnings
warnings.filterwarnings('ignore')
from sklearn import tree
import graphviz
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn import datasets
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import Lasso
import statsmodels.api as sm
from sklearn.neighbors import KNeighborsClassifier
boston_df = datasets.load_boston()
boston_df = pd.DataFrame(data=np.c_[boston_df['data'], boston_df['target']], columns= [c for c in boston_df['feature_names']] + ['Price'])
np.random.seed(1)
train = np.random.rand(len(boston_df)) < 0.5
display(boston_df.head())
# Create design and response matrix
f = 'Price ~ ' + ' + '.join(boston_df.columns.drop(['Price']))
y, X = pt.dmatrices(f, boston_df)
CRIM | ZN | INDUS | CHAS | NOX | RM | AGE | DIS | RAD | TAX | PTRATIO | B | LSTAT | Price | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.00632 | 18.0 | 2.31 | 0.0 | 0.538 | 6.575 | 65.2 | 4.0900 | 1.0 | 296.0 | 15.3 | 396.90 | 4.98 | 24.0 |
1 | 0.02731 | 0.0 | 7.07 | 0.0 | 0.469 | 6.421 | 78.9 | 4.9671 | 2.0 | 242.0 | 17.8 | 396.90 | 9.14 | 21.6 |
2 | 0.02729 | 0.0 | 7.07 | 0.0 | 0.469 | 7.185 | 61.1 | 4.9671 | 2.0 | 242.0 | 17.8 | 392.83 | 4.03 | 34.7 |
3 | 0.03237 | 0.0 | 2.18 | 0.0 | 0.458 | 6.998 | 45.8 | 6.0622 | 3.0 | 222.0 | 18.7 | 394.63 | 2.94 | 33.4 |
4 | 0.06905 | 0.0 | 2.18 | 0.0 | 0.458 | 7.147 | 54.2 | 6.0622 | 3.0 | 222.0 | 18.7 | 396.90 | 5.33 | 36.2 |
# Compare test RMSE of Random Forest for various numbers of features considered at each split (mtry)
# and increasing number of trees (ntree)
max_features = {'p': X.shape[1],
'p/2': int(np.around(X.shape[1]/2)),
'$\sqrt{p}$': int(np.around(np.sqrt(X.shape[1]))),
'1': 1}
results = []
for mtry in max_features:
for tree_count in np.arange(1, 100):
regr = RandomForestRegressor(max_features=max_features[mtry], random_state=0, n_estimators=tree_count)
regr.fit(X[train], y[train])
y_hat = regr.predict(X[~train])
mse = metrics.mean_squared_error(y[~train], y_hat)
rmse = np.sqrt(mse)
results+= [[tree_count, mtry, rmse]]
plt.figure(figsize=(10,10))
sns.lineplot(x='Number of Trees', y='RMSE', hue='mtry',
data=pd.DataFrame(results, columns=['Number of Trees', 'mtry', 'RMSE']));
The above shows the test RMSE for a 50% holdout set where training set contains 240 observations each with 14 predictors.
We find that the test RMSE decreases with increasing number of trees for all values of mtry (number of random features considered at each split)
The optimal value of m is less than p and greater than 1. In this case $m=\sqrt{p}$ yields the best results for $ntree > ?$ although $p/2$ performs better for some lower tree counts.
Incredibly the model does not seem prone to overfitting with test RMSE continuing to decrees up to ntree = 100.
carseats_df = pd.read_csv('./data/Carseats.csv')
# Check for missing values
assert carseats_df.isnull().sum().sum() == 0
# Drop unused index
carseats_df = carseats_df.drop('Unnamed: 0', axis=1)
# Create index for training set
np.random.seed(1)
train = np.random.random(len(carseats_df)) > 0.5
display(carseats_df.head())
Sales | CompPrice | Income | Advertising | Population | Price | ShelveLoc | Age | Education | Urban | US | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 9.50 | 138 | 73 | 11 | 276 | 120 | Bad | 42 | 17 | Yes | Yes |
1 | 11.22 | 111 | 48 | 16 | 260 | 83 | Good | 65 | 10 | Yes | Yes |
2 | 10.06 | 113 | 35 | 10 | 269 | 80 | Medium | 59 | 12 | Yes | Yes |
3 | 7.40 | 117 | 100 | 4 | 466 | 97 | Medium | 55 | 14 | Yes | Yes |
4 | 4.15 | 141 | 64 | 3 | 340 | 128 | Bad | 38 | 13 | Yes | No |
# Use all features excpet response features
# No intercept
preds = carseats_df.columns.drop(['Sales'])
#preds_scaled = ['standardize({})'.format(p) for p in preds]
f = 'Sales ~ 0 +' + ' + '.join(preds)
y, X = pt.dmatrices(f, carseats_df)
y = y.flatten()
# Fit Sklearn's tree regressor
clf = tree.DecisionTreeRegressor(max_depth=5).fit(X[train], y[train])
# Measure test set MSE
y_hat = clf.predict(X[~train])
mse = metrics.mean_squared_error(y[~train], y_hat)
# Get proportion of correct classifications on test set
print('Test MSE: {}'.format(np.around(mse, 3)))
print('Test RMSE: {}'.format(np.around(np.sqrt(mse), 3)))
Test MSE: 5.09 Test RMSE: 2.256
# How about using CV to compare trees with different number of leaf nodes
# as defined by the max_leaf_nodes parameter?
cv_folds = 10
tuning_param = 'max_leaf_nodes'
columns=[tuning_param, 'RMSE', 'upper', 'lower']
results = []
for m in np.arange(2, 100):
regr = tree.DecisionTreeRegressor(max_leaf_nodes=m)
scores = cross_val_score(regr, X[train], y[train], cv=cv_folds, scoring='neg_mean_squared_error')
rmses = np.sqrt(np.absolute(scores))
rmse = np.mean(rmses)
conf_int = np.std(rmses) *2
results += [[m, rmse, rmse+conf_int, rmse-conf_int]]
# Plot classification accuracy for each max_depth cv result
plot_df = pd.DataFrame(np.asarray(results), columns=columns).set_index(tuning_param)
plt.figure(figsize=(10,10))
sns.lineplot(data=plot_df)
plt.ylabel('RMSE')
plt.show();
# Show chosen model
chosen = plot_df[plot_df['RMSE'] == plot_df['RMSE'].min()]
display(chosen)
# Use chosen model for test prediction
regr = tree.DecisionTreeRegressor(max_leaf_nodes=int(chosen.index[0])).fit(X[train], y[train])
y_hat = regr.predict(X[~train])
mse = metrics.mean_squared_error(y[~train], y_hat)
# Get proportion of correct classifications on test set
print('Test MSE : {}'.format(np.around(mse, 3)))
print('Test RMSE: {}'.format(np.around(np.sqrt(mse), 3)))
RMSE | upper | lower | |
---|---|---|---|
max_leaf_nodes | |||
31.0 | 2.188901 | 2.4201 | 1.957702 |
Test MSE : 4.661 Test RMSE: 2.159
10-fold cross validation selects a pruned tree model that achieves test MSE of 4.524, an improvement on the unpruned model (5.066). Interestingly 100-fold, 5-fold and 2-fold CV were all unable to select an improvement on the unpruned model.
# Bagging with 100 trees
# although I'm using RandomForestRegressor algo here this is Bagging because max_features = n_predictors
max_features = X.shape[1]
tree_count = 100
regr = RandomForestRegressor(max_features=max_features, random_state=0, n_estimators=tree_count)
regr.fit(X[train], y[train])
y_hat = regr.predict(X[~train])
mse = metrics.mean_squared_error(y[~train], y_hat)
rmse = np.sqrt(mse)
print('Test MSE : {}'.format(np.around(mse, 3)))
print('Test RMSE: {}'.format(np.around(rmse, 3)))
Test MSE : 2.615 Test RMSE: 1.617
# Plot feature by importance in this model
plot_df = pd.DataFrame({'feature': X.design_info.column_names, 'importance': regr.feature_importances_})
plt.figure(figsize=(10,10))
sns.barplot(x='importance', y='feature', data=plot_df.sort_values('importance', ascending=False),
color='b')
plt.xticks(rotation=90);
Bagging yields a significantly improved test MSE of 2.615 compared with 4.524 for the optimal pruned tree.
The bagging model indicates that instore Shelve Location (Good) and Price of the carseat are the most significant features affecting Sales revenue. This aligns with our observation when performing classification in hte lab.
# Random Forest with 100 trees and 4 features considered at each split
max_features = 7
tree_count = 100
regr = RandomForestRegressor(max_features=max_features, random_state=0, n_estimators=tree_count)
regr.fit(X[train], y[train])
y_hat = regr.predict(X[~train])
mse = metrics.mean_squared_error(y[~train], y_hat)
rmse = np.sqrt(mse)
print('MSE test: {}'.format(np.around(mse, 3)))
print('RMSE test: {}'.format(np.around(rmse, 3)))
MSE test: 2.544 RMSE test: 1.595
# Plot feature by importance in this model
plot_df = pd.DataFrame({'feature': X.design_info.column_names, 'importance': regr.feature_importances_})
plt.figure(figsize=(10,10))
sns.barplot(x='importance', y='feature', data=plot_df.sort_values('importance', ascending=False),
color='b')
plt.xticks(rotation=90);
Random forest with 7 predictors at each split yields a test MSE 2.544 similar to bagging (2.615). A similar feature importance is ascribed.
# Describe the effect of m, the number of variables considered at each split, on the error rate obtained.
results = []
for max_features in np.arange(1, X.shape[1]):
tree_count = 100
regr = RandomForestRegressor(max_features=max_features, random_state=0, n_estimators=tree_count)
regr.fit(X[train], y[train])
y_hat = regr.predict(X[~train])
mse = metrics.mean_squared_error(y[~train], y_hat)
rmse = np.sqrt(mse)
results += [[max_features, mse]]
plt.figure(figsize=(10,5))
sns.lineplot(x='Split Variables', y='MSE', data=pd.DataFrame(results, columns=['Split Variables', 'MSE']));
oj_df = pd.read_csv('./data/oj.csv')
oj_df = oj_df.drop(oj_df.columns[0], axis=1)
display(oj_df.head())
# Index for Training set of 800
np.random.seed(1)
train_sample = np.random.choice(np.arange(len(oj_df)), size=800, replace=False)
train = np.asarray([(i in train_sample) for i in oj_df.index])
#oj_df.Purchase = oj_df.Purchase.map({'CH' : 1, 'MM': 0})
#oj_df.Store7 = oj_df.Store7.map({'Yes' : 1, 'No': 0})
#oj_df.head()
Purchase | WeekofPurchase | StoreID | PriceCH | PriceMM | DiscCH | DiscMM | SpecialCH | SpecialMM | LoyalCH | SalePriceMM | SalePriceCH | PriceDiff | Store7 | PctDiscMM | PctDiscCH | ListPriceDiff | STORE | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | CH | 237 | 1 | 1.75 | 1.99 | 0.00 | 0.0 | 0 | 0 | 0.500000 | 1.99 | 1.75 | 0.24 | No | 0.000000 | 0.000000 | 0.24 | 1 |
1 | CH | 239 | 1 | 1.75 | 1.99 | 0.00 | 0.3 | 0 | 1 | 0.600000 | 1.69 | 1.75 | -0.06 | No | 0.150754 | 0.000000 | 0.24 | 1 |
2 | CH | 245 | 1 | 1.86 | 2.09 | 0.17 | 0.0 | 0 | 0 | 0.680000 | 2.09 | 1.69 | 0.40 | No | 0.000000 | 0.091398 | 0.23 | 1 |
3 | MM | 227 | 1 | 1.69 | 1.69 | 0.00 | 0.0 | 0 | 0 | 0.400000 | 1.69 | 1.69 | 0.00 | No | 0.000000 | 0.000000 | 0.00 | 1 |
4 | CH | 228 | 7 | 1.69 | 1.69 | 0.00 | 0.0 | 0 | 0 | 0.956535 | 1.69 | 1.69 | 0.00 | Yes | 0.000000 | 0.000000 | 0.00 | 0 |
f = 'C(Purchase) ~ ' + ' + '.join(oj_df.columns.drop(['Purchase']))
y, X = pt.dmatrices(f, oj_df)
y = y[:, 0]
# Fit Sklearns tree classifier
clf = tree.DecisionTreeClassifier().fit(X[train], y[train])
print('training accuracy: {}'.format(np.around(clf.score(X[train], y[train]), 3)))
print('leaf nodes: 6')
training accuracy: 0.99 leaf nodes: 6
# Visualise the tree with GraphViz
dot_data = tree.export_graphviz(clf, out_file=None,
feature_names=X.design_info.column_names,
filled=True, rounded=True)
graph = graphviz.Source(dot_data)
display(HTML(graph._repr_svg_()))
# Here's the confusion matrix
print(confusion_matrix(y[~train], clf.predict(X[~train])))
test_err = 1 - clf.score(X[~train], y[~train])
print('\ntest error rate: {}'.format(np.around(test_err, 3)))
[[ 74 24] [ 40 132]] test error rate: 0.237
# Estimate optimal tree with cross validation on training set
cv_folds = 10
results = []
for mln in np.arange(2, 50):
clf = tree.DecisionTreeClassifier(max_leaf_nodes=mln)
score = cross_val_score(clf, X[train], y[train], cv=cv_folds)
results += [[mln, np.mean(score)]]
plt.figure(figsize=(10,5))
plot_df = pd.DataFrame(np.asarray(results), columns=['Leaves', 'Test']).set_index('Leaves')
sns.lineplot(data=plot_df);
plt.ylabel('accuracy')
plt.show();
display(HTML('<h4>Optimal tree size:</h4>'))
display(plot_df[plot_df['Test'] == plot_df['Test'].max()])
Test | |
---|---|
Leaves | |
8.0 | 0.811304 |
# Determine actual optimal tree using test set.
results = []
for mln in np.arange(2, 50):
clf = tree.DecisionTreeClassifier(max_leaf_nodes=mln).fit(X[train], y[train])
accuracy_train = clf.score(X[train], y[train])
accuracy_test = clf.score(X[~train], y[~train])
results += [[mln, accuracy_train, accuracy_test]]
plt.figure(figsize=(10,5))
plot_df = pd.DataFrame(np.asarray(results), columns=['Leaves', 'Train', 'Test']).set_index('Leaves')
sns.lineplot(data=plot_df);
plt.ylabel('accuracy')
plt.show();
display(HTML('<h4>Optimal tree size:</h4>'))
display(plot_df[plot_df['Test'] == plot_df['Test'].max()])
Train | Test | |
---|---|---|
Leaves | ||
2.0 | 0.8125 | 0.8 |
3.0 | 0.8125 | 0.8 |
clf_unpruned = tree.DecisionTreeClassifier().fit(X[train], y[train])
clf_pruned = tree.DecisionTreeClassifier(max_leaf_nodes=8).fit(X[train], y[train])
scores = [['unpruned_train', 1 - clf_unpruned.score(X[train], y[train])],
['pruned_train', 1 - clf_pruned.score(X[train], y[train])],
['unpruned_test', 1 - clf_unpruned.score(X[~train], y[~train])],
['pruned_test', 1 - clf_pruned.score(X[~train], y[~train])]]
plot_df = pd.DataFrame(scores, columns=['test', 'error rate'])
plt.figure(figsize=(10, 6))
sns.barplot(x='test', y='error rate', data=plot_df)
plt.show();
display(plot_df)
test | error rate | |
---|---|---|
0 | unpruned_train | 0.010000000000000009 |
1 | pruned_train | 0.15000000000000002 |
2 | unpruned_test | 0.24444444444444446 |
3 | pruned_test | 0.21851851851851856 |
The unpruned tree performs best in the training setting where as the pruned tree performs best in the test setting. This suggests that the unpruned tree is overfitting the training data leading to poor test score.
hitters_df = pd.read_csv('./data/Hitters.csv')
# Drop null observations
hitters_df = hitters_df.dropna()
assert hitters_df.isnull().sum().sum() == 0
f = 'np.log(Salary) ~ ' + ' + '.join(hitters_df.columns.drop(['Salary']))
y, X = pt.dmatrices(f, hitters_df)
pd.DataFrame(X, columns=X.design_info.column_names).head()
Intercept | League[T.N] | Division[T.W] | NewLeague[T.N] | AtBat | Hits | HmRun | Runs | RBI | Walks | Years | CAtBat | CHits | CHmRun | CRuns | CRBI | CWalks | PutOuts | Assists | Errors | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1.0 | 1.0 | 1.0 | 1.0 | 315.0 | 81.0 | 7.0 | 24.0 | 38.0 | 39.0 | 14.0 | 3449.0 | 835.0 | 69.0 | 321.0 | 414.0 | 375.0 | 632.0 | 43.0 | 10.0 |
1 | 1.0 | 0.0 | 1.0 | 0.0 | 479.0 | 130.0 | 18.0 | 66.0 | 72.0 | 76.0 | 3.0 | 1624.0 | 457.0 | 63.0 | 224.0 | 266.0 | 263.0 | 880.0 | 82.0 | 14.0 |
2 | 1.0 | 1.0 | 0.0 | 1.0 | 496.0 | 141.0 | 20.0 | 65.0 | 78.0 | 37.0 | 11.0 | 5628.0 | 1575.0 | 225.0 | 828.0 | 838.0 | 354.0 | 200.0 | 11.0 | 3.0 |
3 | 1.0 | 1.0 | 0.0 | 1.0 | 321.0 | 87.0 | 10.0 | 39.0 | 42.0 | 30.0 | 2.0 | 396.0 | 101.0 | 12.0 | 48.0 | 46.0 | 33.0 | 805.0 | 40.0 | 4.0 |
4 | 1.0 | 0.0 | 1.0 | 0.0 | 594.0 | 169.0 | 4.0 | 74.0 | 51.0 | 35.0 | 11.0 | 4408.0 | 1133.0 | 19.0 | 501.0 | 336.0 | 194.0 | 282.0 | 421.0 | 25.0 |
# Index for Training set of 200
np.random.seed(1)
train_sample = np.random.choice(np.arange(len(hitters_df)), size=200, replace=False)
train = np.asarray([(i in train_sample) for i in hitters_df.index])
# Gradient boosting
max_features = 'auto'
tree_count = 1000
# np.arange(0.0001, 1.2, 0.01)
results = []
for learning_rate in np.logspace(-10, np.log(1.3), 100):
regr = GradientBoostingRegressor(max_features=max_features,
random_state=1,
n_estimators=tree_count,
learning_rate=learning_rate)
regr = regr.fit(X[train], y[train])
y_hat_train = regr.predict(X[train])
y_hat_test = regr.predict(X[~train])
mse_train = metrics.mean_squared_error(y[train], y_hat_train)
mse_test = metrics.mean_squared_error(y[~train], y_hat_test)
results += [[learning_rate, mse_train, mse_test]]
# Plot
df = pd.DataFrame(np.asarray(results),
columns=['learning_rate', 'mse_train', 'mse_test']).set_index('learning_rate')
plt.figure(figsize=(10,10))
ax = sns.lineplot(data=df)
ax.set_xscale('log')
plt.ylabel('MSE')
plt.show();
# Show best learning rate
display(df[df['mse_test'] == df['mse_test'].min()])
mse_train | mse_test | |
---|---|---|
learning_rate | ||
0.00229 | 0.071057 | 0.206531 |
# Naive ols
model = sm.OLS(y[train], X[train]).fit()
y_hat = model.predict(X[~train])
mse_test = metrics.mean_squared_error(y[~train], y_hat)
print('MSE test: {}'.format(np.around(mse_test, 3)))
MSE test: 0.455
def lasso_cv(X, y, λ, k):
"""Perform the lasso with
k-fold cross validation to return mean MSE scores for each fold"""
# Split dataset into k-folds
# Note: np.array_split doesn't raise excpetion is folds are unequal in size
X_folds = np.array_split(X, k)
y_folds = np.array_split(y, k)
MSEs = []
for f in np.arange(len(X_folds)):
# Create training and test sets
X_test = X_folds[f]
y_test = y_folds[f]
X_train = X.drop(X_folds[f].index)
y_train = y.drop(y_folds[f].index)
# Fit model
model = Lasso(alpha=λ, copy_X=True, fit_intercept=True, max_iter=10000,
normalize=False, positive=False, precompute=False, random_state=0,
selection='cyclic', tol=0.0001, warm_start=False).fit(X_train, y_train)
# Measure MSE
y_hat = model.predict(X_test)
#print(y_test)
MSEs += [metrics.mean_squared_error(y_test, y_hat)]
return MSEs
X_train = pd.DataFrame(X[train], columns=X.design_info.column_names)
y_train = pd.DataFrame(y[train], columns=['Price'])
#lambdas = np.arange(.000001, 0.01, .0001)
lambdas = np.arange(0.2, 20, .1)
MSEs = []
for l in lambdas:
MSEs += [np.mean(lasso_cv(X_train, y_train, λ=l, k=10))]
sns.scatterplot(x='λ', y='MSE', data=pd.DataFrame({'λ': lambdas, 'MSE': MSEs}))
plt.show();
# Choose model
lamb = min(zip(MSEs, lambdas))
print('RMSE Train CV: {}\n@Lambda: {}'.format(np.sqrt(lamb[0]), lamb[1]))
# Use chosen model on test set prediction
model = Lasso(alpha=lamb[1], copy_X=True, fit_intercept=True, max_iter=10000,
normalize=False, positive=False, precompute=False, random_state=0,
selection='cyclic', tol=0.0001, warm_start=False).fit(X[train], y[train])
y_hat = model.predict(X[~train])
mse = metrics.mean_squared_error(y[~train], y_hat)
rmse = np.sqrt(mse)
print('MSE test: {}'.format(np.around(mse, 3)))
print('RMSE test: {}'.format(np.around(rmse, 3)))
RMSE Train CV: 0.6482094156231718 @Lambda: 9.300000000000002 MSE test: 0.469 RMSE test: 0.685
Comment:
Boosting yields a test MSE of 0.207, which is significantly better than naive OLS (0.455) and the Lasso (0.469).
It is interesting that naive OLS with all features outperforms the lasso. This suggests that the lasso is unable to identify redundant features in the model.
max_features = 'auto'
tree_count = 1000
learning_rate = 0.00229
regr = GradientBoostingRegressor(max_features=max_features,
random_state=1,
n_estimators=tree_count,
learning_rate=learning_rate)
regr = regr.fit(X[train], y[train])
y_hat_test = regr.predict(X[~train])
mse_test = metrics.mean_squared_error(y[~train], y_hat_test)
print(mse_test)
# Plot feature by importance in this model
plot_df = pd.DataFrame({'feature': X.design_info.column_names, 'importance': regr.feature_importances_})
plt.figure(figsize=(10,10))
sns.barplot(x='importance', y='feature', data=plot_df.sort_values('importance', ascending=False),
color='b')
plt.xticks(rotation=90);
0.20666656200483657
The boosting model suggests that CAtBat – Number of times at bat during his career – is by far the most important predictor of Salary.
Number of walks during his career (CWalks) is also predictive of salary, but it seems likely that this feature would be covariant with CAtBat.
# Bagging with 100 trees
# although I'm using RandomForestRegressor algo here this is Bagging because max_features = n_predictors
max_features = X.shape[1]
tree_count = 1000
regr = RandomForestRegressor(max_features=max_features, random_state=0, n_estimators=tree_count)
regr.fit(X[train], y[train])
y_hat = regr.predict(X[~train])
mse = metrics.mean_squared_error(y[~train], y_hat)
rmse = np.sqrt(mse)
print('MSE test: {}'.format(np.around(mse, 3)))
print('RMSE test: {}'.format(np.around(rmse, 3)))
MSE test: 0.208 RMSE test: 0.456
Bagging achieves a test MSE of 0.208 which is equivalent to that achieved by boosting. Bagging has the advantage here that the result was achived without need to tune hyper-parameters.
caravan_df = pd.read_csv('./data/Caravan.csv').drop('Unnamed: 0', axis=1)
# Patsy feature processing
f = 'C(Purchase) ~ ' + ' + '.join(caravan_df.columns.drop(['Purchase']))
y, X = pt.dmatrices(f, caravan_df)
y = y[:, 1]
# Display processed features
display(pd.DataFrame(X, columns=X.design_info.column_names).head())
# Index for Training set of 1000
np.random.seed(1)
train_sample = np.random.choice(np.arange(len(caravan_df)), size=1000, replace=False)
train = np.asarray([(i in train_sample) for i in caravan_df.index])
Intercept | MOSTYPE | MAANTHUI | MGEMOMV | MGEMLEEF | MOSHOOFD | MGODRK | MGODPR | MGODOV | MGODGE | ... | ALEVEN | APERSONG | AGEZONG | AWAOREG | ABRAND | AZEILPL | APLEZIER | AFIETS | AINBOED | ABYSTAND | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1.0 | 33.0 | 1.0 | 3.0 | 2.0 | 8.0 | 0.0 | 5.0 | 1.0 | 3.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | 1.0 | 37.0 | 1.0 | 2.0 | 2.0 | 8.0 | 1.0 | 4.0 | 1.0 | 4.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | 1.0 | 37.0 | 1.0 | 2.0 | 2.0 | 8.0 | 0.0 | 4.0 | 2.0 | 4.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | 1.0 | 9.0 | 1.0 | 3.0 | 3.0 | 3.0 | 2.0 | 3.0 | 2.0 | 4.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
4 | 1.0 | 40.0 | 1.0 | 4.0 | 2.0 | 10.0 | 1.0 | 4.0 | 1.0 | 4.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 86 columns
max_features = 'auto'
tree_count = 1000
learning_rate = 0.01
model = GradientBoostingClassifier(max_features=max_features,
random_state=1,
n_estimators=tree_count,
learning_rate=learning_rate)
model = model.fit(X[train], y[train])
#y_hat_test = regr.predict(X[~train])
accuracy = model.score(X[~train], y[~train])
print('accuracy: {}%'.format(np.around(accuracy*100, 2)))
# Plot feature by importance in this model
plot_df = pd.DataFrame({'feature': X.design_info.column_names, 'importance': model.feature_importances_})
plt.figure(figsize=(10,20))
sns.barplot(x='importance', y='feature', data=plot_df.sort_values('importance', ascending=False),
color='b')
plt.xticks(rotation=90);
accuracy: 93.26%
Here we use boosting to predict if someone purchases caravan insurance, a classification problem. The boosting model yields a test set prediction accuracy of 93.2%.
The model suggests that the ten most important predictors are:
Broadly these predictors indicate whether customer has other insurance policies, and their level of family commitment e.g. married, with children.
We can't tell the direction in which these predictors are related with reposnse Revision note: is this possible in the boosting setting?
From tnis might hypothesise that customers who have already purchased insurance of other kinds are more inclined to purchase caravan insurance. Also, those with more family commitment and responsibility might also be more inclined to purchase insurance. It seems plausible that these two groups are more risk averse.
max_features = 'auto'
tree_count = 1000
learning_rate = 0.01
model = GradientBoostingClassifier(max_features=max_features,
random_state=1,
n_estimators=tree_count,
learning_rate=learning_rate)
model = model.fit(X[train], y[train])
#y_hat_test = regr.predict(X[~train])
# Boosting stats
threshold = 0.2
y_hat_proba = model.predict_proba(X[~train])
y_hat = (y_hat_proba[:, 1] > threshold).astype(np.float64)
confusion_mat = confusion_matrix(y[~train], y_hat)
# What fraction of the people predicted to make a purchase do in fact make one?
pos_pred_val = np.around(confusion_mat[:, 1][1] / np.sum(confusion_mat[:, 1]), 5)
display(HTML('<h4>BOOSTING: Confusion matrix</h4>'))
print(confusion_mat)
print('\nPositive Predictive Value: {}'.format(pos_pred_val))
[[4405 120] [ 266 31]] Positive Predictive Value: 0.2053
# KNN
# PREDICT
for K in range(1, 10):
# model
model = KNeighborsClassifier(n_neighbors=K).fit(X[train], y[train])
# Predict
y_pred = model.predict(X[~train])
# Confusion table
display(HTML('<h3>K={}</h3>'.format(K)))
confusion_mtx = confusion_matrix(y[~train], y_pred)
print(confusion_mtx)
## Classifier stats
pos_pred_val = np.around(confusion_mtx[:, 1][1] / np.sum(confusion_mtx[:, 1]), 5)
print('\nPositive Predictive Value: {}'.format(pos_pred_val))
[[4280 245] [ 274 23]] Positive Predictive Value: 0.08582
[[4507 18] [ 293 4]] Positive Predictive Value: 0.18182
[[4476 49] [ 288 9]] Positive Predictive Value: 0.15517
[[4524 1] [ 297 0]] Positive Predictive Value: 0.0
[[4522 3] [ 297 0]] Positive Predictive Value: 0.0
[[4525 0] [ 297 0]] Positive Predictive Value: nan
[[4525 0] [ 297 0]] Positive Predictive Value: nan
[[4525 0] [ 297 0]] Positive Predictive Value: nan
[[4525 0] [ 297 0]] Positive Predictive Value: nan
Comment: KNN performs best when k=2 achieving a Positive Predictive Value of 0.182, which is slightly worse but not disimilar to the boosting result (0.205). The boosting model also excels in achieving roughly 8 times more True Positive predictions, so we would certainly choose the boosting model in this case.
It is worth noting that the dataset in this setting is highly dimensional with over 80 predictors, so the effectiveness of KNN is likely hindered by the curse of dimensionality.