For this exercise you will implement AdaBoost from scratch and applied it to a spam dataset. You will be classifying data into spam and not spam. You can call DecisionTreeClassifier from sklearn to learn your base classifiers.
Here is how you train a decision tree classifier with weights.
h = DecisionTreeClassifier(max_depth=1, random_state=0) h.fit(X, Y, sample_weight=w)
%matplotlib inline
import numpy as np
from sklearn.tree import DecisionTreeClassifier
# accuracy computation
def accuracy(y, pred):
return np.sum(y == pred) / float(len(y))
def parse_spambase_data(filename):
""" Given a filename return X and Y numpy arrays
X is of size number of rows x num_features
Y is an array of size the number of rows
Y is the last element of each row. (Convert 0 to -1)
"""
# YOUR CODE HERE
f = open(filename,'r')
data = []
for line in f.readlines():
data.append(line.split(','))
X = np.array(data)[:,:-1].astype(np.float16)
Y = np.array(data)[:,-1]
Y = np.array([int(i.replace('\n','')) for i in Y])
Y[Y==0] = -1
# raise NotImplementedError()
return X, Y
y_test = np.array([1., -1., 1., 1., -1., -1., 1., 1., 1., -1.])
X, Y = parse_spambase_data("tiny.spam.train")
for i in range(len(y_test)): assert(y_test[i] == Y[i])
n, m = X.shape
assert(n == 10)
assert(m == 57)
def adaboost(X, y, num_iter):
"""Given an numpy matrix X, a array y and num_iter return trees and weights
Input: X, y, num_iter
Outputs: array of trees from DecisionTreeClassifier
trees_weights array of floats
Assumes y is {-1, 1}
"""
trees = []
trees_weights = []
N, _ = X.shape
d = np.ones(N) / N
# YOUR CODE HERE
for i in range(num_iter):
h = DecisionTreeClassifier(max_depth=1, random_state=0)
h.fit(X,y,sample_weight=d)
trees.append(h)
pred = h.predict(X)
err = np.sum((y!=pred)*d)/np.sum(d)
if err == 0:
err+=.000001
trees_weights.append(np.log((1-err)/err))
d[y!=pred] = d[y!=pred]*((1-err)/err)
# raise NotImplementedError()
return trees, trees_weights
X, Y = parse_spambase_data("tiny.spam.train")
trees, weights = adaboost(X, Y, 2)
assert(len(trees) == 2)
assert(len(weights) == 2)
assert(isinstance(trees[0], DecisionTreeClassifier))
x = np.array([[0, -1], [1, 0], [-1, 0]])
y = np.array([-1, 1, 1])
trees, weights = adaboost(x, y, 1)
h = trees[0]
pred = h.predict(x)
for i in range(len(y)): assert(pred[i] == y[i])
def adaboost_predict(X, trees, trees_weights):
"""Given X, trees and weights predict Y
"""
# X input, y output
N, _ = X.shape
y = np.zeros(N)
# YOUR CODE HERE
for i,t in enumerate(trees):
y = y + t.predict(X)*trees_weights[i]
y = np.sign(y)
# raise NotImplementedError()
return y
x = np.array([[0, -1], [1, 0], [-1, 0]])
y = np.array([-1, 1, 1])
trees, weights = adaboost(x, y, 1)
pred = adaboost_predict(x, trees, weights)
for i in range(len(y)):
assert(pred[i] == y[i])
X, Y = parse_spambase_data("spambase.train")
X_test, Y_test = parse_spambase_data("spambase.test")
trees, trees_weights = adaboost(X, Y, 10)
Yhat = adaboost_predict(X, trees, trees_weights)
Yhat_test = adaboost_predict(X_test, trees, trees_weights)
acc_test = accuracy(Y_test, Yhat_test)
acc_train = accuracy(Y, Yhat)
print("Train Accuracy %.4f" % acc_train)
print("Test Accuracy %.4f" % acc_test)
assert(np.around(acc_train, decimals=4)==0.9111)
assert(np.around(acc_test, decimals=4)==0.9190)
Train Accuracy 0.9111 Test Accuracy 0.9190
num_trees = [100,200,300,400,500,600,700,800,900,1000,1200,1400,1600,1800,2000]
err_train = []
err_test = []
for n in num_trees:
trees, trees_weights = adaboost(X, Y, n)
Yhat = adaboost_predict(X, trees, trees_weights)
Yhat_test = adaboost_predict(X_test, trees, trees_weights)
err_test.append(1-accuracy(Y_test, Yhat_test))
err_train.append(1-accuracy(Y, Yhat))
import matplotlib.pyplot as plt
plt.plot(num_trees,err_train)
plt.plot(num_trees,err_test)
plt.legend(['error-train', 'error-test'], loc='upper left')
plt.show()
We can see that validation (test) error is minimum for 1600 trees and after that the error starts to go up. From this experiment we can conclude that the best value of number of trees will be 1600.
print(f'Train accuracy for the best value of num_trees:{100*(1-err_train[12])}')
print(f'Test accuracy for the best value of num_trees:{100*(1-err_test[12])}')
Train accuracy for the best value of num_trees:95.77777777777777 Test accuracy for the best value of num_trees:95.6
import xgboost as xgb
import pandas as pd
import numpy as np
Data needs to be stored in DMatrix
object which is designed to handle sparse datasets.
X, Y = parse_spambase_data("spambase.train")
X_test, Y_test = parse_spambase_data("spambase.test")
Y[Y==-1] = 0
Y_test[Y_test==-1] = 0
dtrain = xgb.DMatrix(X,Y)
dtest = xgb.DMatrix(X_test,Y_test)
print("Train dataset contains {0} rows and {1} columns".format(dtrain.num_row(), dtrain.num_col()))
print("Test dataset contains {0} rows and {1} columns".format(dtest.num_row(), dtest.num_col()))
Train dataset contains 3600 rows and 57 columns Test dataset contains 1000 rows and 57 columns
print("Train possible labels: ")
print(np.unique(dtrain.get_label()))
print("\nTest possible labels: ")
print(np.unique(dtest.get_label()))
Train possible labels: [ 0. 1.] Test possible labels: [ 0. 1.]
Let's make the following assuptions and adjust algorithm parameters to it:
'objective':'binary:logistic'
),'max_depth':2
),'silent':1
),'eta':1
),params = {
'objective':'binary:logistic',
'max_depth':3,
'silent':1,
'eta':1
}
num_rounds = 1000
We can also observe performance on test dataset using watchlist
watchlist = [(dtest,'test'), (dtrain,'train')] # native interface only
bst = xgb.train(params, dtrain, num_rounds, evals=watchlist,early_stopping_rounds=10,verbose_eval=False)
bst.best_iteration
83
watchlist = [(dtest,'test'), (dtrain,'train')] # native interface only
bst = xgb.train(params, dtrain, num_boost_round=83, evals=watchlist,verbose_eval=False)
# #print dettails for each treee
# bst.dump_model(fout='featmap.txt',with_stats=True)
# trees_dump = bst.get_dump(fmap='featmap.txt', with_stats=True)
# for tree in trees_dump:
# print(tree)
preds_prob = bst.predict(dtest)
preds_prob.shape
(1000,)
labels = dtest.get_label()
preds = preds_prob > 0.5 # threshold
correct = 0
for i in range(len(preds)):
if (labels[i] == preds[i]):
correct += 1
print('Predicted correctly: {0}/{1}'.format(correct, len(preds)))
print('Error: {0:.4f}'.format(1-correct/len(preds)))
Predicted correctly: 964/1000 Error: 0.0360
xgb.plot_importance(bst, importance_type='gain', xlabel='Gain',max_num_features=20,show_values=False) #based on gain
<matplotlib.axes._subplots.AxesSubplot at 0x1120e0c88>
xgb.plot_importance(bst,max_num_features=20,height=.5) #based on F-score
<matplotlib.axes._subplots.AxesSubplot at 0x111e7be48>
print("Train dataset contains {0} rows and {1} columns".format(X.shape[0], X.shape[1]))
print("Test dataset contains {0} rows and {1} columns".format(X_test.shape[0], X_test.shape[1]))
Train dataset contains 3600 rows and 57 columns Test dataset contains 1000 rows and 57 columns
print("Train possible labels: ")
print(np.unique(Y))
print("\nTest possible labels: ")
print(np.unique(Y_test))
Train possible labels: [0 1] Test possible labels: [0 1]
params = {
'objective':'binary:logistic',
'max_depth':3,
'silent':1,
'eta':1,
'num_rounds':83
}
bst = xgb.XGBClassifier(**params).fit(X, Y)
preds = bst.predict(X_test)
preds[:5]
array([0, 1, 0, 0, 1])
from sklearn.metrics import accuracy_score
correct = 0
for i in range(len(preds)):
if (Y_test[i] == preds[i]):
correct += 1
acc = accuracy_score(Y_test, preds)
print('Predicted correctly: {0}/{1}'.format(correct, len(preds)))
print('Error: {0:.4f}'.format(1-acc))
Predicted correctly: 955/1000 Error: 0.0450
import matplotlib.pyplot as plt
from sklearn.learning_curve import validation_curve
from sklearn.model_selection import StratifiedKFold
from xgboost.sklearn import XGBClassifier
from scipy.sparse import vstack
seed = 123
np.random.seed(seed)
default_params = {
'objective': 'binary:logistic',
'max_depth': 3,
'learning_rate': 0.3,
'silent': 1.0
}
n_estimators_range = np.linspace(1, 300, 10).astype('int')
train_scores, test_scores = validation_curve(
xgb.XGBClassifier(**default_params),
X, Y,
param_name = 'n_estimators',
param_range = n_estimators_range,
cv=3,
scoring='accuracy'
)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
fig = plt.figure(figsize=(10, 6), dpi=100)
plt.title("Validation Curve with XGBoost (eta = 0.3)")
plt.xlabel("number of trees")
plt.ylabel("Accuracy")
plt.ylim(0.7, 1.1)
plt.plot(n_estimators_range,
train_scores_mean,
label="Training score",
color="r")
plt.plot(n_estimators_range,
test_scores_mean,
label="Cross-validation score",
color="g")
plt.fill_between(n_estimators_range,
train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std,
alpha=0.2, color="r")
plt.fill_between(n_estimators_range,
test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std,
alpha=0.2, color="g")
plt.axhline(y=1, color='k', ls='dashed')
plt.legend(loc="best")
plt.show()
i = np.argmax(test_scores_mean)
print("Best cross-validation result ({0:.2f}) obtained for {1} trees".format(test_scores_mean[i], n_estimators_range[i]))
Best cross-validation result (0.95) obtained for 100 trees
Looking at the plot we can draw the following conclusions:
We can assume that the trade-off for our model will be met at n_estimators = 100
.
If model is too complex try:
In XGBoost you can try to:
max_depth
),min_child_weight
parameter,gamma
parameter,subsample
, colsample_bytree
parameters,lambda
and alpha
regularization parametersIf model is too simple:
In XGBoost you can do it by:
max_depth
),min_child_weight
parameter,gamma
parameter,lambda
and alpha
regularization parametersLet's try to tweak a parameters a little bit. We are going to add some randomness - each tree we will use 70% randomly chosen samples and 60% randomly chosen features. This should help to reduce a variance.
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import randint, uniform
params_fixed = {
'objective': 'binary:logistic',
'silent': 1,
'n_estimators':100
}
params_dist_grid = {
'max_depth': [1, 2, 3, 4],
'gamma': [0, 0.5, 1],
'learning_rate': uniform(), # gaussian distribution
'subsample': uniform(), # gaussian distribution
'colsample_bytree': uniform() # gaussian distribution
}
rs_grid = RandomizedSearchCV(
estimator=xgb.XGBClassifier(**params_fixed, seed=100),
param_distributions=params_dist_grid,
n_iter=50,
cv=3,
scoring='accuracy',
random_state=100
)
rs_grid.fit(X, Y)
RandomizedSearchCV(cv=3, error_score='raise', estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3, min_child_weight=1, missing=None, n_estimators=100, n_jobs=1, nthread=None, objective='binary:logistic', random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=100, silent=1, subsample=1), fit_params={}, iid=True, n_iter=50, n_jobs=1, param_distributions={'max_depth': [1, 2, 3, 4], 'gamma': [0, 0.5, 1], 'learning_rate': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1125980f0>, 'subsample': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1125983c8>, 'colsample_bytree': <scipy.stats._distn_infrastructure.rv_frozen object at 0x111875978>}, pre_dispatch='2*n_jobs', random_state=100, refit=True, return_train_score=True, scoring='accuracy', verbose=0)
rs_grid.best_estimator_
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bytree=0.73299501983799231, gamma=0, learning_rate=0.25155065329556847, max_delta_step=0, max_depth=2, min_child_weight=1, missing=None, n_estimators=100, n_jobs=1, nthread=None, objective='binary:logistic', random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=100, silent=1, subsample=0.40588132236756125)
rs_grid.best_params_
{'colsample_bytree': 0.73299501983799231, 'gamma': 0, 'learning_rate': 0.25155065329556847, 'max_depth': 2, 'subsample': 0.40588132236756125}
rs_grid.best_score_
0.94694444444444448
params = {
'objective':'binary:logistic',
'silent':1,
'n_estimators': 100,
'colsample_bytree': 0.73299501983799231,
'gamma': 0,
'learning_rate': 0.25155065329556847,
'max_depth': 2,
'subsample': 0.40588132236756125}
bst = xgb.XGBClassifier(**params).fit(X, Y)
preds = bst.predict(X_test)
preds[:5]
array([0, 1, 0, 0, 1])
from sklearn.metrics import accuracy_score
correct = 0
for i in range(len(preds)):
if (Y_test[i] == preds[i]):
correct += 1
acc = accuracy_score(Y_test, preds)
print('Predicted correctly: {0}/{1}'.format(correct, len(preds)))
print('Error: {0:.4f}'.format(1-acc))
Predicted correctly: 956/1000 Error: 0.0440
So we're getting a better result using learning API and early stopping rounds
params = {
'objective':'binary:logistic',
'max_depth':3,
'silent':1,
'eta':1
}
bst = xgb.train(params, dtrain, num_boost_round=83,verbose_eval=False)
preds_prob = bst.predict(dtest)
preds_prob.shape
(1000,)
labels = dtest.get_label()
preds = preds_prob > 0.5 # threshold
correct = 0
for i in range(len(preds)):
if (labels[i] == preds[i]):
correct += 1
print('Predicted correctly: {0}/{1}'.format(correct, len(preds)))
print('Error: {0:.4f}'.format(1-correct/len(preds)))
Predicted correctly: 964/1000 Error: 0.0360