#!/usr/bin/env python # coding: utf-8 #

Machine Learning Using Python (MEAFA Workshop)

Lesson 6: Boosting

#
# # Boosting is one the most powerful learning algorithms developed in recent decades. In most settings, a carefully tuned boosting model is likely to be among the best algorithms immediately available for prediction, if not the best. A large scale empirical study by [Caruana and Niculescu-Mizil (2006)](https://dl.acm.org/ft_gateway.cfm?id=1143865&ftid=364245&dwn=1&CFID=7573757&CFTOKEN=aea5af1b7b29f94-E2D75B9E-EE4F-186B-FF2FDBF1F8C435D1) found that boosting was overall the most accurate algorithm for supervised learning across a variety of datasets, among competitors that included random forests, support vector machines, and neural networks. Boosting is also behind the [winning solutions](https://github.com/dmlc/xgboost/tree/master/demo#machine-learning-challenge-winning-solutions) to several machine learning competitions. # # In this lesson we will study a regression problem to illustrate how to using boosting with Python. Due to the importance of this topic, we will move beyond Scikit-Learn to consider two specialised packages for gradient boosting: [XGBoost](http://xgboost.readthedocs.io/en/latest/model.html) and [LightGBM](https://github.com/Microsoft/LightGBM). # # California Housing Data
# Exploratory Data Analysis
# Boosting
# XGBoost
# LightGBM
# Benchmark Models
# Model Evaluation
# # This notebook relies on the following libraries and settings. # In[1]: # Packages import numpy as np from scipy import stats import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import warnings warnings.filterwarnings('ignore') # In[2]: # Plot settings sns.set_context('notebook') sns.set_style('ticks') colours = ['#1F77B4', '#FF7F0E', '#2CA02C', '#DB2728', '#9467BD', '#8C564B', '#E377C2','#7F7F7F', '#BCBD22', '#17BECF'] crayon = ['#4E79A7','#F28E2C','#E15759','#76B7B2','#59A14F', '#EDC949','#AF7AA1','#FF9DA7','#9C755F','#BAB0AB'] sns.set_palette(colours) get_ipython().run_line_magic('matplotlib', 'inline') plt.rcParams['figure.figsize'] = (9, 6) # In[3]: # Methods from sklearn.linear_model import LinearRegression # Model selection and evaluation tools from sklearn.model_selection import train_test_split from sklearn.model_selection import RandomizedSearchCV from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error # ##California Housing Data # # ###Data Processing # # We use the California housing dataset, which we can obtain directly from the Scikit-Learn distribution. # In[4]: from sklearn.datasets import fetch_california_housing raw = fetch_california_housing() print(raw.DESCR) # I prefer to work with pandas, so that the next cells stores the dataset in a dataframe. # In[5]: data = pd.DataFrame(raw.data, columns=raw.feature_names) data['MedianHouseValue']=raw.target data.head() # Some of the predictors have severe outliers, which we remove for simplicity. # In[6]: data=data[data['AveRooms'] # # ii. The response is very right skewed, so that we work with work with its log transformation. # In[7]: response = data.columns[-1] # last column in the dataframe predictors= list(data.columns[:-1]) # all columns except the last index_train, index_test = train_test_split(np.array(data.index), train_size=0.2, random_state=1) train = data.loc[index_train,:].copy() test = data.loc[index_test,:].copy() y_train = np.log(train[response]) y_test = np.log(test[response]) X_train = train[predictors].copy() X_test = test[predictors].copy() # ##Exploratory data analysis # # A key feature of this dataset is the presence of geographical information. The next cell draws a map to allow us to visualise the relationship between location and median house prices. Warmer colours indicate higher prices, while the size of the bubbles indicates the population of the area. # # The house prices are higher near the coast, around the Bay area, and around Los Angeles. As we will see, the geographical patterns will be crucial for predictive accurarcy. # # Note the following technical details: # # (a) You need to install the basemap package to run the cell and render the map, with additional installation required for the full resolution option (you can remove it).
# # (b) The cell will take nearly a minute to run because of the full resolution option.
# In[8]: from mpl_toolkits.basemap import Basemap def california_map(ax=None, lllat=31.5, urlat=42.5, lllon=-124,urlon=-113): # This function is based on "Data Analytics Using Open-Source Tools" by Jeffrey Strickland m = Basemap(ax=ax, projection='stere', lon_0=(urlon + lllon) / 2, lat_0=(urlat + lllat) / 2, llcrnrlat=lllat, urcrnrlat=urlat, llcrnrlon=lllon, urcrnrlon=urlon, resolution='f') m.drawstates() m.drawcountries() m.drawcoastlines(color='lightblue') return m # Plot Figure fig, ax = plt.subplots(figsize=(9,9)) m = california_map() x, y = m(train['Longitude'].as_matrix(), train['Latitude'].as_matrix()) cmap = sns.diverging_palette(220, 10, as_cmap=True) m.scatter(x,y,s=train['Population']/30, c=train['MedianHouseValue'], edgecolors='none', cmap=plt.get_cmap('rainbow'), alpha=0.5) ax.set_title('Median house values in California (training data)', fontsize=17, y=1.01, fontweight='bold') ax.spines['bottom'].set_color('#DDDDDD') ax.spines['top'].set_color('#DDDDDD') ax.spines['right'].set_color('#DDDDDD') ax.spines['left'].set_color('#DDDDDD') plt.tight_layout() plt.show() # The two variables with strongest linear relationship with house values are the median income of the area, and the average number of rooms. Further exploration through the scatter plots below reveal nonlinear patterns for the median income, average rooms, and average occupancy. Population, housing average age, and average bedrooms seem to have only weak relationships with house values. # In[9]: train.corr().round(3) # In[10]: with sns.color_palette(crayon): fig, axes = plt.subplots(2,3, figsize=(12,8)) for i, ax in enumerate(fig.axes): sns.regplot(X_train.iloc[:,i], y_train, scatter_kws = {'s': 15, 'alpha': 0.5}, ax=ax, fit_reg=False) ax.set_ylabel('Log median house value') ax.set_xlim(1, 6) # fixes a bug in the last plot sns.despine() plt.tight_layout() plt.show() # ## Boosting # # We start with the Sciki-Learn implementation of boosting available in the [GradientBoostingRegressor](http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html) class. Recall that boosting has three crucial tuning parameters: # #

The learning rate.

# #

The number of trees.

# #

The size of each tree.

# # In addition, we may want to use stochastic gradient boosting by fitting each tree based on a subsample of the training data. # # The basic syntax for fitting a gradient boosting regressor with Scikit-Learn is as follows. # In[11]: from sklearn.ensemble import GradientBoostingRegressor gb = GradientBoostingRegressor(learning_rate= 0.05, max_depth = 4, n_estimators= 750, subsample = 1.0) gb.fit(X_train, y_train) # We use a randomised search to tune the model..Note that it useful to keep track of the running time, as the presence of multiple tuning parameters can make this process slow. # In[12]: get_ipython().run_cell_magic('time', '', "\nmodel = GradientBoostingRegressor()\n\ntuning_parameters = {\n 'learning_rate': [0.01, 0.05, 0.1],\n 'n_estimators' : [250, 500, 750, 1000, 1500, 2000],\n 'max_depth' : [2 ,3, 4],\n 'subsample' : [0.6, 0.8, 1.0]\n}\n\n# Using GridSearchCV would be too slow. Increase the number of iterations to explore more hyperparameter combinations.\ngb = RandomizedSearchCV(model, tuning_parameters, n_iter = 20, cv = 10, return_train_score=False, n_jobs=4)\ngb.fit(X_train, y_train)\n\nprint('Best parameters found by randomised search:', gb.best_params_, '\\n')\n") # In[13]: gb.best_estimator_ # I have separtely coded a function to plot the importance of the variables. # In[14]: from statlearning import plot_feature_importance plot_feature_importance(gb.best_estimator_, predictors) plt.show() # ## XGBoost # # [XGBoost](https://xgboost.readthedocs.io/en/latest/) is a state-of-art gradient boosting library that is very popular among [Kaggle](https://www.kaggle.com/) users. The easiest way to get started with XGBoost is to use the [Scikit-Learn API](https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn) provided by the package. That makes the syntax identical to what we did above, except that we call the [XGBRegressor](http://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.XGBRegressor) class from the XGBoost package. # In[15]: import xgboost as xgb bst = xgb.XGBRegressor(learning_rate= 0.05, max_depth = 4, n_estimators= 750, subsample = 1.0) bst.fit(X_train, y_train) # In[16]: get_ipython().run_cell_magic('time', '', "\nmodel = xgb.XGBRegressor()\n\ntuning_parameters = {\n 'learning_rate': [0.01, 0.05, 0.1],\n 'n_estimators' : [250, 500, 750, 1000, 1500, 2000],\n 'max_depth' : [2,3,4],\n 'subsample' : [0.6, 0.8, 1.0]\n}\n\nbst = RandomizedSearchCV(model, tuning_parameters, n_iter = 20, cv = 10, return_train_score=False, n_jobs=4)\nbst.fit(X_train, y_train)\n\nprint('Best parameters found by randomised search:', bst.best_params_, '\\n')\n") # XGBoost also supports regularisation of the tree weights (the alpha and lambda hyperparameters), even though we do not use these features here. # In[17]: bst.best_estimator_ # In[18]: plot_feature_importance(bst.best_estimator_, labels=predictors) plt.show() # The XGBoost functionately extends well beyond the Scikit-Learn API. Below, we construct a pure XGBoost implementation to select the number of boosting iterations by cross-validation and early stopping. # In[19]: get_ipython().run_cell_magic('time', '', "\ndtrain = xgb.DMatrix(X_train, y_train) # we need to convert the data to the format required by xgboost\ndtest = xgb.DMatrix(X_test)\n\nparam = {\n 'max_depth': 4, \n 'learning_rate': 0.05, \n 'subsample': 0.8,\n 'silent' : 0, \n 'objective':'reg:linear', \n }\n\ncv = xgb.cv(param, dtrain, num_boost_round = 2000, nfold=10, early_stopping_rounds=50)\n\nprint(f'Selected number of boosting iterations: {cv.shape[0]}')\nprint(f'RMSE (CV): {cv.iloc[-1,0]:.4f}')\n") # In[20]: fig, ax = plt.subplots(figsize=(8,5)) plt.plot(cv.iloc[:,0]) ax.set_ylabel('Cross-validation RMSE') ax.set_xlabel('Boosting iterations') sns.despine() plt.show() # ## LightGBM # # [LightGBM](http://lightgbm.readthedocs.io/en/latest/index.html) is a gradient boosting library developed by Microsoft, and a competitor to XGBoost. Similarly to XGBoost, it provides an [Scikit-Learn API](http://lightgbm.readthedocs.io/en/latest/Python-API.html#scikit-learn-api) that makes it simple to use. Notice how it runs much faster than the the Scikit-Learn implementation of boosting. # In[21]: import lightgbm as lgb # In[22]: get_ipython().run_cell_magic('time', '', "\nmodel = lgb.LGBMRegressor(objective='regression')\n\ntuning_parameters = {\n 'learning_rate': [0.01, 0.05, 0.1],\n 'n_estimators' : [250, 500, 750, 1000, 1500, 2000],\n 'max_depth' : [2, 3, 4],\n 'subsample' : [0.6, 0.8, 1.0],\n}\n\n\nlbst = RandomizedSearchCV(model, tuning_parameters, n_iter = 20, cv = 10, return_train_score=False, n_jobs=4)\nlbst.fit(X_train, y_train)\n\nprint('Best parameters found by randomised search:', lbst.best_params_, '\\n')\n") # ## Benchmark Models # # ### OLS # In[23]: ols = LinearRegression() ols.fit(X_train, y_train) # ### Regression tree # # The basic syntax for fitting a regression tree by recursive binary splitting is the following, where we allow the tree to have a maximum depth of two for visualisation. I also specify the minimum number of samples in the terminal nodes to highlight the importance explictly controlling this tuning parameter. # In[24]: from sklearn.tree import DecisionTreeRegressor tree = DecisionTreeRegressor(max_depth=2, min_samples_leaf=5) tree.fit(X_train, y_train) # To plot the decision tree, we need to use the python-graphviz package. You should experiment with letting the tree grow a few more steps and interpreting the result. # In[25]: from sklearn.tree import export_graphviz import graphviz dot_data = export_graphviz(tree, out_file=None, feature_names=predictors, impurity=False, rounded=True) graph = graphviz.Source(dot_data) graph.render('tree') graph # Next, we find a tree of optimal size for prediction. # In[26]: # This cell may take half a minute to run model = DecisionTreeRegressor(min_samples_leaf=5) tuning_parameters = { 'min_samples_leaf': [1,5,10,20], 'max_depth': np.arange(1,30), } tree = RandomizedSearchCV(model, tuning_parameters, n_iter=20, cv=10, return_train_score=False) tree.fit(X_train, y_train) print('Best parameters:', tree.best_params_) # ### Random Forest # In[27]: from sklearn.ensemble import RandomForestRegressor rf = RandomForestRegressor(n_estimators=2000, max_features = 3, min_samples_leaf= 1) rf.fit(X_train, y_train) # Recall that the key tuning parameters for a random forest are (i) the number of randomly selected candidate variables for each split (b) the mininum node size. I selected the specification above with a grid search (omitted here to save time). # ## Model evaluation # In[28]: columns=['Test RMSE', 'Test R2', 'Test MAE'] rows=['Linear regression', 'Decision tree', 'Random forest', 'SKLearn Boost', 'XGBoost', 'LightGBM'] results=pd.DataFrame(0.0, columns=columns, index=rows) methods=[ols, tree, rf, gb, bst, lbst] for i, method in enumerate(methods): y_pred=method.predict(X_test) results.iloc[i,0] = np.sqrt(mean_squared_error(y_test, y_pred)) results.iloc[i,1] = r2_score(y_test, y_pred) results.iloc[i,2] = mean_absolute_error(y_test, y_pred) results.round(3)