In this lesson we revisit house pricing dataset of De Cock (2011) and the corresponding Kaggle competition. Our goal is to develop a machine learning system that will perform well in the competition. Our final solution is based on model stacking using a linear regression, regularised linear models, and gradient boosting as components.
House Pricing Data
Linear Regression
Regularised Linear Models
Regression Tree
Random Forest
Gradient Boosting
Model Stacking
Model Evaluation
Making a Submission on Kaggle
This notebook relies on the following libraries and settings.
# Packages
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import lightgbm as lgb
data=pd.read_csv('Datasets/AmesHousing-Processed.csv')
data.head()
1stFlrSF | 2ndFlrSF | 3SsnPorch | Age | BsmtFinSF1 | BsmtFinSF2 | BsmtUnfSF | EnclosedPorch | GarageArea | LotArea | ... | RoofMatl_Other | RoofStyle_Hip | RoofStyle_Other | ScreenPorchZero | WoodDeckSFZero | YrSold_2007 | YrSold_2008 | YrSold_2009 | YrSold_2010 | SalePrice | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1656 | 0 | 0 | 50 | 639.0 | 0.0 | 441.0 | 0 | 528.0 | 31770 | ... | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 215000 |
1 | 896 | 0 | 0 | 49 | 468.0 | 144.0 | 270.0 | 0 | 730.0 | 11622 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 105000 |
2 | 1329 | 0 | 0 | 52 | 923.0 | 0.0 | 406.0 | 0 | 312.0 | 14267 | ... | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 172000 |
3 | 2110 | 0 | 0 | 42 | 1065.0 | 0.0 | 1045.0 | 0 | 522.0 | 11160 | ... | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 244000 |
4 | 928 | 701 | 0 | 13 | 791.0 | 0.0 | 137.0 | 0 | 482.0 | 13830 | ... | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 189900 |
5 rows × 196 columns
We the split the data into training and test sets. We use a small training dataset to better illustrate the advantages of regularisation.
response='SalePrice'
predictors=list(data.columns.values[:-1])
# Randomly split indexes
index_train, index_test = train_test_split(np.array(data.index), train_size=0.7, random_state=5)
# Write training and test sets
train = data.loc[index_train,:].copy()
test = data.loc[index_test,:].copy()
# Write training and test response vectors
y_train = np.log(train[response])
y_test = np.log(test[response])
# Write training and test design matrices
X_train = train[predictors].copy()
X_test = test[predictors].copy()
ols = LinearRegression()
ols.fit(X_train, y_train)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
lasso = Pipeline((
('scaler', StandardScaler()),
('estimator', LassoCV(cv=5)),
))
lasso.fit(X_train, y_train)
Pipeline(memory=None, steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('estimator', LassoCV(alphas=None, copy_X=True, cv=5, eps=0.001, fit_intercept=True, max_iter=1000, n_alphas=100, n_jobs=1, normalize=False, positive=False, precompute='auto', random_state=None, selection='cyclic', tol=0.0001, verbose=False))])
alphas = list(np.logspace(-15, 15, 151, base=2))
ridge = Pipeline((
('scaler', StandardScaler()),
('estimator', RidgeCV(alphas=alphas, cv=5)),
))
ridge.fit(X_train, y_train)
Pipeline(memory=None, steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('estimator', RidgeCV(alphas=[3.0517578125e-05, 3.5055491790680982e-05, 4.0268185753567341e-05, 4.6255998733837822e-05, 5.3134189654304478e-05, 6.103515625e-05, 7.0110983581361965e-05, 8.0536371507134683e-05, 9.251199746767...cv=5, fit_intercept=True, gcv_mode=None, normalize=False, scoring=None, store_cv_values=False))])
enet = Pipeline((
('scaler', StandardScaler()),
('estimator', ElasticNetCV(l1_ratio=[0.01,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9, 0.99], cv=5)),
))
enet.fit(X_train, y_train)
Pipeline(memory=None, steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('estimator', ElasticNetCV(alphas=None, copy_X=True, cv=5, eps=0.001, fit_intercept=True, l1_ratio=[0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99], max_iter=1000, n_alphas=100, n_jobs=1, normalize=False, positive=False, precompute='auto', random_state=None, selection='cyclic', tol=0.0001, verbose=0))])
%%time
model = DecisionTreeRegressor(min_samples_leaf=5)
tuning_parameters = {
'min_samples_leaf': [1,5,10,20],
'max_depth': np.arange(1,30),
}
tree = RandomizedSearchCV(model, tuning_parameters, n_iter=20, cv=5, return_train_score=False)
tree.fit(X_train, y_train)
print('Best parameters:', tree.best_params_)
Best parameters: {'min_samples_leaf': 5, 'max_depth': 7} Wall time: 2.17 s
%%time
model = RandomForestRegressor(n_estimators=100)
tuning_parameters = {
'min_samples_leaf': [1,5, 10, 20, 50],
'max_features': np.arange(1, X_train.shape[1], 5),
}
rf_search = RandomizedSearchCV(model, tuning_parameters, cv = 5, n_iter= 16, return_train_score=False, n_jobs=4,
random_state = 20)
rf_search.fit(X_train, y_train)
rf = rf_search.best_estimator_
print('Best parameters found by randomised search:', rf_search.best_params_, '\n')
Best parameters found by randomised search: {'min_samples_leaf': 1, 'max_features': 176} Wall time: 22.5 s
rf.n_estimators = 500
rf.fit(X_train, y_train)
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None, max_features=176, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False)
%%time
model = lgb.LGBMRegressor(objective='regression')
tuning_parameters = {
'learning_rate': [0.01, 0.05, 0.1],
'n_estimators' : [250, 500, 750, 1000, 1500, 2000, 3000, 4000, 5000],
'max_depth' : [2, 3, 4],
'subsample' : [0.6, 0.8, 1.0],
}
gb_search = RandomizedSearchCV(model, tuning_parameters, n_iter = 128, cv = 5, return_train_score=False, n_jobs=4,
random_state = 20)
gb_search.fit(X_train, y_train)
lbst = gb_search.best_estimator_
print('Best parameters found by randomised search:', gb_search.best_params_, '\n')
Best parameters found by randomised search: {'subsample': 1.0, 'n_estimators': 1500, 'max_depth': 2, 'learning_rate': 0.05} Wall time: 5min 58s
%%time
model = xgb.XGBRegressor()
tuning_parameters = {
'learning_rate': [0.01, 0.05, 0.1],
'n_estimators' : [250, 500, 750, 1000, 1500, 2000, 3000, 5000],
'max_depth' : [2, 3, 4],
'subsample' : [0.6, 0.8, 1.0],
}
gb_search = RandomizedSearchCV(model, tuning_parameters, n_iter = 16, cv = 5, return_train_score=False, n_jobs=4,
random_state = 20)
gb_search.fit(X_train, y_train)
xbst = gb_search.best_estimator_
print('Best parameters found by randomised search:', gb_search.best_params_, '\n')
Best parameters found by randomised search: {'subsample': 0.6, 'n_estimators': 1000, 'max_depth': 2, 'learning_rate': 0.05} Wall time: 4min 46s
This is an advanced specification. Since gradient boosting is an additive model fit by forward stagewise additive modelling, nothing stops us from fitting a gradient boosting model to the residuals of a linear regression specification, therefore boosting the linear model with additive trees.
The only disadvantage is that there are no immediately available functions to add this model to our stack.
%%time
y_fit = lasso.predict(X_train)
resid = y_train - y_fit
model = lgb.LGBMRegressor(objective='regression')
tuning_parameters = {
'learning_rate': [0.01, 0.05, 0.1],
'n_estimators' : [250, 500, 750, 1000, 1500, 2000, 3000, 4000, 5000],
'max_depth' : [2, 3, 4],
'subsample' : [0.6, 0.8, 1.0],
}
gb_search = RandomizedSearchCV(model, tuning_parameters, n_iter = 16, cv = 5, return_train_score=False, n_jobs=4,
random_state = 20)
gb_search.fit(X_train, resid)
abst = gb_search.best_estimator_
print('Best parameters found by randomised search:', gb_search.best_params_, '\n')
Best parameters found by randomised search: {'subsample': 0.8, 'n_estimators': 1500, 'max_depth': 2, 'learning_rate': 0.01} Wall time: 1min 6s
%%time
from mlxtend.regressor import StackingCVRegressor
models = [ols, lasso, ridge, xbst]
stack = StackingCVRegressor(models, meta_regressor = LinearRegression(), cv=10)
stack.fit(X_train.values, y_train.ravel())
Wall time: 2min 18s
columns=['Test RMSE', 'Test R2', 'Test MAE']
rows=['OLS', 'Lasso', 'Ridge', 'Elastic Net', 'Tree', 'Random Forest', 'LightGBM', 'XGBoost', 'Additive Boost', 'Stack']
results=pd.DataFrame(0.0, columns=columns, index=rows)
methods=[ols, lasso, ridge, enet, tree, rf, lbst, xbst, abst, stack]
for i, method in enumerate(methods):
if method != stack:
y_pred=np.exp(method.predict(X_test))
if method == abst:
y_pred=np.exp(lasso.predict(X_test)+method.predict(X_test)) # combining predictions
else:
y_pred=np.exp(method.predict(X_test.values))
results.iloc[i,0] = np.sqrt(mean_squared_error(np.exp(y_test), y_pred))
results.iloc[i,1] = r2_score(np.exp(y_test), y_pred)
results.iloc[i,2] = mean_absolute_error(np.exp(y_test), y_pred)
results.round(3)
Test RMSE | Test R2 | Test MAE | |
---|---|---|---|
OLS | 14875.800 | 0.950 | 10572.203 |
Lasso | 14791.127 | 0.951 | 10671.239 |
Ridge | 14704.635 | 0.951 | 10519.765 |
Elastic Net | 14791.233 | 0.951 | 10671.241 |
Tree | 30543.080 | 0.791 | 20826.741 |
Random Forest | 24308.127 | 0.867 | 14746.956 |
LightGBM | 17561.108 | 0.931 | 11675.938 |
XGBoost | 16813.901 | 0.937 | 11535.039 |
Additive Boost | 14022.916 | 0.956 | 10221.708 |
Stack | 13482.407 | 0.959 | 10032.881 |
columns=['Test RMSE', 'Test R2', 'Test MAE']
rows=['OLS', 'Lasso', 'Ridge', 'Elastic Net', 'Tree', 'Random Forest', 'LightGBM', 'XGBoost', 'Additive Boost', 'Stack']
results=pd.DataFrame(0.0, columns=columns, index=rows)
methods=[ols, lasso, ridge, enet, tree, rf, lbst, xbst, abst, stack]
for i, method in enumerate(methods):
if method != stack:
y_pred= method.predict(X_test)
if method == abst:
y_pred=ols.predict(X_test)+method.predict(X_test)
else:
y_pred= method.predict(X_test.values)
results.iloc[i,0] = np.sqrt(mean_squared_error(y_test, y_pred))
results.iloc[i,1] = r2_score(y_test, y_pred)
results.iloc[i,2] = mean_absolute_error(y_test, y_pred)
results.round(3)
Test RMSE | Test R2 | Test MAE | |
---|---|---|---|
OLS | 0.083 | 0.945 | 0.063 |
Lasso | 0.085 | 0.942 | 0.063 |
Ridge | 0.084 | 0.944 | 0.063 |
Elastic Net | 0.085 | 0.942 | 0.063 |
Tree | 0.160 | 0.796 | 0.120 |
Random Forest | 0.113 | 0.898 | 0.082 |
LightGBM | 0.090 | 0.935 | 0.067 |
XGBoost | 0.088 | 0.939 | 0.066 |
Additive Boost | 0.081 | 0.947 | 0.062 |
Stack | 0.078 | 0.951 | 0.059 |
Using the methods from this lesson would lead competitive score at the Kaggle competition. Note that the Kaggle competition is based on predicting the log prices.
If you would like to try it, you would need to download the training and test sets from Kaggle and reprocess the data accordingly. Details on how I processed the data are available on request.
The next cell shows you how to generate a submission file (see further instructions on Kaggle regarding the Id column, which does not exist in our version of the dataset).
submission = pd.DataFrame(np.c_[test.index, y_pred], columns=['Id', response])
submission.to_csv('kaggle_submission.csv', index=False)