#!/usr/bin/env python # coding: utf-8 #

Machine Learning Using Python (MEAFA Workshop)

Lesson 8: Regression Application

#
# # In this lesson we revisit house pricing dataset of [De Cock (2011)](http://www.tandfonline.com/doi/abs/10.1080/10691898.2011.11889627) and the corresponding [Kaggle competition](https://www.kaggle.com/c/house-prices-advanced-regression-techniques). Our goal is to develop a machine learning system that will perform well in the competition. Our final solution is based on model stacking using a linear regression, regularised linear models, and gradient boosting as components. # # House Pricing Data
# Linear Regression
# Regularised Linear Models
# Regression Tree
# Random Forest
# Gradient Boosting
# Model Stacking
# Model Evaluation
# Making a Submission on Kaggle
# # This notebook relies on the following libraries and settings. # In[1]: # Packages import numpy as np import pandas as pd import warnings warnings.filterwarnings('ignore') # In[2]: from sklearn.model_selection import train_test_split from sklearn.model_selection import GridSearchCV, RandomizedSearchCV from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import RandomForestRegressor import xgboost as xgb import lightgbm as lgb # ##House Pricing Data # # # In[3]: data=pd.read_csv('Datasets/AmesHousing-Processed.csv') data.head() # We the split the data into training and test sets. We use a small training dataset to better illustrate the advantages of regularisation. # In[4]: response='SalePrice' predictors=list(data.columns.values[:-1]) # Randomly split indexes index_train, index_test = train_test_split(np.array(data.index), train_size=0.7, random_state=5) # Write training and test sets train = data.loc[index_train,:].copy() test = data.loc[index_test,:].copy() # Write training and test response vectors y_train = np.log(train[response]) y_test = np.log(test[response]) # Write training and test design matrices X_train = train[predictors].copy() X_test = test[predictors].copy() # ## Linear Regression # # # In[5]: ols = LinearRegression() ols.fit(X_train, y_train) # ## Regularised Linear Models # # ### Lasso # In[6]: lasso = Pipeline(( ('scaler', StandardScaler()), ('estimator', LassoCV(cv=5)), )) lasso.fit(X_train, y_train) # ### Ridge Regression # In[7]: alphas = list(np.logspace(-15, 15, 151, base=2)) ridge = Pipeline(( ('scaler', StandardScaler()), ('estimator', RidgeCV(alphas=alphas, cv=5)), )) ridge.fit(X_train, y_train) # ### Elastic Net # In[8]: enet = Pipeline(( ('scaler', StandardScaler()), ('estimator', ElasticNetCV(l1_ratio=[0.01,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9, 0.99], cv=5)), )) enet.fit(X_train, y_train) # ## Regression Tree # In[9]: get_ipython().run_cell_magic('time', '', "\nmodel = DecisionTreeRegressor(min_samples_leaf=5)\n\ntuning_parameters = {\n 'min_samples_leaf': [1,5,10,20],\n 'max_depth': np.arange(1,30),\n}\n\ntree = RandomizedSearchCV(model, tuning_parameters, n_iter=20, cv=5, return_train_score=False)\ntree.fit(X_train, y_train)\n\nprint('Best parameters:', tree.best_params_)\n") # ## Random Forest Regression # In[10]: get_ipython().run_cell_magic('time', '', "\nmodel = RandomForestRegressor(n_estimators=100)\n\ntuning_parameters = {\n 'min_samples_leaf': [1,5, 10, 20, 50],\n 'max_features': np.arange(1, X_train.shape[1], 5),\n}\n\nrf_search = RandomizedSearchCV(model, tuning_parameters, cv = 5, n_iter= 16, return_train_score=False, n_jobs=4,\n random_state = 20)\nrf_search.fit(X_train, y_train)\n\nrf = rf_search.best_estimator_\n\nprint('Best parameters found by randomised search:', rf_search.best_params_, '\\n')\n") # In[11]: rf.n_estimators = 500 rf.fit(X_train, y_train) # ## Gradient Boosting # # # ### LightGBM # In[12]: get_ipython().run_cell_magic('time', '', "\nmodel = lgb.LGBMRegressor(objective='regression')\n\n\ntuning_parameters = {\n 'learning_rate': [0.01, 0.05, 0.1],\n 'n_estimators' : [250, 500, 750, 1000, 1500, 2000, 3000, 4000, 5000],\n 'max_depth' : [2, 3, 4],\n 'subsample' : [0.6, 0.8, 1.0],\n}\n\ngb_search = RandomizedSearchCV(model, tuning_parameters, n_iter = 128, cv = 5, return_train_score=False, n_jobs=4, \n random_state = 20)\n\ngb_search.fit(X_train, y_train)\n\nlbst = gb_search.best_estimator_\n\n\nprint('Best parameters found by randomised search:', gb_search.best_params_, '\\n')\n") # ### XGBoost # In[13]: get_ipython().run_cell_magic('time', '', "\nmodel = xgb.XGBRegressor()\n\ntuning_parameters = {\n 'learning_rate': [0.01, 0.05, 0.1],\n 'n_estimators' : [250, 500, 750, 1000, 1500, 2000, 3000, 5000],\n 'max_depth' : [2, 3, 4],\n 'subsample' : [0.6, 0.8, 1.0],\n}\n\ngb_search = RandomizedSearchCV(model, tuning_parameters, n_iter = 16, cv = 5, return_train_score=False, n_jobs=4,\n random_state = 20)\ngb_search.fit(X_train, y_train)\n\nxbst = gb_search.best_estimator_\n\n\nprint('Best parameters found by randomised search:', gb_search.best_params_, '\\n')\n") # ### Additive Boosting # # This is an advanced specification. Since gradient boosting is an additive model fit by forward stagewise additive modelling, nothing stops us from fitting a gradient boosting model to the residuals of a linear regression specification, therefore boosting the linear model with additive trees. # # The only disadvantage is that there are no immediately available functions to add this model to our stack. # In[14]: get_ipython().run_cell_magic('time', '', "\ny_fit = lasso.predict(X_train)\nresid = y_train - y_fit\n\nmodel = lgb.LGBMRegressor(objective='regression')\n\n\ntuning_parameters = {\n 'learning_rate': [0.01, 0.05, 0.1],\n 'n_estimators' : [250, 500, 750, 1000, 1500, 2000, 3000, 4000, 5000],\n 'max_depth' : [2, 3, 4],\n 'subsample' : [0.6, 0.8, 1.0],\n}\n\ngb_search = RandomizedSearchCV(model, tuning_parameters, n_iter = 16, cv = 5, return_train_score=False, n_jobs=4, \n random_state = 20)\n\ngb_search.fit(X_train, resid)\n\nabst = gb_search.best_estimator_\n\n\nprint('Best parameters found by randomised search:', gb_search.best_params_, '\\n')\n") # ## Model Stacking # In[15]: get_ipython().run_cell_magic('time', '', '\nfrom mlxtend.regressor import StackingCVRegressor\n\nmodels = [ols, lasso, ridge, xbst]\n\nstack = StackingCVRegressor(models, meta_regressor = LinearRegression(), cv=10)\nstack.fit(X_train.values, y_train.ravel())\n') # ## Model Evaluation # # # ### Original prices # In[16]: columns=['Test RMSE', 'Test R2', 'Test MAE'] rows=['OLS', 'Lasso', 'Ridge', 'Elastic Net', 'Tree', 'Random Forest', 'LightGBM', 'XGBoost', 'Additive Boost', 'Stack'] results=pd.DataFrame(0.0, columns=columns, index=rows) methods=[ols, lasso, ridge, enet, tree, rf, lbst, xbst, abst, stack] for i, method in enumerate(methods): if method != stack: y_pred=np.exp(method.predict(X_test)) if method == abst: y_pred=np.exp(lasso.predict(X_test)+method.predict(X_test)) # combining predictions else: y_pred=np.exp(method.predict(X_test.values)) results.iloc[i,0] = np.sqrt(mean_squared_error(np.exp(y_test), y_pred)) results.iloc[i,1] = r2_score(np.exp(y_test), y_pred) results.iloc[i,2] = mean_absolute_error(np.exp(y_test), y_pred) results.round(3) # ### Log prices # In[17]: columns=['Test RMSE', 'Test R2', 'Test MAE'] rows=['OLS', 'Lasso', 'Ridge', 'Elastic Net', 'Tree', 'Random Forest', 'LightGBM', 'XGBoost', 'Additive Boost', 'Stack'] results=pd.DataFrame(0.0, columns=columns, index=rows) methods=[ols, lasso, ridge, enet, tree, rf, lbst, xbst, abst, stack] for i, method in enumerate(methods): if method != stack: y_pred= method.predict(X_test) if method == abst: y_pred=ols.predict(X_test)+method.predict(X_test) else: y_pred= method.predict(X_test.values) results.iloc[i,0] = np.sqrt(mean_squared_error(y_test, y_pred)) results.iloc[i,1] = r2_score(y_test, y_pred) results.iloc[i,2] = mean_absolute_error(y_test, y_pred) results.round(3) # ## Making a Submission on Kaggle # # Using the methods from this lesson would lead competitive score at the [Kaggle competition](https://www.kaggle.com/c/house-prices-advanced-regression-techniques). Note that the Kaggle competition is based on predicting the log prices. # # If you would like to try it, you would need to download the training and test sets from Kaggle and reprocess the data accordingly. Details on how I processed the data are available on request. # # The next cell shows you how to generate a submission file (see further instructions on Kaggle regarding the Id column, which does not exist in our version of the dataset). # In[18]: submission = pd.DataFrame(np.c_[test.index, y_pred], columns=['Id', response]) submission.to_csv('kaggle_submission.csv', index=False)