In [325]:
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from statsmodels.tools.eval_measures import rmse
import matplotlib.pylab as plt
In [326]:
# Make pylab inline and set the theme to 'ggplot'
plt.style.use('ggplot')
%pylab inline
Populating the interactive namespace from numpy and matplotlib
WARNING: pylab import has clobbered these variables: ['plt', 'mod']
`%pylab --no-import-all` prevents importing * from pylab and numpy
In [327]:
# Read Boston Housing Data
data = pd.read_csv('Datasets/Housing.csv')
In [328]:
# Create a data frame with all the independent features
data_indep = data.drop('medv', axis = 1)

# Create a target vector(vector of dependent variable, i.e. 'medv')
data_dep = data['medv']

# Split data into training and test sets
train_X, test_X, train_y, test_y = train_test_split(data_indep, data_dep,
                                                    test_size = 0.20,
                                                    random_state = 42)

Regression without any Outliers:

In [329]:
# Now let's fit a GradientBoostingRegressor with a L1(Least Absolute Deviation) loss function
# Set a random seed so that we can reproduce the results
np.random.seed(32767)

# A GradientBoostingRegressor with L1(Least Absolute Deviation) as the loss function
mod = GradientBoostingRegressor(loss='lad')

fit = mod.fit(train_X, train_y)
predict = fit.predict(test_X)

# Root Mean Squared Error
print "RMSE -> %f" % rmse(predict, test_y)
MSE -> 3.440147
In [330]:
# Suppress printing numpy array in scientific notation
np.set_printoptions(suppress=True)

error = predict - test_y

# Print squared errors in all test samples 
np.around(error ** 2, decimals = 2)
Out[330]:
array([   0.03,    0.09,   13.16,    0.08,    0.  ,    1.49,    0.52,
          0.31,    2.2 ,   11.76,    0.39,    0.  ,    0.71,    0.01,
          6.73,   62.99,    2.71,    0.08,   13.18,    3.96,    2.2 ,
         13.17,    0.18,    0.63,    3.91,    4.46,    2.2 ,    1.9 ,
          2.26,    6.1 ,    7.01,    0.24,  111.68,    0.44,   31.79,
          4.04,    0.06,    0.01,   11.08,    0.06,    6.94,    5.05,
         16.06,   12.71,    0.04,    0.  ,    4.9 ,    0.21,   10.28,
         10.61,    5.29,    0.04,    1.43,    6.06,    3.39,    0.11,
          7.18,   20.04,    1.45,    0.09,    4.82,    2.39,    2.19,
          0.05,    0.81,    0.63,    4.83,    4.14,    4.  ,    0.75,
          0.91,    0.65,    0.42,    1.17,    0.63,   12.26,    0.41,
          3.02,    0.89,   51.75,    0.16,   18.5 ,    0.36,    1.43,
          3.31,    8.56,    1.62,    2.8 ,    0.73,    1.59,    0.75,
          2.85,    0.55,    4.29,   48.63,    3.88,  476.98,   59.54,
         12.18,   22.68,    2.86,    0.45])
In [331]:
# A GradientBoostingRegressor with L2(Least Squares) as the loss function
mod = GradientBoostingRegressor(loss='ls')

fit = mod.fit(train_X, train_y)
predict = fit.predict(test_X)

# Root Mean Squared Error
print "RMSE -> %f" % rmse(predict, test_y)
MSE -> 2.542019
In [332]:
error = predict - test_y

# Print squared errors in all test samples 
np.around(error ** 2, decimals = 2)
Out[332]:
array([  0.02,   1.51,  17.29,   1.49,   2.5 ,   5.38,   0.12,   0.03,
         1.03,  18.  ,   2.44,   1.11,   5.32,   0.31,   1.68,  16.66,
         1.54,   1.86,  24.29,   3.47,   1.  ,  14.71,   0.03,   2.27,
         0.61,   2.82,   3.62,   2.67,   5.87,   9.84,  11.36,   0.11,
        39.7 ,   2.7 ,  21.12,   6.05,   2.57,   0.09,  13.48,   0.56,
         1.96,   4.68,  32.54,  11.9 ,   0.  ,   0.24,   6.82,   0.  ,
         3.35,  15.8 ,   1.78,   0.07,   2.35,   2.28,  18.14,   0.04,
         3.54,  14.06,   3.48,   0.17,   0.57,   0.92,   0.7 ,   1.01,
         0.33,   9.48,   1.98,   1.3 ,   7.62,   7.32,   1.62,   1.61,
         0.06,   0.58,   6.42,   1.81,   0.03,   8.83,   0.81,  34.72,
         0.51,  15.03,   0.94,   0.73,   0.89,   4.12,   0.92,   1.91,
         0.39,   0.2 ,   0.7 ,   0.16,   0.1 ,   3.2 ,  25.71,  12.62,
        84.07,  30.38,   5.81,   7.08,   3.31,   2.21])

As apparent from RMSE errors of L1 and L2 loss functions, Least Squares(L2) outperform L1, when there are no outliers in the data.

Regression with Outliers:

In [333]:
# Some statistics about the Housing Data
data.describe()
Out[333]:
crim zn indus chas nox rm age dis rad tax ptratio black lstat medv
count 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000
mean 3.613524 11.363636 11.136779 0.069170 0.554695 6.284634 68.574901 3.795043 9.549407 408.237154 18.455534 356.674032 12.653063 22.532806
std 8.601545 23.322453 6.860353 0.253994 0.115878 0.702617 28.148861 2.105710 8.707259 168.537116 2.164946 91.294864 7.141062 9.197104
min 0.006320 0.000000 0.460000 0.000000 0.385000 3.561000 2.900000 1.129600 1.000000 187.000000 12.600000 0.320000 1.730000 5.000000
25% 0.082045 0.000000 5.190000 0.000000 0.449000 5.885500 45.025000 2.100175 4.000000 279.000000 17.400000 375.377500 6.950000 17.025000
50% 0.256510 0.000000 9.690000 0.000000 0.538000 6.208500 77.500000 3.207450 5.000000 330.000000 19.050000 391.440000 11.360000 21.200000
75% 3.677082 12.500000 18.100000 0.000000 0.624000 6.623500 94.075000 5.188425 24.000000 666.000000 20.200000 396.225000 16.955000 25.000000
max 88.976200 100.000000 27.740000 1.000000 0.871000 8.780000 100.000000 12.126500 24.000000 711.000000 22.000000 396.900000 37.970000 50.000000

After looking at the minimum and maximum values of 'medv' column, we can see that the range of values in 'medv' is [5, 50].
Let's add a few Outliers in this Dataset, so that we can see some significant differences with L1 and L2 loss functions.

In [334]:
# Get upper and lower bounds[min, max] of all the features
stats = data.describe()
extremes = stats.loc[['min', 'max'],:].drop('medv', axis = 1)
extremes
Out[334]:
crim zn indus chas nox rm age dis rad tax ptratio black lstat
min 0.00632 0 0.46 0 0.385 3.561 2.9 1.1296 1 187 12.6 0.32 1.73
max 88.97620 100 27.74 1 0.871 8.780 100.0 12.1265 24 711 22.0 396.90 37.97

Now, we are going to generate 5 random samples, such that their values lies in the [min, max] range of respective features.

In [335]:
# Set a random seed
np.random.seed(1234)

# Create 5 random values 
rands = np.random.rand(5, 1)
rands
Out[335]:
array([[ 0.19151945],
       [ 0.62210877],
       [ 0.43772774],
       [ 0.78535858],
       [ 0.77997581]])
In [336]:
# Get the 'min' and 'max' rows as numpy array
min_array = np.array(extremes.loc[['min'], :])
max_array = np.array(extremes.loc[['max'], :])

# Find the difference(range) of 'max' and 'min'
range = max_array - min_array
range
Out[336]:
array([[  88.96988,  100.     ,   27.28   ,    1.     ,    0.486  ,
           5.219  ,   97.1    ,   10.9969 ,   23.     ,  524.     ,
           9.4    ,  396.58   ,   36.24   ]])
In [337]:
# Generate 5 samples with 'rands' value
outliers_X = (rands * range) + min_array
outliers_X
Out[337]:
array([[  17.04578252,   19.15194504,    5.68465061,    0.19151945,
           0.47807845,    4.56054001,   21.49653863,    3.23572024,
           5.40494736,  287.356192  ,   14.40028283,   76.27278363,
           8.67066488],
       [  55.35526271,   62.2108771 ,   17.43112727,    0.62210877,
           0.68734486,    6.80778568,   63.30676167,    7.97086794,
          15.30850173,  512.98499602,   18.44782245,  247.03589642,
          24.27522186],
       [  38.95090441,   43.7727739 ,   12.40121272,    0.43772774,
           0.59773568,    5.84550107,   45.40336346,    5.94324817,
          11.067738  ,  416.36933524,   16.71464075,  173.91406674,
          17.59325326],
       [  69.87957895,   78.53585837,   21.88458216,    0.78535858,
           0.76668427,    7.65978645,   79.15831848,    9.76610981,
          19.06324743,  598.52789787,   19.98237069,  311.77750713,
          30.19139507],
       [  69.40067405,   77.99758081,   21.73774005,    0.77997581,
           0.76406824,    7.63169374,   78.63565097,    9.70691596,
          18.93944359,  595.70732345,   19.9317726 ,  309.64280598,
          29.99632329]])
In [338]:
# We will also create some hard coded outliers for 'medv', i.e. our target
medv_outliers = np.array([0, 0, 600, 700, 600])
In [339]:
# Let's have a look at the data types of all the columns
# so that we can create our new Dataset compatible with the original one
data_indep.dtypes
Out[339]:
crim       float64
zn         float64
indus      float64
chas         int64
nox        float64
rm         float64
age        float64
dis        float64
rad          int64
tax          int64
ptratio    float64
black      float64
lstat      float64
dtype: object
In [340]:
# Change the type of 'chas', 'rad' and 'tax' to rounded of Integers
outliers_X[:, [3, 8, 9]] = np.int64(np.round(outliers_X[:, [3, 8, 9]]))
In [341]:
# Finally concatenate our existing 'train_X' and 'train_y' with these outliers
train_X = np.append(train_X, outliers_X, axis = 0)
train_y = np.append(train_y, medv_outliers, axis = 0)
In [342]:
# Plot a histogram of 'medv' in train_y
fig = plt.figure(figsize=(13,7))
plt.hist(train_y, bins=50, range = (-10, 800))
fig.suptitle('medv Count', fontsize = 20)
plt.xlabel('medv', fontsize = 16)
plt.ylabel('count', fontsize = 16)
Out[342]:
<matplotlib.text.Text at 0x7f906589cb10>

You can see there are some clear outliers at 600, 700 and even one or two 'medv' values are 0.
Since, our outliers are in place now, we will once again fit the GradientBoostingRegressor with L1 and L2 Loss function to see the contrast in their performances with outliers.

In [343]:
# So let's fit a GradientBoostingRegressor with a L1(Least Absolute Deviation) loss function
np.random.seed(9876)

# A GradientBoostingRegressor with L1(Least Absolute Deviation) as the loss function
mod = GradientBoostingRegressor(loss='lad')

fit = mod.fit(train_X, train_y)
predict = fit.predict(test_X)

# Root Mean Squared Error
print "RMSE -> %f" % rmse(predict, test_y)
MSE -> 7.055568
In [344]:
error = predict - test_y

# Print squared errors in all test samples 
np.around(error ** 2, decimals = 2)
Out[344]:
array([    0.04,     0.69,    17.13,     0.01,     0.62,     0.01,
           1.88,     0.17,     0.54,    16.12,     2.4 ,     0.23,
           0.04,     0.  ,     1.62,    83.27,     1.58,     0.23,
          17.81,     1.3 ,     2.77,    12.79,     0.07,     0.45,
           1.59,     7.27,     1.3 ,     2.45,     5.99,     7.88,
           9.02,     0.49,  3396.2 ,     0.01,    28.32,    12.75,
           0.45,     0.  ,    15.67,     0.  ,     4.2 ,     3.72,
          30.43,     5.71,     0.01,     0.18,     7.47,     0.09,
           1.28,    12.06,     2.49,     0.13,     6.22,     2.76,
           2.29,     0.02,     4.91,    16.74,     1.94,     0.27,
           0.25,   363.92,     2.42,     0.63,     0.58,     4.43,
           4.28,     3.15,     1.17,     0.97,     0.73,     1.43,
           0.53,     0.49,     0.81,    16.5 ,     0.07,    14.49,
           1.84,    38.94,     0.17,    16.95,     0.45,     1.42,
           1.97,     6.45,     1.08,     3.42,     0.36,     0.46,
           1.1 ,     3.07,     0.12,     2.18,    46.74,     1.48,
         652.32,    66.97,     9.32,    46.2 ,     3.05,     0.66])
In [345]:
# A GradientBoostingRegressor with L2(Least Squares) as the loss function
mod = GradientBoostingRegressor(loss='ls')

fit = mod.fit(train_X, train_y)
predict = fit.predict(test_X)

# Root Mean Squared Error
print "RMSE -> %f" % rmse(predict, test_y)
MSE -> 9.806251
In [346]:
error = predict - test_y

# Print squared errors in all test samples 
np.around(error ** 2, decimals = 2)
Out[346]:
array([    0.  ,     6.18,    11.  ,     0.36,     0.45,     1.62,
           0.28,     0.02,     0.59,    22.96,     2.24,     0.06,
           0.98,     0.02,     0.64,     6.52,     1.72,     0.04,
           7.4 ,     3.76,     0.76,    15.28,     0.38,     1.95,
           3.7 ,     9.  ,     4.28,     6.69,     5.83,    10.83,
          17.79,     0.57,   121.92,     2.3 ,    15.77,     4.94,
           0.02,     0.6 ,    13.59,     0.03,    11.32,     1.51,
          16.45,     6.85,     0.07,     0.21,     6.32,     0.12,
           3.41,    12.18,     7.96,     2.12,     2.95,     9.69,
          17.15,     0.4 ,     2.43,     4.46,     1.58,     1.59,
           2.45,     0.66,     2.56,     3.33,     0.43,    13.59,
           6.76,     1.15,     2.87,     3.3 ,     4.7 ,     0.01,
           1.3 ,     0.02,     0.86,  8597.78,     0.07,     1.7 ,
           0.  ,    33.7 ,     0.71,    10.75,     0.03,     0.63,
           1.55,    13.66,     0.89,     2.61,     0.27,     0.05,
           1.12,     0.08,     0.  ,     3.76,    63.48,    10.78,
         475.58,   111.95,    12.11,     5.93,     1.53,     1.99])

With outliers in the dataset, a L2(Loss function) tries to adjust the coefficients features according to these outliers on the expense of other features, since the squared-error is going to be huge for these outliers(for error > 1). On the other hand L1(Least absolute deviation) is quite robust to outliers.
As a result, L2 loss function may result in huge deviations in some of the samples which results in reduced accuracy.