import numpy as np import pandas as pd from sklearn.cross_validation import train_test_split from sklearn.ensemble import GradientBoostingRegressor from statsmodels.tools.eval_measures import rmse import matplotlib.pylab as plt # Make pylab inline and set the theme to 'ggplot' plt.style.use('ggplot') %pylab inline # Read Boston Housing Data data = pd.read_csv('Datasets/Housing.csv') # Create a data frame with all the independent features data_indep = data.drop('medv', axis = 1) # Create a target vector(vector of dependent variable, i.e. 'medv') data_dep = data['medv'] # Split data into training and test sets train_X, test_X, train_y, test_y = train_test_split(data_indep, data_dep, test_size = 0.20, random_state = 42) # Now let's fit a GradientBoostingRegressor with a L1(Least Absolute Deviation) loss function # Set a random seed so that we can reproduce the results np.random.seed(32767) # A GradientBoostingRegressor with L1(Least Absolute Deviation) as the loss function mod = GradientBoostingRegressor(loss='lad') fit = mod.fit(train_X, train_y) predict = fit.predict(test_X) # Root Mean Squared Error print "RMSE -> %f" % rmse(predict, test_y) # Suppress printing numpy array in scientific notation np.set_printoptions(suppress=True) error = predict - test_y # Print squared errors in all test samples np.around(error ** 2, decimals = 2) # A GradientBoostingRegressor with L2(Least Squares) as the loss function mod = GradientBoostingRegressor(loss='ls') fit = mod.fit(train_X, train_y) predict = fit.predict(test_X) # Root Mean Squared Error print "RMSE -> %f" % rmse(predict, test_y) error = predict - test_y # Print squared errors in all test samples np.around(error ** 2, decimals = 2) # Some statistics about the Housing Data data.describe() # Get upper and lower bounds[min, max] of all the features stats = data.describe() extremes = stats.loc[['min', 'max'],:].drop('medv', axis = 1) extremes # Set a random seed np.random.seed(1234) # Create 5 random values rands = np.random.rand(5, 1) rands # Get the 'min' and 'max' rows as numpy array min_array = np.array(extremes.loc[['min'], :]) max_array = np.array(extremes.loc[['max'], :]) # Find the difference(range) of 'max' and 'min' range = max_array - min_array range # Generate 5 samples with 'rands' value outliers_X = (rands * range) + min_array outliers_X # We will also create some hard coded outliers for 'medv', i.e. our target medv_outliers = np.array([0, 0, 600, 700, 600]) # Let's have a look at the data types of all the columns # so that we can create our new Dataset compatible with the original one data_indep.dtypes # Change the type of 'chas', 'rad' and 'tax' to rounded of Integers outliers_X[:, [3, 8, 9]] = np.int64(np.round(outliers_X[:, [3, 8, 9]])) # Finally concatenate our existing 'train_X' and 'train_y' with these outliers train_X = np.append(train_X, outliers_X, axis = 0) train_y = np.append(train_y, medv_outliers, axis = 0) # Plot a histogram of 'medv' in train_y fig = plt.figure(figsize=(13,7)) plt.hist(train_y, bins=50, range = (-10, 800)) fig.suptitle('medv Count', fontsize = 20) plt.xlabel('medv', fontsize = 16) plt.ylabel('count', fontsize = 16) # So let's fit a GradientBoostingRegressor with a L1(Least Absolute Deviation) loss function np.random.seed(9876) # A GradientBoostingRegressor with L1(Least Absolute Deviation) as the loss function mod = GradientBoostingRegressor(loss='lad') fit = mod.fit(train_X, train_y) predict = fit.predict(test_X) # Root Mean Squared Error print "RMSE -> %f" % rmse(predict, test_y) error = predict - test_y # Print squared errors in all test samples np.around(error ** 2, decimals = 2) # A GradientBoostingRegressor with L2(Least Squares) as the loss function mod = GradientBoostingRegressor(loss='ls') fit = mod.fit(train_X, train_y) predict = fit.predict(test_X) # Root Mean Squared Error print "RMSE -> %f" % rmse(predict, test_y) error = predict - test_y # Print squared errors in all test samples np.around(error ** 2, decimals = 2)