import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from statsmodels.tools.eval_measures import rmse
import matplotlib.pylab as plt

# Make pylab inline and set the theme to 'ggplot'
plt.style.use('ggplot')
%pylab inline

# Read Boston Housing Data
data = pd.read_csv('Datasets/Housing.csv')

# Create a data frame with all the independent features
data_indep = data.drop('medv', axis = 1)

# Create a target vector(vector of dependent variable, i.e. 'medv')
data_dep = data['medv']

# Split data into training and test sets
train_X, test_X, train_y, test_y = train_test_split(data_indep, data_dep,
                                                    test_size = 0.20,
                                                    random_state = 42)

# Now let's fit a GradientBoostingRegressor with a L1(Least Absolute Deviation) loss function
# Set a random seed so that we can reproduce the results
np.random.seed(32767)

# A GradientBoostingRegressor with L1(Least Absolute Deviation) as the loss function
mod = GradientBoostingRegressor(loss='lad')

fit = mod.fit(train_X, train_y)
predict = fit.predict(test_X)

# Root Mean Squared Error
print "RMSE -> %f" % rmse(predict, test_y)

# Suppress printing numpy array in scientific notation
np.set_printoptions(suppress=True)

error = predict - test_y

# Print squared errors in all test samples 
np.around(error ** 2, decimals = 2)

# A GradientBoostingRegressor with L2(Least Squares) as the loss function
mod = GradientBoostingRegressor(loss='ls')

fit = mod.fit(train_X, train_y)
predict = fit.predict(test_X)

# Root Mean Squared Error
print "RMSE -> %f" % rmse(predict, test_y)

error = predict - test_y

# Print squared errors in all test samples 
np.around(error ** 2, decimals = 2)

# Some statistics about the Housing Data
data.describe()

# Get upper and lower bounds[min, max] of all the features
stats = data.describe()
extremes = stats.loc[['min', 'max'],:].drop('medv', axis = 1)
extremes

# Set a random seed
np.random.seed(1234)

# Create 5 random values 
rands = np.random.rand(5, 1)
rands

# Get the 'min' and 'max' rows as numpy array
min_array = np.array(extremes.loc[['min'], :])
max_array = np.array(extremes.loc[['max'], :])

# Find the difference(range) of 'max' and 'min'
range = max_array - min_array
range

# Generate 5 samples with 'rands' value
outliers_X = (rands * range) + min_array
outliers_X

# We will also create some hard coded outliers for 'medv', i.e. our target
medv_outliers = np.array([0, 0, 600, 700, 600])

# Let's have a look at the data types of all the columns
# so that we can create our new Dataset compatible with the original one
data_indep.dtypes

# Change the type of 'chas', 'rad' and 'tax' to rounded of Integers
outliers_X[:, [3, 8, 9]] = np.int64(np.round(outliers_X[:, [3, 8, 9]]))

# Finally concatenate our existing 'train_X' and 'train_y' with these outliers
train_X = np.append(train_X, outliers_X, axis = 0)
train_y = np.append(train_y, medv_outliers, axis = 0)

# Plot a histogram of 'medv' in train_y
fig = plt.figure(figsize=(13,7))
plt.hist(train_y, bins=50, range = (-10, 800))
fig.suptitle('medv Count', fontsize = 20)
plt.xlabel('medv', fontsize = 16)
plt.ylabel('count', fontsize = 16)

# So let's fit a GradientBoostingRegressor with a L1(Least Absolute Deviation) loss function
np.random.seed(9876)

# A GradientBoostingRegressor with L1(Least Absolute Deviation) as the loss function
mod = GradientBoostingRegressor(loss='lad')

fit = mod.fit(train_X, train_y)
predict = fit.predict(test_X)

# Root Mean Squared Error
print "RMSE -> %f" % rmse(predict, test_y)

error = predict - test_y

# Print squared errors in all test samples 
np.around(error ** 2, decimals = 2)

# A GradientBoostingRegressor with L2(Least Squares) as the loss function
mod = GradientBoostingRegressor(loss='ls')

fit = mod.fit(train_X, train_y)
predict = fit.predict(test_X)

# Root Mean Squared Error
print "RMSE -> %f" % rmse(predict, test_y)

error = predict - test_y

# Print squared errors in all test samples 
np.around(error ** 2, decimals = 2)