#!/usr/bin/env python
# coding: utf-8

# #### Example of Regression Analysis Using the Boston Housing Data Set.

# In[1]:


from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, SGDRegressor
import numpy as np
import pandas as pd
import pylab as pl


# In[2]:


from sklearn.datasets import load_boston
boston = load_boston()


# In[3]:


print(boston.DESCR)


# In[4]:


print(boston.feature_names)


# In[5]:


print(boston.data.shape)
print(boston.target.shape)


# In[6]:


bostonDF = pd. DataFrame(boston.data, columns = boston.feature_names)
bostonDF.head()


# In[7]:


np.set_printoptions(precision=2, linewidth=120, suppress=True, edgeitems=7)


# In[8]:


print(boston.data)


# In[9]:


# The attribute MEDV (Median Values) is the target attribute (response variable)
print(boston.target[:20])


# In[10]:


# In order to do multiple regression we need to add a column of 1s as the coefficient for x0
x = np.array([np.concatenate((v,[1])) for v in boston.data])
y = boston.target


# In[11]:


# First 10 elements of the data
print(x[:10])


# In[12]:


# Create linear regression object
linreg = LinearRegression()

# Train the model using the training set
linreg.fit(x,y)


# In[13]:


# Let's see predictions for the first 10 instances and compare to actual MEDV values
for i in range(10):
    pred = linreg.predict(np.array([x[i]]))[0]
    print("%2d \t %2.2f \t %2.2f" % (i, pred, y[i]))


# #### Compute RMSE on training data

# In[14]:


# First, let's compute errors on all training instances

p = linreg.predict(x) # p is the array of predicted values

# Now we can constuct an array of errors
err = abs(p-y)

# Let's see the error on the first 10 predictions
print(err[:10])


# In[15]:


# Dot product of error vector with itself gives us the sum of squared errors
total_error = np.dot(err,err)

# Finally compute RMSE
rmse_train = np.sqrt(total_error/len(p))
print("RMSE on Training Data: ", rmse_train)


# In[16]:


# We can view the regression coefficients
print('Regression Coefficients: \n', linreg.coef_)


# In[17]:


# Let's put some names to the faces
for i in range(len(boston.feature_names)):
    print("%7s   %2.2f" % (boston.feature_names[i], linreg.coef_[i]))


# In[18]:


# The following function can be used to plot the model coefficients
get_ipython().run_line_magic('matplotlib', 'inline')
def plot_coefficients(model, n_features, feature_names):
    pl.barh(range(n_features), model.coef_[:-1], align='center')
    pl.yticks(np.arange(n_features), feature_names)
    pl.xlabel("Coefficient Value")
    pl.ylabel("Feature")
    pl.ylim(-1, n_features)

plot_coefficients(linreg, len(boston.feature_names), boston.feature_names)


# In[19]:


print("Linear Regression Intercept: ", linreg.intercept_)


# In[20]:


# Plot predicted against actual (in the training data)
get_ipython().run_line_magic('matplotlib', 'inline')
pl.plot(p, y,'ro', markersize=5)
pl.plot([0,50],[0,50], 'g-')
pl.xlabel('Predicted')
pl.ylabel('Actual')
pl.show()


# #### Let's now compute RMSE using 10-fold cross-validation

# In[21]:


def cross_validate(model, X, y, n, verbose=False):
    # model: regression model to be trained
    # X: the data matrix
    # y: the target variable array
    # n: the number of fold for x-validation
    # Returns mean RMSE across all folds
    
    from sklearn.model_selection import KFold
    kf = KFold(n_splits=n, random_state=22)
    xval_err = 0
    f = 1
    for train,test in kf.split(x):
        model.fit(X[train],y[train])
        p = model.predict(x[test])
        e = p-y[test]
        rmse = np.sqrt(np.dot(e,e)/len(x[test]))
        if verbose:
            print("Fold %2d RMSE: %.4f" % (f, rmse))
        xval_err += rmse
        f += 1
    return xval_err/n


# In[22]:


rmse_10cv = cross_validate(linreg, x, y, 10, verbose=True)


# In[23]:


method_name = 'Simple Linear Regression'
print('Method: %s' %method_name)
print('RMSE on training: %.4f' %rmse_train)
print('RMSE on 10-fold CV: %.4f' %rmse_10cv)


# #### Let's try Ridge Regression:

# In[24]:


# Create linear regression object with a ridge coefficient 0.5
ridge = Ridge(alpha=0.8)

# Train the model using the training set
ridge.fit(x,y)


# In[25]:


# Compute RMSE on training data
p = ridge.predict(x)
err = p-y
total_error = np.dot(err,err)
rmse_train = np.sqrt(total_error/len(p))

# Compute RMSE using 10-fold x-validation

rmse_10cv = cross_validate(ridge, x, y, 10, verbose=True)

method_name = 'Ridge Regression'
print("\n")
print('Method: %s' %method_name)
print('RMSE on training: %.4f' %rmse_train)
print('RMSE on 10-fold CV: %.4f' %rmse_10cv)


# #### We can try different values of alpha and observe the impact on x-validation RMSE

# In[26]:


print('Ridge Regression')
print('alpha\t RMSE_train\t RMSE_10cv\n')
alpha = np.linspace(.01,20,50)
t_rmse = np.array([])
cv_rmse = np.array([])

for a in alpha:
    ridge = Ridge(alpha=a)
    
    # computing the RMSE on training data
    ridge.fit(x,y)
    p = ridge.predict(x)
    err = p-y
    total_error = np.dot(err,err)
    rmse_train = np.sqrt(total_error/len(p))

    rmse_10cv = cross_validate(ridge, x, y, 10)
    
    t_rmse = np.append(t_rmse, [rmse_train])
    cv_rmse = np.append(cv_rmse, [rmse_10cv])
    print('{:.3f}\t {:.4f}\t\t {:.4f}'.format(a,rmse_train,rmse_10cv))


# In[27]:


fig = pl.figure(figsize=(10,6))
ax = fig.add_subplot(111)
ax.plot(alpha, t_rmse, label='RMSE-Train')
ax.plot(alpha, cv_rmse, label='RMSE_XVal')
pl.legend( ('RMSE-Train', 'RMSE_XVal') )
pl.ylabel('RMSE')
pl.xlabel('Alpha')
pl.show()


# #### To make comparisons across methods easier, let's parametrize the regression methods:

# In[28]:


a = 0.01
for name,met in [
        ('linear regression', LinearRegression()),
        ('lasso', Lasso(alpha=a)),
        ('ridge', Ridge(alpha=a)),
        ('elastic-net', ElasticNet(alpha=a))
        ]:
    
    # computing the RMSE on training data
    met.fit(x,y)
    p = met.predict(x)
    e = p-y
    total_error = np.dot(e,e)
    rmse_train = np.sqrt(total_error/len(p))

    # computing the RMSE for x-validation
    rmse_10cv = cross_validate(met, x, y, 10)
    
    
    print('Method: %s' %name)
    print('RMSE on training: %.4f' %rmse_train)
    print('RMSE on 10-fold CV: %.4f' %rmse_10cv)
    print("\n")


# #### Now let's try to do regression via Stochastic Gradient Descent.

# In[29]:


# SGD is very senstitive to varying-sized feature values. So, first we need to do feature scaling.

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(x)
x_s = scaler.transform(x)

sgdreg = SGDRegressor(penalty='l2', alpha=0.01, max_iter=300)

# Compute RMSE on training data
sgdreg.fit(x_s,y)
p = sgdreg.predict(x_s)
err = p-y
total_error = np.dot(err,err)
rmse_train = np.sqrt(total_error/len(p))

# Compute RMSE using 10-fold x-validation
from sklearn.model_selection import KFold

n = 10
kf = KFold(n_splits=n, random_state=22)
xval_err = 0
f = 1
for train,test in kf.split(x):
    scaler = StandardScaler()
    scaler.fit(x[train])  # Don't cheat - fit only on training data
    xtrain_s = scaler.transform(x[train])
    xtest_s = scaler.transform(x[test])  # apply same transformation to test data
    sgdreg.fit(xtrain_s,y[train])
    p = sgdreg.predict(xtest_s)
    e = p-y[test]
    
    rmse = np.sqrt(np.dot(e,e)/len(x[test]))
    print("Fold %2d RMSE: %.4f" % (f, rmse))
    xval_err += rmse
    f += 1

rmse_10cv = xval_err/n

method_name = 'Stochastic Gradient Descent Regression'
print('Method: %s' %method_name)
print('RMSE on training: %.4f' %rmse_train)
print('RMSE on 10-fold CV: %.4f' %rmse_10cv)


# #### Instead of Scikit-learn, let's  implement the closed form solution for linear regression

# In[30]:


def standRegres(xArr,yArr):
    xMat = np.mat(xArr); yMat = np.mat(yArr).T
    xTx = xMat.T*xMat
    if np.linalg.det(xTx) == 0.0:
        print("This matrix is singular, cannot do inverse")
        return
    ws = xTx.I * (xMat.T*yMat)
    return ws


# In[31]:


w = standRegres(x,y)


# In[32]:


print(w)


# In[33]:


def ridgeRegres(xArr,yArr,lam=0.2):
    xMat = np.mat(xArr); yMat = np.mat(yArr).T
    xTx = xMat.T*xMat
    denom = xTx + np.eye(np.shape(xMat)[1])*lam
    if np.linalg.det(denom) == 0.0:
        print("This matrix is singular, cannot do inverse")
        return
    ws = denom.I * (xMat.T*yMat)
    return ws


# In[34]:


w_ridge = ridgeRegres(x,y,0.5)
print(w_ridge)


# #### Now that we have the regression coefficients, we can compute the predictions:

# In[35]:


xMat=np.mat(x)
yMat=np.mat(y)
yHat = xMat*w_ridge


# In[36]:


yHat.shape


# In[37]:


print(yHat[0:10])


# In[38]:


print(yMat.T[0:10])


# #### Model evaluation and cross validation can be performed as before.

# In[39]:


# You can "ravel" the 2d matrix above to get a 1d Numpy array more suitable for using in earlier functions.
print(yHat.A.ravel())


# In[ ]: