from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, SGDRegressor
import numpy as np
import pandas as pd
import pylab as pl
from sklearn.datasets import load_boston
boston = load_boston()
print(boston.DESCR)
print(boston.feature_names)
print(boston.data.shape)
print(boston.target.shape)
bostonDF = pd. DataFrame(boston.data, columns = boston.feature_names)
bostonDF.head()
np.set_printoptions(precision=2, linewidth=120, suppress=True, edgeitems=7)
print(boston.data)
# The attribute MEDV (Median Values) is the target attribute (response variable)
print(boston.target[:20])
# In order to do multiple regression we need to add a column of 1s as the coefficient for x0
x = np.array([np.concatenate((v,[1])) for v in boston.data])
y = boston.target
# First 10 elements of the data
print(x[:10])
# Create linear regression object
linreg = LinearRegression()
# Train the model using the training set
linreg.fit(x,y)
# Let's see predictions for the first 10 instances and compare to actual MEDV values
for i in range(10):
pred = linreg.predict(np.array([x[i]]))[0]
print("%2d \t %2.2f \t %2.2f" % (i, pred, y[i]))
# First, let's compute errors on all training instances
p = linreg.predict(x) # p is the array of predicted values
# Now we can constuct an array of errors
err = abs(p-y)
# Let's see the error on the first 10 predictions
print(err[:10])
# Dot product of error vector with itself gives us the sum of squared errors
total_error = np.dot(err,err)
# Finally compute RMSE
rmse_train = np.sqrt(total_error/len(p))
print("RMSE on Training Data: ", rmse_train)
# We can view the regression coefficients
print('Regression Coefficients: \n', linreg.coef_)
# Let's put some names to the faces
for i in range(len(boston.feature_names)):
print("%7s %2.2f" % (boston.feature_names[i], linreg.coef_[i]))
# The following function can be used to plot the model coefficients
%matplotlib inline
def plot_coefficients(model, n_features, feature_names):
pl.barh(range(n_features), model.coef_[:-1], align='center')
pl.yticks(np.arange(n_features), feature_names)
pl.xlabel("Coefficient Value")
pl.ylabel("Feature")
pl.ylim(-1, n_features)
plot_coefficients(linreg, len(boston.feature_names), boston.feature_names)
print("Linear Regression Intercept: ", linreg.intercept_)
# Plot predicted against actual (in the training data)
%matplotlib inline
pl.plot(p, y,'ro', markersize=5)
pl.plot([0,50],[0,50], 'g-')
pl.xlabel('Predicted')
pl.ylabel('Actual')
pl.show()
def cross_validate(model, X, y, n, verbose=False):
# model: regression model to be trained
# X: the data matrix
# y: the target variable array
# n: the number of fold for x-validation
# Returns mean RMSE across all folds
from sklearn.model_selection import KFold
kf = KFold(n_splits=n, random_state=22)
xval_err = 0
f = 1
for train,test in kf.split(x):
model.fit(X[train],y[train])
p = model.predict(x[test])
e = p-y[test]
rmse = np.sqrt(np.dot(e,e)/len(x[test]))
if verbose:
print("Fold %2d RMSE: %.4f" % (f, rmse))
xval_err += rmse
f += 1
return xval_err/n
rmse_10cv = cross_validate(linreg, x, y, 10, verbose=True)
method_name = 'Simple Linear Regression'
print('Method: %s' %method_name)
print('RMSE on training: %.4f' %rmse_train)
print('RMSE on 10-fold CV: %.4f' %rmse_10cv)
# Create linear regression object with a ridge coefficient 0.5
ridge = Ridge(alpha=0.8)
# Train the model using the training set
ridge.fit(x,y)
# Compute RMSE on training data
p = ridge.predict(x)
err = p-y
total_error = np.dot(err,err)
rmse_train = np.sqrt(total_error/len(p))
# Compute RMSE using 10-fold x-validation
rmse_10cv = cross_validate(ridge, x, y, 10, verbose=True)
method_name = 'Ridge Regression'
print("\n")
print('Method: %s' %method_name)
print('RMSE on training: %.4f' %rmse_train)
print('RMSE on 10-fold CV: %.4f' %rmse_10cv)
print('Ridge Regression')
print('alpha\t RMSE_train\t RMSE_10cv\n')
alpha = np.linspace(.01,20,50)
t_rmse = np.array([])
cv_rmse = np.array([])
for a in alpha:
ridge = Ridge(alpha=a)
# computing the RMSE on training data
ridge.fit(x,y)
p = ridge.predict(x)
err = p-y
total_error = np.dot(err,err)
rmse_train = np.sqrt(total_error/len(p))
rmse_10cv = cross_validate(ridge, x, y, 10)
t_rmse = np.append(t_rmse, [rmse_train])
cv_rmse = np.append(cv_rmse, [rmse_10cv])
print('{:.3f}\t {:.4f}\t\t {:.4f}'.format(a,rmse_train,rmse_10cv))
fig = pl.figure(figsize=(10,6))
ax = fig.add_subplot(111)
ax.plot(alpha, t_rmse, label='RMSE-Train')
ax.plot(alpha, cv_rmse, label='RMSE_XVal')
pl.legend( ('RMSE-Train', 'RMSE_XVal') )
pl.ylabel('RMSE')
pl.xlabel('Alpha')
pl.show()
a = 0.01
for name,met in [
('linear regression', LinearRegression()),
('lasso', Lasso(alpha=a)),
('ridge', Ridge(alpha=a)),
('elastic-net', ElasticNet(alpha=a))
]:
# computing the RMSE on training data
met.fit(x,y)
p = met.predict(x)
e = p-y
total_error = np.dot(e,e)
rmse_train = np.sqrt(total_error/len(p))
# computing the RMSE for x-validation
rmse_10cv = cross_validate(met, x, y, 10)
print('Method: %s' %name)
print('RMSE on training: %.4f' %rmse_train)
print('RMSE on 10-fold CV: %.4f' %rmse_10cv)
print("\n")
# SGD is very senstitive to varying-sized feature values. So, first we need to do feature scaling.
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(x)
x_s = scaler.transform(x)
sgdreg = SGDRegressor(penalty='l2', alpha=0.01, max_iter=300)
# Compute RMSE on training data
sgdreg.fit(x_s,y)
p = sgdreg.predict(x_s)
err = p-y
total_error = np.dot(err,err)
rmse_train = np.sqrt(total_error/len(p))
# Compute RMSE using 10-fold x-validation
from sklearn.model_selection import KFold
n = 10
kf = KFold(n_splits=n, random_state=22)
xval_err = 0
f = 1
for train,test in kf.split(x):
scaler = StandardScaler()
scaler.fit(x[train]) # Don't cheat - fit only on training data
xtrain_s = scaler.transform(x[train])
xtest_s = scaler.transform(x[test]) # apply same transformation to test data
sgdreg.fit(xtrain_s,y[train])
p = sgdreg.predict(xtest_s)
e = p-y[test]
rmse = np.sqrt(np.dot(e,e)/len(x[test]))
print("Fold %2d RMSE: %.4f" % (f, rmse))
xval_err += rmse
f += 1
rmse_10cv = xval_err/n
method_name = 'Stochastic Gradient Descent Regression'
print('Method: %s' %method_name)
print('RMSE on training: %.4f' %rmse_train)
print('RMSE on 10-fold CV: %.4f' %rmse_10cv)
def standRegres(xArr,yArr):
xMat = np.mat(xArr); yMat = np.mat(yArr).T
xTx = xMat.T*xMat
if np.linalg.det(xTx) == 0.0:
print("This matrix is singular, cannot do inverse")
return
ws = xTx.I * (xMat.T*yMat)
return ws
w = standRegres(x,y)
print(w)
def ridgeRegres(xArr,yArr,lam=0.2):
xMat = np.mat(xArr); yMat = np.mat(yArr).T
xTx = xMat.T*xMat
denom = xTx + np.eye(np.shape(xMat)[1])*lam
if np.linalg.det(denom) == 0.0:
print("This matrix is singular, cannot do inverse")
return
ws = denom.I * (xMat.T*yMat)
return ws
w_ridge = ridgeRegres(x,y,0.5)
print(w_ridge)
xMat=np.mat(x)
yMat=np.mat(y)
yHat = xMat*w_ridge
yHat.shape
print(yHat[0:10])
print(yMat.T[0:10])
# You can "ravel" the 2d matrix above to get a 1d Numpy array more suitable for using in earlier functions.
print(yHat.A.ravel())