by Alejandro Correa Bahnsen & Iván Torroledo
version 1.3, June 2018
This notebook is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License. Special thanks goes to Kevin Markham
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
import itertools
plt.style.use('ggplot')
print(plt.style.available)
['seaborn-pastel', 'seaborn-dark-palette', 'seaborn-darkgrid', '_classic_test', 'bmh', 'seaborn-muted', 'fast', 'fivethirtyeight', 'seaborn-dark', 'ggplot', 'seaborn-bright', 'seaborn-talk', 'dark_background', 'seaborn-poster', 'seaborn-notebook', 'seaborn-ticks', 'seaborn-deep', 'seaborn-white', 'seaborn', 'Solarize_Light2', 'classic', 'grayscale', 'seaborn-whitegrid', 'seaborn-colorblind', 'seaborn-paper']
plt.style.use('fivethirtyeight')
# Test Dataset
# Load dataset
import zipfile
with zipfile.ZipFile('../datasets/houses_portland.csv.zip', 'r') as z:
f = z.open('houses_portland.csv')
data = pd.io.parsers.read_table(f, sep=',')
data.head()
area | bedroom | price | |
---|---|---|---|
0 | 2104 | 3 | 399900 |
1 | 1600 | 3 | 329900 |
2 | 2400 | 3 | 369000 |
3 | 1416 | 2 | 232000 |
4 | 3000 | 4 | 539900 |
data.columns
Index(['area', 'bedroom', ' price'], dtype='object')
y = data[' price'].values
X = data['area'].values
plt.scatter(X, y)
plt.xlabel('Area')
plt.ylabel('Price')
Text(0,0.5,'Price')
y_mean, y_std = y.mean(), y.std()
X_mean, X_std = X.mean(), X.std()
y = (y - y_mean)/ y_std
X = (X - X_mean)/ X_std
plt.scatter(X, y)
plt.xlabel('Area')
plt.ylabel('Price')
Text(0,0.5,'Price')
The $\beta$ values are called the model coefficients:
In the diagram above:
# create X and y
n_samples = X.shape[0]
X_ = np.c_[np.ones(n_samples), X]
Lets suppose the following betas
beta_ini = np.array([-1, 1])
# h
def lr_h(beta,x):
return np.dot(beta, x.T)
# scatter plot
plt.scatter(X, y,c='b')
# Plot the linear regression
x = np.c_[np.ones(2), [X.min(), X.max()]]
plt.plot(x[:, 1], lr_h(beta_ini, x), 'r', lw=5)
plt.xlabel('Area')
plt.ylabel('Price')
Text(0,0.5,'Price')
Lets calculate the error of such regression
# Cost function
def lr_cost_func(beta, x, y):
# Can be vectorized
res = 0
for i in range(x.shape[0]):
res += (lr_h(beta,x[i, :]) - y[i]) ** 2
res *= 1 / (2*x.shape[0])
return res
lr_cost_func(beta_ini, X_, y)
0.6450124071218747
Lets see how the cost function looks like for different values of $\beta$
beta0 = np.arange(-15, 20, 1)
beta1 = 2
cost_func=[]
for beta_0 in beta0:
cost_func.append(lr_cost_func(np.array([beta_0, beta1]), X_, y) )
plt.plot(beta0, cost_func)
plt.xlabel('beta_0')
plt.ylabel('J(beta)')
Text(0,0.5,'J(beta)')
beta0 = 0
beta1 = np.arange(-15, 20, 1)
cost_func=[]
for beta_1 in beta1:
cost_func.append(lr_cost_func(np.array([beta0, beta_1]), X_, y) )
plt.plot(beta1, cost_func)
plt.xlabel('beta_1')
plt.ylabel('J(beta)')
Text(0,0.5,'J(beta)')
Analyzing both at the same time
beta0 = np.arange(-5, 7, 0.2)
beta1 = np.arange(-5, 7, 0.2)
cost_func = pd.DataFrame(index=beta0, columns=beta1)
for beta_0 in beta0:
for beta_1 in beta1:
cost_func.loc[beta_0, beta_1] = lr_cost_func(np.array([beta_0, beta_1]), X_, y)
betas = np.transpose([np.tile(beta0, beta1.shape[0]), np.repeat(beta1, beta0.shape[0])])
fig = plt.figure(figsize=(10, 10))
ax = fig.gca(projection='3d')
ax.plot_trisurf(betas[:, 0], betas[:, 1], cost_func.T.values.flatten(), cmap=cm.jet, linewidth=0.1)
ax.set_xlabel('beta_0')
ax.set_ylabel('beta_1')
ax.set_zlabel('J(beta)')
plt.show()
It can also be seen as a contour plot
contour_levels = [0, 0.1, 0.25, 0.5, 0.75, 1, 1.5, 2, 3, 4, 5, 7, 10, 12, 15, 20]
plt.contour(beta0, beta1, cost_func.T.values, contour_levels)
plt.xlabel('beta_0')
plt.ylabel('beta_1')
Text(0,0.5,'beta_1')
Lets understand how different values of betas are observed on the contour plot
betas = np.array([[0, 0],
[-1, -1],
[-5, 5],
[3, -2]])
plt.style.use('seaborn-notebook')
for beta in betas:
print('\n\nLinear Regression with betas ', beta)
f, (ax1, ax2) = plt.subplots(1,2, figsize=(12, 6))
ax2.contour(beta0, beta1, cost_func.T.values, contour_levels)
ax2.set_xlabel('beta_0')
ax2.set_ylabel('beta_1')
ax2.scatter(beta[0], beta[1],c='b', s=50)
# scatter plot
ax1.scatter(X, y,c='b')
# Plot the linear regression
x = np.c_[np.ones(2), [X.min(), X.max()]]
ax1.plot(x[:, 1], lr_h(beta, x), 'r', lw=5)
ax1.set_xlabel('Area')
ax1.set_ylabel('Price')
plt.show()
Linear Regression with betas [0 0]
Linear Regression with betas [-1 -1]
Linear Regression with betas [-5 5]
Linear Regression with betas [ 3 -2]
Have some function $J(\beta_0, \beta_1)$
Want $\min_{\beta_0, \beta_1}J(\beta_0, \beta_1)$
Process:
Start with some $\beta_0, \beta_1$
Keep changing $\beta_0, \beta_1$ to reduce $J(\beta_0, \beta_1)$
until hopefully end up at a minimum
For the particular case of linear regression with one variable and one intercept the gradient is calculated as:
# gradient calculation
beta_ini = np.array([-1.5, 0.])
def gradient(beta, x, y):
# Not vectorized
gradient_0 = 1 / x.shape[0] * ((lr_h(beta, x) - y).sum())
gradient_1 = 1 / x.shape[0] * ((lr_h(beta, x) - y)* x[:, 1]).sum()
return np.array([gradient_0, gradient_1])
gradient(beta_ini, X_, y)
array([-1.5 , -0.85498759])
def gradient_descent(x, y, beta_ini, alpha, iters):
betas = np.zeros((iters, beta_ini.shape[0] + 1))
beta = beta_ini
for iter_ in range(iters):
betas[iter_, :-1] = beta
betas[iter_, -1] = lr_cost_func(beta, x, y)
beta -= alpha * gradient(beta, x, y)
return betas
iters = 100
alpha = 0.05
beta_ini = np.array([-4., -4.])
betas = gradient_descent(X_, y, beta_ini, alpha, iters)
Lets see the evolution of the cost per iteration
plt.plot(range(iters), betas[:, -1])
plt.xlabel('iteration')
plt.ylabel('J(beta)');
Understanding what it is doing in each iteration
betas_ = betas[range(0, iters, 10), :-1]
for i, beta in enumerate(betas_):
print('\n\nLinear Regression with betas ', beta)
f, (ax1, ax2) = plt.subplots(1,2, figsize=(12, 6))
ax2.contour(beta0, beta1, cost_func.T.values, contour_levels)
ax2.set_xlabel('beta_0')
ax2.set_ylabel('beta_1')
ax2.scatter(beta[0], beta[1], c='r', s=50)
if i > 0:
for beta_ in betas_[:i]:
ax2.scatter(beta_[0], beta_[1], c='b', s=50)
# scatter plot
ax1.scatter(X, y,c='b')
# Plot the linear regression
x = np.c_[np.ones(2), [X.min(), X.max()]]
ax1.plot(x[:, 1], lr_h(beta, x), 'r', lw=5)
ax1.set_xlabel('Area')
ax1.set_ylabel('Price')
plt.show()
Linear Regression with betas [-4. -4.]
Linear Regression with betas [-2.39494776 -2.05187282]
Linear Regression with betas [-1.43394369 -0.88545711]
Linear Regression with betas [-0.85855506 -0.18708094]
Linear Regression with betas [-0.51404863 0.23106267]
Linear Regression with betas [-0.3077799 0.48142069]
Linear Regression with betas [-0.1842792 0.63131929]
Linear Regression with betas [-0.11033476 0.72106912]
Linear Regression with betas [-0.0660615 0.77480566]
Linear Regression with betas [-0.03955346 0.8069797 ]
Estimated Betas
betas[-1, :-1]
array([-0.02492854, 0.82473065])
beta = np.dot(np.linalg.inv(np.dot(X_.T, X_)),np.dot(X_.T, y))
beta
array([-5.80624192e-17, 8.54987593e-01])
# import
from sklearn.linear_model import LinearRegression
# Initialize
linreg = LinearRegression(fit_intercept=False)
# Fit
linreg.fit(X_, y)
LinearRegression(copy_X=True, fit_intercept=False, n_jobs=1, normalize=False)
linreg.coef_
array([-8.88030989e-17, 8.54987593e-01])
*Differs from normal gradient descent by updating the weights with each example. This converges faster for large datasets
# import
from sklearn.linear_model import SGDRegressor
# Initialize
linreg2 = SGDRegressor(fit_intercept=False, max_iter=100)
# Fit
linreg2.fit(X_,y)
SGDRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.01, fit_intercept=False, l1_ratio=0.15, learning_rate='invscaling', loss='squared_loss', max_iter=100, n_iter=None, penalty='l2', power_t=0.25, random_state=None, shuffle=True, tol=None, verbose=0, warm_start=False)
linreg2.coef_
array([-3.01851855e-05, 8.54306989e-01])
Gradient Descent | Normal Equation |
---|---|
Need to choose $\alpha$ | No need to choose $\alpha$ |
Needs many iterations | Don't need to iterate |
Works weel even when $k$ is large | Slow if $k$ is very large |
Need to compute $(X^TX)^{-1}$ |
Lets create a new freature $ area^2 $
data['area2'] = data['area'] ** 2
data.head()
area | bedroom | price | area2 | |
---|---|---|---|---|
0 | 2104 | 3 | 399900 | 4426816 |
1 | 1600 | 3 | 329900 | 2560000 |
2 | 2400 | 3 | 369000 | 5760000 |
3 | 1416 | 2 | 232000 | 2005056 |
4 | 3000 | 4 | 539900 | 9000000 |
i = 2
data.loc[2, ['area', 'area2']]
area 2400 area2 5760000 Name: 2, dtype: int64
i = 2
j = 2
data.loc[2, 'area2']
5760000
X = data[['area', 'area2']].values
X[0:5]
array([[ 2104, 4426816], [ 1600, 2560000], [ 2400, 5760000], [ 1416, 2005056], [ 3000, 9000000]])
from sklearn.preprocessing import StandardScaler
ss = StandardScaler(with_mean=True, with_std=True)
ss.fit(X.astype(np.float))
X = ss.transform(X.astype(np.float))
ss.mean_, ss.scale_
(array([2.00068085e+03, 4.62083843e+06]), array([7.86202619e+02, 4.05394589e+06]))
X[0:5]
array([[ 0.13141542, -0.04786014], [-0.5096407 , -0.50835371], [ 0.5079087 , 0.28100069], [-0.74367706, -0.64524355], [ 1.27107075, 1.08022201]])
X_ = np.c_[np.ones(n_samples), X]
X_[0:5]
array([[ 1. , 0.13141542, -0.04786014], [ 1. , -0.5096407 , -0.50835371], [ 1. , 0.5079087 , 0.28100069], [ 1. , -0.74367706, -0.64524355], [ 1. , 1.27107075, 1.08022201]])
The goal became to estimate the parameters $\beta$ that minimize the sum of squared residuals
Note that $x^0$ is refering to the column of ones
beta_ini = np.array([0., 0., 0.])
# gradient calculation
def gradient(beta, x, y):
return 1 / x.shape[0] * np.dot((lr_h(beta, x) - y).T, x)
gradient(beta_ini, X_, y)
array([ 8.50383593e-17, -8.54987593e-01, -8.33162685e-01])
beta_ini = np.array([0., 0., 0.])
alpha = 0.005
iters = 100
betas = gradient_descent(X_, y, beta_ini, alpha, iters)
# Print iteration vs J
plt.plot(range(iters), betas[:, -1])
plt.xlabel('iteration')
plt.ylabel('J(beta)');
Aparently the cost function is not converging
Lets change alpha and increase the number of iterations
beta_ini = np.array([0., 0., 0.])
alpha = 0.5
iters = 1000
betas = gradient_descent(X_, y, beta_ini, alpha, iters)
# Print iteration vs J
plt.plot(range(1,iters), betas[1:, -1])
plt.xlabel('iteration')
plt.ylabel('J(beta)');
print('betas using gradient descent\n', betas[-1, :-1])
betas using gradient descent [-7.91329177e-17 8.91147493e-01 -3.70307030e-02]
betas_ols = np.dot(np.linalg.inv(np.dot(X_.T, X_)),np.dot(X_.T, y))
betas_ols
array([-7.26814831e-17, 8.91150925e-01, -3.70341353e-02])
Difference
betas_ols - betas[-1, :-1]
array([ 6.45143463e-18, 3.43234289e-06, -3.43234289e-06])
x = np.array([3000., 3000.**2])
# scale
x_scaled = ss.transform(x.reshape(1, -1))
x_ = np.c_[1, x_scaled]
x_
array([[1. , 1.27107075, 1.08022201]])
y_pred = lr_h(betas_ols, x_)
y_pred
array([1.09271078])
y_pred = y_pred * y_std + y_mean
y_pred
array([475583.75451797])
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import LinearRegression
clf1 = LinearRegression()
clf2 = SGDRegressor(max_iter=10000)
When using sklearn there is no need to create the intercept
Also sklearn works with pandas
clf1.fit(data[['area', 'area2']], data[' price'])
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
clf2.fit(X, y)
SGDRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.01, fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling', loss='squared_loss', max_iter=10000, n_iter=None, penalty='l2', power_t=0.25, random_state=None, shuffle=True, tol=None, verbose=0, warm_start=False)
Making predictions
clf1.predict(x.reshape(1, -1))
array([475583.75451797])
clf2.predict(x_scaled.reshape(1, -1)) * y_std + y_mean
array([475454.36969416])
Evaluation metrics for classification problems, such as accuracy, are not useful for regression problems. We need evaluation metrics designed for comparing continuous values.
Here are three common evaluation metrics for regression problems:
Mean Absolute Error (MAE) is the mean of the absolute value of the errors:
$$\frac 1n\sum_{i=1}^n|y_i-\hat{y}_i|$$Mean Squared Error (MSE) is the mean of the squared errors:
$$\frac 1n\sum_{i=1}^n(y_i-\hat{y}_i)^2$$Root Mean Squared Error (RMSE) is the square root of the mean of the squared errors:
$$\sqrt{\frac 1n\sum_{i=1}^n(y_i-\hat{y}_i)^2}$$y_pred = clf1.predict(data[['area', 'area2']])
from sklearn import metrics
import numpy as np
print('MAE:', metrics.mean_absolute_error(data[' price'], y_pred))
print('MSE:', metrics.mean_squared_error(data[' price'], y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(data[' price'], y_pred)))
MAE: 51990.96151069319 MSE: 4115290102.0599403 RMSE: 64150.526903993086
Comparing these metrics:
All of these are loss functions, because we want to minimize them.
Advantages of linear regression:
Disadvantages of linear regression: