The objective of linear regression is to minimize the cost function
$J(\theta) = \frac{1}{2m} \sum_{i=1}^{m} (h_\theta(x^{(i)}) - y^{(i)})^2 $ (vectorial form $J(\theta) = \frac{1}{2m} (X\theta - y)^T (X\theta - y)$)
where the hypothesis $h_\theta$ is given by the linear model
The model's parameters are the $\theta_j$ values. These are the values that need to be adjusted to minimize cost $J(\theta)$. One way to do this is to use the batch gradient descent algorithm. In batch gradient descent, each iteration performs the update
$\theta_j := \theta_j - \frac{\alpha}{m} \sum_{i=1}^{m} (h_\theta(x^{(i)}) - y^{(i)}) x_j^{(i)}$ (simultaneously update $\theta_j$ for all $j$); vectorial form $\theta := \theta - \alpha \nabla J(\theta)$
With each step of gradient descent, the parameters $\theta_j$ come closer to the optimal values that will achieve the lowest cost $J(\theta)$.
Predicting Profits of restaurant chain based on population of city.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline
plt.rcParams['figure.figsize'] = (10, 6)
data = np.loadtxt('C:\\Users\\pochetti\Machine_Learning\\mlclass-ex1-005\\mlclass-ex1\ex1data1.txt', delimiter=',')
X = data[:,0]
y = data[:,1]
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.plot(X, y, 'o', label='Raw Data')
plt.ylabel('Profit in $10Ks')
plt.xlabel('Population of City in 10Ks')
plt.show()
def computeCost(X, y, theta):
# X = (97, 2); theta = (2,1)
# X * theta = (97, 1)
# (X * theta)' = (1, 97)
residual = np.dot(X, theta) - y
m = y.shape[0]
return 1/(2*m) * np.dot(residual.transpose(), residual)
def gradientDescent(X, y, theta, alpha, num_iters):
m = y.shape[0]
J_history = np.zeros((num_iters, 1))
for iter in range(num_iters):
residual = np.dot(X, theta) - y
temp = theta - (alpha/m) * np.dot(X.transpose(), residual)
theta = temp
J_history[iter,0] = computeCost(X, y, theta)
return theta, J_history
data = np.loadtxt('C:\\Users\\pochetti\Machine_Learning\\mlclass-ex1-005\\mlclass-ex1\ex1data1.txt', delimiter=',')
X = data[:,0]
y = data[:,1]
y = y.reshape(y.shape[0], 1)
X = np.c_[np.ones(X.shape[0]), X] # adding column of ones to X to account for theta_0 (the intercept)
theta = np.zeros((2, 1))
computeCost(X, y, theta) # cost at the very beginning with coefficients initialized at zero
array([[ 32.07273388]])
iterations =1500
alpha = 0.01
optimal_theta, J_history = gradientDescent(X, y, theta, alpha, iterations)
print("Theta found by gradient descent: intercept={0}, slope={1}".format(optimal_theta[0],optimal_theta[1]))
Theta found by gradient descent: intercept=[-3.63029144], slope=[ 1.16636235]
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.scatter(range(iterations), J_history)
plt.ylabel('Cost Value')
plt.xlabel('Iteration of Gradient Descent')
plt.show()
fig, ax = plt.subplots()
ax.plot(X[:,1], y[:,0], 'o', label='Raw Data')
ax.plot(X[:,1], np.dot(X, optimal_theta), linestyle='-', label='Linear Regression')
plt.ylabel('Profit in $10Ks')
plt.xlabel('Population of City in 10Ks')
legend = ax.legend(loc='upper center', shadow=True)
plt.show()
Predicting House Prices based on Size (feet squared) and number of bedrooms.
dataMulti = np.loadtxt('C:\\Users\\pochetti\Machine_Learning\\mlclass-ex1-005\\mlclass-ex1\ex1data2.txt', delimiter=',')
X = dataMulti[:,0:2]
y = dataMulti[:,2]
y = y.reshape(y.shape[0], 1)
fig = plt.figure()
plt.subplots_adjust(hspace=0.4)
ax1 = fig.add_subplot(2, 1, 1)
ax1.plot(X[:,0], y, 'o', label='Raw Data')
plt.ylabel('House Price')
plt.xlabel('House Size (feet squared)')
ax2 = fig.add_subplot(2, 1, 2)
ax2.plot(X[:,1], y, 'o', label='Raw Data')
plt.ylabel('House Price')
plt.xlabel('# Bedrooms')
plt.show()
def featureNormalize(X):
average = np.mean(X, axis=0)
sigma = np.std(X, axis= 0)
X = (X-average)/sigma
return X, average, sigma
X, average, sigma = featureNormalize(X) # scaling the dataset (mean = 0, std = 1)
X = np.c_[np.ones(X.shape[0]), X] # adding column of zeros to X to account for theta_0 (the intercept)
theta = np.zeros((3, 1))
iterations = 1500
alpha = 0.01
optimal_theta, J_history = gradientDescent(X, y, theta, alpha, iterations)
optimal_theta
array([[ 340412.56301439], [ 109370.05670466], [ -6500.61509507]])
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
ax.scatter(range(iterations), J_history)
plt.ylabel('Cost Value')
plt.xlabel('Iteration of Gradient Descent')
plt.show()
It is possible to show that the same optimized parameters found implemenating Gradient Descent can be calculated in an elegant, efficient and closed form, using linear algebra. Specifically:
Using this formula does not require any feature scaling, and you will get an exact solution in one calculation: there is no "loop until convergence" like in gradient descent.
def normalEquation(X, y):
return np.dot(np.dot(np.linalg.inv(np.dot(X.transpose(), X)), X.transpose()), y)
normalEquation(X, y)
array([[ 340412.65957447], [ 109447.79646964], [ -6578.35485416]])