Machine Learning Online Class - Exercise 2: Logistic Regression

In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In [3]:
## ==================== Part 1: Plotting ====================
## We start the exercise by first plotting the data to understand the 
#  the problem we are working with.
data = np.loadtxt('ex2data1.txt', delimiter=',')
X = data[:, 0:2]
y = data[:, 2]
def plot_data(X,y):
    pos, neg = y==0, y==1 
    plt.scatter(X[pos,0], X[pos, 1], marker='+', color='red', label='admitted')
    plt.scatter(X[neg,0], X[neg, 1], marker='o', color='blue', label='rejected')
    plt.xlabel('exam1 score')
    plt.ylabel('exam2 score')
    plt.legend()
plot_data(X,y)
In [4]:
## ============ Part 2: Compute Cost and Gradient ============
#  In this part of the exercise, you will implement the cost and gradient
#  for logistic regression. You neeed to complete the code in 
#  costFunction.m

# Setup the data matrix appropriately, and add ones for the intercept term
X = data[:, 0:2]
y = data[:, 2]
(m, n) = np.shape(X)
# Add intercept term to x and X_test
X = np.concatenate((np.ones((m, 1)), X), axis=1)
# Initialize fitting parameters
initial_theta = np.zeros(n+1)

# Compute and display initial cost and gradient
from scipy.special import expit
def sigmoid(z):
    return expit(z)
    #return 1./(1.+np.exp(-z))

def costFunction(theta, X, y):
    """ Logistic Regression cost function and Gradient.
        Denoting g(z) = 1/(1+exp(-z))
                    h = g(X theta) (m,)
    In:
        theta (n+1,) float array
        X (m, n+1) float array
        y (m, ) binary array 
    Out: 
        cost (float): 1/m (-y^T log(h) - (1-y)^T log(1-h))   
        grad (n+1, ): 1/m X^T(g(X theta)-y)
    """
    m = X.shape[0]
    h = sigmoid(np.dot(X, theta))
    cost = 1./float(m) * (-np.dot(y, np.log(h)) - np.dot((1.-y), np.log(1.-h)))
    grad = 1./float(m) * np.dot(X.T, h-y)
    return cost, grad
(cost, grad) = costFunction(initial_theta, X, y)

print 'Cost at initial theta (zeros): %f\n'%(cost)
print 'Gradient at initial theta (zeros):'
print grad
Cost at initial theta (zeros): 0.693147

Gradient at initial theta (zeros):
[ -0.1        -12.00921659 -11.26284221]
In [5]:
## ============= Part 3: Optimizing using fminunc  =============
#  In this exercise, you will use a built-in function (fminunc) to find the
#  optimal parameters theta.
from scipy.optimize import minimize

options = {'maxiter':400, 'disp':True}
res = minimize(costFunction, initial_theta, args=(X,y), 
               method='BFGS', jac=True, options=options)
theta_opt = res['x']
print "theta_opt="
print theta_opt
# Plot decision boundary
def plot_decision_bd(theta, X, y):
    x1min, x1max = X[:,1].min(), X[:,1].max()
    x2min, x2max = X[:,2].min(), X[:,2].max()
    yofx = lambda x1, theta : -1./theta[2] * (theta[0]+theta[1]*x1)  
    pt1, pt2 = [x1min, yofx(x1min, theta)], [x1max, yofx(x1max, theta)]
    plt.plot(pt1, pt2, label='decison_bd')
    pos, neg = y==0, y==1 
    plt.scatter(X[pos,1], X[pos, 2], marker='+', color='red', label='admitted')
    plt.scatter(X[neg,1], X[neg, 2], marker='o', color='blue', label='rejected')
    plt.xlabel('exam1 score')
    plt.ylabel('exam2 score')
    #plt.legend()
plot_decision_bd(theta_opt, X, y)
Optimization terminated successfully.
         Current function value: 0.203498
         Iterations: 23
         Function evaluations: 31
         Gradient evaluations: 31
theta_opt=
[-25.16133284   0.2062317    0.2014716 ]
/home/blumenta/anaconda/envs/tensorflow/lib/python2.7/site-packages/ipykernel/__main__.py:35: RuntimeWarning: divide by zero encountered in log
In [6]:
## ============== Part 4: Predict and Accuracies ==============
#  After learning the parameters, you'll like to use it to predict the outcomes
#  on unseen data. In this part, you will use the logistic regression model
#  to predict the probability that a student with score 45 on exam 1 and 
#  score 85 on exam 2 will be admitted.
#
#  Furthermore, you will compute the training and test set accuracies of 
#  our model.
#
#  Your task is to complete the code in predict.m

#  Predict probability for a student with score 45 on exam 1 
#  and score 85 on exam 2
def proba_admission(theta_opt, x):
    return sigmoid(np.dot(x,theta_opt))
def predict(theta, X):
    return np.double(proba_admission(theta, X)>= 0.5)
    
xtest = np.array([1., 45., 85.]) #test student score
print "Probability of admission : %f"%(proba_admission(theta_opt, xtest))
p = predict(theta_opt, X)
accuracy = np.double(p==y).mean() * 100
print "Training accuracy: %f percent"%accuracy
Probability of admission : 0.776291
Training accuracy: 89.000000 percent

Exercise 2: Logistic regression with regularization

In [8]:
data = np.loadtxt('ex2data2.txt', delimiter=',')
x1, x2, y = data[:,0], data[:,1], data[:,2]
def plot_data2(x1, x2, y):
    pos, neg = y==0, y==1
    fig = plt.figure()
    plt.scatter(x1[pos], x2[pos], marker='+', color='red')
    plt.scatter(x1[neg], x2[neg], marker='o', color='blue')
    plt.xlabel('Microchip Test 1')
    plt.ylabel('Microchip Test 2')
    return fig
fig = plot_data2(x1,x2,y)

Clearly, the data is not linearly separable so we need to use feature mapping to find a non linear decision boundary.

In [9]:
def map_feature(x1, x2, degree=6):
    """ Maps linear to non linear features
    In:
        x1 (m,) dim array
        x2 (m,) dim array
    Out:
        out (m, ncol) dim array  = (1,x1,x1^2,.., x1^6,x1x2,..,x2^6)
    """
    ncol = (degree+1)*(degree+2)/2
    out = np.ones((x1.shape[0], ncol))
    k=0
    for j in range(0, degree+1):
        for i in range(0,degree+1-j):
            out[:,k] = x2**j * x1**i
            k+=1
    return out

#Create data matrix 
X = map_feature(data[:,0], data[:,1], degree=6)
#Define initial theta:
theta0 = np.zeros(X.shape[1])
#Define regularization parameter
lam = 1.

#Define regularized cost function
def cost_function_reg(theta, X, y, lam):
    """ Regularized Logistic Regression cost function and Gradient.
        Denoting g(z) = 1/(1+exp(-z))
                    h = g(X theta) (m,1)
                    t = theta[1:n+1]
    In:
        theta (n+1,) float array
        X (m, n+1) float array
        y (m,) binary array 
        lam : scalar, regularization parameter
    Out: 
        cost (float): 1/m (-y^T log(h) - (1-y)^T log(1-h)) +lam/2m * t^T t   
        grad (n+1, 1): 1/m X^T(g(X theta)-y)
    """
    cost, grad = costFunction(theta, X, y)
    t = theta[1:]
    cost += lam/(2*m) * np.dot(t,t)
    grad[1:] += lam/m * t
    return cost, grad

(cost, grad) = cost_function_reg(theta0, X, y, lam)
print "cost at initial theta: %f"%cost

def learn_reg_logistic(X,y,lam,theta0):
    options = {'maxiter':800, 'disp':True}
    res = minimize(cost_function_reg, theta0, args=(X,y,lam), 
                   method='BFGS', jac=True, options=options)
    return res['x']

#Plotting the decision boundaries for the different regularization parameters
def f(xx1, xx2, theta, degree=6):
    ncol = (degree+1)*(degree+2)/2
    out = np.zeros_like(xx1)
    k=0
    for j in range(0, degree+1):
        for i in range(0,degree+1-j):
            out += theta[k] * xx2**j * xx1**i
            k+=1
    return out
def plot_nl_decision_bd(theta, lam, x1, x2, y, acc):
    x1min, x1max = x1.min(), x1.max()
    x2min, x2max = x2.min(), x2.max()
    nn = 256
    xs, ys = np.linspace(x1min, x1max, nn), np.linspace(x2min, x2max, nn)
    xx, yy = np.meshgrid(xs, ys)
    zz = f(xx,yy,theta)
    fig = plot_data2(x1, x2, y)
    C = plt.contour(xx, yy, zz, [0.], linewidth=0.5, colors='black')
    plt.title('Lam = %.1f, accuracy = %.1f'%(lam,acc))

##############################################
# Perform Regularized logistic Regression for 
# different values of lambda
##############################################
reg_vals = [0., 1., 10., 100.]
for lam in reg_vals:
    theta_opt = learn_reg_logistic(X,y,lam, theta0)
    p = predict(theta_opt, X)
    accuracy = np.double(p==y).mean() * 100
    plot_nl_decision_bd(theta_opt, lam, x1, x2, y, accuracy)
cost at initial theta: 0.693147
Optimization terminated successfully.
         Current function value: 0.224569
         Iterations: 546
         Function evaluations: 547
         Gradient evaluations: 547
Optimization terminated successfully.
         Current function value: 0.539956
         Iterations: 45
         Function evaluations: 46
         Gradient evaluations: 46
Optimization terminated successfully.
         Current function value: 0.653251
         Iterations: 19
         Function evaluations: 20
         Gradient evaluations: 20
Optimization terminated successfully.
         Current function value: 0.687424
         Iterations: 6
         Function evaluations: 7
         Gradient evaluations: 7