Logistic Regression

Implementation of logistic regression in pure Python using tools from scipy.optimize. This is a companion notebook to the blog post Numerical optimizers for Logistic Regression

Loss function

The loss function, using vector notation and abusing of broadcasting is:

$$\mathcal{L}(w) = - \mathbb{1}^T \log(\phi(y * X w)) + \frac{\alpha}{2} w^T w$$

where is the component-wise product (numpy's $$ operator) and $\phi(t) = 1 / (1 + \exp(-t)) $ is the logistic function.

In [1]:
from __future__ import print_function
import numpy as np
from scipy import linalg, optimize
from sklearn import cross_validation, metrics, linear_model
from datetime import datetime
import pytron
In [2]:
def phi(t):
    # logistic function
    idx = t > 0
    out = np.empty(t.size, dtype=np.float)
    out[idx] = 1. / (1 + np.exp(-t[idx]))
    exp_t = np.exp(t[~idx])
    out[~idx] = exp_t / (1. + exp_t)
    return out

def loss(w, X, y, alpha):
    # loss function to be optimized, it's the logistic loss
    z = X.dot(w)
    yz = y * z
    idx = yz > 0
    out = np.zeros_like(yz)
    out[idx] = np.log(1 + np.exp(-yz[idx]))
    out[~idx] = (-yz[~idx] + np.log(1 + np.exp(yz[~idx])))
    out = out.sum() + .5 * alpha * w.dot(w)
    return out

def gradient(w, X, y, alpha):
    # gradient of the logistic loss
    z = X.dot(w)
    z = phi(y * z)
    z0 = (z - 1) * y
    grad = X.T.dot(z0) + alpha * w
    return grad

def hessian(s, w, X, y, alpha):
    z = X.dot(w)
    z = phi(y * z)
    d = z * (1 - z)
    wa = d * X.dot(s)
    Hs = X.T.dot(wa)
    out = Hs + alpha * s
    return  out

def grad_hess(w, X, y, alpha):
    # gradient AND hessian of the logistic
    z = X.dot(w)
    z = phi(y * z)
    z0 = (z - 1) * y
    grad = X.T.dot(z0) + alpha * w
    def Hs(s):
        d = z * (1 - z)
        wa = d * X.dot(s)
        return X.T.dot(wa) + alpha * s

    return grad, Hs

Benchmarks

In [39]:
for corr in (0., 1., 10.):
    n_samples, n_features = 10 ** 4, 10 ** 3
    np.random.seed(0)
    X = np.random.randn(n_samples, n_features)
    w = np.random.randn(n_features)
    y = np.sign(X.dot(w))
    X += 0.8 * np.random.randn(n_samples, n_features) # add noise
    X+= corr # this makes it correlated by adding a constant term
    X = np.hstack((X, np.ones((X.shape[0], 1)))) # add a column of ones for intercept
    alpha = 1.

    # conjugate gradient
    print('CG')
    timings_cg = []
    precision_cg = []
    w0 = np.zeros(X.shape[1])
    start = datetime.now()
    def callback(x0):
        prec = linalg.norm(gradient(x0, X, y, alpha), np.inf)
        precision_cg.append(prec)
        timings_cg.append((datetime.now() - start).total_seconds())
    callback(w0)
    w = optimize.fmin_cg(loss, w0, fprime=gradient, 
            args=(X, y, alpha), gtol=1e-6, callback=callback, maxiter=200)

    ## truncated newton
    print('TNC')
    timings_tnc = []
    precision_tnc = []
    w0 = np.zeros(X.shape[1])
    start = datetime.now()
    def callback(x0):
        prec = linalg.norm(gradient(x0, X, y, alpha), np.inf)
        precision_tnc.append(prec)
        timings_tnc.append((datetime.now() - start).total_seconds())
    callback(w0)
    out = optimize.fmin_ncg(loss, w0, fprime=gradient,
            args=(X, y, alpha), avextol=1e-10,
            callback=callback, maxiter=40)

    # L-BFGS
    print('L-BFGS')
    timings_lbfgs = []
    precision_lbfgs = []
    w0 = np.zeros(X.shape[1])
    start = datetime.now()
    def callback(x0):
        prec = linalg.norm(gradient(x0, X, y, alpha), np.inf)
        precision_lbfgs.append(prec)
        timings_lbfgs.append((datetime.now() - start).total_seconds())
    callback(w0)
    out = optimize.fmin_l_bfgs_b(loss, w0, fprime=gradient, 
            args=(X, y, alpha), pgtol=1e-10, maxiter=200, 
            maxfun=250, factr=1e-30, callback=callback)

    # BFGS
    print('BFGS')
    timings_bfgs = []
    precision_bfgs = []
    w0 = np.zeros(X.shape[1])
    start = datetime.now()
    def callback(x0):
        prec = linalg.norm(gradient(x0, X, y, alpha), np.inf)
        precision_bfgs.append(prec)
        timings_bfgs.append((datetime.now() - start).total_seconds())
    callback(w0)
    out = optimize.fmin_bfgs(loss, w0, fprime=gradient, 
            args=(X, y, alpha), gtol=1e-10, maxiter=50, 
            callback=callback)
    
    # Trust Region
    print('PYTRON')
    timings_pytron= []
    precision_pytron = []
    w0 = np.zeros(X.shape[1])
    start = datetime.now()
    def callback(x0):
        prec = linalg.norm(gradient(x0, X, y, alpha), np.inf)
        precision_pytron.append(prec)
        timings_pytron.append((datetime.now() - start).total_seconds())
    callback(w0)
    w = pytron.minimize(loss, grad_hess, w0, args=(X, y, alpha), 
            gtol=1e-10, tol=1e-20, max_iter=100, callback=callback).x

    import pylab as pl
    pl.plot(timings_lbfgs, precision_lbfgs, label='L-BFGS', lw=5, color='green')
    pl.scatter(timings_lbfgs, precision_lbfgs, s=100, marker='o', color='green')
    pl.plot(timings_bfgs, precision_bfgs, label='BFGS', lw=5, color='red')
    pl.scatter(timings_bfgs, precision_bfgs, s=100, marker='o', color='red')
    pl.plot(timings_pytron, precision_pytron, label='Trust Region', lw=5, color='cyan')
    pl.scatter(timings_pytron, precision_pytron, s=100, marker='o', color='cyan')
    pl.plot(timings_cg, precision_cg, label='CG', lw=5, color='magenta')
    pl.scatter(timings_cg, precision_cg, s=100, marker='o', color='magenta')
    pl.plot(timings_tnc, precision_tnc, label='TNC', lw=5, color='blue')
    pl.scatter(timings_tnc, precision_tnc, s=100, marker='o', color='blue')
    pl.yscale('log')
    pl.ylabel(r'$\|\|\nabla f\|\|_\infty$', fontsize=20)
    pl.xlabel('Time (in seconds)', fontsize=20)
    fig = matplotlib.pyplot.gcf()
    fig.set_size_inches(8.5,6.5)
    pl.legend(fontsize=20)
    if corr == 0.:
        pl.title('Numerical solvers for logistic regression\n(uncorrelated features)', fontsize=20)
        pl.xlim((0, 2))
    if corr == 1.:
        pl.title('Numerical solvers for logistic regression\n(correlated features)', fontsize=20)
        pl.xlim((0, 5))
    if corr == 10.:
        pl.title('Numerical solvers for logistic regression\n(highly correlated features)', fontsize=20)
        pl.xlim((0, 5))
    pl.ylim((10 ** -5, None))
    pl.savefig('comparison_logistic_corr_%s.png' % int(corr))
    pl.show()
CG
Optimization terminated successfully.
         Current function value: 3848.590812
         Iterations: 46
         Function evaluations: 285
         Gradient evaluations: 185
TNC
Optimization terminated successfully.
         Current function value: 3848.590812
         Iterations: 18
         Function evaluations: 21
         Gradient evaluations: 90
         Hessian evaluations: 0
L-BFGS
BFGS
Warning: Maximum number of iterations has been exceeded.
         Current function value: 3848.928087
         Iterations: 50
         Function evaluations: 79
         Gradient evaluations: 79
PYTRON
CG
Warning: Maximum number of iterations has been exceeded.
         Current function value: 3851.748082
         Iterations: 200
         Function evaluations: 478
         Gradient evaluations: 456
TNC
Optimization terminated successfully.
         Current function value: 3851.737228
         Iterations: 18
         Function evaluations: 19
         Gradient evaluations: 508
         Hessian evaluations: 0
L-BFGS
BFGS
Warning: Maximum number of iterations has been exceeded.
         Current function value: 3900.870018
         Iterations: 50
         Function evaluations: 84
         Gradient evaluations: 84
PYTRON
CG
Warning: Maximum number of iterations has been exceeded.
         Current function value: 1206599.572756
         Iterations: 200
         Function evaluations: 860
         Gradient evaluations: 574
TNC
Optimization terminated successfully.
         Current function value: 3856.768202
         Iterations: 19
         Function evaluations: 21
         Gradient evaluations: 515
         Hessian evaluations: 0
L-BFGS
BFGS
Warning: Maximum number of iterations has been exceeded.
         Current function value: 3900.796308
         Iterations: 50
         Function evaluations: 100
         Gradient evaluations: 89
PYTRON
In [ ]: