Implementation of logistic regression in pure Python using tools from scipy.optimize
. This is a companion notebook to the blog post Numerical optimizers for Logistic Regression
The loss function, using vector notation and abusing of broadcasting is:
$$\mathcal{L}(w) = - \mathbb{1}^T \log(\phi(y * X w)) + \frac{\alpha}{2} w^T w$$where * is the component-wise product (numpy's $*$ operator) and $\phi(t) = 1 / (1 + \exp(-t)) $ is the logistic function.
from __future__ import print_function
import numpy as np
from scipy import linalg, optimize
from sklearn import cross_validation, metrics, linear_model
from datetime import datetime
import pytron
def phi(t):
# logistic function
idx = t > 0
out = np.empty(t.size, dtype=np.float)
out[idx] = 1. / (1 + np.exp(-t[idx]))
exp_t = np.exp(t[~idx])
out[~idx] = exp_t / (1. + exp_t)
return out
def loss(w, X, y, alpha):
# loss function to be optimized, it's the logistic loss
z = X.dot(w)
yz = y * z
idx = yz > 0
out = np.zeros_like(yz)
out[idx] = np.log(1 + np.exp(-yz[idx]))
out[~idx] = (-yz[~idx] + np.log(1 + np.exp(yz[~idx])))
out = out.sum() + .5 * alpha * w.dot(w)
return out
def gradient(w, X, y, alpha):
# gradient of the logistic loss
z = X.dot(w)
z = phi(y * z)
z0 = (z - 1) * y
grad = X.T.dot(z0) + alpha * w
return grad
def hessian(s, w, X, y, alpha):
z = X.dot(w)
z = phi(y * z)
d = z * (1 - z)
wa = d * X.dot(s)
Hs = X.T.dot(wa)
out = Hs + alpha * s
return out
def grad_hess(w, X, y, alpha):
# gradient AND hessian of the logistic
z = X.dot(w)
z = phi(y * z)
z0 = (z - 1) * y
grad = X.T.dot(z0) + alpha * w
def Hs(s):
d = z * (1 - z)
wa = d * X.dot(s)
return X.T.dot(wa) + alpha * s
return grad, Hs
for corr in (0., 1., 10.):
n_samples, n_features = 10 ** 4, 10 ** 3
np.random.seed(0)
X = np.random.randn(n_samples, n_features)
w = np.random.randn(n_features)
y = np.sign(X.dot(w))
X += 0.8 * np.random.randn(n_samples, n_features) # add noise
X+= corr # this makes it correlated by adding a constant term
X = np.hstack((X, np.ones((X.shape[0], 1)))) # add a column of ones for intercept
alpha = 1.
# conjugate gradient
print('CG')
timings_cg = []
precision_cg = []
w0 = np.zeros(X.shape[1])
start = datetime.now()
def callback(x0):
prec = linalg.norm(gradient(x0, X, y, alpha), np.inf)
precision_cg.append(prec)
timings_cg.append((datetime.now() - start).total_seconds())
callback(w0)
w = optimize.fmin_cg(loss, w0, fprime=gradient,
args=(X, y, alpha), gtol=1e-6, callback=callback, maxiter=200)
## truncated newton
print('TNC')
timings_tnc = []
precision_tnc = []
w0 = np.zeros(X.shape[1])
start = datetime.now()
def callback(x0):
prec = linalg.norm(gradient(x0, X, y, alpha), np.inf)
precision_tnc.append(prec)
timings_tnc.append((datetime.now() - start).total_seconds())
callback(w0)
out = optimize.fmin_ncg(loss, w0, fprime=gradient,
args=(X, y, alpha), avextol=1e-10,
callback=callback, maxiter=40)
# L-BFGS
print('L-BFGS')
timings_lbfgs = []
precision_lbfgs = []
w0 = np.zeros(X.shape[1])
start = datetime.now()
def callback(x0):
prec = linalg.norm(gradient(x0, X, y, alpha), np.inf)
precision_lbfgs.append(prec)
timings_lbfgs.append((datetime.now() - start).total_seconds())
callback(w0)
out = optimize.fmin_l_bfgs_b(loss, w0, fprime=gradient,
args=(X, y, alpha), pgtol=1e-10, maxiter=200,
maxfun=250, factr=1e-30, callback=callback)
# BFGS
print('BFGS')
timings_bfgs = []
precision_bfgs = []
w0 = np.zeros(X.shape[1])
start = datetime.now()
def callback(x0):
prec = linalg.norm(gradient(x0, X, y, alpha), np.inf)
precision_bfgs.append(prec)
timings_bfgs.append((datetime.now() - start).total_seconds())
callback(w0)
out = optimize.fmin_bfgs(loss, w0, fprime=gradient,
args=(X, y, alpha), gtol=1e-10, maxiter=50,
callback=callback)
# Trust Region
print('PYTRON')
timings_pytron= []
precision_pytron = []
w0 = np.zeros(X.shape[1])
start = datetime.now()
def callback(x0):
prec = linalg.norm(gradient(x0, X, y, alpha), np.inf)
precision_pytron.append(prec)
timings_pytron.append((datetime.now() - start).total_seconds())
callback(w0)
w = pytron.minimize(loss, grad_hess, w0, args=(X, y, alpha),
gtol=1e-10, tol=1e-20, max_iter=100, callback=callback).x
import pylab as pl
pl.plot(timings_lbfgs, precision_lbfgs, label='L-BFGS', lw=5, color='green')
pl.scatter(timings_lbfgs, precision_lbfgs, s=100, marker='o', color='green')
pl.plot(timings_bfgs, precision_bfgs, label='BFGS', lw=5, color='red')
pl.scatter(timings_bfgs, precision_bfgs, s=100, marker='o', color='red')
pl.plot(timings_pytron, precision_pytron, label='Trust Region', lw=5, color='cyan')
pl.scatter(timings_pytron, precision_pytron, s=100, marker='o', color='cyan')
pl.plot(timings_cg, precision_cg, label='CG', lw=5, color='magenta')
pl.scatter(timings_cg, precision_cg, s=100, marker='o', color='magenta')
pl.plot(timings_tnc, precision_tnc, label='TNC', lw=5, color='blue')
pl.scatter(timings_tnc, precision_tnc, s=100, marker='o', color='blue')
pl.yscale('log')
pl.ylabel(r'$\|\|\nabla f\|\|_\infty$', fontsize=20)
pl.xlabel('Time (in seconds)', fontsize=20)
fig = matplotlib.pyplot.gcf()
fig.set_size_inches(8.5,6.5)
pl.legend(fontsize=20)
if corr == 0.:
pl.title('Numerical solvers for logistic regression\n(uncorrelated features)', fontsize=20)
pl.xlim((0, 2))
if corr == 1.:
pl.title('Numerical solvers for logistic regression\n(correlated features)', fontsize=20)
pl.xlim((0, 5))
if corr == 10.:
pl.title('Numerical solvers for logistic regression\n(highly correlated features)', fontsize=20)
pl.xlim((0, 5))
pl.ylim((10 ** -5, None))
pl.savefig('comparison_logistic_corr_%s.png' % int(corr))
pl.show()
CG Optimization terminated successfully. Current function value: 3848.590812 Iterations: 46 Function evaluations: 285 Gradient evaluations: 185 TNC Optimization terminated successfully. Current function value: 3848.590812 Iterations: 18 Function evaluations: 21 Gradient evaluations: 90 Hessian evaluations: 0 L-BFGS BFGS Warning: Maximum number of iterations has been exceeded. Current function value: 3848.928087 Iterations: 50 Function evaluations: 79 Gradient evaluations: 79 PYTRON
CG Warning: Maximum number of iterations has been exceeded. Current function value: 3851.748082 Iterations: 200 Function evaluations: 478 Gradient evaluations: 456 TNC Optimization terminated successfully. Current function value: 3851.737228 Iterations: 18 Function evaluations: 19 Gradient evaluations: 508 Hessian evaluations: 0 L-BFGS BFGS Warning: Maximum number of iterations has been exceeded. Current function value: 3900.870018 Iterations: 50 Function evaluations: 84 Gradient evaluations: 84 PYTRON
CG Warning: Maximum number of iterations has been exceeded. Current function value: 1206599.572756 Iterations: 200 Function evaluations: 860 Gradient evaluations: 574 TNC Optimization terminated successfully. Current function value: 3856.768202 Iterations: 19 Function evaluations: 21 Gradient evaluations: 515 Hessian evaluations: 0 L-BFGS BFGS Warning: Maximum number of iterations has been exceeded. Current function value: 3900.796308 Iterations: 50 Function evaluations: 100 Gradient evaluations: 89 PYTRON