import numpy as np
%load_ext rpy2.ipython
%R library(glmnet)
from selection.algorithms.logistic import instance
import regreg.api as rr
/Users/jonathantaylor/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Loading required package: Matrix res = super(Function, self).__call__(*new_args, **new_kwargs) /Users/jonathantaylor/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Loading required package: foreach res = super(Function, self).__call__(*new_args, **new_kwargs) /Users/jonathantaylor/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: foreach: simple, scalable parallel programming from Revolution Analytics Use Revolution R for scalability, fault tolerance and more. http://www.revolutionanalytics.com res = super(Function, self).__call__(*new_args, **new_kwargs) /Users/jonathantaylor/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Loaded glmnet 2.0-2 res = super(Function, self).__call__(*new_args, **new_kwargs) /Users/jonathantaylor/anaconda/lib/python2.7/site-packages/selection/algorithms/lasso.py:32: UserWarning: cvx not available warnings.warn('cvx not available')
X, Y, beta, active = instance()
n, p = X.shape
n, p
(100, 200)
Logistic regression with LASSO penalty is $$ \text{minimize}_{\beta} \frac{1}{n} \ell(\beta) + \lambda \|\beta\|_1 $$ where $\ell$ is negative-log of the logistic likelihood.
It should not be too hard to convince ourselves that the analog of the active block of the KKT conditions for logistic regression with LASSO penalty is $$ \text{sign}\left(\bar{\beta}_E - \lambda Q_E(\bar{\beta}_E)^{-1}z_E\right) = z_E $$ where $\bar{\beta}_E$ are the unpenalized logistic estimators for that active set and $$ \begin{aligned} Q_E(\beta_E) &= \frac{1}{n} X_E^TW_E(\beta_E;X)X_E \\ W_E(\beta_E) &= \text{diag}(\pi_E(\beta_E;X)(1 - \pi_E(\beta_E;X))) \\ \pi_E(\beta_E;X) &= \frac{\exp(X_E\beta_E)}{1 + \exp(X_E\beta_E)} \end{aligned} $$
def simulate(n=100, p=50, rho=0.3, s=5, snr=5, lam_frac=0.8,
choice='theoretical',
use_glmnet=True,
approx='unpenalized'):
X, Y, beta, true_active = instance(n=n, p=p, rho=rho, s=s, snr=snr)
n, p = X.shape
# add intercept
X1 = np.hstack([np.ones((n,1)), X])
X -= X.mean(0)[None,:]
if choice != 'theoretical':
lam = lam_frac * np.fabs(X.T.dot(Y)).max() / n
else:
Z = np.random.binomial(1, 0.5, size=(n, 5000))
lam = np.fabs(X.T.dot(Z)).max(0) / n
lam = lam.mean()
# solve the penalized problem
%R -i X,Y,lam
%R Y = as.numeric(Y)
%R G = glmnet(X, Y, family='binomial', standardize=FALSE)
%R soln = as.numeric(coef(G, lam, exact=TRUE))
soln_glmnet = %R soln
if use_glmnet:
soln = soln_glmnet
else: # use regreg
loss = rr.logistic_loss(X1, Y.copy(), coef=0.5)
weights = np.ones(p+1)
weights[0] = 0.
penalty = rr.weighted_l1norm(weights, lagrange=lam)
problem = rr.simple_problem(loss, penalty)
soln = problem.solve(min_its=200, tol=1.e-16)
active = (soln != 0)
active_signs = np.sign(soln[active])
active_signs[0] = 0.
# solve the unpenalized problem restricted
# to the active set
X_E = X1[:,active]
restricted_loss = rr.logistic_loss(X_E, Y)
unpenalized = restricted_loss.solve(min_its=200) # \bar{\beta}_E
if approx == 'unpenalized':
lin_pred = X_E.dot(unpenalized)
elif approx == 'truth':
lin_pred = X.dot(beta)
else:
lin_pred = X_E.dot(soln[active])
pi_E = np.exp(lin_pred) / (1 + np.exp(lin_pred))
W_E = pi_E * (1 - pi_E)
Q_E = np.dot(X_E.T, W_E[:, None] * X_E)
# check that the penalized coefficients satisfy active
# KKT conditions
affine_active = active_signs[1:] * (unpenalized - lam * np.linalg.solve(Q_E, active_signs))[1:]
# how many things violate the KKT conditions?
return (affine_active < 0).sum()
simulate(n=100, use_glmnet=False)
0
np.random.seed(0)
ntrial = 200
failed = 0
for i in range(ntrial):
failed += simulate(n=100, use_glmnet=True) > 0
print 'Total number of failures out of %d: %d' % (ntrial, failed)
Total number of failures out of 200: 4
np.random.seed(0)
ntrial = 200
failed = 0
for _ in range(ntrial):
failed += simulate(use_glmnet=False) > 0
print 'Total number of failures out of %d: %d' % (ntrial, failed)
Total number of failures out of 200: 4
Let's up the features to $p=200$.
np.random.seed(0)
ntrial = 200
failed = 0
for _ in range(ntrial):
failed += simulate(p=200, use_glmnet=True) > 0
print 'Total number of failures out of %d: %d' % (ntrial, failed)
Total number of failures out of 200: 28
Let's expand around $\beta_E^*$
np.random.seed(0)
ntrial = 200
failed = 0
for _ in range(ntrial):
failed += simulate(p=50, use_glmnet=True, approx='truth') > 0
print 'Total number of failures out of %d: %d' % (ntrial, failed)
Total number of failures out of 200: 4
np.random.seed(0)
ntrial = 200
failed = 0
for _ in range(ntrial):
failed += simulate(p=200, use_glmnet=True, approx='truth') > 0
print 'Total number of failures out of %d: %d' % (ntrial, failed)
Total number of failures out of 200: 29
Let's try expanding around $\hat{\beta}_E$.
np.random.seed(0)
ntrial = 200
failed = 0
for _ in range(ntrial):
failed += simulate(p=200, use_glmnet=True, approx=None) > 0
print 'Total number of failures out of %d: %d' % (ntrial, failed)
Total number of failures out of 200: 29
np.random.seed(0)
ntrial = 200
failed = 0
for i in range(ntrial):
failed += simulate(n=500, p=50, use_glmnet=True) > 0
print 'Total number of failures out of %d: %d' % (ntrial, failed)
Total number of failures out of 200: 0
np.random.seed(0)
ntrial = 200
failed = 0
for i in range(ntrial):
failed += simulate(n=500, p=200, use_glmnet=True) > 0
print 'Total number of failures out of %d: %d' % (ntrial, failed)
Total number of failures out of 200: 3
np.random.seed(0)
ntrial = 200
failed = 0
for i in range(ntrial):
failed += simulate(n=1000, p=200, use_glmnet=True) > 0
print 'Total number of failures out of %d: %d' % (ntrial, failed)
Total number of failures out of 200: 0
In principle, I think the code below should work but I keep getting a few errors. Maybe you can fix it?
%%R
library(glmnet)
simulate = function(n=100,
p=80,
s=5,
snr=5,
rho=0.3,
lam_frac=0.8,
X=NULL,
Z=NULL,
lam=NULL) {
# create some data
# where the logistic model is
# actually true
if (is.null(X)) {
X = sqrt(1-rho) * matrix(rnorm(n*p), n, p) + sqrt(rho) * outer(rnorm(n), rep(1, p))
X = scale(X)
X = X / sqrt(n)
}
if (is.null(Z)) {
beta = matrix(0, p)
beta[1:s] = snr
eta = X %*% beta
pi = exp(eta) / (1 + exp(eta))
Z = rbinom(n, 1, pi)
}
# choose a fixed value of \lambda
# if not provided
if (is.null(lam)) {
Z0 = matrix(rbinom(n*5000, 1, 0.5), n, 5000)
lam = as.numeric(lam_frac * mean(apply(abs(t(X) %*% Z0), 2, max)) / n)
}
G = glmnet(X,
Z,
family='binomial',
#intercept=FALSE,
standardize=FALSE)
# fails here for some reason
print('makes it to here')
soln = coef(G, s=lam, exact=TRUE)
print('but fails above')
soln = soln[2:length(soln)] # get rid of Intercept column
active = which(soln != 0)
X_E = X[,active]
unpenalized = glm.fit(X_E, Z, family=binomial('logit'))$coef
linpred = X_E %*% unpenalized
pi_E = exp(linpred) / (1 + exp(linpred))
W_E = as.numeric(pi_E * (1 - pi_E))
Q_E = t(X_E) %*% (diag(W_E) %*% X_E)
active_signs = sign(soln)[active]
affine_active = active_signs * (unpenalized - lam * solve(Q_E, active_signs))
return(sum(affine_active < 0))
}
simulate()
failures = 0
for (i in 1:20) {
failures = failures + (simulate() > 0)
}
print(failures)