import numpy as np
%load_ext rpy2.ipython
%R library(glmnet)
from selection.algorithms.logistic import instance
import regreg.api as rr
/Users/jonathantaylor/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Loading required package: Matrix res = super(Function, self).__call__(*new_args, **new_kwargs) /Users/jonathantaylor/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Loading required package: foreach res = super(Function, self).__call__(*new_args, **new_kwargs) /Users/jonathantaylor/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: foreach: simple, scalable parallel programming from Revolution Analytics Use Revolution R for scalability, fault tolerance and more. http://www.revolutionanalytics.com res = super(Function, self).__call__(*new_args, **new_kwargs) /Users/jonathantaylor/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Loaded glmnet 2.0-2 res = super(Function, self).__call__(*new_args, **new_kwargs) /Users/jonathantaylor/anaconda/lib/python2.7/site-packages/selection/algorithms/lasso.py:32: UserWarning: cvx not available warnings.warn('cvx not available')
X, Y, beta, active = instance()
n, p = X.shape
n, p
(100, 200)
Logistic regression with LASSO penalty is minimizeβ1nℓ(β)+λ‖β‖1
It should not be too hard to convince ourselves that the analog of the active block of the KKT conditions for logistic regression with LASSO penalty is sign(ˉβE−λQE(ˉβE)−1zE)=zE
def simulate(n=100, p=50, rho=0.3, s=5, snr=5, lam_frac=0.8,
choice='theoretical',
use_glmnet=True,
approx='unpenalized'):
X, Y, beta, true_active = instance(n=n, p=p, rho=rho, s=s, snr=snr)
n, p = X.shape
# add intercept
X1 = np.hstack([np.ones((n,1)), X])
X -= X.mean(0)[None,:]
if choice != 'theoretical':
lam = lam_frac * np.fabs(X.T.dot(Y)).max() / n
else:
Z = np.random.binomial(1, 0.5, size=(n, 5000))
lam = np.fabs(X.T.dot(Z)).max(0) / n
lam = lam.mean()
# solve the penalized problem
%R -i X,Y,lam
%R Y = as.numeric(Y)
%R G = glmnet(X, Y, family='binomial', standardize=FALSE)
%R soln = as.numeric(coef(G, lam, exact=TRUE))
soln_glmnet = %R soln
if use_glmnet:
soln = soln_glmnet
else: # use regreg
loss = rr.logistic_loss(X1, Y.copy(), coef=0.5)
weights = np.ones(p+1)
weights[0] = 0.
penalty = rr.weighted_l1norm(weights, lagrange=lam)
problem = rr.simple_problem(loss, penalty)
soln = problem.solve(min_its=200, tol=1.e-16)
active = (soln != 0)
active_signs = np.sign(soln[active])
active_signs[0] = 0.
# solve the unpenalized problem restricted
# to the active set
X_E = X1[:,active]
restricted_loss = rr.logistic_loss(X_E, Y)
unpenalized = restricted_loss.solve(min_its=200) # \bar{\beta}_E
if approx == 'unpenalized':
lin_pred = X_E.dot(unpenalized)
elif approx == 'truth':
lin_pred = X.dot(beta)
else:
lin_pred = X_E.dot(soln[active])
pi_E = np.exp(lin_pred) / (1 + np.exp(lin_pred))
W_E = pi_E * (1 - pi_E)
Q_E = np.dot(X_E.T, W_E[:, None] * X_E)
# check that the penalized coefficients satisfy active
# KKT conditions
affine_active = active_signs[1:] * (unpenalized - lam * np.linalg.solve(Q_E, active_signs))[1:]
# how many things violate the KKT conditions?
return (affine_active < 0).sum()
simulate(n=100, use_glmnet=False)
0
np.random.seed(0)
ntrial = 200
failed = 0
for i in range(ntrial):
failed += simulate(n=100, use_glmnet=True) > 0
print 'Total number of failures out of %d: %d' % (ntrial, failed)
Total number of failures out of 200: 4
np.random.seed(0)
ntrial = 200
failed = 0
for _ in range(ntrial):
failed += simulate(use_glmnet=False) > 0
print 'Total number of failures out of %d: %d' % (ntrial, failed)
Total number of failures out of 200: 4
Let's up the features to p=200.
np.random.seed(0)
ntrial = 200
failed = 0
for _ in range(ntrial):
failed += simulate(p=200, use_glmnet=True) > 0
print 'Total number of failures out of %d: %d' % (ntrial, failed)
Total number of failures out of 200: 28
Let's expand around β∗E
np.random.seed(0)
ntrial = 200
failed = 0
for _ in range(ntrial):
failed += simulate(p=50, use_glmnet=True, approx='truth') > 0
print 'Total number of failures out of %d: %d' % (ntrial, failed)
Total number of failures out of 200: 4
np.random.seed(0)
ntrial = 200
failed = 0
for _ in range(ntrial):
failed += simulate(p=200, use_glmnet=True, approx='truth') > 0
print 'Total number of failures out of %d: %d' % (ntrial, failed)
Total number of failures out of 200: 29
Let's try expanding around ˆβE.
np.random.seed(0)
ntrial = 200
failed = 0
for _ in range(ntrial):
failed += simulate(p=200, use_glmnet=True, approx=None) > 0
print 'Total number of failures out of %d: %d' % (ntrial, failed)
Total number of failures out of 200: 29
np.random.seed(0)
ntrial = 200
failed = 0
for i in range(ntrial):
failed += simulate(n=500, p=50, use_glmnet=True) > 0
print 'Total number of failures out of %d: %d' % (ntrial, failed)
Total number of failures out of 200: 0
np.random.seed(0)
ntrial = 200
failed = 0
for i in range(ntrial):
failed += simulate(n=500, p=200, use_glmnet=True) > 0
print 'Total number of failures out of %d: %d' % (ntrial, failed)
Total number of failures out of 200: 3
np.random.seed(0)
ntrial = 200
failed = 0
for i in range(ntrial):
failed += simulate(n=1000, p=200, use_glmnet=True) > 0
print 'Total number of failures out of %d: %d' % (ntrial, failed)
Total number of failures out of 200: 0
In principle, I think the code below should work but I keep getting a few errors. Maybe you can fix it?
%%R
library(glmnet)
simulate = function(n=100,
p=80,
s=5,
snr=5,
rho=0.3,
lam_frac=0.8,
X=NULL,
Z=NULL,
lam=NULL) {
# create some data
# where the logistic model is
# actually true
if (is.null(X)) {
X = sqrt(1-rho) * matrix(rnorm(n*p), n, p) + sqrt(rho) * outer(rnorm(n), rep(1, p))
X = scale(X)
X = X / sqrt(n)
}
if (is.null(Z)) {
beta = matrix(0, p)
beta[1:s] = snr
eta = X %*% beta
pi = exp(eta) / (1 + exp(eta))
Z = rbinom(n, 1, pi)
}
# choose a fixed value of \lambda
# if not provided
if (is.null(lam)) {
Z0 = matrix(rbinom(n*5000, 1, 0.5), n, 5000)
lam = as.numeric(lam_frac * mean(apply(abs(t(X) %*% Z0), 2, max)) / n)
}
G = glmnet(X,
Z,
family='binomial',
#intercept=FALSE,
standardize=FALSE)
# fails here for some reason
print('makes it to here')
soln = coef(G, s=lam, exact=TRUE)
print('but fails above')
soln = soln[2:length(soln)] # get rid of Intercept column
active = which(soln != 0)
X_E = X[,active]
unpenalized = glm.fit(X_E, Z, family=binomial('logit'))$coef
linpred = X_E %*% unpenalized
pi_E = exp(linpred) / (1 + exp(linpred))
W_E = as.numeric(pi_E * (1 - pi_E))
Q_E = t(X_E) %*% (diag(W_E) %*% X_E)
active_signs = sign(soln)[active]
affine_active = active_signs * (unpenalized - lam * solve(Q_E, active_signs))
return(sum(affine_active < 0))
}
simulate()
failures = 0
for (i in 1:20) {
failures = failures + (simulate() > 0)
}
print(failures)