Notebook

In [1]:

import numpy as np
%load_ext rpy2.ipython
%R library(glmnet)
from selection.algorithms.logistic import instance
import regreg.api as rr

/Users/jonathantaylor/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Loading required package: Matrix

  res = super(Function, self).__call__(*new_args, **new_kwargs)
/Users/jonathantaylor/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Loading required package: foreach

  res = super(Function, self).__call__(*new_args, **new_kwargs)
/Users/jonathantaylor/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: foreach: simple, scalable parallel programming from Revolution Analytics
Use Revolution R for scalability, fault tolerance and more.
http://www.revolutionanalytics.com

  res = super(Function, self).__call__(*new_args, **new_kwargs)
/Users/jonathantaylor/anaconda/lib/python2.7/site-packages/rpy2/robjects/functions.py:106: UserWarning: Loaded glmnet 2.0-2


  res = super(Function, self).__call__(*new_args, **new_kwargs)
/Users/jonathantaylor/anaconda/lib/python2.7/site-packages/selection/algorithms/lasso.py:32: UserWarning: cvx not available
  warnings.warn('cvx not available')

In [2]:

X, Y, beta, active = instance()
n, p = X.shape
n, p

Out[2]:

(100, 200)

Analog to active KKT conditions¶

Logistic regression with LASSO penalty is

$\text{minimize}_{\beta} \frac{1}{n} \ell(\beta) + \lambda \|\beta\|_1$ where

$\ell$ is negative-log of the logistic likelihood.

It should not be too hard to convince ourselves that the analog of the active block of the KKT conditions for logistic regression with LASSO penalty is

$\text{sign}\left(\bar{\beta}_E - \lambda Q_E(\bar{\beta}_E)^{-1}z_E\right) = z_E$ where

$\bar{\beta}_E$ are the unpenalized logistic estimators for that active set and

$\begin{aligned} Q_E(\beta_E) &= \frac{1}{n} X_E^TW_E(\beta_E;X)X_E \\ W_E(\beta_E) &= \text{diag}(\pi_E(\beta_E;X)(1 - \pi_E(\beta_E;X))) \\ \pi_E(\beta_E;X) &= \frac{\exp(X_E\beta_E)}{1 + \exp(X_E\beta_E)} \end{aligned}$

In [3]:

def simulate(n=100, p=50, rho=0.3, s=5, snr=5, lam_frac=0.8, 
             choice='theoretical',
             use_glmnet=True, 
             approx='unpenalized'):
    X, Y, beta, true_active = instance(n=n, p=p, rho=rho, s=s, snr=snr)
    
    n, p = X.shape
    
    # add intercept
    X1 = np.hstack([np.ones((n,1)), X])
    X -= X.mean(0)[None,:]

    if choice != 'theoretical':
        lam = lam_frac * np.fabs(X.T.dot(Y)).max() / n
    else:
        Z = np.random.binomial(1, 0.5, size=(n, 5000))
        lam = np.fabs(X.T.dot(Z)).max(0) / n
        lam = lam.mean()
        
    # solve the penalized problem
    
    %R -i X,Y,lam
    %R Y = as.numeric(Y)
    %R G = glmnet(X, Y, family='binomial', standardize=FALSE)
    %R soln = as.numeric(coef(G, lam, exact=TRUE))
    soln_glmnet = %R soln

    if use_glmnet:
        soln = soln_glmnet
    else: # use regreg
        loss = rr.logistic_loss(X1, Y.copy(), coef=0.5)
        weights = np.ones(p+1)
        weights[0] = 0.
        penalty = rr.weighted_l1norm(weights, lagrange=lam)
        problem = rr.simple_problem(loss, penalty)
        soln = problem.solve(min_its=200, tol=1.e-16)
    
    active = (soln != 0)
    active_signs = np.sign(soln[active])
    active_signs[0] = 0.
    
    # solve the unpenalized problem restricted 
    # to the active set
    
    X_E = X1[:,active]
    restricted_loss = rr.logistic_loss(X_E, Y)
    unpenalized = restricted_loss.solve(min_its=200) # \bar{\beta}_E
    
    if approx == 'unpenalized':
        lin_pred = X_E.dot(unpenalized)
    elif approx == 'truth':
        lin_pred = X.dot(beta)
    else:
        lin_pred = X_E.dot(soln[active])
    
    pi_E = np.exp(lin_pred) / (1 + np.exp(lin_pred))
    W_E = pi_E * (1 - pi_E)
    Q_E = np.dot(X_E.T, W_E[:, None] * X_E)
    
    # check that the penalized coefficients satisfy active 
    # KKT conditions
    affine_active = active_signs[1:] * (unpenalized - lam * np.linalg.solve(Q_E, active_signs))[1:]
    
    # how many things violate the KKT conditions?
    
    return (affine_active < 0).sum()

simulate(n=100, use_glmnet=False)

Out[3]:

Using glmnet to solve the problem¶

In [6]:

np.random.seed(0)
ntrial = 200
failed = 0

for i in range(ntrial):
    failed += simulate(n=100, use_glmnet=True) > 0
    
print 'Total number of failures out of %d: %d' % (ntrial, failed)

Total number of failures out of 200: 4

Using regreg¶

In [7]:

np.random.seed(0)
ntrial = 200
failed = 0

for _ in range(ntrial):
    failed += simulate(use_glmnet=False) > 0
    
print 'Total number of failures out of %d: %d' % (ntrial, failed)

Total number of failures out of 200: 4

More features¶

Let's up the features to $p=200$ .

In [8]:

np.random.seed(0)
ntrial = 200
failed = 0

for _ in range(ntrial):
    failed += simulate(p=200, use_glmnet=True) > 0
    
print 'Total number of failures out of %d: %d' % (ntrial, failed)

Total number of failures out of 200: 28

Using true parameters for quadratic¶

Let's expand around $\beta_E^*$

In [9]:

np.random.seed(0)
ntrial = 200
failed = 0

for _ in range(ntrial):
    failed += simulate(p=50, use_glmnet=True, approx='truth') > 0
    
print 'Total number of failures out of %d: %d' % (ntrial, failed)

Total number of failures out of 200: 4

More features¶

In [10]:

np.random.seed(0)
ntrial = 200
failed = 0

for _ in range(ntrial):
    failed += simulate(p=200, use_glmnet=True, approx='truth') > 0
    
print 'Total number of failures out of %d: %d' % (ntrial, failed)

Total number of failures out of 200: 29

Yet another quadratic¶

Let's try expanding around $\hat{\beta}_E$ .

In [11]:

np.random.seed(0)
ntrial = 200
failed = 0

for _ in range(ntrial):
    failed += simulate(p=200, use_glmnet=True, approx=None) > 0

print 'Total number of failures out of %d: %d' % (ntrial, failed)

Total number of failures out of 200: 29

Increasing sample size improves the problem¶

In [13]:

np.random.seed(0)
ntrial = 200
failed = 0

for i in range(ntrial):
    failed += simulate(n=500, p=50, use_glmnet=True) > 0

print 'Total number of failures out of %d: %d' % (ntrial, failed)

Total number of failures out of 200: 0

In [15]:

np.random.seed(0)
ntrial = 200
failed = 0

for i in range(ntrial):
    failed += simulate(n=500, p=200, use_glmnet=True) > 0
    
print 'Total number of failures out of %d: %d' % (ntrial, failed)

Total number of failures out of 200: 3

In [16]:

np.random.seed(0)
ntrial = 200
failed = 0

for i in range(ntrial):
    failed += simulate(n=1000, p=200, use_glmnet=True) > 0
    
print 'Total number of failures out of %d: %d' % (ntrial, failed)

Total number of failures out of 200: 0

Pure R code¶

In principle, I think the code below should work but I keep getting a few errors. Maybe you can fix it?

In [ ]:

%%R
library(glmnet)

simulate = function(n=100,
                    p=80,
                    s=5,
                    snr=5,
                    rho=0.3,
                    lam_frac=0.8,
                    X=NULL,
                    Z=NULL,
                    lam=NULL) {

    # create some data
    # where the logistic model is
    # actually true
    
    if (is.null(X)) {
        X = sqrt(1-rho) * matrix(rnorm(n*p), n, p) + sqrt(rho) * outer(rnorm(n), rep(1, p))
        X = scale(X)
        X = X / sqrt(n)
    }
    
    if (is.null(Z)) {
        beta = matrix(0, p)
        beta[1:s] = snr

        eta = X %*% beta
        pi = exp(eta) / (1 + exp(eta))

        Z = rbinom(n, 1, pi)
    }
    
    # choose a fixed value of \lambda
    # if not provided

    if (is.null(lam)) {
        Z0 = matrix(rbinom(n*5000, 1, 0.5), n, 5000)
        lam = as.numeric(lam_frac * mean(apply(abs(t(X) %*% Z0), 2, max)) / n)
    }
    
    G = glmnet(X,
               Z,
               family='binomial',
               #intercept=FALSE,
               standardize=FALSE)

    # fails here for some reason

    print('makes it to here')
    soln = coef(G, s=lam, exact=TRUE)
    print('but fails above')
    soln = soln[2:length(soln)] # get rid of Intercept column
    active = which(soln != 0)

    X_E = X[,active]
    
    unpenalized = glm.fit(X_E, Z, family=binomial('logit'))$coef 
    linpred = X_E %*% unpenalized
    
    pi_E = exp(linpred) / (1 + exp(linpred))
    W_E = as.numeric(pi_E * (1 - pi_E))
    Q_E = t(X_E) %*% (diag(W_E) %*% X_E)

    active_signs = sign(soln)[active]
    affine_active = active_signs * (unpenalized - lam * solve(Q_E, active_signs))

    return(sum(affine_active < 0))
    
}

simulate()

failures = 0

for (i in 1:20) {
    failures = failures + (simulate() > 0)
}
print(failures)