#!/usr/bin/env python
# coding: utf-8

# # Regularization

# In[1]:


import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
get_ipython().run_line_magic('', 'matplotlib inline')

from scipy.special import expit
import scipy.sparse as sps
from scipy import stats

from pyglmnet import GLM


# In[2]:


def simpleaxis(ax):
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.get_xaxis().tick_bottom()
    ax.get_yaxis().tick_left()
    ax.tick_params(labelsize=14)


# In[3]:


data_source = 'pyglmnet' # can also be 'sklearn' or 'crimedata'


# ### Simulate with `pyglmnet`

# In[4]:


if data_source == 'pyglmnet':
    N = 200
    p = 500
    corr_predictors = False
    
    D = dict()
    D['coef'] = dict()

    # Predictors
    D['X'] = np.random.randn(N, p)

    # Make the predictors correlated predictors
    if corr_predictors==True:
        W = sps.random(p,p, density=0.4).toarray()
        D['X'] = np.dot(D['X'], W)
    
    # Coefficients
    rvs = stats.norm(loc=0.0, scale=1.0).rvs
    D['coef']['beta0'] = 0.1
    tmp = sps.random(p,1,density=0.01, data_rvs=rvs)
    D['coef']['beta'] = np.squeeze(tmp.toarray())

    # Targets
    glm_simulator = GLM(distr='binomial')
    D['y'] = glm_simulator.simulate(D['coef']['beta0'], D['coef']['beta'], D['X'])


# ### Split into training and validation sets

# In[5]:


from sklearn.cross_validation import train_test_split
Xtrain, Xval, ytrain, yval = train_test_split(D['X'], D['y'].reshape(-1, 1), test_size=0.5)


# ### Fit the model

# In[11]:


glm = GLM(distr='binomial', 
          reg_lambda=np.logspace(np.log(5e-1), np.log(1e-3), 50, base=np.exp(1)),
          alpha=1.0)
glm.fit(Xtrain, ytrain)


# ### Regularization path

# In[12]:


R2r = list()
R2v = list()

for l, m in enumerate(glm.reg_lambda):
    ytrain_hat = glm[l].predict(Xtrain)
    yval_hat = glm[l].predict(Xval)
    R2r.append(glm[l].score(np.squeeze(ytrain), np.squeeze(ytrain_hat), np.mean(ytrain), 'pseudo_R2'))
    R2v.append(glm[l].score(np.squeeze(yval), np.squeeze(yval_hat), np.mean(ytrain), 'pseudo_R2'))

plt.figure(figsize=(7, 6))
ax = plt.subplot(111)
simpleaxis(ax)
plt.semilogx(glm.reg_lambda, R2r, 'ro-', alpha=0.5)
plt.semilogx(glm.reg_lambda, R2v, 'go-', alpha=0.5)
plt.ylim([-0.2, 1.2])
plt.title('Regularization path $a = %4.2f$' % glm.alpha, fontsize=18)
plt.xlabel('$\lambda$', fontsize=16)
plt.ylabel('$R_D^2$', fontsize=16)
plt.legend(['train', 'val'], fontsize=16, frameon=False, loc='upper right')

R2r = np.array(R2r)
R2v = np.array(R2v)
lopt = R2v.argmax()
plt.plot(glm.reg_lambda[lopt], 1.0, 'k*', ms=10)
plt.plot(glm.reg_lambda[lopt] * np.ones(10), np.linspace(-0.4, 1, 10), 'k--')
plt.show()


# In[13]:


# Visualize training and test error vs. iterations
plt.figure(figsize=(15, 5))
# Training set prediction
ax = plt.subplot(121)
simpleaxis(ax)
ytrain_hat = glm[lopt].predict(Xtrain)
plt.plot(ytrain + 0.05 * np.random.randn(np.int(0.5 * np.float(N)), 1), 'ko-', alpha=0.5)
plt.plot(ytrain_hat, 'ro-', alpha=0.5)
plt.ylim([-0.5, 1.5])
plt.title("training $R_D^2 = %4.2f$" % R2r[lopt], fontsize=18)

# Test set prediction
ax = plt.subplot(122)
simpleaxis(ax)
yval_hat = glm[lopt].predict(Xval)

plt.plot(yval + 0.05 * np.random.randn(np.int(0.5 * np.float(N)), 1), 'ko-', alpha=0.5)
plt.plot(yval_hat, 'go-', alpha=0.5)
plt.ylim([-0.5, 1.5])
plt.title("val $R_D^2 = %4.2f$" % R2v[lopt], fontsize=18)

plt.show()


# In[14]:


# Coefficients
plt.figure(figsize=(7,6))
ax = plt.subplot(111)
simpleaxis(ax)
plt.stem(np.arange(p), D['coef']['beta'], 'ko-', alpha=0.5)
plt.plot(np.arange(p), glm[lopt].fit_['beta'], 'ro', alpha=0.5)
plt.xlabel('coefficients', fontsize=16)
plt.show()


# In[10]:


plt.figure(figsize=(6,5))
plt.imshow(np.corrcoef(np.transpose(D['X'])), interpolation='none')
plt.colorbar()
plt.show()


# In[ ]: