#!/usr/bin/env python
# coding: utf-8

# # Spam classification using logistic regression
# 
# We have 4601 email addresses with 57 features. The data can be found on [GitHub](https://github.com/probml/pmtk3/tree/master/data/spamData). The column names can be found [here](ftp://ftp.ics.uci.edu/pub/machine-learning-databases/spambase/spambase.names). We want to predict whether the email is spam or not, so we have a binary response, where 1 indicates spam and 0 indicates non-spam.
# 
# You can find some utility functions on [GitHub](https://github.com/ppham27/MLaPP-solutions/blob/master/chap08/classifiers.py).

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')

import numpy as np
import pandas as pd
from classifiers import read_spam_data, transform_log, transform_binary
#from sklearn.linear_model import LogisticRegression # reference sklearn implementation
from classifiers import LogisticRegression

train_data, test_data = read_spam_data()
train_data.head()


# In logistic regression, we let 
# 
# \begin{equation}
# y \mid \mathbf{x} \sim \operatorname{Bernoulli}\left(\frac{1}{1 + \exp\left(-\left(w_0 + \mathbf{w}^\intercal \mathbf{x}\right)\right)}\right),
# \end{equation}
# 
# so if we make $N$ observations the likelihood is
# 
# \begin{equation}
# L\left(w_0, \mathbf{w}\right) = \prod_{i=1}^N\left(\frac{1}{1 + \exp\left(-\left(w_0 + \mathbf{w}^\intercal \mathbf{x}\right)\right)}\right)^{y_i}\left(\frac{1}{1 + \exp\left(w_0 + \mathbf{w}^\intercal \mathbf{x}\right)}\right)^{1-y_i}.
# \end{equation}
# 
# Define $\tilde{y}_i = 2y_i - 1.$ We want to minimize the negative log-likelihood, which is
# 
# \begin{equation}
# J\left(w_0, \mathbf{w}\right) = \sum_{i=1}^N\log\left(1 + \exp\left(-\tilde{y}_i\left(w_0 + \mathbf{w}^\intercal \mathbf{x}\right)\right)\right).
# \end{equation}
# 
# In practice, this function is not always convex, so we add some regularization. Now, we minimize
# 
# \begin{equation}
# J_\lambda\left(w_0, \mathbf{w}\right) = \sum_{i=1}^N\log\left(1 + \exp\left(-\tilde{y}_i\left(w_0 + \mathbf{w}^\intercal \mathbf{x}\right)\right)\right) + \frac{\lambda}{2}\mathbf{w}^\intercal\mathbf{w}.
# \end{equation}
# 
# Note that there is no penalty for $w_0$. One way to minimize this is with Newton's algorithm. To do so, we need to calculate the gradient $\mathbf{g}$ and Hessian $\mathbf{H}$. In exercise 3 (not shown), I show that
# 
# \begin{align}
# \mathbf{g}\left(w_0, \mathbf{w}\right) &= \tilde{\mathbf{X}}^\intercal\left(\boldsymbol\mu - \mathbf{y}\right) + \lambda\begin{pmatrix}0 \\ \mathbf{w}\end{pmatrix} \\
# \mathbf{H}\left(w_0, \mathbf{w}\right) &= \tilde{\mathbf{X}}^\intercal \mathbf{M} \tilde{\mathbf{X}} 
# + \lambda
# \begin{pmatrix}
# 0 & \mathbf{0} \\
# \mathbf{0} & I
# \end{pmatrix},
# \end{align}
# 
# where $\tilde{\mathbf{X}}$ is the data matrix with a column of 1s inserted as the first column. Also, we have defined
# 
# \begin{equation}
# \boldsymbol\mu = \begin{pmatrix}
# \left(1 + \exp\left(-\left(w_0 + \mathbf{w}^\intercal\mathbf{x}_1\right)\right)\right)^{-1}\\
# \left(1 + \exp\left(-\left(w_0 + \mathbf{w}^\intercal\mathbf{x}_2\right)\right)\right)^{-1}\\
# \vdots \\
# \left(1 + \exp\left(-\left(w_0 + \mathbf{w}^\intercal\mathbf{x}_N\right)\right)\right)^{-1}\\
# \end{pmatrix},
# ~\text{and}~
# \mathbf{M} = \operatorname{diag}\left(
# \mu_1(1-\mu_1), \mu_2(1-\mu_2), \ldots,\mu_N(1-\mu_N)
# \right).
# \end{equation}
# 
# Let $\boldsymbol\theta = (w_0,\mathbf{w})^\intercal$. Choose some initial estimate for $\boldsymbol\theta_0$. Then, by Newton's method,
# 
# \begin{equation}
# \mathbf{H}\left(\boldsymbol\theta_k\right)\left(\boldsymbol\theta_{k+1} - \boldsymbol\theta_{k}\right) = -\mathbf{g}\left(\boldsymbol\theta_k\right).
# \end{equation}

# In[2]:


#logReg = LogisticRegression(solver='newton-cg', C=1) # sklearn implementation, only newton-cg converges
logReg = LogisticRegression(regularization = 1)
logReg.fit(train_data.drop('spam', axis = 1).as_matrix(), train_data['spam'].as_matrix())
print(logReg.intercept_)
print(logReg.coef_)
print(1-logReg.score(train_data.drop('spam', axis = 1).as_matrix(), train_data['spam'].as_matrix()))
print(1-logReg.score(test_data.drop('spam', axis = 1).as_matrix(), test_data['spam'].as_matrix()))


# The default settings work pretty well and give us a 7% misclassification rate. Let us try various transformations.
# 
# - Standardize, that is, $\displaystyle\frac{\mathbf{x}_j - \hat{\mu}_j}{\hat{\sigma}_j}$
# - log, $\log\left(\mathbf{x}_j + 0.1\right)$
# - Binarize, $\mathbb{I}\left(\mathbf{x}_j > 0\right)$
# 
# You can see the transformations and the logistic regression class that I wrote by hand on [GitHub](https://github.com/ppham27/MLaPP-solutions/blob/master/chap08/classifiers.py).

# In[3]:


from sklearn import preprocessing

# transform the data
ytrain = train_data['spam'].as_matrix()
ytest = test_data['spam'].as_matrix()
Xtrain_raw = train_data.drop('spam', axis = 1).as_matrix()
Xtest_raw = test_data.drop('spam', axis = 1).as_matrix()
Xtrain_standard = preprocessing.scale(Xtrain_raw, axis=0)
Xtest_standard = preprocessing.scale(Xtest_raw, axis=0)
Xtrain_log = np.apply_along_axis(transform_log, axis = 0, arr=Xtrain_raw)
Xtest_log = np.apply_along_axis(transform_log, axis = 0, arr=Xtest_raw)
Xtrain_binary = np.apply_along_axis(transform_binary, axis = 0, arr=Xtrain_raw)
Xtest_binary = np.apply_along_axis(transform_binary, axis = 0, arr=Xtest_raw)

data_transform = ['Raw', 'Standard', 'Log', 'Binary']
Xtrain = [Xtrain_raw, Xtrain_standard, Xtrain_log, Xtrain_binary]
Xtest = [Xtest_raw, Xtest_standard, Xtest_log, Xtest_binary]

## now run lots of models to find regularization parameter
regularization = np.linspace(0, 20, num=41)
misclassification_rates = pd.DataFrame(dtype=np.float64,
                                       index = np.arange(len(regularization)),
                                       columns = ['Regularization'] + 
                                       list(map(lambda x : x + ' Train',  data_transform)) + 
                                       list(map(lambda x : x + ' Test',  data_transform)))
for i in range(len(regularization)):
    misclassification_rates.iloc[i]['Regularization'] = regularization[i]
    if regularization[i] == 0:
        regularization[i] += 0.01 # hack when there's no convergence.
    logReg = LogisticRegression(regularization = regularization[i])
    for j in range(len(data_transform)):
        logReg.fit(Xtrain[j], ytrain)
        misclassification_rates.iloc[i][data_transform[j] + ' Train'] = 1 - logReg.score(Xtrain[j], ytrain)
        misclassification_rates.iloc[i][data_transform[j] + ' Test'] = 1 - logReg.score(Xtest[j], ytest)
misclassification_rates


# Let's plot these results.

# In[4]:


import matplotlib.pyplot as plt
colors = ['#e41a1c', '#377eb8', '#4daf4a', '#984ea3']
plt.figure(figsize=(12,8))
for i in range(len(data_transform)):
    plt.plot(misclassification_rates['Regularization'], misclassification_rates[data_transform[i] + ' Train'],
             color=colors[i], linestyle='--', linewidth=2, marker='.')
    plt.plot(misclassification_rates['Regularization'], misclassification_rates[data_transform[i] + ' Test'],
             color=colors[i], linestyle='-', linewidth=2, marker='.')
plt.grid(True)
plt.legend(loc='center left', bbox_to_anchor=(1,0.5), title="Dataset")
plt.ylabel("Misclassification Rate")
plt.xlabel("Regularization Parameter ($\lambda$)")
plt.title("Misclassification Rates for Various Transforms and Regularizations")
plt.show()


# The problem tells us to select the regularization paramter with cross validation, but that seems pointless. We see that the log transform performs best with a misclassification rate of 5.8% when $\lambda = 8$.