#!/usr/bin/env python
# coding: utf-8

# # Logistic Regression
# 
# 
# In this notebook implement binary logistic regression from scratch in **`Python`/`Numpy`**. We first generate a two-class dataset. Then derive gradient of the cross-entropy loss. Finally, we implement and test our logistic regression algorithm.

# In[1]:


import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from util import print_result

sns.set()
get_ipython().run_line_magic('matplotlib', 'inline')


# ## Generate a Sample Dataset
# 
# First of all, let me generate some syntatic dataset. As you know, logistic regression is a binary classification algorithm. So following code block generates a two-class dataset with two features.

# In[2]:


N = 250
np.random.seed(1024)

mean_cls_0 = np.array([-2, -2])
cov_cls_0 = np.array([[1.5, 0], [0, 1.5]])
x0 = np.random.multivariate_normal(mean=mean_cls_0, cov=cov_cls_0, size=N)
y0 = np.zeros(N)

mean_cls_1 = np.array([2, 2])
cov_cls_1 = np.array([[1.5, 0], [0, 1.5]])
x1 = np.random.multivariate_normal(mean=mean_cls_1, cov=cov_cls_1, size=N)
y1 = np.ones(N)

X_train = np.row_stack((x0, x1))
y_train = np.concatenate((y0, y1))

fig = plt.figure(figsize=(10, 6))
axes = plt.axes()
axes.scatter(
    X_train[:, 0],
    X_train[:, 1],
    c=y_train,
    s=45,
    cmap=plt.cm.Spectral,
    alpha=0.5,
    edgecolors='none')
axes.set_xlabel('first feature')
axes.set_ylabel('second feature')
axes.set_title('Sample Binary Classification Dataset')
plt.show()


# ## Derive Gradient
# 
# In logistic regression, we use the sigmoid function to squashe responses between zero and one. Hence, we can writedown following probability values.
# 
# $$P(t_n = 1|\mathbf{x_n}, \mathbf{w}) = \frac{1}{1 + e^{-\mathbf{w}^T\mathbf{x}_n}}$$
# $$P(t_n = 0|\mathbf{x_n}, \mathbf{w}) = \frac{e^{-\mathbf{w}^T\mathbf{x}_n}}{1 + e^{-\mathbf{w}^T\mathbf{x}_n}}$$
# 
# Let $\mathbf{t} = (t_1, t_2, ..., t_N)$ and $\mathbf{X} = (\mathbf{x_1}, \mathbf{x_2}, ..., \mathbf{x_N})$ denote full training dataset and data is coming from a i.i.d. distribution, we can write the folloing likelihood.
# 
# 
# $$P(\mathbf{t}|\mathbf{X}, \mathbf{w}) = \prod_{n=1}^{N}\bigg[P(t_n = 1|\mathbf{x_n}, \mathbf{w})^{t_n}P(t_n = 0|\mathbf{x_n}, \mathbf{w})^{(1-t_n)}\bigg]$$
# 
# So our objective is find $\mathbf{w}$ in order to maximize above equation. We can further simplfy our optimization task by taking the $log()$ of our likelihood function. Further, $log()$ will not chage the optimum value of $\mathbf{w}$. Also, instead of maximuxing above function, we minimize $-P(\mathbf{t}|\mathbf{X}, \mathbf{w})$. So our new objective function is:
# 
# $$-P(\mathbf{t}|\mathbf{X}, \mathbf{w}) = \sum_{n=1}^{N}-\bigg[log(P(t_n = 1|\mathbf{x_n}, \mathbf{w})^{t_n}) + log(P(t_n = 0|\mathbf{x_n}, \mathbf{w})^{(1-t_n)})\bigg]$$
# $$-P(\mathbf{t}|\mathbf{X}, \mathbf{w}) = \sum_{n=1}^{N}-\bigg[{t_n}log(P(t_n = 1|\mathbf{x_n}, \mathbf{w})) + (1-t_n)log(P(t_n = 0|\mathbf{x_n}, \mathbf{w}))\bigg]$$
# 
# Now let me introduce a new notations:
# $$\sigma_n = \frac{1}{1 + e^{-\mathbf{w}^T\mathbf{x}_n}}$$
# $$1 - \sigma_n = \frac{e^{-\mathbf{w}^T\mathbf{x}_n}}{1 + e^{-\mathbf{w}^T\mathbf{x}_n}}$$
# 
# $$-P(\mathbf{t}|\mathbf{X}, \mathbf{w}) = \sum_{n=1}^{N}-\bigg[{t_n}log(\sigma_n) + (1-t_n)log(1-\sigma_n)\bigg]$$
# 
# It is easy to show that $\frac{\partial \sigma_n}{\partial \mathbf{w}} = \sigma_n(1 -\sigma_n)\mathbf{w}$ and $\frac{\partial (1- \sigma_n)}{\partial \mathbf{w}} = -\frac{\partial \sigma_n}{\partial \mathbf{w}}$. By substituting these two equations and taking the derivatives we can show that:
# 
# $$\nabla_{\mathbf w}(-P(\mathbf{t}|\mathbf{X}, \mathbf{w})) = \sum_{n=1}^{N}(\sigma_n - t_n)\mathbf{x_n}$$
# 
# Now we have everything we need to implement our own Logistic Regression classifier.

# ## Implementation

# In[3]:


class LogisticRegression:
    def __init__(self, learning_rate=1e-3, max_iter=250):
        self.learning_rate = learning_rate
        self.max_iter = max_iter

        self.weights = None

    def fit(self, X_train, y_train):
        X_train = LogisticRegression._fit_intercept(X_train)
        self.weights = np.random.rand(X_train.shape[1])
        for iter in range(self.max_iter):
            self.weights = self.weights - self.learning_rate * LogisticRegression._grad(
                X_train, y_train, self.weights)
            cost = self._cost(X_train, y_train, self.weights)
            print_result(iter, cost)

    def predict(self, X_test, cutoff=0.5):
        X_test = LogisticRegression._fit_intercept(X_test)
        prob = LogisticRegression.sigmoid(np.dot(X_test, self.weights))
        return prob, prob > cutoff

    @staticmethod
    def sigmoid(x):
        return 1 / (1 + np.exp(-1 * x))

    @staticmethod
    def _cost(X_train, y_train, weights):
        cost = 0.0
        for n in range(len(y_train)):
            pos = (y_train[n] * np.log(
                LogisticRegression.sigmoid(np.inner(X_train[n, :], weights))))
            neg = ((1 - y_train[n]) * np.log(1 - LogisticRegression.sigmoid(
                np.inner(X_train[n, :], weights))))
            cct_cost = -1 * (pos + neg)
            cost += cct_cost / X_train.shape[0]
        return cost

    @staticmethod
    def _grad(X_train, y_train, weights):
        grad = np.zeros(X_train.shape[1])
        for n in range(len(y_train)):
            grad += (LogisticRegression.sigmoid(
                np.inner(X_train[n, :], weights)) - y_train[n]) * X_train[n, :]
        return grad

    @staticmethod
    def _fit_intercept(X_train):
        """Add the intercepting term to the training dataset

        This simple helper method adds the intercepting term to the 
        training dataset.

        Arguments:
            X_train: training dataset

        Returns: training dataset with the interceptor
        """
        return np.column_stack((np.ones(X_train.shape[0]), X_train))


#  Now the Logistic Regression algorithm is ready. Let's train it using our synthetic 2-D dataset and plot the result.

# In[4]:


model = LogisticRegression(max_iter=500)
model.fit(X_train, y_train)

x0_features = np.arange(X_train[:, 0].min() - 1, X_train[:, 0].max() + 1, 0.05)
x1_features = np.arange(X_train[:, 1].min() - 1, X_train[:, 1].max() + 1, 0.05)

x0_features, x1_features = np.meshgrid(x0_features, x1_features)

_, y_pred = model.predict(
    np.column_stack((x0_features.flatten(), x1_features.flatten())))
y_pred = y_pred.reshape(x0_features.shape)

_, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
ax1.scatter(
    X_train[:, 0],
    X_train[:, 1],
    c=y_train,
    s=45,
    cmap=plt.cm.Spectral,
    alpha=0.5,
    edgecolors='none')

ax2.contourf(x0_features, x1_features, y_pred, alpha=0.5, cmap=plt.cm.Spectral)
ax2.scatter(
    X_train[:, 0],
    X_train[:, 1],
    c=y_train,
    s=45,
    cmap=plt.cm.Spectral,
    alpha=0.5,
    edgecolors='none')

plt.show()


# ## Summary
# 
# In this notebook, we implemented Logistic Regression from scratch using Python/Numpy.  We used maximum likelihood estimator for estimating the parameter of the model. 
# 
# Actually, we can look at Logistic Regression Baysean perspective as well. Later we will show you how we can do that from scratch :-).

# ## References and Further Reading
# 
# 1. https://www.youtube.com/watch?v=mz3j59aJBZQ [Readdy good introduction to Logistic Regression by Nando de Freitas] 
# 
# 2. Pattern Recognition and Machine Learning, Christopher Bishop, https://www.amazon.com/Pattern-Recognition-Learning-Information-Statistics/dp/0387310738