#!/usr/bin/env python
# coding: utf-8

# In[1]:


import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
get_ipython().run_line_magic('matplotlib', 'inline')


# In[2]:


np.random.seed(seed=1)


# In[3]:


def probability_hist(probs):
    """Create histogram of probabilities"""
    fig = plt.Figure()
    weights = np.ones_like(probs)/float(len(probs))
    plt.hist(probs, weights=weights)
    plt.xlim(0, 1)
    plt.ylim(0, 1);
    
def plot_roc_curve(fpr, tpr, roc_auc, lw=2):
    """Plot roc curve"""
    lw = lw
    fig = plt.Figure()
    plt.plot(fpr, tpr, color='darkorange',
             lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right");


# In[4]:


from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
y = iris.target

X = X[y>0,:2]
y = y[y>0] - 1


# In[5]:


np.mean(y)


# In[6]:


from sklearn import linear_model

lg = linear_model.LogisticRegression()
probability = lg.fit(X, y).predict_proba(X)[:, 1]


# In[7]:


fpr, tpr, _ = roc_curve(y, probability)
roc_auc = auc(fpr, tpr)

plot_roc_curve(fpr, tpr, roc_auc)


# In[8]:


def probability_histogram_class(probability, y):
    plt.subplot(221)
    counts, bins, _ = plt.hist([probability[y==0], probability[y==1]], stacked=True)
    plt.xlim(np.min(bins),np.max(bins))
    plt.xticks([])
    
    plt.subplot(222)
    plt.hist(probability[y==1], cumulative=-1, normed=True, color='tab:orange')
    plt.xlim(np.min(bins),np.max(bins))
    plt.xticks([])
    plt.ylim(0,1)
    
    plt.subplot(224)
    plt.hist(probability[y==0], cumulative=-1, normed=True, color='tab:blue')
    plt.xlim(np.min(bins),np.max(bins))
    plt.xticks()
    plt.ylim(0,1)
    
    plt.subplot(223)
    proportion = counts[0]/[max(0.0001, x) for x in counts[1]]
    plt.plot(bins[:-1], 1-proportion)
    plt.xlim(np.min(bins),np.max(bins))
    plt.ylim(0,1);


# In[9]:


probability_histogram_class(probability, y)


# Now let's look at this data but this time with unbalanced classes.

# In[10]:


X = iris.data
y = iris.target

X = X[:, :2]
y = np.array([min(1,i) for i in y])


# In[11]:


len(y)


# In[12]:


np.mean(y)


# In[13]:


probability = lg.fit(X, y).predict_proba(X)[:, 1]


# In[14]:


fpr, tpr, _ = roc_curve(y, probability)
roc_auc = auc(fpr, tpr)

plot_roc_curve(fpr, tpr, roc_auc)


# In[15]:


counts, _, _ = plt.hist([probability[y==1], probability[y==0]], bins=20, stacked=True)


# In[16]:


probability_histogram_class(probability, y)


# In[17]:


from sklearn.utils import shuffle

X = shuffle(X)
probability = lg.fit(X, y).predict_proba(X)[:, 1]


# In[18]:


fpr, tpr, _ = roc_curve(y, probability)
roc_auc = auc(fpr, tpr)

plot_roc_curve(fpr, tpr, roc_auc)


# In[19]:


probability_histogram_class(probability, y)


# Beautiful! Again, the positive class appears uniformly throughout the probabilities prevents the model from finding pockets of the positive class.

# In[20]:


get_ipython().run_line_magic('load_ext', 'watermark')
get_ipython().run_line_magic('watermark', '-v -m -p numpy,matplotlib,sklearn')