#!/usr/bin/env python
# coding: utf-8

# ## ## A/B Testing Part 2  :  Bootstrap resampling 
# 
# --------

# In[1]:


# ref 
# https://github.com/yennanliu/hackermath/blob/master/Module_2f_ABTesting.ipynb
# https://github.com/omoju/Fundamentals/blob/master/Data/data_Stats_4_ABTesting.ipynb


# In[2]:


# Import libraries
from __future__ import absolute_import, division, print_function

# Ignore warnings
import warnings
#warnings.filterwarnings('ignore')

import sys
sys.path.append('tools/')

# OP 
import pandas as pd
import numpy as np
#from scipy import stats
import scipy.stats as st
get_ipython().run_line_magic('matplotlib', 'inline')
get_ipython().run_line_magic('pylab', 'inline')
import seaborn  as sns 
#import matplotlib.pyplot as plt
import matplotlib.pyplot as pyplt
from matplotlib import pyplot
import matplotlib as mpl
from IPython.display import display


# In[3]:


# set plot style
matplotlib.style.use('fivethirtyeight')
matplotlib.rcParams['font.size'] = 12
matplotlib.rcParams['figure.figsize'] = (10,10)


# In[69]:


# help func 

def axis_tick_frequency(ax, axis, freq):
    """The frequency of the y axis tick marks
        Attributes
        ----------
        ax: matplotlib axis object
        axis: char eithher 'y' or 'x'
        freq: int, the integer value of which the range moves
    """
    
    if axis == 'y':
        start, end = ax.get_ylim()
        ax.yaxis.set_ticks(np.arange(start, end, freq))
    elif axis == 'x':
        start, end = ax.get_xlim()
        ax.xaxis.set_ticks(np.arange(start, end, freq))
    else:
        raise ValueError('{argument} is not a valid axis object'.format(argument=repr(axis)))
        
        
def sample(num_sample, top, with_replacement=False):
    """
    Create a random sample from a table
    
    Attributes
    ---------
    num_sample: int
    top: dataframe
    with_replacement: boolean
    
    Returns a random subset of table index
    """
    df_index = []
    lst = np.arange(0, len(top), 1)

    for i in np.arange(0, num_sample, 1):

        # pick randomly from the whole table
        sample_index = np.random.choice(lst)

        if with_replacement:
            # store index
            df_index.append(sample_index)
        else:
            # remove the choice that was selected
            lst = np.setdiff1d(lst,[sample_index])
            df_index.append(sample_index)
            
    return df_index


def get_means(df, variable, classes):
    """
    Gets the means of a variable grouped by its class
    
    Attributes
    -------------
    df: a pandas dataframe
    variable: column
    classes: column (bool) 
    """
    class_a = df[classes] == True
    class_b = df[classes] == False

    df_class_b = df.ix[df[class_b].index, :]
    df_class_a = df.ix[df[class_a].index, :]

    a = df_class_b[variable].values
    b = df_class_a[variable].values
    
    # difference in the means
    a.mean() - b.mean()
    
    raw = {
        classes: [False, True],
        variable: [a.mean(), b.mean()]
    }
    means_table = pd.DataFrame(raw)
    
    return means_table


def bootstrap_ci_means(table, variable, classes, repetitions):

    """Bootstrap approximate 95% confidence interval
    for the difference between the means of the two classes
    in the population
    
    Attributes
    -------------
    table: a pandas dataframe
    variable: column
    classes: column (bool) 
    repetitions: int
    """

    t = table[[variable, classes]]

    mean_diffs = []
    for i in np.arange(repetitions):
        bootstrap_sampl = table.ix[sample(len(table), table, with_replacement=True), :]
        m_tbl = get_means(bootstrap_sampl, variable, classes)
        new_stat = m_tbl.ix[0, variable] - m_tbl.ix[1, variable]
        mean_diffs = np.append(mean_diffs, new_stat)

    left = np.percentile(mean_diffs, 2.5) 
    right = np.percentile(mean_diffs, 97.5)

    # Find the observed test statistic
    means_table = get_means(t, variable, classes)
    obs_stat = means_table.ix[0, variable] - means_table.ix[1, variable]

    df = pd.DataFrame()
    df['Difference Between Means'] = mean_diffs
    df.plot.hist(bins=20, normed=True)
    plot([left, right], [0, 0], color='yellow', lw=8);
    print('Observed difference between means:', obs_stat)
    print('Approximate 95% CI for the difference between means:')
    print(left, 'to', right)

    
# ## 0) Load data

# In[14]:


# titanic dataset 
# https://www.kaggle.com/c/titanic/data#

df=pd.read_csv('train.csv')


# In[8]:


df.head(1)


# In[9]:


df.hist()


# ## 1 ) Is "survived or not survived" associated  with fare  ?

# In[10]:


df.Survived.value_counts()


# In[21]:


df[df.Survived==1.0 ].head(1)


# In[32]:


# visualize Fare on Survived & non-Survived group 
df[df.Survived==1.0 ]['Fare'].hist(alpha=.5,bins=np.arange(0, 300, 10))
df[df.Survived==0.0 ]['Fare'].hist(alpha=.5,bins=np.arange(0, 300, 10))
plt.legend(["survived","non-survived" ])
pyplt.ylabel("percent per fare")
pyplt.xlabel("Fare")
plt.title('Fare histgram : Survived VS. non-Survived group  ')


# Both distributions are skew shaped while the "survived" group is 
# with median ~ fare 26.0 ; the "non-survived" group is with median ~ fare 10.5. We can nearly say  *** The distributions are not identical. ***
# But then it raises another question : *** whether the difference ("survived" and "non-survived") reflects just chance variation or a difference in the distributions in the population. ***
# 
# 
# So we have set up "Hypothesis Test"  
# 
# > * Null hypothesis (N0):  The difference in the sample is due to chance.
# i.e. The distribution of fare of "survived group" is the same for "non survived group" 
# > * Alternative hypothesis(N1): The two distributions are different in the population.
# > * Test statistic: T test
# 

# ## 2) Hypothesis Test

# In[33]:


a = df[df.Survived==1.0 ]['Fare'].values
b = df[df.Survived==0.0 ]['Fare'].values


# In[39]:


# mean value table 

raw = {
        'Survived': [True, False],
        'Fare mean': [a.mean(), b.mean()]
}
means_table = pd.DataFrame(raw)
means_table


# In[36]:


# difference in the means
a.mean() - b.mean()


# In[37]:


# t-test 
statistic, pvalue = st.ttest_ind(a, b)
print ('T statistic: %.2f'%statistic,'\nP-value:%.2f'%pvalue)


# > The P-value (0.00) is super small. As a result, we can reject the Null hypothesis (N0) and conclude that in the population, the distribution of 
# "survived" and "non-survived" are different.
# 
# 
# 

# ## 3) Bootstrap Confidence Interval For the Difference

# Our hypothesis Test (via t-test) conclude that the 2 distributions 
# are 2 different with very high possibility.
# But, in general cases, the difference above maybe is due to "randomness".
# To avoid the "randomness affects", we need to generate more samples
# via the the "Bootstrap resampling" tricks here. (It simply replicates the original random sample and computes new values of the statistic.)
# 
# PS. However maybe it not 100% sense to re-sample on the titanic dataset,
# since they may only happen once lol.
# 
# 
# 

# In[49]:


# !pip install scikits.bootstrap
import scikits.bootstrap as bootstrap  
import scipy


# In[58]:


# compute 95% confidence intervals around the mean  

### drop the nan here, since the scikits.bootstrap seems doesn't work with nan
### will fix later 

df_ = df[['Age', 'Survived']].dropna()
CIs = bootstrap.ci(df_[['Age', 'Survived']], scipy.mean) 
print ("Bootstrapped 95% confidence interval around the mean\nLow:",  CIs[0], "\nHigh:", CIs[1])


# In[61]:


print ('all data :  ', len(df))
print ('non null (Age) data :  ' , len(df_))


# In[62]:


# bootstrap 5000 samples instead of only 1174  
CIs = bootstrap.ci(df_[['Age', 'Survived']], scipy.mean, n_samples=5000)  

print ("Bootstrapped 95% confidence interval with 5,000 samples\nLow:",  CIs[0], "\nHigh:", CIs[1])


# In[70]:


bootstrap_ci_means(df_, 'Age', 'Survived', 5000)


# In[72]:


bootstrap_ci_means(df_, 'Age', 'Survived', 5000)


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[ ]:


# In[71]:


df.head(2)