#!/usr/bin/env python # coding: utf-8 # ## ## A/B Testing Part 2 : Bootstrap resampling # # -------- # In[1]: # ref # https://github.com/yennanliu/hackermath/blob/master/Module_2f_ABTesting.ipynb # https://github.com/omoju/Fundamentals/blob/master/Data/data_Stats_4_ABTesting.ipynb # In[2]: # Import libraries from __future__ import absolute_import, division, print_function # Ignore warnings import warnings #warnings.filterwarnings('ignore') import sys sys.path.append('tools/') # OP import pandas as pd import numpy as np #from scipy import stats import scipy.stats as st get_ipython().run_line_magic('matplotlib', 'inline') get_ipython().run_line_magic('pylab', 'inline') import seaborn as sns #import matplotlib.pyplot as plt import matplotlib.pyplot as pyplt from matplotlib import pyplot import matplotlib as mpl from IPython.display import display # In[3]: # set plot style matplotlib.style.use('fivethirtyeight') matplotlib.rcParams['font.size'] = 12 matplotlib.rcParams['figure.figsize'] = (10,10) # In[69]: # help func def axis_tick_frequency(ax, axis, freq): """The frequency of the y axis tick marks Attributes ---------- ax: matplotlib axis object axis: char eithher 'y' or 'x' freq: int, the integer value of which the range moves """ if axis == 'y': start, end = ax.get_ylim() ax.yaxis.set_ticks(np.arange(start, end, freq)) elif axis == 'x': start, end = ax.get_xlim() ax.xaxis.set_ticks(np.arange(start, end, freq)) else: raise ValueError('{argument} is not a valid axis object'.format(argument=repr(axis))) def sample(num_sample, top, with_replacement=False): """ Create a random sample from a table Attributes --------- num_sample: int top: dataframe with_replacement: boolean Returns a random subset of table index """ df_index = [] lst = np.arange(0, len(top), 1) for i in np.arange(0, num_sample, 1): # pick randomly from the whole table sample_index = np.random.choice(lst) if with_replacement: # store index df_index.append(sample_index) else: # remove the choice that was selected lst = np.setdiff1d(lst,[sample_index]) df_index.append(sample_index) return df_index def get_means(df, variable, classes): """ Gets the means of a variable grouped by its class Attributes ------------- df: a pandas dataframe variable: column classes: column (bool) """ class_a = df[classes] == True class_b = df[classes] == False df_class_b = df.ix[df[class_b].index, :] df_class_a = df.ix[df[class_a].index, :] a = df_class_b[variable].values b = df_class_a[variable].values # difference in the means a.mean() - b.mean() raw = { classes: [False, True], variable: [a.mean(), b.mean()] } means_table = pd.DataFrame(raw) return means_table def bootstrap_ci_means(table, variable, classes, repetitions): """Bootstrap approximate 95% confidence interval for the difference between the means of the two classes in the population Attributes ------------- table: a pandas dataframe variable: column classes: column (bool) repetitions: int """ t = table[[variable, classes]] mean_diffs = [] for i in np.arange(repetitions): bootstrap_sampl = table.ix[sample(len(table), table, with_replacement=True), :] m_tbl = get_means(bootstrap_sampl, variable, classes) new_stat = m_tbl.ix[0, variable] - m_tbl.ix[1, variable] mean_diffs = np.append(mean_diffs, new_stat) left = np.percentile(mean_diffs, 2.5) right = np.percentile(mean_diffs, 97.5) # Find the observed test statistic means_table = get_means(t, variable, classes) obs_stat = means_table.ix[0, variable] - means_table.ix[1, variable] df = pd.DataFrame() df['Difference Between Means'] = mean_diffs df.plot.hist(bins=20, normed=True) plot([left, right], [0, 0], color='yellow', lw=8); print('Observed difference between means:', obs_stat) print('Approximate 95% CI for the difference between means:') print(left, 'to', right) # ## 0) Load data # In[14]: # titanic dataset # https://www.kaggle.com/c/titanic/data# df=pd.read_csv('train.csv') # In[8]: df.head(1) # In[9]: df.hist() # ## 1 ) Is "survived or not survived" associated with fare ? # In[10]: df.Survived.value_counts() # In[21]: df[df.Survived==1.0 ].head(1) # In[32]: # visualize Fare on Survived & non-Survived group df[df.Survived==1.0 ]['Fare'].hist(alpha=.5,bins=np.arange(0, 300, 10)) df[df.Survived==0.0 ]['Fare'].hist(alpha=.5,bins=np.arange(0, 300, 10)) plt.legend(["survived","non-survived" ]) pyplt.ylabel("percent per fare") pyplt.xlabel("Fare") plt.title('Fare histgram : Survived VS. non-Survived group ') # Both distributions are skew shaped while the "survived" group is # with median ~ fare 26.0 ; the "non-survived" group is with median ~ fare 10.5. We can nearly say *** The distributions are not identical. *** # But then it raises another question : *** whether the difference ("survived" and "non-survived") reflects just chance variation or a difference in the distributions in the population. *** # # # So we have set up "Hypothesis Test" # # > * Null hypothesis (N0): The difference in the sample is due to chance. # i.e. The distribution of fare of "survived group" is the same for "non survived group" # > * Alternative hypothesis(N1): The two distributions are different in the population. # > * Test statistic: T test # # ## 2) Hypothesis Test # In[33]: a = df[df.Survived==1.0 ]['Fare'].values b = df[df.Survived==0.0 ]['Fare'].values # In[39]: # mean value table raw = { 'Survived': [True, False], 'Fare mean': [a.mean(), b.mean()] } means_table = pd.DataFrame(raw) means_table # In[36]: # difference in the means a.mean() - b.mean() # In[37]: # t-test statistic, pvalue = st.ttest_ind(a, b) print ('T statistic: %.2f'%statistic,'\nP-value:%.2f'%pvalue) # > The P-value (0.00) is super small. As a result, we can reject the Null hypothesis (N0) and conclude that in the population, the distribution of # "survived" and "non-survived" are different. # # # # ## 3) Bootstrap Confidence Interval For the Difference # Our hypothesis Test (via t-test) conclude that the 2 distributions # are 2 different with very high possibility. # But, in general cases, the difference above maybe is due to "randomness". # To avoid the "randomness affects", we need to generate more samples # via the the "Bootstrap resampling" tricks here. (It simply replicates the original random sample and computes new values of the statistic.) # # PS. However maybe it not 100% sense to re-sample on the titanic dataset, # since they may only happen once lol. # # # # In[49]: # !pip install scikits.bootstrap import scikits.bootstrap as bootstrap import scipy # In[58]: # compute 95% confidence intervals around the mean ### drop the nan here, since the scikits.bootstrap seems doesn't work with nan ### will fix later df_ = df[['Age', 'Survived']].dropna() CIs = bootstrap.ci(df_[['Age', 'Survived']], scipy.mean) print ("Bootstrapped 95% confidence interval around the mean\nLow:", CIs[0], "\nHigh:", CIs[1]) # In[61]: print ('all data : ', len(df)) print ('non null (Age) data : ' , len(df_)) # In[62]: # bootstrap 5000 samples instead of only 1174 CIs = bootstrap.ci(df_[['Age', 'Survived']], scipy.mean, n_samples=5000) print ("Bootstrapped 95% confidence interval with 5,000 samples\nLow:", CIs[0], "\nHigh:", CIs[1]) # In[70]: bootstrap_ci_means(df_, 'Age', 'Survived', 5000) # In[72]: bootstrap_ci_means(df_, 'Age', 'Survived', 5000) # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[ ]: # In[71]: df.head(2)