# ref
# https://github.com/yennanliu/hackermath/blob/master/Module_2f_ABTesting.ipynb
# https://github.com/omoju/Fundamentals/blob/master/Data/data_Stats_4_ABTesting.ipynb
# Import libraries
from __future__ import absolute_import, division, print_function
# Ignore warnings
import warnings
#warnings.filterwarnings('ignore')
import sys
sys.path.append('tools/')
# OP
import pandas as pd
import numpy as np
#from scipy import stats
import scipy.stats as st
%matplotlib inline
%pylab inline
import seaborn as sns
#import matplotlib.pyplot as plt
import matplotlib.pyplot as pyplt
from matplotlib import pyplot
import matplotlib as mpl
from IPython.display import display
Populating the interactive namespace from numpy and matplotlib
# set plot style
matplotlib.style.use('fivethirtyeight')
matplotlib.rcParams['font.size'] = 12
matplotlib.rcParams['figure.figsize'] = (10,10)
# help func
def axis_tick_frequency(ax, axis, freq):
"""The frequency of the y axis tick marks
Attributes
----------
ax: matplotlib axis object
axis: char eithher 'y' or 'x'
freq: int, the integer value of which the range moves
"""
if axis == 'y':
start, end = ax.get_ylim()
ax.yaxis.set_ticks(np.arange(start, end, freq))
elif axis == 'x':
start, end = ax.get_xlim()
ax.xaxis.set_ticks(np.arange(start, end, freq))
else:
raise ValueError('{argument} is not a valid axis object'.format(argument=repr(axis)))
def sample(num_sample, top, with_replacement=False):
"""
Create a random sample from a table
Attributes
---------
num_sample: int
top: dataframe
with_replacement: boolean
Returns a random subset of table index
"""
df_index = []
lst = np.arange(0, len(top), 1)
for i in np.arange(0, num_sample, 1):
# pick randomly from the whole table
sample_index = np.random.choice(lst)
if with_replacement:
# store index
df_index.append(sample_index)
else:
# remove the choice that was selected
lst = np.setdiff1d(lst,[sample_index])
df_index.append(sample_index)
return df_index
def get_means(df, variable, classes):
"""
Gets the means of a variable grouped by its class
Attributes
-------------
df: a pandas dataframe
variable: column
classes: column (bool)
"""
class_a = df[classes] == True
class_b = df[classes] == False
df_class_b = df.ix[df[class_b].index, :]
df_class_a = df.ix[df[class_a].index, :]
a = df_class_b[variable].values
b = df_class_a[variable].values
# difference in the means
a.mean() - b.mean()
raw = {
classes: [False, True],
variable: [a.mean(), b.mean()]
}
means_table = pd.DataFrame(raw)
return means_table
def bootstrap_ci_means(table, variable, classes, repetitions):
"""Bootstrap approximate 95% confidence interval
for the difference between the means of the two classes
in the population
Attributes
-------------
table: a pandas dataframe
variable: column
classes: column (bool)
repetitions: int
"""
t = table[[variable, classes]]
mean_diffs = []
for i in np.arange(repetitions):
bootstrap_sampl = table.ix[sample(len(table), table, with_replacement=True), :]
m_tbl = get_means(bootstrap_sampl, variable, classes)
new_stat = m_tbl.ix[0, variable] - m_tbl.ix[1, variable]
mean_diffs = np.append(mean_diffs, new_stat)
left = np.percentile(mean_diffs, 2.5)
right = np.percentile(mean_diffs, 97.5)
# Find the observed test statistic
means_table = get_means(t, variable, classes)
obs_stat = means_table.ix[0, variable] - means_table.ix[1, variable]
df = pd.DataFrame()
df['Difference Between Means'] = mean_diffs
df.plot.hist(bins=20, normed=True)
plot([left, right], [0, 0], color='yellow', lw=8);
print('Observed difference between means:', obs_stat)
print('Approximate 95% CI for the difference between means:')
print(left, 'to', right)
# titanic dataset
# https://www.kaggle.com/c/titanic/data#
df=pd.read_csv('train.csv')
df.head(1)
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.25 | NaN | S |
df.hist()
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x116924a58>, <matplotlib.axes._subplots.AxesSubplot object at 0x11697cb70>, <matplotlib.axes._subplots.AxesSubplot object at 0x1169a6320>], [<matplotlib.axes._subplots.AxesSubplot object at 0x1169af160>, <matplotlib.axes._subplots.AxesSubplot object at 0x1169e3e10>, <matplotlib.axes._subplots.AxesSubplot object at 0x116a0b940>], [<matplotlib.axes._subplots.AxesSubplot object at 0x116a2bfd0>, <matplotlib.axes._subplots.AxesSubplot object at 0x116a54630>, <matplotlib.axes._subplots.AxesSubplot object at 0x116a73240>]], dtype=object)
df.Survived.value_counts()
0 549 1 342 Name: Survived, dtype: int64
df[df.Survived==1.0 ].head(1)
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
# visualize Fare on Survived & non-Survived group
df[df.Survived==1.0 ]['Fare'].hist(alpha=.5,bins=np.arange(0, 300, 10))
df[df.Survived==0.0 ]['Fare'].hist(alpha=.5,bins=np.arange(0, 300, 10))
plt.legend(["survived","non-survived" ])
pyplt.ylabel("percent per fare")
pyplt.xlabel("Fare")
plt.title('Fare histgram : Survived VS. non-Survived group ')
Text(0.5,1,'Fare histgram : Survived VS. non-Survived group ')
Both distributions are skew shaped while the "survived" group is with median ~ fare 26.0 ; the "non-survived" group is with median ~ fare 10.5. We can nearly say *** The distributions are not identical. *** But then it raises another question : *** whether the difference ("survived" and "non-survived") reflects just chance variation or a difference in the distributions in the population. ***
So we have set up "Hypothesis Test"
- Null hypothesis (N0): The difference in the sample is due to chance.
i.e. The distribution of fare of "survived group" is the same for "non survived group"
- Alternative hypothesis(N1): The two distributions are different in the population.
- Test statistic: T test
a = df[df.Survived==1.0 ]['Fare'].values
b = df[df.Survived==0.0 ]['Fare'].values
# mean value table
raw = {
'Survived': [True, False],
'Fare mean': [a.mean(), b.mean()]
}
means_table = pd.DataFrame(raw)
means_table
Fare mean | Survived | |
---|---|---|
0 | 48.395408 | True |
1 | 22.117887 | False |
# difference in the means
a.mean() - b.mean()
26.277520717093282
# t-test
statistic, pvalue = st.ttest_ind(a, b)
print ('T statistic: %.2f'%statistic,'\nP-value:%.2f'%pvalue)
T statistic: 7.94 P-value:0.00
The P-value (0.00) is super small. As a result, we can reject the Null hypothesis (N0) and conclude that in the population, the distribution of
"survived" and "non-survived" are different.
Our hypothesis Test (via t-test) conclude that the 2 distributions are 2 different with very high possibility. But, in general cases, the difference above maybe is due to "randomness". To avoid the "randomness affects", we need to generate more samples via the the "Bootstrap resampling" tricks here. (It simply replicates the original random sample and computes new values of the statistic.)
PS. However maybe it not 100% sense to re-sample on the titanic dataset, since they may only happen once lol.
# !pip install scikits.bootstrap
import scikits.bootstrap as bootstrap
import scipy
# compute 95% confidence intervals around the mean
### drop the nan here, since the scikits.bootstrap seems doesn't work with nan
### will fix later
df_ = df[['Age', 'Survived']].dropna()
CIs = bootstrap.ci(df_[['Age', 'Survived']], scipy.mean)
print ("Bootstrapped 95% confidence interval around the mean\nLow:", CIs[0], "\nHigh:", CIs[1])
Bootstrapped 95% confidence interval around the mean Low: 14.5346638655 High: 15.6042787115
print ('all data : ', len(df))
print ('non null (Age) data : ' , len(df_))
all data : 891 non null (Age) data : 714
# bootstrap 5000 samples instead of only 1174
CIs = bootstrap.ci(df_[['Age', 'Survived']], scipy.mean, n_samples=5000)
print ("Bootstrapped 95% confidence interval with 5,000 samples\nLow:", CIs[0], "\nHigh:", CIs[1])
Bootstrapped 95% confidence interval with 5,000 samples Low: 14.5130182073 High: 15.5919677871
bootstrap_ci_means(df_, 'Age', 'Survived', 5000)
/Users/yliu/anaconda3/envs/zip_dev/lib/python3.5/site-packages/ipykernel_launcher.py:69: DeprecationWarning: .ix is deprecated. Please use .loc for label based indexing or .iloc for positional indexing See the documentation here: http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated /Users/yliu/anaconda3/envs/zip_dev/lib/python3.5/site-packages/ipykernel_launcher.py:107: DeprecationWarning: .ix is deprecated. Please use .loc for label based indexing or .iloc for positional indexing See the documentation here: http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated /Users/yliu/anaconda3/envs/zip_dev/lib/python3.5/site-packages/ipykernel_launcher.py:115: DeprecationWarning: .ix is deprecated. Please use .loc for label based indexing or .iloc for positional indexing See the documentation here: http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
Observed difference between means: 2.28248959011 Approximate 95% CI for the difference between means: -1.56344938156 to 6.38324598383
bootstrap_ci_means(df_, 'Age', 'Survived', 5000)
/Users/yliu/anaconda3/envs/zip_dev/lib/python3.5/site-packages/ipykernel_launcher.py:69: DeprecationWarning: .ix is deprecated. Please use .loc for label based indexing or .iloc for positional indexing See the documentation here: http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated /Users/yliu/anaconda3/envs/zip_dev/lib/python3.5/site-packages/ipykernel_launcher.py:107: DeprecationWarning: .ix is deprecated. Please use .loc for label based indexing or .iloc for positional indexing See the documentation here: http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated /Users/yliu/anaconda3/envs/zip_dev/lib/python3.5/site-packages/ipykernel_launcher.py:115: DeprecationWarning: .ix is deprecated. Please use .loc for label based indexing or .iloc for positional indexing See the documentation here: http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
Observed difference between means: 2.28248959011 Approximate 95% CI for the difference between means: -1.51827737087 to 6.43301059337
df.head(2)
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |