%matplotlib inline 

from scipy import stats
from scipy.stats import  bernoulli
b=bernoulli(.5) # fair coin distribution
nsamples = 100
xs = b.rvs(nsamples*200).reshape(nsamples,-1) # flip it nsamples times for 200 estimates
phat = mean(xs,axis=0) # estimated p
epsilon_n=sqrt(np.log(2/0.05)/2/nsamples) # edge of 95% confidence interval
print '--Interval trapped correct value ',np.logical_and(phat-epsilon_n<=0.5, 0.5 <= (epsilon_n +phat)).mean()*100,'% of the time'

se=sqrt(phat*(1-phat)/xs.shape[0]) # compute estimated se for all trials
rv=stats.norm(0, se[0]) # generate random variable for trial 0
array(rv.interval(0.95))+phat[0] # compute 95% confidence interval for that trial 0

def compute_CI(i):
    return stats.norm.interval(0.95,loc=i,scale=sqrt(i*(1-i)/xs.shape[0]))
lower,upper = compute_CI(phat)

fig,ax=subplots()
fig.set_size_inches((10,3))
ax.axis(ymin=0.2,ymax=0.9,xmax=100)
ax.plot(upper,label='upper asymptotic',lw=2.)
ax.plot(lower,label='lower asymptotic',lw=2.,color='b')
ax.plot(phat,'o',color='gray',label='point estimate')
ax.plot(phat+epsilon_n,label='Hoeffding upper',color='g')
ax.plot(phat-epsilon_n,label='Hoeffding lower',color='g')
ax.set_xlabel('trial index')
ax.set_ylabel('value of estimate')
ax.legend(loc=(1,0))

fig,ax=subplots()
xi=linspace(-3,3,100)
ax.plot(xi,stats.norm.pdf(xi))
ax.fill_between(xi[17:-17],stats.norm.pdf(xi[17:-17]),alpha=.3)
ax.text(-1,0.15,'95% probability',fontsize=18)

# resample with replacement
bs=[np.random.choice(xs[:,0],size=len(xs[:,0])).mean() for i in range(100)]

# use kernel density estimate to approximate empirical PDF
from scipy.stats import gaussian_kde
kbs=gaussian_kde(bs) # kernel density estimate
fig,ax=subplots()
ax.hist(bs,20,normed=True,alpha=.3);
i=linspace(.25,.7,100)
ax.plot(i,kbs.evaluate(i),lw=3.,label='kernel density\nestimate')
ax.vlines(phat[0],0,12,lw=4.,linestyle='--')
ax.legend()

delta=.1
kbs.integrate_box(phat[0]-delta,phat[0]+delta)

from scipy.optimize import fsolve
delta=fsolve(lambda delta:0.95-kbs.integrate_box(phat[0]-delta,phat[0]+delta) ,0.1)[0]

fig,ax=subplots()
ax.hist(bs,20,normed=True,alpha=.3);
i=linspace(.25,.95,100)
ax.plot(i,kbs.evaluate(i),lw=3.,label='kernel density\nestimate')
ax.vlines(phat[0],0,12,lw=4.,linestyle='--')
ax.vlines(phat[0]+delta,0,12,lw=4.,linestyle='--',color='gray')
ax.vlines(phat[0]-delta,0,12,lw=4.,linestyle='--',color='gray')
ii=i[np.where(logical_and(i < phat[0]+delta ,i>phat[0]-delta ))]
ax.fill_between(ii,kbs.evaluate(ii),alpha=.3,color='m')
ax.legend()

def compute_bootstrap_CI(x,nboot=100):
    phat = x.mean()
    bs=[np.random.choice(x,size=len(xs)).mean() for i in range(nboot)]
    kbs=gaussian_kde(bs) # kernel density estimate
    delta=fsolve(lambda delta:0.95-kbs.integrate_box(phat-delta,phat+delta) ,0.1)[0]
    return (phat-delta,phat+delta)

# compute bootstrap confidence intervals
upper_b,lower_b=zip(*[ compute_bootstrap_CI(xs[:,i]) for i in range(xs.shape[1]) ])

fig,ax=subplots()
fig.set_size_inches((10,3))
ax.axis(ymin=0.2,ymax=0.9,xmax=100)
ax.plot(upper,label='upper asymptotic',lw=2.)
ax.plot(lower,label='lower asymptotic',lw=2.,color='b')
ax.plot(phat,'o',color='gray',label='point estimate')
ax.plot(phat+epsilon_n,label='Hoeffding upper',color='g')
ax.plot(phat-epsilon_n,label='Hoeffding lower',color='g')
ax.plot(upper_b,label='upper bootstrap',lw=2.,color='m')
ax.plot(lower_b,label='lower bootstrap',lw=2.,color='m')
ax.set_xlabel('trial index')
ax.set_ylabel('value of estimate')
ax.legend(loc=(1,0))

# example from Good's "Common Errors in Statistics" book p.19 (which is, ironically, wrong)
from pandas import DataFrame
df=DataFrame(index=('male','female'))
df['survived']=(9,4)
df['died']=(1,10)
df

import itertools as it
import combinatorics # from pypi.org
from collections import Counter

patients = ['M']*10 + ['F']*14
sample= np.random.permutation(patients)[:13] # use the first 13 slots for survivors
print sample
print Counter(sample)

foo = lambda i: len(set(range(10)) & set( i[0] )) # count males in first group of 10 males total
o=[foo(i) for i in combinatorics.labeled_balls_in_unlabeled_boxes(24,[13,11]) ]

hist(o,10,align='left')
title('Surviving males under $H_0$')
xlabel('number of surviving males')
axis(xmin=0);

# probability of observing 9 male survivors under H_0 is the following
co=Counter(o)
print 'p-value= ',(co[9]+co[10])/sum(co.values())
print 'p-value= ',(co[0]+co[1]+co[2]+co[9]+co[10])/sum(co.values())

#using one-way chi-squared proportion test
from scipy import stats
print stats.chisquare( [9,4],[6.5,6.5])
print stats.fisher_exact(df.values)