%matplotlib inline from scipy import stats from scipy.stats import bernoulli b=bernoulli(.5) # fair coin distribution nsamples = 100 xs = b.rvs(nsamples*200).reshape(nsamples,-1) # flip it nsamples times for 200 estimates phat = mean(xs,axis=0) # estimated p epsilon_n=sqrt(np.log(2/0.05)/2/nsamples) # edge of 95% confidence interval print '--Interval trapped correct value ',np.logical_and(phat-epsilon_n<=0.5, 0.5 <= (epsilon_n +phat)).mean()*100,'% of the time' se=sqrt(phat*(1-phat)/xs.shape[0]) # compute estimated se for all trials rv=stats.norm(0, se[0]) # generate random variable for trial 0 array(rv.interval(0.95))+phat[0] # compute 95% confidence interval for that trial 0 def compute_CI(i): return stats.norm.interval(0.95,loc=i,scale=sqrt(i*(1-i)/xs.shape[0])) lower,upper = compute_CI(phat) fig,ax=subplots() fig.set_size_inches((10,3)) ax.axis(ymin=0.2,ymax=0.9,xmax=100) ax.plot(upper,label='upper asymptotic',lw=2.) ax.plot(lower,label='lower asymptotic',lw=2.,color='b') ax.plot(phat,'o',color='gray',label='point estimate') ax.plot(phat+epsilon_n,label='Hoeffding upper',color='g') ax.plot(phat-epsilon_n,label='Hoeffding lower',color='g') ax.set_xlabel('trial index') ax.set_ylabel('value of estimate') ax.legend(loc=(1,0)) fig,ax=subplots() xi=linspace(-3,3,100) ax.plot(xi,stats.norm.pdf(xi)) ax.fill_between(xi[17:-17],stats.norm.pdf(xi[17:-17]),alpha=.3) ax.text(-1,0.15,'95% probability',fontsize=18) # resample with replacement bs=[np.random.choice(xs[:,0],size=len(xs[:,0])).mean() for i in range(100)] # use kernel density estimate to approximate empirical PDF from scipy.stats import gaussian_kde kbs=gaussian_kde(bs) # kernel density estimate fig,ax=subplots() ax.hist(bs,20,normed=True,alpha=.3); i=linspace(.25,.7,100) ax.plot(i,kbs.evaluate(i),lw=3.,label='kernel density\nestimate') ax.vlines(phat[0],0,12,lw=4.,linestyle='--') ax.legend() delta=.1 kbs.integrate_box(phat[0]-delta,phat[0]+delta) from scipy.optimize import fsolve delta=fsolve(lambda delta:0.95-kbs.integrate_box(phat[0]-delta,phat[0]+delta) ,0.1)[0] fig,ax=subplots() ax.hist(bs,20,normed=True,alpha=.3); i=linspace(.25,.95,100) ax.plot(i,kbs.evaluate(i),lw=3.,label='kernel density\nestimate') ax.vlines(phat[0],0,12,lw=4.,linestyle='--') ax.vlines(phat[0]+delta,0,12,lw=4.,linestyle='--',color='gray') ax.vlines(phat[0]-delta,0,12,lw=4.,linestyle='--',color='gray') ii=i[np.where(logical_and(i < phat[0]+delta ,i>phat[0]-delta ))] ax.fill_between(ii,kbs.evaluate(ii),alpha=.3,color='m') ax.legend() def compute_bootstrap_CI(x,nboot=100): phat = x.mean() bs=[np.random.choice(x,size=len(xs)).mean() for i in range(nboot)] kbs=gaussian_kde(bs) # kernel density estimate delta=fsolve(lambda delta:0.95-kbs.integrate_box(phat-delta,phat+delta) ,0.1)[0] return (phat-delta,phat+delta) # compute bootstrap confidence intervals upper_b,lower_b=zip(*[ compute_bootstrap_CI(xs[:,i]) for i in range(xs.shape[1]) ]) fig,ax=subplots() fig.set_size_inches((10,3)) ax.axis(ymin=0.2,ymax=0.9,xmax=100) ax.plot(upper,label='upper asymptotic',lw=2.) ax.plot(lower,label='lower asymptotic',lw=2.,color='b') ax.plot(phat,'o',color='gray',label='point estimate') ax.plot(phat+epsilon_n,label='Hoeffding upper',color='g') ax.plot(phat-epsilon_n,label='Hoeffding lower',color='g') ax.plot(upper_b,label='upper bootstrap',lw=2.,color='m') ax.plot(lower_b,label='lower bootstrap',lw=2.,color='m') ax.set_xlabel('trial index') ax.set_ylabel('value of estimate') ax.legend(loc=(1,0)) # example from Good's "Common Errors in Statistics" book p.19 (which is, ironically, wrong) from pandas import DataFrame df=DataFrame(index=('male','female')) df['survived']=(9,4) df['died']=(1,10) df import itertools as it import combinatorics # from pypi.org from collections import Counter patients = ['M']*10 + ['F']*14 sample= np.random.permutation(patients)[:13] # use the first 13 slots for survivors print sample print Counter(sample) foo = lambda i: len(set(range(10)) & set( i[0] )) # count males in first group of 10 males total o=[foo(i) for i in combinatorics.labeled_balls_in_unlabeled_boxes(24,[13,11]) ] hist(o,10,align='left') title('Surviving males under $H_0$') xlabel('number of surviving males') axis(xmin=0); # probability of observing 9 male survivors under H_0 is the following co=Counter(o) print 'p-value= ',(co[9]+co[10])/sum(co.values()) print 'p-value= ',(co[0]+co[1]+co[2]+co[9]+co[10])/sum(co.values()) #using one-way chi-squared proportion test from scipy import stats print stats.chisquare( [9,4],[6.5,6.5]) print stats.fisher_exact(df.values)