#!/usr/bin/env python # coding: utf-8 # We shall work on several concepts of statistics in this notebook, always in the context of data coming from independent Bernoulli trials with an unknown parameter $p$. # Assume that we have a coin flipping experiment with a coin, chosen from a pool of coins, whose success probability is $p$. Let us consider $1$ as success and $0$ a fail. The law which determines $p$ has a density $\pi_0(p)$. Let us then flip the coin $k$ times with result $x_1,\ldots,x_k$. What is the probability law for $p$ given $x_1,\ldots,x_k$. # In[122]: import numpy p0 = 0.8 #numpy.random.uniform(0,1,1) def pi0(p): return 1 k=100 x = numpy.random.binomial(1,p0,k) y = numpy.cumsum(x) print(p0) print(x) #print(y) # # Bayesian Approach # # This a typical question for conditional distribution. We consider the random variables $X_1,\ldots,X_k$ which are Bernoulli distributed with success parameter $p$. Then # $$ # P(X_1=x_1,\ldots,X_k=x_k|p) = p^{(x_1+\ldots+x_k)}(1-p)^{(k-(x_1+\ldots+x_k))} # $$ # holds true. Bayes' theorem says # $$ # P(p|x_1,\ldots,x_k)=\frac{p^{(x_1+\ldots+x_k)}(1-p)^{(k-(x_1+\ldots+x_k))}\pi_0(p)}{\int_0^1p^{(x_1+\ldots+x_k)}(1-p)^{(k-(x_1+\ldots+x_k))}\pi_0(p)dp} \, . # $$ # In[123]: import matplotlib.pyplot as plt for i in range(k): p = numpy.linspace(0,1,1000) l = p**y[i]*(1-p)**(i+1-y[i])*pi0(p) l = l/numpy.sum(l)*1000 plt.plot(p,l) plt.scatter(numpy.array([p0]),numpy.array([0])) plt.show() print(numpy.mean(l*p)) # # Maximum Likelihood Estimator # We are writing now the sequence of maximum likelihood estimators as red point denoting the argument where the maximum is taken of the likelihood function. # In[124]: for i in range(k): p = numpy.linspace(0,1,1000) l = p**y[i]*(1-p)**(i+1-y[i]) plt.plot(p,l) plt.scatter(numpy.array([y[i]/(i+1)]),numpy.array([0]),color='r') plt.show() # # Tests # # We are testing in the sequel whether rather the null hypothesis $p_0$ holds true or the alternative $p_1$: we first plot the likelihood for different values of $p$ given the data (Bayesian approach!), and we also plot the likelihood ratio function (actually its logarithm) depending on data given null and alternative hypothesis. # In[125]: p0 = 0.5 p1 = 0.7 for i in range(k): p = numpy.linspace(0,1,1000) l = p**y[i]*(1-p)**(i+1-y[i]) l = l / numpy.mean(l) plt.plot(p,l) plt.scatter(numpy.array([p0]),numpy.array([0]),color='r') plt.scatter(numpy.array([p1]),numpy.array([0]),color='g') print('Likelihood-Ratio: ',numpy.log(p0**y[i]*(1-p0)**(i+1-y[i])/(p1**y[i]*(1-p1)**(i+1-y[i])))) plt.show() # In[126]: from scipy import special p0 = 0.5 p1 = 0.7 k=100 for i in range(1,k): cs = numpy.linspace(0,i,i+1) test = numpy.ones(i+1) cdf = numpy.ones(i+1) for l in range(i+1): test[l] = numpy.log(p0**cs[l]*(1-p0)**(i-cs[l])/(p1**cs[l]*(1-p1)**(i-cs[l]))) helper[l] = scipy.special.binom(i+1,l)*p0**cs[l]*(1-p0)**(i-cs[l]) cdf = numpy.cumsum(helper) treshhold = test[numpy.argmax(cdf>0.95)] plt.plot(cs,test,color='blue') plt.plot(cs,numpy.ones(i+1)*treshhold) plt.scatter(numpy.array([y[i-1]]),numpy.array([0]),color='r') plt.show() # # $p$-value # # We shall also calculate the $p$-value, i.e. the probability with which values larger or equal than the actual observation would appear under null hypothesis. It measures how 'extreme' an observation is, given the null hypthesis is true. # In[127]: from scipy import special p0 = 0.5 p1 = 0.7 k=100 for i in range(1,k): cs = numpy.linspace(0,i,i+1) c0 = numpy.zeros(i+1) for l in range(i): c0[l] = scipy.special.binom(i,l)*p0**(cs[l])*(1-p0)**(i-cs[l]) plt.plot(cs,c0,color='blue') plt.scatter(numpy.array([y[i-1]]),numpy.array([0]),color='r') print(1-numpy.cumsum(c0)[y[i-1]]) plt.show() # In[117]: numpy.mean(x) # # $t$-test # # We collect two samples of normally distributed data in an unpaired way. We test whether the difference of expectations is $0$ (null hypothesis) # In[128]: ## Import the packages import numpy from scipy import stats ## Define 2 random distributions #Sample Size N = 10 #Gaussian distributed data with mean = 1 and var = 1 a = np.random.randn(N) + 1 #Gaussian distributed data with with mean = 0 and var = 1 b = np.random.randn(N) # In[129]: # calculate the empirical standard deviations with unbiased estimators var_a = a.var(ddof=1) var_b = b.var(ddof=1) #std deviation s = numpy.sqrt((var_a + var_b)/2) # Calculate the t-statistics t = (a.mean() - b.mean())/(s*numpy.sqrt(2/N)) # Compare with the critical t-value # Degrees of freedom df = 2*N - 2 #p-value after comparison with the t p = 1 - stats.t.cdf(t,df=df) print("t = " + str(t)) print("p = " + str(2*p)) # You can see that after comparing the t statistic with the critical t value (computed internally) we get a good p value of 0.0005 and thus we reject the null hypothesis and thus it proves that the mean of the two distributions are different and statistically significant. ## Cross Checking with the internal scipy function t2, p2 = stats.ttest_ind(a,b) print("t = " + str(t2)) print("p = " + str(p2)) x = numpy.linspace(-5,5,1000) plt.plot(x,stats.t.pdf(x,df=df)) plt.scatter(numpy.array([t]),numpy.array([0]),color='red') plt.show() # # Bayesian Approach in a Gaussian world -- the Kalman filter # In[10]: import matplotlib.pyplot as plt import numpy mu = 1 sigmaX = 1 sigmaZ = 1 n = 100 X = numpy.random.normal(mu,sigmaX,1)*numpy.ones(n) Z = numpy.random.normal(0,sigmaZ,n) Y = X + Z Xhat = numpy.ones(n) for i in range(n): x = numpy.linspace(-10,10,1000) l = numpy.exp(-0.5*(Y[i]-x)**2/sigmaZ**2)*numpy.exp(-0.5*(x-mu)**2/sigmaX**2) l = l/numpy.sum(l)*1000/20 plt.plot(x,l) plt.scatter(numpy.array([X[0]]),numpy.array([0])) plt.show() Xhat[i] = numpy.sum(l*x)*20/1000 helper=(mu/sigmaX**2+Y[i]/sigmaZ**2) sigmaX = numpy.sqrt(1/(1/sigmaX**2+1/sigmaZ**2)) mu = sigmaX**2*helper # In[11]: plt.plot(X) plt.plot(Y) plt.plot(Xhat) plt.show() # In[ ]: