import pymc as mc parameter = mc.Exponential( "poisson_param", 1 ) data_generator = mc.Poisson("data_generator", parameter ) data_plus_one = data_generator + 1 print "Children of `parameter`: " print parameter.children print "\nParents of `data_generator`: " print data_generator.parents print "\nChildren of `data_generator`: " print data_generator.children print "parameter.value =",parameter.value print "data_generator.value =",data_generator.value print "data_plus_one.value =", data_plus_one.value lambda_1 = mc.Exponential( "lambda_1", 1 ) #prior on first behaviour lambda_2 = mc.Exponential( "lambda_2", 1 ) #prior on second behaviour tau = mc.DiscreteUniform( "tau", lower = 0, upper = 10 ) #prior on behaviour change print "lambda_1.value = %.3f"%lambda_1.value print "lambda_2.value = %.3f"%lambda_2.value print "tau.value = %.3f"%tau.value print lambda_1.random(), lambda_2.random(), tau.random() print "After calling random() on the variables..." print "lambda_1.value = %.3f"%lambda_1.value print "lambda_2.value = %.3f"%lambda_2.value print "tau.value = %.3f"%tau.value type( lambda_1 + lambda_2 ) n_data_points = 5 # in CH1 we had ~70 data points @mc.deterministic def lambda_( tau = tau, lambda_1 = lambda_1, lambda_2 = lambda_2 ): out = np.zeros(n_data_points) out[:tau] = lambda_1 #lambda before tau is lambda1 out[tau:] = lambda_2 #lambda after tau is lambda1 return out %pylab inline figsize(12.5, 4) samples = [ lambda_1.random() for i in range( 20000) ] hist( samples, bins = 70, normed=True ) plt.title( "Prior distribution for $\lambda_1$") plt.xlim( 0, 8); data = np.array( [10, 5] ) fixed_variable = mc.Poisson( "fxd", 1, value = data, observed = True ) print "value: ",fixed_variable.value print "calling .random()" fixed_variable.random() print "value: ",fixed_variable.value #we're using some fake data here data = np.array( [ 10, 25, 15, 20, 35] ) obs = mc.Poisson( "obs", lambda_, value = data, observed = True ) print obs.value model = mc.Model( [obs, lambda_, lambda_1, lambda_2, tau] ) tau = mc.rdiscrete_uniform(0, 80) print tau alpha = 1./20. lambda_1, lambda_2 = mc.rexponential( alpha, 2 ) print lambda_1, lambda_2 data = np.r_[ mc.rpoisson( lambda_1, tau ), mc.rpoisson( lambda_2,80 - tau) ] plt.bar( np.arange( 80 ), data, color ="#348ABD" ) plt.bar( tau-1, data[tau-1], color = "r", label = "user behaviour changed" ) plt.xlabel( "Time (days)") plt.ylabel("count of text-msgs received") plt.title("Artifical dataset") plt.xlim( 0, 80 ); plt.legend(); def plot_artificial_sms_dataset(): tau = mc.rdiscrete_uniform(0, 80) alpha = 1./20. lambda_1, lambda_2 = mc.rexponential( alpha, 2 ) data = np.r_[ mc.rpoisson( lambda_1, tau ), mc.rpoisson( lambda_2,80- tau) ] plt.bar( np.arange( 80 ), data, color ="#348ABD" ) plt.bar( tau-1, data[tau-1], color = "r", label = "user behaviour changed" ) plt.xlim( 0, 80 ); figsize( 12.5,5) plt.title("More example of artificial datasets" ) for i in range(4): plt.subplot(4,1,i) plot_artificial_sms_dataset() figsize( 12.5, 4) import scipy.stats as stats binomial = stats.binom parameters = [ (10, .4) , (10, .9) ] colors = ["#348ABD", "#A60628"] for i in range(2): N, p = parameters[i] _x = np.arange( N+1 ) plt.bar( _x - 0.5, binomial.pmf( _x, N, p ), color = colors[i], edgecolor=colors[i], alpha = 0.6, label = "$N$: %d, $p$: %.1f"%(N,p), linewidth=3) plt.legend(loc="upper left") plt.xlim(0, 10.5) plt.xlabel("$k$") plt.ylabel("$P(X = k)$") plt.title("Probability mass distributions of binomial random variables"); import pymc as mc N = 100 p = mc.Uniform( "freq_cheating", 0, 1) true_answers = mc.Bernoulli( "truths", p, size = N) first_coin_flips = mc.Bernoulli( "first_flips", 0.5, size = N) print first_coin_flips.value second_coin_flips = mc.Bernoulli("second_flips", 0.5, size = N) @mc.deterministic def observed_proportion( t_a = true_answers, fc = first_coin_flips, sc = second_coin_flips ): observed = fc*t_a + (1-fc)*sc return observed.sum()/float(N) observed_proportion.value X = 35 observations = mc.Binomial("obs", N, observed_proportion, observed = True, value = X) model = mc.Model( [p, true_answers, first_coin_flips, second_coin_flips, observed_proportion, observations] ) ### To be explained in Chapter 3! mcmc = mc.MCMC( model ) mcmc.sample( 150000, 120000,4 ) figsize(12.5, 3 ) from scipy.stats.mstats import mquantiles p_trace = mcmc.trace("freq_cheating")[:] quantiles =mquantiles( p_trace, prob=[0.05, 0.95] ) plt.title("Posterior distribution of frequency of cheaters") plt.hist( p_trace, histtype="stepfilled" , normed = True, alpha = 0.85, bins = 30, label = "posterior distribution", color = "#348ABD") plt.vlines( quantiles, [0,0], [5,5], linestyles = "--" ) plt.xlim(0,1) plt.legend(); p = mc.Uniform( "freq_cheating", 0, 1) @mc.deterministic def p_skewed( p = p ): return 0.5*p + 0.25 yes_responses = mc.Binomial( "number_cheaters", 100, p_skewed, value = 35, observed = True ) model = mc.Model( [yes_responses, p_skewed, p ] ) ### To Be Explained in Chapter 3! mcmc = mc.MCMC(model) mcmc.sample( 12500, 2500 ) figsize(12.5, 3 ) p_trace = mcmc.trace("freq_cheating")[:] quantiles =mquantiles( p_trace, prob=[0.05, 0.95] ) plt.title("Posterior distribution of frequency of cheaters") plt.hist( p_trace, histtype="stepfilled" , normed = True, alpha = 0.85, bins = 30, label = "posterior distribution", color = "#348ABD") plt.vlines( quantiles, [0,0], [5,5], linestyles = "--" ) plt.xlim(0,1) plt.legend(); N = 10 x = np.empty( N , dtype=object ) for i in range(0, N): x[i] = mc.Exponential('x_%i' % i, (i+1)**2) figsize( 12.5, 3.5 ) np.set_printoptions(precision=3, suppress= True) challenger_data = np.genfromtxt("data/challenger_data.csv", skip_header = 1, \ usecols=[1,2], missing_values="NA", delimiter=",") #drop the NA values challenger_data = challenger_data[ ~np.isnan(challenger_data[:,1]) ] #plot it, as a function of tempature (the first column) print "Temp (F), O-Ring failure?" print challenger_data plt.scatter( challenger_data[:,0], challenger_data[:,1], s = 75, color="k", alpha = 0.5) plt.yticks([0,1]) plt.ylabel("Damage Incident?") plt.xlabel("Outside temperature (Fahrenheit)" ) plt.title("Defects of the Space Shuttle O-Rings vs temperature"); figsize(12,3) def logistic( x, beta): return 1.0/( 1.0 + np.exp( beta*x) ) x = np.linspace( -4, 4, 100 ) plt.plot(x, logistic( x, 1), label = r"$\beta = 1$") plt.plot(x, logistic( x, 3), label = r"$\beta = 3$") plt.plot(x, logistic( x, -5), label = r"$\beta = -5$") plt.legend(); def logistic( x, beta, alpha=0): return 1.0/( 1.0 + np.exp( np.dot(beta, x) + alpha) ) x = np.linspace( -4, 4, 100 ) plt.plot(x, logistic( x, 1), label = r"$\beta = 1$", ls= "--", lw =1 ) plt.plot(x, logistic( x, 3), label = r"$\beta = 3$", ls= "--", lw =1) plt.plot(x, logistic( x, -5), label = r"$\beta = -5$", ls= "--", lw =1) plt.plot(x, logistic( x, 1,1), label = r"$\beta = 1, \alpha = 1$", color = "#348ABD") plt.plot(x, logistic( x, 3, -2), label = r"$\beta = 3, \alpha = -2$", color="#A60628") plt.plot(x, logistic( x, -5, 7), label = r"$\beta = -5, \alpha = 7$", color = "#7A68A6") plt.legend(loc = "lower left"); import scipy.stats as stats nor = stats.norm x = np.linspace( -8, 7, 150 ) mu = (-2, 0, 3) tau = (.7, 1, 2.8 ) colors = ["#348ABD", "#A60628", "#7A68A6"] parameters = zip( mu, tau, colors ) for _mu, _tau, _color in parameters: plt.plot( x, nor.pdf( x, _mu , scale = 1./_tau ), \ label ="$\mu = %d,\;\\tau = %.1f$"%(_mu, _tau), color = _color ) plt.fill_between( x, nor.pdf( x, _mu, scale =1./_tau ), color = _color, \ alpha = .33) plt.legend(loc = "upper right") plt.xlabel("$x$") plt.ylabel("density function at $x$") plt.title("Probability distribution of three different Normal random variables"); import pymc as mc temperature = challenger_data[:,0] D = challenger_data[:,1] #defect or not? #notice the`value` here. We explain why below. beta = mc.Normal( "beta", 0, 0.001, value = 0 ) alpha = mc.Normal( "alpha", 0, 0.001, value = 0 ) @mc.deterministic def p( t = temperature, alpha = alpha, beta = beta): return 1.0/( 1. + np.exp( beta*t + alpha) ) p.value # connect the probabilities in `p` with our observations through a # Bernoulli random variable. observed = mc.Bernoulli( "bernoulli_obs", p, value = D, observed=True) model = mc.Model( [observed, beta, alpha] ) #mysterious code to be explained in Chapter 3 map_ = mc.MAP(model) map_.fit() mcmc = mc.MCMC( model ) mcmc.sample( 120000, 100000, 2 ) alpha_samples = mcmc.trace( 'alpha' )[:, None] #best to make them 1d beta_samples = mcmc.trace( 'beta' )[:, None] figsize(12.5, 6) #histogram of the samples: plt.subplot(211) plt.title(r"Posterior distributions of the variables $\alpha, \beta$") plt.hist( beta_samples, histtype='stepfilled', bins = 35, alpha = 0.85, \ label = r"posterior of $\beta$", color = "#7A68A6",normed = True ) plt.legend() plt.subplot(212) plt.hist( alpha_samples, histtype='stepfilled', bins = 35, alpha = 0.85, \ label = r"posterior of $\alpha$", color = "#A60628",normed = True ) plt.legend(); t = np.linspace( temperature.min() - 5, temperature.max()+5, 50 )[:,None] p_t= logistic( t.T, beta_samples, alpha_samples ) mean_prob_t = p_t.mean(axis=0) figsize( 12.5, 4) plt.plot( t, mean_prob_t, lw = 3, label = "average posterior \nprobability \ of defect") plt.plot( t, p_t[0, :], ls="--",label="realization from posterior" ) plt.plot( t, p_t[-2, :], ls="--", label="realization from posterior" ) plt.scatter( temperature, D, color = "k", s = 50, alpha = 0.5 ) plt.title("Posterior expected value of probability of defect; plus realizations") plt.legend(loc= "lower left") plt.ylim( -0.1, 1.1 ) plt.xlim( t.min(), t.max() ) plt.ylabel("probability") plt.xlabel("temperature"); from scipy.stats.mstats import mquantiles # vectorized bottom and top 5% quantiles for "confidence interval" qs = mquantiles(p_t,[0.05,0.95],axis=0) plt.fill_between(t[:,0],*qs,alpha = 0.7, color = "#7A68A6") plt.plot(t[:,0], qs[0], label="95% CI", color = "#7A68A6", alpha =0.7) plt.plot( t, mean_prob_t, lw = 1, ls= "--", color = "k", label = "average posterior \nprobability of defect") plt.xlim( t.min(), t.max() ) plt.ylim( -0.02, 1.02 ) plt.legend(loc="lower left") plt.scatter( temperature, D, color = "k", s = 50, alpha = 0.5 ) plt.xlabel("temp, $t$") plt.ylabel("probability estimate" ) plt.title( "Posterior probability estimates given temp. $t$" ); figsize(12.5, 2.5) prob_31 = logistic( 31, beta_samples, alpha_samples ) plt.xlim( 0.995, 1) plt.hist( prob_31, bins = 1000, normed = True, histtype='stepfilled' ) plt.title( "Posterior distribution of probability of defect, given $t = 31$") plt.xlabel( "probability of defect occuring in O-ring" ); simulated = mc.Bernoulli( "bernoulli_sim", p) N = 10000 mcmc = mc.MCMC( [simulated, alpha, beta, observed ] ) mcmc.sample( N ) figsize(12.5, 5) simulations = mcmc.trace("bernoulli_sim")[:] print simulations.shape plt.title( "Simulated dataset using posterior parameters") figsize( 12.5, 6) for i in range(4): ax = subplot( 4, 1, i+1) plt.scatter( temperature, simulations[1000*i,:], color = "k", s = 50, alpha = 0.6 ); posterior_probability = simulations.mean(axis=0) print "posterior prob of defect | realized defect " for i in range( len(D) ): print "%.2f | %d"%(posterior_probability[i], D[i]) ix = np.argsort( posterior_probability ) print "probb | defect " for i in range( len(D) ): print "%.2f | %d"%(posterior_probability[ix[i]], D[ix[i]]) from separation_plot import separation_plot figsize( 11., 1.5 ) separation_plot(posterior_probability, D ) figsize( 11., 1.25 ) # our temperature-dependent model separation_plot(posterior_probability, D ) plt.title("Temperature-dependent model") # perfect model # i.e. the probability of defect is equal to if a defect occurred or not. p = D separation_plot( p, D) plt.title("Perfect model") # random predictions p = np.random.rand( 23 ) separation_plot( p, D) plt.title("Random model") # constant model constant_prob = 7./23*np.ones(23) separation_plot(constant_prob, D ) plt.title("Constant-prediction model"); #type your code here. figsize(12.5, 4 ) plt.scatter( alpha_samples, beta_samples, alpha = 0.1 ) plt.title( "Why does the plot look like this?" ) plt.xlabel( r"$\alpha$") plt.ylabel( r"$\beta$"); from IPython.core.display import HTML def css_styling(): styles = open("../styles/custom.css", "r").read() return HTML(styles) css_styling()