social() import scipy.stats as stats import numpy as np %matplotlib inline from matplotlib import pyplot as plt import seaborn as sns sns.set(context = 'notebook', style = 'whitegrid') binomial = stats.binom parameters = [(10, 0.4), (10, 0.9)] colors = ["#9b59b6", "#3498db"] for i in range(2): N, p = parameters[i] _x = np.arange(N + 1) plt.bar(_x - 0.5, binomial.pmf(_x, N, p), color=colors[i], alpha=0.6, label='$N$: {}, $p$: {}'.format(N, p)) plt.legend(loc='upper left') plt.xlim(0, 10.5) plt.xlabel('$k$') plt.ylabel('$P(X=k)$') plt.title('Probability mass distribution of binomial random variables') import pymc as pm N = 100 p = pm.Uniform('freq_cheating', 0, 1) true_answers = pm.Bernoulli('truths', p, size=N) first_coin_flip = pm.Bernoulli('first_flip', 0.5, size=N) first_coin_flip.value second_coin_flips = pm.Bernoulli('second_flips', 0.5, size=N) @pm.deterministic def observed_proportion(t_a=true_answers, fc=first_coin_flip, sc=second_coin_flips): observed = fc * t_a + (1 - fc) * sc return observed.sum() / float(N) observed_proportion.value X = 35 observations = pm.Binomial('obs', N, observed_proportion, observed=True, value=X) model = pm.Model([p, true_answers, first_coin_flip, second_coin_flips, observed_proportion, observations]) # Explained in later notebook mcmc = pm.MCMC(model) mcmc.sample(40000, 15000) p_trace = mcmc.trace('freq_cheating')[:] plt.hist(p_trace, histtype='stepfilled', normed=True, alpha=0.8, bins=30, label='posterior distribution', color='#348ABD') plt.vlines([0.05, 0.35], [0, 0], [5, 5], alpha=0.5) plt.xlim(0, 1) plt.legend() p = pm.Uniform('freq_cheating', 0, 1) @pm.deterministic def p_skewed(p=p): return 0.5 * p + 0.25 yes_responses =pm.Binomial('number_cheats', 100, p_skewed, value=35, observed=True) model = pm.Model([yes_responses, p_skewed, p]) mcmc = pm.MCMC(model) mcmc.sample(25000, 2500) p_trace = mcmc.trace('freq_cheating')[:] plt.hist(p_trace, histtype='stepfilled', normed=True, alpha=0.8, bins=30, label='posterior distribution', color='#348ABD') plt.vlines([0.05, 0.35], [0,0], [5, 5], alpha=0.5) plt.xlim(0, 1) plt.legend(); yes_responses =pm.Binomial('number_cheats', 500, p_skewed, value=300, observed=True) model = pm.Model([yes_responses, p_skewed, p]) mcmc = pm.MCMC(model) mcmc.sample(25000, 2500) p_trace = mcmc.trace('freq_cheating')[:] plt.hist(p_trace, histtype='stepfilled', normed=True, alpha=0.8, bins=30, label='posterior distribution', color='#348ABD') plt.vlines([p_trace.mean()+2*p_trace.std(), p_trace.mean() -2* p_trace.std()], [0,0], [10, 10], alpha=0.5) plt.xlim(0, 1) plt.ylim(0, 13) plt.legend(); print p_trace.mean()+2*p_trace.std() print p_trace.mean()-2*p_trace.std() (p_trace.mean()+2*p_trace.std()) - (p_trace.mean() -2* p_trace.std()) N = 10 x = np.empty(N, dtype=object) for i in range(0, N): x[i] = pm.Exponential('x_%i' % i, (i + 1) ** 2) import pandas as pd np.set_printoptions(precision=3, suppress=True) data = pd.read_csv('https://raw.githubusercontent.com/CamDavidsonPilon/Probabilistic-Programming-and-Bayesian-Methods-for-Hackers/master/Chapter2_MorePyMC/data/challenger_data.csv', header=False, names=['date', 'temperature', 'incident']) # Other import method, something is up with the data formating: fix later np.set_printoptions(precision=3, suppress=True) challenger_data = np.genfromtxt("https://raw.githubusercontent.com/CamDavidsonPilon/Probabilistic-Programming-and-Bayesian-Methods-for-Hackers/master/Chapter2_MorePyMC/data/challenger_data.csv", skip_header=1, usecols=[1, 2], missing_values="NA", delimiter=",") challenger_data = challenger_data[~np.isnan(challenger_data[:, 1])] # Drop NA values data = data[data['incident'].notnull()] print data # Plot the data and remove the actual incident plt.scatter(data['temperature'][:-1], data['incident'][:-1], s= 75, color='k', alpha=0.5) plt.yticks([0, 1]) plt.ylabel('Incident?') plt.xlabel('Outside Temperature (F)') def logistic(x, beta): return 1.0 / (1.0 + np.exp(beta * x)) x = np.linspace(-4, 4, 100) plt.plot(x, logistic(x, 1), label = r"$\beta = 1$") plt.plot(x, logistic(x, 3), label = r"$\beta = 3$") plt.plot(x, logistic(x, -5), label = r"$\beta = -5$") plt.legend(); def logistic(x, beta, alpha=0): return 1.0 / (1.0 + np.exp(np.dot(beta, x) + alpha)) x = np.linspace(-4, 4, 100) plt.plot(x, logistic(x, 1), label=r"$\beta = 1$", ls="--", lw=1) plt.plot(x, logistic(x, 3), label=r"$\beta = 3$", ls="--", lw=1) plt.plot(x, logistic(x, -5), label=r"$\beta = -5$", ls="--", lw=1) plt.plot(x, logistic(x, 1, 1), label=r"$\beta = 1, \alpha = 1$", color="#348ABD") plt.plot(x, logistic(x, 3, -2), label=r"$\beta = 3, \alpha = -2$", color="#A60628") plt.plot(x, logistic(x, -5, 7), label=r"$\beta = -5, \alpha = 7$", color="#7A68A6") plt.legend(loc="lower left"); # temp = data['temperature'][:-1].values # D = data['incident'][:-1].values temperature = challenger_data[:, 0] D = challenger_data[:, 1] # defect or not? beta = pm.Normal("beta", 0, 0.001, value=0) alpha = pm.Normal("alpha", 0, 0.001, value=0) @pm.deterministic def p(t = temperature, alpha = alpha, beta = beta): return 1.0 / (1.0 + np.exp(beta * t + alpha)) print p.value # connect the probabilities in `p` with our observations through # a Bernoulli random variable obs = pm.Bernoulli('bernoulli_obs', p, value = D, observed = True) model = pm.Model([obs, beta, alpha]) map_ = pm.MAP(model) map_.fit() mcmc = pm.MCMC(model) mcmc.sample(120000, 100000, 2) alpha_samples = mcmc.trace('alpha')[:, None] # best to make them 1d beta_samples = mcmc.trace('beta')[:, None] # histogram of the samples: plt.subplot(211) plt.title(r"Posterior distributions of the variables $\alpha, \beta$") plt.hist(beta_samples, histtype='stepfilled', bins=35, alpha=0.85, label=r"posterior of $\beta$", color="#7A68A6", normed=True) plt.legend() plt.subplot(212) plt.hist(alpha_samples, histtype='stepfilled', bins=35, alpha=0.85, label=r"posterior of $\alpha$", color="#A60628", normed=True) plt.legend(); t = np.linspace(temperature.min() - 5, temperature.max() + 5, 50)[:, None] p_t = logistic(t.T, beta_samples, alpha_samples) mean_prob_t = p_t.mean(axis=0) from IPython.core.pylabtools import figsize # figsize(18, 5) plt.plot(t, mean_prob_t, lw=3, label="average posterior \nprobability \ of defect") plt.plot(t, p_t[0, :], ls="--", label="realization from posterior") plt.plot(t, p_t[-2, :], ls="--", label="realization from posterior") plt.scatter(temperature, D, color="k", s=50, alpha=0.5) plt.title("Posterior expected value of probability of defect; \ plus realizations") plt.legend(loc="lower left") plt.ylim(-0.1, 1.1) plt.xlim(t.min(), t.max()) plt.ylabel("probability") plt.xlabel("temperature"); from scipy.stats.mstats import mquantiles # vectorized bottom and top 2.5% quantiles for 'confidence interval' qs = mquantiles(p_t, [0.025, 0.975], axis = 0) plt.fill_between(t[:,0], *qs, alpha=0.7, color='#7A68A6') plt.plot(t[:, 0], qs[0], label="95% CI", color="#7A68A6", alpha=0.7) plt.plot(t, mean_prob_t, lw=1, ls="--", color="k", label="average posterior \nprobability of defect") plt.xlim(t.min(), t.max()) plt.ylim(-0.02, 1.02) plt.legend(loc="lower left") plt.scatter(temperature, D, color="k", s=50, alpha=0.5) plt.xlabel("temp, $t$") plt.ylabel("probability estimate") plt.title("Posterior probability estimates given temp. $t$"); prob_31 = logistic(31, beta_samples, alpha_samples) plt.xlim(0.995, 1) plt.hist(prob_31, bins=10000, normed=True, histtype='stepfilled') plt.title('Posterior distribution of a probability of a defect @ 31 degrees') plt.xlabel('probability of a defect occurring in O-ring'); from IPython.core.display import HTML def css_styling(): styles = open("/users/ryankelly/desktop/custom_notebook2.css", "r").read() return HTML(styles) css_styling() def social(): code = """ Tweet Follow @Ryanmdk submit to reddit """ return HTML(code)