from IPython.display import Image import prettyplotlib as ppl from prettyplotlib import plt import numpy as np from matplotlib import rc rc('font',**{'family':'sans-serif','sans-serif':['Helvetica'], 'size': 22}) rc('xtick', labelsize=14) rc('ytick', labelsize=14) ## for Palatino and other serif fonts use: #rc('font',**{'family':'serif','serif':['Palatino']}) rc('text', usetex=True) %matplotlib inline Image('backbox_ml.png') Image('openbox_pp.png') from scipy import stats # set every possibility to be equally possible x_coin = np.linspace(0, 1, 101) import prettyplotlib as ppl fig = plt.figure(figsize=(8, 6)) ax = fig.add_subplot(111, xlabel=r'Hypothesis for chance of heads', ylabel=r'Probability of hypothesis', title=r'Prior probability distribution after no coin tosses') ppl.plot(ax, x_coin, stats.beta(2, 2).pdf(x_coin), linewidth=3.) ax.set_xticklabels([r'0\%', r'20\%', r'40\%', r'60\%', r'80\%', r'100\%']); # fig.savefig('coin1.png') stats.beta(2,2).rvs(1) import random import numpy as np def choose_coin(): return stats.beta(2, 2).rvs(1) # random.uniform(0,1) # np.random.normal(0,1) # # pylab.hist([choose_coin() for dummy in range(100000)], normed=True, bins=100) successes = [] returns = 0. for i in range(10000): prob_heads = choose_coin() results = [random.uniform(0,1) < prob_heads for dummy in range(10)] if results.count(True) == 9: successes.append(prob_heads) if random.uniform(0,1) < prob_heads: returns += -1 else: returns += 1 fig = plt.figure(figsize=(8, 6)) ax = fig.add_subplot(111) r = ax.hist(np.array(successes), normed=True, bins=20) print len(successes) print "Average return", returns / len(successes) fig = plt.figure(figsize=(8, 6)) ax = fig.add_subplot(111, xlabel='Hypothesis for chance of heads', ylabel='Probability of hypothesis', title='Posterior probability distribution after first heads') ppl.plot(ax, x_coin, stats.beta(1, 1).pdf(x_coin)*stats.beta(90, 10).pdf(x_coin), linewidth=3.) ax.set_xticklabels([r'0\%', r'20\%', r'40\%', r'60\%', r'80\%', r'100\%']); plt.savefig('coin2.png') fig = plt.figure(figsize=(8, 6)) ax = fig.add_subplot(111, xlabel='Hypothesis for chance of heads', ylabel='Probability of hypothesis', title='Posterior probability distribution after 1 head, 1 tail') ppl.plot(ax, x_coin, stats.beta(3, 3).pdf(x_coin), linewidth=3.) ax.set_xticklabels(['0\%', '20\%', '40\%', '60\%', '80\%', '100\%']); fig.savefig('coin3.png') Image('coin3.png') fig = plt.figure(figsize=(8, 6)) ax = fig.add_subplot(111, xlabel='Hypothesis for chance of heads', ylabel='Probability of hypothesis', title='Posterior probability distribution after 20 heads and 20 tails') ppl.plot(ax, x_coin, stats.beta(22, 22).pdf(x_coin), linewidth=3.) ax.set_xticklabels(['0\%', '20\%', '40\%', '60\%', '80\%', '100\%']); fig.savefig('coin4.png') Image('coin4.png') from scipy import stats fig = ppl.plt.figure(figsize=(14, 6)) ax1 = fig.add_subplot(121, title='What we want', ylim=(0, .5), xlabel=r'\theta', ylabel=r'P(\theta)') ppl.plot(ax1, np.linspace(-4, 4, 100), stats.norm.pdf(np.linspace(-3, 3, 100)), linewidth=4.) ax2 = fig.add_subplot(122, title='What we get', xlim=(-4, 4), ylim=(0, 1800), xlabel=r'\theta', ylabel='\# of samples') ax2.hist(np.random.randn(10000), bins=20); Image('wantget.png') size = 200 true_intercept = 1 true_slope = 2 x = np.linspace(0, 1, size) # y = a + b*x true_regression_line = true_intercept + true_slope * x # add noise y = true_regression_line + np.random.normal(scale=.5, size=size) data = dict(x=x, y=y) fig = plt.figure(figsize=(7, 7)) ax = fig.add_subplot(111, xlabel='Value of gold', ylabel='Value of gold miners', title='Synthetic data and underlying model') ppl.scatter(ax, x, y, label='sampled data') ppl.plot(ax, x, true_regression_line, label='true regression line', linewidth=4.) ax.legend(loc=2); fig.savefig('synth_data.png') Image('synth_data.png') import pymc as pm with pm.Model() as model: # model specifications in PyMC3 are wrapped in a with-statement # Define priors alpha = pm.Normal('alpha', mu=0, sd=20) beta = pm.Normal('beta', mu=0, sd=20) sigma = pm.Uniform('sigma', lower=0, upper=20) # Define linear regression y_est = alpha + beta * x # Define likelihood likelihood = pm.Normal('y', mu=y_est, sd=sigma, observed=y) # Inference! start = pm.find_MAP() # Find starting value by optimization step = pm.NUTS(state=start) # Instantiate MCMC sampling algorithm trace = pm.sample(2000, step, start=start, progressbar=False) # draw 2000 posterior samples using NUTS sampling with pm.Model() as model: # specify glm and pass in data. The resulting linear model, its likelihood and # and all its parameters are automatically added to our model. pm.glm.glm('y ~ x', data) step = pm.NUTS() # Instantiate MCMC sampling algorithm trace = pm.sample(2000, step, progressbar=False) # draw 2000 posterior samples using NUTS sampling fig = pm.traceplot(trace, lines={'alpha': 1, 'beta': 2, 'sigma': .5}); fig = plt.figure(figsize=(10, 10)) ax = fig.add_subplot(111, xlabel='Value of gold', ylabel='Value of gold miners', title='Posterior predictive regression lines') ppl.scatter(ax, x, y, label='data') from pymc import glm glm.plot_posterior_predictive(trace, samples=100, label='posterior predictive regression lines') ppl.plot(ax, x, true_regression_line, label='true regression line', linewidth=5.) ax.legend(loc=0); fig.savefig('ppc1.png') Image('ppc1.png') # Add outliers x_out = np.append(x, [.1, .15, .2, .25, .25]) y_out = np.append(y, [8, 6, 9, 7, 9]) data_out = dict(x=x_out, y=y_out) fig = plt.figure(figsize=(7, 7)) ax = fig.add_subplot(111, xlabel='Value of gold', ylabel='Value of gold miners', title='Posterior predictive regression lines') ppl.scatter(ax, x_out, y_out, label='data') with pm.Model() as model: glm.glm('y ~ x', data_out) trace = pm.sample(2000, pm.NUTS(), progressbar=False) fig = plt.figure(figsize=(10, 10)) ax = fig.add_subplot(111, xlabel='Value of gold', ylabel='Value of gold miners', title='Posterior predictive regression lines') ppl.scatter(ax, x_out, y_out, label='data') glm.plot_posterior_predictive(trace, samples=100, label='posterior predictive regression lines') ppl.plot(ax, x, true_regression_line, label='true regression line', linewidth=5.) plt.legend(loc=0); fig.savefig('ppc2.png') Image('ppc2.png') fig = plt.figure(figsize=(8, 6)) ax = fig.add_subplot(111) normal_dist = pm.Normal.dist(mu=0, sd=1) t_dist = pm.T.dist(mu=0, lam=1, nu=1) x_eval = np.linspace(-8, 8, 300) ppl.plot(ax, x_eval, pm.theano.tensor.exp(normal_dist.logp(x_eval)).eval(), label='Normal', linewidth=2.) ppl.plot(ax, x_eval, pm.theano.tensor.exp(t_dist.logp(x_eval)).eval(), label='Student T', linewidth=2.) plt.xlabel('x') plt.ylabel('Probability density') plt.legend(); fig.savefig('t-dist.png') Image('t-dist.png') with pm.Model() as model_robust: family = pm.glm.families.T() pm.glm.glm('y ~ x', data_out, family=family) trace_robust = pm.sample(2000, pm.NUTS(), progressbar=False) fig = plt.figure(figsize=(10, 10)) ax = fig.add_subplot(111, xlabel='Value of gold', ylabel='Value of gold miners', title='Posterior predictive regression lines') ppl.scatter(ax, x_out, y_out) glm.plot_posterior_predictive(trace_robust, samples=100, label='posterior predictive regression lines') ppl.plot(ax, x, true_regression_line, label='true regression line', linewidth=5.) plt.legend(); fig.savefig('ppc3.png') Image('ppc3.png') import zipline import pytz from datetime import datetime fig = plt.figure(figsize=(8, 4)) prices = zipline.data.load_from_yahoo(stocks=['GLD', 'GDX'], end=datetime(2013, 8, 1, 0, 0, 0, 0, pytz.utc)).dropna()[:1000] prices.plot(); fig = plt.figure(figsize=(9, 6)) ax = fig.add_subplot(111, xlabel='Price GDX in \$', ylabel='Price GLD in \$') colors = np.linspace(0.1, 1, len(prices)) mymap = plt.get_cmap("winter") sc = ax.scatter(prices.GDX, prices.GLD, c=colors, cmap=mymap, lw=0) cb = plt.colorbar(sc) cb.ax.set_yticklabels([str(p.date()) for p in prices[::len(prices)//10].index]); fig.savefig('price_corr.png') Image('price_corr.png') with pm.Model() as model_reg: family = pm.glm.families.Normal() pm.glm.glm('GLD ~ GDX', prices, family=family) trace_reg = pm.sample(2000, pm.NUTS(), progressbar=False) fig = plt.figure(figsize=(9, 6)) ax = fig.add_subplot(111, xlabel='Price GDX in \$', ylabel='Price GLD in \$', title='Posterior predictive regression lines') sc = ax.scatter(prices.GDX, prices.GLD, c=colors, cmap=mymap, lw=0) glm.plot_posterior_predictive(trace_reg, samples=100, label='posterior predictive regression lines', lm=lambda x, sample: sample['Intercept'] + sample['GDX'] * x, eval=np.linspace(prices.GDX.min(), prices.GDX.max(), 100)) cb = plt.colorbar(sc) cb.ax.set_yticklabels([str(p.date()) for p in prices[::len(prices)//10].index]); ax.legend(loc=0); fig.savefig('ppc4.png') Image('ppc4.png') from pymc.distributions.timeseries import * from theano.tensor import repeat model_randomwalk = pm.Model() with model_randomwalk: # std of random walk, best sampled in log space. sigma_alpha, log_sigma_alpha = model_randomwalk.TransformedVar( 'sigma_alpha', pm.Exponential.dist(1./.02, testval = .1), pm.logtransform ) sigma_beta, log_sigma_beta = model_randomwalk.TransformedVar( 'sigma_beta', pm.Exponential.dist(1./.02, testval = .1), pm.logtransform ) # To make the model simpler, we will apply the same coefficient for 50 data points at a time subsample_alpha = 50 subsample_beta = 50 with model_randomwalk: alpha = GaussianRandomWalk('alpha', sigma_alpha**-2, shape=len(prices) / subsample_alpha) beta = GaussianRandomWalk('beta', sigma_beta**-2, shape=len(prices) / subsample_beta) # Make coefficients have the same length as prices alpha_r = repeat(alpha, subsample_alpha) beta_r = repeat(beta, subsample_beta) with model_randomwalk: # Define regression regression = alpha_r + beta_r * prices.GDX.values # Assume prices are Normally distributed, the mean comes from the regression. sd = pm.Uniform('sd', 0, 20) likelihood = pm.Normal('y', mu=regression, sd=sd, observed=prices.GLD.values) from scipy import optimize with model_randomwalk: # First optimize random walk start = pm.find_MAP(vars=[alpha, beta], fmin=optimize.fmin_l_bfgs_b) # Sample step = pm.NUTS(scaling=start) trace_rw = pm.sample(2000, step, start=start, progressbar=False) fig = plt.figure(figsize=(8, 6)) ax = plt.subplot(111, xlabel='time', ylabel='alpha', title='Change of alpha over time.') ppl.plot(ax, trace_rw[-1000:][alpha].T, 'r', alpha=.05); ax.set_xticklabels([str(p.date()) for p in prices[::len(prices)//5].index]); fig.savefig('rwalk_alpha.png') Image('rwalk_alpha.png') fig = plt.figure(figsize=(8, 6)) ax = fig.add_subplot(111, xlabel='time', ylabel='beta', title='Change of beta over time') ppl.plot(ax, trace_rw[-1000:][beta].T, 'b', alpha=.05); ax.set_xticklabels([str(p.date()) for p in prices[::len(prices)//5].index]); fig.savefig('rwalk_beta.png') Image('rwalk_beta.png') fig = plt.figure(figsize=(8, 6)) ax = fig.add_subplot(111, xlabel='Price GDX in \$', ylabel='Price GLD in \$', title='Posterior predictive regression lines') colors = np.linspace(0.1, 1, len(prices)) colors_sc = np.linspace(0.1, 1, len(trace_rw[-500::10]['alpha'].T)) mymap = plt.get_cmap('winter') mymap_sc = plt.get_cmap('winter') xi = np.linspace(prices.GDX.min(), prices.GDX.max(), 50) for i, (alpha, beta) in enumerate(zip(trace_rw[-500::10]['alpha'].T, trace_rw[-500::10]['beta'].T)): for a, b in zip(alpha, beta): ax.plot(xi, a + b*xi, alpha=.05, lw=1, c=mymap_sc(colors_sc[i])) sc = ax.scatter(prices.GDX, prices.GLD, label='data', cmap=mymap, c=colors) cb = plt.colorbar(sc) cb.ax.set_yticklabels([str(p.date()) for p in prices[::len(prices)//10].index]); fig.savefig('ppc5.png') Image('ppc5.png')