In [1]:

!date
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
%matplotlib inline
sns.set_context('paper')
sns.set_style('darkgrid')

Sun Jun  7 15:42:45 PDT 2015

Mixture Model in PyMC3¶

In [2]:

import pymc3 as pm, theano.tensor as tt

In [3]:

# simulate data from a known mixture distribution
np.random.seed(12345) # set random seed for reproducibility

k = 3
ndata = 500
spread = 3
centers = np.array([-spread, 0, spread])

# simulate data from mixture distribution
v = np.random.randint(0, k, ndata)
data = centers[v] + np.random.randn(ndata)

plt.hist(data)

Out[3]:

(array([ 33.,  61.,  53.,  54.,  90.,  54.,  53.,  72.,  28.,   2.]),
 array([-5.04042311, -3.9210433 , -2.80166348, -1.68228367, -0.56290385,
         0.55647596,  1.67585577,  2.79523559,  3.9146154 ,  5.03399522,
         6.15337503]),
 <a list of 10 Patch objects>)

In [4]:

# setup model
model = pm.Model()
with model:
    # cluster sizes
    a = pm.constant(np.array([1., 1., 1.]))
    p = pm.Dirichlet('p', a=a, shape=k)
    # ensure all clusters have some points
    p_min_potential = pm.Potential('p_min_potential', tt.switch(tt.min(p) < .1, -np.inf, 0))


    # cluster centers
    means = pm.Normal('means', mu=[0, 0, 0], sd=15, shape=k)
    # break symmetry
    order_means_potential = pm.Potential('order_means_potential',
                                         tt.switch(means[1]-means[0] < 0, -np.inf, 0)
                                         + tt.switch(means[2]-means[1] < 0, -np.inf, 0))
    
    # measurement error
    sd = pm.Uniform('sd', lower=0, upper=20)

    # latent cluster of each observation
    category = pm.Categorical('category',
                           p=p,
                           shape=ndata)

    # likelihood for each observed value
    points = pm.Normal('obs',
                     mu=means[category],
                     sd=sd,
                     observed=data)

Cannot compute test value: input 0 (<TensorType(float32, matrix)>) of Op Dot22(<TensorType(float32, matrix)>, <TensorType(float32, matrix)>) missing default value

In [5]:

# make some step-methods particularly suited to the category stochastic

class RandomScanDiscreteMetropolis(pm.step_methods.arraystep.ArrayStep):
    def __init__(self, var, model=None, values=[0,1]):
        model = pm.modelcontext(model)
        self.values = values
        super(RandomScanDiscreteMetropolis, self).__init__([var], [model.fastlogp])
        
    def astep(self, q0, logp):
        i = np.random.choice(len(q0))

        q = np.copy(q0)
        q[i] = np.random.choice(self.values)
        
        q_new = pm.step_methods.arraystep.metrop_select(logp(q) - logp(q0), q, q0)
        
        return q_new


class SequentialScanDiscreteMetropolis(pm.step_methods.arraystep.ArrayStep):
    def __init__(self, var, model=None, values=[0,1]):
        model = pm.modelcontext(model)
        self.values = values
        self.i = 0
        super(SequentialScanDiscreteMetropolis, self).__init__([var], [model.fastlogp])
        
    def astep(self, q0, logp):

        q = np.copy(q0)
        q[self.i] = np.random.choice(self.values)
        self.i = (self.i + 1) % len(q)
        
        q_new = pm.step_methods.arraystep.metrop_select(logp(q) - logp(q0), q, q0)
        
        return q_new

In [6]:

# fit model
with model:
    step1 = pm.Metropolis(vars=[p])
    step2 = pm.Metropolis(vars=[sd, means])
    step3 = SequentialScanDiscreteMetropolis(var=category, values=[0,1,2])
    tr = pm.sample(10000, step=[step1, step2] + [step3]*ndata)

 [-----------------100%-----------------] 10000 of 10000 complete in 1327.3 sec

In [7]:

# take a look at traceplot for some model parameters
# (with some burn-in and thinning)
pm.plots.traceplot(tr[5000::5], ['p', 'sd', 'means']);

In [8]:

# I prefer autocorrelation plots for serious confirmation of MCMC convergence
pm.autocorrplot(tr[5000::5], ['sd'])

Out[8]:

(<matplotlib.figure.Figure at 0x7f897a686150>,
 array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f8961e70250>]], dtype=object))

In [9]:

# this currently does not work for vector-valued stochastics :(
# pm.autocorrplot(tr[5000::5], ['sd'])

In [14]:

# the trace of the categorical stochastics is hard to see on the traceplot
# but this is what a single coordinate looks like
i=0
plt.plot(tr['category'][5000::5,i], drawstyle='steps-mid')
plt.axis(ymin=-.1, ymax=2.1)

Out[14]:

(0.0, 1000.0, -0.1, 2.1)

In [24]:

def cluster_posterior(i=0):
    print 'true cluster:', v[i]
    print '  data value:', np.round(data[i],2)
    plt.hist(tr['category'][5000::5,i], bins=[-.5,.5,1.5,2.5,], rwidth=.9)
    plt.axis(xmin=-.5, xmax=2.5)
    plt.xticks([0,1,2])
cluster_posterior(i)

true cluster: 2
  data value: 1.29

In [25]:

from IPython.html import widgets
widgets.interact(cluster_posterior, i=[0,ndata])

true cluster: 2
  data value: 1.48

In [17]:

In [ ]: