#!/usr/bin/env python # coding: utf-8 # This notebook is an interactive version of the [companion webpage](http://edwardlib.org/iclr2017) for the article, Deep Probabilistic Programming [(Tran et al., 2017)](https://arxiv.org/abs/1701.03757). See Edward's [API](http://edwardlib.org/api/) for details on how to interact with data, models, inference, and criticism. # # The code snippets assume the following versions. # ```bash # pip install edward==1.3.1 # pip install tensorflow==1.1.0 # alternatively, tensorflow-gpu==1.1.0 # pip install keras==2.0.0 # ``` # ## Section 3. Compositional Representations for Probabilistic Models # # __Figure 1__. Beta-Bernoulli program. # In[ ]: import tensorflow as tf from edward.models import Bernoulli, Beta theta = Beta(1.0, 1.0) x = Bernoulli(tf.ones(50) * theta) # For an example of it in use, see # [`examples/beta_bernoulli.py`](https://github.com/blei-lab/edward/blob/master/examples/beta_bernoulli.py) # in the Github repository. # # __Figure 2__. Variational auto-encoder for a data set of 28 x 28 pixel images # (Kingma & Welling, 2014; Rezende, Mohamed, & Wierstra, 2014). # In[ ]: import tensorflow as tf from edward.models import Bernoulli, Normal from keras.layers import Dense N = 55000 # number of data points d = 50 # latent dimension # Probabilistic model z = Normal(loc=tf.zeros([N, d]), scale=tf.ones([N, d])) h = Dense(256, activation='relu')(z) x = Bernoulli(logits=Dense(28 * 28, activation=None)(h)) # Variational model qx = tf.placeholder(tf.float32, [N, 28 * 28]) qh = Dense(256, activation='relu')(qx) qz = Normal(loc=Dense(d, activation=None)(qh), scale=Dense(d, activation='softplus')(qh)) # For an example of it in use, see # [`examples/vae.py`](https://github.com/blei-lab/edward/blob/master/examples/vae.py) # in the Github repository. # # __Figure 3__. Bayesian recurrent neural network (Radford M Neal, 2012). # The program has an unspecified number of time steps; it uses a # symbolic for loop (`tf.scan`). # In[ ]: import edward as ed import tensorflow as tf from edward.models import Normal H = 50 # number of hidden units D = 10 # number of features def rnn_cell(hprev, xt): return tf.tanh(ed.dot(hprev, Wh) + ed.dot(xt, Wx) + bh) Wh = Normal(loc=tf.zeros([H, H]), scale=tf.ones([H, H])) Wx = Normal(loc=tf.zeros([D, H]), scale=tf.ones([D, H])) Wy = Normal(loc=tf.zeros([H, 1]), scale=tf.ones([H, 1])) bh = Normal(loc=tf.zeros(H), scale=tf.ones(H)) by = Normal(loc=tf.zeros(1), scale=tf.ones(1)) x = tf.placeholder(tf.float32, [None, D]) h = tf.scan(rnn_cell, x, initializer=tf.zeros(H)) y = Normal(loc=tf.matmul(h, Wy) + by, scale=1.0) # ## Section 4. Compositional Representations for Inference # # __Figure 5__. Hierarchical model (Gelman & Hill, 2006). # It is a mixture of Gaussians over # $D$-dimensional data $\{x_n\}\in\mathbb{R}^{N\times D}$. There are # $K$ latent cluster means $\beta\in\mathbb{R}^{K\times D}$. # In[ ]: import tensorflow as tf from edward.models import Categorical, Normal N = 10000 # number of data points D = 2 # data dimension K = 5 # number of clusters beta = Normal(loc=tf.zeros([K, D]), scale=tf.ones([K, D])) z = Categorical(logits=tf.zeros([N, K])) x = Normal(loc=tf.gather(beta, z), scale=tf.ones([N, D])) # It is used below in Figure 6 (left/right) and Figure * (variational EM). # # __Figure 6__ __(left)__. Variational inference # (Jordan, Ghahramani, Jaakkola, & Saul, 1999). # It performs inference on the model defined in Figure 5. # In[ ]: import edward as ed import numpy as np import tensorflow as tf from edward.models import Categorical, Normal x_train = np.zeros([N, D]) qbeta = Normal(loc=tf.Variable(tf.zeros([K, D])), scale=tf.exp(tf.Variable(tf.zeros([K, D])))) qz = Categorical(logits=tf.Variable(tf.zeros([N, K]))) inference = ed.VariationalInference({beta: qbeta, z: qz}, data={x: x_train}) # __Figure 6__ __(right)__. Monte Carlo (Robert & Casella, 1999). # It performs inference on the model defined in Figure 5. # In[ ]: import edward as ed import numpy as np import tensorflow as tf from edward.models import Empirical x_train = np.zeros([N, D]) T = 10000 # number of samples qbeta = Empirical(params=tf.Variable(tf.zeros([T, K, D]))) qz = Empirical(params=tf.Variable(tf.zeros([T, N]))) # __Figure 7__. Generative adversarial network # (Goodfellow et al., 2014). # In[ ]: import edward as ed import numpy as np import tensorflow as tf from edward.models import Normal from keras.layers import Dense N = 55000 # number of data points d = 50 # latent dimension def generative_network(eps): h = Dense(256, activation='relu')(eps) return Dense(28 * 28, activation=None)(h) def discriminative_network(x): h = Dense(28 * 28, activation='relu')(x) return Dense(1, activation=None)(h) # DATA x_train = np.zeros([N, 28 * 28]) # MODEL eps = Normal(loc=tf.zeros([N, d]), scale=tf.ones([N, d])) x = generative_network(eps) # INFERENCE inference = ed.GANInference(data={x: x_train}, discriminator=discriminative_network) # For an example of it in use, see the # [generative adversarial networks](http://edwardlib.org/tutorials/gan) tutorial. # # __Figure *__. Variational EM (Radford M. Neal & Hinton, 1993). # It performs inference on the model defined in Figure 5. # In[ ]: import edward as ed import numpy as np import tensorflow as tf from edward.models import Categorical, PointMass # DATA x_train = np.zeros([N, D]) # INFERENCE qbeta = PointMass(params=tf.Variable(tf.zeros([K, D]))) qz = Categorical(logits=tf.Variable(tf.zeros([N, K]))) inference_e = ed.VariationalInference({z: qz}, data={x: x_train, beta: qbeta}) inference_m = ed.MAP({beta: qbeta}, data={x: x_train, z: qz}) inference_e.initialize() inference_m.initialize() tf.initialize_all_variables().run() for _ in range(10000): inference_e.update() inference_m.update() # For more details, see the # [inference compositionality](http://edwardlib.org/api/inference-compositionality) webpage. # See # [`examples/factor_analysis.py`](https://github.com/blei-lab/edward/blob/master/examples/factor_analysis.py) for # a version performing Monte Carlo EM for logistic factor analysis # in the Github repository. # It leverages Hamiltonian Monte Carlo for the E-step to perform maximum # marginal a posteriori. # # __Figure *__. Data subsampling. # In[ ]: import edward as ed import tensorflow as tf from edward.models import Categorical, Normal N = 10000 # number of data points M = 128 # batch size during training D = 2 # data dimension K = 5 # number of clusters # DATA x_batch = tf.placeholder(tf.float32, [M, D]) # MODEL beta = Normal(loc=tf.zeros([K, D]), scale=tf.ones([K, D])) z = Categorical(logits=tf.zeros([M, K])) x = Normal(loc=tf.gather(beta, z), scale=tf.ones([M, D])) # INFERENCE qbeta = Normal(loc=tf.Variable(tf.zeros([K, D])), scale=tf.nn.softplus(tf.Variable(tf.zeros([K, D])))) qz = Categorical(logits=tf.Variable(tf.zeros([M, D]))) inference = ed.VariationalInference({beta: qbeta, z: qz}, data={x: x_batch}) inference.initialize(scale={x: float(N) / M, z: float(N) / M}) # For more details, see the # [data subsampling](http://edwardlib.org/api/inference-data-subsampling) webpage. # # ## Section 5. Experiments # # __Figure 9__. Bayesian logistic regression with Hamiltonian Monte Carlo. # In[ ]: import edward as ed import numpy as np import tensorflow as tf from edward.models import Bernoulli, Empirical, Normal N = 581012 # number of data points D = 54 # number of features T = 100 # number of empirical samples # DATA x_data = np.zeros([N, D]) y_data = np.zeros([N]) # MODEL x = tf.Variable(x_data, trainable=False) beta = Normal(loc=tf.zeros(D), scale=tf.ones(D)) y = Bernoulli(logits=ed.dot(x, beta)) # INFERENCE qbeta = Empirical(params=tf.Variable(tf.zeros([T, D]))) inference = ed.HMC({beta: qbeta}, data={y: y_data}) inference.run(step_size=0.5 / N, n_steps=10) # For an example of it in use, see # [`examples/bayesian_logistic_regression.py`](https://github.com/blei-lab/edward/blob/master/examples/bayesian_logistic_regression.py) # in the Github repository. # # ## Appendix A. Model Examples # # __Figure 10__. Bayesian neural network for classification (Denker, Schwartz, Wittner, & Solla, 1987). # In[ ]: import tensorflow as tf from edward.models import Bernoulli, Normal N = 1000 # number of data points D = 528 # number of features H = 256 # hidden layer size W_0 = Normal(loc=tf.zeros([D, H]), scale=tf.ones([D, H])) W_1 = Normal(loc=tf.zeros([H, 1]), scale=tf.ones([H, 1])) b_0 = Normal(loc=tf.zeros(H), scale=tf.ones(H)) b_1 = Normal(loc=tf.zeros(1), scale=tf.ones(1)) x = tf.placeholder(tf.float32, [N, D]) y = Bernoulli(logits=tf.matmul(tf.nn.tanh(tf.matmul(x, W_0) + b_0), W_1) + b_1) # For an example of it in use, see # [`examples/getting_started_example.py`](https://github.com/blei-lab/edward/blob/master/examples/getting_started_example.py) # in the Github repository. # # __Figure 11__. Latent Dirichlet allocation (D. M. Blei, Ng, & Jordan, 2003). # In[ ]: import tensorflow as tf from edward.models import Categorical, Dirichlet D = 4 # number of documents N = [11502, 213, 1523, 1351] # words per doc K = 10 # number of topics V = 100000 # vocabulary size theta = Dirichlet(tf.zeros([D, K]) + 0.1) phi = Dirichlet(tf.zeros([K, V]) + 0.05) z = [[0] * N] * D w = [[0] * N] * D for d in range(D): for n in range(N[d]): z[d][n] = Categorical(theta[d, :]) w[d][n] = Categorical(phi[z[d][n], :]) # __Figure 12__. Gaussian matrix factorization # (Salakhutdinov & Mnih, 2011). # In[ ]: import tensorflow as tf from edward.models import Normal N = 10 M = 10 K = 5 # latent dimension U = Normal(loc=tf.zeros([M, K]), scale=tf.ones([M, K])) V = Normal(loc=tf.zeros([N, K]), scale=tf.ones([N, K])) Y = Normal(loc=tf.matmul(U, V, transpose_b=True), scale=tf.ones([N, M])) # __Figure 13__. Dirichlet process mixture model (Antoniak, 1974). # In[ ]: import tensorflow as tf from edward.models import DirichletProcess, Normal N = 1000 # number of data points D = 5 # data dimensionality dp = DirichletProcess(alpha=1.0, base=Normal(loc=tf.zeros(D), scale=tf.ones(D))) mu = dp.sample(N) x = Normal(loc=loc, scale=tf.ones([N, D])) # To see the essential component defining the `DirichletProcess`, see # [`examples/pp_dirichlet_process.py`](https://github.com/blei-lab/edward/blob/master/examples/pp_dirichlet_process.py) # in the Github repository. Its source implementation can be found at # [`edward/models/dirichlet_process.py`](https://github.com/blei-lab/edward/blob/master/edward/models/dirichlet_process.py). # # ## Appendix B. Inference Examples # # __Figure *__. Stochastic variational inference (M. D. Hoffman, Blei, Wang, & Paisley, 2013). # For more details, see the # [data subsampling](http://edwardlib.org/api/inference-data-subsampling) webpage. # # ## Appendix C. Complete Examples # # __Figure 15__. Variational auto-encoder # (Kingma & Welling, 2014; Rezende et al., 2014). # See the script # \href{https://github.com/blei-lab/edward/blob/master/examples/vae.py}{\texttt{examples/vae.py}} # in the Github repository. # # __Figure 16__. Exponential family embedding (Rudolph, Ruiz, Mandt, & Blei, 2016). # A Github repository with comprehensive features is available at # [mariru/exponential_family_embeddings](https://github.com/mariru/exponential_family_embeddings).