#!/usr/bin/env python
# coding: utf-8

# # Unsupervised Learning
# 
# In unsupervised learning, the task is to infer hidden structure from
# unlabeled data, comprised of training examples $\{x_n\}$.
# 
# We demonstrate with an example in Edward. A webpage version is available at
# http://edwardlib.org/tutorials/unsupervised.

# In[1]:


get_ipython().run_line_magic('matplotlib', 'inline')
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import edward as ed
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
import six
import tensorflow as tf

from edward.models import (
    Categorical, Dirichlet, Empirical, InverseGamma,
    MultivariateNormalDiag, Normal, ParamMixture)

plt.style.use('ggplot')


# ## Data
# 
# Use a simulated data set of 2-dimensional data points
# $\mathbf{x}_n\in\mathbb{R}^2$.

# In[2]:


def build_toy_dataset(N):
  pi = np.array([0.4, 0.6])
  mus = [[1, 1], [-1, -1]]
  stds = [[0.1, 0.1], [0.1, 0.1]]
  x = np.zeros((N, 2), dtype=np.float32)
  for n in range(N):
    k = np.argmax(np.random.multinomial(1, pi))
    x[n, :] = np.random.multivariate_normal(mus[k], np.diag(stds[k]))

  return x


N = 500  # number of data points
K = 2  # number of components
D = 2  # dimensionality of data
ed.set_seed(42)

x_train = build_toy_dataset(N)


# We visualize the generated data points.

# In[3]:


plt.scatter(x_train[:, 0], x_train[:, 1])
plt.axis([-3, 3, -3, 3])
plt.title("Simulated dataset")
plt.show()


# ## Model
# 
# A mixture model is a model typically used for clustering.
# It assigns a mixture component to each data point, and this mixture component
# determines the distribution that the data point is generated from. A
# mixture of Gaussians uses Gaussian distributions to generate this data
# (Bishop, 2006).
# 
# For a set of $N$ data points,
# the likelihood of each observation $\mathbf{x}_n$ is
# 
# \begin{align*}
#   p(\mathbf{x}_n \mid \pi, \mu, \sigma)
#   &=
#   \sum_{k=1}^K \pi_k \, \text{Normal}(\mathbf{x}_n \mid \mu_k, \sigma_k).
# \end{align*}
# 
# The latent variable $\pi$ is a $K$-dimensional probability vector
# which mixes individual Gaussian distributions, each
# characterized by a mean $\mu_k$ and standard deviation $\sigma_k$.
# 
# Define the prior on $\pi\in[0,1]$ such that $\sum_{k=1}^K\pi_k=1$ to be
# 
# \begin{align*}
#   p(\pi)
#   &=
#   \text{Dirichlet}(\pi \mid \alpha \mathbf{1}_{K})
# \end{align*}
# 
# for fixed $\alpha=1$. Define the prior on each component $\mathbf{\mu}_k\in\mathbb{R}^D$ to be
# 
# \begin{align*}
#   p(\mathbf{\mu}_k)
#   &=
#   \text{Normal}(\mathbf{\mu}_k \mid \mathbf{0}, \mathbf{I}).
# \end{align*}
# 
# Define the prior on each component $\mathbf{\sigma}_k^2\in\mathbb{R}^D$ to be
# 
# \begin{align*}
#   p(\mathbf{\sigma}_k^2)
#   &=
#   \text{InverseGamma}(\mathbf{\sigma}_k^2 \mid a, b).
# \end{align*}
# 
# We build two versions of the model in Edward: one jointly with the
# mixture assignments $c_n\in\{0,\ldots,K-1\}$ as latent variables,
# and another with them summed out.
# 
# The joint version includes an explicit latent variable for the mixture
# assignments. We implement this with the `ParamMixture` random
# variable; it takes as input the mixing probabilities, the components'
# parameters, and the distribution of the components. It is the
# distribution of the mixture conditional on mixture assignments. (Note
# we can also write this separately by first building a `Categorical`
# random variable for `z` and then building `x`; `ParamMixture` avoids
# requiring `tf.gather` which is slightly more efficient.)

# In[4]:


pi = Dirichlet(tf.ones(K))
mu = Normal(tf.zeros(D), tf.ones(D), sample_shape=K)
sigmasq = InverseGamma(tf.ones(D), tf.ones(D), sample_shape=K)
x = ParamMixture(pi, {'loc': mu, 'scale_diag': tf.sqrt(sigmasq)},
                 MultivariateNormalDiag,
                 sample_shape=N)
z = x.cat


# The collapsed version marginalizes out the mixture assignments. We
# implement this with the `Mixture` random variable; it takes as
# input a Categorical distribution and a list of individual distribution
# components. It is the distribution of the mixture summing out the
# mixture assignments. Gibbs sampling does not work with `Mixture` random
# variables, please try an alternative.

# In[5]:


"""
pi = Dirichlet(tf.ones(K))
mu = Normal(tf.zeros(D), tf.ones(D), sample_shape=K)
sigmasq = InverseGamma(tf.ones(D), tf.ones(D), sample_shape=K)
cat = Categorical(probs=pi, sample_shape=N)
components = [
    MultivariateNormalDiag(mu[k], tf.sqrt(sigmasq[k]), sample_shape=N)
    for k in range(K)]
x = Mixture(cat=cat, components=components)
"""


# We will use the joint version in this analysis.

# ## Inference
# 
# Each distribution in the model is written with conjugate priors, so we
# can use Gibbs sampling. It performs Markov chain Monte Carlo by
# iterating over draws from the complete conditionals of each
# distribution, i.e., each distribution conditional on a previously
# drawn value. First we set up Empirical random variables which will
# approximate the posteriors using the collection of samples.

# In[6]:


T = 500  # number of MCMC samples
qpi = Empirical(tf.get_variable(
    "qpi/params", [T, K],
    initializer=tf.constant_initializer(1.0 / K)))
qmu = Empirical(tf.get_variable(
    "qmu/params", [T, K, D],
    initializer=tf.zeros_initializer()))
qsigmasq = Empirical(tf.get_variable(
    "qsigmasq/params", [T, K, D],
    initializer=tf.ones_initializer()))
qz = Empirical(tf.get_variable(
    "qz/params", [T, N],
    initializer=tf.zeros_initializer(),
    dtype=tf.int32))


# Run Gibbs sampling. We write the training loop explicitly, so that we can track
# the cluster means as the sampler progresses.

# In[7]:


inference = ed.Gibbs({pi: qpi, mu: qmu, sigmasq: qsigmasq, z: qz},
                     data={x: x_train})
inference.initialize()

sess = ed.get_session()
tf.global_variables_initializer().run()

t_ph = tf.placeholder(tf.int32, [])
running_cluster_means = tf.reduce_mean(qmu.params[:t_ph], 0)

for _ in range(inference.n_iter):
  info_dict = inference.update()
  inference.print_progress(info_dict)
  t = info_dict['t']
  if t % inference.n_print == 0:
    print("\nInferred cluster means:")
    print(sess.run(running_cluster_means, {t_ph: t - 1}))


# ## Criticism
# 
# We visualize the predicted memberships of each data point. We pick
# the cluster assignment which produces the highest posterior predictive
# density for each data point.
# 
# To do this, we first draw a sample from the posterior and calculate a
# a $N\times K$ matrix of log-likelihoods, one for each data point
# $\mathbf{x}_n$ and cluster assignment $k$.
# We perform this averaged over 100 posterior samples.

# In[8]:


# Calculate likelihood for each data point and cluster assignment,
# averaged over many posterior samples. ``x_post`` has shape (N, 100, K, D).
mu_sample = qmu.sample(100)
sigmasq_sample = qsigmasq.sample(100)
x_post = Normal(loc=tf.ones([N, 1, 1, 1]) * mu_sample,
                scale=tf.ones([N, 1, 1, 1]) * tf.sqrt(sigmasq_sample))
x_broadcasted = tf.tile(tf.reshape(x_train, [N, 1, 1, D]), [1, 100, K, 1])

# Sum over latent dimension, then average over posterior samples.
# ``log_liks`` ends up with shape (N, K).
log_liks = x_post.log_prob(x_broadcasted)
log_liks = tf.reduce_sum(log_liks, 3)
log_liks = tf.reduce_mean(log_liks, 1)


# We then take the $\arg\max$ along the columns (cluster assignments).

# In[9]:


# Choose the cluster with the highest likelihood for each data point.
clusters = tf.argmax(log_liks, 1).eval()


# Plot the data points, colored by their predicted membership.

# In[10]:


plt.scatter(x_train[:, 0], x_train[:, 1], c=clusters, cmap=cm.bwr)
plt.axis([-3, 3, -3, 3])
plt.title("Predicted cluster assignments")
plt.show()


# The model has correctly clustered the data.

# ## Remarks: The log-sum-exp trick
# 
# For a collapsed mixture model, implementing the log density can be tricky.
# In general, the log density is
# 
# \begin{align*}
#   \log p(\pi) +
#   \Big[ \sum_{k=1}^K \log p(\mathbf{\mu}_k) + \log
#   p(\mathbf{\sigma}_k) \Big] +
#   \sum_{n=1}^N \log p(\mathbf{x}_n \mid \pi, \mu, \sigma),
# \end{align*}
# 
# where the likelihood is
# 
# \begin{align*}
#   \sum_{n=1}^N \log p(\mathbf{x}_n \mid \pi, \mu, \sigma)
#   &=
#   \sum_{n=1}^N \log \sum_{k=1}^K \pi_k \, \text{Normal}(\mathbf{x}_n \mid
#   \mu_k, \sigma_k).
# \end{align*}
# 
# To prevent numerical instability, we'd like to work on the log-scale,
# 
# \begin{align*}
#   \sum_{n=1}^N \log p(\mathbf{x}_n \mid \pi, \mu, \sigma)
#   &=
#   \sum_{n=1}^N \log \sum_{k=1}^K \exp\Big(
#   \log \pi_k + \log \text{Normal}(\mathbf{x}_n \mid \mu_k, \sigma_k)\Big).
# \end{align*}
# 
# This expression involves a log sum exp operation, which is
# numerically unstable as exponentiation will often lead to one value
# dominating the rest. Therefore we use the log-sum-exp trick.
# It is based on the identity
# 
# \begin{align*}
#   \mathbf{x}_{\mathrm{max}}
#   &=
#   \arg\max \mathbf{x},
#   \\
#   \log \sum_i \exp(\mathbf{x}_i)
#   &=
#   \log \Big(\exp(\mathbf{x}_{\mathrm{max}}) \sum_i \exp(\mathbf{x}_i -
#   \mathbf{x}_{\mathrm{max}})\Big)
#   \\
#   &=
#   \mathbf{x}_{\mathrm{max}} + \log \sum_i \exp(\mathbf{x}_i -
#   \mathbf{x}_{\mathrm{max}}).
# \end{align*}
# 
# Subtracting the maximum value before taking the log-sum-exp leads to
# more numerically stable output. The $\texttt{Mixture}$ random variable
# implements this trick for calculating the log-density.