Autoencoders Illustrated on MNIST

Herman Kamper, Stellenbosch University, 2018.

Overview

Different types of autoencoders are applied to MNIST. They aim of this Notebook is to give succinct implementations of several different autoencoder models. Intermediate representations from the models are also visualised. The models here are not tuned, the only purpose is to illustrate the models.

The Notebook was developed and tested using Python 2.7 and TensorFlow 1.3. The code relies on utility functions in the separate training.py and plotting.py modules.

The following models are considered:

Preamble

In [1]:
%matplotlib inline
from __future__ import division
from __future__ import print_function
In [2]:
from os import path
from sklearn import manifold, preprocessing
import matplotlib.pyplot as plt
import numpy as np
import PIL.Image as Image
import sys
import tensorflow as tf
In [3]:
from training import train_fixed_epochs
import plotting
In [4]:
output_dir = "/tmp/data-kamperh/"

Data

In [5]:
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets(output_dir, one_hot=True)
Extracting /tmp/data-kamperh/train-images-idx3-ubyte.gz
Extracting /tmp/data-kamperh/train-labels-idx1-ubyte.gz
Extracting /tmp/data-kamperh/t10k-images-idx3-ubyte.gz
Extracting /tmp/data-kamperh/t10k-labels-idx1-ubyte.gz
In [6]:
plt.imshow(np.reshape(mnist.train.images[1, :], (28, 28)), cmap="gray")
plt.axis("off");

Utility functions

In [7]:
# Neural network functions

TF_DTYPE = tf.float32

def build_linear(x, n_output):
    n_input = x.get_shape().as_list()[-1]
    W = tf.get_variable(
        "W", [n_input, n_output], dtype=TF_DTYPE,
        initializer=tf.contrib.layers.xavier_initializer()
        )
    b = tf.get_variable(
        "b", [n_output], dtype=TF_DTYPE,
        initializer=tf.random_normal_initializer()
        )
    return tf.matmul(x, W) + b
In [8]:
def plot_labelled_2d_data(X, labels):
    classes = set(labels)
    for label in sorted(classes):
        indices = np.where(np.array(labels) == label)[0]
        plt.scatter(X[indices, 0], X[indices, 1], label=label)

Autoencoder

$\renewcommand{\vec}[1]{\boldsymbol{\mathbf{#1}}}$A standard autoencoder is trained with the reconstruction loss:

$$\ell(\vec{x}^{(i)}; \vec{\theta}) = \left|\left|\vec{f}(\vec{x}^{(i)}) - \vec{x}^{(i)}\right|\right|^2$$

where $\vec{f}(\vec{x}^{(i)})$ is the output of the neural network for the single training example $\vec{x}^{(i)}$.

In [9]:
def build_autoencoder(x, enc_n_hiddens, n_z, dec_n_hiddens,
        activation=tf.nn.relu):
    """
    Build an autoencoder with the number of encoder and decoder units.
    
    The number of encoder and decoder units are given as lists. The middle
    (encoding/latent) layer has dimensionality `n_z`. This layer and the final
    layer are linear.
    """

    # Encoder
    for i_layer, n_hidden in enumerate(enc_n_hiddens):
        with tf.variable_scope("ae_enc_{}".format(i_layer)):
            x = build_linear(x, n_hidden)
            x = activation(x)

    # Latent variable
    with tf.variable_scope("ae_latent"):
        x = build_linear(x, n_z)
        z = x

    # Decoder
    for i_layer, n_hidden in enumerate(dec_n_hiddens):
        with tf.variable_scope("ae_dec_{}".format(i_layer)):
            x = build_linear(x, n_hidden)
            if i_layer != len(dec_n_hiddens) - 1:
                x = activation(x)
    y = x
    
    return {"z": z, "y": y}
In [10]:
tf.reset_default_graph()

# Training parameters
learning_rate = 0.001
n_epochs = 25
batch_size = 100
enc_n_hiddens = [500, 500]
n_z = 20
dec_n_hiddens = [500, 500, 28*28]

# Model
x = tf.placeholder(TF_DTYPE, [None, 28*28])
ae = build_autoencoder(x, enc_n_hiddens, n_z, dec_n_hiddens)
z = ae["z"]
y = ae["y"]
model_fn = path.join(output_dir, "ae.ckpt")

# Standard reconstruction loss
loss = tf.losses.mean_squared_error(x, y)

# Log loss
# y = tf.sigmoid(y)
# loss = tf.reduce_mean(
#     -tf.reduce_sum(x*tf.log(1e-10 + y) + (1 - x)*tf.log(1e-10 + 1 - y), 1)
#     )

optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)
In [11]:
# Train model

class MNISTTrainFeedIterator(object):
    def __iter__(self):
        n_batches = int(mnist.train.num_examples/batch_size)
        for i_batch in xrange(n_batches):
            batch_x, batch_y = mnist.train.next_batch(batch_size)
            yield [batch_x]


class MNISTTestFeedIterator(object):
    def __iter__(self):
        yield [mnist.validation.images]


record_dict = train_fixed_epochs(
    n_epochs, optimizer, loss, MNISTTrainFeedIterator(),
    [x], loss, MNISTTestFeedIterator(),
    save_model_fn=model_fn
    )
2018-06-05 17:43:29.254033
Epoch 0:
0.942 sec, train loss: 0.0461435, val loss: 0.0238925
Epoch 1:
0.782 sec, train loss: 0.0221112, val loss: 0.0203215
Epoch 2:
0.818 sec, train loss: 0.0195515, val loss: 0.0185138
Epoch 3:
0.830 sec, train loss: 0.0179799, val loss: 0.0171347
Epoch 4:
0.776 sec, train loss: 0.0168287, val loss: 0.0162615
Epoch 5:
0.789 sec, train loss: 0.0159381, val loss: 0.015446
Epoch 6:
0.774 sec, train loss: 0.0152335, val loss: 0.0147576
Epoch 7:
0.777 sec, train loss: 0.0146781, val loss: 0.0143978
Epoch 8:
0.785 sec, train loss: 0.0141636, val loss: 0.0137209
Epoch 9:
0.779 sec, train loss: 0.0137735, val loss: 0.0135287
Epoch 10:
0.845 sec, train loss: 0.0134107, val loss: 0.0131588
Epoch 11:
0.781 sec, train loss: 0.0131231, val loss: 0.0127732
Epoch 12:
0.777 sec, train loss: 0.0128385, val loss: 0.0126966
Epoch 13:
0.777 sec, train loss: 0.0125904, val loss: 0.0124435
Epoch 14:
0.790 sec, train loss: 0.0123627, val loss: 0.0121227
Epoch 15:
0.787 sec, train loss: 0.0121528, val loss: 0.0120981
Epoch 16:
0.768 sec, train loss: 0.0119713, val loss: 0.0118165
Epoch 17:
0.775 sec, train loss: 0.0118037, val loss: 0.011724
Epoch 18:
0.809 sec, train loss: 0.0116383, val loss: 0.0114484
Epoch 19:
0.777 sec, train loss: 0.0115092, val loss: 0.0114377
Epoch 20:
0.790 sec, train loss: 0.0113696, val loss: 0.0112739
Epoch 21:
0.787 sec, train loss: 0.0112492, val loss: 0.011138
Epoch 22:
0.778 sec, train loss: 0.0111494, val loss: 0.0110619
Epoch 23:
0.797 sec, train loss: 0.0110464, val loss: 0.0110747
Epoch 24:
0.780 sec, train loss: 0.0109506, val loss: 0.0110263
Writing: /tmp/data-kamperh/ae.ckpt
Training time: 0.331 min
2018-06-05 17:43:49.525392
In [12]:
# Plot weights from first layer

# Obtain weight matrix
with tf.variable_scope("ae_enc_0", reuse=True):
    W_0 = tf.get_variable("W", dtype=TF_DTYPE)
saver = tf.train.Saver()
with tf.Session() as session:
    saver.restore(session, model_fn)
    W_0 = W_0.eval()

# Plot weights
W_0 = np.transpose(W_0)
image = Image.fromarray(
    plotting.tile_images(W_0, image_shape=(28, 28), tile_shape=(11, 11))
    )
plt.figure(figsize=(10, 10))
plt.imshow(image, cmap="gray");
INFO:tensorflow:Restoring parameters from /tmp/data-kamperh/ae.ckpt
In [13]:
# Embed test data
n_test = 1000
x_np = mnist.test.images[:n_test, :]
test_labels = mnist.test.labels[:n_test]
test_labels = np.argmax(test_labels, axis=1)  # convert from one-hot
saver = tf.train.Saver()
with tf.Session() as session:
    saver.restore(session, model_fn)
    z_np = session.run([z], feed_dict={x: x_np})[0]
    y_np = session.run([y], feed_dict={x: x_np})[0]
INFO:tensorflow:Restoring parameters from /tmp/data-kamperh/ae.ckpt
In [14]:
# Plot reconstruction
plt.figure(figsize=(8, 12))
for i in range(5):
    plt.subplot(5, 2, 2*i + 1)
    plt.title("Input")
    plt.imshow(x_np[i].reshape(28, 28), vmin=0, vmax=1, cmap="gray")
    plt.axis("off")
    plt.subplot(5, 2, 2*i + 2)
    plt.title("Reconstruction")
    plt.imshow(y_np[i].reshape(28, 28), vmin=0, vmax=1, cmap="gray")
    plt.axis("off")
In [15]:
# Plot t-SNE embeddings of the latent variables
tsne = manifold.TSNE(
    n_components=2, #perplexity=20, init="random",
    random_state=0
    )
z_tsne = tsne.fit_transform(z_np)
plt.figure(figsize=(10, 10))
plot_labelled_2d_data(z_tsne, test_labels)
plt.legend();
In [16]:
# Plot t-SNE embeddings of the original data
tsne = manifold.TSNE(
    n_components=2,
    random_state=0
    )
x_tsne = tsne.fit_transform(x_np)
plt.figure(figsize=(10, 10))
plot_labelled_2d_data(x_tsne, test_labels)
plt.legend();
In [17]:
# Plot the embeddings themselves

# Get random indices
n_plot = 100
indices = np.arange(len(test_labels))
np.random.seed(1)
np.random.shuffle(indices)
indices = indices[:n_plot]

# Plot embeddings directly
indices_sorted = np.argsort(test_labels[indices], axis=0)
z_sorted = z_np[indices][indices_sorted, :]
labels_sorted = test_labels[indices][indices_sorted]
plt.figure(figsize=(10, 10))
plt.imshow(z_sorted)
ax = plt.gca()
for i, label in enumerate(labels_sorted):
    if labels_sorted[i] != labels_sorted[i - 1]:
        plt.axhline(y = i - 0.5, color="r")
        ax.text(0, i, str(label), verticalalignment="top", fontsize=15)
plt.axis('off');

# Plot mean and variance normalised embeddings
plt.figure(figsize=(10, 10))
plt.imshow(preprocessing.scale(z_sorted))
ax = plt.gca()
for i, label in enumerate(labels_sorted):
    if labels_sorted[i] != labels_sorted[i - 1]:
        plt.axhline(y = i - 0.5, color="r")
        ax.text(0, i, str(label), verticalalignment="top", fontsize=15)
plt.axis('off');
/r2d2/tools/py2.7_tf1.0/local/lib/python2.7/site-packages/sklearn/preprocessing/data.py:164: UserWarning: Numerical issues were encountered when centering the data and might not be solved. Dataset may contain too large values. You may need to prescale your features.
  warnings.warn("Numerical issues were encountered "
/r2d2/tools/py2.7_tf1.0/local/lib/python2.7/site-packages/sklearn/preprocessing/data.py:181: UserWarning: Numerical issues were encountered when scaling the data and might not be solved. The standard deviation of the data is probably very close to 0. 
  warnings.warn("Numerical issues were encountered "

Variational autoencoder

Resources: https://arxiv.org/abs/1312.6114; https://arxiv.org/abs/1606.05908; https://jmetzen.github.io/2015-11-27/vae.html

The variational autoencoder (VAE) is trained by minimising the negative of the evidence lower bound (ELBO), i.e. maximising the ELBO:

$$\ell(\vec{x}^{(i)}; \vec{\theta}, \vec{\phi}) = -J(\vec{x}^{(i)}; \vec{\theta}, \vec{\phi})$$

with the ELBO given by

$$J(\vec{x}^{(i)}; \vec{\theta}, \vec{\phi}) = \mathbb{E}_{q_{\phi}(\vec{z}|\vec{x}^{(i)})} \left[ \log p_{\theta}(\vec{x}^{(i)}|\vec{z}) \right] - D_{\textrm{KL}} \left( q_{\phi}(\vec{z}|\vec{x}^{(i)}) || p(\vec{z}) \right)$$

The ELBO is a lower bound:

$$ \log p_\theta(\vec{x}^{(i)}) \geq \mathbb{E}_{q_{\phi}(\vec{z}|\vec{x}^{(i)})} \left[ \log p_{\theta}(\vec{x}^{(i)}|\vec{z}) \right] - D_{\textrm{KL}} \left( q_{\phi}(\vec{z}|\vec{x}^{(i)}) || p(\vec{z}) \right)$$

The first term is normally approximated with a single Monte Carlo sample:

$$\mathbb{E}_{q_{\phi}(\vec{z}|\vec{x}^{(i)})} \left[ \log p_{\theta}(\vec{x}^{(i)}|\vec{z}) \right] \approx \log p_{\theta}(\vec{x}^{(i)}|\vec{z}^{(l)}) \textrm{ with } \vec{z}^{(l)} \sim q_{\phi}(\vec{z}|\vec{x}^{(i)})$$

Assuming a spherical Gaussian distribution for the decoder $p_{\theta}(\vec{x}^{(i)}|\vec{z})$, with covariance $\sigma^2\vec{I}$, this reduces to

$$\log p_{\theta}(\vec{x}^{(i)}|\vec{z}^{(l)}) = c - \frac{1}{2\sigma^2} \left|\left| \vec{f} (\vec{z}^{(l)}) - \vec{x}^{(i)} \right|\right|^2 $$

with $c$ a constant (from the normalisation) and $\vec{f} (\vec{z}^{(l)})$ the output of the network given the (sampled) latent variable $\vec{z}^{(l)}$.

The second term in the ELBO has a closed form solution for some distributions. When assuming a standard Gaussian prior $p(\vec{z}) = \mathcal{N}(\vec{z}; \vec{0}, \vec{I})$ and a diagonal-covariance Gaussian distribution for the encoder $q_{\phi}(\vec{z}|\vec{x}^{(i)}) = \mathcal{N} \left( \vec{z};\vec{\mu}_\phi(\vec{x}^{(i)}), \vec{\sigma}^2_\phi(\vec{x}^{(i)})\vec{I} \right)$, then the KL-divergence reduces to

$$ D_{\textrm{KL}} \left( q_{\phi}(\vec{z}|\vec{x}^{(i)}) || p(\vec{z}) \right) = -\frac{1}{2} \sum_{j = 1}^D \left( 1 + \log \sigma_j^2 - \mu_j^2 -\sigma_j^2 \right) $$

where I have dropped the subscripts for $\vec{\mu}_\phi$ and $\vec{\sigma}^2_\phi$ (which indicates dependence on the neural network parameters).

In [18]:
def build_vae(x, enc_n_hiddens, n_z, dec_n_hiddens, activation=tf.nn.relu):
    """
    Build a VAE with the number of encoder and decoder units.
    
    Parameters
    ----------
    The number of encoder and decoder units are given as lists. The middle
    (encoding/latent) layer has dimensionality `n_z`. The final layer is
    linear.

    Return
    ------
    A dictionary with the mean `z_mean`, and log variance squared
    `z_log_sigma_sq` of the latent variable; the latent variable `z` itself
    (the output of the encoder); and the final output `y` of the network (the
    output of the decoder).
    """
    
    # Encoder
    for i_layer, n_hidden in enumerate(enc_n_hiddens):
        with tf.variable_scope("vae_enc_{}".format(i_layer)):
            x = build_linear(x, n_hidden)
            x = activation(x)
    
    # Latent variable
    with tf.variable_scope("vae_latent_mean"):
        z_mean = build_linear(x, n_z)
    with tf.variable_scope("vae_latent_log_sigma_sq"):
        z_log_sigma_sq = build_linear(x, n_z)
    with tf.variable_scope("vae_latent"):
        eps = tf.random_normal((tf.shape(x)[0], n_z), 0, 1, dtype=TF_DTYPE)
        
        # Reparametrisation trick
        z = z_mean + tf.multiply(tf.sqrt(tf.exp(z_log_sigma_sq)), eps)

    # Decoder
    x = z
    for i_layer, n_hidden in enumerate(dec_n_hiddens):
        with tf.variable_scope("vae_dec_{}".format(i_layer)):
            x = build_linear(x, n_hidden)
            if i_layer != len(dec_n_hiddens) - 1:
                x = activation(x)
    y = x
    
    return {"z_mean": z_mean, "z_log_sigma_sq": z_log_sigma_sq, "z": z, "y": y}


def vae_loss_gaussian(x, y, sigma_sq, z_mean, z_log_sigma_sq):
    """
    Use p(x|z) = Normal(x; f(z), sigma^2 I), with y = f(z) the decoder output.
    """
    
    # Gaussian reconstruction loss
    reconstruction_loss = 1./(2*sigma_sq) * tf.losses.mean_squared_error(x, y)
    
    # Regularisation loss
    regularisation_loss = -0.5*tf.reduce_sum(
        1 + z_log_sigma_sq - tf.square(z_mean) - tf.exp(z_log_sigma_sq), 1
        )
    
    return reconstruction_loss + tf.reduce_mean(regularisation_loss)


def vae_loss_bernoulli(x, y, z_mean, z_log_sigma_sq):
    """
    Use a Bernoulli distribution for p(x|z), with the y = f(z) the mean.
    """
    
    # Bernoulli reconstruction loss
    reconstruction_loss = -tf.reduce_sum(
        x*tf.log(1e-10 + y) + (1 - x)*tf.log(1e-10 + 1 - y), 1
        )
    
    # Regularisation loss
    regularisation_loss = -0.5*tf.reduce_sum(
        1 + z_log_sigma_sq - tf.square(z_mean) - tf.exp(z_log_sigma_sq), 1
        )
    
    return tf.reduce_mean(reconstruction_loss + regularisation_loss)
In [48]:
tf.reset_default_graph()

# Training parameters
learning_rate = 0.001
n_epochs = 25
batch_size = 100
enc_n_hiddens = [500, 500]
n_z = 20  # n_z = 2
dec_n_hiddens = [500, 500, 28*28]

# Model
x = tf.placeholder(TF_DTYPE, [None, 28*28])
vae = build_vae(x, enc_n_hiddens, n_z, dec_n_hiddens)
z_mean = vae["z_mean"]
z_log_sigma_sq = vae["z_log_sigma_sq"]
z = vae["z"]
y = vae["y"]
model_fn = path.join(output_dir, "vae.ckpt")

# Training tensors
# y = tf.nn.sigmoid(y)  # use with the Bernoulli loss below
# loss = vae_loss_bernoulli(x, y, z_mean, z_log_sigma_sq)
sigma_sq = 0.001  # smaller values: care more about reconstruction
loss = vae_loss_gaussian(x, y, sigma_sq, z_mean, z_log_sigma_sq)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)
In [49]:
record_dict = train_fixed_epochs(
    n_epochs, optimizer, loss, MNISTTrainFeedIterator(),
    [x], loss, MNISTTestFeedIterator(),
    save_model_fn=model_fn
    )
2018-06-05 17:54:22.289410
Epoch 0:
0.861 sec, train loss: 38.3302, val loss: 31.4858
Epoch 1:
0.883 sec, train loss: 30.0793, val loss: 28.4969
Epoch 2:
0.852 sec, train loss: 27.3993, val loss: 26.3748
Epoch 3:
0.829 sec, train loss: 26.1777, val loss: 25.5769
Epoch 4:
0.843 sec, train loss: 25.4593, val loss: 25.0363
Epoch 5:
0.853 sec, train loss: 24.967, val loss: 24.8891
Epoch 6:
0.842 sec, train loss: 24.5896, val loss: 24.4494
Epoch 7:
0.786 sec, train loss: 24.345, val loss: 23.9138
Epoch 8:
0.824 sec, train loss: 24.0964, val loss: 23.7052
Epoch 9:
0.813 sec, train loss: 23.9169, val loss: 23.5662
Epoch 10:
0.819 sec, train loss: 23.7663, val loss: 23.4442
Epoch 11:
0.793 sec, train loss: 23.607, val loss: 23.3452
Epoch 12:
0.852 sec, train loss: 23.4896, val loss: 23.1194
Epoch 13:
0.916 sec, train loss: 23.3847, val loss: 23.1825
Epoch 14:
0.858 sec, train loss: 23.2788, val loss: 23.076
Epoch 15:
0.851 sec, train loss: 23.2073, val loss: 22.948
Epoch 16:
0.838 sec, train loss: 23.1236, val loss: 22.816
Epoch 17:
0.795 sec, train loss: 23.0285, val loss: 22.7617
Epoch 18:
0.861 sec, train loss: 22.994, val loss: 22.6629
Epoch 19:
0.865 sec, train loss: 22.9248, val loss: 22.6039
Epoch 20:
0.968 sec, train loss: 22.8754, val loss: 22.6749
Epoch 21:
0.882 sec, train loss: 22.8197, val loss: 22.6748
Epoch 22:
0.835 sec, train loss: 22.7727, val loss: 22.6353
Epoch 23:
0.853 sec, train loss: 22.7546, val loss: 22.5936
Epoch 24:
0.829 sec, train loss: 22.6981, val loss: 22.4682
Writing: /tmp/data-kamperh/vae.ckpt
Training time: 0.353 min
2018-06-05 17:54:43.662675
In [50]:
# Plot weights from first layer

# Obtain weight matrix
with tf.variable_scope("vae_enc_0", reuse=True):
    W_0 = tf.get_variable("W", dtype=TF_DTYPE)
saver = tf.train.Saver()
with tf.Session() as session:
    saver.restore(session, model_fn)
    W_0 = W_0.eval()

# Plot weights
W_0 = np.transpose(W_0)
image = Image.fromarray(
    plotting.tile_images(W_0, image_shape=(28, 28), tile_shape=(11, 11))
    )
plt.figure(figsize=(10, 10))
plt.imshow(image, cmap="gray");
INFO:tensorflow:Restoring parameters from /tmp/data-kamperh/vae.ckpt
In [51]:
# Embed test data
n_test = 1000
x_np = mnist.test.images[:n_test, :]
test_labels = mnist.test.labels[:n_test]
test_labels = np.argmax(test_labels, axis=1)  # convert from one-hot
saver = tf.train.Saver()
with tf.Session() as session:
    saver.restore(session, model_fn)
    z_np = session.run([z_mean], feed_dict={x: x_np})[0]
    y_np = session.run([y], feed_dict={x: x_np})[0]
INFO:tensorflow:Restoring parameters from /tmp/data-kamperh/vae.ckpt
In [52]:
# Plot reconstruction
plt.figure(figsize=(8, 12))
for i in range(5):
    plt.subplot(5, 2, 2*i + 1)
    plt.title("Input")
    plt.imshow(x_np[i].reshape(28, 28), vmin=0, vmax=1, cmap="gray")
    plt.axis("off")
    plt.subplot(5, 2, 2*i + 2)
    plt.title("Reconstruction")
    plt.imshow(y_np[i].reshape(28, 28), vmin=0, vmax=1, cmap="gray")
    plt.axis("off")
In [53]:
# Plot t-SNE embeddings of the latent variables
tsne = manifold.TSNE(
    n_components=2, #perplexity=20, init="random",
    random_state=0
    )
z_tsne = tsne.fit_transform(z_np)
plt.figure(figsize=(10, 10))
plot_labelled_2d_data(z_tsne, test_labels)
plt.legend();
In [54]:
# Plot the embeddings themselves

# Get random indices
n_plot = 100
indices = np.arange(len(test_labels))
np.random.seed(1)
np.random.shuffle(indices)
indices = indices[:n_plot]

# Plot embeddings directly
indices_sorted = np.argsort(test_labels[indices], axis=0)
z_sorted = z_np[indices][indices_sorted, :]
labels_sorted = test_labels[indices][indices_sorted]
plt.figure(figsize=(10, 10))
plt.imshow(z_sorted)
ax = plt.gca()
for i, label in enumerate(labels_sorted):
    if labels_sorted[i] != labels_sorted[i - 1]:
        plt.axhline(y = i - 0.5, color="r")
        ax.text(0, i, str(label), verticalalignment="top", fontsize=15)
plt.axis('off');

# Plot mean and variance normalised embeddings
plt.figure(figsize=(10, 10))
plt.imshow(preprocessing.scale(z_sorted))
ax = plt.gca()
for i, label in enumerate(labels_sorted):
    if labels_sorted[i] != labels_sorted[i - 1]:
        plt.axhline(y = i - 0.5, color="r")
        ax.text(0, i, str(label), verticalalignment="top", fontsize=15)
plt.axis('off');