#!/usr/bin/env python
# coding: utf-8

# # Exercise 12.2 - Solution
# ## Activation maximization
# In this task, we use the approach of activation maximization to visualize to which patterns features of a CNN trained using on MNIST are sensitive. This will give us a deeper understanding of the working principle of CNNs.

# In[9]:


import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt

KTF = keras.backend
layers = keras.layers


# ### Download and preprocess data

# In[2]:


(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()
x_train = x_train.astype(np.float32)[...,np.newaxis] / 255.
x_test = x_test.astype(np.float32)[...,np.newaxis] / 255.
y_train = keras.utils.to_categorical(y_train, 10)
y_test = keras.utils.to_categorical(y_test, 10)


# ### Set up a convolutional neural network with at least 4 CNN layers.

# In[3]:


model = keras.models.Sequential([
    layers.Conv2D(16, (3,3), activation='relu', padding='same', input_shape=(28,28,1), name='conv2d_1'),
    layers.Conv2D(32, (3,3), activation='relu', padding='same', name='conv2d_2'),
    layers.MaxPooling2D((2,2)),
    layers.Conv2D(64, (3,3), activation='relu', padding='same', name='conv2d_3'),
    layers.MaxPooling2D((2,2)),
    layers.Conv2D(128, (3,3), activation='relu', padding='same', name='conv2d_4'),
    layers.GlobalMaxPooling2D(),
    layers.Dropout(0.5),
    layers.Dense(10),
    layers.Activation('softmax', name='softmax_layer')])

model.summary()


# #### compile and train model

# In[4]:


model.compile(
    loss='categorical_crossentropy',
    optimizer=keras.optimizers.Adam(lr=1e-3),
    metrics=['accuracy'])


results = model.fit(x_train, y_train,
                    batch_size=100,
                    epochs=3,
                    verbose=1,
                    validation_split=0.1
                    )


# ### Implementation of activation maximization
# Select a layer you want to visualize and perform activation maximization.

# In[5]:


gradient_updates = 50
step_size = 1.

def normalize(x):
    '''Normalize gradients via l2 norm'''
    return x / (KTF.sqrt(KTF.mean(KTF.square(x))) + KTF.epsilon())


# In[6]:


visualized_feature = []
layer_dict = layer_dict = dict([(layer.name, layer) for layer in model.layers[:]])
layer_name = "conv2d_3"

layer_output = layer_dict[layer_name].output
sub_model = keras.models.Model([model.inputs], [layer_output])

for filter_index in range(layer_output.shape[-1]):

    print('Processing feature map %d' % (filter_index+1))
    # activation maximization
    input_img = KTF.variable(np.random.uniform(0,1, (1, 28, 28, 1)))
    for i in range(gradient_updates):

        with tf.GradientTape() as gtape:

            layer_output = sub_model(input_img)
            loss = KTF.mean(layer_output[..., filter_index])  # objective
            grads = gtape.gradient(loss, input_img)
            grads = normalize(grads)
            input_img.assign_add(step_size * grads)

    visualized_feature.append(input_img.numpy())  # cast to numpy array


# #### Plot images to visualize to which patterns the respective feature maps are sensitive.

# In[7]:


def deprocess_image(x):
    # reprocess visualization to format of "MNIST images"
    x -= x.mean()
    x /= (x.std() + KTF.epsilon())
    # x *= 0.1
    x += 0.5
    x *= 255
    x = np.clip(x, 0, 255).astype('uint8')
    return x


# In[8]:


plt.figure(figsize=(10,10))

for i, feature_ in enumerate(visualized_feature):
    feature_image = deprocess_image(feature_)
    ax = plt.subplot(8, 8, 1+i)
    plt.imshow(feature_image.squeeze())
    ax.axis('off')
    plt.title("feature %s" % i)
    
plt.tight_layout()