#!/usr/bin/env python
# coding: utf-8

# # Neural Style Transfer

# ## Summary
# 
# - Introduction
# - Principle
# - Example: apply style transfer to an image

# ## Introduction

# ### Neural style transfer in a nutshell
# 
# - Reproduce an image with a new artistic style provided by another image.
# - Blend a *content* image and a *style reference* image in a stylized output image.
# - First described in [A Neural Algorithm of Artistic Style](https://arxiv.org/abs/1508.06576) by Gatys et al (2015). Many refinements and variations since.

# ### Example
# 
# [![Prisma style transfer example](images/style_transfer_prisma.png)](https://harishnarayanan.org/writing/artistic-style-transfer/)

# ## Principle

# ### Loss definition
# 
# The resulting image $\pmb{x}^*$ minimizes a weighted sum of three losses: the content loss $\mathcal{L}_{content}(\pmb{c}, \pmb{x})$, the style loss $\mathcal{L}_{style}(\pmb{s}, \pmb{x})$ and the total variation loss $\mathcal{L}_{total\_variation}(\pmb{x})$.
# 
# $$\pmb{x}^* = \underset{\pmb{x}}{\mathrm{argmin}}(\alpha\mathcal{L}_{content}(\pmb{c}, \pmb{x}) + \beta\mathcal{L}_{style}(\pmb{s}, \pmb{x}) + \gamma\mathcal{L}_{total\_variation}(\pmb{x}))$$
# 
# - $\pmb{x}$: generated image.
# - $\pmb{c}$: content image.
# - $\pmb{s}$: reference style image.
# - $\alpha$, $\beta$ and $\gamma$: weights.

# ### The content loss
# 
# - Content = high-level structure of an image.
# - Can be captured by the upper layer of a convolutional neural network.
# - Content loss for a layer = distance between the feature maps of the content and generated images.

# ### The style loss
# 
# - Style = low-level features of an image (textures, colors, visual patterns).
# - Can be captured by using correlations across the different feature maps (filter responses) of a convnet.
# - Feature correlations are computed via a Gram matrix (outer product of the feature maps for a given layer).
# - Style loss for a layer = distance between the Gram matrices of the feature maps for the style and generated images.

# ### The total variation loss
# 
# - Sum of the absolute differences for neighboring pixel-values in an image. Measures how much noise is in the image.
# - Encourage local spatial continuity in the generated image (denoising).
# - Act as a regularization loss.

# ### Gradient descent
# 
# - Objective: minimize the total loss.
# - Optimizer: [L-BFGS](http://aria42.com/blog/2014/12/understanding-lbfgs) (original choice made by Gatys et al.) or Adam.
# 
# ![Animation of style transfer](images/style_transfer_animated.gif)

# ## Example: apply style transfer to an image
# 
# (Heavily inspired by [this Keras example](https://keras.io/examples/generative/neural_style_transfer/))

# ### Environment setup

# In[1]:


import platform

print(f"Python version: {platform.python_version()}")
assert platform.python_version_tuple() >= ("3", "6")

from IPython.display import Image, display

import numpy as np


# In[2]:


import tensorflow as tf

print(f"TensorFlow version: {tf.__version__}")
print(f"Keras version: {tf.keras.__version__}")
print('GPU found :)' if tf.config.list_physical_devices("GPU") else 'No GPU :(')

from tensorflow.keras import Model
from tensorflow.keras.applications import vgg19
from tensorflow.keras.utils import get_file
from tensorflow.keras.preprocessing.image import load_img, save_img, img_to_array
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.optimizers.schedules import ExponentialDecay


# ### Images loading

# In[3]:


# Download content and style images
base_image_path = get_file("paris.jpg", "https://i.imgur.com/F28w3Ac.jpg")
style_reference_image_path = get_file(
    "starry_night.jpg", "https://i.imgur.com/9ooB60I.jpg"
)

width, height = load_img(base_image_path).size
print(f"Input image dimensions: {width, height}")


# In[4]:


# Show content images
display(Image(base_image_path))


# In[5]:


# Show style reference image
display(Image(style_reference_image_path))


# ### Utility functions

# In[6]:


def preprocess_image(image_path, height, width):
    """Open, resize and format a picture into appropriate tensors"""
    img = load_img(
        image_path, target_size=(height, width)
    )
    img = img_to_array(img)
    img = np.expand_dims(img, axis=0)
    # Convert image from RGB to BGR and zero-center each color channel w.r.t. the ImageNet dataset
    img = vgg19.preprocess_input(img)
    return tf.convert_to_tensor(img)


def deprocess_image(x, height, width):
    """Convert a tensor into a valid image"""
    x = x.reshape((height, width, 3))
    # Remove zero-center by mean pixel applied by vgg19.preprocess_input()
    # The following values are the mean pixel values of each color channel for the ImageNet dataset
    x[:, :, 0] += 103.939
    x[:, :, 1] += 116.779
    x[:, :, 2] += 123.68
    # 'BGR'->'RGB'
    x = x[:, :, ::-1]
    x = np.clip(x, 0, 255).astype("uint8")
    return x


# ### Loss functions

# In[7]:


# The gram matrix of a 3D tensor (correlations between the feature maps of a convolutional layer)
def gram_matrix(x):
    # Transpose feature maps tensor tensor so that 3rd dimension becomes 1st
    x = tf.transpose(x, (2, 0, 1))
    # Reshape feature maps tensor into a matrix. First dimension is the number of filters/maps 
    features = tf.reshape(x, (tf.shape(x)[0], -1))
    # Compute the outer product of feature vectors with themselves
    gram = tf.matmul(features, tf.transpose(features))
    return gram


# The style loss is designed to maintain the style of the reference image in the generated image
# It is based on the gram matrices (which capture style) of feature maps from the style reference image
# and from the generated image
def style_loss(style, combination, height, width):
    S = gram_matrix(style)
    C = gram_matrix(combination)
    channels = 3
    size = height * width
    # Compute distance between Gram matrices of style and generated images
    return tf.reduce_sum(tf.square(S - C)) / (4.0 * (channels ** 2) * (size ** 2))


# In[8]:


# The content loss is designed to maintain the "content" of the base image in the generated image
def content_loss(base, combination):
    return tf.reduce_sum(tf.square(combination - base))

# The total variation loss is designed to keep the generated image locally coherent
def total_variation_loss(x, height, width):
    a = tf.square(
        x[:, : height - 1, : width - 1, :] - x[:, 1:, : width - 1, :]
    )
    b = tf.square(
        x[:, : height - 1, : width - 1, :] - x[:, : height - 1, 1:, :]
    )
    return tf.reduce_sum(tf.pow(a + b, 1.25))


# ### Model definition
# 
# We use a [VGG](https://arxiv.org/abs/1409.1556) model pretrained on the ImageNet dataset.

# In[9]:


# Using the convolutional base of VGG19, loaded with pre-trained ImageNet weights
vgg = vgg19.VGG19(weights="imagenet", include_top=False)

# Get the symbolic outputs of each "key" layer (we gave them unique names)
outputs_dict = dict([(layer.name, layer.output) for layer in vgg.layers])

# Set up a model that returns the activation values for every layer in VGG19 (as a dict)
feature_extractor = Model(inputs=vgg.inputs, outputs=outputs_dict)


# In[10]:


vgg.summary()


# ### Loss computation

# In[11]:


# List of layers to use for the style loss
style_layer_names = [
    "block1_conv1",
    "block2_conv1",
    "block3_conv1",
    "block4_conv1",
    "block5_conv1",
]
# The layer to use for the content loss
content_layer_name = "block5_conv2"

# Weights of the different loss components
total_variation_weight = 1e-6
style_weight = 1e-6
content_weight = 2.5e-8

def compute_loss(combination_image, base_image, style_reference_image, height, width):
    input_tensor = tf.concat(
        [base_image, style_reference_image, combination_image], axis=0
    )
    features = feature_extractor(input_tensor)

    # Initialize the loss
    loss = tf.zeros(shape=())

    # Add content loss
    layer_features = features[content_layer_name]
    base_image_features = layer_features[0, :, :, :]
    combination_features = layer_features[2, :, :, :]
    loss = loss + content_weight * content_loss(
        base_image_features, combination_features
    )
    # Add style loss
    for layer_name in style_layer_names:
        layer_features = features[layer_name]
        style_reference_features = layer_features[1, :, :, :]
        combination_features = layer_features[2, :, :, :]
        sl = style_loss(style_reference_features, combination_features, height, width)
        loss += (style_weight / len(style_layer_names)) * sl

    # Add total variation loss
    loss += total_variation_weight * total_variation_loss(combination_image, height, width)
    return loss


# In[12]:


@tf.function
def compute_loss_and_grads(combination_image, base_image, style_reference_image, height, width):
    with tf.GradientTape() as tape:
        loss = compute_loss(combination_image, base_image, style_reference_image, height, width)
    grads = tape.gradient(loss, combination_image)
    return loss, grads


# ### Training loop

# In[13]:


# Generated image height
gen_height = 400
# Compute generated width so that input and generated images have same scale
gen_width = int(width * gen_height / height)
print(f"Generated image dimensions: {gen_width, gen_height}")


# In[14]:


optimizer = SGD(
    ExponentialDecay(
        initial_learning_rate=100.0, decay_steps=100, decay_rate=0.96
    )
)

base_image = preprocess_image(base_image_path, gen_height, gen_width)
style_reference_image = preprocess_image(style_reference_image_path, gen_height, gen_width)
combination_image = tf.Variable(preprocess_image(base_image_path, gen_height, gen_width))

# Training loop
n_epochs = 4000
for epoch in range(1, n_epochs + 1):
    loss, grads = compute_loss_and_grads(
        combination_image, base_image, style_reference_image, gen_height, gen_width
    )
    optimizer.apply_gradients([(grads, combination_image)])
    if epoch % 100 == 0:
        print(f"Epoch [{epoch}/{n_epochs}], loss: {loss:.2f}")

# Save final image
final_img = deprocess_image(combination_image.numpy(), gen_height, gen_width)
result_image_path = f"generated_image_epoch_{n_epochs}.png"
save_img(result_image_path, final_img)


# ### Generated image display

# In[15]:


# Show final generated image
display(Image(result_image_path))


# In[ ]: