#!/usr/bin/env python
# coding: utf-8

# In[2]:


# %load /Users/facai/Study/book_notes/preconfig.py
get_ipython().run_line_magic('matplotlib', 'inline')

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)
#sns.set(font='SimHei')
plt.rcParams['axes.grid'] = False

import numpy as np

#from IPython.display import SVG
def show_image(filename, figsize=None, res_dir=True):
    if figsize:
        plt.figure(figsize=figsize)

    if res_dir:
        filename = './res/{}'.format(filename)

    plt.imshow(plt.imread(filename))


# Chapter 9 Convolutional Networks
# ================================
# 
# Convolution is a specialized kind of linear operation.

# ### 9.1 The Convolution Operation
# 
# \begin{align}
#     s(t) &= \int x(a) w(t-a) \mathrm{d}a \\
#          &= (x \ast w)(t)
# \end{align}
# where $x$ is the input, $w$ is the kernel, and $s(t)$ is referred to as the feature map.
# 
# Since convolution is commutative,
# 
# \begin{align}
#     S(i, j) &= (I \ast K)(i, j) &= \sum_m \sum_n I(m, n) K(i - m, j - n) \\
#             &= (K \ast I)(i, j) &= \sum_m \sum_n I(i - m, j - n) K(m, n)
# \end{align}
# 
# Many machine learning libraries implement cross-correlation but call it convolution.
# 
# Discrete convolution can be viewed as multiplication by a matrix.

# In[5]:


show_image("fig9_1.png", figsize=(8, 8))


# ### 9.2 Motivation
# 
# Convolution leverages three important ideas:
# 
# + sparse interactions: fewer parameters
# + parameter sharing: tied weights
# + equivariant representations:     
#   a function $f(x)$ is equivariant to a funtion $g$ if $f(g(x)) = g(f(x))$.

# In[9]:


show_image("fig9_5.png", figsize=(8, 5))


# ### 9.3 Pooling
# 
# A pooling function replaces the output of the new at a certain location with a summary statistic of the nearby outputs.
# 
# popular pooling functions:
# 
# + max
# + average
# + L2 norm
# + weighted average
# 
# strong prior: function must be invariant to small translations.

# In[10]:


show_image("fig9_7.png", figsize=(10, 8))


# In[11]:


show_image("fig9_9.png", figsize=(10, 8))


# ### 9.4 Convolution and Pooling as an Infinitely Strong Prior
# 
# Prior: weak or strong <== how concentrated the probability density in the prior is.
# 
# We can image a convolutional net as being similar to a fully connected net, but with an infinitely strong prior over its weights: the weights for one hidden unit must be identical to the weights of its neighbor, but shifted in space.
# 
# convolution and pooling can cause underfitting.

# ### 9.5 Variants of the Basic Convolution Function
# 
# 4-D tensors: (batch_size, height, width, channels)
# 
# Three zero-padding:
# 
# + valid convolution: m - k +1 = m - (k - 1)
# + same convolution: m，补零至核中心
# + full convolution: m + k - 1，补零至核边角
# 
# see [details below](https://cn.mathworks.com/help/matlab/ref/conv2.html).

# In[14]:


show_image("matlab_conv_2d.png", figsize=(12, 8))


# In[5]:


A = np.random.rand(3, 3)
A


# In[6]:


B = np.random.rand(4, 4)
B


# In[54]:


def gen_kernel_fn(kernel):
    def kernel_fn(a, x_start, y_start):
        x_size, y_size = kernel.shape
        a_slice = a[x_start:x_start+x_size, y_start:y_start+y_size]
        return (a_slice * kernel).sum()
    return kernel_fn

def calc_conv2d_res_size(a, kernel):
    res_x_size = a.shape[0] - kernel.shape[0] + 1
    res_y_size = a.shape[1] - kernel.shape[1] + 1
    return res_x_size, res_y_size

def conv2d(a, kernel):
    kernel_fn = gen_kernel_fn(kernel)
    
    res_x_size, res_y_size = calc_conv2d_res_size(a, kernel)
    res = np.zeros((res_x_size, res_y_size))
    for x in range(res_x_size):
        for y in range(res_y_size):
            res[x, y] = kernel_fn(a, x, y)
    return res


# In[71]:


# valid convolution
conv2d(B, A)


# In[68]:


def calc_2d_pad_width(target_size, real_size):
    pad_x_width = (target_size[0] - real_size[0]) / 2
    pad_y_width = (target_size[1] - real_size[1]) / 2
    return np.array([[pad_x_width] * 2, [pad_y_width] * 2], dtype=np.int)


# In[81]:


def zero_pad_and_conv2d(a, kernel, target_size):
    res_size = calc_conv2d_res_size(a, kernel)
    pad_width = calc_2d_pad_width(target_size, res_size)
    a_pad = np.pad(a, pad_width, 'constant', constant_values=0)
    
    return conv2d(a_pad, kernel)


# In[82]:


# same convolution
same_conv_size = B.shape
zero_pad_and_conv2d(B, A, same_conv_size)


# In[84]:


# full convolution
full_conv_size = [x1 + x2 for (x1, x2) in zip(B.shape, A.shape)]
print("full convolution size: {}".format(full_conv_size))
zero_pad_and_conv2d(B, A, full_conv_size)


# ##### Comparison of local connections, convolution, and full connections.
# 
# convolutional layers $\to$ tiled convolution $\to$ locally connected layer

# In[15]:


show_image("fig9_16.png", figsize=(10, 8))


# compute gradients: See Goodfellow (2010) for a full derivation of the equations in the fully general multi-dimensional, multi-example case.

# ### 9.6 Structured Outputs
# 
# Convolutional networks => output a high-dimensional, structured object. For example, to label every pixel in an image.
# 
# 
# ### 9.7 Data Types
# 
# Convolutional networks can porcess inputs with varying spatial extents which is the same kind of observations.
# 
# 
# ### 9.8 Efficient Convolution Algorithms
# 
# + Fourier transform: point-wise multiplication in frequency domain.
# + Sometimes d-dimensional kernel is *separable*.
# 
# 
# ### 9.9 Random or Unsupervised Features
# 
# Today, most convolutinoal networks are trained in a purely supervised fashion.
# 
# 
# ### 9.10 The Neuroscientific Basis for Convolutional Networks
# 
# 
# ### 9.11 Convolutional Networks and the History of Deep Learning

# In[ ]: