#!/usr/bin/env python
# coding: utf-8

# # Theano 实例：卷积神经网络

# In[1]:


import theano
import theano.tensor as T
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
import numpy as np
from load import mnist

srng = RandomStreams()


# 从前一节导入有用的函数：

# In[2]:


def floatX(X):
    return np.asarray(X, dtype=theano.config.floatX)

def init_weights(shape):
    return theano.shared(floatX(np.random.randn(*shape) * 0.01))

def rectify(X):
    return T.maximum(X, 0.)

def softmax(X):
    e_x = T.exp(X - X.max(axis=1).dimshuffle(0, 'x'))
    return e_x / e_x.sum(axis=1).dimshuffle(0, 'x')

def dropout(X, p=0.):
    if p > 0:
        retain_prob = 1 - p
        X *= srng.binomial(X.shape, p=retain_prob, dtype=theano.config.floatX)
        X /= retain_prob
    return X

def RMSprop(cost, params, lr=0.001, rho=0.9, epsilon=1e-6):
    grads = T.grad(cost=cost, wrt=params)
    updates = []
    for p, g in zip(params, grads):
        acc = theano.shared(p.get_value() * 0.)
        acc_new = rho * acc + (1 - rho) * g ** 2
        gradient_scaling = T.sqrt(acc_new + epsilon)
        g = g / gradient_scaling
        updates.append((acc, acc_new))
        updates.append((p, p - lr * g))
    return updates


# 与前一节不同，我们使用卷积神经网络来实现这次的模型，为此，我们需要导入 2 维的卷积和池化函数：

# In[3]:


from theano.tensor.nnet.conv import conv2d
from theano.tensor.signal.downsample import max_pool_2d


# `conv2d` 函数接受两个输入：
# 
# - 对应输入的 `4D` 张量，其形状如下：
#     
#   `[mini-batch size, number of feature maps at layer m-1, image height, image width]`
#     
#     
# - 对应参数矩阵的 `4D` 张量，其形状如下：
#   
#   `[number of feature maps at layer m, number of feature maps at layer m-1, filter height, filter width]`
# 
# 为了对图像使用卷积，我们需要将图像转化为原始的 `28 × 28` 大小，同时添加一维表示图像的通道数（黑白图像为 1）：

# In[4]:


trX, teX, trY, teY = mnist(onehot=True)

trX = trX.reshape(-1, 1, 28, 28)
teX = teX.reshape(-1, 1, 28, 28)


# 注意，对于 `reshape` 方法，传入的参数是 `-1` 表示该维的维度将根据其他参数自动计算。
# 
# 模型首先进行三层卷积加池化操作，然后在第三层的输出中加一个全连结层，最后在第四层加上一个 `softmax` 层：

# In[5]:


def model(X, w, w2, w3, w4, p_drop_conv, p_drop_hidden):
    
    # X:  128 * 1 * 28 * 28
    # w:  32 * 1 * 3 * 3
    # full mode
    # l1a: 128 * 32 * (28 + 3 - 1) * (28 + 3 - 1)
    l1a = rectify(conv2d(X, w, border_mode='full'))
    # l1a: 128 * 32 * 30 * 30
    # ignore_border False
    # l1:  128 * 32 * (30 / 2) * (30 / 2)
    l1 = max_pool_2d(l1a, (2, 2), ignore_border=False)
    l1 = dropout(l1, p_drop_conv)

    # l1:  128 * 32 * 15 * 15
    # w2:  64 * 32 * 3 * 3
    # valid mode
    # l2a: 128 * 64 * (15 - 3 + 1) * (15 - 3 + 1)
    l2a = rectify(conv2d(l1, w2))    
    # l2a: 128 * 64 * 13 * 13
    # l2:  128 * 64 * (13 / 2 + 1) * (13 / 2 + 1)
    l2 = max_pool_2d(l2a, (2, 2), ignore_border=False)
    l2 = dropout(l2, p_drop_conv)

    # l2:  128 * 64 * 7 * 7
    # w3:  128 * 64 * 3 * 3
    # l3a: 128 * 128 * (7 - 3 + 1) * (7 - 3 + 1)
    l3a = rectify(conv2d(l2, w3))
    # l3a: 128 * 128 * 5 * 5
    # l3b: 128 * 128 * (5 / 2 + 1) * (5 / 2 + 1)
    l3b = max_pool_2d(l3a, (2, 2), ignore_border=False)    
    # l3b: 128 * 128 * 3 * 3
    # l3:  128 * (128 * 3 * 3)
    l3 = T.flatten(l3b, outdim=2)
    l3 = dropout(l3, p_drop_conv)
    
    # l3: 128 * (128 * 3 * 3)
    # w4: (128 * 3 * 3) * 625
    # l4: 128 * 625
    l4 = rectify(T.dot(l3, w4))
    l4 = dropout(l4, p_drop_hidden)

    # l5:  128 * 625
    # w5:  625 * 10
    # pyx: 128 * 10
    pyx = softmax(T.dot(l4, w_o))
    return l1, l2, l3, l4, pyx


# 定义符号变量：

# In[6]:


X = T.ftensor4()
Y = T.fmatrix()

w = init_weights((32, 1, 3, 3))
w2 = init_weights((64, 32, 3, 3))
w3 = init_weights((128, 64, 3, 3))
w4 = init_weights((128 * 3 * 3, 625))
w_o = init_weights((625, 10))


# 使用带 `dropout` 的模型进行训练： 

# In[7]:


noise_l1, noise_l2, noise_l3, noise_l4, noise_py_x = model(X, w, w2, w3, w4, 0.2, 0.5)


# 使用不带 `dropout` 的模型进行预测：

# In[8]:


l1, l2, l3, l4, py_x = model(X, w, w2, w3, w4, 0., 0.)
y_x = T.argmax(py_x, axis=1)


# 定义损失函数和迭代规则：

# In[9]:


cost = T.mean(T.nnet.categorical_crossentropy(noise_py_x, Y))
params = [w, w2, w3, w4, w_o]
updates = RMSprop(cost, params, lr=0.001)


# 开始训练：

# In[10]:


train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True)
predict = theano.function(inputs=[X], outputs=y_x, allow_input_downcast=True)

for i in range(50):
    for start, end in zip(range(0, len(trX), 128), range(128, len(trX), 128)):
        cost = train(trX[start:end], trY[start:end])
    print "iter {:03d}, {:.3f}".format(i + 1, np.mean(np.argmax(teY, axis=1) == predict(teX)))