#!/usr/bin/env python
# coding: utf-8

# # Theano 实例：Softmax 回归

# ## MNIST 数据集的下载和导入

# [MNIST 数据集](http://yann.lecun.com/exdb/mnist/) 是一个手写数字组成的数据集，现在被当作一个机器学习算法评测的基准数据集。
# 
# 这是一个下载并解压数据的脚本：

# In[1]:


get_ipython().run_cell_magic('file', 'download_mnist.py', "import os\nimport os.path\nimport urllib\nimport gzip\nimport shutil\n\nif not os.path.exists('mnist'):\n    os.mkdir('mnist')\n\ndef download_and_gzip(name):\n    if not os.path.exists(name + '.gz'):\n        urllib.urlretrieve('http://yann.lecun.com/exdb/' + name + '.gz', name + '.gz')\n    if not os.path.exists(name):\n        with gzip.open(name + '.gz', 'rb') as f_in, open(name, 'wb') as f_out:\n            shutil.copyfileobj(f_in, f_out)\n            \ndownload_and_gzip('mnist/train-images-idx3-ubyte')\ndownload_and_gzip('mnist/train-labels-idx1-ubyte')\ndownload_and_gzip('mnist/t10k-images-idx3-ubyte')\ndownload_and_gzip('mnist/t10k-labels-idx1-ubyte')\n")


# 可以运行这个脚本来下载和解压数据：

# In[2]:


get_ipython().run_line_magic('run', 'download_mnist.py')


# 使用如下的脚本来导入 MNIST 数据，源码地址：
# 
# https://github.com/Newmu/Theano-Tutorials/blob/master/load.py

# In[3]:


get_ipython().run_cell_magic('file', 'load.py', "import numpy as np\nimport os\n\ndatasets_dir = './'\n\ndef one_hot(x,n):\n\tif type(x) == list:\n\t\tx = np.array(x)\n\tx = x.flatten()\n\to_h = np.zeros((len(x),n))\n\to_h[np.arange(len(x)),x] = 1\n\treturn o_h\n\ndef mnist(ntrain=60000,ntest=10000,onehot=True):\n\tdata_dir = os.path.join(datasets_dir,'mnist/')\n\tfd = open(os.path.join(data_dir,'train-images-idx3-ubyte'))\n\tloaded = np.fromfile(file=fd,dtype=np.uint8)\n\ttrX = loaded[16:].reshape((60000,28*28)).astype(float)\n\n\tfd = open(os.path.join(data_dir,'train-labels-idx1-ubyte'))\n\tloaded = np.fromfile(file=fd,dtype=np.uint8)\n\ttrY = loaded[8:].reshape((60000))\n\n\tfd = open(os.path.join(data_dir,'t10k-images-idx3-ubyte'))\n\tloaded = np.fromfile(file=fd,dtype=np.uint8)\n\tteX = loaded[16:].reshape((10000,28*28)).astype(float)\n\n\tfd = open(os.path.join(data_dir,'t10k-labels-idx1-ubyte'))\n\tloaded = np.fromfile(file=fd,dtype=np.uint8)\n\tteY = loaded[8:].reshape((10000))\n\n\ttrX = trX/255.\n\tteX = teX/255.\n\n\ttrX = trX[:ntrain]\n\ttrY = trY[:ntrain]\n\n\tteX = teX[:ntest]\n\tteY = teY[:ntest]\n\n\tif onehot:\n\t\ttrY = one_hot(trY, 10)\n\t\tteY = one_hot(teY, 10)\n\telse:\n\t\ttrY = np.asarray(trY)\n\t\tteY = np.asarray(teY)\n\n\treturn trX,teX,trY,teY\n")


# ## softmax 回归

# `Softmax` 回归相当于 `Logistic` 回归的一个一般化，`Logistic` 回归处理的是两类问题，`Softmax` 回归处理的是 `N` 类问题。
# 
# `Logistic` 回归输出的是标签为 1 的概率（标签为 0 的概率也就知道了），对应地，对 N 类问题 `Softmax` 输出的是每个类对应的概率。
# 
# 具体的内容，可以参考 `UFLDL` 教程：
# 
# http://ufldl.stanford.edu/wiki/index.php/Softmax%E5%9B%9E%E5%BD%92

# In[4]:


import theano
from theano import tensor as T
import numpy as np
from load import mnist


# 我们来看它具体的实现。
# 
# 这两个函数一个是将数据转化为 `GPU` 计算的类型，另一个是初始化权重：

# In[5]:


def floatX(X):
    return np.asarray(X, dtype=theano.config.floatX)

def init_weights(shape):
    return theano.shared(floatX(np.random.randn(*shape) * 0.01))


# `Softmax` 的模型在 `theano` 中已经实现好了：

# In[6]:


A = T.matrix()

B = T.nnet.softmax(A)

test_softmax = theano.function([A], B)

a = floatX(np.random.rand(3, 4))

b = test_softmax(a)

print b.shape

# 行和
print b.sum(1)


# `softmax` 函数会按照行对矩阵进行 `Softmax` 归一化。

# 所以我们的模型为：

# In[7]:


def model(X, w):
    return T.nnet.softmax(T.dot(X, w))


# 导入数据：

# In[8]:


trX, teX, trY, teY = mnist(onehot=True)


# 定义变量，并初始化权重：

# In[9]:


X = T.fmatrix()
Y = T.fmatrix()

w = init_weights((784, 10))


# 定义模型输出和预测：

# In[10]:


py_x = model(X, w)
y_pred = T.argmax(py_x, axis=1)


# 损失函数为多类的交叉熵，这个在 `theano` 中也被定义好了：

# In[11]:


cost = T.mean(T.nnet.categorical_crossentropy(py_x, Y))
gradient = T.grad(cost=cost, wrt=w)
update = [[w, w - gradient * 0.05]]


# 编译 `train` 和 `predict` 函数：

# In[12]:


train = theano.function(inputs=[X, Y], outputs=cost, updates=update, allow_input_downcast=True)
predict = theano.function(inputs=[X], outputs=y_pred, allow_input_downcast=True)


# 迭代 100 次，测试集正确率为 0.925：

# In[13]:


for i in range(100):
    for start, end in zip(range(0, len(trX), 128), range(128, len(trX), 128)):
        cost = train(trX[start:end], trY[start:end])
    print "{0:03d}".format(i), np.mean(np.argmax(teY, axis=1) == predict(teX))