### 随机梯度下降¶

$$L ( W ) = \frac { 1 } { N } \sum _ { i = 1 } ^ { N } L _ { i } \left( x _ { i } , y _ { i } , W \right)$$

$$\nabla _ { W } L ( W ) = \frac { 1 } { N } \sum _ { i = 1 } ^ { N } \nabla _ { W } L _ { i } \left( x _ { i } , y _ { i } , W \right)$$

SGD 示例:

In [2]:
class Dataloader(object):
def __init__(self, data, labels, batch_size, shuffle=True):
self.data = data
self.batch_size = batch_size
self.shuffle = shuffle
self.labels = labels

def __getitem__(self, index):
return self.data[index], self.labels[index]

def __iter__(self):
datasize = self.data.shape[0]
data_seq = np.arange(datasize)
if self.shuffle:
np.random.shuffle(data_seq)
interval_list = np.append(np.arange(0, datasize, self.batch_size), datasize)
for index in range(interval_list.shape[0]-1):
s = data_seq[interval_list[index]:interval_list[index+1]]
yield self.data[s], self.labels[s]

def __len__(self):
return self.data.shape[0]


### 线性层实现¶

Softmax、K-L 散度、交叉熵和 Cross Entropy Loss 推导和实现 推到了交叉熵损失函数，实现了一个线性分类器。

$$\frac{\partial y}{\partial W}=x^T$$
$$\frac{\partial y}{\partial b}=1$$

In [1]:
import numpy as np

class Linear(object):
def __init__(self, D_in, D_out):
self.weight = np.random.randn(D_in, D_out).astype(np.float32)*0.01
self.bias = np.zeros((1, D_out), dtype=np.float32)

def forward(self, input):
self.data = input
return np.dot(self.data, self.weight)+self.bias

# 更新参数


In [ ]:
from utils import read_mnist
from nn import CrossEntropyLossLayer, lr_scheduler

# 读取并归一化数据，不归一化会导致 nan
# 独热编码标签
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
encoder.fit(np.arange(10).reshape((-1, 1)))

loss_layer = CrossEntropyLossLayer()
lr = 0.1
D, C = 784, 10
np.random.seed(1) # 固定随机生成的权重
best_acc = -float('inf')
max_iter = 900
step_size = 400
scheduler = lr_scheduler(lr, step_size)
loss_list = []

batch_size = 120

linear_classifer = Linear(D, C)

from tqdm import tqdm_notebook
for epoch in tqdm_notebook(range(max_iter)):
# 测试
correct = 0
test_pred = linear_classifer.forward(data)
pred_labels = np.argmax(test_pred, axis=1)
real_labels = np.argmax(labels, axis=1)
correct += np.sum(pred_labels==real_labels)
if acc>best_acc: best_acc=acc
# 训练
total_loss = 0
train_pred = linear_classifer.forward(data)
loss = loss_layer.forward(train_pred, labels)
total_loss += loss
loss_layer.backward()
loss_list.append(total_loss)
scheduler.step()

In [18]:
best_acc

Out[18]:
0.967

In [16]:
import matplotlib.pyplot as plt
plt.plot(np.arange(max_iter), loss_list)
plt.show()