这是一个初步的尝试,使用python的nolearn包,去玩kaggle的一个手写识别项目,参考网址:
# 加载必要的包
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
from nolearn.dbn import DBN
import numpy as np
import pandas as pd
# 读入训练文件
data = pd.read_csv('train.csv')
# 找到X变量名
select_x = data.columns != 'label'
# 将X和Y分开
train = data.ix[:, select_x]/255
label = data.label.values
# 将数据切分成训练和检验
(trainX, testX, trainY, testY) = train_test_split(train, label, test_size = 0.33)
# 设定训练参数
dbn = DBN(
[trainX.shape[1], 300, 10],
learn_rates = 0.3,
learn_rate_decays = 0.9,
epochs = 10,
verbose = 1)
dbn.fit(trainX, trainY)
[DBN] fitting X.shape=(28140, 784) [DBN] layers [784, 300, 10] [DBN] Fine-tune...
100%
Epoch 1:
100%
loss 0.319482779406 err 0.0959923120729 (0:00:05) Epoch 2:
100%
loss 0.183782190051 err 0.0526053530752 (0:00:04) Epoch 3:
100%
loss 0.123825213575 err 0.0371583143508 (0:00:05) Epoch 4:
100%
loss 0.0943740354514 err 0.0265162300683 (0:00:04) Epoch 5:
100%
loss 0.0645239451027 err 0.0194333712984 (0:00:04) Epoch 6:
100%
loss 0.0383955460442 err 0.0120301822323 (0:00:04) Epoch 7:
100%
loss 0.0307188150074 err 0.00904043280182 (0:00:05) Epoch 8:
100%
loss 0.0216088450423 err 0.00615746013667 (0:00:05) Epoch 9:
100%
loss 0.0164434705575 err 0.00491173120729 (0:00:05) Epoch 10: loss 0.009670517364 err 0.00231349658314 (0:00:05)
# 预测结果和检验集进行比对
preds = dbn.predict(testX)
print classification_report(testY, preds)
precision recall f1-score support 0 0.98 0.99 0.98 1401 1 0.99 0.99 0.99 1542 2 0.97 0.98 0.97 1373 3 0.98 0.95 0.97 1496 4 0.98 0.98 0.98 1306 5 0.97 0.97 0.97 1269 6 0.98 0.98 0.98 1372 7 0.98 0.98 0.98 1422 8 0.96 0.97 0.97 1334 9 0.97 0.96 0.96 1345 avg / total 0.98 0.98 0.98 13860
# 将kaggle比赛中的test读进来
test = pd.read_csv('test.csv')
test = test/255
test = test.values
train = train.values
# 重新训练整个train数据
dbn = DBN(
[train.shape[1], 300, 10],
learn_rates = 0.3,
learn_rate_decays = 0.9,
epochs = 20,
verbose = 0)
dbn.fit(train, label)
# 将预测结果写入csv并提交
preds = dbn.predict(test)
Label = pd.Series(preds)
ImageId = pd.Series(range(len(Label))) + 1
sub = pd.concat ([ImageId, Label],1)
sub.columns = ['ImageId', 'Label']
sub.head()
ImageId | Label | |
---|---|---|
0 | 1 | 2 |
1 | 2 | 0 |
2 | 3 | 9 |
3 | 4 | 9 |
4 | 5 | 3 |
sub.to_csv('sub.csv',index =False)