The first step is to download the handwritten image dataset.
%pylab inline
# Fetch the MNIST handwritten digit dataset
from sklearn.datasets import fetch_mldata
mnist = fetch_mldata('MNIST original', data_home="../data")
Populating the interactive namespace from numpy and matplotlib
Now let's explore the data.
# Display the number of samples
print "(Number of samples, No. of pixels) = ", mnist.data.shape
# Display 9 number randomly selectly
for c in range(1, 10):
subplot(3, 3,c)
i = randint(mnist.data.shape[0])
im = mnist.data[i].reshape((28,28))
axis("off")
title("Label = {}".format(mnist.target[i]))
imshow(im, cmap='gray')
(Number of samples, No. of pixels) = (70000, 784)
# Split the data into training and test data
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(mnist.data, mnist.target, test_size=0.05, random_state=42)
# Which is same as
# x_train = mnist.data[:split]
# y_train = mnist.target[:split]
# x_test = mnist.data[split:]
# y_test = mnist.target[split:]
# Create the Multinomial Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
# Perform the predictions
clf.fit(x_train,y_train)
# Perform the predictions
y_predicted = clf.predict(x_test)
# Calculate the accuracy of the prediction
from sklearn.metrics import accuracy_score
print "Accuracy = {} %".format(accuracy_score(y_test, y_predicted)*100)
# Cross validate the scores
from sklearn.metrics import classification_report
print "Classification Report \n {}".format(classification_report(y_test, y_predicted, labels=range(0,10)))
Accuracy = 81.7142857143 % Classification Report precision recall f1-score support 0 0.91 0.89 0.90 320 1 0.85 0.95 0.90 387 2 0.89 0.84 0.87 352 3 0.78 0.81 0.80 356 4 0.83 0.73 0.78 329 5 0.85 0.67 0.75 342 6 0.90 0.91 0.91 377 7 0.94 0.82 0.87 360 8 0.63 0.72 0.67 337 9 0.66 0.80 0.72 340 avg / total 0.83 0.82 0.82 3500