notMNIST dataset is a collection of 28x28 images of letters 'a' through 'j' created by Yaroslav Bulatov. More information regarding this dataset can be found in his blog here.
Dataset consists of two parts:
Dataset can be downloaded from these links (large,small), made available through Udacity machine learning course.
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle
from sklearn.linear_model import LogisticRegression
# load the datasets and lables from previously saved pickle.
pickle_file = 'notMNIST.pickle'
with open(pickle_file, 'rb') as f:
save = pickle.load(f)
train_dataset = save['train_dataset']
train_labels = save['train_labels']
valid_dataset = save['valid_dataset']
valid_labels = save['valid_labels']
test_dataset = save['test_dataset']
test_labels = save['test_labels']
del save # hint to help gc free up memory
print 'Training set: ', train_dataset.shape, train_labels.shape
print 'Validation set: ', valid_dataset.shape, valid_labels.shape
print 'Test set: ', test_dataset.shape, test_labels.shape
# Reformat the image files in to flat vector
nsamples, nx, ny = train_dataset.shape
train_dataset = train_dataset.reshape((nsamples,nx*ny))
nsamples1, nx1, ny1 = test_dataset.shape
test_dataset = test_dataset.reshape((nsamples1,nx1*ny1))
print 'After reformatting the data'
print 'Training set: ', train_dataset.shape, train_labels.shape
print 'Validation set: ', valid_dataset.shape, valid_labels.shape
print 'Test set: ', test_dataset.shape, test_labels.shape
#Train a logistic model
logistic = LogisticRegression() #use n_jobs to assign more CPUs if available
logistic.fit(train_dataset,train_labels)
#Calculate the accuracy
print 'Accuracy: ', logistic.score(test_dataset,test_labels)*100
Training set: (400000, 28, 28) (400000,)
Validation set: (100000, 28, 28) (100000,)
Test set: (18000, 28, 28) (18000,)
After reformatting the data
Training set: (400000, 784) (400000,)
Validation set: (100000, 28, 28) (100000,)
Test set: (18000, 784) (18000,)
Accuracy: 89.45
import numpy as np
import random
import tensorflow as tf
from six.moves import cPickle as pickle
from sklearn.linear_model import LogisticRegression
# load the datasets and lables from previously saved pickle.
pickle_file = '/scratch/piyadasa/deep_learning_udacity/ass1/notMNIST.pickle'
with open(pickle_file, 'rb') as f:
save = pickle.load(f)
train_dataset = save['train_dataset']
train_labels = save['train_labels']
valid_dataset = save['valid_dataset']
valid_labels = save['valid_labels']
test_dataset = save['test_dataset']
test_labels = save['test_labels']
del save # hint to help gc free up memory
print 'Training set: ', train_dataset.shape, train_labels.shape
print 'Validation set: ', valid_dataset.shape, valid_labels.shape
print 'Test set: ', test_dataset.shape, test_labels.shape
# Reformat the image files in to flat vector
image_size = 28
num_labels = 10
def reformat(dataset, labels):
dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
# Map 0 to [1.0, 0.0, 0.0 ...], 1 to [0.0, 1.0, 0.0 ...]
labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print 'After reformatting the data'
print 'Training set: ', train_dataset.shape, train_labels.shape
print 'Validation set: ', valid_dataset.shape, valid_labels.shape
print 'Test set: ', test_dataset.shape, test_labels.shape
# We do a stochastic gradient descent training using smaller batches od training data
batch_size = 2048
graph = tf.Graph()
with graph.as_default():
# Input data. For the training data, we use a placeholder that will be fed at run time with a training minibatch.
tf_train_dataset = tf.placeholder(tf.float32,shape=(batch_size, image_size * image_size))
tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
tf_valid_dataset = tf.constant(valid_dataset)
tf_test_dataset = tf.constant(test_dataset)
# Variables.
weights = tf.Variable(tf.truncated_normal([image_size * image_size, num_labels]))
biases = tf.Variable(tf.zeros([num_labels]))
# Training computation.
logits = tf.matmul(tf_train_dataset, weights) + biases
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))
# Optimizer.
optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
# Predictions for the training, validation, and test data.
train_prediction = tf.nn.softmax(logits)
valid_prediction = tf.nn.softmax(tf.matmul(tf_valid_dataset, weights) + biases)
test_prediction = tf.nn.softmax(tf.matmul(tf_test_dataset, weights) + biases)
# define accuracy function
def accuracy(predictions, labels):
return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))/ predictions.shape[0])
#Run the graph
num_steps = 10001
with tf.Session(graph=graph) as session:
tf.initialize_all_variables().run()
for step in range(num_steps):
# Pick a random sample of train data
sample=random.sample(range(len(train_labels)),batch_size)
batch_data = train_dataset[sample]
batch_labels = train_labels[sample]
# Prepare a dictionary telling the session where to feed the minibatch.
# The key of the dictionary is the placeholder node of the graph to be fed,
# and the value is the numpy array to feed to it.
feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
_, l, predictions = session.run([optimizer, loss, train_prediction], feed_dict=feed_dict)
if (step % 500 == 0):
print "Minibatch loss at step %d: %f" % (step, l)
print "Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels)
print "Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels)
print "Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels)
Training set: (400000, 28, 28) (400000,)
Validation set: (100000, 28, 28) (100000,)
Test set: (18000, 28, 28) (18000,)
After reformatting the data
Training set: (400000, 784) (400000, 10)
Validation set: (100000, 784) (100000, 10)
Test set: (18000, 784) (18000, 10)
Minibatch loss at step 0: 17.024046
Minibatch accuracy: 7.4%
Validation accuracy: 10.4%
Minibatch loss at step 1000: 1.163157
Minibatch accuracy: 78.1%
Validation accuracy: 77.0%
Minibatch loss at step 2000: 0.998228
Minibatch accuracy: 77.8%
Validation accuracy: 78.3%
Minibatch loss at step 3000: 0.843042
Minibatch accuracy: 79.6%
Validation accuracy: 79.3%
Minibatch loss at step 4000: 0.767427
Minibatch accuracy: 81.1%
Validation accuracy: 80.0%
Minibatch loss at step 5000: 0.756067
Minibatch accuracy: 80.1%
Validation accuracy: 80.5%
Minibatch loss at step 6000: 0.725641
Minibatch accuracy: 81.8%
Validation accuracy: 81.0%
Minibatch loss at step 7000: 0.681358
Minibatch accuracy: 82.2%
Validation accuracy: 81.3%
Minibatch loss at step 8000: 0.672609
Minibatch accuracy: 82.6%
Validation accuracy: 81.6%
Minibatch loss at step 9000: 0.652013
Minibatch accuracy: 82.1%
Validation accuracy: 81.8%
Minibatch loss at step 10000: 0.643500
Minibatch accuracy: 82.6%
Validation accuracy: 82.0%
Test accuracy: 89.0%