Digit Recognizer using Multiclass Clasification

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

prepareing data

Download data from https://www.kaggle.com/c/digit-recognizer/data

In [2]:
n_input = 784  # MNIST data input (img shape: 28*28)
n_classes = 10  # MNIST total classes (0-9 digits)

validation_size = 2000
In [39]:
train = pd.read_csv('../input/digit-recognizer/train.csv')
test  = pd.read_csv('../input/digit-recognizer/test.csv')
In [40]:
print(train.shape)
print(test.shape)
(42000, 785)
(28000, 784)
In [5]:
train.head()
Out[5]:
label pixel0 pixel1 pixel2 pixel3 pixel4 pixel5 pixel6 pixel7 pixel8 ... pixel774 pixel775 pixel776 pixel777 pixel778 pixel779 pixel780 pixel781 pixel782 pixel783
0 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 1 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 4 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 785 columns

In [6]:
test.head()
Out[6]:
pixel0 pixel1 pixel2 pixel3 pixel4 pixel5 pixel6 pixel7 pixel8 pixel9 ... pixel774 pixel775 pixel776 pixel777 pixel778 pixel779 pixel780 pixel781 pixel782 pixel783
0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 784 columns

split train data to labels and pixels.

In [7]:
features = (train.ix[:,1:].values).astype('float32')
labels = pd.get_dummies(train.ix[:,0]).astype('float32') # one hot encoding
In [8]:
print(features.shape)
print(labels.shape)
(42000, 784)
(42000, 10)
In [9]:
# split data into training & validation
valid_features = features[:validation_size]
valid_labels = labels[:validation_size]

train_features = features[validation_size:]
train_labels = labels[validation_size:]
In [10]:
print(train_features.shape)
print(train_labels.shape)
print(valid_features.shape)
print(valid_labels.shape)
(40000, 784)
(40000, 10)
(2000, 784)
(2000, 10)
In [11]:
test_features = (test.values).astype('float32')
In [12]:
print(test_features.shape)
(28000, 784)

Make a TensorFlow Graph

In [13]:
# Features and Labels
features = tf.placeholder(tf.float32, [None, n_input])
labels = tf.placeholder(tf.float32, [None, n_classes])

# Weights & bias
weights = tf.Variable(tf.random_normal([n_input, n_classes]))
bias = tf.Variable(tf.random_normal([n_classes]))

# Logits - xW + b
logits = tf.add(tf.matmul(features, weights), bias)

# Define loss and optimizer
learning_rate = tf.placeholder(tf.float32)
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=labels))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

# Calculate accuracy
predict = tf.argmax(logits, 1)
correct_prediction = tf.equal(predict, tf.argmax(labels, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

init = tf.global_variables_initializer()

Training

Define helper functions.

In [14]:
def print_epoch_stats(epoch_i, sess, last_features, last_labels):
    """
    Print cost and validation accuracy of an epoch
    """
    current_cost = sess.run(
        cost,
        feed_dict={features: last_features, labels: last_labels})
    valid_accuracy = sess.run(
        accuracy,
        feed_dict={features: valid_features, labels: valid_labels})
    print('Epoch: {:<4} - Cost: {:<8.3} Valid Accuracy: {:<5.3}'.format(
        epoch_i,
        current_cost,
        valid_accuracy))
In [15]:
import math
def batches(batch_size, features, labels):
    """
    Create batches of features and labels
    :param batch_size: The batch size
    :param features: List of features
    :param labels: List of labels
    :return: Batches of (Features, Labels)
    """
    assert len(features) == len(labels)
    outout_batches = []
    
    sample_size = len(features)
    for start_i in range(0, sample_size, batch_size):
        end_i = start_i + batch_size
        batch = [features[start_i:end_i], labels[start_i:end_i]]
        outout_batches.append(batch)
        
    return outout_batches

HyperParameters

In [16]:
batch_size = 128
epochs = 100
learn_rate = 0.0001
In [17]:
train_batches = batches(batch_size, train_features, train_labels)
In [37]:
with tf.Session() as sess:
    sess.run(init)

    # Training cycle
    for epoch_i in range(epochs):

        # Loop over all batches
        for batch_features, batch_labels in train_batches:
            train_feed_dict = {
                features: batch_features,
                labels: batch_labels,
                learning_rate: learn_rate}
            sess.run(optimizer, feed_dict=train_feed_dict)

        # Print cost and validation accuracy of an epoch
        print_epoch_stats(epoch_i, sess, batch_features, batch_labels)

    predictions = sess.run(
                        predict, 
                        feed_dict={features: test_features})
Epoch: 0    - Cost: 2.76e+03 Valid Accuracy: 0.114
Epoch: 1    - Cost: 2.43e+03 Valid Accuracy: 0.143
Epoch: 2    - Cost: 2.14e+03 Valid Accuracy: 0.174
Epoch: 3    - Cost: 1.87e+03 Valid Accuracy: 0.212
Epoch: 4    - Cost: 1.64e+03 Valid Accuracy: 0.25 
Epoch: 5    - Cost: 1.46e+03 Valid Accuracy: 0.294
Epoch: 6    - Cost: 1.3e+03  Valid Accuracy: 0.331
Epoch: 7    - Cost: 1.17e+03 Valid Accuracy: 0.373
Epoch: 8    - Cost: 1.05e+03 Valid Accuracy: 0.411
Epoch: 9    - Cost: 9.48e+02 Valid Accuracy: 0.439
Epoch: 10   - Cost: 8.55e+02 Valid Accuracy: 0.463
Epoch: 11   - Cost: 7.71e+02 Valid Accuracy: 0.491
Epoch: 12   - Cost: 6.96e+02 Valid Accuracy: 0.516
Epoch: 13   - Cost: 6.27e+02 Valid Accuracy: 0.537
Epoch: 14   - Cost: 5.68e+02 Valid Accuracy: 0.558
Epoch: 15   - Cost: 5.18e+02 Valid Accuracy: 0.582
Epoch: 16   - Cost: 4.72e+02 Valid Accuracy: 0.604
Epoch: 17   - Cost: 4.36e+02 Valid Accuracy: 0.623
Epoch: 18   - Cost: 4.06e+02 Valid Accuracy: 0.64 
Epoch: 19   - Cost: 3.83e+02 Valid Accuracy: 0.658
Epoch: 20   - Cost: 3.62e+02 Valid Accuracy: 0.668
Epoch: 21   - Cost: 3.42e+02 Valid Accuracy: 0.679
Epoch: 22   - Cost: 3.24e+02 Valid Accuracy: 0.689
Epoch: 23   - Cost: 3.11e+02 Valid Accuracy: 0.695
Epoch: 24   - Cost: 3.01e+02 Valid Accuracy: 0.706
Epoch: 25   - Cost: 2.91e+02 Valid Accuracy: 0.714
Epoch: 26   - Cost: 2.81e+02 Valid Accuracy: 0.724
Epoch: 27   - Cost: 2.72e+02 Valid Accuracy: 0.73 
Epoch: 28   - Cost: 2.63e+02 Valid Accuracy: 0.738
Epoch: 29   - Cost: 2.56e+02 Valid Accuracy: 0.743
Epoch: 30   - Cost: 2.5e+02  Valid Accuracy: 0.751
Epoch: 31   - Cost: 2.43e+02 Valid Accuracy: 0.756
Epoch: 32   - Cost: 2.38e+02 Valid Accuracy: 0.759
Epoch: 33   - Cost: 2.33e+02 Valid Accuracy: 0.763
Epoch: 34   - Cost: 2.28e+02 Valid Accuracy: 0.768
Epoch: 35   - Cost: 2.23e+02 Valid Accuracy: 0.772
Epoch: 36   - Cost: 2.18e+02 Valid Accuracy: 0.777
Epoch: 37   - Cost: 2.13e+02 Valid Accuracy: 0.781
Epoch: 38   - Cost: 2.09e+02 Valid Accuracy: 0.784
Epoch: 39   - Cost: 2.06e+02 Valid Accuracy: 0.789
Epoch: 40   - Cost: 2.02e+02 Valid Accuracy: 0.793
Epoch: 41   - Cost: 1.98e+02 Valid Accuracy: 0.798
Epoch: 42   - Cost: 1.95e+02 Valid Accuracy: 0.8  
Epoch: 43   - Cost: 1.91e+02 Valid Accuracy: 0.803
Epoch: 44   - Cost: 1.88e+02 Valid Accuracy: 0.808
Epoch: 45   - Cost: 1.85e+02 Valid Accuracy: 0.812
Epoch: 46   - Cost: 1.82e+02 Valid Accuracy: 0.814
Epoch: 47   - Cost: 1.79e+02 Valid Accuracy: 0.817
Epoch: 48   - Cost: 1.76e+02 Valid Accuracy: 0.819
Epoch: 49   - Cost: 1.73e+02 Valid Accuracy: 0.82 
Epoch: 50   - Cost: 1.71e+02 Valid Accuracy: 0.821
Epoch: 51   - Cost: 1.67e+02 Valid Accuracy: 0.821
Epoch: 52   - Cost: 1.64e+02 Valid Accuracy: 0.823
Epoch: 53   - Cost: 1.61e+02 Valid Accuracy: 0.826
Epoch: 54   - Cost: 1.58e+02 Valid Accuracy: 0.828
Epoch: 55   - Cost: 1.55e+02 Valid Accuracy: 0.829
Epoch: 56   - Cost: 1.52e+02 Valid Accuracy: 0.828
Epoch: 57   - Cost: 1.49e+02 Valid Accuracy: 0.831
Epoch: 58   - Cost: 1.46e+02 Valid Accuracy: 0.831
Epoch: 59   - Cost: 1.44e+02 Valid Accuracy: 0.833
Epoch: 60   - Cost: 1.41e+02 Valid Accuracy: 0.834
Epoch: 61   - Cost: 1.38e+02 Valid Accuracy: 0.835
Epoch: 62   - Cost: 1.36e+02 Valid Accuracy: 0.836
Epoch: 63   - Cost: 1.34e+02 Valid Accuracy: 0.837
Epoch: 64   - Cost: 1.31e+02 Valid Accuracy: 0.84 
Epoch: 65   - Cost: 1.29e+02 Valid Accuracy: 0.84 
Epoch: 66   - Cost: 1.27e+02 Valid Accuracy: 0.84 
Epoch: 67   - Cost: 1.25e+02 Valid Accuracy: 0.841
Epoch: 68   - Cost: 1.23e+02 Valid Accuracy: 0.84 
Epoch: 69   - Cost: 1.21e+02 Valid Accuracy: 0.841
Epoch: 70   - Cost: 1.19e+02 Valid Accuracy: 0.841
Epoch: 71   - Cost: 1.17e+02 Valid Accuracy: 0.842
Epoch: 72   - Cost: 1.15e+02 Valid Accuracy: 0.843
Epoch: 73   - Cost: 1.12e+02 Valid Accuracy: 0.845
Epoch: 74   - Cost: 1.1e+02  Valid Accuracy: 0.845
Epoch: 75   - Cost: 1.08e+02 Valid Accuracy: 0.845
Epoch: 76   - Cost: 1.06e+02 Valid Accuracy: 0.845
Epoch: 77   - Cost: 1.04e+02 Valid Accuracy: 0.846
Epoch: 78   - Cost: 1.02e+02 Valid Accuracy: 0.847
Epoch: 79   - Cost: 99.9     Valid Accuracy: 0.848
Epoch: 80   - Cost: 97.9     Valid Accuracy: 0.848
Epoch: 81   - Cost: 95.9     Valid Accuracy: 0.848
Epoch: 82   - Cost: 93.9     Valid Accuracy: 0.849
Epoch: 83   - Cost: 92.0     Valid Accuracy: 0.849
Epoch: 84   - Cost: 90.3     Valid Accuracy: 0.849
Epoch: 85   - Cost: 88.6     Valid Accuracy: 0.849
Epoch: 86   - Cost: 87.0     Valid Accuracy: 0.849
Epoch: 87   - Cost: 85.3     Valid Accuracy: 0.85 
Epoch: 88   - Cost: 83.6     Valid Accuracy: 0.85 
Epoch: 89   - Cost: 82.1     Valid Accuracy: 0.851
Epoch: 90   - Cost: 80.7     Valid Accuracy: 0.851
Epoch: 91   - Cost: 79.6     Valid Accuracy: 0.853
Epoch: 92   - Cost: 78.4     Valid Accuracy: 0.854
Epoch: 93   - Cost: 77.3     Valid Accuracy: 0.854
Epoch: 94   - Cost: 76.1     Valid Accuracy: 0.854
Epoch: 95   - Cost: 75.3     Valid Accuracy: 0.854
Epoch: 96   - Cost: 74.4     Valid Accuracy: 0.855
Epoch: 97   - Cost: 73.5     Valid Accuracy: 0.856
Epoch: 98   - Cost: 72.6     Valid Accuracy: 0.855
Epoch: 99   - Cost: 72.0     Valid Accuracy: 0.855

Write to file

In [38]:
submissions = pd.DataFrame({"ImageId": list(range(1, len(predictions)+1)),
                             "Label": predictions})
submissions.to_csv("output.csv", index=False, header=True)