Setup data directories¶

In [1]:

% cd /home/ubuntu/courses/deeplearning1/nbs

/home/ubuntu/courses/deeplearning1/nbs

In [2]:

%mkdir data

In [3]:

% cd data

/home/ubuntu/courses/deeplearning1/nbs/data

In [4]:

! pip install -U kaggle-cli

Requirement already up-to-date: kaggle-cli in /home/ubuntu/anaconda2/lib/python2.7/site-packages
Requirement already up-to-date: progressbar2<3.35,>=3.34.3 in /home/ubuntu/anaconda2/lib/python2.7/site-packages (from kaggle-cli)
Requirement already up-to-date: MechanicalSoup<0.9,>=0.7.0 in /home/ubuntu/anaconda2/lib/python2.7/site-packages (from kaggle-cli)
Requirement already up-to-date: cliff<2.9,>=2.8.0 in /home/ubuntu/anaconda2/lib/python2.7/site-packages (from kaggle-cli)
Requirement already up-to-date: configparser in /home/ubuntu/anaconda2/lib/python2.7/site-packages (from kaggle-cli)
Requirement already up-to-date: cssselect<1.1,>=1.0.1 in /home/ubuntu/anaconda2/lib/python2.7/site-packages (from kaggle-cli)
Requirement already up-to-date: lxml<4.1,>=4.0.0 in /home/ubuntu/anaconda2/lib/python2.7/site-packages (from kaggle-cli)
Requirement already up-to-date: python-utils>=2.1.0 in /home/ubuntu/anaconda2/lib/python2.7/site-packages (from progressbar2<3.35,>=3.34.3->kaggle-cli)
Requirement already up-to-date: beautifulsoup4 in /home/ubuntu/anaconda2/lib/python2.7/site-packages (from MechanicalSoup<0.9,>=0.7.0->kaggle-cli)
Requirement already up-to-date: requests>=2.0 in /home/ubuntu/anaconda2/lib/python2.7/site-packages (from MechanicalSoup<0.9,>=0.7.0->kaggle-cli)
Requirement already up-to-date: six>=1.4 in /home/ubuntu/anaconda2/lib/python2.7/site-packages (from MechanicalSoup<0.9,>=0.7.0->kaggle-cli)
Requirement already up-to-date: PrettyTable<0.8,>=0.7.1 in /home/ubuntu/anaconda2/lib/python2.7/site-packages (from cliff<2.9,>=2.8.0->kaggle-cli)
Requirement already up-to-date: pbr!=2.1.0,>=2.0.0 in /home/ubuntu/anaconda2/lib/python2.7/site-packages (from cliff<2.9,>=2.8.0->kaggle-cli)
Requirement already up-to-date: cmd2>=0.6.7 in /home/ubuntu/anaconda2/lib/python2.7/site-packages (from cliff<2.9,>=2.8.0->kaggle-cli)
Requirement already up-to-date: unicodecsv>=0.8.0; python_version < "3.0" in /home/ubuntu/anaconda2/lib/python2.7/site-packages (from cliff<2.9,>=2.8.0->kaggle-cli)
Requirement already up-to-date: PyYAML>=3.10.0 in /home/ubuntu/anaconda2/lib/python2.7/site-packages (from cliff<2.9,>=2.8.0->kaggle-cli)
Requirement already up-to-date: pyparsing>=2.1.0 in /home/ubuntu/anaconda2/lib/python2.7/site-packages (from cliff<2.9,>=2.8.0->kaggle-cli)
Requirement already up-to-date: stevedore>=1.20.0 in /home/ubuntu/anaconda2/lib/python2.7/site-packages (from cliff<2.9,>=2.8.0->kaggle-cli)
Requirement already up-to-date: urllib3<1.23,>=1.21.1 in /home/ubuntu/anaconda2/lib/python2.7/site-packages (from requests>=2.0->MechanicalSoup<0.9,>=0.7.0->kaggle-cli)
Requirement already up-to-date: idna<2.7,>=2.5 in /home/ubuntu/anaconda2/lib/python2.7/site-packages (from requests>=2.0->MechanicalSoup<0.9,>=0.7.0->kaggle-cli)
Requirement already up-to-date: chardet<3.1.0,>=3.0.2 in /home/ubuntu/anaconda2/lib/python2.7/site-packages (from requests>=2.0->MechanicalSoup<0.9,>=0.7.0->kaggle-cli)
Requirement already up-to-date: certifi>=2017.4.17 in /home/ubuntu/anaconda2/lib/python2.7/site-packages (from requests>=2.0->MechanicalSoup<0.9,>=0.7.0->kaggle-cli)
Requirement already up-to-date: pyperclip in /home/ubuntu/anaconda2/lib/python2.7/site-packages (from cmd2>=0.6.7->cliff<2.9,>=2.8.0->kaggle-cli)
You are using pip version 8.1.2, however version 9.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.

In [5]:

# make sure you accept the rules of the competition first
! kg download -u username -p password -c dogs-vs-cats-redux-kernels-edition

downloading https://www.kaggle.com/c/dogs-vs-cats-redux-kernels-edition/download/test.zip

test.zip 100% |######################################| Time: 0:00:10  26.6 MiB/s

downloading https://www.kaggle.com/c/dogs-vs-cats-redux-kernels-edition/download/train.zip

train.zip 100% |#####################################| Time: 0:00:19  27.9 MiB/s

downloading https://www.kaggle.com/c/dogs-vs-cats-redux-kernels-edition/download/sample_submission.csv

sample_submission.csv 100% |#########################| Time: 0:00:00 479.1 KiB/s

In [6]:

!sudo apt-get update
! sudo apt install unzip

Hit:1 http://us-east-1.ec2.archive.ubuntu.com/ubuntu xenial InRelease
Get:2 http://us-east-1.ec2.archive.ubuntu.com/ubuntu xenial-updates InRelease [102 kB]
Get:3 http://us-east-1.ec2.archive.ubuntu.com/ubuntu xenial-backports InRelease [102 kB]
Get:4 http://security.ubuntu.com/ubuntu xenial-security InRelease [102 kB]
Get:5 http://us-east-1.ec2.archive.ubuntu.com/ubuntu xenial-updates/main amd64 Packages [644 kB]
Ign:6 http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64  InRelease
Hit:7 http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64  Release
Get:8 http://us-east-1.ec2.archive.ubuntu.com/ubuntu xenial-updates/universe amd64 Packages [541 kB]
Fetched 1,491 kB in 0s (4,087 kB/s)
Reading package lists... Done
Reading package lists... Done
Building dependency tree       
Reading state information... Done
unzip is already the newest version (6.0-20ubuntu1).
0 upgraded, 0 newly installed, 0 to remove and 262 not upgraded.

In [7]:

! unzip -q test.zip && unzip -q train.zip

In [8]:

! mkdir -p sample/test sample/train sample/valid

In [9]:

! mkdir -p sample/results

In [10]:

! mkdir results

In [11]:

! mkdir -p valid

In [12]:

import shutil, os, glob
import random

In [13]:

for _ in range(1000):
    random_file = random.choice(os.listdir("train/"));
    shutil.copy("train/" + random_file, "sample/train/");

In [14]:

for _ in range(500):
    random_file = random.choice(os.listdir("test/"));
    shutil.copy("test/" + random_file, "sample/test/");

In [15]:

for _ in range(3000):
    random_file = random.choice(os.listdir("train/"));
    shutil.move("train/" + random_file, "valid/");

In [16]:

for _ in range(1000):
    random_file = random.choice(os.listdir("valid/"));
    shutil.copy("valid/" + random_file, "sample/valid/");

In [17]:

! mkdir -p train/dogs train/cats

In [18]:

! mv train/cat.*.jpg train/cats

In [19]:

! mv train/dog.*.jpg train/dogs

In [20]:

! mkdir -p valid/dogs valid/cats

In [21]:

! mv valid/cat.*.jpg valid/cats

In [22]:

! mv valid/dog.*.jpg valid/dogs

In [23]:

%cd sample/train

/home/ubuntu/courses/deeplearning1/nbs/data/sample/train

In [24]:

! mkdir cats dogs

In [25]:

! mv dog.*.jpg dogs/

In [26]:

! mv cat.*.jpg cats/

In [27]:

% cd /home/ubuntu/courses/deeplearning1/nbs/data/sample/valid

/home/ubuntu/courses/deeplearning1/nbs/data/sample/valid

In [28]:

! mkdir cats dogs

In [29]:

! mv dog.*.jpg dogs/

In [30]:

! mv cat.*.jpg cats/

In [31]:

% cd /home/ubuntu/courses/deeplearning1/nbs/data/

/home/ubuntu/courses/deeplearning1/nbs/data

In [32]:

% cd test/

/home/ubuntu/courses/deeplearning1/nbs/data/test

In [33]:

! mkdir unknown

In [34]:

! mv *.jpg unknown/

In [35]:

% cd /home/ubuntu/courses/deeplearning1/nbs/data/sample/test

/home/ubuntu/courses/deeplearning1/nbs/data/sample/test

In [36]:

! mkdir unknown

In [37]:

! mv *.jpg unknown/

Run Model¶

In [39]:

DATA_HOME_DIR = "/home/ubuntu/courses/deeplearning1/nbs/data"

In [40]:

%cd /home/ubuntu/courses/deeplearning1/nbs/

/home/ubuntu/courses/deeplearning1/nbs

In [41]:

from utils import *
from vgg16 import Vgg16

%matplotlib inline

Using gpu device 0: Tesla K80 (CNMeM is disabled, cuDNN 5103)
/home/ubuntu/anaconda2/lib/python2.7/site-packages/theano/sandbox/cuda/__init__.py:600: UserWarning: Your cuDNN version is more recent than the one Theano officially supports. If you see any problems, try updating Theano or downgrading cuDNN to version 5.
  warnings.warn(warn)
Using Theano backend.

In [42]:

%cd $DATA_HOME_DIR

#Set path to sample/ path if desired
path = DATA_HOME_DIR + '/sample/'
test_path = path + '/test/' #We use all the test data
results_path=DATA_HOME_DIR + '/results/'
train_path=path + '/train/'
valid_path=path + '/valid/'

/home/ubuntu/courses/deeplearning1/nbs/data

In [43]:

vgg = Vgg16()

In [44]:

batch_size=4
no_of_epochs=1

In [45]:

batches = vgg.get_batches(train_path, batch_size=batch_size)

Found 974 images belonging to 2 classes.

In [46]:

imgs,labels = next(batches)
plots(imgs, titles=labels)
vgg.predict(imgs, True)

Out[46]:

(array([ 0.1868,  0.0339,  0.6409,  0.5215], dtype=float32),
 array([223, 159, 281, 205]),
 [u'schipperke', u'Rhodesian_ridgeback', u'tabby', u'flat-coated_retriever'])

In [49]:

#Finetune the model
batch_size = 64
no_of_epochs = 1
batches = vgg.get_batches(train_path, batch_size=batch_size)
val_batches = vgg.get_batches(valid_path, batch_size=batch_size*2)
vgg.finetune(batches)

vgg.model.optimizer.lr = 0.01

Found 974 images belonging to 2 classes.
Found 855 images belonging to 2 classes.

In [50]:

#Notice we are passing in the validation dataset to the fit() method
#For each epoch we test our model against the validation set
latest_weights_filename = None
for epoch in range(no_of_epochs):
    print "Running epoch: %d" % epoch
    vgg.fit(batches, val_batches, nb_epoch=1)
    latest_weights_filename = 'ft%d.h5' % epoch
    vgg.model.save_weights(results_path+latest_weights_filename)
print "Completed %s fit operations" % no_of_epochs

Running epoch: 0
Epoch 1/1
974/974 [==============================] - 58s - loss: 0.5535 - acc: 0.9189 - val_loss: 0.3400 - val_acc: 0.9684
Completed 1 fit operations

In [51]:

batch_size = 4
val_batches = vgg.get_batches(valid_path, batch_size=batch_size)

Found 855 images belonging to 2 classes.

In [52]:

imgs,labels = next(val_batches)
plots(imgs, titles=labels)
vgg.predict(imgs, True)

Out[52]:

(array([ 1.,  1.,  1.,  1.], dtype=float32),
 array([0, 1, 1, 1]),
 ['cats', 'dogs', 'dogs', 'dogs'])

In [65]:

#You can verify the column ordering by viewing some images
val_batches, probs = vgg.test(valid_path, batch_size)
from PIL import Image
print probs[1]
print val_batches.classes[1]
print val_batches.filenames[1]
Image.open(valid_path + val_batches.filenames[1])

Found 3000 images belonging to 2 classes.
[ 1.  0.]
0
cats/cat.2126.jpg

Out[65]:

In [66]:

filenames = val_batches.filenames
expected_labels = val_batches.classes #0 or 1

#Round our predictions to 0/1 to generate labels
our_predictions = probs[:,0]
our_labels = np.round(1-our_predictions)

In [67]:

from keras.preprocessing import image

#Helper function to plot images by index in the validation set 
#Plots is a helper function in utils.py
def plots_idx(idx, titles=None):
    for i in idx:
        print filenames[i];
    plots([image.load_img(valid_path + filenames[i]) for i in idx], titles=titles)
    
#Number of images to view for each visualization task
n_view = 4

In [68]:

#1. A few correct labels at random
correct = np.where(our_labels==expected_labels)[0]
print "Found %d correct labels" % len(correct)
idx = permutation(correct)[:n_view]
plots_idx(idx, our_predictions[idx])

Found 2950 correct labels
cats/cat.3887.jpg
cats/cat.611.jpg
dogs/dog.10859.jpg
cats/cat.5595.jpg

In [69]:

#2. A few incorrect labels at random
incorrect = np.where(our_labels!=expected_labels)[0]
print "Found %d incorrect labels" % len(incorrect)
idx = permutation(incorrect)[:n_view]
plots_idx(idx, our_predictions[idx])

Found 50 incorrect labels
cats/cat.4821.jpg
dogs/dog.595.jpg
cats/cat.7599.jpg
cats/cat.2938.jpg

In [70]:

#3a. The images we most confident were cats, and are actually cats
correct_cats = np.where((our_labels==0) & (our_labels==expected_labels))[0]
print "Found %d confident correct cats labels" % len(correct_cats)
most_correct_cats = np.argsort(our_predictions[correct_cats])[::-1][:n_view]
plots_idx(correct_cats[most_correct_cats], our_predictions[correct_cats][most_correct_cats])

Found 1499 confident correct cats labels
cats/cat.6884.jpg
cats/cat.7752.jpg
cats/cat.8724.jpg
cats/cat.557.jpg

In [71]:

#3b. The images we most confident were dogs, and are actually dogs
correct_dogs = np.where((our_labels==1) & (our_labels==expected_labels))[0]
print "Found %d confident correct dogs labels" % len(correct_dogs)
most_correct_dogs = np.argsort(our_predictions[correct_dogs])[:n_view]
plots_idx(correct_dogs[most_correct_dogs], our_predictions[correct_dogs][most_correct_dogs])

Found 1451 confident correct dogs labels
dogs/dog.8854.jpg
dogs/dog.6413.jpg
dogs/dog.9854.jpg
dogs/dog.7749.jpg

In [60]:

#4a. The images we were most confident were cats, but are actually dogs
incorrect_cats = np.where((our_labels==0) & (our_labels!=expected_labels))[0]
print "Found %d incorrect cats" % len(incorrect_cats)
if len(incorrect_cats):
    most_incorrect_cats = np.argsort(our_predictions[incorrect_cats])[::-1][:n_view]
    plots_idx(incorrect_cats[most_incorrect_cats], our_predictions[incorrect_cats][most_incorrect_cats])

Found 6 incorrect cats
dogs/dog.1622.jpg
dogs/dog.5251.jpg
dogs/dog.6694.jpg
dogs/dog.5642.jpg

In [61]:

#4b. The images we were most confident were dogs, but are actually cats
incorrect_dogs = np.where((our_labels==1) & (our_labels!=expected_labels))[0]
print "Found %d incorrect dogs" % len(incorrect_dogs)
if len(incorrect_dogs):
    most_incorrect_dogs = np.argsort(our_predictions[incorrect_dogs])[:n_view]
    plots_idx(incorrect_dogs[most_incorrect_dogs], our_predictions[incorrect_dogs][most_incorrect_dogs])

Found 21 incorrect dogs
cats/cat.7920.jpg
cats/cat.1267.jpg
cats/cat.7194.jpg
cats/cat.376.jpg

In [133]:

#5. The most uncertain labels (ie those with probability closest to 0.5).
most_uncertain = np.argsort(np.abs(our_predictions-0.5))
plots_idx(most_uncertain[:n_view], our_predictions[most_uncertain])

Run over all training and test data¶

In [63]:

vgg.model.load_weights(results_path+'final.h5')

In [64]:

%cd $DATA_HOME_DIR

#Set path to sample/ path if desired
path = DATA_HOME_DIR 
test_path = path + '/test/' #We use all the test data
train_path=path + '/train/'
valid_path=path + '/valid/'

/home/ubuntu/courses/deeplearning1/nbs/data

Go to top and run