#!/usr/bin/env python
# coding: utf-8

# # Dogs vs Cat Redux

# In this tutorial, you will learn how generate and submit predictions to a Kaggle competiton
# 
# [Dogs vs. Cats Redux: Kernels Edition](https://www.kaggle.com/c/dogs-vs-cats-redux-kernels-edition)
#     
#     

# To start you will need to download and unzip the competition data from Kaggle and ensure your directory structure looks like this
# ```
# utils/
#     vgg16.py
#     utils.py
# lesson1/
#     redux.ipynb
#     data/
#         redux/
#             train/
#                 cat.437.jpg
#                 dog.9924.jpg
#                 cat.1029.jpg
#                 dog.4374.jpg
#             test/
#                 231.jpg
#                 325.jpg
#                 1235.jpg
#                 9923.jpg
# ```
# 
# You can download the data files from the competition page [here](https://www.kaggle.com/c/dogs-vs-cats-redux-kernels-edition/data) or you can download them from the command line using the [Kaggle CLI](https://github.com/floydwch/kaggle-cli).
# 
# You should launch your notebook inside the lesson1 directory
# ```
# cd lesson1
# jupyter notebook
# ```

# In[1]:


#Verify we are in the lesson1 directory
get_ipython().run_line_magic('pwd', '')


# In[2]:


#Create references to important directories we will use over and over
import os, sys
current_dir = os.getcwd()
LESSON_HOME_DIR = current_dir
DATA_HOME_DIR = current_dir+'/data/redux'


# In[3]:


#Allow relative imports to directories above lesson1/
sys.path.insert(1, os.path.join(sys.path[0], '..'))

#import modules
from utils3 import *
from vgg16_3 import Vgg16

#Instantiate plotting tool
#In Jupyter notebooks, you will need to run this command before doing any plotting
get_ipython().run_line_magic('matplotlib', 'inline')


# ## Action Plan
# 1. Create Validation and Sample sets
# 2. Rearrange image files into their respective directories 
# 3. Finetune and Train model
# 4. Generate predictions
# 5. Validate predictions
# 6. Submit predictions to Kaggle

# ## Create validation set and sample

# In[66]:


#Create directories
get_ipython().run_line_magic('cd', '$DATA_HOME_DIR')
get_ipython().run_line_magic('mkdir', 'valid')
get_ipython().run_line_magic('mkdir', 'results')
get_ipython().run_line_magic('mkdir', '-p sample/train')
get_ipython().run_line_magic('mkdir', '-p sample/test')
get_ipython().run_line_magic('mkdir', '-p sample/valid')
get_ipython().run_line_magic('mkdir', '-p sample/results')
get_ipython().run_line_magic('mkdir', '-p test/unknown')


# In[67]:


get_ipython().run_line_magic('cd', '$DATA_HOME_DIR/train')


# In[68]:


g = glob('*.jpg')
shuf = np.random.permutation(g)
for i in range(2000): os.rename(shuf[i], DATA_HOME_DIR+'/valid/' + shuf[i])


# In[69]:


from shutil import copyfile


# In[70]:


g = glob('*.jpg')
shuf = np.random.permutation(g)
for i in range(200): copyfile(shuf[i], DATA_HOME_DIR+'/sample/train/' + shuf[i])


# In[71]:


get_ipython().run_line_magic('cd', '$DATA_HOME_DIR/valid')


# In[72]:


g = glob('*.jpg')
shuf = np.random.permutation(g)
for i in range(50): copyfile(shuf[i], DATA_HOME_DIR+'/sample/valid/' + shuf[i])


# ## Rearrange image files into their respective directories

# In[73]:


#Divide cat/dog images into separate directories

get_ipython().run_line_magic('cd', '$DATA_HOME_DIR/sample/train')
get_ipython().run_line_magic('mkdir', 'cats')
get_ipython().run_line_magic('mkdir', 'dogs')
get_ipython().run_line_magic('mv', 'cat.*.jpg cats/')
get_ipython().run_line_magic('mv', 'dog.*.jpg dogs/')

get_ipython().run_line_magic('cd', '$DATA_HOME_DIR/sample/valid')
get_ipython().run_line_magic('mkdir', 'cats')
get_ipython().run_line_magic('mkdir', 'dogs')
get_ipython().run_line_magic('mv', 'cat.*.jpg cats/')
get_ipython().run_line_magic('mv', 'dog.*.jpg dogs/')

get_ipython().run_line_magic('cd', '$DATA_HOME_DIR/valid')
get_ipython().run_line_magic('mkdir', 'cats')
get_ipython().run_line_magic('mkdir', 'dogs')
get_ipython().run_line_magic('mv', 'cat.*.jpg cats/')
get_ipython().run_line_magic('mv', 'dog.*.jpg dogs/')

get_ipython().run_line_magic('cd', '$DATA_HOME_DIR/train')
get_ipython().run_line_magic('mkdir', 'cats')
get_ipython().run_line_magic('mkdir', 'dogs')
get_ipython().run_line_magic('mv', 'cat.*.jpg cats/')
get_ipython().run_line_magic('mv', 'dog.*.jpg dogs/')


# In[74]:


# Create single 'unknown' class for test set
get_ipython().run_line_magic('cd', '$DATA_HOME_DIR/test')
get_ipython().run_line_magic('mv', '*.jpg unknown/')


# ## Finetuning and Training

# In[4]:


get_ipython().run_line_magic('cd', '$DATA_HOME_DIR')

#Set path to sample/ path if desired
path = DATA_HOME_DIR + '/' #'/sample/'
test_path = DATA_HOME_DIR + '/test/' #We use all the test data
results_path=DATA_HOME_DIR + '/results/'
train_path=path + '/train/'
valid_path=path + '/valid/'


# In[5]:


#import Vgg16 helper class
vgg = Vgg16()


# In[6]:


#Set constants. You can experiment with no_of_epochs to improve the model
batch_size=64
no_of_epochs=3


# In[7]:


#Finetune the model
batches = vgg.get_batches(train_path, batch_size=batch_size)
val_batches = vgg.get_batches(valid_path, batch_size=batch_size*2)
vgg.finetune(batches)

#Not sure if we set this for all fits
vgg.model.optimizer.lr = 0.01


# In[13]:


#Notice we are passing in the validation dataset to the fit() method
#For each epoch we test our model against the validation set
latest_weights_filename = None
for epoch in range(no_of_epochs):
    print("Running epoch: {}".format(epoch))
    vgg.fit(batches, val_batches, epochs=1)
    latest_weights_filename = 'ft%d.h5' % epoch
    vgg.model.save_weights(results_path+latest_weights_filename)
print("Completed {} fit operations".format(no_of_epochs))


# ## Generate Predictions

# Let's use our new model to make predictions on the test dataset

# In[ ]:


batches, preds = vgg.test(test_path, batch_size = batch_size*2)


# In[81]:


#For every image, vgg.test() generates two probabilities 
#based on how we've ordered the cats/dogs directories.
#It looks like column one is cats and column two is dogs
print preds[:5]

filenames = batches.filenames
print filenames[:5]


# In[82]:


#You can verify the column ordering by viewing some images
from PIL import Image
Image.open(test_path + filenames[2])


# In[83]:


#Save our test results arrays so we can use them again later
save_array(results_path + 'test_preds.dat', preds)
save_array(results_path + 'filenames.dat', filenames)


# ## Validate Predictions

# Keras' *fit()* function conveniently shows us the value of the loss function, and the accuracy, after every epoch ("*epoch*" refers to one full run through all training examples). The most important metrics for us to look at are for the validation set, since we want to check for over-fitting. 
# 
# - **Tip**: with our first model we should try to overfit before we start worrying about how to reduce over-fitting - there's no point even thinking about regularization, data augmentation, etc if you're still under-fitting! (We'll be looking at these techniques shortly).
# 
# As well as looking at the overall metrics, it's also a good idea to look at examples of each of:
# 1. A few correct labels at random
# 2. A few incorrect labels at random
# 3. The most correct labels of each class (ie those with highest probability that are correct)
# 4. The most incorrect labels of each class (ie those with highest probability that are incorrect)
# 5. The most uncertain labels (ie those with probability closest to 0.5).

# Let's see what we can learn from these examples. (In general, this is a particularly useful technique for debugging problems in the model. However, since this model is so simple, there may not be too much to learn at this stage.)
# 
# Calculate predictions on validation set, so we can find correct and incorrect examples:

# In[84]:


vgg.model.load_weights(results_path+latest_weights_filename)


# In[85]:


val_batches, probs = vgg.test(valid_path, batch_size = batch_size)


# In[86]:


filenames = val_batches.filenames
expected_labels = val_batches.classes #0 or 1

#Round our predictions to 0/1 to generate labels
our_predictions = probs[:,0]
our_labels = np.round(1-our_predictions)


# In[1]:


from keras.preprocessing import image

#Helper function to plot images by index in the validation set 
#Plots is a helper function in utils.py
def plots_idx(idx, titles=None):
    plots([image.load_img(valid_path + filenames[i]) for i in idx], titles=titles)
    
#Number of images to view for each visualization task
n_view = 4


# In[88]:


#1. A few correct labels at random
correct = np.where(our_labels==expected_labels)[0]
print "Found %d correct labels" % len(correct)
idx = permutation(correct)[:n_view]
plots_idx(idx, our_predictions[idx])


# In[89]:


#2. A few incorrect labels at random
incorrect = np.where(our_labels!=expected_labels)[0]
print "Found %d incorrect labels" % len(incorrect)
idx = permutation(incorrect)[:n_view]
plots_idx(idx, our_predictions[idx])


# In[90]:


#3a. The images we most confident were cats, and are actually cats
correct_cats = np.where((our_labels==0) & (our_labels==expected_labels))[0]
print "Found %d confident correct cats labels" % len(correct_cats)
most_correct_cats = np.argsort(our_predictions[correct_cats])[::-1][:n_view]
plots_idx(correct_cats[most_correct_cats], our_predictions[correct_cats][most_correct_cats])


# In[91]:


#3b. The images we most confident were dogs, and are actually dogs
correct_dogs = np.where((our_labels==1) & (our_labels==expected_labels))[0]
print "Found %d confident correct dogs labels" % len(correct_dogs)
most_correct_dogs = np.argsort(our_predictions[correct_dogs])[:n_view]
plots_idx(correct_dogs[most_correct_dogs], our_predictions[correct_dogs][most_correct_dogs])


# In[92]:


#4a. The images we were most confident were cats, but are actually dogs
incorrect_cats = np.where((our_labels==0) & (our_labels!=expected_labels))[0]
print "Found %d incorrect cats" % len(incorrect_cats)
if len(incorrect_cats):
    most_incorrect_cats = np.argsort(our_predictions[incorrect_cats])[::-1][:n_view]
    plots_idx(incorrect_cats[most_incorrect_cats], our_predictions[incorrect_cats][most_incorrect_cats])


# In[93]:


#4b. The images we were most confident were dogs, but are actually cats
incorrect_dogs = np.where((our_labels==1) & (our_labels!=expected_labels))[0]
print "Found %d incorrect dogs" % len(incorrect_dogs)
if len(incorrect_dogs):
    most_incorrect_dogs = np.argsort(our_predictions[incorrect_dogs])[:n_view]
    plots_idx(incorrect_dogs[most_incorrect_dogs], our_predictions[incorrect_dogs][most_incorrect_dogs])


# In[94]:


#5. The most uncertain labels (ie those with probability closest to 0.5).
most_uncertain = np.argsort(np.abs(our_predictions-0.5))
plots_idx(most_uncertain[:n_view], our_predictions[most_uncertain])


# Perhaps the most common way to analyze the result of a classification model is to use a [confusion matrix](http://www.dataschool.io/simple-guide-to-confusion-matrix-terminology/). Scikit-learn has a convenient function we can use for this purpose:

# In[95]:


from sklearn.metrics import confusion_matrix
cm = confusion_matrix(expected_labels, our_labels)


# We can just print out the confusion matrix, or we can show a graphical view (which is mainly useful for dependents with a larger number of categories).

# In[96]:


plot_confusion_matrix(cm, val_batches.class_indices)


# ## Submit Predictions to Kaggle!

# Here's the format Kaggle requires for new submissions:
# ```
# imageId,isDog
# 1242, .3984
# 3947, .1000
# 4539, .9082
# 2345, .0000
# ```
# 
# Kaggle wants the imageId followed by the probability of the image being a dog. Kaggle uses a metric called [Log Loss](http://wiki.fast.ai/index.php/Log_Loss) to evaluate your submission.

# In[97]:


#Load our test predictions from file
preds = load_array(results_path + 'test_preds.dat')
filenames = load_array(results_path + 'filenames.dat')


# In[98]:


#Grab the dog prediction column
isdog = preds[:,1]
print "Raw Predictions: " + str(isdog[:5])
print "Mid Predictions: " + str(isdog[(isdog < .6) & (isdog > .4)])
print "Edge Predictions: " + str(isdog[(isdog == 1) | (isdog == 0)])


# [Log Loss](http://wiki.fast.ai/index.php/Log_Loss) doesn't support probability values of 0 or 1--they are undefined (and we have many). Fortunately, Kaggle helps us by offsetting our 0s and 1s by a very small value. So if we upload our submission now we will have lots of .99999999 and .000000001 values. This seems good, right?
# 
# Not so. There is an additional twist due to how log loss is calculated--log loss rewards predictions that are confident and correct (p=.9999,label=1), but it punishes predictions that are confident and wrong far more (p=.0001,label=1). See visualization below.

# In[128]:


#Visualize Log Loss when True value = 1
#y-axis is log loss, x-axis is probabilty that label = 1
#As you can see Log Loss increases rapidly as we approach 0
#But increases slowly as our predicted probability gets closer to 1
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import log_loss

x = [i*.0001 for i in range(1,10000)]
y = [log_loss([1],[[i*.0001,1-(i*.0001)]],eps=1e-15) for i in range(1,10000,1)]

plt.plot(x, y)
plt.axis([-.05, 1.1, -.8, 10])
plt.title("Log Loss when true label = 1")
plt.xlabel("predicted probability")
plt.ylabel("log loss")

plt.show()


# In[125]:


#So to play it safe, we use a sneaky trick to round down our edge predictions
#Swap all ones with .95 and all zeros with .05
isdog = isdog.clip(min=0.05, max=0.95)


# In[100]:


#Extract imageIds from the filenames in our test/unknown directory 
filenames = batches.filenames
ids = np.array([int(f[8:f.find('.')]) for f in filenames])


# Here we join the two columns into an array of [imageId, isDog]

# In[101]:


subm = np.stack([ids,isdog], axis=1)
subm[:5]


# In[102]:


get_ipython().run_line_magic('cd', '$DATA_HOME_DIR')
submission_file_name = 'submission1.csv'
np.savetxt(submission_file_name, subm, fmt='%d,%.5f', header='id,label', comments='')


# In[103]:


from IPython.display import FileLink
get_ipython().run_line_magic('cd', '$LESSON_HOME_DIR')
FileLink('data/redux/'+submission_file_name)


# You can download this file and submit on the Kaggle website or use the Kaggle command line tool's "submit" method.