#!/usr/bin/env python
# coding: utf-8

# # [Given a picture, would you be able to identify which camera took it?](https://www.kaggle.com/c/sp-society-camera-model-identification)

# The Kaggle competition overview reads
# 
# "Finding footage of a crime caught on tape is an investigator's dream. But even with crystal clear, damning evidence, one critical question always remains–is the footage real?
# 
# Today, one way to help authenticate footage is to identify the camera that the image was taken with. Forgeries often require splicing together content from two different cameras. But, unfortunately, the most common way to do this now is using image metadata, which can be easily falsified itself.
# 
# This problem is actively studied by several researchers around the world. Many machine learning solutions have been proposed in the past: least-squares estimates of a camera's color demosaicing filters as classification features, co-occurrences of pixel value prediction errors as features that are passed to sophisticated ensemble classifiers, and using CNNs to learn camera model identification features. However, this is a problem yet to be sufficiently solved.
# 
# For this competition, the IEEE Signal Processing Society is challenging you to build an algorithm that identifies which camera model captured an image by using traces intrinsically left in the image. Helping to solve this problem would have a big impact on the verification of evidence used in criminal and civil trials and even news reporting."

# ### Importing packages 

# In[1]:


import numpy as np
import os
import pandas as pd
import shutil
from PIL import Image
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.svm import SVC

from bayes_opt import BayesianOptimization

from keras.preprocessing.image import ImageDataGenerator, array_to_img
from keras.applications.vgg16 import VGG16
from keras.applications.resnet50 import ResNet50
from keras.applications.inception_v3 import InceptionV3
from keras.applications.xception import Xception
from keras.models import Model, Sequential
from keras.layers import Dense, BatchNormalization, Flatten, Dropout, Convolution2D, Activation, MaxPooling2D, GlobalAveragePooling2D
from random import shuffle

from keras.callbacks import Callback
from keras.callbacks import ModelCheckpoint
from keras.callbacks import LearningRateScheduler
import math

from keras.optimizers import SGD, Adam
from keras import losses
from keras.preprocessing.image import ImageDataGenerator, array_to_img

master_path = "/home/paperspace/IEEE"


# ** Is there a GPU on the machine? **

# In[2]:


from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())


# ### Helper Functions

# In[3]:


def read_crop_save(frompath, topath):
    img = Image.open(frompath)
    w, h = img.size
    img = img.crop((w // 2 - 256, h // 2 - 256, w // 2 + 256, h // 2 + 256)) 
    new_array = np.array(img)
    plt.imsave(topath, new_array)
    return

#################################################################
#################################################################

def copy_images(df, directory):
    destination_directory = os.path.join(os.path.sep, master_path, 'crossval', directory)
    print("copying {} files to {}...".format(directory, destination_directory))

    if os.path.exists(destination_directory):
        shutil.rmtree(destination_directory)

    if not os.path.exists(destination_directory):
        os.makedirs(destination_directory)

    for c in set(list(df['class'])):
        if not os.path.exists(os.path.join(os.path.sep, destination_directory, c)):
            os.makedirs(os.path.join(os.path.sep, destination_directory, c))

    for i, row in df.iterrows():
        try:
            path_from = row['filename']
            path_to = os.path.join(os.path.sep, destination_directory, row['class'])

            shutil.copy(path_from, path_to)
        except Exception as e:
            print("Error when copying {}: {}".format(row['filename'], str(e)))

#################################################################
#################################################################            
            
def preprocess_input_vgg16(x):
    from keras.applications.vgg16 import preprocess_input
    X = np.expand_dims(x, axis=0)
    X = preprocess_input(X)
    return X[0]

#################################################################
#################################################################

def preprocess_input_inception_v3(x):
    from keras.applications.inception_v3 import preprocess_input
    X = np.expand_dims(x, axis=0)
    X = preprocess_input(X)
    return X[0]

#################################################################
#################################################################

def preprocess_input_xception(x):
    from keras.applications.xception import preprocess_input
    X = np.expand_dims(x, axis=0)
    X = preprocess_input(X)
    return X[0]

#################################################################
#################################################################

def preprocess_input_resnet50(x):
    from keras.applications.resnet50 import preprocess_input
    X = np.expand_dims(x, axis=0)
    X = preprocess_input(X)
    return X[0]

#################################################################
#################################################################

class CheckLr(Callback):
    def on_epoch_end(self, epoch, logs=None):
        lr = self.model.optimizer.lr
        decay = self.model.optimizer.decay
        iterations = self.model.optimizer.iterations
        lr_with_decay = lr / (1. + decay * K.cast(iterations, K.dtype(decay)))
        print(K.eval(lr_with_decay))

#################################################################
#################################################################        
        
def step_decay(epoch):
    initial_lrate = 0.1
    drop = 0.5
    epochs_drop = 10.0
    lrate = initial_lrate * math.pow(drop, math.floor((1+epoch)/epochs_drop))
    return lrate

#################################################################
#################################################################

def build_on_bottleneck(which_net):
    
    if which_net == 'vgg16':
        base_model = VGG16(weights='imagenet')
    elif which_net == 'resnet50':
        base_model = ResNet50(weights='imagenet')
    elif which_net == 'xception':
        base_model = Xception(weights='imagenet', include_top=False)
    elif which_net == 'inception_v3':
        base_model = InceptionV3(weights='imagenet', include_top=False)
    
    if which_net not in ['inception_v3', 'xception']:
        j = [i for i, layer in enumerate(base_model.layers) if 'flatten' in layer.name][-1]
    
        x = base_model.layers[j].output
        x = BatchNormalization()(x)
        x = Dense(400, activation='relu')(x)
        x = BatchNormalization()(x)
        x = Dropout(0.5)(x)
        x = Dense(200, activation='relu')(x)
        x = BatchNormalization()(x)
        x = Dropout(0.5)(x)
        x = Dense(100, activation='relu')(x)
        x = Dense(10, activation='softmax')(x)
        
        model = Model(input=base_model.input, output=x)
        for i, layer in enumerate(model.layers):
            if i <= j: layer.trainable = False
    else:
        x = base_model.output
        x = GlobalAveragePooling2D()(x)
        x = BatchNormalization()(x)
        x = Dense(400, activation='relu')(x)
        x = BatchNormalization()(x)
        x = Dropout(0.5)(x)
        x = Dense(200, activation='relu')(x)
        x = BatchNormalization()(x)
        x = Dropout(0.5)(x)
        x = Dense(100, activation='relu')(x)
        x = Dense(10, activation='softmax')(x)
        
        model = Model(input=base_model.input, output=x)
        for layer in base_model.layers:
            layer.trainable = False
    
    model.compile(optimizer='adam', loss=losses.categorical_crossentropy, metrics=['accuracy'])
    
    filepath = os.path.join(os.path.sep, master_path, which_net + "BottleneckWeights.best.hdf5")
    checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
    callbacks_list = [checkpoint]
    
    return model, callbacks_list

#################################################################
#################################################################

def fine_tune(which_net, freeze_up_to):
    
    if which_net == 'vgg16':
        base_model = VGG16(weights='imagenet')
    elif which_net == 'resnet50':
        base_model = ResNet50(weights='imagenet')
    elif which_net == 'xception':
        base_model = Xception(weights='imagenet', include_top=False)
    elif which_net == 'inception_v3':
        base_model = InceptionV3(weights='imagenet', include_top=False)
    
    if which_net not in ['inception_v3', 'xception']:
        j = [i for i, layer in enumerate(base_model.layers) if 'flatten' in layer.name][-1]
    
        x = base_model.layers[j].output
        x = BatchNormalization()(x)
        x = Dense(400, activation='relu')(x)
        x = BatchNormalization()(x)
        x = Dropout(0.5)(x)
        x = Dense(200, activation='relu')(x)
        x = BatchNormalization()(x)
        x = Dropout(0.5)(x)
        x = Dense(100, activation='relu')(x)
        x = Dense(10, activation='softmax')(x)
        
        model = Model(input=base_model.input, output=x)
        
    else:
        x = base_model.output
        x = GlobalAveragePooling2D()(x)
        x = BatchNormalization()(x)
        x = Dense(400, activation='relu')(x)
        x = BatchNormalization()(x)
        x = Dropout(0.5)(x)
        x = Dense(200, activation='relu')(x)
        x = BatchNormalization()(x)
        x = Dropout(0.5)(x)
        x = Dense(100, activation='relu')(x)
        x = Dense(10, activation='softmax')(x)
        
        model = Model(input=base_model.input, output=x)
    
    model.load_weights(os.path.join(os.path.sep, master_path, which_net + "BottleneckWeights.best.hdf5"))
    
    for layer in model.layers[:freeze_up_to]:
       layer.trainable = False
    for layer in model.layers[freeze_up_to:]:
       layer.trainable = True
    
    filepath = os.path.join(os.path.sep, master_path, which_net + "FineTuneWeights.best.hdf5")
    checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
    callbacks_list = [checkpoint]
    
    return model, callbacks_list    

#################################################################
#################################################################

def save_bottleneck_features(batch_size, which_net):
    
    if which_net == 'vgg16': 
        model = VGG16(weights='imagenet', include_top=False)
        target_size = [224, 224]
        preprocessing_function=preprocess_input_vgg16
    elif which_net == 'resnet50': 
        model = ResNet50(weights='imagenet', include_top=False)
        target_size = [224, 224]
        preprocessing_function=preprocess_input_resnet50
    elif which_net == 'inception_v3':
        model = InceptionV3(weights='imagenet', include_top=False)
        target_size = [299, 299]
        preprocessing_function=preprocess_input_inception_v3
    
    
    datagen = ImageDataGenerator(preprocessing_function=preprocessing_function)

    generator = datagen.flow_from_directory(os.path.join(os.path.sep, master_path, 'crossval', 'training'),
                                            target_size=target_size,
                                            batch_size=batch_size,
                                            class_mode=None,
                                            shuffle=False)
    bottleneck_features_train = model.predict_generator(generator, nb_train_samples // batch_size)
    np.save(os.path.join(os.sep, master_path, which_net+'bottleneck_features_train.npy'), 
            bottleneck_features_train)

    generator = datagen.flow_from_directory(os.path.join(os.path.sep, master_path, 'crossval', 'validation'),
                                            target_size=target_size,
                                            batch_size=batch_size,
                                            class_mode=None,
                                            shuffle=False)
    bottleneck_features_validation = model.predict_generator(generator, nb_validation_samples // batch_size)
    np.save(os.path.join(os.sep, master_path, which_net+'bottleneck_features_val.npy'), 
            bottleneck_features_validation)
    
    return bottleneck_features_train, bottleneck_features_validation

#################################################################
#################################################################    
    
def define_generators(batch_size, which_net):
    
    if which_net == 'vgg16': 
        target_size = [224, 224]
        preprocessing_function=preprocess_input_vgg16
    elif which_net == 'resnet50': 
        target_size = [224, 224]
        preprocessing_function=preprocess_input_resnet50
    elif which_net == 'inception_v3': 
        target_size = [299, 299]
        preprocessing_function=preprocess_input_inception_v3
    elif which_net == 'xception': 
        target_size = [299, 299]
        preprocessing_function=preprocess_input_xception
    
    train_datagen = ImageDataGenerator(preprocessing_function=preprocessing_function,
                                       rotation_range=15,
                                       width_shift_range=0.2,
                                       height_shift_range=0.2,
                                       shear_range=0.2,
                                       zoom_range=0.2,
                                       horizontal_flip=True,
                                       fill_mode='nearest')

    train_generator = train_datagen.flow_from_directory(directory=os.path.join(os.path.sep, master_path, 'crossval', 'training'),
                                                        target_size=target_size,
                                                        batch_size=batch_size,
                                                        class_mode='categorical')

    validation_datagen = ImageDataGenerator(preprocessing_function=preprocessing_function)
    
    validation_generator = validation_datagen.flow_from_directory(directory=os.path.join(os.path.sep, master_path, 'crossval', 'validation'),
                                                                  target_size=target_size,
                                                                  batch_size=batch_size,
                                                                  class_mode='categorical')
    return train_generator, validation_generator

#################################################################
#################################################################   

def get_unshuffled_labels(X, class_indices):
    s = X.str.split(os.sep).str[-2]
    t1 = s.groupby(s).size().to_dict()
    for i, k in enumerate(class_indices.keys()):
        t1[i] = t1.pop(k)

    t2 = [y for x in [[k] * v for k, v in t1.items()] for y in x]
    return t2

#################################################################
#################################################################   

def reshape_and_concat_features(features):
    reshaped_ = []
    for f in features:
        a, b, c, d = f.shape
        new_f = f.reshape((a, b*c*d))
        reshaped_.append(new_f)
        
    return np.hstack(reshaped_)

#################################################################
#################################################################   

def rfccv(n_estimators, min_samples_split, max_features):
    clf = RandomForestClassifier(n_estimators=int(n_estimators),
                                min_samples_split=int(min_samples_split),
                                max_features=min(max_features, 0.999),
                                random_state=2).fit(bft_shrink, y_t_unshuf)
    return clf.score(bfv_shrink, y_v_unshuf)


# ** Center Cropping the training data **
# 
# Test images are 512 x 512 pictures which were center cropped from the originals.
# 
# We apply the same process to all our training set.

# In[18]:


list_paths = []
for subdir, dirs, files in os.walk(master_path + os.sep + "data"):
    for file in files:
        filepath = subdir + os.sep + file
        list_paths.append(filepath)

list_classes = list(set([os.path.dirname(filepath).split(os.sep)[-1] for filepath in list_paths if "train" in filepath]))

dest_dir = os.path.join(os.path.sep, master_path, 'resized')

for c in list_classes:
    if not os.path.exists(os.path.join(os.path.sep, dest_dir, c)):
        os.makedirs(os.path.join(os.path.sep, dest_dir, c))
        
for i, row in data.iterrows():
    frompath = row['filename']
    topath = os.path.join(os.path.sep, dest_dir, row['class'], row['filename'].split(os.sep)[-1]).replace('tif', 'jpg')
    read_crop_save(frompath, topath)


# ** Loading images' paths and labels into a pandas dataframe **
# 
# This simplifies the way we handle the split between train and validation set.

# In[4]:


list_paths = []
for subdir, dirs, files in os.walk(master_path + os.sep + "resized"):
    for file in files:
        filepath = subdir + os.sep + file
        list_paths.append(filepath)
        
list_train = [filepath for filepath in list_paths if "resized" in filepath and 'ipynb' not in filepath]

labels = [os.path.dirname(filepath).split(os.sep)[-1] for filepath in list_train]
data = pd.DataFrame({'filename': list_train, 'class': labels})

print(data.shape)
data.head()


# ** Splitting dataset into Train and Validation **
# 
# We fix them now and don't change anymore, i.e. we train and validate always on the same images. A better option consists in applying a cross validated approach, which is of course more expensive, especially in a Deep Learning environment. In our specific case, the dataset is so small that we could probably afford it. For the sake of fast experimenting we decide to go for a fixed split instead.

# In[6]:


X_train, X_val, y_train, y_val = train_test_split(data['filename'], data['class'], test_size=0.25, random_state=42)

print(X_train.shape, X_val.shape)

train = pd.concat([X_train, y_train], axis=1)
validation = pd.concat([X_val, y_val], axis = 1)

copy_images(train, 'training')
copy_images(validation, 'validation')


# The code in the below cell is what you would use to perform cross validation when dealing with images (just comment the *break* at the end). It basically consists in defining a KFold and overwriting in each iteration the contents of the train/validation folders where Keras would go and fetch pictures during training. Every time a new split is defined, the network learns from a new set of images and performance is assessed on the rest. CV accuracy is the average of validation scores.

# In[8]:


df_y = data['class']
df_x = data['filename']

skf = StratifiedKFold(n_splits = 3)

for i, (train_index, test_index) in enumerate(skf.split(df_x, df_y)):
    x_train, x_val = df_x.iloc[train_index], df_x.iloc[test_index]
    y_train, y_val = df_y.iloc[train_index], df_y.iloc[test_index]

    train = pd.concat([x_train, y_train], axis=1)
    validation = pd.concat([x_val, y_val], axis = 1)
    print('Train Shape:', train.shape)
    print('Validation Shape:', validation.shape)
    
    # copy the images according to the fold
    copy_images(train, 'training')
    copy_images(validation, 'validation')
    
    break


# ** Checking if the image augmented generators work as expected **

# In[7]:


t, v = define_generators(64, 'xception')
plt.imshow(next(t)[0][3])
plt.show()


# ** and here the classes we need to predict... **

# In[8]:


t.class_indices


# ## Use pre-trained networks as feature extractors, train a Random Forest classifier on top of them and take advantage of Bayesian Optimization to tune hyperparameters: only ~30% accuracy
# 

# In[21]:


nb_train_samples = X_train.shape[0]
nb_validation_samples = X_val.shape[0]

bft_vgg16, bfv_vgg16 = save_bottleneck_features(1, 'vgg16')

print(bft_vgg16.shape)
print(bfv_vgg16.shape)

a, b, c, d = bft_vgg16.shape
bft_vgg16.reshape((a, b*c*d)).shape


# In[23]:


bft_resnet50, bfv_resnet50 = save_bottleneck_features(1, 'resnet50')

print(bft_resnet50.shape)
print(bfv_resnet50.shape)

a, b, c, d = bft_resnet50.shape
bft_resnet50.reshape((a, b*c*d)).shape


# In[24]:


bft_inception_v3, bfv_inception_v3 = save_bottleneck_features(1, 'inception_v3')

print(bft_inception_v3.shape)
print(bfv_inception_v3.shape)

a, b, c, d = bft_inception_v3.shape
bft_inception_v3.reshape((a, b*c*d)).shape


# In[15]:


y_t_unshuf = get_unshuffled_labels(X_train, t.class_indices)
y_v_unshuf = get_unshuffled_labels(X_val, t.class_indices)

print(len(y_t_unshuf), len(y_v_unshuf))


# In[124]:


bft = reshape_and_concat_features([bft_vgg16, bft_resnet50, bft_inception_v3])

bft.shape


# In[128]:


pca = PCA(n_components=1000)
bft_shrink = pca.fit_transform(bft)

bft_shrink.shape


# In[133]:


bfv = reshape_and_concat_features([bfv_vgg16, bfv_resnet50, bfv_inception_v3])

bfv_shrink = pca.transform(bfv)

print(bfv.shape, bfv_shrink.shape)


# In[134]:


rf = RandomForestClassifier()
rf.fit(bft_shrink, y_t_unshuf)
rf.score(bfv_shrink, y_v_unshuf)


# In[135]:


rf.score(bft_shrink, y_t_unshuf)


# In[140]:


gp_params = {"alpha": 1e-5}

rfcBO = BayesianOptimization(
    rfccv,
    {'n_estimators': (10, 1000),
    'min_samples_split': (2, 25),
    'max_features': (0.1, 0.999)}
)

rfcBO.maximize(n_iter=25, **gp_params)
print('-' * 53)
print('Final Results')
print('RFC: %f' % rfcBO.res['max']['max_val'])


# In[146]:


rfcBO.res['max']


# ## Fine tune a pre-trained Xception network: only ~50% accuracy

# In[9]:


xception_b, callbacks_xception_b = build_on_bottleneck('xception')


# In[23]:


batch_size = 64
train_gen, valid_gen = define_generators(batch_size, 'xception')
    
xception_b.fit_generator(train_gen,
                        steps_per_epoch=X_train.shape[0]//batch_size,
                        epochs=15,
                        validation_data=valid_gen,
                        verbose=2,
                        validation_steps=X_val.shape[0]//batch_size,
                        callbacks=callbacks_xception_b)


# In[26]:


xception_ft, callbacks_xception_ft = fine_tune('xception', 75)
xception_ft.compile(optimizer=SGD(lr=0.0001, momentum=0.9), loss='categorical_crossentropy', metrics=['accuracy'])


# In[27]:


batch_size = 64
train_gen, valid_gen = define_generators(batch_size, 'xception')

xception_ft.fit_generator(train_gen,
                        steps_per_epoch=X_train.shape[0]//batch_size,
                        epochs=15,
                        validation_data=valid_gen,
                        verbose=2,
                        validation_steps=X_val.shape[0]//batch_size,
                        callbacks=callbacks_xception_ft)


# In[29]:


xception_ft.compile(optimizer=SGD(lr=0.001, momentum=0.9), loss='categorical_crossentropy', metrics=['accuracy'])

xception_ft.fit_generator(train_gen,
                        steps_per_epoch=X_train.shape[0]//batch_size,
                        epochs=3,
                        validation_data=valid_gen,
                        verbose=2,
                        validation_steps=X_val.shape[0]//batch_size,
                        callbacks=callbacks_xception_ft)


# ## Fine tune a pre-trained VGG16 network: only ~40% accuracy

# In[15]:


vgg16_b, callbacks_vgg16_b = build_on_bottleneck('vgg16')
#vgg16_b.summary()


# In[16]:


df_y = data['class']
df_x = data['filename']
batch_size = 64

skf = StratifiedKFold(n_splits = 3)
#total_actual = []
#total_predicted = []
#total_val_accuracy = []
#total_val_loss = []
#total_test_accuracy = []

for i, (train_index, test_index) in enumerate(skf.split(df_x, df_y)):
    x_train, x_val = df_x.iloc[train_index], df_x.iloc[test_index]
    y_train, y_val = df_y.iloc[train_index], df_y.iloc[test_index]

    train = pd.concat([x_train, y_train], axis=1)
    validation = pd.concat([x_val, y_val], axis = 1)
    print('Train Shape:', train.shape)
    print('Validation Shape:', validation.shape)
    #print(validation.groupby(['class'])['class'].count())
    
    # copy the images according to the fold
    copy_images(train, 'training')
    copy_images(validation, 'validation')

    print('**** Running fold '+ str(i))
    
    train_gen, valid_gen = define_generators(batch_size, 'vgg16')
    
    vgg16_b.fit_generator(train_gen,
                        steps_per_epoch=x_train.shape[0]//batch_size,
                        epochs=10,
                        validation_data=valid_gen,
                        verbose=2,
                        validation_steps=x_val.shape[0]//batch_size,
                        callbacks=callbacks_vgg16_b)
    break


# ## Fine tune a pre-trained ResNet50 network: playing around with learning rates, optimizers, freezing/un-freezing layers we jump to ~90% accuracy

# In[30]:


resnet50_b, callbacks_resnet50_b = build_on_bottleneck('resnet50')
#resnet50_b.summary()


# In[37]:


df_y = data['class']
df_x = data['filename']
batch_size = 64

skf = StratifiedKFold(n_splits = 4)
#total_actual = []
#total_predicted = []
#total_val_accuracy = []
#total_val_loss = []
#total_test_accuracy = []

for i, (train_index, test_index) in enumerate(skf.split(df_x, df_y)):
    x_train, x_val = df_x.iloc[train_index], df_x.iloc[test_index]
    y_train, y_val = df_y.iloc[train_index], df_y.iloc[test_index]

    train = pd.concat([x_train, y_train], axis=1)
    validation = pd.concat([x_val, y_val], axis = 1)
    print('Train Shape:', train.shape)
    print('Validation Shape:', validation.shape)
    #print(validation.groupby(['class'])['class'].count())
    
    # copy the images according to the fold
    copy_images(train, 'training')
    copy_images(validation, 'validation')

    print('**** Running fold '+ str(i))
    
    train_gen, valid_gen = define_generators(batch_size, 'resnet50')
    
    resnet50_b.fit_generator(train_gen,
                        steps_per_epoch=x_train.shape[0]//batch_size,
                        epochs=10,
                        validation_data=valid_gen,
                        verbose=2,
                        validation_steps=x_val.shape[0]//batch_size,
                        callbacks=callbacks_resnet50_b)
    break


# In[10]:


resnet50_ft, callbacks_resnet50_ft = fine_tune('resnet50', 78)


# In[16]:


resnet50_ft.compile(optimizer=SGD(lr=0.0001, momentum=0.9), loss='categorical_crossentropy', metrics=['accuracy'])


# In[17]:


df_y = data['class']
df_x = data['filename']
batch_size = 64

skf = StratifiedKFold(n_splits = 4)

for i, (train_index, test_index) in enumerate(skf.split(df_x, df_y)):
    x_train, x_val = df_x.iloc[train_index], df_x.iloc[test_index]
    y_train, y_val = df_y.iloc[train_index], df_y.iloc[test_index]

    train = pd.concat([x_train, y_train], axis=1)
    validation = pd.concat([x_val, y_val], axis = 1)
    print('Train Shape:', train.shape)
    print('Validation Shape:', validation.shape)
    
    # copy the images according to the fold
    copy_images(train, 'training')
    copy_images(validation, 'validation')

    print('**** Running fold '+ str(i))
    
    train_gen, valid_gen = define_generators(batch_size, 'resnet50')
    
    resnet50_ft.fit_generator(train_gen,
                        steps_per_epoch=x_train.shape[0]//batch_size,
                        epochs=10,
                        validation_data=valid_gen,
                        verbose=2,
                        validation_steps=x_val.shape[0]//batch_size,
                        callbacks=callbacks_resnet50_ft)
    break


# In[18]:


resnet50_ft.fit_generator(train_gen,
                        steps_per_epoch=x_train.shape[0]//batch_size,
                        epochs=10,
                        validation_data=valid_gen,
                        verbose=2,
                        validation_steps=x_val.shape[0]//batch_size,
                        callbacks=callbacks_resnet50_ft)


# In[19]:


resnet50_ft.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

resnet50_ft.fit_generator(train_gen,
                        steps_per_epoch=x_train.shape[0]//batch_size,
                        epochs=3,
                        validation_data=valid_gen,
                        verbose=2,
                        validation_steps=x_val.shape[0]//batch_size,
                        callbacks=callbacks_resnet50_ft)


# In[20]:


resnet50_ft.load_weights('./IEEE/resnet50FineTuneWeights.best.hdf5')

resnet50_ft.compile(optimizer=SGD(lr=0.001, momentum=0.9), loss='categorical_crossentropy', metrics=['accuracy'])

resnet50_ft.fit_generator(train_gen,
                        steps_per_epoch=x_train.shape[0]//batch_size,
                        epochs=3,
                        validation_data=valid_gen,
                        verbose=2,
                        validation_steps=x_val.shape[0]//batch_size,
                        callbacks=callbacks_resnet50_ft)


# In[21]:


resnet50_ft.fit_generator(train_gen,
                        steps_per_epoch=x_train.shape[0]//batch_size,
                        epochs=3,
                        validation_data=valid_gen,
                        verbose=2,
                        validation_steps=x_val.shape[0]//batch_size,
                        callbacks=callbacks_resnet50_ft)


# In[14]:


batch_size = 32

resnet50_ft, callbacks_resnet50_ft = fine_tune('resnet50', 2)

resnet50_ft.load_weights('./IEEE/resnet50FineTuneWeights.best.hdf5')

train_gen, valid_gen = define_generators(batch_size, 'resnet50')

resnet50_ft.compile(optimizer=SGD(lr=0.0001, momentum=0.9), loss='categorical_crossentropy', metrics=['accuracy'])

resnet50_ft.fit_generator(train_gen,
                        steps_per_epoch=X_train.shape[0]//batch_size,
                        epochs=10,
                        validation_data=valid_gen,
                        verbose=2,
                        validation_steps=X_val.shape[0]//batch_size,
                        callbacks=callbacks_resnet50_ft)


# ## Fine tune a pre-trained Inception V3 network: only ~60% accuracy

# In[45]:


inception_v3_b, callbacks_inception_v3_b = build_on_bottleneck('inception_v3')
#inception_v3_b.summary()


# In[46]:


df_y = data['class']
df_x = data['filename']
batch_size = 64

skf = StratifiedKFold(n_splits = 4)
#total_actual = []
#total_predicted = []
#total_val_accuracy = []
#total_val_loss = []
#total_test_accuracy = []

for i, (train_index, test_index) in enumerate(skf.split(df_x, df_y)):
    x_train, x_val = df_x.iloc[train_index], df_x.iloc[test_index]
    y_train, y_val = df_y.iloc[train_index], df_y.iloc[test_index]

    train = pd.concat([x_train, y_train], axis=1)
    validation = pd.concat([x_val, y_val], axis = 1)
    print('Train Shape:', train.shape)
    print('Validation Shape:', validation.shape)
    #print(validation.groupby(['class'])['class'].count())
    
    # copy the images according to the fold
    copy_images(train, 'training')
    copy_images(validation, 'validation')

    print('**** Running fold '+ str(i))
    
    train_gen, valid_gen = define_generators(batch_size, 'inception_v3')
    
    inception_v3_b.fit_generator(train_gen,
                        steps_per_epoch=x_train.shape[0]//batch_size,
                        epochs=10,
                        validation_data=valid_gen,
                        verbose=2,
                        validation_steps=x_val.shape[0]//batch_size,
                        callbacks=callbacks_inception_v3_b)
    break


# In[47]:


inception_v3_ft, callbacks_inception_v3_ft = fine_tune('inception_v3', 249)


# In[48]:


inception_v3_ft.compile(optimizer=SGD(lr=0.0001, momentum=0.9), loss='categorical_crossentropy', metrics=['accuracy'])


# In[49]:


df_y = data['class']
df_x = data['filename']
batch_size = 64

skf = StratifiedKFold(n_splits = 4)

for i, (train_index, test_index) in enumerate(skf.split(df_x, df_y)):
    x_train, x_val = df_x.iloc[train_index], df_x.iloc[test_index]
    y_train, y_val = df_y.iloc[train_index], df_y.iloc[test_index]

    train = pd.concat([x_train, y_train], axis=1)
    validation = pd.concat([x_val, y_val], axis = 1)
    print('Train Shape:', train.shape)
    print('Validation Shape:', validation.shape)
    
    # copy the images according to the fold
    copy_images(train, 'training')
    copy_images(validation, 'validation')

    print('**** Running fold '+ str(i))
    
    train_gen, valid_gen = define_generators(batch_size, 'inception_v3')
    
    inception_v3_ft.fit_generator(train_gen,
                        steps_per_epoch=x_train.shape[0]//batch_size,
                        epochs=15,
                        validation_data=valid_gen,
                        verbose=2,
                        validation_steps=x_val.shape[0]//batch_size,
                        callbacks=callbacks_inception_v3_ft)
    break


# ## Kaggle submission using ResNet50

# In[36]:


list_paths_t = []
for subdir, dirs, files in os.walk(master_path + os.sep + "data" + os.sep + 'test'): #"data
    for file in files:
        filepath = subdir + os.sep + file
        list_paths_t.append(filepath)


# In[37]:


list_test = [filepath for filepath in list_paths_t if "test/test" in filepath and '.ipynb' not in filepath]
classes = {v:k for k,v in t.class_indices.items()}


# In[39]:


from keras.preprocessing import image
from keras.applications.resnet50 import preprocess_input

final_dict = {}

for img_path in list_test:
    img = image.load_img(img_path, target_size=(224, 224))
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)

    pred = resnet50_ft.predict(x)
    c = classes[np.argmax(pred, axis=1)[0]]
    final_dict[img_path.split(os.sep)[-1]] = c


# In[40]:


kaggle = pd.DataFrame.from_dict(final_dict, orient='index').reset_index()
kaggle.columns = ['fname', 'camera']
kaggle.to_csv('./IEEE/for_kaggle.csv', index=False)
kaggle.head(10)


# In[41]:


kaggle.shape