Imports

In [1]:
import cv2
import numpy as np
import pandas as pd
from random import shuffle
import os
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
from sklearn import metrics
from PIL import Image
import pickle

from mpl_toolkits.axes_grid1 import ImageGrid

import keras
from keras.models import Model
from keras.optimizers import Adam, SGD
from keras.applications.inception_v3 import InceptionV3
from keras.applications.densenet import DenseNet121
from keras.preprocessing import image
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, LearningRateScheduler, Callback, CSVLogger

from keras.preprocessing.image import ImageDataGenerator, array_to_img
from keras.applications.vgg16 import VGG16
from keras.applications.resnet50 import ResNet50
from keras.applications.xception import Xception
from keras.models import Model, Sequential
from keras.utils import to_categorical
from keras.layers import Input, Dense, BatchNormalization, Flatten, Dropout, Convolution2D, Activation, MaxPooling2D, GlobalAveragePooling2D
from random import shuffle

import math

from keras import losses

#master_path = "C:\\Users\\pochetti\\WorkDocs\\Desktop\\Fra\\Francesco\\Kaggle\\Invasive"
master_path = "/home/paperspace/Invasive"
Using TensorFlow backend.
/home/paperspace/anaconda3/lib/python3.6/importlib/_bootstrap.py:205: RuntimeWarning: compiletime version 3.5 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.6
  return f(*args, **kwds)

Helper Functions

In [79]:
def load_train(path):
    train_set = pd.read_csv(os.path.join(os.sep, master_path, 'train_labels.csv'))
    train_label = np.array(train_set['invasive'].iloc[: ])
    train_files = []
    for i in range(len(train_set)):
        train_files.append(path + os.sep + str(int(train_set.iloc[i][0])) +'.jpg')
    train_set['name'] = train_files
    return train_files, train_set, train_label

############################################################################################
############################################################################################

def augment(src, choice):
    if choice == 0:
        # Rotate 90
        src = np.rot90(src, 1)
    if choice == 1:
        # flip vertically
        src = np.flipud(src)
    if choice == 2:
        # Rotate 180
        src = np.rot90(src, 2)
    if choice == 3:
        # flip horizontally
        src = np.fliplr(src)
    if choice == 4:
        # Rotate 90 counter-clockwise
        src = np.rot90(src, 3)
    if choice == 5:
        # Rotate 180 and flip horizontally
        src = np.rot90(src, 2)
        src = np.fliplr(src)
    if choice == 6:
        # leave it as is
        src = src
    return src

############################################################################################
############################################################################################

def read_augment_save(frompath, topath):
    img = Image.open(frompath)
    new_array = np.array(img)
    new_array = augment(new_array, np.random.randint(6))
    img = Image.fromarray(new_array.astype(np.uint8))
    img.save(topath)
    return

############################################################################################
############################################################################################

def preprocess_input_resnet50(x):
    from keras.applications.resnet50 import preprocess_input
    X = np.expand_dims(x, axis=0)
    X = preprocess_input(X)
    return X[0]

def preprocess_input_vgg16(x):
    from keras.applications.vgg16 import preprocess_input
    X = np.expand_dims(x, axis=0)
    X = preprocess_input(X)
    return X[0]

def preprocess_input_inception(x):
    from keras.applications.inception_v3 import preprocess_input
    X = np.expand_dims(x, axis=0)
    X = preprocess_input(X)
    return X[0]

def preprocess_input_densenet(x):
    from keras.applications.densenet import preprocess_input
    X = np.expand_dims(x, axis=0)
    X = preprocess_input(X)
    return X[0]

############################################################################################
############################################################################################

def prepare_image(filepath, size, preprocessing_function):
    img = Image.open(filepath)
    img = img.resize(size, Image.ANTIALIAS)
    img = np.array(img).astype(np.float64)
    img = augment(img, np.random.randint(7))
    img = preprocessing_function(img)
    return img

############################################################################################
############################################################################################

def data_generator(data, which_net, size=(800,800), batch_size=8):
    
    if which_net == 'resnet50': 
        preprocessing_function=preprocess_input_resnet50
    elif which_net == 'densenet': 
        preprocessing_function=preprocess_input_densenet
    elif which_net == 'inception': 
        preprocessing_function=preprocess_input_inception
    elif which_net == 'vgg': 
        preprocessing_function=preprocess_input_vgg16
    
    while True:
        for start in range(0, len(data), batch_size):
            x_batch = []
            y_batch = []
            end = min(start + batch_size, len(data))
            data_batch = data[start:end]
            for filepath, tag in data_batch.values:
                img = prepare_image(filepath, size, preprocessing_function)
                x_batch.append(img)
                y_batch.append(tag)
            x_batch = np.array(x_batch)
            y_batch = np.array(y_batch, np.uint8)
            yield x_batch, y_batch

############################################################################################
############################################################################################            
            
def get_model(which_net, img_dim=(800,800,3)):
    
    if which_net == 'resnet50': 
        base_model = ResNet50(include_top=False, weights='imagenet',input_shape=img_dim)
    elif which_net == 'inception': 
        base_model = InceptionV3(include_top=False, weights='imagenet',input_shape=img_dim)
    elif which_net == 'densenet': 
        base_model = DenseNet121(include_top=False, weights='imagenet',input_shape=img_dim)

    input_tensor = Input(shape=img_dim)
    bn = BatchNormalization()(input_tensor)
    x = base_model(bn)
    x = GlobalAveragePooling2D()(x)
    x = Dropout(0.5)(x)
    output = Dense(1, activation='sigmoid')(x)
    model = Model(input_tensor, output)
    return model

Loading train data

In [9]:
train_files, train_set, train_label = load_train(master_path + os.sep + 'train')

train_set.head()
Out[9]:
name invasive
0 /home/paperspace/Invasive/train/1.jpg 0
1 /home/paperspace/Invasive/train/2.jpg 0
2 /home/paperspace/Invasive/train/3.jpg 1
3 /home/paperspace/Invasive/train/4.jpg 0
4 /home/paperspace/Invasive/train/5.jpg 1
In [5]:
train_set.shape
Out[5]:
(2295, 2)

Checking class imbalance

In [6]:
f, ax = plt.subplots(1,1,figsize=(8,6))
ax = sns.barplot(x=['Not Invasive','Invasive'], y=train_set.groupby(['invasive'],as_index=False).count()['name'])
ax.set(ylabel='Count of Samples')
plt.show()

Fixing class imbalance with random image augmentation

In [10]:
train_files, train_set, train_label = load_train(master_path + os.sep + 'train')

not_inv_to_add = 601
not_inv = train_set.loc[train_set.invasive == 0,:].sample(not_inv_to_add, random_state=3)
max_file = pd.read_csv(os.path.join(os.sep, master_path, 'train_labels.csv')).name.max()

for i in range(max_file + 1, max_file + not_inv_to_add + 1):
    frompath = not_inv.iloc[i - max_file - 1, 0]
    topath = os.path.join(master_path, 'train', str(i) + '.jpg')
    read_augment_save(frompath, topath)
    train_set = train_set.append(pd.DataFrame({'name': [topath], 'invasive': [0]})[['name', 'invasive']], ignore_index=True)
In [11]:
train_set.to_pickle(os.path.join(master_path, 'train_set.pkl'))

f, ax = plt.subplots(1,1,figsize=(8,6)) ax = sns.barplot(x=['Not Invasive','Invasive'], y=train_set.groupby(['invasive'],as_index=False).count()['name']) ax.set(ylabel='Count of Samples') plt.show()

Showing some training images

In [12]:
fig = plt.figure(1, figsize=(16, 16))
invas_dict ={0: 'Not Invasive', 1: 'Invasive'}
grid = ImageGrid(fig, 111, nrows_ncols=(4, 4), axes_pad=0.05)
for i, (img_path, invasive)  in enumerate(train_set.sample(16).values):
    ax = grid[i]
    img = image.load_img(img_path)
    ax.imshow(img)
    ax.text(10, 200, 'LABEL: %s' % invas_dict[invasive], color='w', backgroundcolor='k', alpha=0.8)
    ax.axis('off')
plt.show()