This tutorial shows how to fine-tune a pre-trained SqueezeNet model for the dogs and cats categorization in GPU using Caffe2 framework.
This notebook assumes nvidia-docker environment. Install nvidia-docker
in a GPU host. Launch caffe2ai/caffe2
instance to run this notebook.
nvidia-docker run -it --rm -p 8888:8888 caffe2ai/caffe2:c2v0.8.1.cuda8.cudnn7.ubuntu16.04 \
jupyter notebook --no-browser --ip "*" --allow-root
We use the dog-vs-cats dataset from Kaggle. Create a Kaggle account, login, and download the dataset from the following URL:
https://www.kaggle.com/c/dogs-vs-cats/data
Once downloaded the dataset, unarchive the files and organize data in the following location.
mkdir dogs-vs-cats/
unzip train.zip -d dogs-vs-cats/
unzip test1.zip -d dogs-vs-cats/
rm train.zip test1.zip
Caffe2 v0.8.1 has a bug in model download module. Fix the code and get SqueezeNet as follows.
sed -i "s,s3.amazonaws.com/caffe2/,s3.amazonaws.com/download.caffe2.ai/," \
/usr/local/caffe2/python/models/download.py
python -m caffe2.python.models.download -i squeezenet
The following code transforms an image to the desired input format for SqueezeNet.
import numpy as np
import skimage.io
import skimage.transform
def rescale(img, input_height, input_width):
aspect = img.shape[1] / float(img.shape[0])
if aspect > 1:
return skimage.transform.resize(img, (input_width, int(aspect * input_height)))
elif aspect < 1:
return skimage.transform.resize(img, (int(input_width/aspect), input_height))
else:
return skimage.transform.resize(img, (input_width, input_height))
def crop_center(img, cropx, cropy):
y, x, c = img.shape
startx = x // 2 - (cropx // 2)
starty = y // 2 - (cropy // 2)
return img[starty:starty+cropy, startx:startx+cropx]
def prepare_image(img_path):
img = skimage.io.imread(img_path)
img = skimage.img_as_float(img)
img = rescale(img, 227, 227)
img = crop_center(img, 227, 227)
img = img.swapaxes(1, 2).swapaxes(0, 1) # HWC to CHW dimension
img = img[(2, 1, 0), :, :] # RGB to BGR color order
img = img * 255 - 128 # Subtract mean = 128
return img.astype(np.float32)
During training, we will use the following object to read batches of image and label pairs.
import os, glob, random
def make_batch(iterable, batch_size=1):
length = len(iterable)
for index in range(0, length, batch_size):
yield iterable[index:min(index + batch_size, length)]
class DogsCatsDataset(object):
""" Dogs and cats dataset reader """
def __init__(self, split="train", data_dir="dogs-vs-cats/"):
self.categories = {"dog": 0, "cat": 1}
self.image_files = list(glob.glob(os.path.join(data_dir, split, "*.jpg")))
self.labels = [self.categories.get(os.path.basename(path).strip().split(".")[0], -1)
for path in self.image_files]
def __getitem__(self, index):
image = prepare_image(self.image_files[index])
label = self.labels[index]
return image, label
def __len__(self):
return len(self.labels)
def read(self, batch_size=50, shuffle=True):
"""Read (image, label) pairs in batch"""
order = list(range(len(self)))
if shuffle:
random.shuffle(order)
for batch in make_batch(order, batch_size):
images, labels = [], []
for index in batch:
image, label = self[index]
images.append(image)
labels.append(label)
yield np.stack(images).astype(np.float32), np.stack(labels).astype(np.int32).reshape((batch_size,))
In the following, we will modify SqueezeNet structure so that we can apply the pre-trained model to dogs-vs-cats categorization problem. We do not really need to change the kind of operators in the prediction module but need to change how to initialize model parameters as well as the output shape. In Caffe2, net definition is given in the protocol buffer. We can directly modify the protocol buffer and add any other necessary operators using model_helper.
The basic steps are:
from caffe2.python import core, workspace, model_helper, optimizer, brew
from caffe2.python.modeling import initializers
from caffe2.python.modeling.parameter_info import ParameterTags
from caffe2.proto import caffe2_pb2
PREDICT_NET = "/usr/local/caffe2/python/models/squeezenet/predict_net.pb"
INIT_NET = "/usr/local/caffe2/python/models/squeezenet/init_net.pb"
def AddPredictNet(model, predict_net_path):
predict_net_proto = caffe2_pb2.NetDef()
with open(predict_net_path, "rb") as f:
predict_net_proto.ParseFromString(f.read())
model.net = core.Net(predict_net_proto)
# Fix dimension incompatibility
model.Squeeze("softmaxout", "softmax", dims=[2, 3])
def AddInitNet(model, init_net_path, out_dim=2, params_to_learn=None):
init_net_proto = caffe2_pb2.NetDef()
with open(init_net_path, "rb") as f:
init_net_proto.ParseFromString(f.read())
# Define params to learn in the model.
for op in init_net_proto.op:
param_name = op.output[0]
if params_to_learn is None or op.output[0] in params_to_learn:
tags = (ParameterTags.WEIGHT if param_name.endswith("_w")
else ParameterTags.BIAS)
model.create_param(
param_name=param_name,
shape=op.arg[0],
initializer=initializers.ExternalInitializer(),
tags=tags,
)
# Remove conv10_w, conv10_b initializers at (50, 51)
init_net_proto.op.pop(51)
init_net_proto.op.pop(50)
# Add new initializers for conv10_w, conv10_b
model.param_init_net = core.Net(init_net_proto)
model.param_init_net.XavierFill([], "conv10_w", shape=[out_dim, 512, 1, 1])
model.param_init_net.ConstantFill([], "conv10_b", shape=[out_dim])
def AddTrainingOperators(model, softmax, label):
xent = model.LabelCrossEntropy([softmax, label], "xent")
loss = model.AveragedLoss(xent, "loss")
brew.accuracy(model, [softmax, label], "accuracy")
model.AddGradientOperators([loss])
opt = optimizer.build_sgd(model, base_learning_rate=0.1)
for param in model.GetOptimizationParamInfo():
opt(model.net, model.param_init_net, param)
train_model = model_helper.ModelHelper("train_net")
AddPredictNet(train_model, PREDICT_NET)
AddInitNet(train_model, INIT_NET, params_to_learn=["conv10_w", "conv10_b"]) # Use None to learn everything.
AddTrainingOperators(train_model, "softmax", "label")
In this version of Caffe2, operators with empty device_option
runs in CPU. We have to manually remove per-op device option and specify global device_option
like below.
def SetDeviceOption(model, device_option):
# Clear op-specific device options and set global device option.
for net in ("net", "param_init_net"):
net_def = getattr(model, net).Proto()
net_def.device_option.CopyFrom(device_option)
for op in net_def.op:
# Some operators are CPU-only.
if op.output[0] not in ("optimizer_iteration", "iteration_mutex"):
op.ClearField("device_option")
op.ClearField("engine")
setattr(model, net, core.Net(net_def))
device_option = caffe2_pb2.DeviceOption()
device_option.device_type = caffe2_pb2.CUDA
device_option.cuda_gpu_id = 0
SetDeviceOption(train_model, device_option)
Now that we specified the fine-tuning model. Let's start the process using GPU.
workspace.ResetWorkspace()
# Initialization.
train_dataset = DogsCatsDataset("train")
for image, label in train_dataset.read(batch_size=1):
workspace.FeedBlob("data", image, device_option=device_option)
workspace.FeedBlob("label", label, device_option=device_option)
break
workspace.RunNetOnce(train_model.param_init_net)
workspace.CreateNet(train_model.net, overwrite=True)
# Main loop.
batch_size = 50
print_freq = 50
losses = []
for epoch in range(5):
for index, (image, label) in enumerate(train_dataset.read(batch_size)):
workspace.FeedBlob("data", image, device_option=device_option)
workspace.FeedBlob("label", label, device_option=device_option)
workspace.RunNet(train_model.net)
accuracy = float(workspace.FetchBlob("accuracy"))
loss = workspace.FetchBlob("loss").mean()
losses.append(loss)
if index % print_freq == 0:
print("[{}][{}/{}] loss={}, accuracy={}".format(
epoch, index, int(len(train_dataset) / batch_size),
loss, accuracy))
/usr/local/lib/python2.7/dist-packages/skimage/transform/_warps.py:84: UserWarning: The default mode, 'constant', will be changed to 'reflect' in skimage 0.15. warn("The default mode, 'constant', will be changed to 'reflect' in "
[0][0/500] loss=1.15868854523, accuracy=0.600000023842 [0][50/500] loss=0.10267546773, accuracy=0.959999978542 [0][100/500] loss=0.0946362018585, accuracy=0.959999978542 [0][150/500] loss=0.131896346807, accuracy=0.939999997616 [0][200/500] loss=0.0796595662832, accuracy=0.959999978542 [0][250/500] loss=0.0999855622649, accuracy=0.939999997616 [0][300/500] loss=0.08723885566, accuracy=0.980000019073 [0][350/500] loss=0.0652658939362, accuracy=0.959999978542 [0][400/500] loss=0.0490036420524, accuracy=0.980000019073 [0][450/500] loss=0.0513261109591, accuracy=0.959999978542 [1][0/500] loss=0.109811611474, accuracy=0.959999978542 [1][50/500] loss=0.17768985033, accuracy=0.899999976158 [1][100/500] loss=0.070137001574, accuracy=0.959999978542 [1][150/500] loss=0.103964686394, accuracy=0.959999978542 [1][200/500] loss=0.0276749264449, accuracy=0.980000019073 [1][250/500] loss=0.0539846345782, accuracy=0.959999978542 [1][300/500] loss=0.0995031148195, accuracy=0.959999978542 [1][350/500] loss=0.154732093215, accuracy=0.920000016689 [1][400/500] loss=0.0426763668656, accuracy=0.980000019073 [1][450/500] loss=0.0599067881703, accuracy=0.959999978542 [2][0/500] loss=0.0479127429426, accuracy=0.980000019073 [2][50/500] loss=0.124013155699, accuracy=0.939999997616 [2][100/500] loss=0.107514634728, accuracy=0.959999978542 [2][150/500] loss=0.0175920911133, accuracy=1.0 [2][200/500] loss=0.0359073840082, accuracy=0.980000019073 [2][250/500] loss=0.0313472747803, accuracy=0.980000019073 [2][300/500] loss=0.0128663843498, accuracy=1.0 [2][350/500] loss=0.0937282890081, accuracy=0.959999978542 [2][400/500] loss=0.115684136748, accuracy=0.959999978542 [2][450/500] loss=0.134855091572, accuracy=0.920000016689 [3][0/500] loss=0.0344010330737, accuracy=0.980000019073 [3][50/500] loss=0.0307683553547, accuracy=0.980000019073 [3][100/500] loss=0.0360927656293, accuracy=1.0 [3][150/500] loss=0.105807900429, accuracy=0.959999978542 [3][200/500] loss=0.0475012473762, accuracy=1.0 [3][250/500] loss=0.0839904695749, accuracy=0.920000016689 [3][300/500] loss=0.0396303012967, accuracy=1.0 [3][350/500] loss=0.108292654157, accuracy=0.920000016689 [3][400/500] loss=0.042269334197, accuracy=0.959999978542 [3][450/500] loss=0.120973356068, accuracy=0.959999978542 [4][0/500] loss=0.0672254338861, accuracy=0.980000019073 [4][50/500] loss=0.0240190029144, accuracy=0.980000019073 [4][100/500] loss=0.155671566725, accuracy=0.959999978542 [4][150/500] loss=0.00950713548809, accuracy=1.0 [4][200/500] loss=0.0494051575661, accuracy=0.980000019073 [4][250/500] loss=0.0336142256856, accuracy=0.980000019073 [4][300/500] loss=0.0153067773208, accuracy=1.0 [4][350/500] loss=0.0327255986631, accuracy=1.0 [4][400/500] loss=0.0851453989744, accuracy=0.939999997616 [4][450/500] loss=0.0748727470636, accuracy=0.980000019073
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(losses)
plt.xlabel("iterations")
plt.ylabel("loss")
plt.grid("on")
Once fine-tuning is finished, we can apply the learned model to a new input. First instantiate a deploy model.
deploy_model = model_helper.ModelHelper("deploy_net")
AddPredictNet(deploy_model, PREDICT_NET)
SetDeviceOption(deploy_model, device_option)
We can pre-process and feed a test image to the depoly model. The prediction result is given as probability in NxC format, where C is in ["dog", "cat"]
order.
test_dataset = DogsCatsDataset("test1")
image, _ = test_dataset[0]
image = image[np.newaxis, :]
workspace.FeedBlob("data", image, device_option=device_option)
workspace.RunNetOnce(deploy_model.net)
result = workspace.FetchBlob("softmax")[0]
skimage.io.imshow(test_dataset.image_files[0])
print("dog={:g} cat={:g}".format(result[0], result[1]))
dog=8.07451e-09 cat=1