在3.0: YOLO物體偵測演算法概念與介紹的文章裡介紹了YOLO演算法一些重要概念與其它物體偵測演算法的不同之處。
這一篇文章則是要手把手地介紹使用basic-yolo-keras來將Darknet預訓練的模型進行再訓練與調整來偵測人在做動的雙手 (Hands)。
basic-yolo-keras包含在Keras中使用Tensorflow後端的YOLOv2演算方的實現。它支持訓練YOLOv2網絡與不同的網絡結構,如MobileNet和InceptionV3。
由於原始的專案experiencor/basic-yolo-keras主要以英文來說明演算法的實現與再訓練, 對於一些剛入門深度學習的學習者來說,有一些不容易理解與入手的情況。
因此在這個專案的方向在於文件說明的中文化以外,同時也會以一些便於使用與學習的角度進行源碼的調整與修正。
git clone https://github.com/erhwenkuo/basic-yolo-keras.git
cd basic-yolo-keras
pip install numpy h5py pillow
pip install tensorflow-gpu # CPU-only: conda install -c conda-forge tensorflow
pip install keras # Possibly older release: conda install keras
...
VIVA手部檢測資料集是由駕駛員和乘客雙手位置的二維邊界框(Bounding Box)組成,包括從手部動作和常見遮擋等自然駕駛的情境裡透過54個視頻採集而來。 這個資料集包括了7個可能的視角的手部圖像(也包括第一人稱視角)。 有些數據已經被我們的測試平台捕獲,有些則由YouTube提供。
資料集的網站: http://cvrr.ucsd.edu/vivachallenge/index.php/hands/hand-detection/
basic-yolo-keras
的目錄裡產生一個子目錄data
與data/hands
LISA_HD_Static.zip
。LISA_HD_Static.zip
並複製其中的detectiondata\train
與detectiondata\test
兩個目錄到basic-yolo-keras/data/hands
的目錄下最後你的目錄結構看起來像這樣: (這裡只列出來在這個範例會用到的相關檔案與目錄)
basic-yolo-keras/
├── xxxx.ipynb
├── yolo.weights
├── backend.py
├── preprocessing.py
├── utils.py
├── font/
│ └── FiraMono-Medium.otf
├── yolo.weights
└── data/
└── hands/
├── train/
│ ├── pos/ <--- 訓練圖像的位置
│ └── posGt/ <--- 標註檔的位置
└── test/
└── pos/ <--- 驗證圖像的位置
# Utilities相關函式庫
import os
import random
from tqdm import tqdm
# 多維向量處理相關函式庫
import numpy as np
# 讓Keras只使用GPU來進行訓練
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = ""
# 圖像處理相關函式庫
import cv2
import imgaug as ia
from imgaug import augmenters as iaa
import colorsys
import matplotlib.pyplot as plt
from PIL import Image, ImageDraw, ImageFont
%matplotlib inline
# 序列/反序列化相關函式庫
import pickle
# 深度學習相關函式庫
from keras.models import Sequential, Model
from keras.layers import Reshape, Activation, Conv2D, Input, MaxPooling2D, BatchNormalization, Flatten, Dense, Lambda
from keras.layers.advanced_activations import LeakyReLU
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from keras.optimizers import SGD, Adam, RMSprop
from keras.layers.merge import concatenate
import keras.backend as K
import tensorflow as tf
# 專案相關函式庫
from preprocessing import parse_annotation, BatchGenerator
from utils import WeightReader, decode_netout, draw_boxes, normalize
from utils import draw_bgr_image_boxes, draw_rgb_image_boxes, draw_pil_image_boxes
Using TensorFlow backend.
# 專案的根目錄路徑
ROOT_DIR = os.getcwd()
# 訓練/驗證用的資料目錄
DATA_PATH = os.path.join(ROOT_DIR, "data")
# 資料集目錄
DATA_SET_PATH = os.path.join(DATA_PATH, "hands")
TRAIN_DATA_PATH = os.path.join(DATA_SET_PATH, "train")
TRAIN_IMGS_PATH = os.path.join(TRAIN_DATA_PATH, "pos")
TRAIN_ANNOTATION_PATH = os.path.join(TRAIN_DATA_PATH, "posGt")
在這個圖像資料集裡有4種類別:
leftHand_driver
rightHand_driver
leftHand_passenger
rightHand_passenger
但在這次的訓練中, 我們只要判斷是左/右手就行了:
# 圖像類別的Label-encoding
map_categories = {0: 'left_hand', 1: 'right_hand'}
# 取得所有圖像的圖像類別列表
labels=list(map_categories.values())
print(labels)
['left_hand', 'right_hand']
LABELS = labels # 圖像類別
IMAGE_H, IMAGE_W = 416, 416 # 模型輸入的圖像長寬
GRID_H, GRID_W = 13 , 13
BOX = 5
CLASS = len(LABELS)
CLASS_WEIGHTS = np.ones(CLASS, dtype='float32')
OBJ_THRESHOLD = 0.5
NMS_THRESHOLD = 0.45 # NMS非極大值抑制 , 說明(https://chenzomi12.github.io/2016/12/14/YOLO-nms/)
ANCHORS = [0.57273, 0.677385, 1.87446, 2.06253, 3.33843, 5.47434, 7.88282, 3.52778, 9.77052, 9.16828]
NO_OBJECT_SCALE = 1.0
OBJECT_SCALE = 5.0
COORD_SCALE = 1.0
CLASS_SCALE = 1.0
BATCH_SIZE = 16
WARM_UP_BATCHES = 0
TRUE_BOX_BUFFER = 50
wt_path = 'yolo.weights'
train_image_folder = TRAIN_IMGS_PATH
train_annot_folder = TRAIN_ANNOTATION_PATH
valid_image_folder = TRAIN_IMGS_PATH
valid_annot_folder = TRAIN_ANNOTATION_PATH
# the function to implement the orgnization layer (thanks to github.com/allanzelener/YAD2K)
def space_to_depth_x2(x):
return tf.space_to_depth(x, block_size=2)
input_image = Input(shape=(IMAGE_H, IMAGE_W, 3))
true_boxes = Input(shape=(1, 1, 1, TRUE_BOX_BUFFER , 4))
# Layer 1
x = Conv2D(32, (3,3), strides=(1,1), padding='same', name='conv_1', use_bias=False)(input_image)
x = BatchNormalization(name='norm_1')(x)
x = LeakyReLU(alpha=0.1)(x)
x = MaxPooling2D(pool_size=(2, 2))(x)
# Layer 2
x = Conv2D(64, (3,3), strides=(1,1), padding='same', name='conv_2', use_bias=False)(x)
x = BatchNormalization(name='norm_2')(x)
x = LeakyReLU(alpha=0.1)(x)
x = MaxPooling2D(pool_size=(2, 2))(x)
# Layer 3
x = Conv2D(128, (3,3), strides=(1,1), padding='same', name='conv_3', use_bias=False)(x)
x = BatchNormalization(name='norm_3')(x)
x = LeakyReLU(alpha=0.1)(x)
# Layer 4
x = Conv2D(64, (1,1), strides=(1,1), padding='same', name='conv_4', use_bias=False)(x)
x = BatchNormalization(name='norm_4')(x)
x = LeakyReLU(alpha=0.1)(x)
# Layer 5
x = Conv2D(128, (3,3), strides=(1,1), padding='same', name='conv_5', use_bias=False)(x)
x = BatchNormalization(name='norm_5')(x)
x = LeakyReLU(alpha=0.1)(x)
x = MaxPooling2D(pool_size=(2, 2))(x)
# Layer 6
x = Conv2D(256, (3,3), strides=(1,1), padding='same', name='conv_6', use_bias=False)(x)
x = BatchNormalization(name='norm_6')(x)
x = LeakyReLU(alpha=0.1)(x)
# Layer 7
x = Conv2D(128, (1,1), strides=(1,1), padding='same', name='conv_7', use_bias=False)(x)
x = BatchNormalization(name='norm_7')(x)
x = LeakyReLU(alpha=0.1)(x)
# Layer 8
x = Conv2D(256, (3,3), strides=(1,1), padding='same', name='conv_8', use_bias=False)(x)
x = BatchNormalization(name='norm_8')(x)
x = LeakyReLU(alpha=0.1)(x)
x = MaxPooling2D(pool_size=(2, 2))(x)
# Layer 9
x = Conv2D(512, (3,3), strides=(1,1), padding='same', name='conv_9', use_bias=False)(x)
x = BatchNormalization(name='norm_9')(x)
x = LeakyReLU(alpha=0.1)(x)
# Layer 10
x = Conv2D(256, (1,1), strides=(1,1), padding='same', name='conv_10', use_bias=False)(x)
x = BatchNormalization(name='norm_10')(x)
x = LeakyReLU(alpha=0.1)(x)
# Layer 11
x = Conv2D(512, (3,3), strides=(1,1), padding='same', name='conv_11', use_bias=False)(x)
x = BatchNormalization(name='norm_11')(x)
x = LeakyReLU(alpha=0.1)(x)
# Layer 12
x = Conv2D(256, (1,1), strides=(1,1), padding='same', name='conv_12', use_bias=False)(x)
x = BatchNormalization(name='norm_12')(x)
x = LeakyReLU(alpha=0.1)(x)
# Layer 13
x = Conv2D(512, (3,3), strides=(1,1), padding='same', name='conv_13', use_bias=False)(x)
x = BatchNormalization(name='norm_13')(x)
x = LeakyReLU(alpha=0.1)(x)
skip_connection = x
x = MaxPooling2D(pool_size=(2, 2))(x)
# Layer 14
x = Conv2D(1024, (3,3), strides=(1,1), padding='same', name='conv_14', use_bias=False)(x)
x = BatchNormalization(name='norm_14')(x)
x = LeakyReLU(alpha=0.1)(x)
# Layer 15
x = Conv2D(512, (1,1), strides=(1,1), padding='same', name='conv_15', use_bias=False)(x)
x = BatchNormalization(name='norm_15')(x)
x = LeakyReLU(alpha=0.1)(x)
# Layer 16
x = Conv2D(1024, (3,3), strides=(1,1), padding='same', name='conv_16', use_bias=False)(x)
x = BatchNormalization(name='norm_16')(x)
x = LeakyReLU(alpha=0.1)(x)
# Layer 17
x = Conv2D(512, (1,1), strides=(1,1), padding='same', name='conv_17', use_bias=False)(x)
x = BatchNormalization(name='norm_17')(x)
x = LeakyReLU(alpha=0.1)(x)
# Layer 18
x = Conv2D(1024, (3,3), strides=(1,1), padding='same', name='conv_18', use_bias=False)(x)
x = BatchNormalization(name='norm_18')(x)
x = LeakyReLU(alpha=0.1)(x)
# Layer 19
x = Conv2D(1024, (3,3), strides=(1,1), padding='same', name='conv_19', use_bias=False)(x)
x = BatchNormalization(name='norm_19')(x)
x = LeakyReLU(alpha=0.1)(x)
# Layer 20
x = Conv2D(1024, (3,3), strides=(1,1), padding='same', name='conv_20', use_bias=False)(x)
x = BatchNormalization(name='norm_20')(x)
x = LeakyReLU(alpha=0.1)(x)
# Layer 21
skip_connection = Conv2D(64, (1,1), strides=(1,1), padding='same', name='conv_21', use_bias=False)(skip_connection)
skip_connection = BatchNormalization(name='norm_21')(skip_connection)
skip_connection = LeakyReLU(alpha=0.1)(skip_connection)
skip_connection = Lambda(space_to_depth_x2)(skip_connection)
x = concatenate([skip_connection, x])
# Layer 22
x = Conv2D(1024, (3,3), strides=(1,1), padding='same', name='conv_22', use_bias=False)(x)
x = BatchNormalization(name='norm_22')(x)
x = LeakyReLU(alpha=0.1)(x)
# Layer 23
x = Conv2D(BOX * (4 + 1 + CLASS), (1,1), strides=(1,1), padding='same', name='conv_23')(x)
output = Reshape((GRID_H, GRID_W, BOX, 4 + 1 + CLASS))(x)
# small hack to allow true_boxes to be registered when Keras build the model
# for more information: https://github.com/fchollet/keras/issues/2790
output = Lambda(lambda args: args[0])([output, true_boxes])
model = Model([input_image, true_boxes], output)
model.summary() # 打印模型
__________________________________________________________________________________________________ Layer (type) Output Shape Param # Connected to ================================================================================================== input_1 (InputLayer) (None, 416, 416, 3) 0 __________________________________________________________________________________________________ conv_1 (Conv2D) (None, 416, 416, 32) 864 input_1[0][0] __________________________________________________________________________________________________ norm_1 (BatchNormalization) (None, 416, 416, 32) 128 conv_1[0][0] __________________________________________________________________________________________________ leaky_re_lu_1 (LeakyReLU) (None, 416, 416, 32) 0 norm_1[0][0] __________________________________________________________________________________________________ max_pooling2d_1 (MaxPooling2D) (None, 208, 208, 32) 0 leaky_re_lu_1[0][0] __________________________________________________________________________________________________ conv_2 (Conv2D) (None, 208, 208, 64) 18432 max_pooling2d_1[0][0] __________________________________________________________________________________________________ norm_2 (BatchNormalization) (None, 208, 208, 64) 256 conv_2[0][0] __________________________________________________________________________________________________ leaky_re_lu_2 (LeakyReLU) (None, 208, 208, 64) 0 norm_2[0][0] __________________________________________________________________________________________________ max_pooling2d_2 (MaxPooling2D) (None, 104, 104, 64) 0 leaky_re_lu_2[0][0] __________________________________________________________________________________________________ conv_3 (Conv2D) (None, 104, 104, 128 73728 max_pooling2d_2[0][0] __________________________________________________________________________________________________ norm_3 (BatchNormalization) (None, 104, 104, 128 512 conv_3[0][0] __________________________________________________________________________________________________ leaky_re_lu_3 (LeakyReLU) (None, 104, 104, 128 0 norm_3[0][0] __________________________________________________________________________________________________ conv_4 (Conv2D) (None, 104, 104, 64) 8192 leaky_re_lu_3[0][0] __________________________________________________________________________________________________ norm_4 (BatchNormalization) (None, 104, 104, 64) 256 conv_4[0][0] __________________________________________________________________________________________________ leaky_re_lu_4 (LeakyReLU) (None, 104, 104, 64) 0 norm_4[0][0] __________________________________________________________________________________________________ conv_5 (Conv2D) (None, 104, 104, 128 73728 leaky_re_lu_4[0][0] __________________________________________________________________________________________________ norm_5 (BatchNormalization) (None, 104, 104, 128 512 conv_5[0][0] __________________________________________________________________________________________________ leaky_re_lu_5 (LeakyReLU) (None, 104, 104, 128 0 norm_5[0][0] __________________________________________________________________________________________________ max_pooling2d_3 (MaxPooling2D) (None, 52, 52, 128) 0 leaky_re_lu_5[0][0] __________________________________________________________________________________________________ conv_6 (Conv2D) (None, 52, 52, 256) 294912 max_pooling2d_3[0][0] __________________________________________________________________________________________________ norm_6 (BatchNormalization) (None, 52, 52, 256) 1024 conv_6[0][0] __________________________________________________________________________________________________ leaky_re_lu_6 (LeakyReLU) (None, 52, 52, 256) 0 norm_6[0][0] __________________________________________________________________________________________________ conv_7 (Conv2D) (None, 52, 52, 128) 32768 leaky_re_lu_6[0][0] __________________________________________________________________________________________________ norm_7 (BatchNormalization) (None, 52, 52, 128) 512 conv_7[0][0] __________________________________________________________________________________________________ leaky_re_lu_7 (LeakyReLU) (None, 52, 52, 128) 0 norm_7[0][0] __________________________________________________________________________________________________ conv_8 (Conv2D) (None, 52, 52, 256) 294912 leaky_re_lu_7[0][0] __________________________________________________________________________________________________ norm_8 (BatchNormalization) (None, 52, 52, 256) 1024 conv_8[0][0] __________________________________________________________________________________________________ leaky_re_lu_8 (LeakyReLU) (None, 52, 52, 256) 0 norm_8[0][0] __________________________________________________________________________________________________ max_pooling2d_4 (MaxPooling2D) (None, 26, 26, 256) 0 leaky_re_lu_8[0][0] __________________________________________________________________________________________________ conv_9 (Conv2D) (None, 26, 26, 512) 1179648 max_pooling2d_4[0][0] __________________________________________________________________________________________________ norm_9 (BatchNormalization) (None, 26, 26, 512) 2048 conv_9[0][0] __________________________________________________________________________________________________ leaky_re_lu_9 (LeakyReLU) (None, 26, 26, 512) 0 norm_9[0][0] __________________________________________________________________________________________________ conv_10 (Conv2D) (None, 26, 26, 256) 131072 leaky_re_lu_9[0][0] __________________________________________________________________________________________________ norm_10 (BatchNormalization) (None, 26, 26, 256) 1024 conv_10[0][0] __________________________________________________________________________________________________ leaky_re_lu_10 (LeakyReLU) (None, 26, 26, 256) 0 norm_10[0][0] __________________________________________________________________________________________________ conv_11 (Conv2D) (None, 26, 26, 512) 1179648 leaky_re_lu_10[0][0] __________________________________________________________________________________________________ norm_11 (BatchNormalization) (None, 26, 26, 512) 2048 conv_11[0][0] __________________________________________________________________________________________________ leaky_re_lu_11 (LeakyReLU) (None, 26, 26, 512) 0 norm_11[0][0] __________________________________________________________________________________________________ conv_12 (Conv2D) (None, 26, 26, 256) 131072 leaky_re_lu_11[0][0] __________________________________________________________________________________________________ norm_12 (BatchNormalization) (None, 26, 26, 256) 1024 conv_12[0][0] __________________________________________________________________________________________________ leaky_re_lu_12 (LeakyReLU) (None, 26, 26, 256) 0 norm_12[0][0] __________________________________________________________________________________________________ conv_13 (Conv2D) (None, 26, 26, 512) 1179648 leaky_re_lu_12[0][0] __________________________________________________________________________________________________ norm_13 (BatchNormalization) (None, 26, 26, 512) 2048 conv_13[0][0] __________________________________________________________________________________________________ leaky_re_lu_13 (LeakyReLU) (None, 26, 26, 512) 0 norm_13[0][0] __________________________________________________________________________________________________ max_pooling2d_5 (MaxPooling2D) (None, 13, 13, 512) 0 leaky_re_lu_13[0][0] __________________________________________________________________________________________________ conv_14 (Conv2D) (None, 13, 13, 1024) 4718592 max_pooling2d_5[0][0] __________________________________________________________________________________________________ norm_14 (BatchNormalization) (None, 13, 13, 1024) 4096 conv_14[0][0] __________________________________________________________________________________________________ leaky_re_lu_14 (LeakyReLU) (None, 13, 13, 1024) 0 norm_14[0][0] __________________________________________________________________________________________________ conv_15 (Conv2D) (None, 13, 13, 512) 524288 leaky_re_lu_14[0][0] __________________________________________________________________________________________________ norm_15 (BatchNormalization) (None, 13, 13, 512) 2048 conv_15[0][0] __________________________________________________________________________________________________ leaky_re_lu_15 (LeakyReLU) (None, 13, 13, 512) 0 norm_15[0][0] __________________________________________________________________________________________________ conv_16 (Conv2D) (None, 13, 13, 1024) 4718592 leaky_re_lu_15[0][0] __________________________________________________________________________________________________ norm_16 (BatchNormalization) (None, 13, 13, 1024) 4096 conv_16[0][0] __________________________________________________________________________________________________ leaky_re_lu_16 (LeakyReLU) (None, 13, 13, 1024) 0 norm_16[0][0] __________________________________________________________________________________________________ conv_17 (Conv2D) (None, 13, 13, 512) 524288 leaky_re_lu_16[0][0] __________________________________________________________________________________________________ norm_17 (BatchNormalization) (None, 13, 13, 512) 2048 conv_17[0][0] __________________________________________________________________________________________________ leaky_re_lu_17 (LeakyReLU) (None, 13, 13, 512) 0 norm_17[0][0] __________________________________________________________________________________________________ conv_18 (Conv2D) (None, 13, 13, 1024) 4718592 leaky_re_lu_17[0][0] __________________________________________________________________________________________________ norm_18 (BatchNormalization) (None, 13, 13, 1024) 4096 conv_18[0][0] __________________________________________________________________________________________________ leaky_re_lu_18 (LeakyReLU) (None, 13, 13, 1024) 0 norm_18[0][0] __________________________________________________________________________________________________ conv_19 (Conv2D) (None, 13, 13, 1024) 9437184 leaky_re_lu_18[0][0] __________________________________________________________________________________________________ norm_19 (BatchNormalization) (None, 13, 13, 1024) 4096 conv_19[0][0] __________________________________________________________________________________________________ conv_21 (Conv2D) (None, 26, 26, 64) 32768 leaky_re_lu_13[0][0] __________________________________________________________________________________________________ leaky_re_lu_19 (LeakyReLU) (None, 13, 13, 1024) 0 norm_19[0][0] __________________________________________________________________________________________________ norm_21 (BatchNormalization) (None, 26, 26, 64) 256 conv_21[0][0] __________________________________________________________________________________________________ conv_20 (Conv2D) (None, 13, 13, 1024) 9437184 leaky_re_lu_19[0][0] __________________________________________________________________________________________________ leaky_re_lu_21 (LeakyReLU) (None, 26, 26, 64) 0 norm_21[0][0] __________________________________________________________________________________________________ norm_20 (BatchNormalization) (None, 13, 13, 1024) 4096 conv_20[0][0] __________________________________________________________________________________________________ lambda_1 (Lambda) (None, 13, 13, 256) 0 leaky_re_lu_21[0][0] __________________________________________________________________________________________________ leaky_re_lu_20 (LeakyReLU) (None, 13, 13, 1024) 0 norm_20[0][0] __________________________________________________________________________________________________ concatenate_1 (Concatenate) (None, 13, 13, 1280) 0 lambda_1[0][0] leaky_re_lu_20[0][0] __________________________________________________________________________________________________ conv_22 (Conv2D) (None, 13, 13, 1024) 11796480 concatenate_1[0][0] __________________________________________________________________________________________________ norm_22 (BatchNormalization) (None, 13, 13, 1024) 4096 conv_22[0][0] __________________________________________________________________________________________________ leaky_re_lu_22 (LeakyReLU) (None, 13, 13, 1024) 0 norm_22[0][0] __________________________________________________________________________________________________ conv_23 (Conv2D) (None, 13, 13, 35) 35875 leaky_re_lu_22[0][0] __________________________________________________________________________________________________ reshape_1 (Reshape) (None, 13, 13, 5, 7) 0 conv_23[0][0] __________________________________________________________________________________________________ input_2 (InputLayer) (None, 1, 1, 1, 50, 0 __________________________________________________________________________________________________ lambda_2 (Lambda) (None, 13, 13, 5, 7) 0 reshape_1[0][0] input_2[0][0] ================================================================================================== Total params: 50,583,811 Trainable params: 50,563,139 Non-trainable params: 20,672 __________________________________________________________________________________________________
Load the weights originally provided by YOLO
weight_reader = WeightReader(wt_path) # 初始讀取Darknet預訓練權重檔物件
weight_reader.reset()
nb_conv = 23 # 總共有23層的卷積層
for i in range(1, nb_conv+1):
conv_layer = model.get_layer('conv_' + str(i))
# 在conv_1~conv_22的卷積組合裡都包含了"conv + norm"二層, 只有conv_23是獨立一層
if i < nb_conv:
print("handle norm_" + str(i) + " start")
norm_layer = model.get_layer('norm_' + str(i)) # 取得BatchNormalization層
size = np.prod(norm_layer.get_weights()[0].shape) # 取得BatchNormalization層的參數量
print("shape: ", norm_layer.get_weights()[0].shape)
beta = weight_reader.read_bytes(size)
gamma = weight_reader.read_bytes(size)
mean = weight_reader.read_bytes(size)
var = weight_reader.read_bytes(size)
weights = norm_layer.set_weights([gamma, beta, mean, var])
print("handle norm_" + str(i) + " completed")
if len(conv_layer.get_weights()) > 1:
print("handle conv_" + str(i) + " start")
print("len:",len(conv_layer.get_weights()))
bias = weight_reader.read_bytes(np.prod(conv_layer.get_weights()[1].shape))
kernel = weight_reader.read_bytes(np.prod(conv_layer.get_weights()[0].shape))
kernel = kernel.reshape(list(reversed(conv_layer.get_weights()[0].shape)))
kernel = kernel.transpose([2,3,1,0])
conv_layer.set_weights([kernel, bias])
print("handle conv_" + str(i) + " completed")
else:
print("handle conv_" + str(i) + " start")
kernel = weight_reader.read_bytes(np.prod(conv_layer.get_weights()[0].shape))
kernel = kernel.reshape(list(reversed(conv_layer.get_weights()[0].shape)))
kernel = kernel.transpose([2,3,1,0])
conv_layer.set_weights([kernel])
print("handle conv_" + str(i) + " completed")
handle norm_1 start shape: (32,) handle norm_1 completed handle conv_1 start handle conv_1 completed handle norm_2 start shape: (64,) handle norm_2 completed handle conv_2 start handle conv_2 completed handle norm_3 start shape: (128,) handle norm_3 completed handle conv_3 start handle conv_3 completed handle norm_4 start shape: (64,) handle norm_4 completed handle conv_4 start handle conv_4 completed handle norm_5 start shape: (128,) handle norm_5 completed handle conv_5 start handle conv_5 completed handle norm_6 start shape: (256,) handle norm_6 completed handle conv_6 start handle conv_6 completed handle norm_7 start shape: (128,) handle norm_7 completed handle conv_7 start handle conv_7 completed handle norm_8 start shape: (256,) handle norm_8 completed handle conv_8 start handle conv_8 completed handle norm_9 start shape: (512,) handle norm_9 completed handle conv_9 start handle conv_9 completed handle norm_10 start shape: (256,) handle norm_10 completed handle conv_10 start handle conv_10 completed handle norm_11 start shape: (512,) handle norm_11 completed handle conv_11 start handle conv_11 completed handle norm_12 start shape: (256,) handle norm_12 completed handle conv_12 start handle conv_12 completed handle norm_13 start shape: (512,) handle norm_13 completed handle conv_13 start handle conv_13 completed handle norm_14 start shape: (1024,) handle norm_14 completed handle conv_14 start handle conv_14 completed handle norm_15 start shape: (512,) handle norm_15 completed handle conv_15 start handle conv_15 completed handle norm_16 start shape: (1024,) handle norm_16 completed handle conv_16 start handle conv_16 completed handle norm_17 start shape: (512,) handle norm_17 completed handle conv_17 start handle conv_17 completed handle norm_18 start shape: (1024,) handle norm_18 completed handle conv_18 start handle conv_18 completed handle norm_19 start shape: (1024,) handle norm_19 completed handle conv_19 start handle conv_19 completed handle norm_20 start shape: (1024,) handle norm_20 completed handle conv_20 start handle conv_20 completed handle norm_21 start shape: (64,) handle norm_21 completed handle conv_21 start handle conv_21 completed handle norm_22 start shape: (1024,) handle norm_22 completed handle conv_22 start handle conv_22 completed handle conv_23 start len: 2 handle conv_23 completed
Randomize weights of the last layer
由於在YOLOv2的模型中, 最後一層卷積層決定了最後的輸出, 讓我們重新來調整與訓練這一層的卷積層來讓預訓練的模型可以讓我們進行所需要的微調。 詳細的概念與說明,見 1.5: 使用預先訓練的卷積網絡模型。
layer = model.layers[-4] # 找出最後一層的卷積層
weights = layer.get_weights()
new_kernel = np.random.normal(size=weights[0].shape)/(GRID_H*GRID_W)
new_bias = np.random.normal(size=weights[1].shape)/(GRID_H*GRID_W)
layer.set_weights([new_kernel, new_bias]) # 重初始化權重
YOLOv2訓練用的損失函數:
def custom_loss(y_true, y_pred):
mask_shape = tf.shape(y_true)[:4]
cell_x = tf.to_float(tf.reshape(tf.tile(tf.range(GRID_W), [GRID_H]), (1, GRID_H, GRID_W, 1, 1)))
cell_y = tf.transpose(cell_x, (0,2,1,3,4))
cell_grid = tf.tile(tf.concat([cell_x,cell_y], -1), [BATCH_SIZE, 1, 1, 5, 1])
coord_mask = tf.zeros(mask_shape)
conf_mask = tf.zeros(mask_shape)
class_mask = tf.zeros(mask_shape)
seen = tf.Variable(0.)
total_recall = tf.Variable(0.)
"""
Adjust prediction
"""
### adjust x and y
pred_box_xy = tf.sigmoid(y_pred[..., :2]) + cell_grid
### adjust w and h
pred_box_wh = tf.exp(y_pred[..., 2:4]) * np.reshape(ANCHORS, [1,1,1,BOX,2])
### adjust confidence
pred_box_conf = tf.sigmoid(y_pred[..., 4])
### adjust class probabilities
pred_box_class = y_pred[..., 5:]
"""
Adjust ground truth
"""
### adjust x and y
true_box_xy = y_true[..., 0:2] # relative position to the containing cell
### adjust w and h
true_box_wh = y_true[..., 2:4] # number of cells accross, horizontally and vertically
### adjust confidence
true_wh_half = true_box_wh / 2.
true_mins = true_box_xy - true_wh_half
true_maxes = true_box_xy + true_wh_half
pred_wh_half = pred_box_wh / 2.
pred_mins = pred_box_xy - pred_wh_half
pred_maxes = pred_box_xy + pred_wh_half
intersect_mins = tf.maximum(pred_mins, true_mins)
intersect_maxes = tf.minimum(pred_maxes, true_maxes)
intersect_wh = tf.maximum(intersect_maxes - intersect_mins, 0.)
intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1]
true_areas = true_box_wh[..., 0] * true_box_wh[..., 1]
pred_areas = pred_box_wh[..., 0] * pred_box_wh[..., 1]
union_areas = pred_areas + true_areas - intersect_areas
iou_scores = tf.truediv(intersect_areas, union_areas)
true_box_conf = iou_scores * y_true[..., 4]
### adjust class probabilities
true_box_class = tf.argmax(y_true[..., 5:], -1)
"""
Determine the masks
"""
### coordinate mask: simply the position of the ground truth boxes (the predictors)
coord_mask = tf.expand_dims(y_true[..., 4], axis=-1) * COORD_SCALE
### confidence mask: penelize predictors + penalize boxes with low IOU
# penalize the confidence of the boxes, which have IOU with some ground truth box < 0.6
true_xy = true_boxes[..., 0:2]
true_wh = true_boxes[..., 2:4]
true_wh_half = true_wh / 2.
true_mins = true_xy - true_wh_half
true_maxes = true_xy + true_wh_half
pred_xy = tf.expand_dims(pred_box_xy, 4)
pred_wh = tf.expand_dims(pred_box_wh, 4)
pred_wh_half = pred_wh / 2.
pred_mins = pred_xy - pred_wh_half
pred_maxes = pred_xy + pred_wh_half
intersect_mins = tf.maximum(pred_mins, true_mins)
intersect_maxes = tf.minimum(pred_maxes, true_maxes)
intersect_wh = tf.maximum(intersect_maxes - intersect_mins, 0.)
intersect_areas = intersect_wh[..., 0] * intersect_wh[..., 1]
true_areas = true_wh[..., 0] * true_wh[..., 1]
pred_areas = pred_wh[..., 0] * pred_wh[..., 1]
union_areas = pred_areas + true_areas - intersect_areas
iou_scores = tf.truediv(intersect_areas, union_areas)
best_ious = tf.reduce_max(iou_scores, axis=4)
conf_mask = conf_mask + tf.to_float(best_ious < 0.6) * (1 - y_true[..., 4]) * NO_OBJECT_SCALE
# penalize the confidence of the boxes, which are reponsible for corresponding ground truth box
conf_mask = conf_mask + y_true[..., 4] * OBJECT_SCALE
### class mask: simply the position of the ground truth boxes (the predictors)
class_mask = y_true[..., 4] * tf.gather(CLASS_WEIGHTS, true_box_class) * CLASS_SCALE
"""
Warm-up training
"""
no_boxes_mask = tf.to_float(coord_mask < COORD_SCALE/2.)
seen = tf.assign_add(seen, 1.)
true_box_xy, true_box_wh, coord_mask = tf.cond(tf.less(seen, WARM_UP_BATCHES),
lambda: [true_box_xy + (0.5 + cell_grid) * no_boxes_mask,
true_box_wh + tf.ones_like(true_box_wh) * np.reshape(ANCHORS, [1,1,1,BOX,2]) * no_boxes_mask,
tf.ones_like(coord_mask)],
lambda: [true_box_xy,
true_box_wh,
coord_mask])
"""
Finalize the loss
"""
nb_coord_box = tf.reduce_sum(tf.to_float(coord_mask > 0.0))
nb_conf_box = tf.reduce_sum(tf.to_float(conf_mask > 0.0))
nb_class_box = tf.reduce_sum(tf.to_float(class_mask > 0.0))
loss_xy = tf.reduce_sum(tf.square(true_box_xy-pred_box_xy) * coord_mask) / (nb_coord_box + 1e-6) / 2.
loss_wh = tf.reduce_sum(tf.square(true_box_wh-pred_box_wh) * coord_mask) / (nb_coord_box + 1e-6) / 2.
loss_conf = tf.reduce_sum(tf.square(true_box_conf-pred_box_conf) * conf_mask) / (nb_conf_box + 1e-6) / 2.
loss_class = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=true_box_class, logits=pred_box_class)
loss_class = tf.reduce_sum(loss_class * class_mask) / (nb_class_box + 1e-6)
loss = loss_xy + loss_wh + loss_conf + loss_class
nb_true_box = tf.reduce_sum(y_true[..., 4])
nb_pred_box = tf.reduce_sum(tf.to_float(true_box_conf > 0.5) * tf.to_float(pred_box_conf > 0.3))
"""
Debugging code
"""
current_recall = nb_pred_box/(nb_true_box + 1e-6)
total_recall = tf.assign_add(total_recall, current_recall)
loss = tf.Print(loss, [tf.zeros((1))], message='Dummy Line \t', summarize=1000)
loss = tf.Print(loss, [loss_xy], message='Loss XY \t', summarize=1000)
loss = tf.Print(loss, [loss_wh], message='Loss WH \t', summarize=1000)
loss = tf.Print(loss, [loss_conf], message='Loss Conf \t', summarize=1000)
loss = tf.Print(loss, [loss_class], message='Loss Class \t', summarize=1000)
loss = tf.Print(loss, [loss], message='Total Loss \t', summarize=1000)
loss = tf.Print(loss, [current_recall], message='Current Recall \t', summarize=1000)
loss = tf.Print(loss, [total_recall/seen], message='Average Recall \t', summarize=1000)
return loss
用來產生Keras訓練模型的BatchGenerator的設定:
generator_config = {
'IMAGE_H' : IMAGE_H, # YOLOv2網絡輸入的image_h
'IMAGE_W' : IMAGE_W, # YOLOv2網絡輸入的image_w
'GRID_H' : GRID_H, # 直向網格的拆分數量
'GRID_W' : GRID_W, # 橫向網格的拆分數量
'BOX' : BOX, # 每個單一網格要預測的邊界框數量
'LABELS' : LABELS, # 要預測的圖像種類列表
'CLASS' : len(LABELS), # 要預測的圖像種類數
'ANCHORS' : ANCHORS, # 每個單一網格要預測的邊界框時用的錨點
'BATCH_SIZE' : BATCH_SIZE, # 訓練時的批量數
'TRUE_BOX_BUFFER' : 50, # 一個訓練圖像最大數量的邊界框數
}
由於這個資料集的標註檔並不是採用PASCAL VOC格式而是自行定義的格式:
% bbGt version=3
leftHand_driver 87 295 57 67 0 0 0 0 0 0 0
rightHand_driver 223 283 62 64 0 0 0 0 0 0 0
leftHand_passenger 483 356 91 71 0 0 0 0 0 0 0
rightHand_passenger 548 328 86 70 0 0 0 0 0 0 0
因此要對標註檔的解析進行客製化。
from tqdm import tqdm
from PIL import Image
def parse_hands_annotation(ann_dir, img_dir, labels=[]):
"""解析圖像標註檔
根據手部標註檔存放的目錄路徑迭代地解析每一個標註檔,
將每個圖像的檔名(filename)、圖像的寬(width)、高(height)、圖像的類別(name)以
及物體的邊界框的坐標(xmin,ymin,xmax,ymax)擷取出來。以下是圖像標註檔的範例:
% bbGt version=3
leftHand_driver 87 295 57 67 0 0 0 0 0 0 0
rightHand_driver 223 283 62 64 0 0 0 0 0 0 0
leftHand_passenger 483 356 91 71 0 0 0 0 0 0 0
rightHand_passenger 548 328 86 70 0 0 0 0 0 0 0
擷取目標: [hands_class x y w h ...]
hands_class: leftHand_driver/leftHand_passenger: left_hand, rightHand_driver/rightHand_passenger: right_hand,
參數:
ann_dir: 圖像標註檔存放的目錄路徑
img_dir: 圖像檔存放的目錄路徑
labels: 圖像資料集的物體類別列表
回傳:
all_imgs: 一個列表物件, 每一個物件都包括了要訓練用的重要資訊。例如:
{
'filename': '/tmp/img/img001.jpg',
'width': 128,
'height': 128,
'object':[
{'name':'person',xmin:0, ymin:0, xmax:28, ymax:28},
{'name':'person',xmin:45, ymin:45, xmax:60, ymax:60}
]
}
seen_labels: 一個字典物件(k:圖像類別, v:出現的次數)用來檢視每一個圖像類別出現的次數
"""
print("start parsing annotation..")
# 產生一個標註圖資料標註的mapping
hands_label_map = {'leftHand_driver':'left_hand', 'leftHand_passenger':'left_hand',
'rightHand_driver':'right_hand', 'rightHand_passenger':'right_hand'}
all_imgs = []
seen_labels = {}
# 迭代每個標註檔
for ann in tqdm(sorted(os.listdir(ann_dir))):
img = {'object':[]}
# 處理圖檔檔案路徑
img_filename = ann[0:len(ann)-3]+"png"
# 圖檔檔案路徑
img['filename'] = os.path.join(img_dir, img_filename)
im = Image.open(img['filename'])
img_width, img_height = im.size
# 圖檔大小
img['width'] = img_width
img['height'] = img_height
line = 0 # 行數
with open(os.path.join(ann_dir, ann), 'r') as fann:
# 一行一行讀進來處理
for cnt, line in enumerate(fann):
# 忽略第一行的資料
if cnt == 0:
continue
# 建立物件來保留bbox
obj = {}
tokens = line.split()
label = hands_label_map[tokens[0]]
bbox_x = int(tokens[1])
bbox_y = int(tokens[2])
bbox_w = int(tokens[3])
bbox_h = int(tokens[4])
#print("Line {}: {},{},{},{},{}".format(cnt, label, bbox_x, bbox_y, bbox_w, bbox_h))
obj['name'] = label
if obj['name'] in seen_labels:
seen_labels[obj['name']] += 1
else:
seen_labels[obj['name']] = 1
obj['xmin'] = bbox_x
obj['ymin'] = bbox_y
obj['xmax'] = bbox_x + bbox_w
obj['ymax'] = bbox_y + bbox_h
#檢看是是否有物體的標籤是沒有在傳入的物體類別(labels)中
if len(labels) > 0 and obj['name'] not in labels:
continue
else:
img['object'] += [obj]
# 把img物件加進要回傳的列表中
if len(['object']) > 0:
all_imgs += [img]
print("Parsing annotation completed!")
print("Total: {} images processed.".format(len(all_imgs)))
return all_imgs, seen_labels
# 進行圖像標註檔的解析 (在Racoon資料集的標註採用的是PASCAL VOC的XML格式)
train_imgs, seen_train_labels = parse_hands_annotation(train_annot_folder, train_image_folder, labels=LABELS)
# 建立一個訓練用的資料產生器
train_batch = BatchGenerator(train_imgs, generator_config, norm=normalize)
# 進行圖像標註檔的解析 (在Racoon資料集的標註採用的是PASCAL VOC的XML格式)
valid_imgs, seen_valid_labels = parse_hands_annotation(valid_annot_folder, valid_image_folder, labels=LABELS)
# 建立一個驗證用的資料產生器
valid_batch = BatchGenerator(valid_imgs, generator_config, norm=normalize, jitter=False)
start parsing annotation..
100%|█████████████████████████████████████████████████████████████████████████████| 5500/5500 [00:15<00:00, 360.49it/s]
Parsing annotation completed! Total: 5500 images processed. start parsing annotation..
100%|█████████████████████████████████████████████████████████████████████████████| 5500/5500 [00:15<00:00, 363.95it/s]
Parsing annotation completed! Total: 5500 images processed.
設置一些回調函式並開始訓練
# 如果超過3次的循環在loss的收歛上沒有改善就停止訓練
early_stop = EarlyStopping(monitor='val_loss',
min_delta=0.001,
patience=3,
mode='min',
verbose=1)
# 每次的訓練循都去比較模型的loss是否有改善, 有就把模型的權重儲存下來
checkpoint = ModelCheckpoint('weights_hands.h5',
monitor='val_loss',
verbose=1,
save_best_only=True,
mode='min',
period=1)
開始訓練
optimizer = Adam(lr=0.5e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
#optimizer = SGD(lr=1e-4, decay=0.0005, momentum=0.9)
#optimizer = RMSprop(lr=1e-4, rho=0.9, epsilon=1e-08, decay=0.0)
model.compile(loss=custom_loss, optimizer=optimizer)
history = model.fit_generator(generator = train_batch,
steps_per_epoch = len(train_batch),
epochs = 20, # 應該增加更多次的訓練循環
verbose = 0,
validation_data = valid_batch,
validation_steps = len(valid_batch),
callbacks = [early_stop, checkpoint],
max_queue_size = 3)
Epoch 00001: val_loss improved from inf to 0.11776, saving model to weights_hands.h5 Epoch 00002: val_loss improved from 0.11776 to 0.05966, saving model to weights_hands.h5 Epoch 00003: val_loss improved from 0.05966 to 0.05382, saving model to weights_hands.h5 Epoch 00004: val_loss improved from 0.05382 to 0.04547, saving model to weights_hands.h5 Epoch 00005: val_loss improved from 0.04547 to 0.03713, saving model to weights_hands.h5 Epoch 00006: val_loss did not improve Epoch 00007: val_loss improved from 0.03713 to 0.03611, saving model to weights_hands.h5 Epoch 00008: val_loss improved from 0.03611 to 0.03438, saving model to weights_hands.h5 Epoch 00009: val_loss improved from 0.03438 to 0.02191, saving model to weights_hands.h5 Epoch 00010: val_loss did not improve Epoch 00011: val_loss improved from 0.02191 to 0.01899, saving model to weights_hands.h5 Epoch 00012: val_loss did not improve Epoch 00013: val_loss did not improve Epoch 00014: val_loss did not improve Epoch 00014: early stopping
# 載入訓練好的模型權重
model.load_weights("weights_hands.h5")
# 產生一個Dummy的標籤輸入
# 在訓練階段放的是真實的邊界框與圖像類別訊息
# 但在預測階段還是需要有一個Dummy的輸入, 因為定義在網絡的結構中有兩個輸入:
# 1.圖像的輸人
# 2.圖像邊界框/錨點/信心分數的輸入
dummy_array = np.zeros((1,1,1,1,TRUE_BOX_BUFFER,4))
# 選一張圖像
img_filepath = train_imgs[np.random.randint(len(train_imgs))]['filename']
# 使用OpenCV讀入圖像
image = cv2.imread(img_filepath) # 載入圖像
plt.figure(figsize=(10,10))
# 進行圖像輸入的前處理
input_image = cv2.resize(image, (416, 416)) # 修改輸入圖像大小來符合模型的要求
input_image = input_image / 255. # 進行圖像歸一處理
input_image = np.expand_dims(input_image, 0) # 增加 batch dimension
# 進行圖像偵測
netout = model.predict([input_image, dummy_array])
# 解析網絡的輸出來取得最後偵測出來的邊界框(bounding boxes)列表
boxes = decode_netout(netout[0],
obj_threshold=OBJ_THRESHOLD,
nms_threshold=NMS_THRESHOLD,
anchors=ANCHORS,
nb_class=CLASS)
# "draw_bgr_image_boxes"
# 一個簡單把邊界框與預測結果打印到原始圖像(BGR)上的工具函式
# 參數: image 是image的numpy ndarray [h, w, channels(BGR)]
# boxes 是偵測的結果
# labels 是模型訓練的圖像類別列表
# 回傳: image 是image的numpy ndarray [h, w, channels(RGB)]
image = draw_bgr_image_boxes(image, boxes, labels=LABELS)
# 把最後的結果秀出來
plt.imshow(image)
plt.show()
# 載入訓練好的模型權重
model.load_weights("weights_hands.h5")
# 產生一個Dummy的標籤輸入
# 在訓練階段放的是真實的邊界框與圖像類別訊息
# 但在預測階段還是需要有一個Dummy的輸入, 因為定義在網絡的結構中有兩個輸入:
# 1.圖像的輸人
# 2.圖像邊界框/錨點/信心分數的輸入
dummy_array = np.zeros((1,1,1,1,TRUE_BOX_BUFFER,4))
# 資料集目錄
VIDEO_DATA_PATH = os.path.join(DATA_SET_PATH, "video")
# 選擇要進行浣熊影像偵測的影像檔
# 在這個測試我從YOUTUBE下載了: https://www.youtube.com/watch?v=c0IykwK6zkY
video_inp = os.path.join(VIDEO_DATA_PATH, "cardriving.mp4")
# 偵測結果的輸出影像檔
video_out = os.path.join(VIDEO_DATA_PATH, "cardriving-out.mp4")
# 透過OpenCv擷取影像
video_reader = cv2.VideoCapture(video_inp)
# 取得影像的基本資訊
nb_frames = int(video_reader.get(cv2.CAP_PROP_FRAME_COUNT)) # 總共有多少frames
frame_h = int(video_reader.get(cv2.CAP_PROP_FRAME_HEIGHT)) # 每個frame的高
frame_w = int(video_reader.get(cv2.CAP_PROP_FRAME_WIDTH)) # 每個frame的寬
# 設定影像的輸出
video_writer = cv2.VideoWriter(video_out,
cv2.VideoWriter_fourcc(*'XVID'),
50.0,
(frame_w, frame_h))
# 迭代每一個frame來進行圖像偵測
for i in tqdm(range(nb_frames)):
ret, image = video_reader.read() # 讀取一個frame
input_image = cv2.resize(image, (416, 416)) # 修改輸入圖像大小來符合模型的要求
input_image = input_image / 255. # 進行圖像歸一處理
input_image = np.expand_dims(input_image, 0) # 增加 batch dimension
# 進行圖像偵測
netout = model.predict([input_image, dummy_array])
# 解析網絡的輸出來取得最後偵測出來的邊界框(bounding boxes)列表
boxes = decode_netout(netout[0],
obj_threshold=OBJ_THRESHOLD,
nms_threshold=NMS_THRESHOLD,
anchors=ANCHORS,
nb_class=CLASS)
# "draw_bgr_image_boxes"
# 一個簡單把邊界框與預測結果打印到原始圖像(BGR)上的工具函式
# 參數: image 是image的numpy ndarray [h, w, channels(BGR)]
# boxes 是偵測的結果
# labels 是模型訓練的圖像類別列表
# 回傳: image 是image的numpy ndarray [h, w, channels(RGB)]
image = draw_bgr_image_boxes(image, boxes, labels=LABELS)
# 透過OpenCV把影像輸出出來
video_writer.write(np.uint8(image[:,:,::-1])) # 轉換 RGB -> BGR來讓Open CV寫Video
video_reader.release() # 釋放資源
video_writer.release() # 釋放資源
100%|████████████████████████████████████████████████████████████████████████████| 17191/17191 [07:58<00:00, 35.91it/s]