from IPython.display import Image
from IPython.core.display import HTML
Image(url="http://dividiti.com/dvdt/a_eq_dvdt_1000.png")
Image(url="http://taxistartup.com/wp-content/uploads/2015/03/UK-Self-Driving-Cars.jpg")
Image(url="http://image.slidesharecdn.com/pydatatalk-150729202131-lva1-app6892/95/deep-learning-with-python-pydata-seattle-2015-35-638.jpg?cb=1438315555")
Image(url="http://images.cdn.autocar.co.uk/sites/autocar.co.uk/files/styles/gallery_slide/public/audi-rs7-driverless-005.jpg?itok=A-GlUErw")
Image(url="https://1.bp.blogspot.com/-aQw-r1FZcQk/VyINpA8ntxI/AAAAAAAAPF4/o34l1MvKJVQTuLD1qsv5Ink-04Dra0PDgCLcB/s1600/Movidius%2BFathom-1.JPG")
This Jupyter Notebook compares the performance (execution time, memory consumption):
on dividiti's Jetson TX1 board (official page, Phoronix review):
$ uname -a
Linux tegra-ubuntu 3.10.96-tegra #1 SMP PREEMPT Wed Nov 9 19:42:57 PST 2016 aarch64 aarch64 aarch64 GNU/Linux
$ cat /etc/lsb-release
DISTRIB_ID=Ubuntu
DISTRIB_RELEASE=16.04
DISTRIB_CODENAME=xenial
DISTRIB_DESCRIPTION="Ubuntu 16.04.1 LTS"
using 8 Caffe libraries:
tag
] Branch (revision hash, date): math libraries.cpu
] Master (24d2f67, 28/Nov/2016): with OpenBLAS 0.2.19;cuda
] Master (24d2f67, 28/Nov/2016): with cuBLAS (part of CUDA Toolkit 8.0.33);cudnn
] Master 24d2f67, 28/Nov/2016): with cuDNN 5.1;libdnn-cuda
] OpenCL (b735c2d, 23/Nov/2016): with libDNN and cuBLAS (NB: not yet tuned for TX1; uses optimal parameters for GTX 1080);nvidia-cuda
] NVIDIA v0.15 (1024d34, 17/Nov/2016): with cuBLAS (part of CUDA Toolkit 8.0.33);nvidia-cudnn
] NVIDIA v0.15 (1024d34, 17/Nov/2016): with cuDNN 5.1;nvidia-fp16-cuda
] NVIDIA experimental/fp16 (fca1cf4, 11/Jul/2016): with cuBLAS (part of CUDA Toolkit 8.0.33);nvidia-fp16-cudnn
] NVIDIA experimental/fp16 (fca1cf4, 11/Jul/2016): with cuDNN 5.1;using 4 CNN models:
with the batch size varying from 2 to 16 with step 2.
fw = [ 'forward' ]
fwbw = [ 'forward', 'backward' ]
# Set to fw for inference; to fwbw for training.
direction = fw
direction
if direction==fw:
time_ms = 'time_fw_ms'
else: # direction==fwbw
time_ms = 'time_fwbw_ms'
time_ms
def images_per_second(time_in_milliseconds):
return 1000.0 / time_in_milliseconds
NB: Please ignore this section if you are not interested in re-running or modifying this notebook.
import os
import sys
import json
import re
If some of the scientific packages are missing, please install them using:
# pip install jupyter pandas numpy matplotlib
import IPython as ip
import pandas as pd
import numpy as np
import matplotlib as mp
print ('IPython version: %s' % ip.__version__)
print ('Pandas version: %s' % pd.__version__)
print ('NumPy version: %s' % np.__version__)
print ('Matplotlib version: %s' % mp.__version__)
import matplotlib.pyplot as plt
from matplotlib import cm
%matplotlib inline
If CK is not installed, please install it using:
# pip install ck
import ck.kernel as ck
print ('CK version: %s' % ck.__version__)
pretty_print_libs = {
'cpu': '[CPU] OpenBLAS',
'libdnn-cuda': '[GPU] libDNN-fp32',
'nvidia-cuda': '[GPU] cuBLAS-fp32',
'nvidia-fp16-cuda': '[GPU] cuBLAS-fp16',
'nvidia-cudnn': '[GPU] cuDNN-fp32',
'nvidia-fp16-cudnn':'[GPU] cuDNN-fp16'
}
pretty_print_models = {
'bvlc-alexnet':'AlexNet',
'bvlc-googlenet':'GoogleNet',
'deepscale-squeezenet-1.0':'SqueezeNet 1.0',
'deepscale-squeezenet-1.1':'SqueezeNet 1.1'
}
speedup_sort_models = [
'[CPU] OpenBLAS',
'[GPU] libDNN-fp32',
'[GPU] cuBLAS-fp32',
'[GPU] cuBLAS-fp16',
'[GPU] cuDNN-fp32',
'[GPU] cuDNN-fp16'
]
def get_experimental_results(repo_uoa, tags):
module_uoa = 'experiment'
r = ck.access({'action':'search', 'repo_uoa':repo_uoa, 'module_uoa':module_uoa, 'tags':tags})
if r['return']>0:
print ("Error: %s" % r['error'])
exit(1)
experiments = r['lst']
dfs = []
for experiment in experiments:
data_uoa = experiment['data_uoa']
r = ck.access({'action':'list_points', 'repo_uoa':repo_uoa, 'module_uoa':module_uoa, 'data_uoa':data_uoa})
if r['return']>0:
print ("Error: %s" % r['error'])
exit(1)
# Get (lib_tag, model_tag) from a list of tags that should be available in r['dict']['tags'].
# Tags include 2 of the 3 irrelevant tags, a model tag and a lib tag.
# NB: Since it's easier to list all model tags than all lib tags, the latter list is not expicitly specified.
tags = r['dict']['tags']
irrelevant_tags = [ 'explore-batch-size-libs-models','time_gpu','time_cpu','time_gpu_fp16' ]
model_tags = [ 'bvlc-alexnet','bvlc-googlenet','deepscale-squeezenet-1.0','deepscale-squeezenet-1.1' ]
lib_model_tags = [ tag for tag in tags if tag not in irrelevant_tags ]
model_tags = [ tag for tag in lib_model_tags if tag in model_tags ]
lib_tags = [ tag for tag in lib_model_tags if tag not in model_tags ]
if len(lib_tags)==1 and len(model_tags)==1:
(lib, model) = (lib_tags[0], model_tags[0])
else:
continue
for point in r['points']:
with open(os.path.join(r['path'], 'ckp-%s.0001.json' % point)) as point_file:
point_data_raw = json.load(point_file)
# Obtain column data.
characteristics = [
{
'time (ms)' : characteristics['run'].get(time_ms,+1e9), # "positive infinity"
'memory (MB)' : characteristics['run'].get('memory_mbytes',-1),
'success?' : characteristics['run'].get('run_success','n/a'),
'per layer info' : characteristics['run'].get('per_layer_info',[])
}
for characteristics in point_data_raw['characteristics_list']
]
# Deal with missing column data (resulting from failed runs).
if len(characteristics)==1:
repetitions = point_data_raw['features'].get('statistical_repetitions',1)
characteristics = characteristics * repetitions
# Construct a DataFrame.
df = pd.DataFrame(characteristics)
# Set columns and index names.
df.columns.name = 'run characteristic'
df.index.name = 'repetition'
# Set indices.
if lib=='tensorrt-1.0.0':
enable_fp16 = (point_data_raw['choices']['env']['CK_TENSORRT_ENABLE_FP16'] != 0)
df['lib'] = 'tensorrt-fp%d' % (16 if enable_fp16 else 32)
else:
df['lib'] = lib
df['model'] = model
df['batch size'] = point_data_raw['choices']['env']['CK_CAFFE_BATCH_SIZE']
df = df.set_index(['lib', 'model', 'batch size'], append=True)
df = df.reorder_levels(('model', 'lib', 'batch size', 'repetition'))
# Append to the list of similarly constructed DataFrames.
dfs.append(df)
# Concatenate all constructed DataFrames (i.e. stack on top of each other).
result = pd.concat(dfs)
return result.sortlevel(result.index.names)
def plot(mean, std, title='Execution time per image (ms)', ymax=0, rot=0):
ymax = mean.max().max() if ymax==0 else ymax
mean.plot(yerr=std, ylim=[0,ymax*1.05], title=title,
kind='bar', rot=rot, figsize=[16, 8], grid=True, legend=True, colormap=cm.autumn)
# ['cuda', 'cudnn'] are roughly equivalent to ['nvidia-cuda', 'nvidia-cudnn'], so can be dropped.
def plot_max_num_images_per_second(df_mean_time_per_image, libs_to_drop=['cuda', 'cudnn'], rot=0):
min_time_per_image = df_mean_time_per_image.min(axis=1).unstack('lib')
max_num_images_per_second = images_per_second(min_time_per_image) \
.drop(libs_to_drop, axis=1) \
.rename(columns=pretty_print_libs, index=pretty_print_models) \
.reindex(columns=speedup_sort_models)
ax = max_num_images_per_second \
.plot(title='Images/s (with the best even batch size between 2 and 16)', kind='bar',
figsize=[16, 8], width=0.95, rot=rot, grid=True, legend=True, colormap=cm.autumn)
for patch in ax.patches:
ax.annotate(str(int(patch.get_height()+0.5)), (patch.get_x()*1.00, patch.get_height()*1.01))
# ['cuda', 'cudnn'] are roughly equivalent to ['nvidia-cuda', 'nvidia-cudnn'], so can be dropped.
def plot_speedup_over_baseline(df_mean_time_per_image, baseline='cpu', libs_to_drop=['cuda', 'cudnn'], rot=0):
speedup_over_baseline = df_mean_time_per_image.min(axis=1).unstack('model').ix[baseline] / \
df_mean_time_per_image.min(axis=1).unstack('model')
speedup_over_baseline = speedup_over_baseline.T \
.drop(libs_to_drop, axis=1) \
.rename(columns=pretty_print_libs, index=pretty_print_models) \
.reindex(columns=speedup_sort_models)
ax = speedup_over_baseline \
.plot(title='Speedup over the given baseline (%s)' % pretty_print_libs[baseline], kind='bar',
figsize=[16, 8], width=0.95, rot=rot, grid=True, legend=True, colormap=cm.autumn)
for patch in ax.patches:
ax.annotate('{0:.2f}'.format(patch.get_height())[0:4], (patch.get_x()*1.00, patch.get_height()*1.01))
# This transformation is time consuming, hence only call it once for multiple plots.
def get_per_layer_info(df_all):
df_per_layer_info = df_all['per layer info']
row_dfs = []
for (row_info, row_id) in zip(df_per_layer_info, range(len(df_per_layer_info))):
# Skip constructing a DataFrame when no layer info is available.
if not row_info: continue
# Augment each layer info with the row index: (model, lib, batch size, repetition).
for layer_info in row_info:
layer_info.update({ k : v for k, v in zip(df_per_layer_info.index.names, df_per_layer_info.index[row_id]) })
# Construct a DataFrame and move the row index to where it belongs.
row_df = pd.DataFrame(data=row_info).set_index(df_per_layer_info.index.names)
row_dfs.append(row_df)
return pd.concat(row_dfs)
def plot_time_per_image_per_layer(df_per_layer_info, model, libs, batch_sizes,
direction=['forward'], lower=0.0, upper=1.0, ymax=0, rot=90):
df_time_per_batch = df_per_layer_info.loc[model, libs, batch_sizes] \
.set_index(['direction', 'label'], append=True) \
.reorder_levels(['direction', 'label', 'model', 'lib', 'batch size', 'repetition' ]) \
.ix[direction] \
.reorder_levels(['label', 'model', 'lib', 'batch size', 'repetition', 'direction' ]) \
.groupby(level=['label', 'model', 'lib', 'batch size', 'repetition']).sum() \
['time_ms']
df_time_per_image = df_time_per_batch.unstack('batch size') / batch_sizes
df = df_time_per_image.unstack(['lib', 'model'])
df = df.reorder_levels(['model', 'lib', 'batch size'], axis=1)
mean = df.groupby(level='label').mean()
std = df.groupby(level='label').std()
select = (lower*mean.sum() <= mean).any(axis=1) & (mean <= upper*mean.sum()).any(axis=1)
ymax = mean[select].max().max() if ymax==0 else ymax
plot(mean=mean[select], std=std[select], title='Execution time per image per layer (ms)', ymax=ymax, rot=rot)
# The ideal adaptive solution for each layer selects the best performing library from the 'libs_for_adaptation' list.
# FIXME: add batch_sizes as explicit parameter.
def get_ideal_adaptive_solution(df_per_layer_info, libs_for_adaptation, direction):
df_for_adaptation = df_per_layer_info \
.set_index(['direction', 'label'], append=True) \
.reorder_levels(['direction', 'lib', 'model', 'label', 'batch size', 'repetition']) \
.ix[direction] \
.reorder_levels(['lib', 'model', 'label', 'batch size', 'repetition', 'direction']) \
.ix[libs_for_adaptation] \
.reorder_levels(['model', 'label', 'lib', 'batch size', 'repetition', 'direction']) \
['time_ms']
# With every step, reduce the rightmost dimension until the min time per model is reached.
df_cum_time_per_repetition = df_for_adaptation.groupby(level=df_for_adaptation.index.names[:-1]).sum()
df_min_time_per_repetition = df_cum_time_per_repetition.groupby(level=df_cum_time_per_repetition.index.names[:-1]).min()
df_min_time_per_batch = df_min_time_per_repetition.unstack('batch size') / batch_sizes
df_min_time_per_image = df_min_time_per_batch.min(axis=1)
df_min_time_per_layer = df_min_time_per_image.groupby(level=df_min_time_per_image.index.names[:-1]).min()
#df_min_time_per_model = df_min_time_per_layer.groupby(level=df_min_time_per_layer.index.names[:-1]).sum()
# Transform to get the models in the index and the libs in the columns.
df_min_time_per_layer_idx = df_min_time_per_image.groupby(level=df_min_time_per_image.index.names[:-1]).idxmin()
df_ideal = df_min_time_per_image[df_min_time_per_layer_idx] \
.reorder_levels(['model', 'lib', 'label']) \
.groupby(level=['model', 'lib']).sum() \
.unstack('lib')
# Sort in the order of increasing time per model.
df_ideal_sorted = df_ideal.ix[df_ideal.sum(axis=1).sort_values(ascending=True).index]
return df_ideal_sorted
def plot_ideal_adaptive_solution(df_ideal, df_real, tag=""):
figsize=[15, 3]
if not tag=="": figsize=[10, 2] # good for dumping png (e.g. 3 graphs fit well onto a slide).
for model in df_ideal.index:
df_data = {}; df_data['adaptive'] = df_ideal.ix[model]
for lib in df_ideal.columns:
df_data[lib] = pd.Series(index=df_ideal.columns)
df_data[lib][lib] = df_real.ix[model, lib]
df = pd.DataFrame(df_data).T \
.rename(index={'cpu': 'OpenBLAS only', 'nvidia-cuda':'cuBLAS only', 'nvidia-cudnn':'cuDNN only', 'libdnn-cuda': 'libDNN only'},
columns={'cpu': 'OpenBLAS', 'nvidia-cuda':'cuBLAS', 'nvidia-cudnn':'cuDNN', 'libdnn-cuda': 'libDNN'})
ax = df.ix[df.sum(axis=1).sort_values(ascending=True).index] \
.plot(title='%s - execution time per image (ms)' % model, kind='barh', stacked=True,
grid=True, legend=True, colormap=cm.summer_r, figsize=figsize, width=0.9) \
.legend(loc='lower right')
if not tag=="": ax.get_figure().savefig('%s.%s.png' % (tag, model))
def plot_time_per_image_and_memory_consumption(df_all, model, lib):
df = df_all[['time (ms)', 'memory (MB)']] \
.groupby(level=df_all.index.names[:-1]).mean() \
.loc[model, lib]
df['time per image (ms)'] = df['time (ms)'].divide(df.index, axis=0)
df['memory per image (MB)'] = df['memory (MB)'].divide(df.index, axis=0)
df = df.drop('time (ms)', axis=1).sortlevel(axis=1)
ax = df.plot(secondary_y=['memory (MB)', 'memory per image (MB)'], title='%s w/ %s' % (model, lib),
figsize=[12, 8], mark_right=False, colormap=cm.winter, grid=True)
ax.set_ylabel('execution time (ms)'); ax.legend(loc='center left'); ax.set_ylim(0)
ax.right_ax.set_ylabel('memory consumption (MB)'); ax.right_ax.legend(loc='center right')
NB: Please ignore this section if you are not interested in re-running or modifying this notebook.
The experimental data was collected on the experimental platform (after installing all Caffe libraries and models of interest) as follows:
$ cd `ck find ck-caffe:script:explore-batch-size-libs-models`
$ python explore-batch-size-libs-models-benchmark.py
It can be downloaded from GitHub via CK as follows:
$ ck pull repo:ck-caffe-nvidia-tx1 --url=https://github.com/dividiti/ck-caffe-nvidia-tx1
alexnet_accuracy = (0.568279, 0.799501)
squeezenet_1_0_accuracy = (0.576801, 0.803903)
squeezenet_1_1_accuracy = (0.58388, 0.810123)
googlenet_accuracy = (0.689299, 0.891441)
df_accuracy = pd.DataFrame(
columns=[['Accuracy, %']*2, ['Top 1', 'Top 5']],
data=[alexnet_accuracy, squeezenet_1_0_accuracy, squeezenet_1_1_accuracy, googlenet_accuracy],
index=['AlexNet', 'SqueezeNet 1.0', 'SqueezeNet 1.1', 'GoogleNet']
)
df_accuracy
df_all = get_experimental_results(repo_uoa='ck-caffe-nvidia-tx1', tags='explore-batch-size-libs-models')
df_time = df_all['time (ms)'].unstack(df_all.index.names[:-1])
df_mean_time_per_batch = df_time.describe().ix['mean'].unstack(level='batch size')
batch_sizes = df_mean_time_per_batch.columns.tolist()
# batch_sizes
df_mean_time_per_image = df_mean_time_per_batch / batch_sizes
df_mean_time_per_image.min(axis=1)
plot_max_num_images_per_second(df_mean_time_per_image, libs_to_drop=[])
# What is the batch size that gives the minimum time per image (or the maximum number of images per second)?
df_mean_time_per_image.idxmin(axis=1)
# Focus on e.g. nvidia-fp16-cuda, for which the batch size of 16 is not always the best.
df_mean_time_per_image.idxmin(axis=1).reorder_levels(['lib', 'model']).loc['nvidia-fp16-cuda']
df_time_per_image = df_time / (batch_sizes*(len(df_time.columns)/len(batch_sizes)))
df_min_time_per_image_index = pd.DataFrame(df_mean_time_per_image.idxmin(axis=1)).set_index(0, append=True).index.values
df_model_lib = df_time_per_image[df_min_time_per_image_index] \
.stack(['model', 'lib']).reorder_levels(['model','lib','repetition']).sum(axis=1)
df_model_lib_mean = df_model_lib.groupby(level=['model', 'lib']).mean()
df_model_lib_std = df_model_lib.groupby(level=['model', 'lib']).std()
zero_positive_infinity = df_model_lib_mean > 1e5
df_model_lib_mean[zero_positive_infinity] = 0
df_accuracy['Accuracy, %'].T \
.plot(title='Prediction accuracy on the ImageNet validation set (50,000 images)',
kind='bar', rot=0, ylim=[0,1], figsize=[20, 8], grid=True, legend=True, colormap=cm.autumn, fontsize=16)
mean = df_model_lib_mean.unstack('lib').rename(columns=pretty_print_libs, index=pretty_print_models)
std = df_model_lib_std.unstack('lib').rename(columns=pretty_print_libs, index=pretty_print_models)
plot(mean, std)
mean = df_model_lib_mean.unstack('lib').drop('cpu', axis=1) \
.rename(columns=pretty_print_libs, index=pretty_print_models)
std = df_model_lib_std.unstack('lib').drop('cpu', axis=1) \
.rename(columns=pretty_print_libs, index=pretty_print_models)
plot(mean, std)
cuda_level_performance = ['nvidia-cuda', 'nvidia-cudnn', 'libdnn-cuda']
mean = df_model_lib_mean.reorder_levels(['lib', 'model'])[cuda_level_performance] \
.unstack('lib').rename(columns=pretty_print_libs, index=pretty_print_models)
std = df_model_lib_std.reorder_levels(['lib', 'model'])[cuda_level_performance] \
.unstack('lib').rename(columns=pretty_print_libs, index=pretty_print_models)
plot(mean, std)
cublas_libs = ['nvidia-cuda', 'nvidia-fp16-cuda']
mean = df_model_lib_mean.reorder_levels(['lib', 'model'])[cublas_libs] \
.unstack('lib').rename(columns=pretty_print_libs, index=pretty_print_models)
std = df_model_lib_std.reorder_levels(['lib', 'model'])[cublas_libs] \
.unstack('lib').rename(columns=pretty_print_libs, index=pretty_print_models)
plot(mean, std)
# With cuBLAS, NVIDIA's fp16 branch is up to 20% faster than NVIDIA's fp32 mainline.
nvidia_fp16_cuda_vs_nvidia_fp32_cuda = mean['[GPU] cuBLAS-fp16'] / mean['[GPU] cuBLAS-fp32']
nvidia_fp16_cuda_vs_nvidia_fp32_cuda
cudnn_libs = ['nvidia-cudnn', 'nvidia-fp16-cudnn']
mean = df_model_lib_mean.reorder_levels(['lib', 'model'])[cudnn_libs] \
.unstack('lib').rename(columns=pretty_print_libs, index=pretty_print_models)
std = df_model_lib_std.reorder_levels(['lib', 'model'])[cudnn_libs] \
.unstack('lib').rename(columns=pretty_print_libs, index=pretty_print_models)
plot(mean, std)
# With cuDNN, NVIDIA's fp16 branch is up to 35% (roughly one third) faster than NVIDIA's fp32 mainline.
nvidia_fp16_cudnn_vs_nvidia_fp32_cudnn = mean['[GPU] cuDNN-fp16'] / mean['[GPU] cuDNN-fp32']
nvidia_fp16_cudnn_vs_nvidia_fp32_cudnn
mean = df_model_lib_mean.unstack('model').rename(index=pretty_print_libs, columns=pretty_print_models)
std = df_model_lib_std.unstack('model').rename(index=pretty_print_libs, columns=pretty_print_models)
plot(mean, std)
mean = df_model_lib_mean.unstack('model').drop('cpu', axis=0) \
.rename(index=pretty_print_libs, columns=pretty_print_models)
std = df_model_lib_std.unstack('model').drop('cpu', axis=0) \
.rename(index=pretty_print_libs, columns=pretty_print_models)
plot(mean, std)
alexnet_level_accuracy = ['bvlc-alexnet','deepscale-squeezenet-1.0','deepscale-squeezenet-1.1']
# On this platform with all the libraries, SqueezeNet 1.0 is always slower than AlexNet
# despite a 50x reduction in weights (5 MB vs. 250 MB).
mean = df_model_lib_mean[alexnet_level_accuracy].unstack('model')
std = df_model_lib_std[alexnet_level_accuracy].unstack('model')
plot(mean, std, rot=10)
# SqueezeNet 1.1 is 46% faster than AlexNet with OpenBLAS (on the CPU).
mean = df_model_lib_mean[alexnet_level_accuracy].unstack('model').ix[['cpu']] \
.rename(index=pretty_print_libs, columns=pretty_print_models)
std = df_model_lib_std[alexnet_level_accuracy].unstack('model').ix[['cpu']] \
.rename(index=pretty_print_libs, columns=pretty_print_models)
plot(mean, std)
mean['SqueezeNet 1.1'] / mean['AlexNet']
# SqueezeNet 1.0 is slower than AlexNet. SqueezeNet 1.1 is 28% faster than AlexNet with
# libDNN-CUDA, and roughly equivalent to AlexNet with cuBLAS and cuDNN.
mean = df_model_lib_mean[alexnet_level_accuracy].unstack('model').ix[cuda_level_performance] \
.rename(index=pretty_print_libs, columns=pretty_print_models)
std = df_model_lib_std[alexnet_level_accuracy].unstack('model').ix[cuda_level_performance] \
.rename(index=pretty_print_libs, columns=pretty_print_models)
plot(mean, std)
mean['SqueezeNet 1.1'] / mean['AlexNet']
df_per_layer_info = get_per_layer_info(df_all)
# pd.options.display.max_columns = len(df_per_layer_info.columns)
# pd.options.display.max_rows = len(df_per_layer_info.index)
# df_per_layer_info
# Plot for a list of batch sizes.
# NB: This suggests that the batch size of 16 is better than 14 for the fully connected layers fc6, fc7, fc8.
plot_time_per_image_per_layer(df_per_layer_info, model='bvlc-alexnet', libs='nvidia-cudnn',
batch_sizes=[14, 16], direction=direction)
# Plot for a list of batch sizes. Only plot layers that consume at least 10% of the total execution time.
plot_time_per_image_per_layer(df_per_layer_info, model='bvlc-alexnet', libs='nvidia-cudnn',
batch_sizes=[8, 16], direction=direction, lower=0.10)
# Plot for a list of libs.
# NB: cuDNN and cuBLAS perform about the same on the fully connected layers (which suggests that
# cuDNN falls back to cuBLAS for these).
# Unsurprisingly, cuDNN performs better than cuBLAS on the convolution layers.
# Surprisingly, cuBLAS performs a bit better than cuDNN on the relu layers.
plot_time_per_image_per_layer(df_per_layer_info, model='bvlc-alexnet', libs=['nvidia-cuda','nvidia-cudnn'],
batch_sizes=16, direction=direction)
# Plot for a list of libs.
# NB: This suggests that libDNN is faster than cuDNN on the conv1 and expand1x1 layers, but slower on the squeeze1x1,
# expand3x3, conv/pool10 layers. (Recall that libDNN is not yet tuned for TX1 but uses parameters optimal for GTX 1080.)
plot_time_per_image_per_layer(df_per_layer_info, model='deepscale-squeezenet-1.1', libs=['nvidia-cudnn', 'libdnn-cuda'],
batch_sizes=16, direction=direction, ymax=0.65)