The twitter world seems to have enjoyed the scikit-learn machine learning classifier comparison graph made by Surya Saha.
This notebook shows you how to make a Plotly version of that graph.
First import the modules required to run this notebook:
# For computations
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.lda import LDA
from sklearn.qda import QDA
# To replicate the matplotlib figure
import pylab as pl
from matplotlib.colors import ListedColormap
# To make the Plotly figure using Plotly Python API
import plotly.plotly as py # signing in with your credentials file
import plotly.tools as tls
from plotly.graph_objs import Figure, Data, Layout
from plotly.graph_objs import Scatter, Contour
from plotly.graph_objs import Marker, Contours, Font
from plotly.graph_objs import XAxis, YAxis, Annotation, Annotations
Next, define two functions, one to generate the classifiers and one to generate the datasets:
def make_classifiers():
# Classifiers names
names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree",
"Random Forest", "AdaBoost", "Naive Bayes", "LDA", "QDA"]
# Classifier objects
classifiers = [
KNeighborsClassifier(3),
SVC(kernel="linear", C=0.025),
SVC(gamma=2, C=1),
DecisionTreeClassifier(max_depth=5),
RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
AdaBoostClassifier(),
GaussianNB(),
LDA(),
QDA()]
return (names, classifiers)
def make_datasets(N):
'''
N: the datasets' sample size
'''
# Make custom dataset
X, y = make_classification(n_samples=N, n_features=2, n_redundant=0, n_informative=2,
random_state=1, n_clusters_per_class=1)
rng = np.random.RandomState(2)
X += 2 * rng.uniform(size=X.shape)
linearly_separable = (X, y)
# Datasets list
datasets = [make_moons(n_samples=N,noise=0.3, random_state=0),
make_circles(n_samples=N,noise=0.2, factor=0.5, random_state=1),
linearly_separable]
return datasets
First, let's reproduce the original matplotlib figure using Surya Saha's code.
# To display figure inside this notebook
%matplotlib inline
h = .02 # step size in the mesh (original value)
N = 100 # sample size (original value)
# Get classifiers and datasets
names, classifiers = make_classifiers()
datasets = make_datasets(N)
figure = pl.figure(figsize=(27, 9))
i = 1
# iterate over datasets
for ds in datasets:
# preprocess dataset, split into training and test part
X, y = ds
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4)
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
# just plot the dataset first
cm = pl.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])
ax = pl.subplot(len(datasets), len(classifiers) + 1, i)
# Plot the training points
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
# and testing points
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6)
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())
i += 1
# iterate over classifiers
for name, clf in zip(names, classifiers):
ax = pl.subplot(len(datasets), len(classifiers) + 1, i)
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
if hasattr(clf, "decision_function"):
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
else:
Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
# Put the result into a color plot
Z = Z.reshape(xx.shape)
ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)
# Plot also the training points
ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
# and testing points
ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
alpha=0.6)
ax.set_xlim(xx.min(), xx.max())
ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(())
ax.set_yticks(())
ax.set_title(name)
ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'),
size=15, horizontalalignment='right')
i += 1
figure.subplots_adjust(left=.02, right=.98)
#pl.show()
# Save a .png version
figure.savefig('ml-classifier-comp_matplotlib.png')
Making a Plotly graph is a declarative process.
For example, where in matplotlib the x-axis limits are set using the set_xlim()
axis method; in Plotly the x-axis limits are declared in a graph object (XAxis
in this case) along with all the other custom features for this particular x-axis.
For more on Plotly's Python API, take a look at our online documentation and User Guide.
First, define a few functions to help us build the graph objects:
# Scatter points colors
cm_bright = ['#FF0000', '#0000FF']
# Function to make Scatter graph object to plot the datasets' pts
def make_Scatter(sbplt_in,x_in,y_in,name_in,color_in,opacity_in):
return Scatter(
x= x_in, # x coordinates
y= y_in, # y coordinates
name=name_in, # label name (on hover)
mode='markers', # plot marker pts
marker= Marker(
color= color_in, # marker color
opacity= opacity_in), # marker opacity
xaxis= 'x{}'.format(sbplt_in), # bind coordinate to given x-axis
yaxis= 'y{}'.format(sbplt_in)) # bind coordinate to given y-axis
# Color scale for contour plots
cm_name = 'RdBu'
# Function to make Contour graph object to plot the 'decision boundary'
def make_Contour(sbplt_in,x_in,y_in,Z_in):
return Contour(
x= x_in, # x coordinates
y= y_in, # y coordinates
z= Z_in, # f(x,y) to be contoured
scl= cm_name, # color scale (or color map)
reversescl=True, # reverse the color scale
opacity=0.8, # opacity of color scale
showscale=False, # don't show the color bar
contours= Contours(
showlines=False), # colored contours only, no lines
xaxis= 'x{}'.format(sbplt_in), # bind coordinate to given x-axis
yaxis= 'y{}'.format(sbplt_in)) # bind coordinate to given y-axis
# Some style options for all x- and y-axes
axis_style = dict(
ticks='', # no ticks
showticklabels=False, # no tick labels
showline=True, # show axis frame
mirror=True, # show axes on both sides (bottom/top and left/right)
showgrid=False, # no grid lines
zeroline=False) # no thick line at x=y=0
# Function to make XAxis graph object
def make_XAxis(x_in):
xaxis = XAxis(range=[x_in.min(),x_in.max()]) # range of x-axis (i.e. its limits)
xaxis.update(axis_style) # add style options
return xaxis
# Function to make YAxis graph object
def make_YAxis(y_in):
yaxis = YAxis(range=[y_in.min(),y_in.max()]) # range of y-axis (i.e. its limits)
yaxis.update(axis_style) # add style options
return yaxis
# Function to make annotation at the bottom-right corner of every subplot
def make_score_anno(sbplt_in,x_in,y_in,score):
return Annotation(
x= x_in.max() - 0.95, # x position (had to tweak these from the original)
y= y_in.min() + 0, # y position
text= ('%.2f' % score).lstrip('0'), # text is the classifier's score
align='right', # align text on the right
font= Font(size=15), # font size
showarrow=False, # no arrow
xref= 'x{}'.format(sbplt_in), # position in relation to the x
yref= 'y{}'.format(sbplt_in)) # and y axes
# Function to make annotation labelling each classifier (at top of each column)
def make_sbplt_anno(sbplt_in,x_in,y_in,name):
return Annotation(
x= np.mean(x_in), # x position
y= y_in[-1], # y position
text= name, # text is the classifier name
align='center', # align text in the center
font= Font(size=14), # font size
showarrow=False, # no arrow
xanchor='center', # anchor at axis' center
xref= 'x{}'.format(sbplt_in), # position in relation to the x
yref= 'y{}'.format(sbplt_in)) # and y axes
Next, get the datasets.
We had to reduce the size of the mesh grid to allow web browsers to generate the Plotly graph in reasonable time. That said, when converted to a png, the Plotly graph with an 8 time sparser mesh grid has approximatively the same file size as the original png.
res_factor = 8 # 1d resolution decrease (res_factor**2 is the 2d decrease)
h = h*res_factor # step size in the Contour mesh
N = 100 # sample size (original, scatter pts do not slow down plotting)
# Get classifiers and datasets
names, classifiers = make_classifiers()
datasets = make_datasets(N)
Next, generate a figure object with the desired subplot layout.
This figure object will package all other graph objects.
#
figure = tls.get_subplots(
rows=len(datasets),
columns=len(classifiers)+1,
horizontal_spacing=0.01,
vertical_spacing=0.05,
print_grid=True)
This is the format of your plot grid! [21] [22] [23] [24] [25] [26] [27] [28] [29] [30] [11] [12] [13] [14] [15] [16] [17] [18] [19] [20] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10]
The subplot indices in Plotly start at 1 and increase from left to right, bottom to top.
Next, add a few layout feautres:
my_width=1472 # width and
my_height=490 # height of the matplotlib figure
figure['layout'].update(showlegend=False, # don't show legend
hovermode='closest', # show info about closest point on hover
autosize=False, # manual sizing
width=my_width, # set figure's width and
height=my_height) # height
title = 'Machine Learning classifier comparison, {}x smaller res.'.format(res_factor**2)
figure['layout'].update(title=title, # add title
font= Font(family="Open Sans, sans-serif")) # set font
figure['layout']['annotations'] = Annotations([]) # init. 'annotations' key
Now, loop through the datasets and the classifiers to fill in the figure object:
i = 1 # init. subplot counter
# iterate over datasets (in reverse order, to match original layout)
for ds in datasets[::-1]:
# preprocess dataset, split into training and test part
X, _y = ds
X = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, _y, test_size=0.4)
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
x = np.arange(x_min, x_max, h)
y = np.arange(y_min, y_max, h)
xx, yy = np.meshgrid(x,y)
# Make color list (1 item for each per scatter pt)
cm_train = [cm_bright[yy_train] for yy_train in y_train]
cm_test = [cm_bright[yy_test] for yy_test in y_test]
# Append 'data' with Scatter objects
figure['data'] += [make_Scatter(i,X_train[:, 0],X_train[:, 1],
'Training',cm_train,1)]
figure['data'] += [make_Scatter(i,X_test[:, 0],X_test[:, 1],
'Test',cm_test,0.6)]
# Format subplot i's axes
figure['layout'].update({'xaxis{}'.format(i): make_XAxis(x)})
figure['layout'].update({'yaxis{}'.format(i): make_XAxis(y)})
i += 1 # increment subplot counter
# iterate over classifiers
for name, clf in zip(names, classifiers):
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
if hasattr(clf, "decision_function"):
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
else:
Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
# Put the result into a color plot, append 'data' with Contour object
Z = Z.reshape(xx.shape)
figure['data'] += [make_Contour(i,x,y,Z)]
# Plot also the training points and testing points, append 'data'
figure['data'] += [make_Scatter(i,X_train[:, 0],X_train[:, 1],
'Training',cm_train,1)]
figure['data'] += [make_Scatter(i,X_test[:, 0],X_test[:, 1],
'Test',cm_test,0.6)]
# Format subplot i's axes
figure['layout'].update({'xaxis{}'.format(i): make_XAxis(x)})
figure['layout'].update({'yaxis{}'.format(i): make_YAxis(y)})
# Add score annotation to 'layout'
figure['layout']['annotations'] += [make_score_anno(i,x,y,score)]
# Add subplot title to 'layout' if on the top-most row
if i>=22:
figure['layout']['annotations'] += [make_sbplt_anno(i,x,y,name)]
i += 1 # increment subplot counter
The only thing left to do is to send the figure object to Plotly:
# Type in a filename and send figure object to Plotly
filename = 'ml-classifier-comp_r{}'.format(res_factor**2)
py.iplot(figure, filename=filename,
width=my_width, height=my_height) # adjust notebook display width and height
Plotly graphs reside online.
To save a static copy of the above graph, run:
py.image.save_as(figure, filename)
from IPython.display import Image
Image('ml-classifier-comp_r64.png')
About Plotly
Big thanks to
# CSS styling within IPython notebook
from IPython.core.display import HTML
import urllib2
def css_styling():
url = 'https://raw.githubusercontent.com/plotly/python-user-guide/master/custom.css'
styles = urllib2.urlopen(url).read()
return HTML(styles)
css_styling()