Notebook

Machine Learning¶

classifier comparison using Plotly¶

The twitter world seems to have enjoyed the scikit-learn machine learning classifier comparison graph made by Surya Saha.

This notebook shows you how to make a Plotly version of that graph.

First import the modules required to run this notebook:

In [1]:

# For computations 
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.lda import LDA
from sklearn.qda import QDA

In [2]:

# To replicate the matplotlib figure
import pylab as pl
from matplotlib.colors import ListedColormap

In [3]:

# To make the Plotly figure using Plotly Python API
import plotly.plotly as py    # signing in with your credentials file
import plotly.tools as tls
from plotly.graph_objs import Figure, Data, Layout
from plotly.graph_objs import Scatter, Contour
from plotly.graph_objs import Marker, Contours, Font
from plotly.graph_objs import XAxis, YAxis, Annotation, Annotations

Next, define two functions, one to generate the classifiers and one to generate the datasets:

In [4]:

def make_classifiers():
    
    # Classifiers names
    names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree",
             "Random Forest", "AdaBoost", "Naive Bayes", "LDA", "QDA"]

    # Classifier objects
    classifiers = [
        KNeighborsClassifier(3),
        SVC(kernel="linear", C=0.025),
        SVC(gamma=2, C=1),
        DecisionTreeClassifier(max_depth=5),
        RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
        AdaBoostClassifier(),
        GaussianNB(),
        LDA(),
        QDA()]

    return (names, classifiers)

def make_datasets(N):
    '''
    N: the datasets' sample size
    '''
    
    # Make custom dataset
    X, y = make_classification(n_samples=N, n_features=2, n_redundant=0, n_informative=2,
                               random_state=1, n_clusters_per_class=1)
    rng = np.random.RandomState(2)
    X += 2 * rng.uniform(size=X.shape)
    linearly_separable = (X, y)

    # Datasets list
    datasets = [make_moons(n_samples=N,noise=0.3, random_state=0),
                make_circles(n_samples=N,noise=0.2, factor=0.5, random_state=1),
                linearly_separable]
    
    return datasets

1. Original matplotlib version¶

First, let's reproduce the original matplotlib figure using Surya Saha's code.

In [5]:

# To display figure inside this notebook
%matplotlib inline

In [6]:

h = .02  # step size in the mesh (original value)
N = 100  # sample size (original value) 

# Get classifiers and datasets
names, classifiers = make_classifiers()
datasets = make_datasets(N)

In [21]:

figure = pl.figure(figsize=(27, 9))
i = 1

# iterate over datasets
for ds in datasets:
    # preprocess dataset, split into training and test part
    X, y = ds
    X = StandardScaler().fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4)

    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))

    # just plot the dataset first
    cm = pl.cm.RdBu
    cm_bright = ListedColormap(['#FF0000', '#0000FF'])
    ax = pl.subplot(len(datasets), len(classifiers) + 1, i)
    # Plot the training points
    ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
    # and testing points
    ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6)
    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    ax.set_xticks(())
    ax.set_yticks(())
    i += 1
    
    # iterate over classifiers
    for name, clf in zip(names, classifiers):
        ax = pl.subplot(len(datasets), len(classifiers) + 1, i)
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)

        # Plot the decision boundary. For that, we will assign a color to each
        # point in the mesh [x_min, m_max]x[y_min, y_max].
        if hasattr(clf, "decision_function"):
            Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
        else:
            Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]

        # Put the result into a color plot
        Z = Z.reshape(xx.shape)
        ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)

        # Plot also the training points
        ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
        # and testing points
        ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
                   alpha=0.6)

        ax.set_xlim(xx.min(), xx.max())
        ax.set_ylim(yy.min(), yy.max())
        ax.set_xticks(())
        ax.set_yticks(())
        ax.set_title(name)
        ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'),
                size=15, horizontalalignment='right')
        i += 1

figure.subplots_adjust(left=.02, right=.98)
#pl.show()

In [8]:

# Save a .png version
figure.savefig('ml-classifier-comp_matplotlib.png')

2. Plotly version¶

Making a Plotly graph is a declarative process.

For example, where in matplotlib the x-axis limits are set using the set_xlim() axis method; in Plotly the x-axis limits are declared in a graph object (XAxis in this case) along with all the other custom features for this particular x-axis.

For more on Plotly's Python API, take a look at our online documentation and User Guide.

First, define a few functions to help us build the graph objects:

In [9]:

# Scatter points colors
cm_bright = ['#FF0000', '#0000FF']

# Function to make Scatter graph object to plot the datasets' pts
def make_Scatter(sbplt_in,x_in,y_in,name_in,color_in,opacity_in):
    return Scatter(
        x= x_in,        # x coordinates
        y= y_in,        # y coordinates
        name=name_in,   # label name (on hover)
        mode='markers',    # plot marker pts
        marker= Marker(
            color= color_in,        # marker color
            opacity= opacity_in),   # marker opacity
        xaxis= 'x{}'.format(sbplt_in),   # bind coordinate to given x-axis
        yaxis= 'y{}'.format(sbplt_in))   # bind coordinate to given y-axis

In [10]:

# Color scale for contour plots
cm_name = 'RdBu'

# Function to make Contour graph object to plot the 'decision boundary'
def make_Contour(sbplt_in,x_in,y_in,Z_in):
    return Contour(
        x= x_in,     # x coordinates
        y= y_in,     # y coordinates
        z= Z_in,     # f(x,y) to be contoured
        scl= cm_name,      # color scale (or color map)
        reversescl=True,   # reverse the color scale
        opacity=0.8,       # opacity of color scale
        showscale=False,   # don't show the color bar
        contours= Contours(
            showlines=False),  # colored contours only, no lines
        xaxis= 'x{}'.format(sbplt_in),  # bind coordinate to given x-axis
        yaxis= 'y{}'.format(sbplt_in))  # bind coordinate to given y-axis

In [11]:

# Some style options for all x- and y-axes
axis_style = dict(
    ticks='',              # no ticks
    showticklabels=False,  # no tick labels
    showline=True,  # show axis frame 
    mirror=True,    # show axes on both sides (bottom/top and left/right)
    showgrid=False, # no grid lines
    zeroline=False) # no thick line at x=y=0

# Function to make XAxis graph object
def make_XAxis(x_in):
    xaxis = XAxis(range=[x_in.min(),x_in.max()])  # range of x-axis (i.e. its limits)
    xaxis.update(axis_style)                      # add style options
    return xaxis

# Function to make YAxis graph object
def make_YAxis(y_in):
    yaxis = YAxis(range=[y_in.min(),y_in.max()])  # range of y-axis (i.e. its limits) 
    yaxis.update(axis_style)                      # add style options
    return yaxis

In [12]:

# Function to make annotation at the bottom-right corner of every subplot
def make_score_anno(sbplt_in,x_in,y_in,score):
    return Annotation(
        x= x_in.max() - 0.95,   # x position (had to tweak these from the original)
        y= y_in.min() + 0,      # y position
        text= ('%.2f' % score).lstrip('0'),  # text is the classifier's score
        align='right',                       # align text on the right
        font= Font(size=15),                 # font size
        showarrow=False,                     # no arrow
        xref= 'x{}'.format(sbplt_in),  # position in relation to the x
        yref= 'y{}'.format(sbplt_in))  #   and y axes

# Function to make annotation labelling each classifier (at top of each column)
def make_sbplt_anno(sbplt_in,x_in,y_in,name):
    return Annotation(
        x= np.mean(x_in),  # x position
        y= y_in[-1],       # y position
        text= name,        # text is the classifier name
        align='center',       # align text in the center
        font= Font(size=14),  # font size
        showarrow=False,      # no arrow 
        xanchor='center',     # anchor at axis' center
        xref= 'x{}'.format(sbplt_in),  # position in relation to the x
        yref= 'y{}'.format(sbplt_in))  #   and y axes

Next, get the datasets.

We had to reduce the size of the mesh grid to allow web browsers to generate the Plotly graph in reasonable time. That said, when converted to a png, the Plotly graph with an 8 time sparser mesh grid has approximatively the same file size as the original png.

In [13]:

res_factor = 8    # 1d resolution decrease (res_factor**2 is the 2d decrease)

h = h*res_factor  # step size in the Contour mesh

N = 100           # sample size (original, scatter pts do not slow down plotting)

# Get classifiers and datasets
names, classifiers = make_classifiers()
datasets = make_datasets(N)

Next, generate a figure object with the desired subplot layout.

This figure object will package all other graph objects.

In [14]:

#
figure = tls.get_subplots(
    rows=len(datasets),
    columns=len(classifiers)+1,
    horizontal_spacing=0.01,
    vertical_spacing=0.05,
    print_grid=True)

This is the format of your plot grid!
[21]	[22]	[23]	[24]	[25]	[26]	[27]	[28]	[29]	[30]	
[11]	[12]	[13]	[14]	[15]	[16]	[17]	[18]	[19]	[20]	
[1]	[2]	[3]	[4]	[5]	[6]	[7]	[8]	[9]	[10]

The subplot indices in Plotly start at 1 and increase from left to right, bottom to top.

Next, add a few layout feautres:

In [15]:

my_width=1472   # width and
my_height=490   #  height of the matplotlib figure

figure['layout'].update(showlegend=False,     # don't show legend
                        hovermode='closest',  # show info about closest point on hover
                        autosize=False,       # manual sizing
                        width=my_width,       # set figure's width and
                        height=my_height)     #   height

title = 'Machine Learning classifier comparison, {}x smaller res.'.format(res_factor**2)
figure['layout'].update(title=title,                                 # add title
                        font= Font(family="Open Sans, sans-serif"))  # set font

figure['layout']['annotations'] = Annotations([])  # init. 'annotations' key

Now, loop through the datasets and the classifiers to fill in the figure object:

In [16]:

i = 1   # init. subplot counter

# iterate over datasets (in reverse order, to match original layout)
for ds in datasets[::-1]:
    
    # preprocess dataset, split into training and test part
    X, _y = ds
    X = StandardScaler().fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X, _y, test_size=0.4)
    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
    x = np.arange(x_min, x_max, h)
    y = np.arange(y_min, y_max, h)
    xx, yy = np.meshgrid(x,y)
    
    # Make color list (1 item for each per scatter pt)
    cm_train = [cm_bright[yy_train] for yy_train in y_train]
    cm_test = [cm_bright[yy_test] for yy_test in y_test]
    
    # Append 'data' with Scatter objects
    figure['data'] += [make_Scatter(i,X_train[:, 0],X_train[:, 1],
                                    'Training',cm_train,1)]
    figure['data'] += [make_Scatter(i,X_test[:, 0],X_test[:, 1],
                                    'Test',cm_test,0.6)]
    
    # Format subplot i's axes
    figure['layout'].update({'xaxis{}'.format(i): make_XAxis(x)})
    figure['layout'].update({'yaxis{}'.format(i): make_XAxis(y)})
      
    i += 1   # increment subplot counter
    
    # iterate over classifiers
    for name, clf in zip(names, classifiers):
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)

        # Plot the decision boundary. For that, we will assign a color to each
        # point in the mesh [x_min, m_max]x[y_min, y_max].
        if hasattr(clf, "decision_function"):
            Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
        else:
            Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
        
        # Put the result into a color plot, append 'data' with Contour object
        Z = Z.reshape(xx.shape)
        figure['data'] += [make_Contour(i,x,y,Z)]
        
        # Plot also the training points and testing points, append 'data'
        figure['data'] += [make_Scatter(i,X_train[:, 0],X_train[:, 1],
                                        'Training',cm_train,1)]
        figure['data'] += [make_Scatter(i,X_test[:, 0],X_test[:, 1],
                                        'Test',cm_test,0.6)]
        
        # Format subplot i's axes
        figure['layout'].update({'xaxis{}'.format(i): make_XAxis(x)})
        figure['layout'].update({'yaxis{}'.format(i): make_YAxis(y)})
        
        # Add score annotation to 'layout'
        figure['layout']['annotations'] += [make_score_anno(i,x,y,score)]
        
        # Add subplot title to 'layout' if on the top-most row
        if i>=22:
            figure['layout']['annotations'] += [make_sbplt_anno(i,x,y,name)]
        
        i += 1   # increment subplot counter
        

The only thing left to do is to send the figure object to Plotly:

In [17]:

# Type in a filename and send figure object to Plotly
filename = 'ml-classifier-comp_r{}'.format(res_factor**2)
py.iplot(figure, filename=filename, 
         width=my_width, height=my_height)  # adjust notebook display width and height

Plotly graphs reside online.

To save a static copy of the above graph, run:

In [20]:

py.image.save_as(figure, filename)

from IPython.display import Image
Image('ml-classifier-comp_r64.png')

Out[20]:

Got Questions or Feedback?

About Plotly

email: feedback@plot.ly
tweet:

@plotlygraphs

Notebook styling ideas

Big thanks to

In [19]:

# CSS styling within IPython notebook
from IPython.core.display import HTML
import urllib2
def css_styling():
    url = 'https://raw.githubusercontent.com/plotly/python-user-guide/master/custom.css'
    styles = urllib2.urlopen(url).read()
    return HTML(styles)

css_styling()

Out[19]:

In [19]: