Notebook

In [1]:

!date

Wed Jan 15 22:45:24 EST 2014

In [2]:

from pykalman.classifier import GenerativeBayes

import pandas as pd
import numpy as np
rnorm = np.random.normal

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.qda import QDA
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.mixture import GMM
from sklearn.cross_validation import train_test_split

%matplotlib inline
import matplotlib.pylab as plt

Comparing Machine Learning Techniques¶

After seeing this post I wanted to perform the same analsysis using SciKit-Learn. The classifiers used are given below with their paramters. I do not have a Neural Network like that shown in the post, but added a few extras. The GenerativeBayes is based on commits for this issue. It trains a Gaussian Mixture Model per class. One thing this notebook shows that I did not see in the post, I provide an out-of-sample score for each classifier. The score is provided on each plot.

In [3]:

classifiers = [
    DecisionTreeClassifier(),
    KNeighborsClassifier(3),    
    LogisticRegression(),
    SVC(kernel="rbf"),
    AdaBoostClassifier(),
    GenerativeBayes(GMM(n_components=1,covariance_type='full', init_params='wc', n_iter=20)),
    GenerativeBayes(GMM(n_components=2,covariance_type='full', init_params='wc', n_iter=20)),
    QDA(),
    RandomForestClassifier()]

clf_names = [
    'Decision Tree',
    'K Neighbors',
    'Logistic Regression',
    'SVC (Gaussian)',
    'Ada Boost',
    'GMM (1 component)',
    'GMM (2 component)',
    'QDA',
    'Random Forest']

In [4]:

def plot_results(classifiers, df):
    plt.figure(figsize=(14,14))
    
    X = df[['x','y']]
    Y = df['label']
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=.4)
    n_classes = len(Y.unique())
    plot_colors = "brym"
    plot_step = 0.02
    
    x_min, x_max = X.ix[:, 0].min() - 1, X.ix[:, 0].max() + 1
    y_min, y_max = X.ix[:, 1].min() - 1, X.ix[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
                             np.arange(y_min, y_max, plot_step))
    
    for j, clf in enumerate(classifiers):
        clf.fit(X_train,y_train)    
        score = clf.score(X_test, y_test)

        ax = plt.subplot(4,3,j+1)
        Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
        Z = Z.reshape(xx.shape)
        cs = plt.contourf(xx, yy, Z, cmap=plt.cm.Paired)
        
        # Plot the training points
        for i, color in zip(range(n_classes), plot_colors):
            plt.scatter(X[Y==i].x, X[Y==i].y, c=color, label=i, cmap=plt.cm.Paired)
            
        ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'),
                        size=15, horizontalalignment='right')        
        plt.title(clf_names[j])

Linearly Separable Classes (Simple)¶

In [5]:

# Simple Linearly Seperated
N = 50
p1 = pd.DataFrame(np.hstack((rnorm(loc=2.0, scale=0.5, size=(N,1)), 
                              rnorm(loc=2.0, scale=0.5, size=(N,1)))),
                   columns=['x','y'])
p1['label'] = 0
p2 = pd.DataFrame(np.hstack((rnorm(loc=1.0, scale=0.5, size=(N,1)), 
                              rnorm(loc=1.0, scale=0.5, size=(N,1)))),
                   columns=['x','y'])
p2['label'] = 1
df = pd.concat([p1, p2])

plot_results(classifiers, df)

3 Class Linearly Separable Classes (Simple)¶

In [6]:

# 3 Class Linearly Seperated
N = 50
p1 = pd.DataFrame(np.hstack((rnorm(loc=3.0, scale=0.5, size=(N,1)), 
                              rnorm(loc=3.0, scale=0.5, size=(N,1)))),
                   columns=['x','y'])
p1['label'] = 0
p2 = pd.DataFrame(np.hstack((rnorm(loc=2.0, scale=0.5, size=(N,1)), 
                              rnorm(loc=2.0, scale=0.5, size=(N,1)))),
                   columns=['x','y'])
p2['label'] = 1
p3 = pd.DataFrame(np.hstack((rnorm(loc=1.0, scale=0.5, size=(N,1)), 
                              rnorm(loc=1.0, scale=0.5, size=(N,1)))),
                   columns=['x','y'])
p3['label'] = 2
df = pd.concat([p1, p2,p3])

plot_results(classifiers, df)

XOR Pattern, 2 Class¶

In [7]:

# XOR pattern (simple)
N = 50
p1 = pd.DataFrame(np.hstack((rnorm(loc=1.0, scale=0.5, size=(N,1)), 
                              rnorm(loc=1.0, scale=0.5, size=(N,1)))),
                   columns=['x','y'])
p1['label'] = 0
p2 = pd.DataFrame(np.hstack((rnorm(loc=-1.0, scale=0.5, size=(N,1)), 
                              rnorm(loc=1.0, scale=0.5, size=(N,1)))),
                   columns=['x','y'])
p2['label'] = 1
p3 = pd.DataFrame(np.hstack((rnorm(loc=-1.0, scale=0.5, size=(N,1)), 
                              rnorm(loc=-1.0, scale=0.5, size=(N,1)))),
                   columns=['x','y'])
p3['label'] = 0
p4 = pd.DataFrame(np.hstack((rnorm(loc=1.0, scale=0.5, size=(N,1)), 
                              rnorm(loc=-1.0, scale=0.5, size=(N,1)))),
                   columns=['x','y'])
p4['label'] = 1
df = pd.concat([p1,p2,p3,p4])

plot_results(classifiers, df)

XOR, 2 Class, Increase Spread¶

In [8]:

# XOR pattern (complex)
N = 50
p1 = pd.DataFrame(np.hstack((rnorm(loc=1.0, scale=1.0, size=(N,1)), 
                              rnorm(loc=1.0, scale=1.0, size=(N,1)))),
                   columns=['x','y'])
p1['label'] = 0
p2 = pd.DataFrame(np.hstack((rnorm(loc=-1.0, scale=1.0, size=(N,1)), 
                              rnorm(loc=1.0, scale=1.0, size=(N,1)))),
                   columns=['x','y'])
p2['label'] = 1
p3 = pd.DataFrame(np.hstack((rnorm(loc=-1.0, scale=1.0, size=(N,1)), 
                              rnorm(loc=-1.0, scale=1.0, size=(N,1)))),
                   columns=['x','y'])
p3['label'] = 0
p4 = pd.DataFrame(np.hstack((rnorm(loc=1.0, scale=1.0, size=(N,1)), 
                              rnorm(loc=-1.0, scale=1.0, size=(N,1)))),
                   columns=['x','y'])
p4['label'] = 1
df = pd.concat([p1,p2,p3,p4])

plot_results(classifiers, df)

Complex 4 Class Example¶

In [9]:

# 4 Class (complex)
N = 100
p1 = pd.DataFrame(np.hstack((rnorm(loc=1.0, scale=1.0, size=(N,1)), 
                              rnorm(loc=1.0, scale=1.0, size=(N,1)))),
                   columns=['x','y'])
p1['label'] = 0
p2 = pd.DataFrame(np.hstack((rnorm(loc=-1.0, scale=1.0, size=(N,1)), 
                              rnorm(loc=1.0, scale=1.0, size=(N,1)))),
                   columns=['x','y'])
p2['label'] = 1
p3 = pd.DataFrame(np.hstack((rnorm(loc=-1.0, scale=1.0, size=(N,1)), 
                              rnorm(loc=-1.0, scale=1.0, size=(N,1)))),
                   columns=['x','y'])
p3['label'] = 2
p4 = pd.DataFrame(np.hstack((rnorm(loc=1.0, scale=1.0, size=(N,1)), 
                              rnorm(loc=-1.0, scale=1.0, size=(N,1)))),
                   columns=['x','y'])
p4['label'] = 3
df = pd.concat([p1,p2,p3,p4])

plot_results(classifiers, df)

In [9]: