In [19]:

from sklearn.decomposition import PCA
from sklearn.lda import LDA
from sklearn.qda import QDA
import pylab as pl
from sklearn.datasets import load_iris
import cPickle
import numpy as np

%pylab inline --no-import-all

Populating the interactive namespace from numpy and matplotlib

In [7]:

def plot_data(X, y):
    for label in np.unique(y):
        pl.figure()
        pl.plot(X[y != label, 0], X[y != label, 1], 'y.')
        pl.plot(X[y == label, 0], X[y == label, 1], 'r+')
        pl.title('label=%i' % label)

In [13]:

def plot_data_classes(X, y, title = ''):
    pl.figure()
    pl.title(title)
    for label in np.unique(y):
        pl.plot(X[y == label, 0], X[y == label, 1], '+', label='label=%i' % label)
        pl.legend(loc='best')

In [3]:

## ON IRIS data
iris = load_iris()
iris_X, iris_y = iris.data, iris.target

In [21]:

pca = PCA(n_components=2)
%time iris_pca_X = pca.fit_transform(iris_X)
lda = LDA(n_components=2)
%time iris_lda_X = lda.fit_transform(iris_X, iris_y)
print iris_pca_X.shape, iris_lda_X.shape

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 1.05 ms
CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 2.17 ms
(150, 2) (150, 2)

In [22]:

plot_data_classes(iris_pca_X, iris_y, 'PCA')
plot_data_classes(iris_lda_X, iris_y, 'LDA')

In [32]:

## ON BLACKBOX data
from sklearn.preprocessing import normalize
black_X, black_y = cPickle.load(open('data/blackbox.pkl'))
black_X = normalize(black_X)
print black_X.shape

(1000, 1875)

In [27]:

pca = PCA(n_components=2)
%time black_pca_X = pca.fit_transform(black_X)
lda = LDA(n_components=2)
%time black_lda_X = lda.fit_transform(black_X, black_y)
print black_pca_X.shape, black_lda_X.shape

CPU times: user 2.23 s, sys: 1.01 s, total: 3.24 s
Wall time: 1.66 s
CPU times: user 2.32 s, sys: 1.17 s, total: 3.49 s
Wall time: 1.75 s
(1000, 2) (1000, 2)

In [28]:

plot_data_classes(black_pca_X, black_y, 'PCA')
plot_data_classes(black_lda_X, black_y, 'LDA')

In [33]:

## lda with linear model
from sklearn.linear_model import SGDClassifier
from sklearn.cross_validation import cross_val_score
lda = LDA(n_components=150, )
lda_black_X = lda.fit_transform(black_X, black_y)
print lda_black_X.shape
sgd = SGDClassifier(n_jobs=-1, shuffle=True, n_iter=5)
cross_val_score(sgd, lda_black_X, black_y, cv = 5)

(1000, 9)

Out[33]:

array([ 0.945,  0.96 ,  0.93 ,  0.97 ,  0.925])

In [34]:

from sklearn.cross_validation import train_test_split
train_X, test_X, train_y, test_y = train_test_split(black_X, black_y, test_size = 0.2)
print train_X.shape, test_X.shape

(800, 1875) (200, 1875)

In [35]:

from sklearn.pipeline import Pipeline
lda = LDA(n_components=9)
sgd = SGDClassifier(shuffle = True, n_iter = 50)
clf = Pipeline(steps = [('lda', lda), 
                        ('sgd', sgd)])
clf.fit(train_X, train_y)

Out[35]:

Pipeline(steps=[('lda', LDA(n_components=9, priors=None)), ('sgd', SGDClassifier(alpha=0.0001, class_weight=None, epsilon=0.1, eta0=0.0,
       fit_intercept=True, l1_ratio=0.15, learning_rate='optimal',
       loss='hinge', n_iter=50, n_jobs=1, penalty='l2', power_t=0.5,
       random_state=None, rho=None, shuffle=True, verbose=0,
       warm_start=False))])

In [36]:

clf.score(test_X, test_y)

Out[36]:

0.14999999999999999

In [37]:

cross_val_score(clf, black_X, black_y, cv = 3)

Out[37]:

array([ 0.14371257,  0.13513514,  0.14414414])

Supervised LDA Feature Learning can EASILY overfit small dataset¶

In [ ]: