from sklearn.decomposition import PCA
from sklearn.lda import LDA
from sklearn.qda import QDA
import pylab as pl
from sklearn.datasets import load_iris
import cPickle
import numpy as np
%pylab inline --no-import-all
Populating the interactive namespace from numpy and matplotlib
def plot_data(X, y):
for label in np.unique(y):
pl.figure()
pl.plot(X[y != label, 0], X[y != label, 1], 'y.')
pl.plot(X[y == label, 0], X[y == label, 1], 'r+')
pl.title('label=%i' % label)
def plot_data_classes(X, y, title = ''):
pl.figure()
pl.title(title)
for label in np.unique(y):
pl.plot(X[y == label, 0], X[y == label, 1], '+', label='label=%i' % label)
pl.legend(loc='best')
## ON IRIS data
iris = load_iris()
iris_X, iris_y = iris.data, iris.target
pca = PCA(n_components=2)
%time iris_pca_X = pca.fit_transform(iris_X)
lda = LDA(n_components=2)
%time iris_lda_X = lda.fit_transform(iris_X, iris_y)
print iris_pca_X.shape, iris_lda_X.shape
CPU times: user 0 ns, sys: 0 ns, total: 0 ns Wall time: 1.05 ms CPU times: user 4 ms, sys: 0 ns, total: 4 ms Wall time: 2.17 ms (150, 2) (150, 2)
plot_data_classes(iris_pca_X, iris_y, 'PCA')
plot_data_classes(iris_lda_X, iris_y, 'LDA')
## ON BLACKBOX data
from sklearn.preprocessing import normalize
black_X, black_y = cPickle.load(open('data/blackbox.pkl'))
black_X = normalize(black_X)
print black_X.shape
(1000, 1875)
pca = PCA(n_components=2)
%time black_pca_X = pca.fit_transform(black_X)
lda = LDA(n_components=2)
%time black_lda_X = lda.fit_transform(black_X, black_y)
print black_pca_X.shape, black_lda_X.shape
CPU times: user 2.23 s, sys: 1.01 s, total: 3.24 s Wall time: 1.66 s CPU times: user 2.32 s, sys: 1.17 s, total: 3.49 s Wall time: 1.75 s (1000, 2) (1000, 2)
plot_data_classes(black_pca_X, black_y, 'PCA')
plot_data_classes(black_lda_X, black_y, 'LDA')
## lda with linear model
from sklearn.linear_model import SGDClassifier
from sklearn.cross_validation import cross_val_score
lda = LDA(n_components=150, )
lda_black_X = lda.fit_transform(black_X, black_y)
print lda_black_X.shape
sgd = SGDClassifier(n_jobs=-1, shuffle=True, n_iter=5)
cross_val_score(sgd, lda_black_X, black_y, cv = 5)
(1000, 9)
array([ 0.945, 0.96 , 0.93 , 0.97 , 0.925])
from sklearn.cross_validation import train_test_split
train_X, test_X, train_y, test_y = train_test_split(black_X, black_y, test_size = 0.2)
print train_X.shape, test_X.shape
(800, 1875) (200, 1875)
from sklearn.pipeline import Pipeline
lda = LDA(n_components=9)
sgd = SGDClassifier(shuffle = True, n_iter = 50)
clf = Pipeline(steps = [('lda', lda),
('sgd', sgd)])
clf.fit(train_X, train_y)
Pipeline(steps=[('lda', LDA(n_components=9, priors=None)), ('sgd', SGDClassifier(alpha=0.0001, class_weight=None, epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='hinge', n_iter=50, n_jobs=1, penalty='l2', power_t=0.5, random_state=None, rho=None, shuffle=True, verbose=0, warm_start=False))])
clf.score(test_X, test_y)
0.14999999999999999
cross_val_score(clf, black_X, black_y, cv = 3)
array([ 0.14371257, 0.13513514, 0.14414414])