Notebook

In [1]:

from sklearn.ensemble import ExtraTreesClassifier
import autodiff
import numpy as np
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin

/usr/local/lib/python2.7/dist-packages/scipy/lib/_util.py:34: DeprecationWarning: Module scipy.linalg.blas.fblas is deprecated, use scipy.linalg.blas instead
  DeprecationWarning)
/usr/local/lib/python2.7/dist-packages/scipy/lib/_util.py:34: DeprecationWarning: Module scipy.linalg.blas.fblas is deprecated, use scipy.linalg.blas instead
  DeprecationWarning)
/usr/local/lib/python2.7/dist-packages/scipy/lib/_util.py:34: DeprecationWarning: Module scipy.linalg.blas.fblas is deprecated, use scipy.linalg.blas instead
  DeprecationWarning)
/usr/local/lib/python2.7/dist-packages/scipy/lib/_util.py:34: DeprecationWarning: Module scipy.linalg.blas.fblas is deprecated, use scipy.linalg.blas instead
  DeprecationWarning)
/usr/local/lib/python2.7/dist-packages/scipy/lib/_util.py:34: DeprecationWarning: Module scipy.linalg.blas.fblas is deprecated, use scipy.linalg.blas instead
  DeprecationWarning)
/usr/local/lib/python2.7/dist-packages/scipy/lib/_util.py:34: DeprecationWarning: Module scipy.linalg.blas.fblas is deprecated, use scipy.linalg.blas instead
  DeprecationWarning)
/usr/local/lib/python2.7/dist-packages/scipy/lib/_util.py:34: DeprecationWarning: Module scipy.linalg.blas.fblas is deprecated, use scipy.linalg.blas instead
  DeprecationWarning)
/usr/local/lib/python2.7/dist-packages/scipy/lib/_util.py:34: DeprecationWarning: Module scipy.linalg.blas.fblas is deprecated, use scipy.linalg.blas instead
  DeprecationWarning)
Using gpu device 0: Quadro 4000

In [2]:

import cPickle
from sklearn.utils import shuffle
black_X, black_y = cPickle.load(open('../ml-practice/data/blackbox.pkl'))
black_X, black_y = shuffle(black_X, black_y)

In [3]:

def soft_absolute(u):
    return np.sqrt(u**2 + 1e-5)

class SparseFilter(BaseEstimator, TransformerMixin):
    def __init__(self, n_components = 100, activation = soft_absolute):
        self.n_components = n_components
        self.activation = activation
    def get_objective(self, X):
        def _objective(W):
            Y = self.activation(np.dot(X, W))
            YY = Y / np.sqrt(np.sum(Y**2, axis = 0)+1e-5)
            YYY = YY / np.sqrt(np.sum(YY**2, axis=1)+1e-5)[:,np.newaxis]
            cost = np.sum(YYY)
            return cost
        return _objective
    def fit(self, X, y = None):
        n_feats = X.shape[1]
        W0 = np.random.uniform(low = -4.*np.sqrt(6./(n_feats+self.n_components)), 
                            high = 4.*np.sqrt(6./(n_feats+self.n_components)), 
                            size = (n_feats, self.n_components))
        fn = self.get_objective(X)
        self.W_ = autodiff.optimize.fmin_l_bfgs_b(fn, W0, 
                                                  maxfun=800, iprint=1) 
        return self
    def transform(self, X):
        Y = self.activation(np.dot(X, self.W_))
        YY = Y / np.sqrt(np.sum(Y**2, axis = 0)+1e-5)
        YYY = YY / np.sqrt(np.sum(YY**2, axis=1)+1e-5)[:,np.newaxis]
        return YYY

In [4]:

## feature learning
sf = SparseFilter(n_components = 1000)
feats_X = sf.fit_transform(black_X)

In [5]:

## feature selection
trees = ExtraTreesClassifier(n_estimators=600, max_features=50)
%time trees.fit(feats_X, black_y)

CPU times: user 9.06 s, sys: 32 ms, total: 9.09 s
Wall time: 9.09 s

Out[5]:

ExtraTreesClassifier(bootstrap=False, compute_importances=None,
           criterion='gini', max_depth=None, max_features=50,
           min_density=None, min_samples_leaf=1, min_samples_split=2,
           n_estimators=600, n_jobs=1, oob_score=False, random_state=None,
           verbose=0)

In [6]:

feature_importances = np.mean(np.asarray([m.feature_importances_ for m in trees.estimators_])
                              , axis = 0)

In [7]:

%pylab inline
bar(range(feature_importances.shape[0]), feature_importances)

Populating the interactive namespace from numpy and matplotlib

WARNING: pylab import has clobbered these variables: ['shuffle']
`%pylab --no-import-all` prevents importing * from pylab and numpy

Out[7]:

<Container object of 1000 artists>

In [8]:

important_features = feature_importances.argsort()[-150:]
print feature_importances[important_features]

[ 0.00110069  0.0011031   0.00110832  0.00110848  0.00111614  0.00111639
  0.00112009  0.00113204  0.00113678  0.00114143  0.00114172  0.001143
  0.00114519  0.00114622  0.00114835  0.00114892  0.00115196  0.00115934
  0.00116113  0.00116126  0.00116654  0.001174    0.00117759  0.00118032
  0.00118902  0.0011906   0.00119145  0.00119203  0.00120129  0.00120793
  0.0012085   0.00121645  0.00121883  0.00122185  0.00122611  0.00123157
  0.00124038  0.00125046  0.00125906  0.00127429  0.00130033  0.00130571
  0.00130687  0.00132428  0.00132554  0.00132642  0.00132692  0.00132889
  0.00135761  0.00135789  0.00140002  0.00140562  0.00141332  0.00142513
  0.00143438  0.00148939  0.00150558  0.00150697  0.0015466   0.00155047
  0.00156705  0.00159911  0.0016077   0.00161358  0.00166584  0.00168708
  0.00169305  0.00173968  0.00178936  0.00182665  0.00184424  0.00185604
  0.00187056  0.00189641  0.00189788  0.00197408  0.00197563  0.00197906
  0.00198388  0.00198644  0.00199776  0.00201113  0.00202779  0.0020321
  0.00203423  0.00205813  0.00205877  0.00206985  0.00208242  0.00212943
  0.00216439  0.00216616  0.00219573  0.00222     0.00222141  0.00224237
  0.00225731  0.00227293  0.00228616  0.00229451  0.00230983  0.00232317
  0.00233347  0.00240385  0.00240597  0.00243864  0.00245069  0.00249774
  0.00258083  0.00259638  0.00260168  0.00264882  0.00266653  0.00270366
  0.00271039  0.00271974  0.002796    0.0028731   0.00290867  0.002936
  0.00296366  0.00298982  0.00300052  0.00302205  0.00304761  0.00307148
  0.00311108  0.00313937  0.0031422   0.0031447   0.0031697   0.00324979
  0.00332527  0.0033305   0.00378664  0.00385297  0.00387611  0.00398981
  0.00404379  0.00438955  0.0044861   0.00448716  0.00488615  0.00493941
  0.00496819  0.0050033   0.00504933  0.00521961  0.00617199  0.00704302]

In [9]:

selected_feats = feats_X[:, important_features]
print selected_feats.shape

(1000, 150)

In [15]:

from sklearn.decomposition import PCA
pca_feats = PCA(n_components = 2).fit_transform(selected_feats)

n_classes = len(np.unique(black_y))

fig, axes = subplots(nrows = n_classes, ncols = 1, figsize = (12, 12 * n_classes))

for i, cls in enumerate(np.unique(black_y)):
    in_cls = (black_y == cls)
    out_cls = (1 - (black_y == cls)).astype(np.bool)
    axes[i].plot(pca_feats[in_cls, 0], pca_feats[in_cls, 1], 'ro', 
                 label = 'in_'+str(cls))
    axes[i].plot(pca_feats[out_cls, 0], pca_feats[out_cls, 1], 'b+',
                 label = 'out_'+str(cls))
    axes[i].legend(loc='best')

In [10]:

from sklearn.preprocessing import normalize
normalized_feats = normalize(selected_feats)
svc = SVC(kernel = 'rbf', )
params = {'C': [0.1, 1, 100, 1000, 10000],
          'gamma': [1e-3, 1e-2, 1e-1, 0., 1.]}
gs = GridSearchCV(svc, params, cv = 3, n_jobs=-1)
%time gs.fit(normalized_feats, black_y)
print gs.best_params_
print gs.best_score_
print gs.best_estimator_.score(normalized_feats, black_y)

CPU times: user 560 ms, sys: 352 ms, total: 912 ms
Wall time: 1.98 s
{'C': 100, 'gamma': 1.0}
0.59
1.0

In [93]:

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
params = {'n_estimators': [100, 300, 500, 600],
          'max_features': [10, 50, 70, 100]}
gs = GridSearchCV(rf, params, cv = 3, n_jobs=-1)
%time gs.fit(normalized_feats, black_y)
print gs.best_params_
print gs.best_score_

CPU times: user 14.1 s, sys: 161 ms, total: 14.3 s
Wall time: 3min 38s
{'max_features': 50, 'n_estimators': 300}
0.546

In [92]:

from sklearn.ensemble import RandomForestClassifier

Based on winner's solution - sparse filtering for feature learning, random forest for feature selection, and SVM as a final result

It is suprising that svc is performing better than RandomForestClassifier, probably because the number of samples is NOT large enough

It's seldom to go very deep by using SparseFiltering (e.g. stacked with other SparseFiltering or auot-encoders). soft_absolute works better with SparseFiltering than sigmoid

It is useful to select fewer features for SVM, and normalize the data before feeding them to SVM

SVM is sometimes better than RTrees when number of samples are small

In [98]:

Out[98]:

0.82999999999999996

In [ ]: