from sklearn.ensemble import ExtraTreesClassifier
import autodiff
import numpy as np
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
/usr/local/lib/python2.7/dist-packages/scipy/lib/_util.py:34: DeprecationWarning: Module scipy.linalg.blas.fblas is deprecated, use scipy.linalg.blas instead DeprecationWarning) /usr/local/lib/python2.7/dist-packages/scipy/lib/_util.py:34: DeprecationWarning: Module scipy.linalg.blas.fblas is deprecated, use scipy.linalg.blas instead DeprecationWarning) /usr/local/lib/python2.7/dist-packages/scipy/lib/_util.py:34: DeprecationWarning: Module scipy.linalg.blas.fblas is deprecated, use scipy.linalg.blas instead DeprecationWarning) /usr/local/lib/python2.7/dist-packages/scipy/lib/_util.py:34: DeprecationWarning: Module scipy.linalg.blas.fblas is deprecated, use scipy.linalg.blas instead DeprecationWarning) /usr/local/lib/python2.7/dist-packages/scipy/lib/_util.py:34: DeprecationWarning: Module scipy.linalg.blas.fblas is deprecated, use scipy.linalg.blas instead DeprecationWarning) /usr/local/lib/python2.7/dist-packages/scipy/lib/_util.py:34: DeprecationWarning: Module scipy.linalg.blas.fblas is deprecated, use scipy.linalg.blas instead DeprecationWarning) /usr/local/lib/python2.7/dist-packages/scipy/lib/_util.py:34: DeprecationWarning: Module scipy.linalg.blas.fblas is deprecated, use scipy.linalg.blas instead DeprecationWarning) /usr/local/lib/python2.7/dist-packages/scipy/lib/_util.py:34: DeprecationWarning: Module scipy.linalg.blas.fblas is deprecated, use scipy.linalg.blas instead DeprecationWarning) Using gpu device 0: Quadro 4000
import cPickle
from sklearn.utils import shuffle
black_X, black_y = cPickle.load(open('../ml-practice/data/blackbox.pkl'))
black_X, black_y = shuffle(black_X, black_y)
def soft_absolute(u):
return np.sqrt(u**2 + 1e-5)
class SparseFilter(BaseEstimator, TransformerMixin):
def __init__(self, n_components = 100, activation = soft_absolute):
self.n_components = n_components
self.activation = activation
def get_objective(self, X):
def _objective(W):
Y = self.activation(np.dot(X, W))
YY = Y / np.sqrt(np.sum(Y**2, axis = 0)+1e-5)
YYY = YY / np.sqrt(np.sum(YY**2, axis=1)+1e-5)[:,np.newaxis]
cost = np.sum(YYY)
return cost
return _objective
def fit(self, X, y = None):
n_feats = X.shape[1]
W0 = np.random.uniform(low = -4.*np.sqrt(6./(n_feats+self.n_components)),
high = 4.*np.sqrt(6./(n_feats+self.n_components)),
size = (n_feats, self.n_components))
fn = self.get_objective(X)
self.W_ = autodiff.optimize.fmin_l_bfgs_b(fn, W0,
maxfun=800, iprint=1)
return self
def transform(self, X):
Y = self.activation(np.dot(X, self.W_))
YY = Y / np.sqrt(np.sum(Y**2, axis = 0)+1e-5)
YYY = YY / np.sqrt(np.sum(YY**2, axis=1)+1e-5)[:,np.newaxis]
return YYY
## feature learning
sf = SparseFilter(n_components = 1000)
feats_X = sf.fit_transform(black_X)
## feature selection
trees = ExtraTreesClassifier(n_estimators=600, max_features=50)
%time trees.fit(feats_X, black_y)
CPU times: user 9.06 s, sys: 32 ms, total: 9.09 s Wall time: 9.09 s
ExtraTreesClassifier(bootstrap=False, compute_importances=None, criterion='gini', max_depth=None, max_features=50, min_density=None, min_samples_leaf=1, min_samples_split=2, n_estimators=600, n_jobs=1, oob_score=False, random_state=None, verbose=0)
feature_importances = np.mean(np.asarray([m.feature_importances_ for m in trees.estimators_])
, axis = 0)
%pylab inline
bar(range(feature_importances.shape[0]), feature_importances)
Populating the interactive namespace from numpy and matplotlib
WARNING: pylab import has clobbered these variables: ['shuffle'] `%pylab --no-import-all` prevents importing * from pylab and numpy
<Container object of 1000 artists>
important_features = feature_importances.argsort()[-150:]
print feature_importances[important_features]
[ 0.00110069 0.0011031 0.00110832 0.00110848 0.00111614 0.00111639 0.00112009 0.00113204 0.00113678 0.00114143 0.00114172 0.001143 0.00114519 0.00114622 0.00114835 0.00114892 0.00115196 0.00115934 0.00116113 0.00116126 0.00116654 0.001174 0.00117759 0.00118032 0.00118902 0.0011906 0.00119145 0.00119203 0.00120129 0.00120793 0.0012085 0.00121645 0.00121883 0.00122185 0.00122611 0.00123157 0.00124038 0.00125046 0.00125906 0.00127429 0.00130033 0.00130571 0.00130687 0.00132428 0.00132554 0.00132642 0.00132692 0.00132889 0.00135761 0.00135789 0.00140002 0.00140562 0.00141332 0.00142513 0.00143438 0.00148939 0.00150558 0.00150697 0.0015466 0.00155047 0.00156705 0.00159911 0.0016077 0.00161358 0.00166584 0.00168708 0.00169305 0.00173968 0.00178936 0.00182665 0.00184424 0.00185604 0.00187056 0.00189641 0.00189788 0.00197408 0.00197563 0.00197906 0.00198388 0.00198644 0.00199776 0.00201113 0.00202779 0.0020321 0.00203423 0.00205813 0.00205877 0.00206985 0.00208242 0.00212943 0.00216439 0.00216616 0.00219573 0.00222 0.00222141 0.00224237 0.00225731 0.00227293 0.00228616 0.00229451 0.00230983 0.00232317 0.00233347 0.00240385 0.00240597 0.00243864 0.00245069 0.00249774 0.00258083 0.00259638 0.00260168 0.00264882 0.00266653 0.00270366 0.00271039 0.00271974 0.002796 0.0028731 0.00290867 0.002936 0.00296366 0.00298982 0.00300052 0.00302205 0.00304761 0.00307148 0.00311108 0.00313937 0.0031422 0.0031447 0.0031697 0.00324979 0.00332527 0.0033305 0.00378664 0.00385297 0.00387611 0.00398981 0.00404379 0.00438955 0.0044861 0.00448716 0.00488615 0.00493941 0.00496819 0.0050033 0.00504933 0.00521961 0.00617199 0.00704302]
selected_feats = feats_X[:, important_features]
print selected_feats.shape
(1000, 150)
from sklearn.decomposition import PCA
pca_feats = PCA(n_components = 2).fit_transform(selected_feats)
n_classes = len(np.unique(black_y))
fig, axes = subplots(nrows = n_classes, ncols = 1, figsize = (12, 12 * n_classes))
for i, cls in enumerate(np.unique(black_y)):
in_cls = (black_y == cls)
out_cls = (1 - (black_y == cls)).astype(np.bool)
axes[i].plot(pca_feats[in_cls, 0], pca_feats[in_cls, 1], 'ro',
label = 'in_'+str(cls))
axes[i].plot(pca_feats[out_cls, 0], pca_feats[out_cls, 1], 'b+',
label = 'out_'+str(cls))
axes[i].legend(loc='best')
from sklearn.preprocessing import normalize
normalized_feats = normalize(selected_feats)
svc = SVC(kernel = 'rbf', )
params = {'C': [0.1, 1, 100, 1000, 10000],
'gamma': [1e-3, 1e-2, 1e-1, 0., 1.]}
gs = GridSearchCV(svc, params, cv = 3, n_jobs=-1)
%time gs.fit(normalized_feats, black_y)
print gs.best_params_
print gs.best_score_
print gs.best_estimator_.score(normalized_feats, black_y)
CPU times: user 560 ms, sys: 352 ms, total: 912 ms Wall time: 1.98 s {'C': 100, 'gamma': 1.0} 0.59 1.0
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
params = {'n_estimators': [100, 300, 500, 600],
'max_features': [10, 50, 70, 100]}
gs = GridSearchCV(rf, params, cv = 3, n_jobs=-1)
%time gs.fit(normalized_feats, black_y)
print gs.best_params_
print gs.best_score_
CPU times: user 14.1 s, sys: 161 ms, total: 14.3 s Wall time: 3min 38s {'max_features': 50, 'n_estimators': 300} 0.546
from sklearn.ensemble import RandomForestClassifier
Based on winner's solution - sparse filtering for feature learning, random forest for feature selection, and SVM as a final result
It is suprising that svc is performing better than RandomForestClassifier, probably because the number of samples is NOT large enough
It's seldom to go very deep by using SparseFiltering (e.g. stacked with other SparseFiltering or auot-encoders). soft_absolute works better with SparseFiltering than sigmoid
It is useful to select fewer features for SVM, and normalize the data before feeding them to SVM
SVM is sometimes better than RTrees when number of samples are small
0.82999999999999996