Try different feature extraction strategy for binary classification problems¶

data from sklearn-london workshop hosted in Kaggle
Things to try:
- discretization of features - by decision tree and log odd coding
- sparse filtering & forests feature selection

In [2]:

import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.decomposition import PCA


from autodiff import optimize
from scipy import stats
import pylab as pl

%pylab inline --no-import-all

Populating the interactive namespace from numpy and matplotlib

In [7]:

## load data
train_data = pd.read_csv('../data/sklearn-london/train.csv', header=None)
train_labels = pd.read_csv('../data/sklearn-london/trainLabels.csv', header=None)

X = np.asarray(train_data)
y = np.asarray(train_labels).ravel()

print X.shape, y.shape
print X.dtype, y.dtype, np.unique(y)

## shuffle data
X, y = shuffle(X, y)
print X.shape, y.shape

(1000, 40) (1000,)
float64 int64 [0 1]
(1000, 40) (1000,)

In [8]:

train_index, test_index = train_test_split(np.arange(X.shape[0]), test_size = 0.2)
print train_index.shape, test_index.shape

(800,) (200,)

base case - on raw features¶

In [9]:

## analzye min/max, center, shape of different features
def statistics(feat):
    return (feat.min(), feat.max(), feat.std(), stats.skew(feat))
mins, maxs, stds, skewness = zip(*[statistics(X[:, i]) for i in xrange(X.shape[1])])
print np.std(mins), np.min(mins), np.max(mins)
print np.std(maxs), np.min(maxs), np.max(maxs)
print np.std(stds), np.min(stds), np.max(stds)
print np.std(skewness), np.min(skewness), np.max(skewness)

3.52909765249 -16.4219014729 -2.6956019378
3.29453132536 2.54650651811 17.5653445056
0.967315549334 0.966798559976 4.53656367456
0.135242790356 -0.412842893417 0.333415441978

In [10]:

ss = StandardScaler()
train_X = ss.fit_transform(X[train_index, :])
test_X = ss.transform(X[test_index, :])
train_y = y[train_index]
test_y = y[test_index]

svc = SVC()
gammas = [1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1., 3., 10.]
gs = GridSearchCV(svc, {'gamma': gammas}, n_jobs=-1, cv = 5, scoring='accuracy')
gs.fit(train_X, train_y)
print gs.best_params_
print gs.best_score_

svc = SVC(**gs.best_params_)
svc.fit(train_X, train_y)
print svc.score(test_X, test_y)

{'gamma': 0.03}
0.86125
0.9

Base case - on PCA¶

In [11]:

pca = PCA(whiten=True)
pca.fit(X)
feature_importances = pd.DataFrame(pca.explained_variance_ratio_)
feature_importances.plot(kind = 'bar')

Out[11]:

<matplotlib.axes.AxesSubplot at 0x9346c10>

In [12]:

pca = PCA(n_components=15, whiten=True)
pca_X = pca.fit_transform(X)
train_X = pca_X[train_index, :]
test_X = pca_X[test_index, :]
train_y = y[train_index]
test_y = y[test_index]
print train_X.shape, test_X.shape
print train_y.shape, test_y.shape

(800, 15) (200, 15)
(800,) (200,)

In [13]:

svc = SVC()
gammas = [1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1., 3., 10.]
gs = GridSearchCV(svc, {'gamma': gammas}, n_jobs=-1, cv = 5, scoring='accuracy')
gs.fit(train_X, train_y)
print gs.best_params_
print gs.best_score_

svc = SVC(**gs.best_params_)
svc.fit(train_X, train_y)
print svc.score(test_X, test_y)

{'gamma': 0.1}
0.91625
0.93

benchmark 1 - sparse filtering (feature extraction) + decision forest (feature selection)¶

In [10]:

def soft_absolute(u):
    epsilon = 1e-8
    return np.sqrt(u*u + epsilon)

def logistic(u):
    return 1. / (1. + np.exp(-u))

class SparseFilter(BaseEstimator, TransformerMixin):
    def __init__(self, n_features = 200, n_iterations = 300, activate=soft_absolute):
        self.epsilon = 1e-8
        self.n_features = n_features
        self.n_iterations = n_iterations
        self.activate = activate
    def fit(self, X, y = None):
        n_samples, n_dim = X.shape
        W = np.random.randn(n_dim, self.n_features)
        b = np.random.randn(self.n_features)
        obj_fn = self.get_objective_fn(X)
        self.W_, self.b_ = optimize.fmin_l_bfgs_b(obj_fn, (W, b), 
                                iprint = 1, 
                                maxfun = self.n_iterations)
        return self
    def get_objective_fn(self, X):
        def _objective_fn(W, b):
            Y = self.activate(np.dot(X, W) + b)
            Y = Y / np.sqrt(np.sum(Y*Y, axis = 0) + self.epsilon)
            Y = Y / np.sqrt(np.sum(Y*Y, axis = 1)[:, np.newaxis] + self.epsilon)
            return np.sum(Y)
        return _objective_fn
    def transform(self, X):
        Y = self.activate(np.dot(X, self.W_) + self.b_)
        Y = Y / np.sqrt(np.sum(Y*Y, axis=0) + self.epsilon)
        Y = Y / np.sqrt(np.sum(Y*Y, axis=1)[:, np.newaxis] + self.epsilon)
        return Y
    

In [11]:

sf = SparseFilter(n_features=50, n_iterations=5000)
sf_X = sf.fit_transform(X)
sf_X = ss.fit_transform(sf_X)

In [12]:

train_X = sf_X[train_index, :]
train_y = y[train_index, :]
test_X = sf_X[test_index, :]
test_y = y[test_index, :]
print train_X.shape, test_X.shape
print train_y.shape, test_y.shape

(800, 50) (200, 50)
(800,) (200,)

In [13]:

trees = ExtraTreesClassifier(n_estimators=100, max_features=50, n_jobs=-1, random_state=0)
trees.fit(train_X, train_y)
feature_importances = pd.DataFrame(trees.feature_importances_)
feature_importances.plot(kind='bar')

Out[13]:

<matplotlib.axes.AxesSubplot at 0x7de04d0>

In [14]:

selected_features = np.where(feature_importances >= 0.01)[0]
train_X = train_X[:, selected_features]
test_X = test_X[:, selected_features]
print train_X.shape, test_X.shape

(800, 15) (200, 15)

In [15]:

svc = SVC()
gammas = [1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1., 3., 10.]
gs = GridSearchCV(svc, {'gamma': gammas}, n_jobs=-1, cv = 5, scoring='accuracy')
gs.fit(train_X, train_y)
print gs.best_params_
print gs.best_score_

svc = SVC(**gs.best_params_)
svc.fit(train_X, train_y)
print svc.score(test_X, test_y)

{'gamma': 0.1}
0.93375
0.905

In [16]:

gbt = GradientBoostingClassifier(random_state=0)
learning_rates = [1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1., 3., 10.]
gs = GridSearchCV(gbt, {'learning_rate': learning_rates}, 
                  n_jobs=-1, cv = 5, scoring='accuracy')
gs.fit(train_X, train_y)

print gs.best_params_
print gs.best_score_

gbt = GradientBoostingClassifier(**gs.best_params_)
gbt.fit(train_X, train_y)
print gbt.score(test_X, test_y)

{'learning_rate': 0.1}
0.93625
0.88

In [17]:

from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=100, )
max_features_choices = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
gs = GridSearchCV(forest, {'max_features': max_features_choices}, 
                  n_jobs=-1, cv = 5, scoring='accuracy')
gs.fit(train_X, train_y)
print gs.best_params_
print gs.best_score_

forest = RandomForestClassifier(n_estimators=100, **gs.best_params_)
forest.fit(train_X, train_y)
print forest.score(test_X, test_y)

{'max_features': 0.3}
0.935
0.885

benchmark 2 - single variable decision tree as discretizer¶

In [18]:

from sklearn.tree import DecisionTreeClassifier
class TreeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.trees_ = []
        for ifeat in xrange(n_features):
            ## IT IS VERY IMPORTANT TO SET max_depth = small value to avoid overfitting
            self.trees_.append(DecisionTreeClassifier(max_depth=1))
            self.trees_[ifeat].fit(X[:, ifeat].reshape((-1, 1)), y)
        return self
    def transform(self, X):
        n_samples, n_features = X.shape
        assert n_features == len(self.trees_)
        return np.concatenate([self.trees_[ifeat].predict_proba(X[:, ifeat].reshape((-1, 1)))[:, :-1]
                       for ifeat in xrange(n_features)], 
                    axis = 1)

In [19]:

tt = TreeTransformer()
train_y = y[train_index, :]
train_X = tt.fit_transform(X[train_index, :], train_y)
#train_X = ss.fit_transform(train_X)
test_y = y[test_index, :]
test_X = tt.transform(X[test_index, :])
#test_X = ss.transform(test_X)
print train_X.shape, train_y.shape
print test_X.shape, test_y.shape

(800, 40) (800,)
(200, 40) (200,)

In [20]:

svc = SVC()
gammas = [1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1., 3., 10.]
gs = GridSearchCV(svc, {'gamma': gammas}, n_jobs=-1, cv = 5, scoring='accuracy')
gs.fit(train_X, train_y)
print gs.best_params_
print gs.best_score_

svc = SVC(**gs.best_params_)
svc.fit(train_X, train_y)
print svc.score(test_X, test_y)

{'gamma': 3.0}
0.84125
0.79

benchmark 3 - extra embedding tree (unsupervised)¶

In [14]:

from sklearn.ensemble import RandomTreesEmbedding
rte = RandomTreesEmbedding(n_estimators=20, max_depth=2, n_jobs=-1)
embedded_X = rte.fit_transform(X)
print embedded_X.shape

(1000, 79)

In [15]:

embedded_X = embedded_X.toarray()
train_X = embedded_X[train_index, :]
test_X = embedded_X[test_index, :]
train_y = y[train_index]
test_y = y[test_index]

print train_X.shape, test_X.shape
print train_y.shape, test_y.shape

(800, 79) (200, 79)
(800,) (200,)

In [16]:

## feature selection
trees = ExtraTreesClassifier(n_estimators=50, max_features=0.1, random_state=0, n_jobs=-1)
trees.fit(train_X, train_y)
pd.DataFrame(trees.feature_importances_).plot(kind = 'bar')

Out[16]:

<matplotlib.axes.AxesSubplot at 0x90f1cd0>

In [17]:

selected_features = np.where(trees.feature_importances_ >= 0.001)[0]
train_X = ss.fit_transform(train_X[:, selected_features])
test_X = ss.transform(test_X[:, selected_features])

In [18]:

svc = SVC()
gammas = [1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1., 3., 10.]
gs = GridSearchCV(svc, {'gamma': gammas}, n_jobs=-1, cv = 5, scoring='accuracy')
gs.fit(train_X, train_y)
print gs.best_params_
print gs.best_score_

svc = SVC(**gs.best_params_)
svc.fit(train_X, train_y)
print svc.score(test_X, test_y)

{'gamma': 0.01}
0.6275
0.685

benchmark 4 - extra embedding tree + PCA¶

In [30]:

from sklearn.ensemble import RandomTreesEmbedding
rte = RandomTreesEmbedding(n_estimators=100, max_depth=3, n_jobs=-1)
embedded_X = rte.fit_transform(X)
print embedded_X.shape

(1000, 735)

In [31]:

from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(embedded_X.toarray())
pd.DataFrame(pca.explained_variance_ratio_).plot(kind = 'bar')

Out[31]:

<matplotlib.axes.AxesSubplot at 0x9d13b10>

In [34]:

pca = PCA(n_components=30)
pca_embedded_X = pca.fit_transform(embedded_X.toarray())
pca_embedded_X = ss.fit_transform(pca_embedded_X)


train_pca_embedded_X = pca_embedded_X[train_index, :]
test_pca_embedded_X = pca_embedded_X[test_index, :]
train_y = y[train_index]
test_y = y[test_index]

print train_pca_embedded_X.shape, test_pca_embedded_X.shape
print train_y.shape, test_y.shape

(800, 30) (200, 30)
(800,) (200,)

In [35]:

svc = SVC()
gammas = [1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1., 3., 10.]
gs = GridSearchCV(svc, {'gamma': gammas}, n_jobs=-1, cv = 5, scoring='accuracy')
gs.fit(train_X, train_y)
print gs.best_params_
print gs.best_score_

svc = SVC(**gs.best_params_)
svc.fit(train_X, train_y)
print svc.score(test_X, test_y)

{'gamma': 0.01}
0.6275
0.685

Visualization¶

In [26]:

for ylabel, c in zip(np.unique(y), ['r.', 'b+']):
    pl.plot(sf_X[y==ylabel, 17], sf_X[y==ylabel, 12], c)

Putting together¶

Use X, y, train_index, test_index

In [27]:

## UNSUPERVISED FEATURE LEARNING

## train a sparse filter on the whole data
sf = SparseFilter(n_features=50, n_iterations=1000)
sf_X = sf.fit_transform(X)
## train a PCA on the whole data
pca = PCA(n_components=15)
pca_X = pca.fit_transform(X)
## combine both featues
sf_pca_X = np.c_[sf_X, pca_X]
## standarize the features
ss = StandardScaler()
norm_sf_pca_X = ss.fit_transform(sf_pca_X)
print norm_sf_pca_X.shape

(1000, 65)

In [28]:

## train and test split

train_X = norm_sf_pca_X[train_index, :]
test_X = norm_sf_pca_X[test_index, :]
train_y = y[train_index]
test_y = y[test_index]

print train_X.shape, test_X.shape
print train_y.shape, test_y.shape

(800, 65) (200, 65)
(800,) (200,)

In [29]:

## SVC model


svc = SVC()
gammas = [1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1., 3., 10.]
gs = GridSearchCV(svc, {'gamma': gammas}, n_jobs=-1, cv = 5, scoring='accuracy')
gs.fit(train_X, train_y)
print gs.best_params_
print gs.best_score_

svc = SVC(probability=True, **gs.best_params_)
svc.fit(train_X, train_y)
print svc.score(test_X, test_y)

{'gamma': 0.03}
0.9275
0.92

In [30]:

## Extra Random Forest

forest = ExtraTreesClassifier(n_estimators=100, )
max_features_choices = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
gs = GridSearchCV(forest, {'max_features': max_features_choices}, 
                  n_jobs=-1, cv = 5, scoring='accuracy')
gs.fit(train_X, train_y)
print gs.best_params_
print gs.best_score_

forest = ExtraTreesClassifier(n_estimators=100, **gs.best_params_)
forest.fit(train_X, train_y)
print forest.score(test_X, test_y)

{'max_features': 0.1}
0.92625
0.935

In [31]:

pd.DataFrame(forest.feature_importances_).plot(kind = 'bar')

Out[31]:

<matplotlib.axes.AxesSubplot at 0x8a0ca90>

In [32]:

yhat_prob = (svc.predict_proba(test_X)[:, 0] + forest.predict_proba(test_X)[:, 0]) / 2.
yhat = (yhat_prob < 0.5).astype(np.int)
print yhat.shape
print np.mean(yhat==test_y)

(200,)
0.93

In [33]:

selected_features = np.where(forest.feature_importances_ >= 0.01)[0]
selected_train_X = train_X[:, selected_features]
selected_test_X = test_X[:, selected_features]

In [36]:

## SVC model


svc = SVC()
gammas = [1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1., 3., 10.]
gs = GridSearchCV(svc, {'gamma': gammas}, n_jobs=-1, cv = 5, scoring='accuracy')
gs.fit(selected_train_X, train_y)
print gs.best_params_
print gs.best_score_

svc = SVC(probability = True, **gs.best_params_)
svc.fit(selected_train_X, train_y)
print svc.score(selected_test_X, test_y)

{'gamma': 0.03}
0.93375
0.945

In [38]:

yhat_prob = (svc.predict_proba(selected_test_X)[:, 0] + forest.predict_proba(test_X)[:, 0]) / 2.
yhat = (yhat_prob < 0.5).astype(np.int)
print yhat.shape
print np.mean(yhat==test_y)

(200,)
0.93

In [ ]: