import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.decomposition import PCA
from autodiff import optimize
from scipy import stats
import pylab as pl
%pylab inline --no-import-all
Populating the interactive namespace from numpy and matplotlib
## load data
train_data = pd.read_csv('../data/sklearn-london/train.csv', header=None)
train_labels = pd.read_csv('../data/sklearn-london/trainLabels.csv', header=None)
X = np.asarray(train_data)
y = np.asarray(train_labels).ravel()
print X.shape, y.shape
print X.dtype, y.dtype, np.unique(y)
## shuffle data
X, y = shuffle(X, y)
print X.shape, y.shape
(1000, 40) (1000,) float64 int64 [0 1] (1000, 40) (1000,)
train_index, test_index = train_test_split(np.arange(X.shape[0]), test_size = 0.2)
print train_index.shape, test_index.shape
(800,) (200,)
## analzye min/max, center, shape of different features
def statistics(feat):
return (feat.min(), feat.max(), feat.std(), stats.skew(feat))
mins, maxs, stds, skewness = zip(*[statistics(X[:, i]) for i in xrange(X.shape[1])])
print np.std(mins), np.min(mins), np.max(mins)
print np.std(maxs), np.min(maxs), np.max(maxs)
print np.std(stds), np.min(stds), np.max(stds)
print np.std(skewness), np.min(skewness), np.max(skewness)
3.52909765249 -16.4219014729 -2.6956019378 3.29453132536 2.54650651811 17.5653445056 0.967315549334 0.966798559976 4.53656367456 0.135242790356 -0.412842893417 0.333415441978
ss = StandardScaler()
train_X = ss.fit_transform(X[train_index, :])
test_X = ss.transform(X[test_index, :])
train_y = y[train_index]
test_y = y[test_index]
svc = SVC()
gammas = [1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1., 3., 10.]
gs = GridSearchCV(svc, {'gamma': gammas}, n_jobs=-1, cv = 5, scoring='accuracy')
gs.fit(train_X, train_y)
print gs.best_params_
print gs.best_score_
svc = SVC(**gs.best_params_)
svc.fit(train_X, train_y)
print svc.score(test_X, test_y)
{'gamma': 0.03} 0.86125 0.9
pca = PCA(whiten=True)
pca.fit(X)
feature_importances = pd.DataFrame(pca.explained_variance_ratio_)
feature_importances.plot(kind = 'bar')
<matplotlib.axes.AxesSubplot at 0x9346c10>
pca = PCA(n_components=15, whiten=True)
pca_X = pca.fit_transform(X)
train_X = pca_X[train_index, :]
test_X = pca_X[test_index, :]
train_y = y[train_index]
test_y = y[test_index]
print train_X.shape, test_X.shape
print train_y.shape, test_y.shape
(800, 15) (200, 15) (800,) (200,)
svc = SVC()
gammas = [1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1., 3., 10.]
gs = GridSearchCV(svc, {'gamma': gammas}, n_jobs=-1, cv = 5, scoring='accuracy')
gs.fit(train_X, train_y)
print gs.best_params_
print gs.best_score_
svc = SVC(**gs.best_params_)
svc.fit(train_X, train_y)
print svc.score(test_X, test_y)
{'gamma': 0.1} 0.91625 0.93
def soft_absolute(u):
epsilon = 1e-8
return np.sqrt(u*u + epsilon)
def logistic(u):
return 1. / (1. + np.exp(-u))
class SparseFilter(BaseEstimator, TransformerMixin):
def __init__(self, n_features = 200, n_iterations = 300, activate=soft_absolute):
self.epsilon = 1e-8
self.n_features = n_features
self.n_iterations = n_iterations
self.activate = activate
def fit(self, X, y = None):
n_samples, n_dim = X.shape
W = np.random.randn(n_dim, self.n_features)
b = np.random.randn(self.n_features)
obj_fn = self.get_objective_fn(X)
self.W_, self.b_ = optimize.fmin_l_bfgs_b(obj_fn, (W, b),
iprint = 1,
maxfun = self.n_iterations)
return self
def get_objective_fn(self, X):
def _objective_fn(W, b):
Y = self.activate(np.dot(X, W) + b)
Y = Y / np.sqrt(np.sum(Y*Y, axis = 0) + self.epsilon)
Y = Y / np.sqrt(np.sum(Y*Y, axis = 1)[:, np.newaxis] + self.epsilon)
return np.sum(Y)
return _objective_fn
def transform(self, X):
Y = self.activate(np.dot(X, self.W_) + self.b_)
Y = Y / np.sqrt(np.sum(Y*Y, axis=0) + self.epsilon)
Y = Y / np.sqrt(np.sum(Y*Y, axis=1)[:, np.newaxis] + self.epsilon)
return Y
sf = SparseFilter(n_features=50, n_iterations=5000)
sf_X = sf.fit_transform(X)
sf_X = ss.fit_transform(sf_X)
train_X = sf_X[train_index, :]
train_y = y[train_index, :]
test_X = sf_X[test_index, :]
test_y = y[test_index, :]
print train_X.shape, test_X.shape
print train_y.shape, test_y.shape
(800, 50) (200, 50) (800,) (200,)
trees = ExtraTreesClassifier(n_estimators=100, max_features=50, n_jobs=-1, random_state=0)
trees.fit(train_X, train_y)
feature_importances = pd.DataFrame(trees.feature_importances_)
feature_importances.plot(kind='bar')
<matplotlib.axes.AxesSubplot at 0x7de04d0>
selected_features = np.where(feature_importances >= 0.01)[0]
train_X = train_X[:, selected_features]
test_X = test_X[:, selected_features]
print train_X.shape, test_X.shape
(800, 15) (200, 15)
svc = SVC()
gammas = [1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1., 3., 10.]
gs = GridSearchCV(svc, {'gamma': gammas}, n_jobs=-1, cv = 5, scoring='accuracy')
gs.fit(train_X, train_y)
print gs.best_params_
print gs.best_score_
svc = SVC(**gs.best_params_)
svc.fit(train_X, train_y)
print svc.score(test_X, test_y)
{'gamma': 0.1} 0.93375 0.905
gbt = GradientBoostingClassifier(random_state=0)
learning_rates = [1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1., 3., 10.]
gs = GridSearchCV(gbt, {'learning_rate': learning_rates},
n_jobs=-1, cv = 5, scoring='accuracy')
gs.fit(train_X, train_y)
print gs.best_params_
print gs.best_score_
gbt = GradientBoostingClassifier(**gs.best_params_)
gbt.fit(train_X, train_y)
print gbt.score(test_X, test_y)
{'learning_rate': 0.1} 0.93625 0.88
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators=100, )
max_features_choices = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
gs = GridSearchCV(forest, {'max_features': max_features_choices},
n_jobs=-1, cv = 5, scoring='accuracy')
gs.fit(train_X, train_y)
print gs.best_params_
print gs.best_score_
forest = RandomForestClassifier(n_estimators=100, **gs.best_params_)
forest.fit(train_X, train_y)
print forest.score(test_X, test_y)
{'max_features': 0.3} 0.935 0.885
from sklearn.tree import DecisionTreeClassifier
class TreeTransformer(BaseEstimator, TransformerMixin):
def __init__(self):
pass
def fit(self, X, y):
n_samples, n_features = X.shape
self.trees_ = []
for ifeat in xrange(n_features):
## IT IS VERY IMPORTANT TO SET max_depth = small value to avoid overfitting
self.trees_.append(DecisionTreeClassifier(max_depth=1))
self.trees_[ifeat].fit(X[:, ifeat].reshape((-1, 1)), y)
return self
def transform(self, X):
n_samples, n_features = X.shape
assert n_features == len(self.trees_)
return np.concatenate([self.trees_[ifeat].predict_proba(X[:, ifeat].reshape((-1, 1)))[:, :-1]
for ifeat in xrange(n_features)],
axis = 1)
tt = TreeTransformer()
train_y = y[train_index, :]
train_X = tt.fit_transform(X[train_index, :], train_y)
#train_X = ss.fit_transform(train_X)
test_y = y[test_index, :]
test_X = tt.transform(X[test_index, :])
#test_X = ss.transform(test_X)
print train_X.shape, train_y.shape
print test_X.shape, test_y.shape
(800, 40) (800,) (200, 40) (200,)
svc = SVC()
gammas = [1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1., 3., 10.]
gs = GridSearchCV(svc, {'gamma': gammas}, n_jobs=-1, cv = 5, scoring='accuracy')
gs.fit(train_X, train_y)
print gs.best_params_
print gs.best_score_
svc = SVC(**gs.best_params_)
svc.fit(train_X, train_y)
print svc.score(test_X, test_y)
{'gamma': 3.0} 0.84125 0.79
from sklearn.ensemble import RandomTreesEmbedding
rte = RandomTreesEmbedding(n_estimators=20, max_depth=2, n_jobs=-1)
embedded_X = rte.fit_transform(X)
print embedded_X.shape
(1000, 79)
embedded_X = embedded_X.toarray()
train_X = embedded_X[train_index, :]
test_X = embedded_X[test_index, :]
train_y = y[train_index]
test_y = y[test_index]
print train_X.shape, test_X.shape
print train_y.shape, test_y.shape
(800, 79) (200, 79) (800,) (200,)
## feature selection
trees = ExtraTreesClassifier(n_estimators=50, max_features=0.1, random_state=0, n_jobs=-1)
trees.fit(train_X, train_y)
pd.DataFrame(trees.feature_importances_).plot(kind = 'bar')
<matplotlib.axes.AxesSubplot at 0x90f1cd0>
selected_features = np.where(trees.feature_importances_ >= 0.001)[0]
train_X = ss.fit_transform(train_X[:, selected_features])
test_X = ss.transform(test_X[:, selected_features])
svc = SVC()
gammas = [1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1., 3., 10.]
gs = GridSearchCV(svc, {'gamma': gammas}, n_jobs=-1, cv = 5, scoring='accuracy')
gs.fit(train_X, train_y)
print gs.best_params_
print gs.best_score_
svc = SVC(**gs.best_params_)
svc.fit(train_X, train_y)
print svc.score(test_X, test_y)
{'gamma': 0.01} 0.6275 0.685
from sklearn.ensemble import RandomTreesEmbedding
rte = RandomTreesEmbedding(n_estimators=100, max_depth=3, n_jobs=-1)
embedded_X = rte.fit_transform(X)
print embedded_X.shape
(1000, 735)
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(embedded_X.toarray())
pd.DataFrame(pca.explained_variance_ratio_).plot(kind = 'bar')
<matplotlib.axes.AxesSubplot at 0x9d13b10>
pca = PCA(n_components=30)
pca_embedded_X = pca.fit_transform(embedded_X.toarray())
pca_embedded_X = ss.fit_transform(pca_embedded_X)
train_pca_embedded_X = pca_embedded_X[train_index, :]
test_pca_embedded_X = pca_embedded_X[test_index, :]
train_y = y[train_index]
test_y = y[test_index]
print train_pca_embedded_X.shape, test_pca_embedded_X.shape
print train_y.shape, test_y.shape
(800, 30) (200, 30) (800,) (200,)
svc = SVC()
gammas = [1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1., 3., 10.]
gs = GridSearchCV(svc, {'gamma': gammas}, n_jobs=-1, cv = 5, scoring='accuracy')
gs.fit(train_X, train_y)
print gs.best_params_
print gs.best_score_
svc = SVC(**gs.best_params_)
svc.fit(train_X, train_y)
print svc.score(test_X, test_y)
{'gamma': 0.01} 0.6275 0.685
for ylabel, c in zip(np.unique(y), ['r.', 'b+']):
pl.plot(sf_X[y==ylabel, 17], sf_X[y==ylabel, 12], c)
## UNSUPERVISED FEATURE LEARNING
## train a sparse filter on the whole data
sf = SparseFilter(n_features=50, n_iterations=1000)
sf_X = sf.fit_transform(X)
## train a PCA on the whole data
pca = PCA(n_components=15)
pca_X = pca.fit_transform(X)
## combine both featues
sf_pca_X = np.c_[sf_X, pca_X]
## standarize the features
ss = StandardScaler()
norm_sf_pca_X = ss.fit_transform(sf_pca_X)
print norm_sf_pca_X.shape
(1000, 65)
## train and test split
train_X = norm_sf_pca_X[train_index, :]
test_X = norm_sf_pca_X[test_index, :]
train_y = y[train_index]
test_y = y[test_index]
print train_X.shape, test_X.shape
print train_y.shape, test_y.shape
(800, 65) (200, 65) (800,) (200,)
## SVC model
svc = SVC()
gammas = [1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1., 3., 10.]
gs = GridSearchCV(svc, {'gamma': gammas}, n_jobs=-1, cv = 5, scoring='accuracy')
gs.fit(train_X, train_y)
print gs.best_params_
print gs.best_score_
svc = SVC(probability=True, **gs.best_params_)
svc.fit(train_X, train_y)
print svc.score(test_X, test_y)
{'gamma': 0.03} 0.9275 0.92
## Extra Random Forest
forest = ExtraTreesClassifier(n_estimators=100, )
max_features_choices = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
gs = GridSearchCV(forest, {'max_features': max_features_choices},
n_jobs=-1, cv = 5, scoring='accuracy')
gs.fit(train_X, train_y)
print gs.best_params_
print gs.best_score_
forest = ExtraTreesClassifier(n_estimators=100, **gs.best_params_)
forest.fit(train_X, train_y)
print forest.score(test_X, test_y)
{'max_features': 0.1} 0.92625 0.935
pd.DataFrame(forest.feature_importances_).plot(kind = 'bar')
<matplotlib.axes.AxesSubplot at 0x8a0ca90>
yhat_prob = (svc.predict_proba(test_X)[:, 0] + forest.predict_proba(test_X)[:, 0]) / 2.
yhat = (yhat_prob < 0.5).astype(np.int)
print yhat.shape
print np.mean(yhat==test_y)
(200,) 0.93
selected_features = np.where(forest.feature_importances_ >= 0.01)[0]
selected_train_X = train_X[:, selected_features]
selected_test_X = test_X[:, selected_features]
## SVC model
svc = SVC()
gammas = [1e-4, 3e-4, 1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1, 1., 3., 10.]
gs = GridSearchCV(svc, {'gamma': gammas}, n_jobs=-1, cv = 5, scoring='accuracy')
gs.fit(selected_train_X, train_y)
print gs.best_params_
print gs.best_score_
svc = SVC(probability = True, **gs.best_params_)
svc.fit(selected_train_X, train_y)
print svc.score(selected_test_X, test_y)
{'gamma': 0.03} 0.93375 0.945
yhat_prob = (svc.predict_proba(selected_test_X)[:, 0] + forest.predict_proba(test_X)[:, 0]) / 2.
yhat = (yhat_prob < 0.5).astype(np.int)
print yhat.shape
print np.mean(yhat==test_y)
(200,) 0.93