import ensemble
import features
reload(ensemble)
reload(features)
from itertools import cycle
import numpy as np
import random
from IPython.parallel import Client
client = Client()
print len(client)
4
import cPickle
X, y = cPickle.load(open('data/blackbox.pkl', 'rb'))
print X.shape, y.shape
print np.unique(y)
(1000, 1875) (1000,) [1 2 3 4 5 6 7 8 9]
## clear and create ensemble
!rm -fR tmp/blackbox_ensemble_iii/
ensemble_path = ensemble.new_ensemble('blackbox_ensemble_iii', 'tmp/')
print ensemble_path
/Users/lima/workspace/tutorials/ml-tutorials/tmp/blackbox_ensemble_iii
*FEATURE and DATA ENGINEERING*
## split data into train and validation
from sklearn.cross_validation import train_test_split
n_samples, n_features = X.shape
train_index, test_index = train_test_split(range(n_samples), test_size = 0.2)
data_records = []
data_names = []
## kernel approximation subpatches of original features
feature_patches = features.bootstrap_seqs(range(n_features), n_iter = 100, subsize = 100)
data_name_prefix = 'ka_%d_%d'
Xs = [features.patch(X, None, feats) for feats in feature_patches]
ka_Xs = features.kernel_approximation(Xs, client, kernel = 'polynomial', degree = 2, n_components=100)
for feats, ka_X in zip(feature_patches, ka_Xs):
train_X = features.patch(ka_X, train_index, None)
train_y = features.patch(y, train_index)
test_X = features.patch(ka_X, test_index, None)
test_y = features.patch(y, test_index)
train_name = 'train_' + data_name_prefix % (feats[0], feats[-1])
test_name = 'test_' + data_name_prefix % (feats[0], feats[-1])
data_records.append([train_name, (train_X, train_y), {}])
data_records.append([test_name, (test_X, test_y), {}])
data_names.append((train_name, test_name))
## bootstraped subpatches of original features
feature_patches = features.bootstrap_seqs(range(n_features), n_iter = 500, subsize = 15)
data_name_prefix = 'bs_%d_%d'
for feats in feature_patches:
train_X = features.patch(X, train_index, feats)
train_y = features.patch(y, train_index)
test_X = features.patch(X, test_index, feats)
test_y = features.patch(y, test_index)
train_name = 'train_' + data_name_prefix % (feats[0], feats[-1])
test_name = 'test_' + data_name_prefix % (feats[0], feats[-1])
data_records.append([train_name, (train_X, train_y), {}])
data_records.append([test_name, (test_X, test_y), {}])
data_names.append((train_name, test_name))
## tri-kmeans features
reload(features)
tri_kmeans = features.TriKmeansFeatures(n_clusters = 20,
feat_patches = features.bootstrap_seqs(range(n_features),
n_iter = 500, subsize=20),
client = client)
tri_X = tri_kmeans.fit_transform(X)
feature_patches = features.strided_seqs(range(tri_X.shape[1]), stride = 15, subsize=30)
data_name_prefix = 'tri_%d_%d'
for feats in feature_patches:
train_X = features.patch(tri_X, train_index, feats)
train_y = features.patch(y, train_index)
test_X = features.patch(tri_X, test_index, feats)
test_y = features.patch(y, test_index)
train_name = 'train_' + data_name_prefix % (feats[0], feats[-1])
test_name = 'test_' + data_name_prefix % (feats[0], feats[-1])
data_records.append([train_name, (train_X, train_y), {}])
data_records.append([test_name, (test_X, test_y), {}])
data_names.append((train_name, test_name))
24/24 tasks finished after 14 s done
## 2nd order features
## TODO
## Write data in batch
ensemble.batch_write_data(ensemble_path, data_records)
print len(data_names)
1266
*FARM MODELS*
## different model configurations
from sklearn import svm
from sklearn import linear_model
from sklearn import tree
from sklearn.grid_search import IterGrid
models = []
## tree models
tree_params = IterGrid({'criterion': ['gini', 'entropy'], 'max_depth': range(5, 16)})
for param in tree_params:
model_name = 'tree_%s_%d' % (param['criterion'], param['max_depth'])
models.append((model_name, tree.DecisionTreeClassifier(**param)))
"""
## svc models - slow to train and evaluate, no obvious accuracy improvement
svc_params = IterGrid({'C': np.logspace(1, 5, 5),
'gamma': np.logspace(-5, 2, 8),
'probability': [True]})
for param in svc_params:
model_name = 'svc_%g_%g' % (param['C'], param['gamma'])
models.append((model_name, svm.SVC(**param)))
"""
## linear models
"""
sgd_params = IterGrid({'penalty': ['l1', 'l2', 'elasticnet'],
'alpha': np.logspace(-5, 2, 8)})
for param in sgd_params:
model_name = 'sgd_%s_%g' % (param['penalty'], param['alpha'])
models.append((model_name, linear_model.SGDClassifier(**param)))
"""
"\nsgd_params = IterGrid({'penalty': ['l1', 'l2', 'elasticnet'], \n 'alpha': np.logspace(-5, 2, 8)})\nfor param in sgd_params:\n model_name = 'sgd_%s_%g' % (param['penalty'], param['alpha'])\n models.append((model_name, linear_model.SGDClassifier(**param)))\n"
## link models with data
model_names = []
model_records = []
for (model_name, model) in models:
for (train_data, validation_data) in data_names:
model_meta = {
'is_probabilistic': False
, 'train_data': train_data
, 'validation_data': validation_data
, 'test_data': None
}
model_data_name = '__'.join([model_name, train_data, validation_data])
model_names.append(model_data_name)
model_records.append([model_data_name, model, model_meta])
## write models into ensemble
print len(model_names)
selected_models = range(len(model_names))
from random import shuffle
shuffle(selected_models)
selected_models = selected_models[:10000]
model_names = np.array(model_names)[selected_models]
model_records = np.array(model_records)[selected_models]
print len(model_names)
ensemble.batch_write_model(ensemble_path, model_records)
27852 10000
## train models in parallel
model_data_pairs = zip(model_names, cycle(['train_data']))
ensemble.parallel_train_models(ensemble_path, model_data_pairs, client)
24/24 tasks finished after 186 s done
## construct the greedy ensemble
reload(ensemble)
## DISCRETE CLASSIFICATION
from sklearn.metrics import accuracy_score
ge = ensemble.GreedyEnsemble(ensemble_path,
scorefn = ensemble.GreedyEnsemble.score_label_classification,
votefn = ensemble.GreedyEnsemble.vote_major_class,
client = client)
"""
## PROBALISTIC CLASSIFICATION
ge = ensemble.GreedyEnsemble(ensemble_path,
scorefn = ensemble.GreedyEnsemble.score_label_prob_classification,
votefn = ensemble.GreedyEnsemble.vote_average,
client = client)
"""
ge.fit(model_names, verbose = True)
24/24 tasks finished after 266 s done checking model tree_gini_12__train_ka_1404_956__test_ka_1404_956 improvement from 0.0 to 0.28 checking model tree_gini_12__train_bs_254_412__test_bs_254_412 improvement from 0.28 to 0.32 checking model tree_entropy_10__train_tri_4875_4904__test_tri_4875_4904 improvement from 0.32 to 0.34 checking model tree_entropy_10__train_bs_676_867__test_bs_676_867 improvement from 0.34 to 0.355 checking model tree_entropy_14__train_tri_5415_5444__test_tri_5415_5444 improvement from 0.355 to 0.375 checking model tree_gini_10__train_bs_1597_1597__test_bs_1597_1597 improvement from 0.375 to 0.38 checking model tree_gini_13__train_ka_829_336__test_ka_829_336 improvement from 0.38 to 0.395 checking model tree_gini_11__train_bs_328_285__test_bs_328_285 improvement from 0.395 to 0.405 checking model tree_entropy_8__train_bs_1161_617__test_bs_1161_617 improvement from 0.405 to 0.42 checking model tree_gini_9__train_bs_1766_1870__test_bs_1766_1870 improvement from 0.42 to 0.44 checking model tree_entropy_14__train_ka_80_324__test_ka_80_324 improvement from 0.44 to 0.445 checking model tree_entropy_11__train_bs_335_560__test_bs_335_560 improvement from 0.445 to 0.445 checking model tree_gini_15__train_ka_1542_1170__test_ka_1542_1170 improvement from 0.445 to 0.45 checking model tree_gini_12__train_bs_780_780__test_bs_780_780 improvement from 0.45 to 0.45 checking model tree_entropy_11__train_bs_1099_1186__test_bs_1099_1186 improvement from 0.45 to 0.455 checking model tree_gini_13__train_ka_1762_1451__test_ka_1762_1451 improvement from 0.455 to 0.455 checking model tree_entropy_12__train_bs_371_599__test_bs_371_599 improvement from 0.455 to 0.47 checking model tree_entropy_7__train_bs_549_321__test_bs_549_321 improvement from 0.47 to 0.475 checking model tree_entropy_13__train_tri_4080_4109__test_tri_4080_4109 improvement from 0.475 to 0.475 checking model tree_gini_13__train_ka_72_1838__test_ka_72_1838 improvement from 0.475 to 0.48 checking model tree_gini_11__train_ka_355_110__test_ka_355_110 improvement from 0.48 to 0.495 checking model tree_gini_15__train_bs_994_1245__test_bs_994_1245 NO improvement from 0.495 to 0.49
GreedyEnsemble(client=<IPython.parallel.client.client.Client object at 0xb1a37390>, ensemble_path='/home/ce/mali/tutorials/ml-tutorials/tmp/blackbox_ensemble_iii', random_seed=0, scorefn=<function score_label_classification at 0x6a3a2a8>, votefn=<function vote_major_class at 0x6a3a1b8>)
## performance checking on training data
print ge.score(data_type = 'train_data')
21/21 tasks finished after 0 s done 0.9925
## performance checking on validation_data
print ge.score(data_type = 'validation_data')
21/21 tasks finished after 0 s done 0.495
## write out ensemble
## OVERWRITE the pervious solution - the results showed that
## merging sevearl ensembles in a naive way is NOT really useful practice
reload(ensemble)
optimal_ensemble_path = 'tmp/blackbox_ensemble_iii_optimal'
ensemble.copy_ensemble(ensemble_path, optimal_ensemble_path, ge.ensemble_, overwrite=True)
!ls tmp/blackbox_ensemble_iii_optimal/
data data.json models models.json
print 'current number of models in the optimal ensemble',len(ensemble.all_model_names(optimal_ensemble_path))
print ensemble.all_model_names(optimal_ensemble_path)
for model_name in ensemble.all_model_names(optimal_ensemble_path):
ensemble.update_model_record(optimal_ensemble_path, model_name, {'is_probabilistic': False})
current number of models in the optimal ensemble 21 [u'tree_gini_13__train_ka_1762_1451__test_ka_1762_1451', u'tree_gini_12__train_ka_1404_956__test_ka_1404_956', u'tree_gini_15__train_ka_1542_1170__test_ka_1542_1170', u'tree_entropy_11__train_bs_1099_1186__test_bs_1099_1186', u'tree_entropy_10__train_tri_4875_4904__test_tri_4875_4904', u'tree_entropy_13__train_tri_4080_4109__test_tri_4080_4109', u'tree_gini_11__train_ka_355_110__test_ka_355_110', u'tree_gini_10__train_bs_1597_1597__test_bs_1597_1597', u'tree_gini_13__train_ka_72_1838__test_ka_72_1838', u'tree_gini_13__train_ka_829_336__test_ka_829_336', u'tree_gini_12__train_bs_780_780__test_bs_780_780', u'tree_gini_12__train_bs_254_412__test_bs_254_412', u'tree_entropy_8__train_bs_1161_617__test_bs_1161_617', u'tree_entropy_14__train_tri_5415_5444__test_tri_5415_5444', u'tree_entropy_11__train_bs_335_560__test_bs_335_560', u'tree_gini_9__train_bs_1766_1870__test_bs_1766_1870', u'tree_entropy_12__train_bs_371_599__test_bs_371_599', u'tree_entropy_10__train_bs_676_867__test_bs_676_867', u'tree_entropy_14__train_ka_80_324__test_ka_80_324', u'tree_entropy_7__train_bs_549_321__test_bs_549_321', u'tree_gini_11__train_bs_328_285__test_bs_328_285']
## reconstruct the ensemble model
## the models in the optimal ensemble is getting more and more
## so it becomes an aggressive model
gee = ensemble.GreedyEnsemble(optimal_ensemble_path,
scorefn=accuracy_score,
votefn=ensemble.GreedyEnsemble.vote_major_class,
client = client)
#gee.fit(ensemble.all_model_names(optimal_ensemble_path), verbose = True)
gee.ensemble_ = ensemble.all_model_names(optimal_ensemble_path)
print gee.score('train_data')
21/21 tasks finished after 0 s done 0.9925
print gee.score('validation_data')
21/21 tasks finished after 0 s done 0.495