import ensemble
import features
from itertools import cycle
from IPython.parallel import Client
client = Client()
print len(client)
4
import cPickle
X, y = cPickle.load(open('data/blackbox.pkl', 'rb'))
print X.shape, y.shape
(1000, 1875) (1000,)
reload(ensemble)
<module 'ensemble' from 'ensemble.pyc'>
## split data to train, validatin
from sklearn.cross_validation import train_test_split
train_X, validation_X, train_y, validation_y = train_test_split(X, y, test_size = 0.2)
print train_X.shape, validation_X.shape
print train_y.shape, validation_y.shape
(800, 1875) (200, 1875) (800,) (200,)
## make new ensemble
!rm -fR tmp/blackbox_raw_ensemble
ensemble_path = ensemble.new_ensemble('blackbox_raw_ensemble', 'tmp/')
print ensemble_path
/Users/lima/workspace/tutorials/ml-tutorials/tmp/blackbox_raw_ensemble
## persist data
ensemble.write_data(ensemble_path, 'train_blackbox', (train_X, train_y), {'description': 'blackbox data training'})
ensemble.write_data(ensemble_path, 'validation_blackbox', (validation_X, validation_y), {'description': 'blackbox data validation'})
## confirm data writing
!ls tmp/blackbox_raw_ensemble/data
train_blackbox.pkl validation_blackbox.pkl train_blackbox.pkl_01.npy validation_blackbox.pkl_01.npy train_blackbox.pkl_02.npy validation_blackbox.pkl_02.npy
## config different models
from sklearn import svm
from sklearn import linear_model
from sklearn import tree
models = {
'sgd_0.1': linear_model.SGDClassifier(alpha = 0.1)
, 'svc_0.001': svm.SVC(probability=True, gamma=0.001)
, 'svc_0.01': svm.SVC(probability=True, gamma = 0.01)
, 'svc_0.1': svm.SVC(probability=True, gamma=0.1)
, 'sgd_0.0001': linear_model.SGDClassifier(alpha = 0.0001)
, 'sgd_0.001': linear_model.SGDClassifier(alpha = 0.001)
, 'sgd_0.01': linear_model.SGDClassifier(alpha = 0.01) # SUPER model for digits
, 'sgd_0.05': linear_model.SGDClassifier(alpha = 0.05)
, 'sgd_0.15': linear_model.SGDClassifier(alpha = 0.05)
, 'pac_1.0': linear_model.PassiveAggressiveClassifier(C=1.0)
, 'pac_0.1': linear_model.PassiveAggressiveClassifier(C=0.1)
, 'pac_0.01': linear_model.PassiveAggressiveClassifier(C=0.01)
, 'pac_0.001': linear_model.PassiveAggressiveClassifier(C=0.001)
, 'tree_5': tree.DecisionTreeClassifier(max_depth=5)
, 'tree_3': tree.DecisionTreeClassifier(max_depth=3)
, 'tree_1': tree.DecisionTreeClassifier(max_depth=1)
}
## write models
common_model_meta = {
'is_probabilistic': False
, 'train_data': 'train_blackbox'
, 'validation_data': 'validation_blackbox'
}
for (model_name, model) in models.items():
ensemble.write_model(ensemble_path, model_name, model, model_meta = common_model_meta)
## confirm model writing
!ls tmp/blackbox_raw_ensemble/models
pac_0.001.pkl sgd_0.0001.pkl sgd_0.1.pkl svc_0.1.pkl pac_0.01.pkl sgd_0.001.pkl sgd_0.15.pkl tree_1.pkl pac_0.1.pkl sgd_0.01.pkl svc_0.001.pkl tree_3.pkl pac_1.0.pkl sgd_0.05.pkl svc_0.01.pkl tree_5.pkl
## train models in parallel
model_data_pairs = zip(models.keys(), cycle(['train_data']))
ensemble.parallel_train_models(ensemble_path, model_data_pairs, client)
4/4 tasks finished after 16 s done
## construct ensemble model
from sklearn import metrics
ge = ensemble.GreedyEnsemble(ensemble_path,
scorefn = metrics.accuracy_score,
votefn = ensemble.GreedyEnsemble.vote_major_class,
client = client)
ge.fit(models.keys(), verbose = True)
4/4 tasks finished after 0 s done checking model svc_0.01 improvement from 0.0 to 0.25 checking model tree_3 improvement from 0.25 to 0.25 checking model sgd_0.1 improvement from 0.25 to 0.255 checking model tree_5 improvement from 0.255 to 0.255 checking model sgd_0.01 improvement from 0.255 to 0.255 checking model pac_1.0 NO improvement from 0.255 to 0.25
GreedyEnsemble(client=<IPython.parallel.client.client.Client object at 0x113af81d0>, ensemble_path='/Users/lima/workspace/tutorials/ml-tutorials/tmp/blackbox_raw_ensemble', random_seed=0, scorefn=<function accuracy_score at 0x113af0488>, votefn=<function vote_major_class at 0x113ba6f50>)
## model performance
print ge.score(data_type = 'train_data')
print ge.score(data_type = 'validation_data')
4/4 tasks finished after 0 s done 0.255
reload(ensemble)
reload(features)
<module 'features' from 'features.pyc'>
## create new ensemble folder
!rm -fR tmp/blackbox_mixed_ensemble/
ensemble_path = ensemble.new_ensemble('blackbox_mixed_ensemble', 'tmp/')
print ensemble_path
/home/ce/mali/tutorials/ml-tutorials/tmp/blackbox_mixed_ensemble
## from sklearn.cross_validation import train_test_split
n_samples, n_features = X.shape
## same train and test index for each sub dataset
train_index, test_index = train_test_split(range(n_samples), test_size = 0.2)
data_names = []
data_infor = []
## HEAVEY IO ENGAGEMENT
## create different data from raw features
## data set of all rows with different columns (strided)
sub_features = features.strided_seqs(range(n_features), stride = 25, subsize = 25)
sub_features += features.strided_seqs(range(n_features), stride = 10, subsize = 50)
sub_features += features.bootstrap_seqs(range(n_features), n_iter = 500, subsize = 30)
data_name_index = [('subdata_%d_%d' % (sub_feature[0], sub_feature[-1]), sub_feature)
for sub_feature in sub_features]
for data_name, feat_index in data_name_index:
train_X = features.patch(X, train_index, feat_index)
train_y = features.patch(y, train_index)
test_X = features.patch(X, test_index, feat_index)
test_y = features.patch(y, test_index)
train_name = 'train_' + data_name
test_name = 'test_' + data_name
data_names.append((train_name, test_name))
data_infor.append([train_name, (train_X, train_y), {}])
data_infor.append([test_name, (test_X, test_y), {}])
## creat MORE features from tri-kmeans result
reload(features)
feat_patches = features.bootstrap_seqs(range(n_features), n_iter = 500, subsize = 30)
tri_kmeans = features.TriKmeansFeatures(n_clusters = 10, feat_patches = feat_patches, client = client)
tri_X = tri_kmeans.fit_transform(X)
sub_features = features.strided_seqs(range(tri_X.shape[1]), stride = 15, subsize=30)
data_name_index = [('tridata_%d_%d' % (sub_feature[0], sub_feature[-1]), sub_feature)
for sub_feature in sub_features]
for data_name, feat_index in data_name_index:
train_X = features.patch(tri_X, train_index, feat_index)
train_y = features.patch(y, train_index)
test_X = features.patch(tri_X, test_index, feat_index)
test_y = features.patch(y, test_index)
train_name = 'train_' + data_name
test_name = 'test_' + data_name
data_names.append((train_name, test_name))
data_infor.append([train_name, (train_X, train_y), {}])
data_infor.append([test_name, (test_X, test_y), {}])
24/24 tasks finished after 12 s done
ensemble.batch_write_data(ensemble_path, data_infor)
print len(data_names)
1095
## configure different models
from sklearn import svm
from sklearn import linear_model
from sklearn import tree
models = {
'sgd_0.1': linear_model.SGDClassifier(alpha = 0.1)
#, 'svc_0.001': svm.SVC(probability=True, gamma=0.001)
#, 'svc_0.01': svm.SVC(probability=True, gamma = 0.01)
#, 'svc_0.1': svm.SVC(probability=True, gamma=0.1)
#, 'sgd_0.0001': linear_model.SGDClassifier(alpha = 0.0001)
#, 'sgd_0.001': linear_model.SGDClassifier(alpha = 0.001)
#, 'sgd_0.01': linear_model.SGDClassifier(alpha = 0.01) # SUPER model for digits
#, 'sgd_0.05': linear_model.SGDClassifier(alpha = 0.05)
#, 'sgd_0.15': linear_model.SGDClassifier(alpha = 0.05)
#, 'pac_1.0': linear_model.PassiveAggressiveClassifier(C=1.0)
#, 'pac_0.1': linear_model.PassiveAggressiveClassifier(C=0.1)
#, 'pac_0.01': linear_model.PassiveAggressiveClassifier(C=0.01)
#, 'pac_0.001': linear_model.PassiveAggressiveClassifier(C=0.001)
, 'tree_11': tree.DecisionTreeClassifier(max_depth=11)
, 'tree_10': tree.DecisionTreeClassifier(max_depth=10)
, 'tree_9': tree.DecisionTreeClassifier(max_depth=9)
, 'tree_8': tree.DecisionTreeClassifier(max_depth=8)
, 'tree_7': tree.DecisionTreeClassifier(max_depth=7)
, 'tree_6': tree.DecisionTreeClassifier(max_depth=6)
, 'tree_5': tree.DecisionTreeClassifier(max_depth=5)
, 'tree_4': tree.DecisionTreeClassifier(max_depth=4)
, 'tree_3': tree.DecisionTreeClassifier(max_depth=3)
, 'tree_2': tree.DecisionTreeClassifier(max_depth=2)
, 'tree_1': tree.DecisionTreeClassifier(max_depth=1)
}
model_metas = [{
'is_probabilistic': False
, 'train_data': train_name
, 'validation_data': validation_name
, 'test_data': None
} for (train_name, validation_name) in data_names]
print len(model_metas)
1095
n_candidates = len(models)
from random import shuffle
## write model configurations
model_names = []
model_infor = []
for model_meta in model_metas:
candidates = models.items()
shuffle(candidates)
candidates = candidates[:n_candidates]
## randomly select
for (template_name, model) in candidates:
model_name = template_name + "_" + model_meta["train_data"] + "_" + model_meta['validation_data']
model_names.append(model_name)
model_infor.append([model_name, model, model_meta])
ensemble.batch_write_model(ensemble_path, model_infor)
## train models in parallel
model_data_pairs = zip(model_names, cycle(['train_data']))
ensemble.parallel_train_models(ensemble_path, model_data_pairs, client)
24/24 tasks finished after 238 s done
## construct ensemble model
ensemble_path = 'tmp/blackbox_mixed_ensemble/'
from sklearn.metrics import accuracy_score
ge = ensemble.GreedyEnsemble(ensemble_path,
scorefn = accuracy_score,
votefn = ensemble.GreedyEnsemble.vote_major_class,
client = client)
ge.fit(model_names, verbose=True)
24/24 tasks finished after 444 s done checking model tree_6_train_subdata_632_1701_test_subdata_632_1701 improvement from 0.0 to 0.27 checking model tree_10_train_subdata_860_909_test_subdata_860_909 improvement from 0.27 to 0.295 checking model tree_9_train_subdata_1180_1229_test_subdata_1180_1229 improvement from 0.295 to 0.335 checking model tree_10_train_subdata_1690_1739_test_subdata_1690_1739 improvement from 0.335 to 0.34 checking model tree_9_train_subdata_1840_14_test_subdata_1840_14 improvement from 0.34 to 0.37 checking model tree_8_train_subdata_1720_1769_test_subdata_1720_1769 improvement from 0.37 to 0.39 checking model tree_10_train_subdata_1790_1839_test_subdata_1790_1839 improvement from 0.39 to 0.41 checking model tree_11_train_subdata_347_1620_test_subdata_347_1620 improvement from 0.41 to 0.415 checking model sgd_0.1_train_tridata_3630_3659_test_tridata_3630_3659 improvement from 0.415 to 0.42 checking model sgd_0.1_train_tridata_3390_3419_test_tridata_3390_3419 NO improvement from 0.42 to 0.415
GreedyEnsemble(client=<IPython.parallel.client.client.Client object at 0x4a48ed0>, ensemble_path='tmp/blackbox_mixed_ensemble/', random_seed=0, scorefn=<function accuracy_score at 0x58269b0>, votefn=<function vote_major_class at 0x8e3d0578>)
## performance
print ge.score(data_type = 'train_data')
9/9 tasks finished after 0 s done 0.84875
print ge.score(data_type = 'validation_data')
9/9 tasks finished after 0 s done 0.42
reload(ensemble)
data_names = ensemble.all_data_names(ensemble_path)
print len(data_names)
1656
reload(ensemble)
model_names = ensemble.all_model_names(ensemble_path)
print len(model_names)
9936