## TEST greedy ensemble with datasets such as digits and blackbox
import ensemble
from itertools import cycle
reload(ensemble)
<module 'ensemble' from 'ensemble.pyc'>
from IPython.parallel import Client
client = Client()
print len(client)
16
## load data
import cPickle
X, y = cPickle.load(open('data/digits.pkl', 'rb'))
print X.shape, y.shape
(42000, 784) (42000,)
## split train and test
from sklearn.cross_validation import train_test_split
train_X, validation_X, train_y, validation_y = train_test_split(X, y, test_size = 0.2)
train_X, test_X, train_y, test_y = train_test_split(train_X, train_y, test_size = 0.3)
print train_X.shape, validation_X.shape, test_X.shape
print train_y.shape, validation_y.shape, test_y.shape
(23520, 784) (8400, 784) (10080, 784) (23520,) (8400,) (10080,)
## make new ensemble
!rm -fR tmp/digits_ensemble/
ensemble_path = ensemble.new_ensemble('digits_ensemble', 'tmp/')
print ensemble_path
/home/ce/mali/tutorials/ml-tutorials/tmp/digits_ensemble
## persist data
ensemble.write_data(ensemble_path, 'train_digits', (train_X, train_y), {'description': 'training for digits'})
ensemble.write_data(ensemble_path, 'validation_digits', (validation_X, validation_y), {'description': 'validation for digits'})
ensemble.write_data(ensemble_path, 'test_digits', (test_X, test_y), {'description': 'testing for digits'})
## confirm writing of data
!ls tmp/digits_ensemble/data
test_digits.pkl train_digits.pkl validation_digits.pkl test_digits.pkl_01.npy train_digits.pkl_01.npy validation_digits.pkl_01.npy test_digits.pkl_02.npy train_digits.pkl_02.npy validation_digits.pkl_02.npy
## config different models
from sklearn import svm
from sklearn import linear_model
from sklearn import tree
models = {
'sgd_0.1': linear_model.SGDClassifier(alpha = 0.1)
#, 'svc_0.001': svm.SVC(probability=True, gamma=0.001)
#, 'svc_0.01': svm.SVC(probability=True, gamma = 0.01)
#, 'svc_0.1': svm.SVC(probability=True, gamma=0.1)
#, 'sgd_0.0001': linear_model.SGDClassifier(alpha = 0.0001)
, 'sgd_0.001': linear_model.SGDClassifier(alpha = 0.001)
#, 'sgd_0.01': linear_model.SGDClassifier(alpha = 0.01) # SUPER model for digits
, 'sgd_0.05': linear_model.SGDClassifier(alpha = 0.05)
, 'sgd_0.15': linear_model.SGDClassifier(alpha = 0.05)
, 'pac_1.0': linear_model.PassiveAggressiveClassifier(C=1.0)
, 'pac_0.1': linear_model.PassiveAggressiveClassifier(C=0.1)
, 'pac_0.01': linear_model.PassiveAggressiveClassifier(C=0.01)
, 'pac_0.001': linear_model.PassiveAggressiveClassifier(C=0.001)
, 'tree_5': tree.DecisionTreeClassifier(max_depth=5)
, 'tree_3': tree.DecisionTreeClassifier(max_depth=3)
, 'tree_1': tree.DecisionTreeClassifier(max_depth=1)
}
model_meta = {
'is_probabilistic': False
, 'train_data': 'train_digits'
, 'validation_data': 'validation_digits'
, 'test_data': 'test_digits'
}
for (model_name, model) in models.items():
ensemble.write_model(ensemble_path, model_name, model, model_meta = model_meta)
## check if models are written
!ls tmp/digits_ensemble/models
print models.keys()
pac_0.001.pkl pac_0.1.pkl sgd_0.001.pkl sgd_0.15.pkl tree_1.pkl tree_5.pkl pac_0.01.pkl pac_1.0.pkl sgd_0.05.pkl sgd_0.1.pkl tree_3.pkl ['tree_3', 'tree_1', 'tree_5', 'sgd_0.1', 'pac_1.0', 'pac_0.001', 'pac_0.1', 'sgd_0.15', 'sgd_0.05', 'pac_0.01', 'sgd_0.001']
## train models in sequential mode
model_names = models.keys()#['svc_0.1']#
print model_names
import time
for model_name in model_names:
print 'training model', model_name
tic = time.time()
ensemble.train_model(ensemble_path, model_name, data_type='train_data')
print 'time elapsed', time.time() - tic, 'seconds'
['tree_3', 'tree_1', 'tree_5', 'sgd_0.1', 'pac_1.0', 'pac_0.001', 'pac_0.1', 'sgd_0.15', 'sgd_0.05', 'pac_0.01', 'sgd_0.001'] training model tree_3 time elapsed 3.20642900467 seconds training model tree_1 time elapsed 1.70039081573 seconds training model tree_5 time elapsed 6.68565106392 seconds training model sgd_0.1 time elapsed 1.97066402435 seconds training model pac_1.0 time elapsed 3.16873908043 seconds training model pac_0.001 time elapsed 3.16739988327 seconds training model pac_0.1 time elapsed 3.16902804375 seconds training model sgd_0.15 time elapsed 1.94980216026 seconds training model sgd_0.05 time elapsed 1.94951891899 seconds training model pac_0.01 time elapsed 3.17199516296 seconds training model sgd_0.001 time elapsed 1.94905114174 seconds
/usr/lib/python2.7/dist-packages/numpy/lib/utils.py:1132: DeprecationWarning: The compiler package is deprecated and removed in Python 3.x. import compiler
## parallel train models on data
#model_data_pairs = zip(models.keys()[:3], cycle(['train_data']))
model_data_pairs = zip(model_names, cycle(['train_data']))
ensemble.paralle_train_models(ensemble_path, model_data_pairs, client)
## construct ensemble model
from sklearn import metrics
ge = ensemble.GreedyEnsemble(ensemble_path,
scorefn=metrics.accuracy_score,
votefn = ensemble.GreedyEnsemble.vote_major_class,
client=client)
ge.fit(model_names, verbose=True)
checking model sgd_0.15 improvement from 0.0 to 0.866071428571 checking model sgd_0.05 improvement from 0.866071428571 to 0.866071428571 checking model tree_3 improvement from 0.866071428571 to 0.866071428571 checking model sgd_0.001 NO improvement from 0.866071428571 to 0.865357142857
GreedyEnsemble(client=<IPython.parallel.client.client.Client object at 0x33a3c50>, ensemble_path='/home/ce/mali/tutorials/ml-tutorials/tmp/digits_ensemble', random_seed=0, scorefn=<function accuracy_score at 0x393a320>, votefn=<function vote_major_class at 0x33aa5f0>)
print ge.score(data_type='train_data')
print ge.score(data_type='validation_data')
print ge.score(data_type='test_data')
print ge.ensemble_
0.88231292517 0.866071428571 0.866369047619 ['sgd_0.15', 'sgd_0.05', 'tree_3']
## try add all models to ensemble and see
ge.ensemble_ = model_names
print ge.score(data_type='train_data')
print ge.score(data_type='validation_data')
print ge.score(data_type='test_data')
print ge.ensemble_
0.883843537415 0.86369047619 0.863194444444 ['tree_3', 'tree_1', 'tree_5', 'sgd_0.1', 'pac_1.0', 'pac_0.001', 'pac_0.1', 'sgd_0.15', 'sgd_0.05', 'pac_0.01', 'sgd_0.001']
## try indivdual models and see the results
for model_name in model_names:
print 'for model', model_name
ge.ensemble_ = [model_name]
print ge.score(data_type='train_data')
print ge.score(data_type='validation_data')
print ge.score(data_type='test_data')
print '--------------------------'
print ''
for model tree_3 0.462414965986 0.455595238095 0.453273809524 -------------------------- for model tree_1 0.193239795918 0.194761904762 0.193055555556 -------------------------- for model tree_5 0.690476190476 0.687619047619 0.677281746032 -------------------------- for model sgd_0.1 0.878571428571 0.858571428571 0.858630952381 -------------------------- for model pac_1.0 0.874149659864 0.853928571429 0.852777777778 -------------------------- for model pac_0.001 0.874149659864 0.853928571429 0.852777777778 -------------------------- for model pac_0.1 0.874149659864 0.853928571429 0.852777777778 -------------------------- for model sgd_0.15 0.88231292517 0.866071428571 0.866369047619 -------------------------- for model sgd_0.05 0.88231292517 0.866071428571 0.866369047619 -------------------------- for model pac_0.01 0.874149659864 0.853928571429 0.852777777778 -------------------------- for model sgd_0.001 0.86994047619 0.854404761905 0.850396825397 --------------------------