import ensemble import features from itertools import cycle from IPython.parallel import Client client = Client() print len(client) import cPickle X, y = cPickle.load(open('data/blackbox.pkl', 'rb')) print X.shape, y.shape reload(ensemble) ## split data to train, validatin from sklearn.cross_validation import train_test_split train_X, validation_X, train_y, validation_y = train_test_split(X, y, test_size = 0.2) print train_X.shape, validation_X.shape print train_y.shape, validation_y.shape ## make new ensemble !rm -fR tmp/blackbox_raw_ensemble ensemble_path = ensemble.new_ensemble('blackbox_raw_ensemble', 'tmp/') print ensemble_path ## persist data ensemble.write_data(ensemble_path, 'train_blackbox', (train_X, train_y), {'description': 'blackbox data training'}) ensemble.write_data(ensemble_path, 'validation_blackbox', (validation_X, validation_y), {'description': 'blackbox data validation'}) ## confirm data writing !ls tmp/blackbox_raw_ensemble/data ## config different models from sklearn import svm from sklearn import linear_model from sklearn import tree models = { 'sgd_0.1': linear_model.SGDClassifier(alpha = 0.1) , 'svc_0.001': svm.SVC(probability=True, gamma=0.001) , 'svc_0.01': svm.SVC(probability=True, gamma = 0.01) , 'svc_0.1': svm.SVC(probability=True, gamma=0.1) , 'sgd_0.0001': linear_model.SGDClassifier(alpha = 0.0001) , 'sgd_0.001': linear_model.SGDClassifier(alpha = 0.001) , 'sgd_0.01': linear_model.SGDClassifier(alpha = 0.01) # SUPER model for digits , 'sgd_0.05': linear_model.SGDClassifier(alpha = 0.05) , 'sgd_0.15': linear_model.SGDClassifier(alpha = 0.05) , 'pac_1.0': linear_model.PassiveAggressiveClassifier(C=1.0) , 'pac_0.1': linear_model.PassiveAggressiveClassifier(C=0.1) , 'pac_0.01': linear_model.PassiveAggressiveClassifier(C=0.01) , 'pac_0.001': linear_model.PassiveAggressiveClassifier(C=0.001) , 'tree_5': tree.DecisionTreeClassifier(max_depth=5) , 'tree_3': tree.DecisionTreeClassifier(max_depth=3) , 'tree_1': tree.DecisionTreeClassifier(max_depth=1) } ## write models common_model_meta = { 'is_probabilistic': False , 'train_data': 'train_blackbox' , 'validation_data': 'validation_blackbox' } for (model_name, model) in models.items(): ensemble.write_model(ensemble_path, model_name, model, model_meta = common_model_meta) ## confirm model writing !ls tmp/blackbox_raw_ensemble/models ## train models in parallel model_data_pairs = zip(models.keys(), cycle(['train_data'])) ensemble.parallel_train_models(ensemble_path, model_data_pairs, client) ## construct ensemble model from sklearn import metrics ge = ensemble.GreedyEnsemble(ensemble_path, scorefn = metrics.accuracy_score, votefn = ensemble.GreedyEnsemble.vote_major_class, client = client) ge.fit(models.keys(), verbose = True) ## model performance print ge.score(data_type = 'train_data') print ge.score(data_type = 'validation_data') ## try add all models to ensemble and see ge.ensemble_ = models.keys() print ge.score(data_type='train_data') print ge.score(data_type='validation_data') print ge.ensemble_ ## try indivdual models and see the results for model_name in models.keys: print 'for model', model_name ge.ensemble_ = [model_name] print ge.score(data_type='train_data') print ge.score(data_type='validation_data') print '--------------------------' print '' reload(ensemble) reload(features) ## create new ensemble folder !rm -fR tmp/blackbox_mixed_ensemble/ ensemble_path = ensemble.new_ensemble('blackbox_mixed_ensemble', 'tmp/') print ensemble_path ## from sklearn.cross_validation import train_test_split n_samples, n_features = X.shape ## same train and test index for each sub dataset train_index, test_index = train_test_split(range(n_samples), test_size = 0.2) data_names = [] data_infor = [] ## HEAVEY IO ENGAGEMENT ## create different data from raw features ## data set of all rows with different columns (strided) sub_features = features.strided_seqs(range(n_features), stride = 25, subsize = 25) sub_features += features.strided_seqs(range(n_features), stride = 10, subsize = 50) sub_features += features.bootstrap_seqs(range(n_features), n_iter = 500, subsize = 30) data_name_index = [('subdata_%d_%d' % (sub_feature[0], sub_feature[-1]), sub_feature) for sub_feature in sub_features] for data_name, feat_index in data_name_index: train_X = features.patch(X, train_index, feat_index) train_y = features.patch(y, train_index) test_X = features.patch(X, test_index, feat_index) test_y = features.patch(y, test_index) train_name = 'train_' + data_name test_name = 'test_' + data_name data_names.append((train_name, test_name)) data_infor.append([train_name, (train_X, train_y), {}]) data_infor.append([test_name, (test_X, test_y), {}]) ## creat MORE features from tri-kmeans result reload(features) feat_patches = features.bootstrap_seqs(range(n_features), n_iter = 500, subsize = 30) tri_kmeans = features.TriKmeansFeatures(n_clusters = 10, feat_patches = feat_patches, client = client) tri_X = tri_kmeans.fit_transform(X) sub_features = features.strided_seqs(range(tri_X.shape[1]), stride = 15, subsize=30) data_name_index = [('tridata_%d_%d' % (sub_feature[0], sub_feature[-1]), sub_feature) for sub_feature in sub_features] for data_name, feat_index in data_name_index: train_X = features.patch(tri_X, train_index, feat_index) train_y = features.patch(y, train_index) test_X = features.patch(tri_X, test_index, feat_index) test_y = features.patch(y, test_index) train_name = 'train_' + data_name test_name = 'test_' + data_name data_names.append((train_name, test_name)) data_infor.append([train_name, (train_X, train_y), {}]) data_infor.append([test_name, (test_X, test_y), {}]) ensemble.batch_write_data(ensemble_path, data_infor) print len(data_names) ## configure different models from sklearn import svm from sklearn import linear_model from sklearn import tree models = { 'sgd_0.1': linear_model.SGDClassifier(alpha = 0.1) #, 'svc_0.001': svm.SVC(probability=True, gamma=0.001) #, 'svc_0.01': svm.SVC(probability=True, gamma = 0.01) #, 'svc_0.1': svm.SVC(probability=True, gamma=0.1) #, 'sgd_0.0001': linear_model.SGDClassifier(alpha = 0.0001) #, 'sgd_0.001': linear_model.SGDClassifier(alpha = 0.001) #, 'sgd_0.01': linear_model.SGDClassifier(alpha = 0.01) # SUPER model for digits #, 'sgd_0.05': linear_model.SGDClassifier(alpha = 0.05) #, 'sgd_0.15': linear_model.SGDClassifier(alpha = 0.05) #, 'pac_1.0': linear_model.PassiveAggressiveClassifier(C=1.0) #, 'pac_0.1': linear_model.PassiveAggressiveClassifier(C=0.1) #, 'pac_0.01': linear_model.PassiveAggressiveClassifier(C=0.01) #, 'pac_0.001': linear_model.PassiveAggressiveClassifier(C=0.001) , 'tree_11': tree.DecisionTreeClassifier(max_depth=11) , 'tree_10': tree.DecisionTreeClassifier(max_depth=10) , 'tree_9': tree.DecisionTreeClassifier(max_depth=9) , 'tree_8': tree.DecisionTreeClassifier(max_depth=8) , 'tree_7': tree.DecisionTreeClassifier(max_depth=7) , 'tree_6': tree.DecisionTreeClassifier(max_depth=6) , 'tree_5': tree.DecisionTreeClassifier(max_depth=5) , 'tree_4': tree.DecisionTreeClassifier(max_depth=4) , 'tree_3': tree.DecisionTreeClassifier(max_depth=3) , 'tree_2': tree.DecisionTreeClassifier(max_depth=2) , 'tree_1': tree.DecisionTreeClassifier(max_depth=1) } model_metas = [{ 'is_probabilistic': False , 'train_data': train_name , 'validation_data': validation_name , 'test_data': None } for (train_name, validation_name) in data_names] print len(model_metas) n_candidates = len(models) from random import shuffle ## write model configurations model_names = [] model_infor = [] for model_meta in model_metas: candidates = models.items() shuffle(candidates) candidates = candidates[:n_candidates] ## randomly select for (template_name, model) in candidates: model_name = template_name + "_" + model_meta["train_data"] + "_" + model_meta['validation_data'] model_names.append(model_name) model_infor.append([model_name, model, model_meta]) ensemble.batch_write_model(ensemble_path, model_infor) ## train models in parallel model_data_pairs = zip(model_names, cycle(['train_data'])) ensemble.parallel_train_models(ensemble_path, model_data_pairs, client) ## construct ensemble model ensemble_path = 'tmp/blackbox_mixed_ensemble/' from sklearn.metrics import accuracy_score ge = ensemble.GreedyEnsemble(ensemble_path, scorefn = accuracy_score, votefn = ensemble.GreedyEnsemble.vote_major_class, client = client) ge.fit(model_names, verbose=True) ## performance print ge.score(data_type = 'train_data') print ge.score(data_type = 'validation_data') reload(ensemble) data_names = ensemble.all_data_names(ensemble_path) print len(data_names) reload(ensemble) model_names = ensemble.all_model_names(ensemble_path) print len(model_names)