In [4]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
In [5]:
sys.path.append('./lib')
In [6]:
import data_io
In [7]:
from data_manager import DataManager 
In [8]:
input_dir = '/Users/ogrisel/data/automl/round1/'
dataset = 'christine'
D = DataManager(dataset, input_dir,
                replace_missing=True,
                filter_features=True,
                verbose=10)
Info file found : /Users/ogrisel/data/automl/round1/christine/christine_public.info
========= Reading /Users/ogrisel/data/automl/round1/christine/christine_feat.type
[+] Success in  0.00 sec
========= Reading /Users/ogrisel/data/automl/round1/christine/christine_train.data
[+] Success in  1.89 sec
========= Reading /Users/ogrisel/data/automl/round1/christine/christine_train.solution
[+] Success in  0.04 sec
========= Reading /Users/ogrisel/data/automl/round1/christine/christine_valid.data
[+] Success in  0.29 sec
========= Reading /Users/ogrisel/data/automl/round1/christine/christine_test.data
[+] Success in  0.72 sec
In [9]:
print(D)
DataManager : christine
info:
	task = binary.classification
	name = christine
	feat_type = Numerical
	format = dense
	is_sparse = 0
	metric = bac_metric
	target_type = Binary
	test_num = 2084
	label_num = 2
	target_num = 1
	valid_num = 834
	has_categorical = 0
	usage = AutoML challenge 2015
	feat_num = 1636
	time_budget = 1200
	train_num = 5418
	has_missing = 0
data:
	X_train = array(5418, 1636)
	Y_train = array(5418,)
	X_valid = array(834, 1636)
	X_test = array(2084, 1636)
feat_type:	array(1636,)
feat_idx:	array(1636,)

In [10]:
X_train = D.data['X_train']
Y_train = D.data['Y_train']
In [11]:
X_train
Out[11]:
array([[ 443.,  375.,  109., ...,  388.,  197.,  331.],
       [ 486.,  716.,  136., ...,  222.,    0.,  380.],
       [ 277.,  424.,   75., ...,  329.,    0.,  299.],
       ..., 
       [ 208.,  384.,   87., ...,    0.,    0.,  248.],
       [ 514.,  394.,  132., ...,  460.,    0.,  304.],
       [ 382.,  317.,  208., ...,  415.,    0.,  234.]])
In [12]:
pd.DataFrame(X_train).describe()
Out[12]:
0 1 2 3 4 5 6 7 8 9 ... 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635
count 5418.000000 5418.000000 5418.000000 5418.000000 5418.000000 5418.000000 5418.000000 5418.000000 5418.000000 5418.000000 ... 5418.000000 5418.000000 5418.000000 5418.000000 5418.000000 5418.000000 5418.000000 5418.000000 5418.000000 5418.000000
mean 266.457918 480.771687 134.844038 246.080472 545.022702 263.512366 102.437062 578.476375 166.745847 643.209302 ... 287.611849 178.392396 31.440015 239.989849 164.960871 173.133813 407.397195 302.219823 48.287929 353.227575
std 109.652430 110.545910 48.277235 111.408996 100.527975 77.407330 99.016102 85.524281 58.868650 86.100403 ... 77.764002 149.928859 27.969916 60.959969 52.703243 62.146435 122.723735 91.582356 89.119348 70.699755
min 0.000000 59.000000 18.000000 17.000000 85.000000 52.000000 0.000000 149.000000 29.000000 14.000000 ... 63.000000 0.000000 0.000000 44.000000 53.000000 34.000000 0.000000 0.000000 0.000000 167.000000
25% 192.000000 399.000000 101.000000 177.000000 482.000000 209.000000 3.000000 524.000000 126.000000 596.000000 ... 230.250000 0.000000 17.000000 199.000000 129.000000 130.000000 349.000000 261.000000 0.000000 303.000000
50% 260.000000 469.000000 127.000000 244.000000 550.000000 253.000000 101.000000 574.000000 158.000000 652.000000 ... 287.000000 188.000000 28.000000 233.000000 156.000000 162.000000 414.000000 302.000000 0.000000 343.000000
75% 336.000000 550.000000 159.000000 315.000000 613.000000 305.000000 141.000000 631.000000 197.000000 703.000000 ... 343.000000 300.000000 42.000000 270.000000 191.000000 205.000000 481.750000 353.000000 69.000000 391.000000
max 986.000000 941.000000 695.000000 715.000000 939.000000 971.000000 874.000000 912.000000 767.000000 879.000000 ... 571.000000 697.000000 999.000000 603.000000 740.000000 736.000000 885.000000 999.000000 904.000000 839.000000

8 rows × 1636 columns

In [13]:
Y_train
Out[13]:
array([ 0.,  1.,  1., ...,  1.,  1.,  1.])
In [14]:
np.mean(Y_train)
Out[14]:
0.5
In [15]:
from sklearn.decomposition import RandomizedPCA

pca = RandomizedPCA(n_components=2)
X_train_pca = pca.fit_transform(X_train)
In [16]:
X_train_pca_0 = X_train_pca[Y_train == 0]
X_train_pca_1 = X_train_pca[Y_train == 1]

plt.scatter(X_train_pca_0[:, 0], X_train_pca_0[:, 1],
            alpha=0.1, color='b');
plt.scatter(X_train_pca_1[:, 0], X_train_pca_1[:, 1],
            alpha=0.1, color='g');
In [18]:
bins = np.linspace(X_train.min(), X_train.max(), 30)
i = 72
_ = plt.hist(X_train[Y_train == 0, i], bins=bins, alpha=0.3)
_ = plt.hist(X_train[Y_train == 1, i], bins=bins, alpha=0.3)
In [19]:
for i in range(10):
    plt.figure()
    _ = plt.hist(X_train[Y_train == 0, i], bins=bins, alpha=0.3)
    _ = plt.hist(X_train[Y_train == 1, i], bins=bins, alpha=0.3)
In [21]:
%%time
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=10, n_jobs=4)
scores = cross_val_score(rf, X_train, Y_train, cv=5)
print(pd.Series(scores).describe())
count    5.000000
mean     0.687155
std      0.007609
min      0.678044
25%      0.680812
50%      0.689114
75%      0.691312
max      0.696494
dtype: float64
CPU times: user 10 s, sys: 354 ms, total: 10.4 s
Wall time: 5.83 s
In [22]:
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import GenericUnivariateSelect, f_classif
from sklearn.preprocessing import StandardScaler
In [23]:
%%time
pipeline = make_pipeline(
    GenericUnivariateSelect(f_classif, 'k_best', 300),
#     StandardScaler(),
#     RandomizedPCA(n_components=50),
    RandomForestClassifier(n_estimators=10, n_jobs=4),
)
scores = cross_val_score(pipeline, X_train, Y_train, cv=5)
print(pd.Series(scores).describe())
/Users/ogrisel/venvs/py27/lib/python2.7/site-packages/sklearn/feature_selection/univariate_selection.py:111: UserWarning: Features [  72  155  213  244  256  261  477  545  574  583  646  779  832  856  934
 1004 1009 1047 1186 1228 1238 1239 1255 1259 1355 1486 1585] are constant.
  UserWarning)
/Users/ogrisel/venvs/py27/lib/python2.7/site-packages/sklearn/feature_selection/univariate_selection.py:111: UserWarning: Features [  72  155  213  244  256  261  477  545  574  583  646  832  856  934 1004
 1009 1186 1228 1238 1239 1255 1259 1355 1486 1585] are constant.
  UserWarning)
/Users/ogrisel/venvs/py27/lib/python2.7/site-packages/sklearn/feature_selection/univariate_selection.py:111: UserWarning: Features [  72  155  213  238  244  256  261  477  545  574  583  646  687  832  856
  934 1004 1009 1186 1228 1238 1239 1255 1259 1355 1486 1585] are constant.
  UserWarning)
/Users/ogrisel/venvs/py27/lib/python2.7/site-packages/sklearn/feature_selection/univariate_selection.py:111: UserWarning: Features [  72  155  213  244  256  261  477  545  574  583  646  832  856  934 1004
 1009 1186 1228 1238 1239 1255 1259 1300 1355 1486 1585] are constant.
  UserWarning)
count    5.000000
mean     0.706906
std      0.014259
min      0.683579
25%      0.706642
50%      0.708487
75%      0.714418
max      0.721402
dtype: float64
CPU times: user 5.25 s, sys: 510 ms, total: 5.76 s
Wall time: 4.71 s
In [57]:
%%time
from sklearn.svm import SVC

pipeline = make_pipeline(
    GenericUnivariateSelect(f_classif, 'k_best', 300),
    StandardScaler(),
    SVC(kernel='rbf', C=1, gamma=1e-4),
)
scores = cross_val_score(pipeline, X_train, Y_train, cv=5)
print(pd.Series(scores).describe())
count    5.000000
mean     0.715585
std      0.014213
min      0.702952
25%      0.704797
50%      0.709410
75%      0.725092
max      0.735675
dtype: float64
In [24]:
from sklearn.kernel_approximation import Nystroem
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegressionCV

pipeline = make_pipeline(
    GenericUnivariateSelect(f_classif, 'k_best', 300),
    StandardScaler(),
    Nystroem(n_components=300, gamma=1e-3),
    LogisticRegressionCV(),
)
In [81]:
%%time

scores = cross_val_score(pipeline, X_train, Y_train, cv=5)
print(pd.Series(scores).describe())
count    5.000000
mean     0.750278
std      0.003715
min      0.745387
25%      0.748155
50%      0.750000
75%      0.753690
max      0.754159
dtype: float64
CPU times: user 24.2 s, sys: 1.67 s, total: 25.9 s
Wall time: 19.7 s
In [82]:
%%time
pipeline.fit(X_train, Y_train)
CPU times: user 6.58 s, sys: 345 ms, total: 6.92 s
Wall time: 5.08 s
Out[82]:
Pipeline(steps=[('genericunivariateselect', GenericUnivariateSelect(mode='k_best', param=300,
            score_func=<function f_classif at 0x13dd4e488>)), ('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('nystroem', Nystroem(coef0=1, degree=3, gamma=0.001, kernel='rbf', kernel_..._jobs=1, penalty='l2', refit=True,
           scoring=None, solver='lbfgs', tol=0.0001, verbose=0))])
In [86]:
lrcv = pipeline.named_steps['logisticregressioncv']
In [88]:
lrcv.Cs
Out[88]:
10
In [25]:
%%time
from sklearn.grid_search import RandomizedSearchCV

params = {
    'genericunivariateselect__param': [10, 100, 200, 300, 500, 1000],
    'nystroem__gamma': np.logspace(-6, 0, 7),
}

rscv = RandomizedSearchCV(pipeline, params, n_iter=10,
                          cv=5, verbose=10, n_jobs=1,
                          scoring='roc_auc')
_ = rscv.fit(X_train, Y_train)
[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:    3.0s
[Parallel(n_jobs=1)]: Done  50 jobs       | elapsed:  2.9min
Fitting 5 folds for each of 10 candidates, totalling 50 fits
CPU times: user 3min 37s, sys: 16.2 s, total: 3min 53s
Wall time: 3min
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  2.9min finished
In [33]:
sorted(rscv.grid_scores_,
       key=lambda x: x.mean_validation_score,
       reverse=True)
Out[33]:
[mean: 0.74991, std: 0.01489, params: {'genericunivariateselect__param': 200, 'nystroem__gamma': 0.0001},
 mean: 0.74954, std: 0.00722, params: {'genericunivariateselect__param': 300, 'nystroem__gamma': 0.001},
 mean: 0.74031, std: 0.01191, params: {'genericunivariateselect__param': 300, 'nystroem__gamma': 1.0000000000000001e-05},
 mean: 0.73662, std: 0.00355, params: {'genericunivariateselect__param': 100, 'nystroem__gamma': 0.01},
 mean: 0.72868, std: 0.00627, params: {'genericunivariateselect__param': 1000, 'nystroem__gamma': 0.001},
 mean: 0.72628, std: 0.01139, params: {'genericunivariateselect__param': 200, 'nystroem__gamma': 0.01},
 mean: 0.68143, std: 0.00722, params: {'genericunivariateselect__param': 100, 'nystroem__gamma': 0.10000000000000001},
 mean: 0.67940, std: 0.00953, params: {'genericunivariateselect__param': 10, 'nystroem__gamma': 1.0},
 mean: 0.55408, std: 0.03445, params: {'genericunivariateselect__param': 300, 'nystroem__gamma': 0.10000000000000001},
 mean: 0.50351, std: 0.00244, params: {'genericunivariateselect__param': 500, 'nystroem__gamma': 0.10000000000000001}]
In [29]:
first_score.mean_validation_score
Out[29]:
0.68143226282761171

Wrap the official metric from AutoML (in libscores.py) using the make_scorer function of scikit-learn:

http://scikit-learn.org/stable/modules/model_evaluation.html