%pylab inline
import pylab as pl
import numpy as np
# Some nice default configuration for plots
pl.rcParams['figure.figsize'] = 10, 7.5
pl.rcParams['axes.grid'] = True
pl.gray()
Welcome to pylab, a matplotlib-based Python environment [backend: module://IPython.kernel.zmq.pylab.backend_inline]. For more information, type 'help(pylab)'.
<matplotlib.figure.Figure at 0x10da00fd0>
from IPython.parallel import Client
client = Client()
len(client)
2
%px print("Hello from the cluster engines!")
[stdout:0] Hello from the cluster engines! [stdout:1] Hello from the cluster engines!
def where_am_i():
import os
import socket
return "In process with pid {0} on host: '{1}'".format(
os.getpid(), socket.gethostname())
where_am_i()
"In process with pid 60954 on host: 'iamapc.local'"
direct_view = client.direct_view()
where_am_i_direct_results = direct_view.apply(where_am_i)
where_am_i_direct_results
<AsyncResult: where_am_i>
where_am_i_direct_results.get()
["In process with pid 56581 on host: 'iamapc.local'", "In process with pid 56582 on host: 'iamapc.local'"]
where_am_i_direct_results.get_dict()
{0: "In process with pid 56581 on host: 'iamapc.local'", 1: "In process with pid 56582 on host: 'iamapc.local'"}
lb_view = client.load_balanced_view()
where_am_i_lb_result = lb_view.apply(where_am_i)
where_am_i_lb_result
<AsyncResult: where_am_i>
where_am_i_lb_result.get()
"In process with pid 56581 on host: 'iamapc.local'"
import mmap_utils, model_selection
_ = reload(mmap_utils), reload(model_selection)
from sklearn.datasets import load_digits
from sklearn.preprocessing import MinMaxScaler
digits = load_digits()
X = MinMaxScaler().fit_transform(digits.data)
y = digits.target
digits_cv_split_filenames = mmap_utils.persist_cv_splits('digits_10', X, y, 10)
digits_cv_split_filenames
['/Users/ogrisel/coding/dsl-demo/digits_10_cv_000.pkl', '/Users/ogrisel/coding/dsl-demo/digits_10_cv_001.pkl', '/Users/ogrisel/coding/dsl-demo/digits_10_cv_002.pkl', '/Users/ogrisel/coding/dsl-demo/digits_10_cv_003.pkl', '/Users/ogrisel/coding/dsl-demo/digits_10_cv_004.pkl', '/Users/ogrisel/coding/dsl-demo/digits_10_cv_005.pkl', '/Users/ogrisel/coding/dsl-demo/digits_10_cv_006.pkl', '/Users/ogrisel/coding/dsl-demo/digits_10_cv_007.pkl', '/Users/ogrisel/coding/dsl-demo/digits_10_cv_008.pkl', '/Users/ogrisel/coding/dsl-demo/digits_10_cv_009.pkl']
mmap_utils.warm_mmap_on_cv_splits(client, digits_cv_split_filenames)
from sklearn.svm import LinearSVC
from collections import OrderedDict
import numpy as np
linear_svc_params = OrderedDict((
('C', np.logspace(-2, 2, 5)),
))
linear_svc = LinearSVC()
linear_svc_search = model_selection.RandomizedGridSeach(lb_view)
linear_svc_search.launch_for_splits(linear_svc, linear_svc_params, digits_cv_split_filenames)
Progress: 00% (000/050)
linear_svc_search
Progress: 72% (036/050) Rank 1: validation: 0.96267 (+/-0.00307) train: 0.99154 (+/-0.00047): {'C': 1.0} Rank 2: validation: 0.96089 (+/-0.00302) train: 0.97669 (+/-0.00104): {'C': 0.10000000000000001} Rank 3: validation: 0.94667 (+/-0.00344) train: 0.99765 (+/-0.00035): {'C': 10.0} Rank 4: validation: 0.94200 (+/-0.00295) train: 0.95442 (+/-0.00092): {'C': 0.01}
linear_svc_search.boxplot_parameters(display_train=False)
n_samples
¶x = np.linspace(0, int(1e3), 100)
pl.plot(x, x ** 3 / 1e9)
pl.xlabel("Number of training samples")
pl.ylabel("Estimated Convergence Time of SMO (in seconds)")
<matplotlib.text.Text at 0x1108ea1d0>
1e6 ** 3 / 1e9 / 60 / 60 / 24 / 365
31.709791983764582
from sklearn.kernel_approximation import Nystroem
from sklearn.pipeline import Pipeline
nystroem_pipeline = Pipeline([
('nystroem', Nystroem()),
('clf', LinearSVC()),
])
nystroem_pipeline_params = OrderedDict((
('nystroem__n_components', [50, 100, 200]),
('nystroem__gamma', np.logspace(-2, 2, 5)),
('clf__C', np.logspace(-2, 2, 5)),
))
nystroem_search = model_selection.RandomizedGridSeach(lb_view)
nystroem_search.launch_for_splits(nystroem_pipeline, nystroem_pipeline_params, digits_cv_split_filenames)
Progress: 00% (000/750)
nystroem_search
Progress: 32% (240/750) Rank 1: validation: 0.98556 (+/-0.00142) train: 0.99933 (+/-0.00021): {'nystroem__n_components': 200, 'clf__C': 10.0, 'nystroem__gamma': 0.10000000000000001} Rank 2: validation: 0.98156 (+/-0.00159) train: 0.99852 (+/-0.00027): {'nystroem__n_components': 100, 'clf__C': 10.0, 'nystroem__gamma': 0.10000000000000001} Rank 3: validation: 0.97178 (+/-0.00175) train: 0.98827 (+/-0.00044): {'nystroem__n_components': 100, 'clf__C': 1.0, 'nystroem__gamma': 0.10000000000000001} Rank 4: validation: 0.96578 (+/-0.00263) train: 0.99436 (+/-0.00039): {'nystroem__n_components': 50, 'clf__C': 100.0, 'nystroem__gamma': 0.01} Rank 5: validation: 0.96578 (+/-0.00137) train: 0.98337 (+/-0.00169): {'nystroem__n_components': 200, 'clf__C': 100.0, 'nystroem__gamma': 1.0}
nystroem_search.boxplot_parameters()
client.abort()
In this example we used LinearSVC that does not provide a partial_fit
method hence require to put the Nystroem expansion of complet dataset in memory. Furthermore the Pipeline
object does not optimize the memory usage.
To make this example really scalable we would need to:
partial_fit
to sklearn.pipeline.Pipeline
partial_fit
method with small minibatches in the inner model evaluation function.