In [1]:
%pylab inline
import pylab as pl
import numpy as np

# Some nice default configuration for plots
pl.rcParams['figure.figsize'] = 10, 7.5
pl.rcParams['axes.grid'] = True
pl.gray()
Welcome to pylab, a matplotlib-based Python environment [backend: module://IPython.kernel.zmq.pylab.backend_inline].
For more information, type 'help(pylab)'.
<matplotlib.figure.Figure at 0x10da00fd0>

IPython.parallel

In [2]:
from IPython.parallel import Client
client = Client()
In [3]:
len(client)
Out[3]:
2
In [4]:
%px print("Hello from the cluster engines!")
[stdout:0] Hello from the cluster engines!
[stdout:1] Hello from the cluster engines!
In [5]:
def where_am_i():
    import os
    import socket
    
    return "In process with pid {0} on host: '{1}'".format(
        os.getpid(), socket.gethostname())
In [6]:
where_am_i()
Out[6]:
"In process with pid 60954 on host: 'iamapc.local'"

Direct View

In [7]:
direct_view = client.direct_view()
In [8]:
where_am_i_direct_results = direct_view.apply(where_am_i)
where_am_i_direct_results
Out[8]:
<AsyncResult: where_am_i>
In [9]:
where_am_i_direct_results.get()
Out[9]:
["In process with pid 56581 on host: 'iamapc.local'",
 "In process with pid 56582 on host: 'iamapc.local'"]
In [10]:
where_am_i_direct_results.get_dict()
Out[10]:
{0: "In process with pid 56581 on host: 'iamapc.local'",
 1: "In process with pid 56582 on host: 'iamapc.local'"}

Load Balanced View

In [11]:
lb_view = client.load_balanced_view()
In [12]:
where_am_i_lb_result = lb_view.apply(where_am_i)
where_am_i_lb_result
Out[12]:
<AsyncResult: where_am_i>
In [13]:
where_am_i_lb_result.get()
Out[13]:
"In process with pid 56581 on host: 'iamapc.local'"

Distributed Grid Search for a Linear Support Vector Machine

In [14]:
import mmap_utils, model_selection
_ = reload(mmap_utils), reload(model_selection)
In [15]:
from sklearn.datasets import load_digits
from sklearn.preprocessing import MinMaxScaler

digits = load_digits()

X = MinMaxScaler().fit_transform(digits.data)
y = digits.target

digits_cv_split_filenames = mmap_utils.persist_cv_splits('digits_10', X, y, 10)
digits_cv_split_filenames
Out[15]:
['/Users/ogrisel/coding/dsl-demo/digits_10_cv_000.pkl',
 '/Users/ogrisel/coding/dsl-demo/digits_10_cv_001.pkl',
 '/Users/ogrisel/coding/dsl-demo/digits_10_cv_002.pkl',
 '/Users/ogrisel/coding/dsl-demo/digits_10_cv_003.pkl',
 '/Users/ogrisel/coding/dsl-demo/digits_10_cv_004.pkl',
 '/Users/ogrisel/coding/dsl-demo/digits_10_cv_005.pkl',
 '/Users/ogrisel/coding/dsl-demo/digits_10_cv_006.pkl',
 '/Users/ogrisel/coding/dsl-demo/digits_10_cv_007.pkl',
 '/Users/ogrisel/coding/dsl-demo/digits_10_cv_008.pkl',
 '/Users/ogrisel/coding/dsl-demo/digits_10_cv_009.pkl']
In [16]:
mmap_utils.warm_mmap_on_cv_splits(client, digits_cv_split_filenames)
In [17]:
from sklearn.svm import LinearSVC
from collections import OrderedDict
import numpy as np

linear_svc_params = OrderedDict((
    ('C', np.logspace(-2, 2, 5)),
))
linear_svc = LinearSVC()
In [18]:
linear_svc_search = model_selection.RandomizedGridSeach(lb_view)

linear_svc_search.launch_for_splits(linear_svc, linear_svc_params, digits_cv_split_filenames)
Out[18]:
Progress: 00% (000/050)
In [23]:
linear_svc_search
Out[23]:
Progress: 72% (036/050)

Rank 1: validation: 0.96267 (+/-0.00307) train: 0.99154 (+/-0.00047):
 {'C': 1.0}
Rank 2: validation: 0.96089 (+/-0.00302) train: 0.97669 (+/-0.00104):
 {'C': 0.10000000000000001}
Rank 3: validation: 0.94667 (+/-0.00344) train: 0.99765 (+/-0.00035):
 {'C': 10.0}
Rank 4: validation: 0.94200 (+/-0.00295) train: 0.95442 (+/-0.00092):
 {'C': 0.01}
In [44]:
linear_svc_search.boxplot_parameters(display_train=False)

Scaling Non-Linear SVM: Kernel Approximations

Motivation: traditional kernel SVM as in SVC has almost cubic complexity w.r.t. n_samples

In [25]:
x = np.linspace(0, int(1e3), 100)

pl.plot(x, x ** 3 / 1e9)
pl.xlabel("Number of training samples")
pl.ylabel("Estimated Convergence Time of SMO (in seconds)")
Out[25]:
<matplotlib.text.Text at 0x1108ea1d0>
In [26]:
1e6 ** 3 / 1e9 / 60 / 60 / 24 / 365
Out[26]:
31.709791983764582

Solution: Approximate Kernel SVM with a non-linear Kernel Expansion on a Small Basis + Linear SVC

In [27]:
from sklearn.kernel_approximation import Nystroem
from sklearn.pipeline import Pipeline

nystroem_pipeline = Pipeline([
    ('nystroem', Nystroem()),
    ('clf', LinearSVC()),
])
In [28]:
nystroem_pipeline_params = OrderedDict((
    ('nystroem__n_components', [50, 100, 200]),
    ('nystroem__gamma', np.logspace(-2, 2, 5)),
    ('clf__C', np.logspace(-2, 2, 5)),
))
In [29]:
nystroem_search = model_selection.RandomizedGridSeach(lb_view)
nystroem_search.launch_for_splits(nystroem_pipeline, nystroem_pipeline_params, digits_cv_split_filenames)
Out[29]:
Progress: 00% (000/750)
In [41]:
nystroem_search
Out[41]:
Progress: 32% (240/750)

Rank 1: validation: 0.98556 (+/-0.00142) train: 0.99933 (+/-0.00021):
 {'nystroem__n_components': 200, 'clf__C': 10.0, 'nystroem__gamma': 0.10000000000000001}
Rank 2: validation: 0.98156 (+/-0.00159) train: 0.99852 (+/-0.00027):
 {'nystroem__n_components': 100, 'clf__C': 10.0, 'nystroem__gamma': 0.10000000000000001}
Rank 3: validation: 0.97178 (+/-0.00175) train: 0.98827 (+/-0.00044):
 {'nystroem__n_components': 100, 'clf__C': 1.0, 'nystroem__gamma': 0.10000000000000001}
Rank 4: validation: 0.96578 (+/-0.00263) train: 0.99436 (+/-0.00039):
 {'nystroem__n_components': 50, 'clf__C': 100.0, 'nystroem__gamma': 0.01}
Rank 5: validation: 0.96578 (+/-0.00137) train: 0.98337 (+/-0.00169):
 {'nystroem__n_components': 200, 'clf__C': 100.0, 'nystroem__gamma': 1.0}
In [42]:
nystroem_search.boxplot_parameters()
In [ ]:
client.abort()

A Word of Caution on the Scalability of this Implementation Nystroem method

In this example we used LinearSVC that does not provide a partial_fit method hence require to put the Nystroem expansion of complet dataset in memory. Furthermore the Pipeline object does not optimize the memory usage.

To make this example really scalable we would need to:

  • Replace LinearSVC with an incremental linear model: Perceptron, PassiveAggressiveClassifier, SGDClassifier
  • Add support for memory efficient partial_fit to sklearn.pipeline.Pipeline
  • Change our IPython.parallel based model evaluator to use the partial_fit method with small minibatches in the inner model evaluation function.