In [1]:

from itertools import islice
import logging
import json

from bubbly.model import Model
from bubbly.dr1 import LocationGenerator, highest_quality_on_params
from bubbly.extractors import RingWaveletCompressionExtractor, enhance_contrast
from bubbly.util import summary

from bubbly.util import rfp_curve
import brewer2mpl


logging.getLogger('bubbly').addHandler(logging.StreamHandler())

%pylab

Welcome to pylab, a matplotlib-based Python environment [backend: module://IPython.kernel.zmq.pylab.backend_inline].
For more information, type 'help(pylab)'.

/Users/beaumont/Library/Python/2.7/lib/python/site-packages/scikits/__init__.py:1: UserWarning: Module argparse was already imported from /opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/argparse.pyc, but /Users/beaumont/Library/Python/2.7/lib/python/site-packages is being added to sys.path
  __import__('pkg_resources').declare_namespace(__name__)

Instantiate Model¶

In [2]:

ex = RingWaveletCompressionExtractor()

l = LocationGenerator()
l.positive_generator = highest_quality_on_params

model = Model(ex, l, 
              weak_learner_params=dict(verbose=1, max_depth=1, n_estimators=200, subsample=.4), 
              cascade_params=dict(verbose=1, max_layers=1))

First fit¶

In [3]:

model.fit()

Locally scanning for 462 false positives
Runtime for false_positives: 0.72 s
Runtime for _make_xy: 38.11 s
Fitting

........................................................................................................................................................................................................

Runtime for fit: 76.20 s

Cascade round 1. False pos rate: 0.000000e+00. Recall: 9.935065e-01
WARNING: Could not reduce false positive enough after 1 layers. False positive rate: 0.000000e+00. Recall: 9.935065e-01 [bubbly.cascade]

Performance on training data¶

In [4]:

x, y = model._make_xy(model.training_data[0]['pos'], model.training_data[0]['neg'])
summary(model.estimator, x, y)

Runtime for _make_xy: 19.61 s

False Positive: 0.000
Recall:         0.987
AUC:            0.994
Accuracy:       0.997

Performance on test data¶

In [5]:

cv_locator = LocationGenerator(1)
cv_locator.positive_generator = highest_quality_on_params
on2 = cv_locator.positives()

off2 = list(islice(cv_locator.negatives_iterator(), 10000))
x2, y2 = model._make_xy(on2, off2)
summary(model.estimator, x2, y2)

WARNING: RuntimeWarning: invalid value encountered in divide [bubbly.util]
WARNING: RuntimeWarning: invalid value encountered in divide [bubbly.util]
WARNING

Runtime for _make_xy: 200.90 s

: Non-finite values in feature vectors. Fixing [bubbly.model]
False Positive: 0.002
Recall:         0.764
AUC:            0.881
Accuracy:       0.995

In [6]:

colors = brewer2mpl.get_map('Purples', 'sequential', 7).mpl_colors[::-1]

for i, y2p in enumerate(model.estimator.staged_decision_function(x2)):
    rfp_curve(y2p, y2, label = 'CV %i' % i, color = colors[i])

yp = model.estimator.decision_function(x)
rfp_curve(yp, y, color='red', label='Training Data')

plt.ylim(0, .01)
plt.legend(loc='upper left')

Out[6]:

<matplotlib.legend.Legend at 0x112dd3090>

Bootstrapping¶

In [7]:

off3 = model.cloud_false_positives(1000, workers=50)
on3 = model.training_data[0]['pos']

Cloud scanning for 1000 false positives
To re-fetch results, use 
cloud_false_positives(jobs=range(268580, 268630)
Runtime for cloud_false_positives: 879.74 s

In [8]:

model.add_layer(on3, off3)

Runtime for _make_xy: 64.94 s

........................................................................................................................................................................................................

Performance on test data¶

In [19]:

colors = ['c', 'm']
yp = model.estimator.decision_function(x)
rfp_curve(yp, y, color='red', label='Training Data')

for i, y2p in enumerate(model.estimator.staged_decision_function(x2)):
    rfp_curve(y2p, y2, label = 'CV %i' % i, color = colors[i])


plt.ylim(0, .01)
plt.legend(loc='upper left')

Out[19]:

<matplotlib.legend.Legend at 0x113682a90>

In [17]:

model.estimator.bias_.pop()
model.estimator.estimators_.pop()
model.add_layer(on3, off3[:462])

Runtime for _make_xy: 33.79 s

........................................................................................................................................................................................................

Is it posible that the first stage of fitting overfit, doomed the cascade?¶

No, but it seems that imbalanced drastically reduces performance

In [28]:

import random
model2 = Model(ex, l, 
              weak_learner_params=dict(verbose=1, max_depth=1, n_estimators=200, subsample=.4), 
              cascade_params=dict(verbose=1, max_layers=1))
off_all = off3 + model.training_data[0]['neg']

model2.fit(on3, random.sample(off_all, len(on3)))

Runtime for _make_xy: 42.55 s
Fitting

........................................................................................................................................................................................................

Runtime for fit: 61.20 s

Cascade round 1. False pos rate: 0.000000e+00. Recall: 9.935065e-01

In [32]:

model2.add_layer(on3, random.sample(off_all, len(on3)))

Runtime for _make_xy: 37.59 s

........................................................................................................................................................................................................

In [40]:

colors = ['c', 'm', 'b']
yp = model2.estimator.decision_function(x)
rfp_curve(yp, y, color='red', label='Training Data')

for i, y2p in enumerate(model2.estimator.staged_decision_function(x2)):
    rfp_curve(y2p, y2, label = 'CV %i' % i, color = colors[i])


plt.ylim(0, .002)
plt.legend(loc='upper left')

Out[40]:

<matplotlib.legend.Legend at 0x11b32e150>

In [38]:

data = {'pos': on3, 'neg': off_all, 'cv_pos': on2, 'cv_neg': off2}
with open('../models/benchmark_training_data.json', 'w') as outfile:
    json.dump(data, outfile)

Tasks for this afternoon¶

Try a hyperparameter optimizing on benchmark_training_data, in a new notebook
Visualize the false positives / negatives. Any trends?