from itertools import islice
import logging
import json
from bubbly.model import Model
from bubbly.dr1 import LocationGenerator, highest_quality_on_params
from bubbly.extractors import RingWaveletCompressionExtractor, enhance_contrast
from bubbly.util import summary
from bubbly.util import rfp_curve
import brewer2mpl
logging.getLogger('bubbly').addHandler(logging.StreamHandler())
%pylab
Welcome to pylab, a matplotlib-based Python environment [backend: module://IPython.kernel.zmq.pylab.backend_inline]. For more information, type 'help(pylab)'.
/Users/beaumont/Library/Python/2.7/lib/python/site-packages/scikits/__init__.py:1: UserWarning: Module argparse was already imported from /opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/argparse.pyc, but /Users/beaumont/Library/Python/2.7/lib/python/site-packages is being added to sys.path __import__('pkg_resources').declare_namespace(__name__)
ex = RingWaveletCompressionExtractor()
l = LocationGenerator()
l.positive_generator = highest_quality_on_params
model = Model(ex, l,
weak_learner_params=dict(verbose=1, max_depth=1, n_estimators=200, subsample=.4),
cascade_params=dict(verbose=1, max_layers=1))
model.fit()
Locally scanning for 462 false positives Runtime for false_positives: 0.72 s Runtime for _make_xy: 38.11 s Fitting
........................................................................................................................................................................................................
Runtime for fit: 76.20 s
Cascade round 1. False pos rate: 0.000000e+00. Recall: 9.935065e-01 WARNING: Could not reduce false positive enough after 1 layers. False positive rate: 0.000000e+00. Recall: 9.935065e-01 [bubbly.cascade]
x, y = model._make_xy(model.training_data[0]['pos'], model.training_data[0]['neg'])
summary(model.estimator, x, y)
Runtime for _make_xy: 19.61 s
False Positive: 0.000 Recall: 0.987 AUC: 0.994 Accuracy: 0.997
cv_locator = LocationGenerator(1)
cv_locator.positive_generator = highest_quality_on_params
on2 = cv_locator.positives()
off2 = list(islice(cv_locator.negatives_iterator(), 10000))
x2, y2 = model._make_xy(on2, off2)
summary(model.estimator, x2, y2)
WARNING: RuntimeWarning: invalid value encountered in divide [bubbly.util] WARNING: RuntimeWarning: invalid value encountered in divide [bubbly.util] WARNING
Runtime for _make_xy: 200.90 s
: Non-finite values in feature vectors. Fixing [bubbly.model] False Positive: 0.002 Recall: 0.764 AUC: 0.881 Accuracy: 0.995
colors = brewer2mpl.get_map('Purples', 'sequential', 7).mpl_colors[::-1]
for i, y2p in enumerate(model.estimator.staged_decision_function(x2)):
rfp_curve(y2p, y2, label = 'CV %i' % i, color = colors[i])
yp = model.estimator.decision_function(x)
rfp_curve(yp, y, color='red', label='Training Data')
plt.ylim(0, .01)
plt.legend(loc='upper left')
<matplotlib.legend.Legend at 0x112dd3090>
off3 = model.cloud_false_positives(1000, workers=50)
on3 = model.training_data[0]['pos']
Cloud scanning for 1000 false positives To re-fetch results, use cloud_false_positives(jobs=range(268580, 268630) Runtime for cloud_false_positives: 879.74 s
model.add_layer(on3, off3)
Runtime for _make_xy: 64.94 s
........................................................................................................................................................................................................
colors = ['c', 'm']
yp = model.estimator.decision_function(x)
rfp_curve(yp, y, color='red', label='Training Data')
for i, y2p in enumerate(model.estimator.staged_decision_function(x2)):
rfp_curve(y2p, y2, label = 'CV %i' % i, color = colors[i])
plt.ylim(0, .01)
plt.legend(loc='upper left')
<matplotlib.legend.Legend at 0x113682a90>
model.estimator.bias_.pop()
model.estimator.estimators_.pop()
model.add_layer(on3, off3[:462])
Runtime for _make_xy: 33.79 s
........................................................................................................................................................................................................
No, but it seems that imbalanced drastically reduces performance
import random
model2 = Model(ex, l,
weak_learner_params=dict(verbose=1, max_depth=1, n_estimators=200, subsample=.4),
cascade_params=dict(verbose=1, max_layers=1))
off_all = off3 + model.training_data[0]['neg']
model2.fit(on3, random.sample(off_all, len(on3)))
Runtime for _make_xy: 42.55 s Fitting
........................................................................................................................................................................................................
Runtime for fit: 61.20 s
Cascade round 1. False pos rate: 0.000000e+00. Recall: 9.935065e-01
model2.add_layer(on3, random.sample(off_all, len(on3)))
Runtime for _make_xy: 37.59 s
........................................................................................................................................................................................................
colors = ['c', 'm', 'b']
yp = model2.estimator.decision_function(x)
rfp_curve(yp, y, color='red', label='Training Data')
for i, y2p in enumerate(model2.estimator.staged_decision_function(x2)):
rfp_curve(y2p, y2, label = 'CV %i' % i, color = colors[i])
plt.ylim(0, .002)
plt.legend(loc='upper left')
<matplotlib.legend.Legend at 0x11b32e150>
data = {'pos': on3, 'neg': off_all, 'cv_pos': on2, 'cv_neg': off2}
with open('../models/benchmark_training_data.json', 'w') as outfile:
json.dump(data, outfile)