#!/usr/bin/env python
# coding: utf-8
# # Viva la Factory
#
# Factory is way to train several different classifiers on the same dataset and compare the quailty of predictions.
#
# ### First, enable plotting
# In[1]:
get_ipython().run_line_magic('pylab', 'inline')
# # Loading data
# ### download particle identification Data Set from UCI
# In[2]:
get_ipython().system('cd toy_datasets; wget -O MiniBooNE_PID.txt -nc --no-check-certificate https://archive.ics.uci.edu/ml/machine-learning-databases/00199/MiniBooNE_PID.txt')
# In[3]:
import numpy, pandas
from rep.utils import train_test_split
# In[4]:
import numpy, pandas
from rep.utils import train_test_split
from sklearn.metrics import roc_auc_score
data = pandas.read_csv('toy_datasets/MiniBooNE_PID.txt', sep='\s*', skiprows=[0], header=None, engine='python')
labels = pandas.read_csv('toy_datasets/MiniBooNE_PID.txt', sep=' ', nrows=1, header=None)
labels = [1] * labels[1].values[0] + [0] * labels[2].values[0]
data.columns = ['feature_{}'.format(key) for key in data.columns]
train_data, test_data, train_labels, test_labels = train_test_split(data, labels, train_size=0.5)
# ## Variables needed for analysis
# In[5]:
train_variables = ["feature_new01: feature_0/feature_1", "feature_2", "feature_26", "feature_12", "feature_24",
"feature_25", "feature_16",]
plot_variables = train_variables + ['feature_3']
# # Factory of different models
#
# This class is OrderedDict, with additional interface, main methods are:
#
# * `factory.add_classifier(name, classifier)`
#
# * `factory.fit(X, y, sample_weight=None, ipc_profile=None, features=None)`
# train all classifiers in factory
# if `features` is not None, then all classifiers will be trained on these features
# you can pass the name of ipython cluster via `ipc_profile` for parallel training
#
# * `factory.test_on_lds(lds)` - test all models on lds(`rep.data.storage.LabeledDataStorage`)
# returns report (`rep.report.classification.ClassificationReport`)
# In[6]:
from rep.metaml import ClassifiersFactory
from rep.estimators import TMVAClassifier, SklearnClassifier, XGBoostClassifier
from sklearn.ensemble import AdaBoostClassifier
# ## Define classifiers (that will be compared)
# Please pay attention that we set very small number of trees, just to make this notebook work fast.
# Don't forget to tune classifier!
#
#
# In[7]:
factory = ClassifiersFactory()
# There are different ways to add classifiers to Factory:
factory.add_classifier('tmva', TMVAClassifier(NTrees=50, features=train_variables, Shrinkage=0.05))
factory.add_classifier('ada', AdaBoostClassifier(n_estimators=10))
factory['xgb'] = XGBoostClassifier(features=train_variables)
# ### Create a copy of the factory with all classifiers
# In[8]:
from copy import deepcopy
factory_copy = deepcopy(factory)
# ## Training
# __pay attention:__
# for the first factory we pointed features that will be used in training and all classifiers will use them,
# for the second factory all the classifiers will use those features we set in their constuctors
# In[9]:
get_ipython().run_line_magic('time', 'factory.fit(train_data, train_labels, features=train_variables)')
pass
# In[10]:
factory.predict_proba(train_data)
# In[11]:
get_ipython().run_line_magic('time', 'factory_copy.fit(train_data, train_labels)')
pass
# # Everybody loves plots!
# ## Visualizing of training result with factory
# `ClassificationReport` class provides the posibility to get classification description to compare different models.
Below you can find available functions which can help you to analyze result on arbitrary dataset.
#
# There are different plotting backends supported:
# * __matplotlib__ (default, de-facto standard plotting library, mpld3 allows turning this into interactive plots),
# * __plotly__ (proprietary package with interactive plots, information is kept on the server),
# * __ROOT__ (the library used by CERN people),
# * __bokeh__ (open-source package with interactive plots)
# ### Get ClassificationReport object
# report has many useful methods
# In[12]:
report = factory.test_on(test_data, test_labels)
# ## Plot importances of features
# Only the features used in training are compared
# In[13]:
features_importances = report.feature_importance()
features_importances.plot()
# not only in matplotlib, but in other libraries too. For instance, with plotly
# features_importances.plot_plotly('importances', figsize=(15, 6))
# ## Plot learning curves to see possible overfitting of trained classifier
# Learning curves are powerful and simple tool to analyze the behaviour of your model.
# In[14]:
from rep.report.metrics import RocAuc
# In[15]:
learning_curve = report.learning_curve(RocAuc(), metric_label='ROC AUC', steps=1)
learning_curve.plot(new_plot=True, figsize=(8, 4))
# with plotly
# learning_curve.plot_plotly(plotly_filename='learning curves', figsize=(18, 8))
# ## Plot correlation between features
# In[16]:
correlation_pairs = []
correlation_pairs.append((plot_variables[0], plot_variables[1]))
correlation_pairs.append((plot_variables[0], plot_variables[2]))
report.scatter(correlation_pairs, alpha=0.01).plot()
# ## Plot data information: features correlation matrix
# In[17]:
# plot correlations between variables for signal-like and bck-like events
report.features_correlation_matrix(features=plot_variables).plot(new_plot=True, show_legend=False, figsize=(7, 5))
# In[18]:
report.features_correlation_matrix_by_class(features=plot_variables).plot(new_plot=True,
show_legend=False, figsize=(15, 5))
# compute correclation matrix only for zero class data
# report.features_correlation_matrix_by_class(features=plot_variables[:4], labels_dict={0: 'background'}, grid_columns=1).plot()
# ## Plot distribution for each feature
# In[19]:
# use just common features for all classifiers
report.features_pdf().plot()
# ## Plot predictions distributions
# In[20]:
report.prediction_pdf().plot(new_plot=True, figsize = (9, 4))
# Plot the same only for zero class data
# report.prediction_pdf(labels_dict={0: 'background'}, size=5).plot()
# ## ROC curves (receiver operating characteristic)
# Plot roc curve for train, test data (it's the same as BackgroundRejection vs Signal Efficiency plot)
# In[21]:
figure(figsize(14, 5))
subplot(1, 2, 1)
report.roc().plot(xlim=(0.5, 1))
subplot(1, 2, 2)
report.roc(physics_notion=True).plot(ylim=(0.5, 1))
# ## Plot 'flatness' of classifier prediction
# (this is dependence of efficiency on variables of dataset)
# In[22]:
efficiencies = report.efficiencies(['feature_3'], ignored_sideband=0.01)
efficiencies_with_errors = report.efficiencies(['feature_3'], errors=True, bins=15, ignored_sideband=0.01)
# In[23]:
efficiencies.plot(figsize=(18, 15), fontsize=12, show_legend=False)
# plot efficiencies with error bars
# efficiencies_with_errors.plot(figsize=(18, 25), fontsize=12, show_legend=False)
# ## Quality on different metrics
# look how you can estimate the quality with your custom binary metrics and look for optimal threshold
# In[24]:
from rep.report.metrics import significance
metrics = report.metrics_vs_cut(significance, metric_label='significance')
metrics.plot(new_plot=True, figsize=(10, 4))
# You can define your own metric and use it
# def AMS(s, b):
# b_reg = 0.01
# radicand = 2 *( (s+b+b_reg) * numpy.log (1.0 + s/(b+b_reg)) - s)
# return numpy.sqrt(radicand)
# metrics = report.metrics_vs_cut(AMS, metric_label='ams')
# # Exercises
# __Exercise 1__. Create weight column for test and train datasets. Use weights in `fit`ting and `test`ing of factory.
#
# __Exercise 2__. Train another classifiers, play with parameters and feature sets.
#
# __Exercise 3__. Set up you IPython cluster for distributed training of estimators.