#!/usr/bin/env python # coding: utf-8 # # Viva la Factory # # Factory is way to train several different classifiers on the same dataset and compare the quailty of predictions. # # ### First, enable plotting # In[1]: get_ipython().run_line_magic('pylab', 'inline') # # Loading data # ### download particle identification Data Set from UCI # In[2]: get_ipython().system('cd toy_datasets; wget -O MiniBooNE_PID.txt -nc --no-check-certificate https://archive.ics.uci.edu/ml/machine-learning-databases/00199/MiniBooNE_PID.txt') # In[3]: import numpy, pandas from rep.utils import train_test_split # In[4]: import numpy, pandas from rep.utils import train_test_split from sklearn.metrics import roc_auc_score data = pandas.read_csv('toy_datasets/MiniBooNE_PID.txt', sep='\s*', skiprows=[0], header=None, engine='python') labels = pandas.read_csv('toy_datasets/MiniBooNE_PID.txt', sep=' ', nrows=1, header=None) labels = [1] * labels[1].values[0] + [0] * labels[2].values[0] data.columns = ['feature_{}'.format(key) for key in data.columns] train_data, test_data, train_labels, test_labels = train_test_split(data, labels, train_size=0.5) # ## Variables needed for analysis # In[5]: train_variables = ["feature_new01: feature_0/feature_1", "feature_2", "feature_26", "feature_12", "feature_24", "feature_25", "feature_16",] plot_variables = train_variables + ['feature_3'] # # Factory of different models # # This class is OrderedDict, with additional interface, main methods are: # # * `factory.add_classifier(name, classifier)` # # * `factory.fit(X, y, sample_weight=None, ipc_profile=None, features=None)`
# train all classifiers in factory
# if `features` is not None, then all classifiers will be trained on these features
# you can pass the name of ipython cluster via `ipc_profile` for parallel training # # * `factory.test_on_lds(lds)` - test all models on lds(`rep.data.storage.LabeledDataStorage`)
# returns report (`rep.report.classification.ClassificationReport`) # In[6]: from rep.metaml import ClassifiersFactory from rep.estimators import TMVAClassifier, SklearnClassifier, XGBoostClassifier from sklearn.ensemble import AdaBoostClassifier # ## Define classifiers (that will be compared) # Please pay attention that we set very small number of trees, just to make this notebook work fast. # Don't forget to tune classifier! # # # In[7]: factory = ClassifiersFactory() # There are different ways to add classifiers to Factory: factory.add_classifier('tmva', TMVAClassifier(NTrees=50, features=train_variables, Shrinkage=0.05)) factory.add_classifier('ada', AdaBoostClassifier(n_estimators=10)) factory['xgb'] = XGBoostClassifier(features=train_variables) # ### Create a copy of the factory with all classifiers # In[8]: from copy import deepcopy factory_copy = deepcopy(factory) # ## Training # __pay attention:__
# for the first factory we pointed features that will be used in training and all classifiers will use them,
# for the second factory all the classifiers will use those features we set in their constuctors # In[9]: get_ipython().run_line_magic('time', 'factory.fit(train_data, train_labels, features=train_variables)') pass # In[10]: factory.predict_proba(train_data) # In[11]: get_ipython().run_line_magic('time', 'factory_copy.fit(train_data, train_labels)') pass # # Everybody loves plots! # ## Visualizing of training result with factory # `ClassificationReport` class provides the posibility to get classification description to compare different models.
Below you can find available functions which can help you to analyze result on arbitrary dataset. # # There are different plotting backends supported: # * __matplotlib__ (default, de-facto standard plotting library, mpld3 allows turning this into interactive plots), # * __plotly__ (proprietary package with interactive plots, information is kept on the server), # * __ROOT__ (the library used by CERN people), # * __bokeh__ (open-source package with interactive plots) # ### Get ClassificationReport object # report has many useful methods # In[12]: report = factory.test_on(test_data, test_labels) # ## Plot importances of features # Only the features used in training are compared # In[13]: features_importances = report.feature_importance() features_importances.plot() # not only in matplotlib, but in other libraries too. For instance, with plotly # features_importances.plot_plotly('importances', figsize=(15, 6)) # ## Plot learning curves to see possible overfitting of trained classifier # Learning curves are powerful and simple tool to analyze the behaviour of your model. # In[14]: from rep.report.metrics import RocAuc # In[15]: learning_curve = report.learning_curve(RocAuc(), metric_label='ROC AUC', steps=1) learning_curve.plot(new_plot=True, figsize=(8, 4)) # with plotly # learning_curve.plot_plotly(plotly_filename='learning curves', figsize=(18, 8)) # ## Plot correlation between features # In[16]: correlation_pairs = [] correlation_pairs.append((plot_variables[0], plot_variables[1])) correlation_pairs.append((plot_variables[0], plot_variables[2])) report.scatter(correlation_pairs, alpha=0.01).plot() # ## Plot data information: features correlation matrix # In[17]: # plot correlations between variables for signal-like and bck-like events report.features_correlation_matrix(features=plot_variables).plot(new_plot=True, show_legend=False, figsize=(7, 5)) # In[18]: report.features_correlation_matrix_by_class(features=plot_variables).plot(new_plot=True, show_legend=False, figsize=(15, 5)) # compute correclation matrix only for zero class data # report.features_correlation_matrix_by_class(features=plot_variables[:4], labels_dict={0: 'background'}, grid_columns=1).plot() # ## Plot distribution for each feature # In[19]: # use just common features for all classifiers report.features_pdf().plot() # ## Plot predictions distributions # In[20]: report.prediction_pdf().plot(new_plot=True, figsize = (9, 4)) # Plot the same only for zero class data # report.prediction_pdf(labels_dict={0: 'background'}, size=5).plot() # ## ROC curves (receiver operating characteristic) # Plot roc curve for train, test data (it's the same as BackgroundRejection vs Signal Efficiency plot) # In[21]: figure(figsize(14, 5)) subplot(1, 2, 1) report.roc().plot(xlim=(0.5, 1)) subplot(1, 2, 2) report.roc(physics_notion=True).plot(ylim=(0.5, 1)) # ## Plot 'flatness' of classifier prediction # (this is dependence of efficiency on variables of dataset) # In[22]: efficiencies = report.efficiencies(['feature_3'], ignored_sideband=0.01) efficiencies_with_errors = report.efficiencies(['feature_3'], errors=True, bins=15, ignored_sideband=0.01) # In[23]: efficiencies.plot(figsize=(18, 15), fontsize=12, show_legend=False) # plot efficiencies with error bars # efficiencies_with_errors.plot(figsize=(18, 25), fontsize=12, show_legend=False) # ## Quality on different metrics # look how you can estimate the quality with your custom binary metrics and look for optimal threshold # In[24]: from rep.report.metrics import significance metrics = report.metrics_vs_cut(significance, metric_label='significance') metrics.plot(new_plot=True, figsize=(10, 4)) # You can define your own metric and use it # def AMS(s, b): # b_reg = 0.01 # radicand = 2 *( (s+b+b_reg) * numpy.log (1.0 + s/(b+b_reg)) - s) # return numpy.sqrt(radicand) # metrics = report.metrics_vs_cut(AMS, metric_label='ams') # # Exercises # __Exercise 1__. Create weight column for test and train datasets. Use weights in `fit`ting and `test`ing of factory. # # __Exercise 2__. Train another classifiers, play with parameters and feature sets. # # __Exercise 3__. Set up you IPython cluster for distributed training of estimators.