Binary classification with PyMVA

In [1]:
import ROOT
Welcome to JupyROOT 6.09/01
In [2]:
# Select Theano as backend for Keras
from os import environ
environ['KERAS_BACKEND'] = 'theano'

# Set architecture of system (AVX instruction set is not supported on SWAN)
environ['THEANO_FLAGS'] = 'gcc.cxxflags=-march=corei7'

from keras.models import Sequential
from keras.layers.core import Dense, Dropout
from keras.optimizers import Adam
Using Theano backend.

Load data

In [3]:
# Open file
data = ROOT.TFile.Open('https://raw.githubusercontent.com/iml-wg/tmvatutorials/master/inputdata.root')

# Get signal and background trees from file
signal = data.Get('TreeS')
background = data.Get('TreeB')

# Add variables to dataloader
dataloader = ROOT.TMVA.DataLoader('dataset_pymva')
numVariables = len(signal.GetListOfBranches())
for branch in signal.GetListOfBranches():
    dataloader.AddVariable(branch.GetName())

# Add trees to dataloader
dataloader.AddSignalTree(signal, 1.0)
dataloader.AddBackgroundTree(background, 1.0)
trainTestSplit = 0.8
dataloader.PrepareTrainingAndTestTree(ROOT.TCut(''),
        'TrainTestSplit_Signal={}:'.format(trainTestSplit)+\
        'TrainTestSplit_Background={}:'.format(trainTestSplit)+\
        'SplitMode=Random')
DataSetInfo              : [dataset_pymva] : Added class "Signal"
                         : Add Tree TreeS of type Signal with 6000 events
DataSetInfo              : [dataset_pymva] : Added class "Background"
                         : Add Tree TreeB of type Background with 6000 events
                         : Dataset[dataset_pymva] : Class index : 0  name : Signal
                         : Dataset[dataset_pymva] : Class index : 1  name : Background

Set up TMVA

In [4]:
# Setup TMVA
ROOT.TMVA.Tools.Instance()
ROOT.TMVA.PyMethodBase.PyInitialize()

outputFile = ROOT.TFile.Open('TMVAOutputPyMVA.root', 'RECREATE')
factory = ROOT.TMVA.Factory('TMVAClassification', outputFile,
        '!V:!Silent:Color:DrawProgressBar:Transformations=I,G:'+\
        'AnalysisType=Classification')

Define model for Keras

In [5]:
# Define model
model = Sequential()
model.add(Dense(32, init='glorot_normal', activation='relu',
        input_dim=numVariables))
model.add(Dropout(0.5))
model.add(Dense(32, init='glorot_normal', activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(2, init='glorot_uniform', activation='softmax'))

# Set loss and optimizer
model.compile(loss='categorical_crossentropy', optimizer=Adam(),
        metrics=['categorical_accuracy',])

# Store model to file
model.save('model.h5')

# Print summary of model
model.summary()
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
====================================================================================================
dense_1 (Dense)                  (None, 32)            160         dense_input_1[0][0]              
____________________________________________________________________________________________________
dropout_1 (Dropout)              (None, 32)            0           dense_1[0][0]                    
____________________________________________________________________________________________________
dense_2 (Dense)                  (None, 32)            1056        dropout_1[0][0]                  
____________________________________________________________________________________________________
dropout_2 (Dropout)              (None, 32)            0           dense_2[0][0]                    
____________________________________________________________________________________________________
dense_3 (Dense)                  (None, 2)             66          dropout_2[0][0]                  
====================================================================================================
Total params: 1282
____________________________________________________________________________________________________
WARNING (theano.gof.cmodule): WARNING: your Theano flags `gcc.cxxflags` specify an `-march=X` flags.
         It is better to let Theano/g++ find it automatically, but we don't do it now
WARNING:theano.gof.cmodule:WARNING: your Theano flags `gcc.cxxflags` specify an `-march=X` flags.
         It is better to let Theano/g++ find it automatically, but we don't do it now

Book methods

Just run the cells that contain the classifiers you want to try.

In [6]:
# Keras interface with previously defined model
factory.BookMethod(dataloader, ROOT.TMVA.Types.kPyKeras, 'PyKeras',
        'H:!V:VarTransform=G:FilenameModel=model.h5:'+\
        'NumEpochs=10:BatchSize=32:'+\
        'TriesEarlyStopping=3')
Out[6]:
<ROOT.TMVA::MethodPyKeras object ("PyKeras") at 0x77e48b0>
Factory                  : Booking method: PyKeras
                         : 
PyKeras                  : [dataset_pymva] : Create Transformation "G" with events from all classes.
                         : 
                         : Transformation, Variable selection : 
                         : Input : variable 'var1' <---> Output : variable 'var1'
                         : Input : variable 'var2' <---> Output : variable 'var2'
                         : Input : variable 'var3' <---> Output : variable 'var3'
                         : Input : variable 'var4' <---> Output : variable 'var4'
                         : Load model from file: model.h5
In [7]:
# Gradient tree boosting from scikit-learn package
factory.BookMethod(dataloader, ROOT.TMVA.Types.kPyGTB, 'GTB',
        'H:!V:VarTransform=None:'+\
        'NEstimators=100:LearningRate=0.1:MaxDepth=3')
Out[7]:
<ROOT.TMVA::MethodPyGTB object ("GTB") at 0x77c0a30>
Factory                  : Booking method: GTB
                         : 
DataSetFactory           : [dataset_pymva] : Number of events in input trees
                         : 
                         : 
                         : Dataset[dataset_pymva] : Weight renormalisation mode: "EqualNumEvents": renormalises all event classes ...
                         : Dataset[dataset_pymva] :  such that the effective (weighted) number of events in each class is the same 
                         : Dataset[dataset_pymva] :  (and equals the number of events (entries) given for class=0 )
                         : Dataset[dataset_pymva] : ... i.e. such that Sum[i=1..N_j]{w_i} = N_classA, j=classA, classB, ...
                         : Dataset[dataset_pymva] : ... (note that N_j is the sum of TRAINING events
                         : Dataset[dataset_pymva] :  ..... Testing events are not renormalised nor included in the renormalisation factor!)
                         : Number of training and testing events
                         : ---------------------------------------------------------------------------
                         : Signal     -- training events            : 4800
                         : Signal     -- testing events             : 1200
                         : Signal     -- training and testing events: 6000
                         : Background -- training events            : 4800
                         : Background -- testing events             : 1200
                         : Background -- training and testing events: 6000
                         : 
DataSetInfo              : Correlation matrix (Signal):
                         : ----------------------------------------
                         :             var1    var2    var3    var4
                         :    var1:  +1.000  +0.379  +0.585  +0.813
                         :    var2:  +0.379  +1.000  +0.691  +0.727
                         :    var3:  +0.585  +0.691  +1.000  +0.848
                         :    var4:  +0.813  +0.727  +0.848  +1.000
                         : ----------------------------------------
DataSetInfo              : Correlation matrix (Background):
                         : ----------------------------------------
                         :             var1    var2    var3    var4
                         :    var1:  +1.000  +0.852  +0.914  +0.964
                         :    var2:  +0.852  +1.000  +0.925  +0.935
                         :    var3:  +0.914  +0.925  +1.000  +0.970
                         :    var4:  +0.964  +0.935  +0.970  +1.000
                         : ----------------------------------------
DataSetFactory           : [dataset_pymva] :  
                         : 
/cvmfs/sft-nightlies.cern.ch/lcg/views/dev3/Sat/x86_64-slc6-gcc49-opt/lib/python2.7/site-packages/ipykernel/__main__.py:4: DeprecationWarning: PyArray_FromDims: use PyArray_SimpleNew.
/cvmfs/sft-nightlies.cern.ch/lcg/views/dev3/Sat/x86_64-slc6-gcc49-opt/lib/python2.7/site-packages/ipykernel/__main__.py:4: DeprecationWarning: PyArray_FromDimsAndDataAndDescr: use PyArray_NewFromDescr.

Run training, testing and evaluation

In [8]:
factory.TrainAllMethods()
Factory                  : Train all methods
Factory                  : [dataset_pymva] : Create Transformation "I" with events from all classes.
                         : 
                         : Transformation, Variable selection : 
                         : Input : variable 'var1' <---> Output : variable 'var1'
                         : Input : variable 'var2' <---> Output : variable 'var2'
                         : Input : variable 'var3' <---> Output : variable 'var3'
                         : Input : variable 'var4' <---> Output : variable 'var4'
Factory                  : [dataset_pymva] : Create Transformation "G" with events from all classes.
                         : 
                         : Transformation, Variable selection : 
                         : Input : variable 'var1' <---> Output : variable 'var1'
                         : Input : variable 'var2' <---> Output : variable 'var2'
                         : Input : variable 'var3' <---> Output : variable 'var3'
                         : Input : variable 'var4' <---> Output : variable 'var4'
                         : Preparing the Gaussian transformation...
TFHandler_Factory        : Variable        Mean        RMS   [        Min        Max ]
                         : -----------------------------------------------------------
                         :     var1:  0.0065519    0.99843   [    -3.1728     5.7307 ]
                         :     var2:  0.0068699     1.0010   [    -3.1728     5.7307 ]
                         :     var3:  0.0067702     1.0001   [    -3.1728     5.7307 ]
                         :     var4:  0.0066114    0.99911   [    -3.1728     5.7307 ]
                         : -----------------------------------------------------------
                         : Ranking input variables (method unspecific)...
Id_GaussTransformation   : Ranking result (top variable is best ranked)
                         : -----------------------------
                         : Rank : Variable  : Separation
                         : -----------------------------
                         :    1 : var4      : 3.445e-01
                         :    2 : var3      : 2.750e-01
                         :    3 : var1      : 2.670e-01
                         :    4 : var2      : 2.116e-01
                         : -----------------------------
Factory                  : Train method: PyKeras for Classification
                         : 
                         : 
                         : ================================================================
                         : H e l p   f o r   M V A   m e t h o d   [ PyKeras ] :
                         : 
                         : Keras is a high-level API for the Theano and Tensorflow packages.
                         : This method wraps the training and predictions steps of the Keras
                         : Python package for TMVA, so that dataloading, preprocessing and
                         : evaluation can be done within the TMVA system. To use this Keras
                         : interface, you have to generate a model with Keras first. Then,
                         : this model can be loaded and trained in TMVA.
                         : 
                         : 
                         : <Suppress this message by specifying "!H" in the booking option>
                         : ================================================================
                         : 
                         : Preparing the Gaussian transformation...
TFHandler_PyKeras        : Variable        Mean        RMS   [        Min        Max ]
                         : -----------------------------------------------------------
                         :     var1:  0.0065519    0.99843   [    -3.1728     5.7307 ]
                         :     var2:  0.0068699     1.0010   [    -3.1728     5.7307 ]
                         :     var3:  0.0067702     1.0001   [    -3.1728     5.7307 ]
                         :     var4:  0.0066114    0.99911   [    -3.1728     5.7307 ]
                         : -----------------------------------------------------------
                         : Option SaveBestOnly: Only model weights with smallest validation loss will be stored
                         : Option TriesEarlyStopping: Training will stop after 3 number of epochs with no improvement of validation loss
Train on 9600 samples, validate on 2400 samples
Epoch 1/10
9376/9600 [============================>.] - ETA: 0s - loss: 0.6087 - categorical_accuracy: 0.6496Epoch 00000: val_loss improved from inf to 0.53422, saving model to dataset_pymva/weights/TrainedModel_PyKeras.h5
9600/9600 [==============================] - 0s - loss: 0.6084 - categorical_accuracy: 0.6504 - val_loss: 0.5342 - val_categorical_accuracy: 0.7600
Epoch 2/10
9504/9600 [============================>.] - ETA: 0s - loss: 0.5125 - categorical_accuracy: 0.7488Epoch 00001: val_loss improved from 0.53422 to 0.43809, saving model to dataset_pymva/weights/TrainedModel_PyKeras.h5
9600/9600 [==============================] - 0s - loss: 0.5119 - categorical_accuracy: 0.7492 - val_loss: 0.4381 - val_categorical_accuracy: 0.7975
Epoch 3/10
9440/9600 [============================>.] - ETA: 0s - loss: 0.4697 - categorical_accuracy: 0.7728Epoch 00002: val_loss improved from 0.43809 to 0.40123, saving model to dataset_pymva/weights/TrainedModel_PyKeras.h5
9600/9600 [==============================] - 0s - loss: 0.4688 - categorical_accuracy: 0.7732 - val_loss: 0.4012 - val_categorical_accuracy: 0.8125
Epoch 4/10
9504/9600 [============================>.] - ETA: 0s - loss: 0.4404 - categorical_accuracy: 0.7942Epoch 00003: val_loss improved from 0.40123 to 0.37674, saving model to dataset_pymva/weights/TrainedModel_PyKeras.h5
9600/9600 [==============================] - 0s - loss: 0.4401 - categorical_accuracy: 0.7946 - val_loss: 0.3767 - val_categorical_accuracy: 0.8421
Epoch 5/10
9248/9600 [===========================>..] - ETA: 0s - loss: 0.4240 - categorical_accuracy: 0.8005Epoch 00004: val_loss improved from 0.37674 to 0.37410, saving model to dataset_pymva/weights/TrainedModel_PyKeras.h5
9600/9600 [==============================] - 0s - loss: 0.4237 - categorical_accuracy: 0.8004 - val_loss: 0.3741 - val_categorical_accuracy: 0.8433
Epoch 6/10
9280/9600 [============================>.] - ETA: 0s - loss: 0.4082 - categorical_accuracy: 0.8196Epoch 00005: val_loss improved from 0.37410 to 0.35731, saving model to dataset_pymva/weights/TrainedModel_PyKeras.h5
9600/9600 [==============================] - 0s - loss: 0.4079 - categorical_accuracy: 0.8198 - val_loss: 0.3573 - val_categorical_accuracy: 0.8433
Epoch 7/10
9248/9600 [===========================>..] - ETA: 0s - loss: 0.3998 - categorical_accuracy: 0.8212Epoch 00006: val_loss improved from 0.35731 to 0.34590, saving model to dataset_pymva/weights/TrainedModel_PyKeras.h5
9600/9600 [==============================] - 0s - loss: 0.4001 - categorical_accuracy: 0.8206 - val_loss: 0.3459 - val_categorical_accuracy: 0.8471
Epoch 8/10
9280/9600 [============================>.] - ETA: 0s - loss: 0.3886 - categorical_accuracy: 0.8292Epoch 00007: val_loss improved from 0.34590 to 0.34174, saving model to dataset_pymva/weights/TrainedModel_PyKeras.h5
9600/9600 [==============================] - 0s - loss: 0.3882 - categorical_accuracy: 0.8297 - val_loss: 0.3417 - val_categorical_accuracy: 0.8483
Epoch 9/10
9440/9600 [============================>.] - ETA: 0s - loss: 0.3905 - categorical_accuracy: 0.8263Epoch 00008: val_loss did not improve
9600/9600 [==============================] - 0s - loss: 0.3903 - categorical_accuracy: 0.8261 - val_loss: 0.3448 - val_categorical_accuracy: 0.8517
Epoch 10/10
9248/9600 [===========================>..] - ETA: 0s - loss: 0.3858 - categorical_accuracy: 0.8269Epoch 00009: val_loss did not improve
9600/9600 [==============================] - 0s - loss: 0.3846 - categorical_accuracy: 0.8276 - val_loss: 0.3449 - val_categorical_accuracy: 0.8462
                         : Elapsed time for training with 9600 events: 13.3 sec         
                         : Creating xml weight file: dataset_pymva/weights/TMVAClassification_PyKeras.weights.xml
                         : Creating standalone class: dataset_pymva/weights/TMVAClassification_PyKeras.class.C
Factory                  : Training finished
                         : 
Factory                  : Train method: GTB for Classification
                         : 
                         : 
                         : ================================================================
                         : H e l p   f o r   M V A   m e t h o d   [ GTB ] :
                         : 
                         : --- Short description:
                         : 
                         : Decision Trees and Rule-Based Models 
                         : 
                         : --- Performance optimisation:
                         : 
                         : 
                         : --- Performance tuning via configuration options:
                         : 
                         : <None>
                         : 
                         : <Suppress this message by specifying "!H" in the booking option>
                         : ================================================================
                         : 
('deviance', 0.1, 100, 1.0, 2, 1, 0.0, 3, None, None, None, 0, None, 0)
GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=0)
                         : 
                         : --- Saving State File In:dataset_pymva/weights/PyGTBModel.PyData
                         : 
                         : Elapsed time for training with 9600 events: 1.11 sec         
                         : Dataset[dataset_pymva] : Evaluation of GTB on training sample (9600 events)
                         : Dataset[dataset_pymva] : Elapsed time for evaluation of 9600 events: 0.0345 sec       
                         : Creating xml weight file: dataset_pymva/weights/TMVAClassification_GTB.weights.xml
                         : Creating standalone class: dataset_pymva/weights/TMVAClassification_GTB.class.C
Factory                  : Training finished
                         : 
                         : Ranking input variables (method specific)...
                         : No variable ranking supplied by classifier: PyKeras
                         : No variable ranking supplied by classifier: GTB
Factory                  : === Destroy and recreate all methods via weight files for testing ===
                         : 
/cvmfs/sft-nightlies.cern.ch/lcg/views/dev3/Sat/x86_64-slc6-gcc49-opt/lib/python2.7/site-packages/ipykernel/__main__.py:1: DeprecationWarning: PyArray_FromDims: use PyArray_SimpleNew.
  if __name__ == '__main__':
/cvmfs/sft-nightlies.cern.ch/lcg/views/dev3/Sat/x86_64-slc6-gcc49-opt/lib/python2.7/site-packages/ipykernel/__main__.py:1: DeprecationWarning: PyArray_FromDimsAndDataAndDescr: use PyArray_NewFromDescr.
  if __name__ == '__main__':
In [9]:
factory.TestAllMethods()
Factory                  : Test all methods
Factory                  : Test method: PyKeras for Classification performance
                         : 
                         : Load model from file: dataset_pymva/weights/TrainedModel_PyKeras.h5
Factory                  : Test method: GTB for Classification performance
                         : 
                         : 
                         : --- Loading State File From:dataset_pymva/weights/PyGTBModel.PyData
                         : 
                         : Dataset[dataset_pymva] : Evaluation of GTB on testing sample (2400 events)
                         : Dataset[dataset_pymva] : Elapsed time for evaluation of 2400 events: 0.00952 sec       
/cvmfs/sft-nightlies.cern.ch/lcg/views/dev3/Sat/x86_64-slc6-gcc49-opt/lib/python2.7/site-packages/ipykernel/__main__.py:1: DeprecationWarning: PyArray_FromDims: use PyArray_SimpleNew.
  if __name__ == '__main__':
/cvmfs/sft-nightlies.cern.ch/lcg/views/dev3/Sat/x86_64-slc6-gcc49-opt/lib/python2.7/site-packages/ipykernel/__main__.py:1: DeprecationWarning: PyArray_FromDimsAndDataAndDescr: use PyArray_NewFromDescr.
  if __name__ == '__main__':
In [10]:
factory.EvaluateAllMethods()
Factory                  : Evaluate all methods
Factory                  : Evaluate classifier: PyKeras
                         : 
TFHandler_PyKeras        : Variable        Mean        RMS   [        Min        Max ]
                         : -----------------------------------------------------------
                         :     var1:  -0.019674     1.0126   [    -2.8208     5.7307 ]
                         :     var2:  -0.025370    0.99752   [    -3.1672     5.7307 ]
                         :     var3:  -0.025914     1.0079   [    -3.0141     5.7307 ]
                         :     var4:  -0.023154     1.0059   [    -2.9557     5.7307 ]
                         : -----------------------------------------------------------
PyKeras                  : [dataset_pymva] : Loop over test events and fill histograms with classifier response...
                         : 
TFHandler_PyKeras        : Variable        Mean        RMS   [        Min        Max ]
                         : -----------------------------------------------------------
                         :     var1:  -0.019674     1.0126   [    -2.8208     5.7307 ]
                         :     var2:  -0.025370    0.99752   [    -3.1672     5.7307 ]
                         :     var3:  -0.025914     1.0079   [    -3.0141     5.7307 ]
                         :     var4:  -0.023154     1.0059   [    -2.9557     5.7307 ]
                         : -----------------------------------------------------------
Factory                  : Evaluate classifier: GTB
                         : 
GTB                      : [dataset_pymva] : Loop over test events and fill histograms with classifier response...
                         : 
TFHandler_GTB            : Variable        Mean        RMS   [        Min        Max ]
                         : -----------------------------------------------------------
                         :     var1:  -0.019646     1.6797   [    -4.8163     4.5708 ]
                         :     var2:  -0.028834     1.5789   [    -5.2407     4.4671 ]
                         :     var3:  -0.036699     1.7446   [    -5.2331     4.6430 ]
                         :     var4:    0.11995     2.1669   [    -6.3160     4.8976 ]
                         : -----------------------------------------------------------
                         : 
                         : Evaluation results ranked by best signal efficiency and purity (area)
                         : -------------------------------------------------------------------------------------------------------------------
                         : DataSet       MVA                       
                         : Name:         Method:          ROC-integ
                         : dataset_pymva PyKeras        : 0.928
                         : dataset_pymva GTB            : 0.918
                         : -------------------------------------------------------------------------------------------------------------------
                         : 
                         : Testing efficiency compared to training efficiency (overtraining check)
                         : -------------------------------------------------------------------------------------------------------------------
                         : DataSet              MVA              Signal efficiency: from test sample (from training sample) 
                         : Name:                Method:          @B=0.01             @B=0.10            @B=0.30   
                         : -------------------------------------------------------------------------------------------------------------------
                         : dataset_pymva        PyKeras        : 0.357 (0.335)       0.737 (0.780)      0.963 (0.957)
                         : dataset_pymva        GTB            : 0.295 (0.395)       0.733 (0.788)      0.947 (0.948)
                         : -------------------------------------------------------------------------------------------------------------------
                         : 
Dataset:dataset_pymva    : Created tree 'TestTree' with 2400 events
                         : 
Dataset:dataset_pymva    : Created tree 'TrainTree' with 9600 events
                         : 
Factory                  : Thank you for using TMVA!
                         : For citation information, please visit: http://tmva.sf.net/citeTMVA.html
/cvmfs/sft-nightlies.cern.ch/lcg/views/dev3/Sat/x86_64-slc6-gcc49-opt/lib/python2.7/site-packages/ipykernel/__main__.py:1: DeprecationWarning: PyArray_FromDims: use PyArray_SimpleNew.
  if __name__ == '__main__':
/cvmfs/sft-nightlies.cern.ch/lcg/views/dev3/Sat/x86_64-slc6-gcc49-opt/lib/python2.7/site-packages/ipykernel/__main__.py:1: DeprecationWarning: PyArray_FromDimsAndDataAndDescr: use PyArray_NewFromDescr.
  if __name__ == '__main__':
In [11]:
# Enable Javascript for ROOT so that we can draw the canvas
%jsroot on

# Print ROC
canvas = factory.GetROCCurve(dataloader)
canvas.Draw()