%matplotlib inline from array import array import numpy as np import matplotlib.pyplot as plt from sklearn import datasets from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import AdaBoostClassifier from sklearn.metrics import classification_report, roc_auc_score # Generate data like in Hastie et al. 2009, Example 10.2. # where DecisionTrees, DecisionStumps and AdaBoost are # compared to each other X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1) # Train on the first 2000, test on the rest X_train, y_train = X[:2000], y[:2000] X_test, y_test = X[2000:], y[2000:] # some shortcuts to select "signal" or "background" # entries in the feautre vector signal = y_train > +0.1 background = y_train < -0.1 plt.scatter(X_train[signal,0], X_train[signal,1], c='red') plt.scatter(X_train[background,0], X_train[background,1], c='blue') plt.xlabel("Feature 0") plt.ylabel("Feature 1") plt.scatter(X_train[signal,4], X_train[signal,5], c='red') plt.scatter(X_train[background,4], X_train[background,5], c='blue') plt.xlabel("Feature 4") plt.ylabel("Feature 5") import ROOT as R from ROOT import TMVA # ROOT and TMVA require an open file to store things # as they go along outfile = R.TFile('/tmp/tmva_output.root', 'recreate') factory = TMVA.Factory("TMVAClassification", outfile, "AnalysisType=Classification") for n in range(10): factory.AddVariable("f%i"%n, "F") for y,row in zip(y_train, X_train): a = R.std.vector(R.Double)() for r in row: a.push_back(r) if y > 0: factory.AddSignalTrainingEvent(a) else: factory.AddBackgroundTrainingEvent(a) for y,row in zip(y_test, X_test): a = R.std.vector(R.Double)() # instantiate a std::vector for r in row: a.push_back(r) if y > 0: factory.AddSignalTestEvent(a) else: factory.AddBackgroundTestEvent(a) factory.PrepareTrainingAndTestTree(R.TCut("1"), "SplitMode=Random:NormMode=NumEvents") factory.BookMethod(TMVA.Types.kBDT, "BDT", #"NTrees=800:nEventsMin=100:MaxDepth=3:" #"BoostType=AdaBoost:AdaBoostBeta=0.5:SeparationType=GiniIndex:" "nCuts=-1" ) factory.TrainAllMethods() reader = TMVA.Reader() for n in range(10): a = array("f", [0.]) reader.AddVariable("f%i"%n, a) reader.BookMVA("BDT", "weights/TMVAClassification_BDT.weights.xml") y_predicted = [] decision_val = [] for row in X_test: a = R.std.vector(R.Double)() for r in row: a.push_back(r) value = reader.EvaluateMVA(a, "BDT") decision_val.append(value) if value > 0: y_predicted.append(+1) else: y_predicted.append(-1) print classification_report(y_test, y_predicted, target_names=["background", "signal"]) print "Area under ROC curve: %.4f"%(roc_auc_score(y_test, y_predicted)) D = np.array(decision_val) plt.hist(D[y_test>0.], color='r', alpha=0.5, range=(-0.4,0.4), bins=30) plt.hist(D[y_test<0.], color='b', alpha=0.5, range=(-0.4,0.4), bins=30) plt.xlabel("TMVA BDT output") dt = DecisionTreeClassifier(max_depth=3, min_samples_leaf=0.05*len(X_train)) bdt = AdaBoostClassifier(dt, algorithm='SAMME', n_estimators=800, learning_rate=0.5) bdt.fit(X_train, y_train) sk_y_predicted = bdt.predict(X_test) print classification_report(y_test, sk_y_predicted, target_names=["background", "signal"]) print "Area under ROC curve: %.4f"%(roc_auc_score(y_test, y_predicted)) plt.hist(bdt.decision_function(X_test[y_test>0.]).ravel(), color='r', alpha=0.5, range=(-0.4,0.4), bins=30) plt.hist(bdt.decision_function(X_test[y_test<0.]).ravel(), color='b', alpha=0.5, range=(-0.4,0.4), bins=30) plt.xlabel("scikit-learn BDT output") plt.hist(bdt.decision_function(X_train[y_train>0.]).ravel(), color='r', alpha=0.5, range=(-0.4,0.4), bins=20) plt.hist(bdt.decision_function(X_train[y_train<0.]).ravel(), color='b', alpha=0.5, range=(-0.4,0.4), bins=20)