#!/usr/bin/env python # coding: utf-8 # # About # # This notebook demonstrates neural networks (NN) classifiers, which are provided by __Reproducible experiment platform (REP)__ package.
REP contains wrappers for following NN libraries: # * __theanets__ # * __neurolab__ # * __pybrain__ # # # ### In this notebook we show: # * train classifier # * get predictions # * measure quality # * pretraining and partial fitting # * combine classifiers using meta-algorithms # # Most of this is done in the same way as for other classifiers (see notebook [01-howto-Classifiers.ipynb](https://github.com/yandex/rep/blob/master/howto/01-howto-Classifiers.ipynb)). # # Parameters selected here are specially taken to make training very fast, those are very non-optimal. # # Loading data # ### download particle identification data set from UCI # In[1]: get_ipython().system('cd toy_datasets; wget -O MiniBooNE_PID.txt -nc --no-check-certificate https://archive.ics.uci.edu/ml/machine-learning-databases/00199/MiniBooNE_PID.txt') # In[2]: import numpy, pandas from rep.utils import train_test_split from sklearn.metrics import roc_auc_score data = pandas.read_csv('toy_datasets/MiniBooNE_PID.txt', sep='\s*', skiprows=[0], header=None, engine='python') labels = pandas.read_csv('toy_datasets/MiniBooNE_PID.txt', sep=' ', nrows=1, header=None) labels = [1] * labels[1].values[0] + [0] * labels[2].values[0] data.columns = ['feature_{}'.format(key) for key in data.columns] # In[3]: len(data) # ### First rows of data # In[4]: data[:5] # ### Splitting into train and test # In[5]: # Get train and test data train_data, test_data, train_labels, test_labels = train_test_split(data, labels, train_size=0.25) # # Neural nets # # All nets inherit from __sklearn.BaseEstimator__ and have the same interface as another wrappers in REP (details see in **01-howto-Classifiers**) # # Neurla network libraries libraries **support**: # # * classification # * multi-classification # * regression # * multi-target regresssion # * additional fitting (using `partial_fit` method) # # and **don't support**: # # * staged prediction methods # * weights for data # # Variables used in training # In[6]: variables = list(data.columns[:15]) # # Theanets # In[7]: from rep.estimators import TheanetsClassifier print TheanetsClassifier.__doc__ # ### Simple training # In[8]: tn = TheanetsClassifier(features=variables, layers=[7], trainers=[{'optimize': 'nag', 'learning_rate': 0.1, 'min_improvement': 0.1}]) tn.fit(train_data, train_labels) pass # ### Predicting probabilities, measuring the quality # In[9]: prob = tn.predict_proba(test_data) print prob # In[10]: print 'ROC AUC', roc_auc_score(test_labels, prob[:, 1]) # ### Theanets multistage training # # In some cases we need to continue training: i.e., we have new data or current trainer is not efficient anymore. # # For this purpose there is `partial_fit` method, where you can continue training using different trainer or different data. # In[11]: tn = TheanetsClassifier(features=variables, layers=[10, 10], trainers=[{'algo': 'rprop', 'min_improvement': 0.1}]) tn.fit(train_data, train_labels) print('training complete') # #### Second stage of fitting # In[12]: tn.partial_fit(train_data, train_labels, **{'algo': 'adagrad', 'min_improvement': 0.1}) print('training complete') # In[13]: # predict probabilities for each class prob = tn.predict_proba(test_data) print prob # In[14]: print 'ROC AUC', roc_auc_score(test_labels, prob[:, 1]) # ### Predictions of classes # In[15]: tn.predict(test_data) # ## Neurolab # In[16]: from rep.estimators import NeurolabClassifier print NeurolabClassifier.__doc__ # ### Let's train network using Rprop algorithm # In[17]: import neurolab nl = NeurolabClassifier(features=variables, layers=[10], epochs=5, trainf=neurolab.train.train_rprop) nl.fit(train_data, train_labels) print('training complete') # ### After training neural network you still can improve it by using partial fit on other data: # ``` # nl.partial_fit(new_train_data, new_train_labels) # ``` # # ### Predict probabilities and estimate quality # In[18]: # predict probabilities for each class prob = nl.predict_proba(test_data) print prob # In[19]: print 'ROC AUC', roc_auc_score(test_labels, prob[:, 1]) # In[20]: # predict labels nl.predict(test_data) # ## Pybrain # In[21]: from rep.estimators import PyBrainClassifier print PyBrainClassifier.__doc__ # In[22]: pb = PyBrainClassifier(features=variables, layers=[5], epochs=2, hiddenclass=['TanhLayer']) pb.fit(train_data, train_labels) print('training complete') # ### Predict probabilities and estimate quality # again, we could proceed with training and use new dataset # ``` # nl.partial_fit(new_train_data, new_train_labels) # ``` # # In[23]: prob = pb.predict_proba(test_data) print 'ROC AUC:', roc_auc_score(test_labels, prob[:, 1]) # ### Predict labels # In[24]: pb.predict(test_data) # ## Scaling of features # initial prescaling of features is frequently crucial to get some appropriate results using neural networks. # # By default, all the networks use `StandardScaler` from `sklearn`, but you can use any other transformer, say MinMax or self-written by passing appropriate value as scaler. All the networks have same support of `scaler` parameter # In[25]: from sklearn.preprocessing import MinMaxScaler # will use StandardScaler NeurolabClassifier(scaler='standard') # will use MinMaxScaler NeurolabClassifier(scaler=MinMaxScaler()) # will not use any pretransformation of features NeurolabClassifier(scaler=False) # # Advantages of common interface # # Let's build an ensemble of neural networks. This will be done by bagging meta-algorithm # ## Bagging over Theanets classifier # # A well-known fact is that the classification quality of single neural network can be significantly improved by ensembling. # # In simplest case, we average predictions of several neural networks. Bagging trains several classifiers on random subsets of training data, and thus achieves higher quality and more stable predictions. # # You can try the same trick with any other network, not only Theanets. # In[26]: # uncomment the code below to try, this may take much time # from sklearn.ensemble import BaggingClassifier # base_tn = TheanetsClassifier(layers=[10, 7], trainers=[{'algo': 'adadelta'}]) # bagging_tn = BaggingClassifier(base_estimator=base_tn, n_estimators=10) # bagging_tn.fit(train_data[variables], train_labels) # prob = bagging_tn.predict_proba(test_data[variables]) # print 'AUC', roc_auc_score(test_labels, prob[:, 1]) # # Other advantages of common interface # There are many things you can do with neural networks now: # * cloning # * getting / setting parameters as dictionaries # * use `grid_search`, play with sizes of hidden layers and other parameters # * build pipelines (`sklearn.pipeline`) # * use hierarchical training, training on subsets # * passing over internet / train classifiers on other machines / distributed learning of ensemles # # # And you can replace classifiers at any moment. # ## See also # Sklearn-compatible libraries you can use within REP: # # 1. [hep_ml.nnet](https://arogozhnikov.github.io/hep_ml/nnet.html) are sklearn-compatible. # 2. [nolearn](https://github.com/dnouri/nolearn) wrappers are expected to be sklearn-compatible