#!/usr/bin/env python # coding: utf-8 # # Random Forest classifiers # # In this section of the tutorial, we will investigate the use of Random Forest classifiers in `sklearn`. As for all models in the `sklearn` framework, Random Forests mainly rely on `fit(X, y)` and `predict(X)` methods. Once fitted, relative importance of the features can be accessed _via_ the `feature_importances_` property. # # More information about the use of Random Forests for Classification in `sklearn` can be found at: . # # To begin with, let us import libraries we need and define a function to plot a fitted classifier (this function will not be specific to Random Forests) in 2D. # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') from sklearn.ensemble import RandomForestClassifier from sklearn.datasets import make_circles import matplotlib.pyplot as plt import numpy as np def plot_decision(clf, X, y): # Build a 2D grid and perform classification using clf on this grid xx, yy = np.meshgrid(np.arange(X[:,0].min() - .5, X[:,0].max() + .5, .01), np.arange(X[:,1].min() - .5, X[:,1].max() + .5, .01)) zz = clf.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape) plt.contourf(xx, yy, zz, alpha=.2) # Plot data plt.scatter(X[:, 0], X[:, 1], c=y, s=40) # Set figure coordinate limits plt.xlim(X[:,0].min() - .5, X[:,0].max() + .5) plt.ylim(X[:,1].min() - .5, X[:,1].max() + .5) # Then, we load some data and train a forest made of a single tree (`n_estimators=1`): # In[2]: X, y = make_circles(n_samples=100, random_state=0, noise=.1, factor=.6) clf = RandomForestClassifier(n_estimators=1) clf.fit(X, y) plot_decision(clf, X, y) # Now, if we vary the number of trees in the model, things can change a little bit: # In[3]: plt.figure(figsize=(15, 5)) for i, n_trees in enumerate([1, 10, 100]): plt.subplot(1, 3, i + 1) clf = RandomForestClassifier(n_estimators=n_trees) clf.fit(X, y) plot_decision(clf, X, y) plt.title("%d tree(s)" % n_trees) # Once a model fitted, we can have a look at relative importance of the different features: # In[4]: clf.feature_importances_ # To get an idea, we can add random components and see what happens: # In[5]: X, y = make_circles(n_samples=100, random_state=0, noise=.1, factor=.8) X = np.hstack((X, np.random.randn(100, 10))) clf = RandomForestClassifier(n_estimators=1) clf.fit(X, y) print(clf.feature_importances_) # Surprisingly enough, first 2 dimensions do not seem to be the most informative. This is because we do not have sufficient amount of data to assess feature importance. If we add some: # In[6]: for n in [100, 1000, 10000]: X, y = make_circles(n_samples=n, random_state=0, noise=.1, factor=.8) X = np.hstack((X, np.random.randn(n, 10))) clf = RandomForestClassifier(n_estimators=1) clf.fit(X, y) print(n, clf.feature_importances_)