Variable Importance with Completely Randomized Trees¶

This notebook is an empirical evaluation of Understanding variable importances in forests of randomized trees by Gilles Louppe, Louis Wehenkel, Antonio Sutera and Pierre Geurts.

In [1]:

%matplotlib inline
from copy import copy
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_digits
from sklearn.ensemble import ExtraTreesClassifier

In [2]:

digits = load_digits()
rng = np.random.RandomState(42)

In [3]:

X_continous, y = digits.data, digits.target

Binarize the features to make the problem closer to the setup from the paper that only deals with discrete variables.

In [4]:

X_orig = (X_continous > np.median(X_continous, axis=0)).astype(np.float32)

In [5]:

X_orig

Out[5]:

array([[ 0.,  0.,  1., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  1.,  0.,  0.],
       [ 0.,  0.,  0., ...,  1.,  1.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  1.,  0.,  0.],
       [ 0.,  0.,  1., ...,  1.,  1.,  0.]], dtype=float32)

In [6]:

n_samples, n_features = X_orig.shape
n_samples, n_features

Out[6]:

(1797, 64)

Noisy variables generated from the normal distribution¶

In [7]:

n_features_noise_1 = 10
n_features_noise_2 = 100

X_noise_1 = np.hstack([X_orig, rng.normal(size=(n_samples, n_features_noise_1))])
X_noise_2 = np.hstack([X_orig, rng.normal(size=(n_samples, n_features_noise_2))])

In [8]:

n_estimators = 100
extra_trees = ExtraTreesClassifier(n_estimators, n_jobs=2, random_state=0)
random_trees = ExtraTreesClassifier(n_estimators, max_features=1, n_jobs=2, random_state=0)

In [68]:

def compute_importances(ensemble, normalize=False):
    trees_importances = [base_model.tree_.compute_feature_importances(normalize=normalize)
                         for base_model in ensemble.estimators_]
    return sum(trees_importances) / len(trees_importances)

def plot_feature_importances(importances, normalize=False, color=None, alpha=0.5, label=None, chunk=None):
    if hasattr(importances, 'estimators_'):
        importances = compute_importances(importances, normalize=normalize)
    if chunk is not None:
        importances = importances[chunk]
    plt.bar(range(len(importances)), importances, color=color, alpha=alpha, label=label)

In [10]:

plt.figure(figsize=(16, 3))
plot_feature_importances(extra_trees.fit(X_orig, y), label='extra')
plot_feature_importances(random_trees.fit(X_orig, y), color='g', label='random')
_ = plt.legend()

In [11]:

plt.figure(figsize=(16, 3))
plot_feature_importances(extra_trees.fit(X_noise_1, y), label='extra')
plot_feature_importances(random_trees.fit(X_noise_1, y), color='g', label='random')
_ = plt.legend()

In [12]:

plt.figure(figsize=(16, 3))
plot_feature_importances(extra_trees.fit(X_noise_1, y), label='extra', normalize=True)
plot_feature_importances(random_trees.fit(X_noise_1, y), color='g', label='random', normalize=True)
_ = plt.legend()

Normalization does not seem to be the issue: the extra noisy variables badly impact the random trees.

In [13]:

plt.figure(figsize=(16, 3))
plot_feature_importances(extra_trees.fit(X_noise_2, y), label='extra')
plot_feature_importances(random_trees.fit(X_noise_2, y), color='g', label='random')
_ = plt.legend()

In [14]:

plt.figure(figsize=(16, 3))
plot_feature_importances(extra_trees.fit(X_noise_2, y), label='extra', normalize=True)
plot_feature_importances(random_trees.fit(X_noise_2, y), color='g', label='random', normalize=True)
_ = plt.legend()

Noisy variables generated by feature-wise shuffling of true variables¶

In [15]:

def shuffle_columns(X, copy=True):
    if copy:
        X = X.copy()
    for i in range(X.shape[1]):
        rng.shuffle(X[:, i])
    return X

X_shuffled = shuffle_columns(X_orig)
X_shuffled.shape

Out[15]:

(1797, 64)

In [16]:

X_noise_3 = np.hstack([X_orig, X_shuffled[:, :10]])
X_noise_4 = np.hstack([X_orig, X_shuffled])

In [17]:

plt.figure(figsize=(16, 3))
plot_feature_importances(extra_trees.fit(X_noise_3, y), label='extra')
plot_feature_importances(random_trees.fit(X_noise_3, y), color='g', label='random')
_ = plt.legend()

In [18]:

plt.figure(figsize=(16, 3))
plot_feature_importances(extra_trees.fit(X_noise_4, y), label='extra')
plot_feature_importances(random_trees.fit(X_noise_4, y), color='g', label='random')
_ = plt.legend()

Variable Importances adjusted for chance (against random feature-wise random permutations)¶

Naive way to measure the chance level of variable importances:

In [19]:

def compute_permutation_importances(ensemble, X, y, n_permutations=1):
    permutation_importances = []
    for i in range(n_permutations):
        ensemble.fit(shuffle_columns(X), y)
        permutation_importances.append(compute_importances(ensemble))   
    permutation_importances = sum(permutation_importances) / len(permutation_importances)
    return permutation_importances

def compute_adjusted_importances(ensemble, X, y, n_permutations=3):
    ensemble.fit(X, y)
    variable_importances = compute_importances(ensemble)
    permutation_importances = compute_permutation_importances(
        ensemble, X, y, n_permutations=n_permutations)
    return variable_importances - permutation_importances

In [20]:

n_estimators = 100
extra_trees = ExtraTreesClassifier(n_estimators, n_jobs=2, random_state=0)
random_trees = ExtraTreesClassifier(n_estimators, max_features=1, n_jobs=2, random_state=0)

plt.figure(figsize=(16, 3))
plot_feature_importances(random_trees.fit(X_orig, y), color='b', label='importance')
plot_feature_importances(compute_permutation_importances(random_trees, X_orig, y), color='g', label='permutation')
_ = plt.legend()

In [21]:

plt.figure(figsize=(16, 3))
adjusted_importances = compute_adjusted_importances(extra_trees, X_orig, y)
plot_feature_importances(adjusted_importances, label='extra', color='b')
adjusted_importances = compute_adjusted_importances(random_trees, X_orig, y)
plot_feature_importances(adjusted_importances, label='random', color='g')
_ = plt.legend()

This does not work and is a bit to be expected, especially for very randomized trees. Using a larger number of trees does not seem to help in any significant way.

In [22]:

def compute_permutation_importances2(ensemble, X, y, n_permutations=1):
    all_importances = []
    n_features = X.shape[1]
    for i in range(n_permutations):
        ensemble.fit(np.hstack([X, shuffle_columns(X)]), y)
        all_importances.append(compute_importances(ensemble))   
    all_importances = sum(all_importances) / len(all_importances)
    return all_importances[:n_features], all_importances[n_features:]


def compute_adjusted_importances2(ensemble, X, y, n_permutations=1):
    vi, pi = compute_permutation_importances2()

In [23]:

vi_random, pi_random = compute_permutation_importances2(random_trees, X_orig, y)
adjusted_random = vi_random - pi_random

plt.figure(figsize=(16, 3))
plot_feature_importances(vi_random, color='b', label='importance')
plot_feature_importances(pi_random, color='g', label='permutation')
plt.title('Raw variable importances vs permutation importances on random trees')
_ = plt.legend()

That looks like a better way to find the chance level. Let's plot the differences, we now plot the adjusted for chance variable importances to take into account the finiteness of our samples:

In [24]:

vi_extra, pi_extra = compute_permutation_importances2(extra_trees, X_orig, y)
adjusted_extra = vi_extra - pi_extra

In [25]:

plt.figure(figsize=(16, 3))
plot_feature_importances(adjusted_extra, color='b', label='extra')
plot_feature_importances(adjusted_random, color='g', label='random')
plt.legend()
_ = plt.title('Variable importances adjusted against feature permutations')

Let's evaluate this scheme on a dataset with normally distributed noisy variables:

In [26]:

plt.figure(figsize=(16, 3))
vi_extra_noise2, pi_extra_noise2 = compute_permutation_importances2(extra_trees, X_noise_2, y)
adjusted_extra_noise2 = vi_extra_noise2 - pi_extra_noise2

vi_random_noise2, pi_random_noise2 = compute_permutation_importances2(random_trees, X_noise_2, y)
adjusted_random_noise2 = vi_random_noise2 - pi_random_noise2

plot_feature_importances(adjusted_extra_noise2, color='b', label='extra')
plot_feature_importances(adjusted_random_noise2, color='g', label='random')
_ = plt.legend()

We can see that this scheme of adjusting the variable importance is actually working as expected. Let's now zoom in on the remaining variable importances to check which of the extra or random trees perform best at assigning zero values to noisy variables:

In [27]:

n_features = X_orig.shape[1]
adjusted_extra_noisy_part = adjusted_extra_noise2[n_features:]
adjusted_random_noisy_part = adjusted_random_noise2[n_features:]

plt.figure(figsize=(16, 3))
plot_feature_importances(adjusted_extra_noisy_part, color='b', label='extra')
plot_feature_importances(adjusted_random_noisy_part, color='g', label='random')
plt.title('Zoom on the AVI of noisy features')
_ = plt.legend()

Qualitatively there does not seem to be a clear winner. Let's quantify:

In [28]:

np.mean(adjusted_extra_noisy_part), np.std(adjusted_extra_noisy_part)

Out[28]:

(-4.4719599020202392e-06, 0.00014378502634102241)

In [29]:

np.mean(adjusted_random_noisy_part), np.std(adjusted_random_noisy_part)

Out[29]:

(2.0340691632104051e-05, 0.00018497496806577288)

The noise level seems to be the same and approximitalely centered on zero as expected. Let's evaluate how much the addition of extra noisy features impacts the values of the adjusted variable importances for the original features part.

In [30]:

adjusted_extra_orig_part = adjusted_extra_noise2[:n_features]
adjusted_random_orig_part = adjusted_random_noise2[:n_features]

In [31]:

plt.figure(figsize=(16, 3))
plot_feature_importances(adjusted_extra, color='b', label='AVI on original data')
plot_feature_importances(adjusted_extra_orig_part, color='g', label='with extra noisy features')
plt.title('Extra Trees')
_ = plt.legend()

In [32]:

plt.figure(figsize=(16, 3))
plot_feature_importances(adjusted_random, color='b', label='AVI on original data')
plot_feature_importances(adjusted_random_orig_part, color='g', label='with extra noisy features')
plt.title('Random Trees')
_ = plt.legend()

In [86]:

plt.figure(figsize=(16, 3))
plot_feature_importances(adjusted_extra / adjusted_extra.sum(),
                         color='b', label='AVI on original data')
plot_feature_importances(adjusted_extra_orig_part / adjusted_extra_orig_part.sum(),
                         color='g', label='with extra noisy features')
plt.title('Extra Trees with normalization')
_ = plt.legend()

In [84]:

plt.figure(figsize=(16, 3))
plot_feature_importances(adjusted_random / adjusted_random.sum(),
                         color='b', label='AVI on original data')
plot_feature_importances(adjusted_random_orig_part / adjusted_random_orig_part.sum(),
                         color='g', label='with extra noisy features')
plt.title('Random Trees with normalization')
_ = plt.legend()

Random trees seems to be more impacted by the addition of noisy variables also in the case of adjustment for permutation to take into account the finiteness of the dataset. Even after normalization of the VIs.

Dealing with non-asomptotic regime by increasing the number of samples¶

In [72]:

plt.figure(figsize=(16, 3))
plot_feature_importances(extra_trees.fit(X_orig, y), color='b', label='original data')
plot_feature_importances(extra_trees.fit(X_noise_4, y), chunk=slice(0, n_features),
                         color='g', label='original data + noisy variables')
plt.title('Impoct of noisy features on VI from Extra Trees')
_ = plt.legend()

In [71]:

plt.figure(figsize=(16, 3))
plot_feature_importances(random_trees.fit(X_orig, y), color='b', label='original data')
plot_feature_importances(random_trees.fit(X_noise_4, y), chunk=slice(0, n_features),
                         color='g', label='original data + noisy variables')
plt.title('Impoct of noisy features on VI from Random Trees')
_ = plt.legend()

In [80]:

def bernoulli_replicate(X, y, n_replications=3, noise_level=0.1):
    all_data = [X]
    all_target = [y] * (n_replications + 1)
    for i in range(n_replications):
        random_mask = rng.rand(X.size).reshape(X.shape) > (1 - noise_level)
        X_corrupted = np.where(random_mask, X, np.logical_not(X))
        all_data.append(X_corrupted)
    return np.vstack(all_data), np.concatenate(all_target)

In [81]:

plt.figure(figsize=(16, 3))
plot_feature_importances(random_trees.fit(*bernoulli_replicate(X_orig, y)), color='b', label='original data')
plot_feature_importances(random_trees.fit(*bernoulli_replicate(X_noise_4, y)), chunk=slice(0, n_features),
                         color='g', label='original data + noisy variables')
plt.title('Impact of noisy features on VI from Random Trees')
_ = plt.legend()

This does not seem to help with fixing the impact of noisy variables on the feature importance values.