In [1]:

%matplotlib inline
from copy import copy
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats.mstats import mquantiles

from sklearn.datasets import load_digits
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import StratifiedShuffleSplit

In [2]:

digits = load_digits()
X, y = digits.data, digits.target

Make the problem harder by reducing the number of samples:

In [3]:

cv = StratifiedShuffleSplit(y, train_size=50, test_size=None)
train, test = iter(cv).next()
X_orig, X_test = X[train], X[test]
y_orig, y_test = y[train], y[test]

n_samples, n_features_orig = X_orig.shape 

In [4]:

print(n_samples)

In [5]:

print(n_features_orig)

Note: some features are de-facto non-informative as they have 0 variance (pixels near the corners):

In [6]:

np.sum(np.var(X_orig, axis=0) == 0)

Out[6]:

Add noisy variables to make it hard to identify the relevant variables

In [7]:

n_features_noise_1 = 50
n_features_noise_2 = 1000
rng = np.random.RandomState(42)

X_noise_1 = np.hstack([X_orig, rng.normal(size=(n_samples, n_features_noise_1))])
X_noise_2 = np.hstack([X_orig, rng.normal(size=(n_samples, n_features_noise_2))])

In [70]:

n_trees = 100
extra_trees = ExtraTreesClassifier(n_estimators=n_trees, n_jobs=-1, random_state=0)
random_forest = RandomForestClassifier(n_estimators=n_trees, n_jobs=-1, random_state=0)

Even with so few samples, then ensemble of trees models are able to predict reasonably well on the noise-free data:

In [71]:

extra_trees.fit(X_orig, y_orig).score(X_test, y_test)

Out[71]:

0.85174585002862047

In [10]:

random_forest.fit(X_orig, y_orig).score(X_test, y_test)

Out[10]:

0.84487693188322843

Let's have a look at the look at the feature importances of such tree models:

In [11]:

def compute_importances(ensemble, normalize=False):
    trees_importances = [base_model.tree_.compute_feature_importances(normalize=normalize)
                         for base_model in ensemble.estimators_]
    return sum(trees_importances) / len(trees_importances)


def plot_feature_importances(importances, normalize=False, color=None, alpha=0.5, label=None, chunk=None):
    if hasattr(importances, 'estimators_'):
        importances = compute_importances(importances, normalize=normalize)
    if chunk is not None:
        importances = importances[chunk]
    plt.bar(range(len(importances)), importances, color=color, alpha=alpha, label=label)

In [12]:

plt.figure(figsize=(16, 3))
plot_feature_importances(extra_trees.fit(X_orig, y_orig), label='extra')
_ = plt.legend()

The pixels at the center of the image are more informative to predict the label (digit) of the image. Pixels in the corner have either zero variance hence close to zero variable importance or close to zero VI.

Let's compute the trees variable importances for the same dataset with additional noisy variables:

In [13]:

plt.figure(figsize=(16, 3))
plot_feature_importances(extra_trees.fit(X_noise_1, y_orig), label='extra')
_ = plt.legend()

The noisy variables have non-zero variable importances: this is caused by the impurity decrease caused by random splits in a finite sample training set.

The absolute values of the importances of the relevant variables is impacted (decreased) by the addition. This impact is more signicantly observed when the number of noisy variable is significantly larger than the number of relevant variables.

In [14]:

plt.figure(figsize=(16, 3))
plot_feature_importances(extra_trees.fit(X_noise_2, y_orig))
_ = plt.title("Variable importance on a data set with a majority of noisy variables")

The fact that noisy variables have a non-zero variable importance on average makes it difficult to use those scores directly for feature selection.

Adjusting the variable importances vs chance¶

Let's compute new variable importance scores that are adjusted so that noisy variables have zero score on average.

To do so we expand the dataset by horizontally stacking a column-wise shuffled copy of the original feature values, then fitting an ensemble of randomized trees on the expanded data, then computing the difference of the VI for each feature and its shuffled copy. The operation is iterated with different random shufflings of the data.

AVI of noisy variables should therefore be null on average (on many iteration of this shuffling process).

In [15]:

def shuffle_columns(X, copy=True, seed=0):
    rng = np.random.RandomState(seed)
    if copy:
        X = X.copy()
    for i in range(X.shape[1]):
        rng.shuffle(X[:, i])
    return X

In [16]:

def model_importances(ensemble, normalize=False):
    return np.array([
        base_model.tree_.compute_feature_importances(normalize=normalize)
        for base_model in ensemble.estimators_])


def compute_adjusted_importances(ensemble, X, y, n_permutations=5, return_diff=True,
                                 aggregate_method='average', normalize=False):
    all_importances = []
    n_features = X.shape[1]
    for i in range(n_permutations):
        ensemble.fit(np.hstack([X, shuffle_columns(X, seed=i)]), y)
        all_importances.append(model_importances(ensemble, normalize=normalize))
    if aggregate_method == 'average':
        # Average importances of trees accross permutations:
        # the shape is (n_trees, 2 * n_features)
        all_importances = np.mean(all_importances, axis=0)
    elif aggregate_method == 'stack':
        # Stack importances of trees for various permutations:
        # the shape is (n_permutations * n_trees, 2 * n_features)
        all_importances = np.vstack(all_importances)

    if return_diff:
        return all_importances[:, :n_features] - all_importances[:, n_features:]
    else:
        return all_importances[:, :n_features], all_importances[:, n_features:]

In [17]:

def bootstrap_adjusted_importances(ensemble, X, y, normalize=False,
                                   n_permutations=10, n_bootstraps=1000,
                                   aggregate_method='average', seed=0):
    rng = np.random.RandomState(seed)
    
    avi_trees = compute_adjusted_importances(
        ensemble, X, y, n_permutations=n_permutations, normalize=normalize,
        aggregate_method=aggregate_method)

    n_trees = len(ensemble.estimators_)
    bootstraped_means = []
    for i in range(n_bootstraps):
        idx = rng.random_integers(low=0, high=n_trees - 1, size=n_trees)
        bootstraped_means.append(avi_trees[idx].mean(axis=0))
    bootstraped_means = np.array(bootstraped_means)
    bootstraped_means.sort(axis=0)
    return np.mean(bootstraped_means, axis=0), np.std(bootstraped_means, axis=0), bootstraped_means

In [80]:

avi_mean, avi_std, bootstraped_avi = bootstrap_adjusted_importances(
    extra_trees, X_noise_2, y_orig, n_permutations=10, aggregate_method='average', seed=0)

In [81]:

plt.figure(figsize=(16, 3))
plt.bar(range(len(avi_mean)), avi_mean, color=None, alpha=0.5)
_ = plt.title('Mean Adjusted Variable Importances for all features')

In [82]:

avi_mean[:n_features_orig].mean(), avi_mean[:n_features_orig].std()

Out[82]:

(0.0014289204654493045, 0.0016611488031069005)

In [83]:

avi_mean[n_features_orig:].mean(), avi_mean[n_features_orig:].std()

Out[83]:

(-5.2594874852622571e-06, 0.00028116471646904255)

Let's zoom on the AVI values for the noisy variables:

In [84]:

plt.figure(figsize=(16, 3))
plt.bar(range(len(avi_mean) - 64), avi_mean[64:], color=None, alpha=0.5)
_ = plt.title('Mean Adjusted Variable Importances for noisy features')

Let's compute, for each feature the proportion of the boostrapped AVI means that are negative:

In [85]:

neg_rates = np.mean(bootstraped_avi <= 0, axis=0)

plt.figure(figsize=(16, 3))
plt.bar(range(len(neg_rates)), neg_rates, color=None, alpha=0.5)
_ = plt.title('Negative AVI rate')

Many of the original data features with a non-zero variance have a positive distribution of the bootsrapped mean AVI (0 negative rates). We can therefore be confident that those variables are relevant to the prediction task at end:

In [86]:

np.sum(neg_rates[:n_features_orig][np.var(X_orig, axis=0) != 0] <= 0)

Out[86]:

Most of the noisy variables have a boostrapped distribution of the mean AVI that includes 0 or negative values:

In [87]:

np.mean(neg_rates[n_features_orig:] != 0)

Out[87]:

0.98899999999999999

Thresholding all features with non-zero negative adjusted AVI rate would already provide a high recall noisy feature detector. However it might detect relevant variables as noisy with such a broad threshold.

Maybe trimming features that have neg_rate > 0.5 and iterating might give a high recall, high precision feature selector.

In [88]:

np.mean(neg_rates > 0.5)

Out[88]:

0.56484962406015038

Fix point AVI feature selection¶

In [ ]:

def select_important(ensemble, X, y, threshold=0.5, n_iter=None, n_permutations=10, n_bootstraps=1000, seed=0):
    selected_features = np.arange(X.shape[1])
    
    X_selected = X.copy()
    while True:
        _, _, bootstraped_avi = bootstrap_adjusted_importances(
            ensemble, X_selected, y,
            n_permutations=n_permutations,
            n_bootstraps=n_bootstraps,
            seed=seed)
        neg_rates = np.mean(bootstraped_avi <= 0, axis=0)
        selection_mask = neg_rates <= threshold
        print("Keeping %d features out of %d." % (np.sum(selection_mask), X_selected.shape[1]))
        if np.all(selection_mask):
            # All features are selected
            break
        elif not np.any(selection_mask):
            # All features where detected as noisy
            return selected_features[selection_mask]
        # Else: apply the selection mask and iterate untill reaching a fixpoint
        X_selected = X_selected[:, selection_mask]
        selected_features = selected_features[selection_mask]
    return selected_features

In [119]:

selected = select_important(extra_trees, X_noise_2, y_orig, n_permutations=20, threshold=0.0)
print("Number of selected features: %d" % len(selected))
print("Number of selected features among original features: %d" % np.sum(selected < n_features_orig))
print("Number of selected noisy features: %d" % np.sum(selected >= n_features_orig))

Keeping 45 features out of 1064.
Keeping 37 features out of 45.
Keeping 37 features out of 37.
Number of selected features: 37
Number of selected features among original features: 32
Number of selected noisy features: 5

Comparing with univariate feature selection¶

Univariate feature selection has a an hyper parameter alpha to control the selectivity of the selector:

In [169]:

from sklearn.feature_selection import SelectFdr, f_classif

alphas = np.logspace(-8, -2, 100)
total_selected = []
wrongly_selected = []

for alpha in alphas:
    univariate_selector = SelectFdr(f_classif, alpha=alpha).fit(X_noise_2, y_orig)
    selected = univariate_selector.get_support()
    total_selected.append(np.sum(selected))
    wrongly_selected.append(np.sum(selected[n_features_orig:]))
    
total_selected = np.array(total_selected)
wrongly_selected = np.array(wrongly_selected)

In [170]:

plt.semilogx(alphas, total_selected, label="# selected features")
plt.semilogx(alphas, wrongly_selected, label="# selected noisy features")
_ = plt.legend()

In [171]:

small_alphas = alphas < 1e-4

plt.semilogx(alphas[small_alphas], total_selected[small_alphas], label="# selected features")
plt.semilogx(alphas[small_alphas], wrongly_selected[small_alphas], label="# selected noisy features")
_ = plt.legend()

In [159]:

alphas[wrongly_selected == 0][-1]

Out[159]:

9.9999999999999995e-08

In [161]:

total_selected[wrongly_selected == 0][-1]

Out[161]:

Introspection of the mean AVI distribution for some relevant variables¶

In [93]:

plt.figure(figsize=(16, 3))
plt.hist(bootstraped_avi[:, 10], bins=30)
plt.title("Distribution of the bootstrapped mean AVI for a relevant variable")
_ = plt.xlim(bootstraped_avi.min(), bootstraped_avi.max())

In [94]:

plt.figure(figsize=(16, 3))
plt.hist(bootstraped_avi[:, 28], bins=30)
plt.title("Distribution of the bootstrapped mean AVI for a relevant variable")
_ = plt.xlim(bootstraped_avi.min(), bootstraped_avi.max())

One can expect the mean AVI to be strictly positive for relevant variable.

Introspection of the mean AVI distribution for some noisy variables¶

In [95]:

plt.figure(figsize=(16, 3))
plt.hist(bootstraped_avi[:, 145], bins=30)
plt.title("Distribution of the bootstrapped mean AVI for a noisy variable")
_ = plt.xlim(bootstraped_avi.min(), bootstraped_avi.max())

We can observe that it's not possible to reject the hypothesis that the mean AVI of this noisy variable is zero or less.

In [96]:

plt.figure(figsize=(16, 3))
plt.hist(bootstraped_avi[:, 99], bins=30)
plt.title("Distribution of the bootstrapped mean AVI for a noisy variable")
_ = plt.xlim(bootstraped_avi.min(), bootstraped_avi.max())

In [97]:

plt.figure(figsize=(16, 3))
plt.hist(bootstraped_avi[:, 400], bins=30)
plt.title("Distribution of the bootstrapped mean AVI for a noisy variable")
_ = plt.xlim(bootstraped_avi.min(), bootstraped_avi.max())

In [ ]: