Notebook

In [1]:

## Why do Feature Selection ??

# Reduces Overfitting: Less redundant data means less opportunity to make decisions based on noise.
# Improves Accuracy: Less misleading data means modeling accuracy improves.
# Reduces Training Time: Less data means that algorithms train faster.

In [2]:

# Imports
from sklearn.utils import shuffle
import pandas as pd
import numpy as np
np.random.seed(10)
import sklearn
from numpy import set_printoptions
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler

print sklearn.__version__
print pd.__version__
print np.__version__

0.19.1
0.20.3
1.14.2

In [3]:

# this class takes care for scaling the features to the scale of 0-1
# we are doing the scaling with this cap because we use sigmoid activation fxn in logistic which 
# also has the range from 0-1
class Normalizer:

    def __init__(self):
        self.sc = MinMaxScaler()
    
    def scale(self, X, dtype):
        if dtype=='train':
            XX = self.sc.fit_transform(X)
        elif dtype=='test':
            XX = self.sc.transform(X)
        else:
            return None
        return XX

In [4]:

#### METHOD-1
# Univariate Feature Selection
###

# Select those features that have strong relationship with the output variable
df = pd.read_csv('../data/day1/iris.csv')
df = shuffle(df)
print df.head(2)
print '*'*20
norm = Normalizer()
X = norm.scale(df.iloc[:,:-1].values, 'train')
Y = df.iloc[:,-1].values
print X[:2], Y[:2]
print '*'*20
# feature extraction
test = SelectKBest(score_func=chi2, k=2)
fit = test.fit(X, Y)
# summarize scores
np.set_printoptions(precision=3)
print(fit.scores_)
print '*'*20
features = fit.transform(X)
# summarize selected features
print(features[0:2,:])

# we see that feat3 and feat4 are good enough for this problem as per this feature selection technique

     feat1  feat2  feat3  feat4            class
87     6.3    2.3    4.4    1.3  Iris-versicolor
111    6.4    2.7    5.3    1.9   Iris-virginica
********************
[[0.55555556 0.125      0.57627119 0.5       ]
 [0.58333333 0.29166667 0.72881356 0.75      ]] ['Iris-versicolor' 'Iris-virginica']
********************
[11.377  4.34  26.827 30.569]
********************
[[0.576 0.5  ]
 [0.729 0.75 ]]

Suggestions before you see the next cell¶

Read about chi-square test and get the intution.

In [5]:

#### METHOD-2
# Recursive Feature Elimination (Backwards)
###
# The Recursive Feature Elimination (or RFE) works by recursively removing attributes and
# building a model on those attributes that remain. It uses the model accuracy to identify which attributes 
# (and combination of attributes) contribute the most to predicting the target attribute

df = pd.read_csv('../data/day1/iris.csv')
df = shuffle(df)
print df.head(2)
print '*'*20
norm = Normalizer()
X = norm.scale(df.iloc[:,:-1].values, 'train')
Y = df.iloc[:,-1].values
print X[:2], Y[:2]
print '*'*20
model = LogisticRegression()
rfe = RFE(model, 2)
fit = rfe.fit(X, Y)
print("Selected Features: %s") % fit.support_

# we see that feat3 and feat4 are good enough for this problem as per this feature selection technique

     feat1  feat2  feat3  feat4            class
87     6.3    2.3    4.4    1.3  Iris-versicolor
100    6.3    3.3    6.0    2.5   Iris-virginica
********************
[[0.556 0.125 0.576 0.5  ]
 [0.556 0.542 0.847 1.   ]] ['Iris-versicolor' 'Iris-virginica']
********************
Selected Features: [False False  True  True]

Suggestions before you see the next cell¶

http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html

In [6]:

#### METHOD-3
# Using Feature importance from any bagging algorithm
###
df = pd.read_csv('../data/day1/iris.csv')
df = shuffle(df)
print df.head(2)
print '*'*20
norm = Normalizer()
X = norm.scale(df.iloc[:,:-1].values, 'train')
Y = df.iloc[:,-1].values
print X[:2], Y[:2]
print '*'*20
# feature extraction
model = ExtraTreesClassifier()
model.fit(X, Y)
print model.feature_importances_

# we see that feat3 and feat4 are good enough for this problem as per this feature selection technique

    feat1  feat2  feat3  feat4        class
36    5.5    3.5    1.3    0.2  Iris-setosa
43    5.0    3.5    1.6    0.6  Iris-setosa
********************
[[0.333 0.625 0.051 0.042]
 [0.194 0.625 0.102 0.208]] ['Iris-setosa' 'Iris-setosa']
********************
[0.083 0.086 0.447 0.384]

In [7]:

# Feature Selection and Elimination are mutually exclusive techniques for getting correct set of features for
# your ML model

# Few techniques for elimination are:
## Remove Zero/Less Variance columns
## Remove Columns with many missing values
## Remove Highly +ve/-ve co-related features

In [8]:

# Other techniques for feature selection include: 
## PCA Decomposition
## Auto Encoders

### we will see them in few upcoming days!!