## Why do Feature Selection ??
# Reduces Overfitting: Less redundant data means less opportunity to make decisions based on noise.
# Improves Accuracy: Less misleading data means modeling accuracy improves.
# Reduces Training Time: Less data means that algorithms train faster.
# Imports
from sklearn.utils import shuffle
import pandas as pd
import numpy as np
np.random.seed(10)
import sklearn
from numpy import set_printoptions
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
print sklearn.__version__
print pd.__version__
print np.__version__
0.19.1 0.20.3 1.14.2
# this class takes care for scaling the features to the scale of 0-1
# we are doing the scaling with this cap because we use sigmoid activation fxn in logistic which
# also has the range from 0-1
class Normalizer:
def __init__(self):
self.sc = MinMaxScaler()
def scale(self, X, dtype):
if dtype=='train':
XX = self.sc.fit_transform(X)
elif dtype=='test':
XX = self.sc.transform(X)
else:
return None
return XX
#### METHOD-1
# Univariate Feature Selection
###
# Select those features that have strong relationship with the output variable
df = pd.read_csv('../data/day1/iris.csv')
df = shuffle(df)
print df.head(2)
print '*'*20
norm = Normalizer()
X = norm.scale(df.iloc[:,:-1].values, 'train')
Y = df.iloc[:,-1].values
print X[:2], Y[:2]
print '*'*20
# feature extraction
test = SelectKBest(score_func=chi2, k=2)
fit = test.fit(X, Y)
# summarize scores
np.set_printoptions(precision=3)
print(fit.scores_)
print '*'*20
features = fit.transform(X)
# summarize selected features
print(features[0:2,:])
# we see that feat3 and feat4 are good enough for this problem as per this feature selection technique
feat1 feat2 feat3 feat4 class 87 6.3 2.3 4.4 1.3 Iris-versicolor 111 6.4 2.7 5.3 1.9 Iris-virginica ******************** [[0.55555556 0.125 0.57627119 0.5 ] [0.58333333 0.29166667 0.72881356 0.75 ]] ['Iris-versicolor' 'Iris-virginica'] ******************** [11.377 4.34 26.827 30.569] ******************** [[0.576 0.5 ] [0.729 0.75 ]]
#### METHOD-2
# Recursive Feature Elimination (Backwards)
###
# The Recursive Feature Elimination (or RFE) works by recursively removing attributes and
# building a model on those attributes that remain. It uses the model accuracy to identify which attributes
# (and combination of attributes) contribute the most to predicting the target attribute
df = pd.read_csv('../data/day1/iris.csv')
df = shuffle(df)
print df.head(2)
print '*'*20
norm = Normalizer()
X = norm.scale(df.iloc[:,:-1].values, 'train')
Y = df.iloc[:,-1].values
print X[:2], Y[:2]
print '*'*20
model = LogisticRegression()
rfe = RFE(model, 2)
fit = rfe.fit(X, Y)
print("Selected Features: %s") % fit.support_
# we see that feat3 and feat4 are good enough for this problem as per this feature selection technique
feat1 feat2 feat3 feat4 class 87 6.3 2.3 4.4 1.3 Iris-versicolor 100 6.3 3.3 6.0 2.5 Iris-virginica ******************** [[0.556 0.125 0.576 0.5 ] [0.556 0.542 0.847 1. ]] ['Iris-versicolor' 'Iris-virginica'] ******************** Selected Features: [False False True True]
#### METHOD-3
# Using Feature importance from any bagging algorithm
###
df = pd.read_csv('../data/day1/iris.csv')
df = shuffle(df)
print df.head(2)
print '*'*20
norm = Normalizer()
X = norm.scale(df.iloc[:,:-1].values, 'train')
Y = df.iloc[:,-1].values
print X[:2], Y[:2]
print '*'*20
# feature extraction
model = ExtraTreesClassifier()
model.fit(X, Y)
print model.feature_importances_
# we see that feat3 and feat4 are good enough for this problem as per this feature selection technique
feat1 feat2 feat3 feat4 class 36 5.5 3.5 1.3 0.2 Iris-setosa 43 5.0 3.5 1.6 0.6 Iris-setosa ******************** [[0.333 0.625 0.051 0.042] [0.194 0.625 0.102 0.208]] ['Iris-setosa' 'Iris-setosa'] ******************** [0.083 0.086 0.447 0.384]
# Feature Selection and Elimination are mutually exclusive techniques for getting correct set of features for
# your ML model
# Few techniques for elimination are:
## Remove Zero/Less Variance columns
## Remove Columns with many missing values
## Remove Highly +ve/-ve co-related features
# Other techniques for feature selection include:
## PCA Decomposition
## Auto Encoders
### we will see them in few upcoming days!!