#!/usr/bin/env python
# coding: utf-8

# ## Techniques for Feature Selection and Parameter Optimization

# In[1]:


get_ipython().run_line_magic('pylab', 'inline')


# In[2]:


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


# #### Import titanic data using pandas

# Modified version of the "Titanic" data can be found at: http://facweb.cs.depaul.edu/mobasher/classes/csc478/Data/titanic-trimmed.csv.

# In[3]:


url = "http://facweb.cs.depaul.edu/mobasher/classes/csc478/Data/titanic-trimmed.csv"
titanic = pd.read_csv(url)
titanic.head(10)


# In[4]:


titanic.describe(include="all")


# #### Handling missing variables

# In[5]:


titanic[titanic.age.isnull()].shape


# In[6]:


age_mean = titanic.age.mean()
titanic.age.fillna(age_mean, axis=0, inplace=True)
titanic.dropna(axis=0, inplace=True)


# In[7]:


titanic.shape


# In[8]:


titanic.set_index('pid', drop=True, inplace=True)
titanic.head()


# #### Creating dummy variables for categorical features

# In[9]:


titanic_ssf = pd.get_dummies(titanic)
titanic_ssf.head(10)


# In[10]:


titanic_names = titanic_ssf.columns.values
print(titanic_names)


# In[11]:


y = titanic_ssf['survived']
X = titanic_ssf[titanic_names[1:]]
X.head()


# In[12]:


titanic_ssf.describe().T


# #### Build the training and testing dataset

# In[13]:


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=33)


# In[14]:


# Now let's train the decision tree on the training data

from sklearn import tree
dt = tree.DecisionTreeClassifier(criterion='entropy')
dt = dt.fit(X_train, y_train)


# #### A versatile function to measure performance of a classification model

# In[15]:


from sklearn import metrics

def measure_performance(X, y, clf, show_accuracy=True, show_classification_report=True, show_confussion_matrix=True):
    y_pred = clf.predict(X)   
    if show_accuracy:
         print("Accuracy:{0:.3f}".format(metrics.accuracy_score(y, y_pred)),"\n")
    if show_classification_report:
        print("Classification report")
        print(metrics.classification_report(y, y_pred),"\n")
      
    if show_confussion_matrix:
        print("Confussion matrix")
        print(metrics.confusion_matrix(y, y_pred),"\n")


# In[16]:


from sklearn import metrics
measure_performance(X_test, y_test, dt, show_confussion_matrix=False)


# ### Feature Selection

# #### Select the top 30% of the most important features, using a chi2 test

# In[17]:


from sklearn import feature_selection


# In[18]:


fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=30)
X_train_fs = fs.fit_transform(X_train, y_train)


# In[19]:


np.set_printoptions(suppress=True, precision=2, linewidth=120)
print(list(X.columns))
print(fs.get_support())
print(fs.scores_)


# In[20]:


print(X.columns[fs.get_support()].values)


# In[21]:


for i in range(len(X.columns.values)):
    if fs.get_support()[i]:
        print("%10s  %3.2f" % (X.columns.values[i], fs.scores_[i]))


# In[22]:


print(X_train_fs)


# #### Evaluate performance with the new feature set on test data

# In[23]:


dt = tree.DecisionTreeClassifier(criterion='entropy')
dt.fit(X_train_fs, y_train)
X_test_fs = fs.transform(X_test)
measure_performance(X_test_fs, y_test, dt, show_confussion_matrix=False)


# #### To do feature selection more systematically, we need to find the best percentile using cross-validation

# In[24]:


from sklearn.model_selection import cross_val_score
dt = tree.DecisionTreeClassifier(criterion='entropy')

percentiles = range(1, 100, 5)
results = []
for i in range(1, 100, 5):
    fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=i)
    X_train_fs = fs.fit_transform(X_train, y_train)
    scores = cross_val_score(dt, X_train_fs, y_train, cv=5)
    print("%2d  %0.4f" % (i, scores.mean()))
    results = np.append(results, scores.mean())


# In[25]:


optimal_percentile_ind = np.where(results == results.max())[0][0]
print(optimal_percentile_ind)


# In[26]:


optimal_percentile_ind = np.where(results == results.max())[0][0]
print("Optimal percentile of features:{0}".format(percentiles[optimal_percentile_ind]), "\n")
optimal_num_features = int(percentiles[optimal_percentile_ind]*len(X.columns)/100)
print("Optimal number of features:{0}".format(optimal_num_features), "\n")

# Plot percentile of features VS. cross-validation scores
import pylab as pl
pl.figure()
pl.xlabel("Percentage of features selected")
pl.ylabel("Cross validation accuracy")
pl.plot(percentiles,results)


# ### Evaluate our best number of features on the test set

# In[27]:


fs = feature_selection.SelectKBest(feature_selection.chi2, optimal_num_features)
X_train_fs = fs.fit_transform(X_train, y_train)
dt = tree.DecisionTreeClassifier(criterion='entropy')
dt.fit(X_train_fs, y_train)
X_test_fs = fs.transform(X_test)
measure_performance(X_test_fs, y_test, dt, show_confussion_matrix=False)


# ### Model selection

# #### Exploring and comparing model parameters

# In[28]:


print(dt.get_params())


# #### Let's first focus on "criterion' parameter and find the best one

# In[29]:


dt = tree.DecisionTreeClassifier(criterion='entropy')
scores = cross_val_score(dt, X_train, y_train, cv=5)
print("Entropy criterion accuracy on cv: {0:.3f}".format(scores.mean()))

dt = tree.DecisionTreeClassifier(criterion='gini')
scores = cross_val_score(dt, X_train, y_train, cv=5)
print("Gini criterion accuracy on cv: {0:.3f}".format(scores.mean()))


# In[30]:


# Now we can fit the model to the full training data usign the optimal features and the desired parameters
# and apply the model to the set-aside test data
dt = tree.DecisionTreeClassifier(criterion='entropy')
dt.fit(X_train, y_train)
measure_performance(X_test, y_test, dt, show_confussion_matrix=False, show_classification_report=True)


# #### Another parameter of decision tree that can have an impact on accuracy is 'max-depth'

# In[31]:


dt = tree.DecisionTreeClassifier(criterion='entropy')
dt.set_params(max_depth=5)

dt.fit(X_train, y_train)
measure_performance(X_test, y_test, dt, show_confussion_matrix=False, show_classification_report=False)


# #### But, again, we need a more systematic way to explore the space of values for each parameter. The following is a general function that performs cross-validation using a range of values for a specified parameter of a model

# In[32]:


from sklearn.model_selection import KFold

def calc_params(X, y, clf, param_values, param_name, K):
    
    # Convert input to Numpy arrays
    X = np.array(X)
    y = np.array(y)

    # initialize training and testing score arrays with zeros
    train_scores = np.zeros(len(param_values))
    test_scores = np.zeros(len(param_values))
    
    # iterate over the different parameter values
    for i, param_value in enumerate(param_values):
        print(param_name, ' = ', param_value)
        
        # set classifier parameters
        clf.set_params(**{param_name:param_value})
        
        # initialize the K scores obtained for each fold
        k_train_scores = np.zeros(K)
        k_test_scores = np.zeros(K)
        
        # create KFold cross validation
        cv = KFold(n_splits=K, shuffle=True, random_state=0)
        
        # iterate over the K folds
        j = 0
        for train, test in cv.split(X):
            # fit the classifier in the corresponding fold
            # and obtain the corresponding accuracy scores on train and test sets
            clf.fit(X[train], y[train])
            k_train_scores[j] = clf.score(X[train], y[train])
            k_test_scores[j] = clf.score(X[test], y[test])
            j += 1
            
        # store the mean of the K fold scores
        train_scores[i] = np.mean(k_train_scores)
        test_scores[i] = np.mean(k_test_scores)
       
    # plot the training and testing scores in a log scale
    plt.plot(param_values, train_scores, label='Train', alpha=0.4, lw=2, c='b')
    plt.plot(param_values, test_scores, label='X-Val', alpha=0.4, lw=2, c='g')
    plt.legend(loc=7)
    plt.xlabel(param_name + " values")
    plt.ylabel("Mean cross validation accuracy")

    # return the training and testing scores on each parameter value
    return train_scores, test_scores


# #### Now we can explore the impact of max-depth more systematically

# In[33]:


# Let's create an evenly spaced range of numbers in a specified interval
md = np.linspace(1, 40, 20)
md = np.array([int(e) for e in md])
print(md)


# In[34]:


train_scores, test_scores = calc_params(X_train, y_train, dt, md, 'max_depth', 5)


# #### max_depth = 3 seems to work best; larger values seem to lead to over-fitting.

# In[35]:


dt = tree.DecisionTreeClassifier(criterion='entropy')
dt.set_params(max_depth=3)

dt.fit(X_train, y_train)
measure_performance(X_test, y_test, dt, show_confussion_matrix=False, show_classification_report=False)


# #### Another parameter of decision tree that's important is the min number of samples allowed at a leaf node

# In[36]:


msl = np.linspace(1, 30, 15)
msl = np.array([int(e) for e in msl])

dt = tree.DecisionTreeClassifier(criterion='entropy')
train_scores, test_scores = calc_params(X_train, y_train, dt, msl, 'min_samples_leaf', 5)


# #### Looks like min_samples_leaf around 21 seems like a good choice. Let's now combine these optimal parameter values in our final model to fit the full training data.

# In[37]:


dt = tree.DecisionTreeClassifier(criterion='entropy')
dt.set_params(min_samples_leaf=21, max_depth=3)

dt.fit(X_train, y_train)
measure_performance(X_test, y_test, dt, show_confussion_matrix=False)


# #### Grid Search allows us to more systematically explore different combinations of multiple parameters

# In[38]:


from sklearn.model_selection import GridSearchCV

dt = tree.DecisionTreeClassifier()

parameters = {
    'criterion': ['entropy','gini'],
    'max_depth': np.linspace(1, 20, 10, dtype=int),
    'min_samples_leaf': np.linspace(1, 30, 15, dtype=int),
    'min_samples_split': np.linspace(2, 20, 10, dtype=int)
}

gs = GridSearchCV(dt, parameters, verbose=1, cv=5)


# In[39]:


get_ipython().run_line_magic('time', '_ = gs.fit(X_train, y_train)')

gs.best_params_, gs.best_score_


# In[40]:


dt = tree.DecisionTreeClassifier(criterion='gini', max_depth=3, min_samples_leaf=3, min_samples_split=2)

dt.fit(X_train, y_train)
measure_performance(X_test, y_test, dt, show_confussion_matrix=False, show_classification_report=True)


# In[ ]: