#!/usr/bin/env python # coding: utf-8 # ## Techniques for Feature Selection and Parameter Optimization # In[1]: get_ipython().run_line_magic('pylab', 'inline') # In[2]: import pandas as pd import numpy as np import matplotlib.pyplot as plt # #### Import titanic data using pandas # Modified version of the "Titanic" data can be found at: http://facweb.cs.depaul.edu/mobasher/classes/csc478/Data/titanic-trimmed.csv. # In[3]: url = "http://facweb.cs.depaul.edu/mobasher/classes/csc478/Data/titanic-trimmed.csv" titanic = pd.read_csv(url) titanic.head(10) # In[4]: titanic.describe(include="all") # #### Handling missing variables # In[5]: titanic[titanic.age.isnull()].shape # In[6]: age_mean = titanic.age.mean() titanic.age.fillna(age_mean, axis=0, inplace=True) titanic.dropna(axis=0, inplace=True) # In[7]: titanic.shape # In[8]: titanic.set_index('pid', drop=True, inplace=True) titanic.head() # #### Creating dummy variables for categorical features # In[9]: titanic_ssf = pd.get_dummies(titanic) titanic_ssf.head(10) # In[10]: titanic_names = titanic_ssf.columns.values print(titanic_names) # In[11]: y = titanic_ssf['survived'] X = titanic_ssf[titanic_names[1:]] X.head() # In[12]: titanic_ssf.describe().T # #### Build the training and testing dataset # In[13]: from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=33) # In[14]: # Now let's train the decision tree on the training data from sklearn import tree dt = tree.DecisionTreeClassifier(criterion='entropy') dt = dt.fit(X_train, y_train) # #### A versatile function to measure performance of a classification model # In[15]: from sklearn import metrics def measure_performance(X, y, clf, show_accuracy=True, show_classification_report=True, show_confussion_matrix=True): y_pred = clf.predict(X) if show_accuracy: print("Accuracy:{0:.3f}".format(metrics.accuracy_score(y, y_pred)),"\n") if show_classification_report: print("Classification report") print(metrics.classification_report(y, y_pred),"\n") if show_confussion_matrix: print("Confussion matrix") print(metrics.confusion_matrix(y, y_pred),"\n") # In[16]: from sklearn import metrics measure_performance(X_test, y_test, dt, show_confussion_matrix=False) # ### Feature Selection # #### Select the top 30% of the most important features, using a chi2 test # In[17]: from sklearn import feature_selection # In[18]: fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=30) X_train_fs = fs.fit_transform(X_train, y_train) # In[19]: np.set_printoptions(suppress=True, precision=2, linewidth=120) print(list(X.columns)) print(fs.get_support()) print(fs.scores_) # In[20]: print(X.columns[fs.get_support()].values) # In[21]: for i in range(len(X.columns.values)): if fs.get_support()[i]: print("%10s %3.2f" % (X.columns.values[i], fs.scores_[i])) # In[22]: print(X_train_fs) # #### Evaluate performance with the new feature set on test data # In[23]: dt = tree.DecisionTreeClassifier(criterion='entropy') dt.fit(X_train_fs, y_train) X_test_fs = fs.transform(X_test) measure_performance(X_test_fs, y_test, dt, show_confussion_matrix=False) # #### To do feature selection more systematically, we need to find the best percentile using cross-validation # In[24]: from sklearn.model_selection import cross_val_score dt = tree.DecisionTreeClassifier(criterion='entropy') percentiles = range(1, 100, 5) results = [] for i in range(1, 100, 5): fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=i) X_train_fs = fs.fit_transform(X_train, y_train) scores = cross_val_score(dt, X_train_fs, y_train, cv=5) print("%2d %0.4f" % (i, scores.mean())) results = np.append(results, scores.mean()) # In[25]: optimal_percentile_ind = np.where(results == results.max())[0][0] print(optimal_percentile_ind) # In[26]: optimal_percentile_ind = np.where(results == results.max())[0][0] print("Optimal percentile of features:{0}".format(percentiles[optimal_percentile_ind]), "\n") optimal_num_features = int(percentiles[optimal_percentile_ind]*len(X.columns)/100) print("Optimal number of features:{0}".format(optimal_num_features), "\n") # Plot percentile of features VS. cross-validation scores import pylab as pl pl.figure() pl.xlabel("Percentage of features selected") pl.ylabel("Cross validation accuracy") pl.plot(percentiles,results) # ### Evaluate our best number of features on the test set # In[27]: fs = feature_selection.SelectKBest(feature_selection.chi2, optimal_num_features) X_train_fs = fs.fit_transform(X_train, y_train) dt = tree.DecisionTreeClassifier(criterion='entropy') dt.fit(X_train_fs, y_train) X_test_fs = fs.transform(X_test) measure_performance(X_test_fs, y_test, dt, show_confussion_matrix=False) # ### Model selection # #### Exploring and comparing model parameters # In[28]: print(dt.get_params()) # #### Let's first focus on "criterion' parameter and find the best one # In[29]: dt = tree.DecisionTreeClassifier(criterion='entropy') scores = cross_val_score(dt, X_train, y_train, cv=5) print("Entropy criterion accuracy on cv: {0:.3f}".format(scores.mean())) dt = tree.DecisionTreeClassifier(criterion='gini') scores = cross_val_score(dt, X_train, y_train, cv=5) print("Gini criterion accuracy on cv: {0:.3f}".format(scores.mean())) # In[30]: # Now we can fit the model to the full training data usign the optimal features and the desired parameters # and apply the model to the set-aside test data dt = tree.DecisionTreeClassifier(criterion='entropy') dt.fit(X_train, y_train) measure_performance(X_test, y_test, dt, show_confussion_matrix=False, show_classification_report=True) # #### Another parameter of decision tree that can have an impact on accuracy is 'max-depth' # In[31]: dt = tree.DecisionTreeClassifier(criterion='entropy') dt.set_params(max_depth=5) dt.fit(X_train, y_train) measure_performance(X_test, y_test, dt, show_confussion_matrix=False, show_classification_report=False) # #### But, again, we need a more systematic way to explore the space of values for each parameter. The following is a general function that performs cross-validation using a range of values for a specified parameter of a model # In[32]: from sklearn.model_selection import KFold def calc_params(X, y, clf, param_values, param_name, K): # Convert input to Numpy arrays X = np.array(X) y = np.array(y) # initialize training and testing score arrays with zeros train_scores = np.zeros(len(param_values)) test_scores = np.zeros(len(param_values)) # iterate over the different parameter values for i, param_value in enumerate(param_values): print(param_name, ' = ', param_value) # set classifier parameters clf.set_params(**{param_name:param_value}) # initialize the K scores obtained for each fold k_train_scores = np.zeros(K) k_test_scores = np.zeros(K) # create KFold cross validation cv = KFold(n_splits=K, shuffle=True, random_state=0) # iterate over the K folds j = 0 for train, test in cv.split(X): # fit the classifier in the corresponding fold # and obtain the corresponding accuracy scores on train and test sets clf.fit(X[train], y[train]) k_train_scores[j] = clf.score(X[train], y[train]) k_test_scores[j] = clf.score(X[test], y[test]) j += 1 # store the mean of the K fold scores train_scores[i] = np.mean(k_train_scores) test_scores[i] = np.mean(k_test_scores) # plot the training and testing scores in a log scale plt.plot(param_values, train_scores, label='Train', alpha=0.4, lw=2, c='b') plt.plot(param_values, test_scores, label='X-Val', alpha=0.4, lw=2, c='g') plt.legend(loc=7) plt.xlabel(param_name + " values") plt.ylabel("Mean cross validation accuracy") # return the training and testing scores on each parameter value return train_scores, test_scores # #### Now we can explore the impact of max-depth more systematically # In[33]: # Let's create an evenly spaced range of numbers in a specified interval md = np.linspace(1, 40, 20) md = np.array([int(e) for e in md]) print(md) # In[34]: train_scores, test_scores = calc_params(X_train, y_train, dt, md, 'max_depth', 5) # #### max_depth = 3 seems to work best; larger values seem to lead to over-fitting. # In[35]: dt = tree.DecisionTreeClassifier(criterion='entropy') dt.set_params(max_depth=3) dt.fit(X_train, y_train) measure_performance(X_test, y_test, dt, show_confussion_matrix=False, show_classification_report=False) # #### Another parameter of decision tree that's important is the min number of samples allowed at a leaf node # In[36]: msl = np.linspace(1, 30, 15) msl = np.array([int(e) for e in msl]) dt = tree.DecisionTreeClassifier(criterion='entropy') train_scores, test_scores = calc_params(X_train, y_train, dt, msl, 'min_samples_leaf', 5) # #### Looks like min_samples_leaf around 21 seems like a good choice. Let's now combine these optimal parameter values in our final model to fit the full training data. # In[37]: dt = tree.DecisionTreeClassifier(criterion='entropy') dt.set_params(min_samples_leaf=21, max_depth=3) dt.fit(X_train, y_train) measure_performance(X_test, y_test, dt, show_confussion_matrix=False) # #### Grid Search allows us to more systematically explore different combinations of multiple parameters # In[38]: from sklearn.model_selection import GridSearchCV dt = tree.DecisionTreeClassifier() parameters = { 'criterion': ['entropy','gini'], 'max_depth': np.linspace(1, 20, 10, dtype=int), 'min_samples_leaf': np.linspace(1, 30, 15, dtype=int), 'min_samples_split': np.linspace(2, 20, 10, dtype=int) } gs = GridSearchCV(dt, parameters, verbose=1, cv=5) # In[39]: get_ipython().run_line_magic('time', '_ = gs.fit(X_train, y_train)') gs.best_params_, gs.best_score_ # In[40]: dt = tree.DecisionTreeClassifier(criterion='gini', max_depth=3, min_samples_leaf=3, min_samples_split=2) dt.fit(X_train, y_train) measure_performance(X_test, y_test, dt, show_confussion_matrix=False, show_classification_report=True) # In[ ]: