%pylab inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
Modified version of the "Titanic" data can be found at: http://facweb.cs.depaul.edu/mobasher/classes/csc478/Data/titanic-trimmed.csv. Original unmodified Titanic data is available at CMU StatLib.
url = "http://facweb.cs.depaul.edu/mobasher/classes/csc478/Data/titanic-trimmed.csv"
titanic = pd.read_csv(url)
titanic.head(10)
titanic.describe(include="all")
titanic[titanic.age.isnull()].shape
age_mean = titanic.age.mean()
titanic.age.fillna(age_mean, axis=0, inplace=True)
titanic.dropna(axis=0, inplace=True)
titanic.shape
titanic.set_index('pid', drop=True, inplace=True)
titanic.head()
titanic_ssf = pd.get_dummies(titanic)
titanic_ssf.head(10)
titanic_names = titanic_ssf.columns.values
print(titanic_names)
y = titanic_ssf['survived']
X = titanic_ssf[titanic_names[1:]]
X.head()
titanic_ssf.describe().T
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=33)
# Now let's train the decision tree on the training data
from sklearn import tree
dt = tree.DecisionTreeClassifier(criterion='entropy')
dt = dt.fit(X_train, y_train)
from sklearn import metrics
def measure_performance(X, y, clf, show_accuracy=True, show_classification_report=True, show_confussion_matrix=True):
y_pred = clf.predict(X)
if show_accuracy:
print("Accuracy:{0:.3f}".format(metrics.accuracy_score(y, y_pred)),"\n")
if show_classification_report:
print("Classification report")
print(metrics.classification_report(y, y_pred),"\n")
if show_confussion_matrix:
print("Confussion matrix")
print(metrics.confusion_matrix(y, y_pred),"\n")
from sklearn import metrics
measure_performance(X_test, y_test, dt, show_confussion_matrix=False)
from sklearn import feature_selection
fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=30)
X_train_fs = fs.fit_transform(X_train, y_train)
np.set_printoptions(suppress=True, precision=2, linewidth=120)
print(list(X.columns))
print(fs.get_support())
print(fs.scores_)
print(X.columns[fs.get_support()].values)
for i in range(len(X.columns.values)):
if fs.get_support()[i]:
print("%10s %3.2f" % (X.columns.values[i], fs.scores_[i]))
print(X_train_fs)
dt = tree.DecisionTreeClassifier(criterion='entropy')
dt.fit(X_train_fs, y_train)
X_test_fs = fs.transform(X_test)
measure_performance(X_test_fs, y_test, dt, show_confussion_matrix=False)
from sklearn.model_selection import cross_val_score
dt = tree.DecisionTreeClassifier(criterion='entropy')
percentiles = range(1, 100, 5)
results = []
for i in range(1, 100, 5):
fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=i)
X_train_fs = fs.fit_transform(X_train, y_train)
scores = cross_val_score(dt, X_train_fs, y_train, cv=5)
print("%2d %0.4f" % (i, scores.mean()))
results = np.append(results, scores.mean())
optimal_percentile_ind = np.where(results == results.max())[0][0]
print(optimal_percentile_ind)
optimal_percentile_ind = np.where(results == results.max())[0][0]
print("Optimal percentile of features:{0}".format(percentiles[optimal_percentile_ind]), "\n")
optimal_num_features = int(percentiles[optimal_percentile_ind]*len(X.columns)/100)
print("Optimal number of features:{0}".format(optimal_num_features), "\n")
# Plot percentile of features VS. cross-validation scores
import pylab as pl
pl.figure()
pl.xlabel("Percentage of features selected")
pl.ylabel("Cross validation accuracy")
pl.plot(percentiles,results)
fs = feature_selection.SelectKBest(feature_selection.chi2, optimal_num_features)
X_train_fs = fs.fit_transform(X_train, y_train)
dt = tree.DecisionTreeClassifier(criterion='entropy')
dt.fit(X_train_fs, y_train)
X_test_fs = fs.transform(X_test)
measure_performance(X_test_fs, y_test, dt, show_confussion_matrix=False)
print(dt.get_params())
dt = tree.DecisionTreeClassifier(criterion='entropy')
scores = cross_val_score(dt, X_train_fs, y_train, cv=5)
print("Entropy criterion accuracy on cv: {0:.3f}".format(scores.mean()))
dt = tree.DecisionTreeClassifier(criterion='gini')
scores = cross_val_score(dt, X_train_fs, y_train, cv=5)
print("Gini criterion accuracy on cv: {0:.3f}".format(scores.mean()))
# Now we can fit the model to the full training data usign the optimal features and the desired parameters
# and apply the model to the set-aside test data
dt = tree.DecisionTreeClassifier(criterion='entropy')
dt.fit(X_train_fs, y_train)
X_test_fs = fs.transform(X_test)
measure_performance(X_test_fs, y_test, dt, show_confussion_matrix=False, show_classification_report=True)
dt = tree.DecisionTreeClassifier(criterion='entropy')
dt.set_params(max_depth=5)
dt.fit(X_train, y_train)
measure_performance(X_test, y_test, dt, show_confussion_matrix=False, show_classification_report=False)
from sklearn.model_selection import KFold
def calc_params(X, y, clf, param_values, param_name, K):
# Convert input to Numpy arrays
X = np.array(X)
y = np.array(y)
# initialize training and testing score arrays with zeros
train_scores = np.zeros(len(param_values))
test_scores = np.zeros(len(param_values))
# iterate over the different parameter values
for i, param_value in enumerate(param_values):
print(param_name, ' = ', param_value)
# set classifier parameters
clf.set_params(**{param_name:param_value})
# initialize the K scores obtained for each fold
k_train_scores = np.zeros(K)
k_test_scores = np.zeros(K)
# create KFold cross validation
cv = KFold(n_splits=K, shuffle=True, random_state=0)
# iterate over the K folds
j = 0
for train, test in cv.split(X):
# fit the classifier in the corresponding fold
# and obtain the corresponding accuracy scores on train and test sets
clf.fit(X[train], y[train])
k_train_scores[j] = clf.score(X[train], y[train])
k_test_scores[j] = clf.score(X[test], y[test])
j += 1
# store the mean of the K fold scores
train_scores[i] = np.mean(k_train_scores)
test_scores[i] = np.mean(k_test_scores)
# plot the training and testing scores in a log scale
plt.plot(param_values, train_scores, label='Train', alpha=0.4, lw=2, c='b')
plt.plot(param_values, test_scores, label='X-Val', alpha=0.4, lw=2, c='g')
plt.legend(loc=7)
plt.xlabel(param_name + " values")
plt.ylabel("Mean cross validation accuracy")
# return the training and testing scores on each parameter value
return train_scores, test_scores
# Let's create an evenly spaced range of numbers in a specified interval
md = np.linspace(1, 40, 20)
md = np.array([int(e) for e in md])
print(md)
train_scores, test_scores = calc_params(X_train, y_train, dt, md, 'max_depth', 5)
dt = tree.DecisionTreeClassifier(criterion='entropy')
dt.set_params(max_depth=3)
dt.fit(X_train, y_train)
measure_performance(X_test, y_test, dt, show_confussion_matrix=False, show_classification_report=False)
msl = np.linspace(1, 30, 15)
msl = np.array([int(e) for e in msl])
dt = tree.DecisionTreeClassifier(criterion='entropy')
train_scores, test_scores = calc_params(X_train, y_train, dt, msl, 'min_samples_leaf', 5)
dt = tree.DecisionTreeClassifier(criterion='entropy')
dt.set_params(min_samples_leaf=11, max_depth=3)
dt.fit(X_train, y_train)
measure_performance(X_test, y_test, dt, show_confussion_matrix=False)
from sklearn.model_selection import GridSearchCV
dt = tree.DecisionTreeClassifier()
parameters = {
'criterion': ['entropy','gini'],
'max_depth': np.linspace(1, 20, 10, dtype=int),
'min_samples_leaf': np.linspace(1, 30, 15, dtype=int),
'min_samples_split': np.linspace(2, 20, 10, dtype=int)
}
gs = GridSearchCV(dt, parameters, verbose=1, cv=5)
%time _ = gs.fit(X_train, y_train)
gs.best_params_, gs.best_score_
dt = tree.DecisionTreeClassifier(criterion='gini', max_depth=3, min_samples_leaf=3, min_samples_split=2)
dt.fit(X_train, y_train)
measure_performance(X_test, y_test, dt, show_confussion_matrix=False, show_classification_report=True)