# ### Overview # - [Streamlining workflows with pipelines](#Streamlining-workflows-with-pipelines) # - [Loading the Breast Cancer Wisconsin dataset](#Loading-the-Breast-Cancer-Wisconsin-dataset) # - [Combining transformers and estimators in a pipeline](#Combining-transformers-and-estimators-in-a-pipeline) # - [Using k-fold cross-validation to assess model performance](#Using-k-fold-cross-validation-to-assess-model-performance) # - [The holdout method](#The-holdout-method) # - [K-fold cross-validation](#K-fold-cross-validation) # - [Debugging algorithms with learning and validation curves](#Debugging-algorithms-with-learning-and-validation-curves) # - [Diagnosing bias and variance problems with learning curves](#Diagnosing-bias-and-variance-problems-with-learning-curves) # - [Addressing overfitting and underfitting with validation curves](#Addressing-overfitting-and-underfitting-with-validation-curves) # - [Fine-tuning machine learning models via grid search](#Fine-tuning-machine-learning-models-via-grid-search) # - [Tuning hyperparameters via grid search](#Tuning-hyperparameters-via-grid-search) # - [Algorithm selection with nested cross-validation](#Algorithm-selection-with-nested-cross-validation) # - [Looking at different performance evaluation metrics](#Looking-at-different-performance-evaluation-metrics) # - [Reading a confusion matrix](#Reading-a-confusion-matrix) # - [Optimizing the precision and recall of a classification model](#Optimizing-the-precision-and-recall-of-a-classification-model) # - [Plotting a receiver operating characteristic](#Plotting-a-receiver-operating-characteristic) # - [The scoring metrics for multiclass classification](#The-scoring-metrics-for-multiclass-classification) # - [Summary](#Summary) #
# In[2]: from IPython.display import Image get_ipython().run_line_magic('matplotlib', 'inline') # In[3]: # Added version check for recent scikit-learn 0.18 checks from distutils.version import LooseVersion as Version from sklearn import __version__ as sklearn_version # # Streamlining workflows with pipelines # ... # ## Loading the Breast Cancer Wisconsin dataset # In[4]: import pandas as pd import urllib try: df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases' '/breast-cancer-wisconsin/wdbc.data', header=None) except urllib.error.URLError: df = pd.read_csv('https://raw.githubusercontent.com/rasbt/' 'python-machine-learning-book/master/code/' 'datasets/wdbc/wdbc.data', header=None) print('rows, columns:', df.shape) df.head() # In[5]: df.shape # In[6]: from sklearn.preprocessing import LabelEncoder X = df.loc[:, 2:].values y = df.loc[:, 1].values le = LabelEncoder() y = le.fit_transform(y) le.transform(['M', 'B']) # In[7]: if Version(sklearn_version) < '0.18': from sklearn.cross_validation import train_test_split else: from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=0.20, random_state=1) #
# ## Combining transformers and estimators in a pipeline # In[8]: from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA from sklearn.linear_model import LogisticRegression from sklearn.pipeline import Pipeline pipe_lr = Pipeline([('scl', StandardScaler()), ('pca', PCA(n_components=2)), ('clf', LogisticRegression(random_state=1))]) pipe_lr.fit(X_train, y_train) print('Test Accuracy: %.3f' % pipe_lr.score(X_test, y_test)) y_pred = pipe_lr.predict(X_test) # In[9]: Image(filename='./images/06_01.png', width=500) #
# # Using k-fold cross validation to assess model performance # ... # ## The holdout method # In[10]: Image(filename='./images/06_02.png', width=500) #
# ## K-fold cross-validation # In[11]: Image(filename='./images/06_03.png', width=500) # In[12]: import numpy as np if Version(sklearn_version) < '0.18': from sklearn.cross_validation import StratifiedKFold else: from sklearn.model_selection import StratifiedKFold if Version(sklearn_version) < '0.18': kfold = StratifiedKFold(y=y_train, n_folds=10, random_state=1) else: kfold = StratifiedKFold(n_splits=10, random_state=1).split(X_train, y_train) scores = [] for k, (train, test) in enumerate(kfold): pipe_lr.fit(X_train[train], y_train[train]) score = pipe_lr.score(X_train[test], y_train[test]) scores.append(score) print('Fold: %s, Class dist.: %s, Acc: %.3f' % (k+1, np.bincount(y_train[train]), score)) print('\nCV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores))) # In[13]: if Version(sklearn_version) < '0.18': from sklearn.cross_validation import cross_val_score else: from sklearn.model_selection import cross_val_score scores = cross_val_score(estimator=pipe_lr, X=X_train, y=y_train, cv=10, n_jobs=1) print('CV accuracy scores: %s' % scores) print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores))) #
# ## Diagnosing bias and variance problems with learning curves # In[14]: Image(filename='./images/06_04.png', width=600) # In[15]: import matplotlib.pyplot as plt if Version(sklearn_version) < '0.18': from sklearn.learning_curve import learning_curve else: from sklearn.model_selection import learning_curve pipe_lr = Pipeline([('scl', StandardScaler()), ('clf', LogisticRegression(penalty='l2', random_state=0))]) train_sizes, train_scores, test_scores =\ learning_curve(estimator=pipe_lr, X=X_train, y=y_train, train_sizes=np.linspace(0.1, 1.0, 10), cv=10, n_jobs=1) train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='training accuracy') plt.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue') plt.plot(train_sizes, test_mean, color='green', linestyle='--', marker='s', markersize=5, label='validation accuracy') plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green') plt.grid() plt.xlabel('Number of training samples') plt.ylabel('Accuracy') plt.legend(loc='lower right') plt.ylim([0.8, 1.0]) plt.tight_layout() # plt.savefig('./figures/learning_curve.png', dpi=300) plt.show() #
# ## Addressing over- and underfitting with validation curves # In[16]: if Version(sklearn_version) < '0.18': from sklearn.learning_curve import validation_curve else: from sklearn.model_selection import validation_curve param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0] train_scores, test_scores = validation_curve( estimator=pipe_lr, X=X_train, y=y_train, param_name='clf__C', param_range=param_range, cv=10) train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) plt.plot(param_range, train_mean, color='blue', marker='o', markersize=5, label='training accuracy') plt.fill_between(param_range, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue') plt.plot(param_range, test_mean, color='green', linestyle='--', marker='s', markersize=5, label='validation accuracy') plt.fill_between(param_range, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green') plt.grid() plt.xscale('log') plt.legend(loc='lower right') plt.xlabel('Parameter C') plt.ylabel('Accuracy') plt.ylim([0.8, 1.0]) plt.tight_layout() # plt.savefig('./figures/validation_curve.png', dpi=300) plt.show() #
# ## Tuning hyperparameters via grid search # In[17]: from sklearn.svm import SVC if Version(sklearn_version) < '0.18': from sklearn.grid_search import GridSearchCV else: from sklearn.model_selection import GridSearchCV pipe_svc = Pipeline([('scl', StandardScaler()), ('clf', SVC(random_state=1))]) param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0] param_grid = [{'clf__C': param_range, 'clf__kernel': ['linear']}, {'clf__C': param_range, 'clf__gamma': param_range, 'clf__kernel': ['rbf']}] gs = GridSearchCV(estimator=pipe_svc, param_grid=param_grid, scoring='accuracy', cv=10, n_jobs=-1) gs = gs.fit(X_train, y_train) print(gs.best_score_) print(gs.best_params_) # In[18]: clf = gs.best_estimator_ clf.fit(X_train, y_train) print('Test accuracy: %.3f' % clf.score(X_test, y_test)) #
# ## Algorithm selection with nested cross-validation # In[19]: Image(filename='./images/06_07.png', width=500) # In[20]: gs = GridSearchCV(estimator=pipe_svc, param_grid=param_grid, scoring='accuracy', cv=2) # Note: Optionally, you could use cv=2 # in the GridSearchCV above to produce # the 5 x 2 nested CV that is shown in the figure. scores = cross_val_score(gs, X_train, y_train, scoring='accuracy', cv=5) print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores))) # In[21]: from sklearn.tree import DecisionTreeClassifier gs = GridSearchCV(estimator=DecisionTreeClassifier(random_state=0), param_grid=[{'max_depth': [1, 2, 3, 4, 5, 6, 7, None]}], scoring='accuracy', cv=2) scores = cross_val_score(gs, X_train, y_train, scoring='accuracy', cv=5) print('CV accuracy: %.3f +/- %.3f' % (np.mean(scores), np.std(scores))) #
# # Looking at different performance evaluation metrics # ... # ## Reading a confusion matrix # In[22]: Image(filename='./images/06_08.png', width=300) # In[23]: from sklearn.metrics import confusion_matrix pipe_svc.fit(X_train, y_train) y_pred = pipe_svc.predict(X_test) confmat = confusion_matrix(y_true=y_test, y_pred=y_pred) print(confmat) # In[24]: fig, ax = plt.subplots(figsize=(2.5, 2.5)) ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3) for i in range(confmat.shape[0]): for j in range(confmat.shape[1]): ax.text(x=j, y=i, s=confmat[i, j], va='center', ha='center') plt.xlabel('predicted label') plt.ylabel('true label') plt.tight_layout() # plt.savefig('./figures/confusion_matrix.png', dpi=300) plt.show() # ### Additional Note # Remember that we previously encoded the class labels so that *malignant* samples are the "postive" class (1), and *benign* samples are the "negative" class (0): # In[25]: le.transform(['M', 'B']) # In[26]: confmat = confusion_matrix(y_true=y_test, y_pred=y_pred) print(confmat) # Next, we printed the confusion matrix like so: # In[27]: confmat = confusion_matrix(y_true=y_test, y_pred=y_pred) print(confmat) # Note that the (true) class 0 samples that are correctly predicted as class 0 (true negatives) are now in the upper left corner of the matrix (index 0, 0). In order to change the ordering so that the true negatives are in the lower right corner (index 1,1) and the true positves are in the upper left, we can use the `labels` argument like shown below: # In[28]: confmat = confusion_matrix(y_true=y_test, y_pred=y_pred, labels=[1, 0]) print(confmat) # We conclude: # # Assuming that class 1 (malignant) is the positive class in this example, our model correctly classified 71 of the samples that belong to class 0 (true negatives) and 40 samples that belong to class 1 (true positives), respectively. However, our model also incorrectly misclassified 1 sample from class 0 as class 1 (false positive), and it predicted that 2 samples are benign although it is a malignant tumor (false negatives). #
# ## Optimizing the precision and recall of a classification model # In[29]: from sklearn.metrics import precision_score, recall_score, f1_score print('Precision: %.3f' % precision_score(y_true=y_test, y_pred=y_pred)) print('Recall: %.3f' % recall_score(y_true=y_test, y_pred=y_pred)) print('F1: %.3f' % f1_score(y_true=y_test, y_pred=y_pred)) # In[30]: from sklearn.metrics import make_scorer scorer = make_scorer(f1_score, pos_label=0) c_gamma_range = [0.01, 0.1, 1.0, 10.0] param_grid = [{'clf__C': c_gamma_range, 'clf__kernel': ['linear']}, {'clf__C': c_gamma_range, 'clf__gamma': c_gamma_range, 'clf__kernel': ['rbf']}] gs = GridSearchCV(estimator=pipe_svc, param_grid=param_grid, scoring=scorer, cv=10, n_jobs=-1) gs = gs.fit(X_train, y_train) print(gs.best_score_) print(gs.best_params_) #
# ## Plotting a receiver operating characteristic # In[31]: from sklearn.metrics import roc_curve, auc from scipy import interp pipe_lr = Pipeline([('scl', StandardScaler()), ('pca', PCA(n_components=2)), ('clf', LogisticRegression(penalty='l2', random_state=0, C=100.0))]) X_train2 = X_train[:, [4, 14]] if Version(sklearn_version) < '0.18': cv = StratifiedKFold(y_train, n_folds=3, random_state=1) else: cv = list(StratifiedKFold(n_splits=3, random_state=1).split(X_train, y_train)) fig = plt.figure(figsize=(7, 5)) mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) all_tpr = [] for i, (train, test) in enumerate(cv): probas = pipe_lr.fit(X_train2[train], y_train[train]).predict_proba(X_train2[test]) fpr, tpr, thresholds = roc_curve(y_train[test], probas[:, 1], pos_label=1) mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i+1, roc_auc)) plt.plot([0, 1], [0, 1], linestyle='--', color=(0.6, 0.6, 0.6), label='random guessing') mean_tpr /= len(cv) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) plt.plot(mean_fpr, mean_tpr, 'k--', label='mean ROC (area = %0.2f)' % mean_auc, lw=2) plt.plot([0, 0, 1], [0, 1, 1], lw=2, linestyle=':', color='black', label='perfect performance') plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('false positive rate') plt.ylabel('true positive rate') plt.title('Receiver Operator Characteristic') plt.legend(loc="lower right") plt.tight_layout() # plt.savefig('./figures/roc.png', dpi=300) plt.show() # In[32]: pipe_lr = pipe_lr.fit(X_train2, y_train) y_labels = pipe_lr.predict(X_test[:, [4, 14]]) y_probas = pipe_lr.predict_proba(X_test[:, [4, 14]])[:, 1] # note that we use probabilities for roc_auc # the `[:, 1]` selects the positive class label only # In[33]: from sklearn.metrics import roc_auc_score, accuracy_score print('ROC AUC: %.3f' % roc_auc_score(y_true=y_test, y_score=y_probas)) print('Accuracy: %.3f' % accuracy_score(y_true=y_test, y_pred=y_labels)) #
# ## The scoring metrics for multiclass classification # In[34]: pre_scorer = make_scorer(score_func=precision_score, pos_label=1, greater_is_better=True, average='micro') #
