#!/usr/bin/env python # coding: utf-8 # # 6장. 모델 평가와 하이퍼파라미터 튜닝의 모범 사례 # **아래 링크를 통해 이 노트북을 주피터 노트북 뷰어(nbviewer.jupyter.org)로 보거나 구글 코랩(colab.research.google.com)에서 실행할 수 있습니다.** # # # # #
# 주피터 노트북 뷰어로 보기 # # 구글 코랩(Colab)에서 실행하기 #
# `watermark`는 주피터 노트북에 사용하는 파이썬 패키지를 출력하기 위한 유틸리티입니다. `watermark` 패키지를 설치하려면 다음 셀의 주석을 제거한 뒤 실행하세요. # In[1]: #!pip install watermark # In[2]: get_ipython().run_line_magic('load_ext', 'watermark') get_ipython().run_line_magic('watermark', '-u -d -v -p numpy,pandas,matplotlib,sklearn') # ## 위스콘신 유방암 데이터셋 적재하기 # In[3]: import pandas as pd df = pd.read_csv('https://archive.ics.uci.edu/ml/' 'machine-learning-databases' '/breast-cancer-wisconsin/wdbc.data', header=None) # UCI 머신 러닝 저장소에서 유방암 데이터셋을 다운로드할 수 없을 때 # 다음 주석을 해제하고 로컬 경로에서 데이터셋을 적재하세요. # df = pd.read_csv('wdbc.data', header=None) df.head() # In[4]: df.shape # In[5]: from sklearn.preprocessing import LabelEncoder X = df.loc[:, 2:].values y = df.loc[:, 1].values le = LabelEncoder() y = le.fit_transform(y) le.classes_ # In[6]: le.transform(['M', 'B']) # In[7]: from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=0.20, stratify=y, random_state=1) # ## 파이프라인으로 변환기와 추정기 연결하기 # In[8]: from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA from sklearn.linear_model import LogisticRegression from sklearn.pipeline import make_pipeline pipe_lr = make_pipeline(StandardScaler(), PCA(n_components=2), LogisticRegression(solver='liblinear', random_state=1)) pipe_lr.fit(X_train, y_train) y_pred = pipe_lr.predict(X_test) print('테스트 정확도: %.3f' % pipe_lr.score(X_test, y_test)) # # k-겹 교차 검증을 사용한 모델 성능 평가 # ## k-겹 교차 검증 # In[9]: import numpy as np from sklearn.model_selection import StratifiedKFold kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=1).split(X_train, y_train) scores = [] for k, (train, test) in enumerate(kfold): pipe_lr.fit(X_train[train], y_train[train]) score = pipe_lr.score(X_train[test], y_train[test]) scores.append(score) print('폴드: %2d, 클래스 분포: %s, 정확도: %.3f' % (k+1, np.bincount(y_train[train]), score)) print('\nCV 정확도: %.3f +/- %.3f' % (np.mean(scores), np.std(scores))) # In[10]: from sklearn.model_selection import cross_val_score scores = cross_val_score(estimator=pipe_lr, X=X_train, y=y_train, cv=10, n_jobs=1) print('CV 정확도 점수: %s' % scores) print('CV 정확도: %.3f +/- %.3f' % (np.mean(scores), np.std(scores))) # 역자 노트 ##### # In[11]: from sklearn.model_selection import cross_validate scores = cross_validate(estimator=pipe_lr, X=X_train, y=y_train, scoring=['accuracy'], cv=10, n_jobs=-1, return_train_score=False) print('CV 정확도 점수: %s' % scores['test_accuracy']) print('CV 정확도: %.3f +/- %.3f' % (np.mean(scores['test_accuracy']), np.std(scores['test_accuracy']))) # ##### # # 학습 곡선과 검증 곡선을 사용한 알고리즘 디버깅 # ## 학습 곡선으로 편향과 분산 문제 분석하기 # In[12]: import matplotlib.pyplot as plt # In[13]: from sklearn.model_selection import learning_curve pipe_lr = make_pipeline(StandardScaler(), LogisticRegression(solver='liblinear', penalty='l2', random_state=1)) train_sizes, train_scores, test_scores =\ learning_curve(estimator=pipe_lr, X=X_train, y=y_train, train_sizes=np.linspace(0.1, 1.0, 10), cv=10, n_jobs=1) train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='training accuracy') plt.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue') plt.plot(train_sizes, test_mean, color='green', linestyle='--', marker='s', markersize=5, label='validation accuracy') plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green') plt.grid() plt.xlabel('Number of training samples') plt.ylabel('Accuracy') plt.legend(loc='lower right') plt.ylim([0.8, 1.03]) plt.tight_layout() plt.show() # ## 검증 곡선으로 과대적합과 과소적합 조사하기 # In[14]: from sklearn.model_selection import validation_curve param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0] train_scores, test_scores = validation_curve( estimator=pipe_lr, X=X_train, y=y_train, param_name='logisticregression__C', param_range=param_range, cv=10) train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) plt.plot(param_range, train_mean, color='blue', marker='o', markersize=5, label='training accuracy') plt.fill_between(param_range, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue') plt.plot(param_range, test_mean, color='green', linestyle='--', marker='s', markersize=5, label='validation accuracy') plt.fill_between(param_range, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green') plt.grid() plt.xscale('log') plt.legend(loc='lower right') plt.xlabel('Parameter C') plt.ylabel('Accuracy') plt.ylim([0.8, 1.00]) plt.tight_layout() plt.show() # # 그리드 서치를 사용한 머신 러닝 모델 세부 튜닝 # ## 그리드 서치를 사용한 하이퍼파라미터 튜닝 # In[15]: from sklearn.model_selection import GridSearchCV from sklearn.svm import SVC pipe_svc = make_pipeline(StandardScaler(), SVC(random_state=1)) param_range = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0] param_grid = [{'svc__C': param_range, 'svc__kernel': ['linear']}, {'svc__C': param_range, 'svc__gamma': param_range, 'svc__kernel': ['rbf']}] gs = GridSearchCV(estimator=pipe_svc, param_grid=param_grid, scoring='accuracy', cv=10, n_jobs=-1) gs = gs.fit(X_train, y_train) print(gs.best_score_) print(gs.best_params_) # In[16]: clf = gs.best_estimator_ clf.fit(X_train, y_train) print('테스트 정확도: %.3f' % clf.score(X_test, y_test)) # ## 중첩 교차 검증을 사용한 알고리즘 선택 # In[17]: gs = GridSearchCV(estimator=pipe_svc, param_grid=param_grid, scoring='accuracy', cv=2) scores = cross_val_score(gs, X_train, y_train, scoring='accuracy', cv=5) print('CV 정확도: %.3f +/- %.3f' % (np.mean(scores), np.std(scores))) # In[18]: from sklearn.tree import DecisionTreeClassifier gs = GridSearchCV(estimator=DecisionTreeClassifier(random_state=0), param_grid=[{'max_depth': [1, 2, 3, 4, 5, 6, 7, None]}], scoring='accuracy', cv=2) scores = cross_val_score(gs, X_train, y_train, scoring='accuracy', cv=5) print('CV 정확도: %.3f +/- %.3f' % (np.mean(scores), np.std(scores))) # # 다른 성능 평가 지표 # ## 오차 행렬 # In[19]: from sklearn.metrics import confusion_matrix pipe_svc.fit(X_train, y_train) y_pred = pipe_svc.predict(X_test) confmat = confusion_matrix(y_true=y_test, y_pred=y_pred) print(confmat) # In[20]: fig, ax = plt.subplots(figsize=(2.5, 2.5)) ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3) for i in range(confmat.shape[0]): for j in range(confmat.shape[1]): ax.text(x=j, y=i, s=confmat[i, j], va='center', ha='center') plt.xlabel('Predicted label') plt.ylabel('True label') plt.tight_layout() plt.show() # ## 분류 모델의 정밀도와 재현율 최적화 # In[21]: from sklearn.metrics import precision_score, recall_score, f1_score print('정밀도: %.3f' % precision_score(y_true=y_test, y_pred=y_pred)) print('재현율: %.3f' % recall_score(y_true=y_test, y_pred=y_pred)) print('F1: %.3f' % f1_score(y_true=y_test, y_pred=y_pred)) # In[22]: from sklearn.metrics import make_scorer scorer = make_scorer(f1_score, pos_label=0) c_gamma_range = [0.01, 0.1, 1.0, 10.0] param_grid = [{'svc__C': c_gamma_range, 'svc__kernel': ['linear']}, {'svc__C': c_gamma_range, 'svc__gamma': c_gamma_range, 'svc__kernel': ['rbf']}] gs = GridSearchCV(estimator=pipe_svc, param_grid=param_grid, scoring=scorer, cv=10, n_jobs=-1) gs = gs.fit(X_train, y_train) print(gs.best_score_) print(gs.best_params_) # ## ROC 곡선 그리기 # In[23]: from sklearn.metrics import roc_curve, auc from numpy import interp pipe_lr = make_pipeline(StandardScaler(), PCA(n_components=2), LogisticRegression(solver='liblinear', penalty='l2', random_state=1, C=100.0)) X_train2 = X_train[:, [4, 14]] cv = list(StratifiedKFold(n_splits=3, shuffle=True, random_state=1).split(X_train, y_train)) fig = plt.figure(figsize=(7, 5)) mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) all_tpr = [] for i, (train, test) in enumerate(cv): probas = pipe_lr.fit(X_train2[train], y_train[train]).predict_proba(X_train2[test]) fpr, tpr, thresholds = roc_curve(y_train[test], probas[:, 1], pos_label=1) mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, label='ROC fold %d (area = %0.2f)' % (i+1, roc_auc)) plt.plot([0, 1], [0, 1], linestyle='--', color=(0.6, 0.6, 0.6), label='random guessing') mean_tpr /= len(cv) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) plt.plot(mean_fpr, mean_tpr, 'k--', label='mean ROC (area = %0.2f)' % mean_auc, lw=2) plt.plot([0, 0, 1], [0, 1, 1], linestyle=':', color='black', label='perfect performance') plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('false positive rate') plt.ylabel('true positive rate') plt.legend(loc="lower right") plt.tight_layout() plt.show() # ## 다중 분류의 성능 지표 # In[24]: pre_scorer = make_scorer(score_func=precision_score, pos_label=1, greater_is_better=True, average='micro') # ## 불균형한 클래스 다루기 # In[25]: X_imb = np.vstack((X[y == 0], X[y == 1][:40])) y_imb = np.hstack((y[y == 0], y[y == 1][:40])) # In[26]: y_pred = np.zeros(y_imb.shape[0]) np.mean(y_pred == y_imb) * 100 # In[27]: from sklearn.utils import resample print('샘플링하기 전의 클래스 1의 샘플 개수:', X_imb[y_imb == 1].shape[0]) X_upsampled, y_upsampled = resample(X_imb[y_imb == 1], y_imb[y_imb == 1], replace=True, n_samples=X_imb[y_imb == 0].shape[0], random_state=123) print('샘플링한 후의 클래스 1의 샘플 개수:', X_upsampled.shape[0]) # In[28]: X_bal = np.vstack((X[y == 0], X_upsampled)) y_bal = np.hstack((y[y == 0], y_upsampled)) # In[29]: y_pred = np.zeros(y_bal.shape[0]) np.mean(y_pred == y_bal) * 100