from sklearn.datasets import make_blobs
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
# create a synthetic dataset
X, y = make_blobs(random_state=0)
# split data and labels into a training and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
# instantiate a model and fit it to the training set
logreg = LogisticRegression().fit(X_train, y_train)
# evaluate the model on the test set
print("Test set score: {:.2f}".format(logreg.score(X_test, y_test)))
Test set score: 0.88
%matplotlib inline
import sys
sys.path.append('..')
from preamble import *
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
iris = load_iris()
logreg = LogisticRegression()
scores = cross_val_score(logreg, iris.data, iris.target)
print("교차 검증 점수: {}".format(scores))
교차 검증 점수: [0.961 0.922 0.958]
scores = cross_val_score(logreg, iris.data, iris.target, cv=5)
print("교차 검증 점수: {}".format(scores))
교차 검증 점수: [1. 0.967 0.933 0.9 1. ]
print("교차 검증 평균 점수: {:.2f}".format(scores.mean()))
교차 검증 평균 점수: 0.96
from sklearn.datasets import load_iris
iris = load_iris()
print("Iris labels:\n{}".format(iris.target))
Iris labels: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
from sklearn.model_selection import KFold
kfold = KFold(n_splits=5)
print("Cross-validation scores:\n{}".format(
cross_val_score(logreg, iris.data, iris.target, cv=kfold)))
Cross-validation scores: [1. 0.933 0.433 0.967 0.433]
kfold = KFold(n_splits=3)
print("Cross-validation scores:\n{}".format(
cross_val_score(logreg, iris.data, iris.target, cv=kfold)))
Cross-validation scores: [0. 0. 0.]
kfold = KFold(n_splits=3, shuffle=True, random_state=0)
print("Cross-validation scores:\n{}".format(
cross_val_score(logreg, iris.data, iris.target, cv=kfold)))
Cross-validation scores: [0.9 0.96 0.96]
from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()
scores = cross_val_score(logreg, iris.data, iris.target, cv=loo)
print("교차 검증 분할 횟수: ", len(scores))
print("평균 정확도: {:.2f}".format(scores.mean()))
교차 검증 분할 횟수: 150 평균 정확도: 0.95
from sklearn.model_selection import ShuffleSplit
shuffle_split = ShuffleSplit(test_size=.5, train_size=.5, n_splits=10)
scores = cross_val_score(logreg, iris.data, iris.target, cv=shuffle_split)
print("교차 검증 점수:\n{}".format(scores))
교차 검증 점수: [0.907 0.973 0.893 0.973 0.933 0.893 0.893 0.907 0.893 0.96 ]
from sklearn.model_selection import GroupKFold
X, y = make_blobs(n_samples=12, random_state=0)
groups = [0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, 3]
scores = cross_val_score(logreg, X, y, groups, cv=GroupKFold(n_splits=3))
print("교차 검증 점수:\n{}".format(scores))
교차 검증 점수: [0.75 0.8 0.667]
from sklearn.svm import SVC
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=0)
print("Size of training set: {} size of test set: {}".format(X_train.shape[0], X_test.shape[0]))
best_score = 0
for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
for C in [0.001, 0.01, 0.1, 1, 10, 100]:
svm = SVC(gamma=gamma, C=C)
svm.fit(X_train, y_train)
score = svm.score(X_test, y_test)
if score > best_score:
best_score = score
best_parameters = {'C': C, 'gamma': gamma}
print("최고 점수: {:.2f}".format(best_score))
print("최고 매개변수: {}".format(best_parameters))
Size of training set: 112 size of test set: 38 최고 점수: 0.97 최고 매개변수: {'C': 100, 'gamma': 0.001}
from sklearn.svm import SVC
X_trainval, X_test, y_trainval, y_test = train_test_split(iris.data, iris.target, random_state=0)
X_train, X_valid, y_train, y_valid = train_test_split(X_trainval, y_trainval, random_state=1)
print("Size of training set: {}, size of validation set: {},size of test set: {}\n".format(
X_train.shape[0],
X_valid.shape[0],
X_test.shape[0]))
best_score = 0
for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
for C in [0.001, 0.01, 0.1, 1, 10, 100]:
svm = SVC(gamma=gamma, C=C)
svm.fit(X_train, y_train)
score = svm.score(X_valid, y_valid)
if score > best_score:
best_score = score
best_parameters = {'C': C, 'gamma': gamma}
svm = SVC(**best_parameters)
svm.fit(X_trainval, y_trainval)
test_score = svm.score(X_test, y_test)
print("검증 세트에서 최고 점수: {:.2f}".format(best_score))
print("최고 파라미터: ", best_parameters)
print("최적 매개변수에서 테스트 세트 점수: {:.2f}".format(test_score))
Size of training set: 84, size of validation set: 28,size of test set: 38 검증 세트에서 최고 점수: 0.96 최고 파라미터: {'C': 10, 'gamma': 0.001} 최적 매개변수에서 테스트 세트 점수: 0.92
for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
for C in [0.001, 0.01, 0.1, 1, 10, 100]:
svm = SVC(gamma=gamma, C=C)
scores = cross_val_score(svm, X_trainval, y_trainval, cv=5)
score = np.mean(scores)
if score > best_score:
best_score = score
best_parameters = {'C': C, 'gamma': gamma}
svm = SVC(**best_parameters)
svm.fit(X_trainval, y_trainval)
SVC(C=100, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape=None, degree=3, gamma=0.01, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)
param_grid = { 'C': [0.001, 0.01, 0.1, 1, 10, 100],
'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}
print("Parameter grid:\n{}".format(param_grid))
Parameter grid: {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
grid_search = GridSearchCV(SVC(), param_grid, cv=5, return_train_score=True)
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state=0)
grid_search.fit(X_train, y_train)
GridSearchCV(cv=5, error_score='raise', estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape=None, degree=3, gamma='auto', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False), fit_params={}, iid=True, n_jobs=1, param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}, pre_dispatch='2*n_jobs', refit=True, return_train_score=True, scoring=None, verbose=0)
print("테스트 세트 점수: {:.2f}".format(grid_search.score(X_test, y_test)))
테스트 세트 점수: 0.97
print("최고 매개변수: {}".format(grid_search.best_params_))
print("최적 매개변수에서 테스트 세트 점수: {:.2f}".format(grid_search.best_score_))
최고 매개변수: {'C': 100, 'gamma': 0.01} 최적 매개변수에서 테스트 세트 점수: 0.97
print("최고 성능 모델:\n{}".format(grid_search.best_estimator_))
최고 성능 모델: SVC(C=100, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape=None, degree=3, gamma=0.01, kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False)
import pandas as pd
results = pd.DataFrame(grid_search.cv_results_)
print(results.columns)
display(results.head())
Index(['mean_fit_time', 'mean_score_time', 'mean_test_score', 'mean_train_score', 'param_C', 'param_gamma', 'params', 'rank_test_score', 'split0_test_score', 'split0_train_score', 'split1_test_score', 'split1_train_score', 'split2_test_score', 'split2_train_score', 'split3_test_score', 'split3_train_score', 'split4_test_score', 'split4_train_score', 'std_fit_time', 'std_score_time', 'std_test_score', 'std_train_score'], dtype='object')
mean_fit_time | mean_score_time | mean_test_score | mean_train_score | ... | std_fit_time | std_score_time | std_test_score | std_train_score | |
---|---|---|---|---|---|---|---|---|---|
0 | 6.86e-04 | 2.68e-04 | 0.37 | 0.37 | ... | 1.76e-04 | 4.39e-05 | 0.01 | 2.85e-03 |
1 | 6.11e-04 | 2.53e-04 | 0.37 | 0.37 | ... | 2.05e-05 | 6.22e-06 | 0.01 | 2.85e-03 |
2 | 6.77e-04 | 2.93e-04 | 0.37 | 0.37 | ... | 8.31e-05 | 5.52e-05 | 0.01 | 2.85e-03 |
3 | 5.89e-04 | 2.60e-04 | 0.37 | 0.37 | ... | 1.95e-05 | 3.59e-05 | 0.01 | 2.85e-03 |
4 | 6.35e-04 | 2.46e-04 | 0.37 | 0.37 | ... | 1.99e-05 | 6.66e-06 | 0.01 | 2.85e-03 |
5 rows × 22 columns
scores = np.array(results.mean_test_score).reshape(6, 6)
mglearn.tools.heatmap(
scores,
xlabel='gamma',
xticklabels=param_grid['gamma'],
ylabel='C',
yticklabels=param_grid['C'],
cmap="viridis"
)
<matplotlib.collections.PolyCollection at 0x116d45fd0>
fig, axes = plt.subplots(1, 3, figsize=(20, 5))
param_grid_linear = {'C': np.linspace(1, 2, 6), 'gamma': np.linspace(1, 2, 6)}
param_grid_one_log = {'C': np.linspace(1, 2, 6), 'gamma': np.logspace(-3, 2, 6)}
param_grid_range = {'C': np.logspace(-3, 2, 6), 'gamma': np.logspace(-7, -2, 6)}
for param_grid, ax in zip([param_grid_linear, param_grid_one_log, param_grid_range], axes):
grid_search = GridSearchCV(SVC(), param_grid, cv=5)
grid_search.fit(X_train, y_train)
scores = grid_search.cv_results_['mean_test_score'].reshape(6, 6)
scores_image = mglearn.tools.heatmap(
scores, xlabel='gamma', ylabel='C', xticklabels=param_grid['gamma'],
yticklabels=param_grid['C'], cmap="viridis", ax=ax)
plt.colorbar(scores_image, ax=axes.tolist())
<matplotlib.colorbar.Colorbar at 0x117a10d30>
param_grid = [{'kernel': ['rbf'],
'C': [0.001, 0.01, 0.1, 1, 10, 100],
'gamma': [0.001, 0.01, 0.1, 1, 10, 100]},
{'kernel': ['linear'],
'C': [0.001, 0.01, 0.1, 1, 10, 100]}]
print("그리드 목록:\n{}".format(param_grid))
그리드 목록: [{'kernel': ['rbf'], 'C': [0.001, 0.01, 0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}, {'kernel': ['linear'], 'C': [0.001, 0.01, 0.1, 1, 10, 100]}]
grid_search = GridSearchCV(SVC(), param_grid, cv=5, return_train_score=True)
grid_search.fit(X_train, y_train)
print("최고 매개벼수: {}".format(grid_search.best_params_))
print("최고 교차 검증 점수: {:.2f}".format(grid_search.best_score_))
최고 매개벼수: {'C': 100, 'gamma': 0.01, 'kernel': 'rbf'} 최고 교차 검증 점수: 0.97
results = pd.DataFrame(grid_search.cv_results_)
display(results.T)
0 | 1 | 2 | 3 | ... | 38 | 39 | 40 | 41 | |
---|---|---|---|---|---|---|---|---|---|
mean_fit_time | 0.00065 | 0.0006 | 0.00059 | 0.00062 | ... | 0.00035 | 0.00034 | 0.00034 | 0.00035 |
mean_score_time | 0.00029 | 0.00025 | 0.00025 | 0.00025 | ... | 0.00021 | 0.0002 | 0.0002 | 0.0002 |
mean_test_score | 0.37 | 0.37 | 0.37 | 0.37 | ... | 0.95 | 0.97 | 0.96 | 0.96 |
mean_train_score | 0.37 | 0.37 | 0.37 | 0.37 | ... | 0.97 | 0.98 | 0.99 | 0.99 |
param_C | 0.001 | 0.001 | 0.001 | 0.001 | ... | 0.1 | 1 | 10 | 100 |
param_gamma | 0.001 | 0.01 | 0.1 | 1 | ... | NaN | NaN | NaN | NaN |
param_kernel | rbf | rbf | rbf | rbf | ... | linear | linear | linear | linear |
params | {'C': 0.001, 'gamma': 0.001, 'kernel': 'rbf'} | {'C': 0.001, 'gamma': 0.01, 'kernel': 'rbf'} | {'C': 0.001, 'gamma': 0.1, 'kernel': 'rbf'} | {'C': 0.001, 'gamma': 1, 'kernel': 'rbf'} | ... | {'C': 0.1, 'kernel': 'linear'} | {'C': 1, 'kernel': 'linear'} | {'C': 10, 'kernel': 'linear'} | {'C': 100, 'kernel': 'linear'} |
rank_test_score | 27 | 27 | 27 | 27 | ... | 11 | 1 | 3 | 3 |
split0_test_score | 0.38 | 0.38 | 0.38 | 0.38 | ... | 0.96 | 1 | 0.96 | 0.96 |
split0_train_score | 0.36 | 0.36 | 0.36 | 0.36 | ... | 0.97 | 0.99 | 0.99 | 0.99 |
split1_test_score | 0.35 | 0.35 | 0.35 | 0.35 | ... | 0.91 | 0.96 | 1 | 1 |
split1_train_score | 0.37 | 0.37 | 0.37 | 0.37 | ... | 0.98 | 0.98 | 0.99 | 0.99 |
split2_test_score | 0.36 | 0.36 | 0.36 | 0.36 | ... | 1 | 1 | 1 | 1 |
split2_train_score | 0.37 | 0.37 | 0.37 | 0.37 | ... | 0.94 | 0.98 | 0.98 | 0.99 |
split3_test_score | 0.36 | 0.36 | 0.36 | 0.36 | ... | 0.91 | 0.95 | 0.91 | 0.91 |
split3_train_score | 0.37 | 0.37 | 0.37 | 0.37 | ... | 0.98 | 0.99 | 0.99 | 1 |
split4_test_score | 0.38 | 0.38 | 0.38 | 0.38 | ... | 0.95 | 0.95 | 0.95 | 0.95 |
split4_train_score | 0.36 | 0.36 | 0.36 | 0.36 | ... | 0.97 | 0.99 | 1 | 1 |
std_fit_time | 8.8e-05 | 1.6e-05 | 1.8e-05 | 1.7e-05 | ... | 1.1e-05 | 1e-05 | 1.3e-05 | 2.9e-05 |
std_score_time | 6e-05 | 3.2e-06 | 1.5e-05 | 6.7e-06 | ... | 3.6e-06 | 1.1e-06 | 1.9e-06 | 1.8e-06 |
std_test_score | 0.011 | 0.011 | 0.011 | 0.011 | ... | 0.033 | 0.022 | 0.034 | 0.034 |
std_train_score | 0.0029 | 0.0029 | 0.0029 | 0.0029 | ... | 0.012 | 0.0055 | 0.007 | 0.0055 |
23 rows × 42 columns
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}
scores = cross_val_score(GridSearchCV(SVC(), param_grid, cv=5), iris.data, iris.target, cv=5)
print("교차 검증 점수: ", scores)
print("교차 검증 평균 점수: ", scores.mean())
교차 검증 점수: [0.967 1. 0.967 0.967 1. ] 교차 검증 평균 점수: 0.9800000000000001
def nested_cv(X, y, inner_cv, outer_cv, Classifier, parameter_grid):
outer_scores = []
for training_samples, test_samples in outer_cv.split(X, y):
best_parms = {}
best_score = -np.inf
for parameters in parameter_grid:
cv_scores = []
for inner_train, inner_test in inner_cv.split(X[training_samples], y[training_samples]):
clf = Classifier(**parameters)
clf.fit(X[inner_train], y[inner_train])
score = clf.score(X[inner_test], y[inner_test])
cv_scores.append(score)
mean_score = np.mean(cv_scores)
if mean_score > best_score:
best_score = mean_score
best_params = parameters
clf = Classifier(**best_params)
clf.fit(X[training_samples], y[training_samples])
outer_scores.append(clf.score(X[test_samples], y[test_samples]))
return np.array(outer_scores)
from sklearn.model_selection import ParameterGrid, StratifiedKFold
scores = nested_cv(iris.data, iris.target, StratifiedKFold(5),
StratifiedKFold(5), SVC, ParameterGrid(param_grid))
print("교차 검증 점수: {}".format(scores))
교차 검증 점수: [0.967 1. 0.967 0.967 1. ]
from sklearn.datasets import load_digits
digits = load_digits()
y = digits.target == 9
X_train, X_test, y_train, y_test = train_test_split(
digits.data, y, random_state=0)
from sklearn.dummy import DummyClassifier
dummy_majority = DummyClassifier(strategy='most_frequent').fit(X_train, y_train)
pred_most_frequent = dummy_majority.predict(X_test)
print("예측된 유니크 레이블: {}".format(np.unique(pred_most_frequent)))
print("테스트 점수: {:.2f}".format(dummy_majority.score(X_test, y_test)))
예측된 유니크 레이블: [False] 테스트 점수: 0.90
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(max_depth=2).fit(X_train, y_train)
pred_tree = tree.predict(X_test)
print("테스트 점수: {:.2f}".format(tree.score(X_test, y_test)))
테스트 점수: 0.92
from sklearn.linear_model import LogisticRegression
dummy = DummyClassifier().fit(X_train, y_train)
pred_dummy = dummy.predict(X_test)
print("dummy score: {:.2f}".format(dummy.score(X_test, y_test)))
logreg = LogisticRegression(C=0.1).fit(X_train, y_train)
pred_logreg = logreg.predict(X_test)
print("logreg 점수: {:.2f}".format(logreg.score(X_test, y_test)))
dummy score: 0.83 logreg 점수: 0.98
from sklearn.metrics import confusion_matrix
confusion = confusion_matrix(y_test, pred_logreg)
print("Confusion matrix:\n{}".format(confusion))
Confusion matrix: [[401 2] [ 8 39]]
mglearn.plots.plot_confusion_matrix_illustration()
mglearn.plots.plot_binary_confusion_matrix()
print("빈도 기반 더미 모델:")
print(confusion_matrix(y_test, pred_most_frequent))
print("\n무작위 더미 모델:")
print(confusion_matrix(y_test, pred_dummy))
print("\n결정 트리:")
print(confusion_matrix(y_test, pred_tree))
print("\nLogistic Regression")
print(confusion_matrix(y_test, pred_logreg))
빈도 기반 더미 모델: [[403 0] [ 47 0]] 무작위 더미 모델: [[359 44] [ 45 2]] 결정 트리: [[390 13] [ 24 23]] Logistic Regression [[401 2] [ 8 39]]
from sklearn.metrics import f1_score
print("f1 score most frequent: {:.2f}".format(
f1_score(y_test, pred_most_frequent)))
print("f1 더미 점수: {:.2f}".format(f1_score(y_test, pred_dummy)))
print("f1 트리 점수: {:.2f}".format(f1_score(y_test, pred_tree)))
print("f1 회귀 점수: {:.2f}".format(
f1_score(y_test, pred_logreg)))
f1 score most frequent: 0.00 f1 더미 점수: 0.04 f1 트리 점수: 0.55 f1 회귀 점수: 0.89
/Users/Kyo/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: F-score is ill-defined and being set to 0.0 due to no predicted samples. 'precision', 'predicted', average, warn_for)
from sklearn.metrics import classification_report
print(classification_report(y_test, pred_most_frequent,
target_names=["9 아님", "9"]))
precision recall f1-score support 9 아님 0.90 1.00 0.94 403 9 0.00 0.00 0.00 47 avg / total 0.80 0.90 0.85 450
/Users/Kyo/anaconda/lib/python3.6/site-packages/sklearn/metrics/classification.py:1113: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. 'precision', 'predicted', average, warn_for)
from mglearn.datasets import make_blobs
X, y = make_blobs(n_samples=(400, 50), centers=2, cluster_std=[7.0, 2],
random_state=22)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
svc = SVC(gamma=.05).fit(X_train, y_train)
mglearn.plots.plot_decision_threshold()
print(classification_report(y_test, svc.predict(X_test)))
precision recall f1-score support 0 0.97 0.89 0.93 104 1 0.35 0.67 0.46 9 avg / total 0.92 0.88 0.89 113
y_pred_lower_threshold = svc.decision_function(X_test) > -.8
print(classification_report(y_test, y_pred_lower_threshold))
precision recall f1-score support 0 1.00 0.82 0.90 104 1 0.32 1.00 0.49 9 avg / total 0.95 0.83 0.87 113
from sklearn.metrics import precision_recall_curve
precision, recall, thresholds = precision_recall_curve(
y_test, svc.decision_function(X_test))
X, y = make_blobs(n_samples=(4000, 500), centers=2, cluster_std=[7.0, 2],
random_state=22)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
svc = SVC(gamma=.05).fit(X_train, y_train)
precision, recall, thresholds = precision_recall_curve(
y_test, svc.decision_function(X_test))
close_zero = np.argmin(np.abs(thresholds))
plt.plot(precision[close_zero], recall[close_zero], 'o', markersize=10,
label="threshold zero", fillstyle="none", c='k', mew=2)
plt.plot(precision, recall, label="precision recall curve")
plt.xlabel("Precision")
plt.ylabel("Recall")
plt.legend(loc="best")
<matplotlib.legend.Legend at 0x11b63c550>
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=0, max_features=2)
rf.fit(X_train, y_train)
precision_rf, recall_rf, thresholds_rf = precision_recall_curve(
y_test, rf.predict_proba(X_test)[:, 1])
plt.plot(precision, recall, label="svc")
plt.plot(precision[close_zero], recall[close_zero], 'o', markersize=10,
label="threshold zero svc", fillstyle="none", c='k', mew=2)
plt.plot(precision_rf, recall_rf, label="rf")
close_default_rf = np.argmin(np.abs(thresholds_rf - 0.5))
plt.plot(precision_rf[close_default_rf], recall_rf[close_default_rf], '^', c='k',
markersize=10, label="threshold 0.5 rf", fillstyle="none", mew=2)
plt.xlabel("Precision")
plt.ylabel("Recall")
plt.legend(loc="best")
<matplotlib.legend.Legend at 0x115e62b00>
print("Random Forest f1_score: {:.3f}".format(
f1_score(y_test, rf.predict(X_test))))
print("SVC f1_score: {:.3f}".format(f1_score(y_test, svc.predict(X_test))))
Random Forest f1_score: 0.610 SVC f1_score: 0.656
from sklearn.metrics import average_precision_score
ap_rf = average_precision_score(y_test, rf.predict_proba(X_test)[:, 1])
ap_svc = average_precision_score(y_test, svc.decision_function(X_test))
print("Average precision of random forest: {:.3f}".format(ap_rf))
print("Average precision of svc: {:.3f}".format(ap_svc))
Average precision of random forest: 0.666 Average precision of svc: 0.663
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_test, svc.decision_function(X_test))
plt.plot(fpr, tpr, label="ROC Curve")
plt.xlabel("FPR")
plt.ylabel("TPR (recall)")
close_zero = np.argmin(np.abs(thresholds))
plt.plot(fpr[close_zero], tpr[close_zero], 'o', markersize=10,
label="threshold zero", fillstyle="none", c='k', mew=2)
plt.legend(loc=4)
<matplotlib.legend.Legend at 0x1177c9ef0>
fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_test, rf.predict_proba(X_test)[:, 1])
plt.plot(fpr, tpr, label="ROC Curve SVC")
plt.plot(fpr_rf, tpr_rf, label="ROC Curve RF")
plt.xlabel("FPR")
plt.ylabel("TPR (recall)")
plt.plot(fpr[close_zero], tpr[close_zero], 'o', markersize=10,
label="threshold zero SVC", fillstyle="none", c='k', mew=2)
close_default_rf = np.argmin(np.abs(thresholds_rf - 0.5))
plt.plot(fpr_rf[close_default_rf], tpr[close_default_rf], '^', markersize=10,
label="threshold 0.5 RF", fillstyle="none", c='k', mew=2)
plt.legend(loc=4)
<matplotlib.legend.Legend at 0x117711cf8>
from sklearn.metrics import roc_auc_score
rf_auc = roc_auc_score(y_test, rf.predict_proba(X_test)[:, 1])
svc_auc = roc_auc_score(y_test, svc.decision_function(X_test))
print("AUC for Random Forest: {:.3f}".format(rf_auc))
print("AUC for SVC: {:.3f}".format(svc_auc))
AUC for Random Forest: 0.937 AUC for SVC: 0.916
y = digits.target == 9
X_train, X_test, y_train, y_test = train_test_split(
digits.data, y, random_state=0)
plt.figure()
for gamma in [1, 0.1, 0.01]:
svc = SVC(gamma=gamma).fit(X_train, y_train)
accuracy = svc.score(X_test, y_test)
auc = roc_auc_score(y_test, svc.decision_function(X_test))
fpr, tpr, _ = roc_curve(y_test , svc.decision_function(X_test))
print("gamma = {:.2f} accuracy = {:.2f} AUC = {:.2f}".format(
gamma, accuracy, auc))
plt.plot(fpr, tpr, label="gamma={:.3f}".format(gamma))
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.xlim(-0.01, 1)
plt.ylim(0, 1.02)
plt.legend(loc="best")
gamma = 1.00 accuracy = 0.90 AUC = 0.50 gamma = 0.10 accuracy = 0.90 AUC = 0.96 gamma = 0.01 accuracy = 0.90 AUC = 1.00
<matplotlib.legend.Legend at 0x1175783c8>
from sklearn.metrics import accuracy_score
X_train, X_test, y_train, y_test = train_test_split(
digits.data, digits.target, random_state=0)
lr = LogisticRegression().fit(X_train, y_train)
pred = lr.predict(X_test)
print("Accuracy: {:.3f}".format(accuracy_score(y_test, pred)))
print("Confusion matrix:\n{}".format(confusion_matrix(y_test, pred)))
Accuracy: 0.953 Confusion matrix: [[37 0 0 0 0 0 0 0 0 0] [ 0 39 0 0 0 0 2 0 2 0] [ 0 0 41 3 0 0 0 0 0 0] [ 0 0 1 43 0 0 0 0 0 1] [ 0 0 0 0 38 0 0 0 0 0] [ 0 1 0 0 0 47 0 0 0 0] [ 0 0 0 0 0 0 52 0 0 0] [ 0 1 0 1 1 0 0 45 0 0] [ 0 3 1 0 0 0 0 0 43 1] [ 0 0 0 1 0 1 0 0 1 44]]
scores_image = mglearn.tools.heatmap(
confusion_matrix(y_test, pred), xlabel='Predicted label',
ylabel='True label', xticklabels=digits.target_names,
yticklabels=digits.target_names, cmap=plt.cm.gray_r, fmt="%d")
plt.title("Confusion matrix")
plt.gca().invert_yaxis()
print(classification_report(y_test, pred))
precision recall f1-score support 0 1.00 1.00 1.00 37 1 0.89 0.91 0.90 43 2 0.95 0.93 0.94 44 3 0.90 0.96 0.92 45 4 0.97 1.00 0.99 38 5 0.98 0.98 0.98 48 6 0.96 1.00 0.98 52 7 1.00 0.94 0.97 48 8 0.93 0.90 0.91 48 9 0.96 0.94 0.95 47 avg / total 0.95 0.95 0.95 450
print("Micro average f1 score: {:.3f}".format(
f1_score(y_test, pred, average="micro")))
print("Macro average f1 score: {:.3f}".format(
f1_score(y_test, pred, average="macro")))
Micro average f1 score: 0.953 Macro average f1 score: 0.954
print("Default scoring: {}".format(
cross_val_score(SVC(), digits.data, digits.target == 9)))
explicit_accuracy = cross_val_score(SVC(), digits.data, digits.target == 9,
scoring="accuracy")
print("Explicit accuracy scoring: {}".format(explicit_accuracy))
roc_auc = cross_val_score(SVC(), digits.data, digits.target == 9,
scoring="roc_auc")
print("AUC scoring: {}".format(roc_auc))
Default scoring: [0.9 0.9 0.9] Explicit accuracy scoring: [0.9 0.9 0.9] AUC scoring: [0.994 0.99 0.996]
X_train, X_test, y_train, y_test = train_test_split(
digits.data, digits.target == 9, random_state=0)
param_grid = {'gamma': [0.0001, 0.01, 0.1, 1, 10]}
grid = GridSearchCV(SVC(), param_grid=param_grid)
grid.fit(X_train, y_train)
print("Grid-Search with accuracy")
print("Best parameters:", grid.best_params_)
print("Best cross-validation score (accuracy)): {:.3f}".format(grid.best_score_))
print("Test set AUC: {:.3f}".format(
roc_auc_score(y_test, grid.decision_function(X_test))))
print("Test set accuracy: {:.3f}".format(grid.score(X_test, y_test)))
grid = GridSearchCV(SVC(), param_grid=param_grid, scoring="roc_auc")
grid.fit(X_train, y_train)
print("\nGrid-Search with AUC")
print("Best parameters:", grid.best_params_)
print("Best cross-validation score (AUC): {:.3f}".format(grid.best_score_))
print("Test set AUC: {:.3f}".format(
roc_auc_score(y_test, grid.decision_function(X_test))))
print("Test set accuracy: {:.3f}".format(grid.score(X_test, y_test)))
Grid-Search with accuracy Best parameters: {'gamma': 0.0001} Best cross-validation score (accuracy)): 0.970 Test set AUC: 0.992 Test set accuracy: 0.973 Grid-Search with AUC Best parameters: {'gamma': 0.01} Best cross-validation score (AUC): 0.997 Test set AUC: 1.000 Test set accuracy: 1.000
from sklearn.metrics.scorer import SCORERS
print("가능한 평가 방식:\n{}".format(sorted(SCORERS.keys())))
가능한 평가 방식: ['accuracy', 'adjusted_rand_score', 'average_precision', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'log_loss', 'mean_absolute_error', 'mean_squared_error', 'median_absolute_error', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc']