In [1]:
%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

import sys
sys.path.append('/Users/kaonpark/workspace/github.com/likejazz/kaon-learn')
import kaonlearn
from kaonlearn.plots import plot_decision_regions
In [2]:
from sklearn.datasets import make_blobs
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# create a synthetic dataset
X, y = make_blobs(random_state=0)
# split data and labels into a training and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
# instantiate a model and fit it to the training set
logreg = LogisticRegression().fit(X_train, y_train)
# evaluate the model on the test set
print("Test set score: {:.2f}".format(logreg.score(X_test, y_test)))
Test set score: 0.88
In [3]:
plot_decision_regions(X, y, logreg)
Out[3]:
<matplotlib.axes._subplots.AxesSubplot at 0x110cd7c18>
In [4]:
plot_decision_regions(X_train, y_train, logreg)
Out[4]:
<matplotlib.axes._subplots.AxesSubplot at 0x1110570b8>
In [5]:
plot_decision_regions(X_test, y_test, logreg)
Out[5]:
<matplotlib.axes._subplots.AxesSubplot at 0x10bf0c7f0>
In [6]:
X_test
Out[6]:
array([[ 1.28535145,  1.43691285],
       [ 0.94808785,  4.7321192 ],
       [ 3.00251949,  0.74265357],
       [-0.6700734 ,  2.26685667],
       [ 2.11567076,  3.06896151],
       [-2.56114686,  3.59947678],
       [-1.62535654,  2.25440397],
       [ 3.97820955,  2.37817845],
       [-2.02493646,  4.84741432],
       [-0.33887422,  3.23482487],
       [ 2.2635425 ,  1.8743027 ],
       [-0.42724442,  3.57314599],
       [-1.88089792,  1.54293097],
       [ 2.50904929,  5.7731461 ],
       [-0.57748321,  3.0054335 ],
       [ 1.12031365,  5.75806083],
       [-0.88677249,  1.30092622],
       [ 0.9845149 ,  1.95211539],
       [ 2.47034915,  4.09862906],
       [ 2.72756228,  1.3051255 ],
       [-0.73000011,  6.25456272],
       [-2.33031368,  2.22833248],
       [-0.63762777,  4.09104705],
       [ 3.2460247 ,  2.84942165],
       [ 0.4666179 ,  3.86571303]])
In [7]:
y_test
Out[7]:
array([1, 0, 1, 2, 0, 2, 2, 1, 2, 2, 1, 2, 2, 0, 2, 0, 2, 1, 0, 1, 0, 2, 0,
       0, 0])
In [8]:
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression

scores = cross_val_score(logreg, X, y)
print("Cross-validation scores: {}".format(scores))
Cross-validation scores: [ 0.88235294  0.84848485  0.90909091]
In [9]:
scores = cross_val_score(logreg, X, y, cv=5)
print("Cross-validation scores: {}".format(scores))
Cross-validation scores: [ 0.9047619   0.85714286  0.9047619   0.89473684  0.88888889]
In [10]:
from sklearn.model_selection import KFold
kfold = KFold(n_splits=5)
In [11]:
print("Cross-validation scores:\n{}".format(
      cross_val_score(logreg, X, y, cv=kfold)))
Cross-validation scores:
[ 0.85  0.9   0.95  0.8   0.95]
In [12]:
kfold = KFold(n_splits=3)
print("Cross-validation scores:\n{}".format(
      cross_val_score(logreg, X, y, cv=kfold)))
Cross-validation scores:
[ 0.88235294  0.93939394  0.84848485]
In [13]:
kfold = KFold(n_splits=3, shuffle=True, random_state=0)
print("Cross-validation scores:\n{}".format(
      cross_val_score(logreg, X, y, cv=kfold)))
Cross-validation scores:
[ 0.91176471  0.87878788  0.93939394]
In [14]:
from sklearn.model_selection import ShuffleSplit
shuffle_split = ShuffleSplit(test_size=.5, train_size=.5, n_splits=10)
scores = cross_val_score(logreg, X, y, cv=shuffle_split)
print("Cross-validation scores:\n{}".format(scores))
Cross-validation scores:
[ 0.84  0.88  0.9   0.86  0.9   0.86  0.86  0.92  0.94  0.9 ]
In [15]:
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
              'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}
print("Parameter grid:\n{}".format(param_grid))
Parameter grid:
{'C': [0.001, 0.01, 0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}
In [16]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
grid_search = GridSearchCV(SVC(), param_grid, cv=5)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=0)
grid_search.fit(X_train, y_train)
print("Test set score: {:.2f}".format(grid_search.score(X_test, y_test)))
print("Best parameters: {}".format(grid_search.best_params_))
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))
Test set score: 0.88
Best parameters: {'C': 1, 'gamma': 1}
Best cross-validation score: 0.95
In [17]:
print("Best estimator:\n{}".format(grid_search.best_estimator_))
Best estimator:
SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
In [18]:
# convert to DataFrame
results = pd.DataFrame(grid_search.cv_results_)
# show the first 5 rows
results.head()
Out[18]:
mean_fit_time mean_score_time mean_test_score mean_train_score param_C param_gamma params rank_test_score split0_test_score split0_train_score ... split2_test_score split2_train_score split3_test_score split3_train_score split4_test_score split4_train_score std_fit_time std_score_time std_test_score std_train_score
0 0.000852 0.000425 0.36 0.359969 0.001 0.001 {'C': 0.001, 'gamma': 0.001} 20 0.375 0.355932 ... 0.333333 0.366667 0.357143 0.360656 0.357143 0.360656 0.000282 0.000171 0.015533 0.00396
1 0.000534 0.000262 0.36 0.359969 0.001 0.01 {'C': 0.001, 'gamma': 0.01} 20 0.375 0.355932 ... 0.333333 0.366667 0.357143 0.360656 0.357143 0.360656 0.000118 0.000056 0.015533 0.00396
2 0.000494 0.000230 0.36 0.359969 0.001 0.1 {'C': 0.001, 'gamma': 0.1} 20 0.375 0.355932 ... 0.333333 0.366667 0.357143 0.360656 0.357143 0.360656 0.000039 0.000014 0.015533 0.00396
3 0.000656 0.000321 0.36 0.359969 0.001 1 {'C': 0.001, 'gamma': 1} 20 0.375 0.355932 ... 0.333333 0.366667 0.357143 0.360656 0.357143 0.360656 0.000138 0.000082 0.015533 0.00396
4 0.000770 0.000402 0.36 0.359969 0.001 10 {'C': 0.001, 'gamma': 10} 20 0.375 0.355932 ... 0.333333 0.366667 0.357143 0.360656 0.357143 0.360656 0.000211 0.000159 0.015533 0.00396

5 rows × 22 columns

In [19]:
scores = np.array(results.mean_test_score).reshape(6, 6)
sns.heatmap(scores, xticklabels=param_grid['gamma'],
                      yticklabels=param_grid['C'], annot=True)
Out[19]:
<matplotlib.axes._subplots.AxesSubplot at 0x1110ace48>
In [20]:
param_grid = [{'kernel': ['rbf'],
               'C': [0.001, 0.01, 0.1, 1, 10, 100],
               'gamma': [0.001, 0.01, 0.1, 1, 10, 100]},
              {'kernel': ['linear'],
               'C': [0.001, 0.01, 0.1, 1, 10, 100]}]
print("List of grids:\n{}".format(param_grid))
List of grids:
[{'kernel': ['rbf'], 'C': [0.001, 0.01, 0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1, 10, 100]}, {'kernel': ['linear'], 'C': [0.001, 0.01, 0.1, 1, 10, 100]}]
In [21]:
grid_search = GridSearchCV(SVC(), param_grid, cv=5)
grid_search.fit(X_train, y_train)
print("Best parameters: {}".format(grid_search.best_params_))
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))
Best parameters: {'C': 1, 'gamma': 1, 'kernel': 'rbf'}
Best cross-validation score: 0.95
In [22]:
results = pd.DataFrame(grid_search.cv_results_)
# we display the transposed table so that it better fits on the page:
results.T
Out[22]:
0 1 2 3 4 5 6 7 8 9 ... 32 33 34 35 36 37 38 39 40 41
mean_fit_time 0.000600719 0.000883627 0.00111666 0.0011353 0.000996351 0.00113578 0.000873089 0.000591946 0.000497246 0.000456381 ... 0.000386429 0.000453997 0.000532818 0.000525141 0.000346756 0.000330734 0.000304556 0.000306177 0.000330257 0.000708103
mean_score_time 0.000257683 0.000851917 0.000538158 0.000598478 0.000419283 0.000628805 0.000488091 0.0002985 0.000229931 0.000209188 ... 0.000184202 0.000191832 0.000219202 0.000204468 0.000189781 0.000181198 0.00018549 0.000175571 0.000183487 0.000184298
mean_test_score 0.36 0.36 0.36 0.36 0.36 0.36 0.36 0.36 0.36 0.36 ... 0.92 0.92 0.746667 0.453333 0.36 0.906667 0.893333 0.893333 0.893333 0.893333
mean_train_score 0.359969 0.359969 0.359969 0.359969 0.359969 0.359969 0.359969 0.359969 0.359969 0.359969 ... 0.973382 0.976772 1 1 0.359969 0.913258 0.919981 0.936541 0.933319 0.946654
param_C 0.001 0.001 0.001 0.001 0.001 0.001 0.01 0.01 0.01 0.01 ... 100 100 100 100 0.001 0.01 0.1 1 10 100
param_gamma 0.001 0.01 0.1 1 10 100 0.001 0.01 0.1 1 ... 0.1 1 10 100 NaN NaN NaN NaN NaN NaN
param_kernel rbf rbf rbf rbf rbf rbf rbf rbf rbf rbf ... rbf rbf rbf rbf linear linear linear linear linear linear
params {'C': 0.001, 'gamma': 0.001, 'kernel': 'rbf'} {'C': 0.001, 'gamma': 0.01, 'kernel': 'rbf'} {'C': 0.001, 'gamma': 0.1, 'kernel': 'rbf'} {'C': 0.001, 'gamma': 1, 'kernel': 'rbf'} {'C': 0.001, 'gamma': 10, 'kernel': 'rbf'} {'C': 0.001, 'gamma': 100, 'kernel': 'rbf'} {'C': 0.01, 'gamma': 0.001, 'kernel': 'rbf'} {'C': 0.01, 'gamma': 0.01, 'kernel': 'rbf'} {'C': 0.01, 'gamma': 0.1, 'kernel': 'rbf'} {'C': 0.01, 'gamma': 1, 'kernel': 'rbf'} ... {'C': 100, 'gamma': 0.1, 'kernel': 'rbf'} {'C': 100, 'gamma': 1, 'kernel': 'rbf'} {'C': 100, 'gamma': 10, 'kernel': 'rbf'} {'C': 100, 'gamma': 100, 'kernel': 'rbf'} {'C': 0.001, 'kernel': 'linear'} {'C': 0.01, 'kernel': 'linear'} {'C': 0.1, 'kernel': 'linear'} {'C': 1, 'kernel': 'linear'} {'C': 10, 'kernel': 'linear'} {'C': 100, 'kernel': 'linear'}
rank_test_score 25 25 25 25 25 25 25 25 25 25 ... 3 3 19 22 25 7 10 10 10 10
split0_test_score 0.375 0.375 0.375 0.375 0.375 0.375 0.375 0.375 0.375 0.375 ... 1 0.9375 0.75 0.4375 0.375 0.9375 0.875 0.875 0.875 0.875
split0_train_score 0.355932 0.355932 0.355932 0.355932 0.355932 0.355932 0.355932 0.355932 0.355932 0.355932 ... 0.966102 0.966102 1 1 0.355932 0.915254 0.898305 0.898305 0.915254 0.915254
split1_test_score 0.375 0.375 0.375 0.375 0.375 0.375 0.375 0.375 0.375 0.375 ... 0.875 0.875 0.8125 0.4375 0.375 0.875 0.875 0.875 0.9375 0.9375
split1_train_score 0.355932 0.355932 0.355932 0.355932 0.355932 0.355932 0.355932 0.355932 0.355932 0.355932 ... 0.983051 1 1 1 0.355932 0.898305 0.932203 0.949153 0.949153 0.966102
split2_test_score 0.333333 0.333333 0.333333 0.333333 0.333333 0.333333 0.333333 0.333333 0.333333 0.333333 ... 0.8 0.933333 0.666667 0.4 0.333333 0.866667 0.866667 0.866667 0.8 0.8
split2_train_score 0.366667 0.366667 0.366667 0.366667 0.366667 0.366667 0.366667 0.366667 0.366667 0.366667 ... 0.983333 0.983333 1 1 0.366667 0.916667 0.933333 0.95 0.933333 0.966667
split3_test_score 0.357143 0.357143 0.357143 0.357143 0.357143 0.357143 0.357143 0.357143 0.357143 0.357143 ... 0.928571 0.928571 0.642857 0.5 0.357143 0.857143 0.857143 0.857143 0.857143 0.857143
split3_train_score 0.360656 0.360656 0.360656 0.360656 0.360656 0.360656 0.360656 0.360656 0.360656 0.360656 ... 0.967213 0.967213 1 1 0.360656 0.934426 0.934426 0.95082 0.934426 0.95082
split4_test_score 0.357143 0.357143 0.357143 0.357143 0.357143 0.357143 0.357143 0.357143 0.357143 0.357143 ... 1 0.928571 0.857143 0.5 0.357143 1 1 1 1 1
split4_train_score 0.360656 0.360656 0.360656 0.360656 0.360656 0.360656 0.360656 0.360656 0.360656 0.360656 ... 0.967213 0.967213 1 1 0.360656 0.901639 0.901639 0.934426 0.934426 0.934426
std_fit_time 0.000149639 0.000419799 0.000435166 0.000215438 0.00038384 0.000299698 0.000253743 0.000120045 8.02341e-05 3.29161e-05 ... 3.51941e-05 3.22151e-05 8.05663e-05 2.90466e-05 1.13799e-05 6.73911e-06 6.78819e-06 8.73719e-06 2.1366e-05 0.00031625
std_score_time 2.08539e-05 0.00104085 0.000159495 0.000170187 0.000142739 0.000141239 0.000211396 5.26278e-05 1.89746e-05 1.90744e-05 ... 6.81327e-06 9.67004e-06 3.48269e-05 6.07401e-06 6.18971e-06 4.4604e-06 1.68924e-05 1.94629e-06 1.54289e-05 1.03067e-05
std_test_score 0.0155329 0.0155329 0.0155329 0.0155329 0.0155329 0.0155329 0.0155329 0.0155329 0.0155329 0.0155329 ... 0.0767184 0.023671 0.0806029 0.0385861 0.0155329 0.0531096 0.0515167 0.0515167 0.0678116 0.0678116
std_train_score 0.00395963 0.00395963 0.00395963 0.00395963 0.00395963 0.00395963 0.00395963 0.00395963 0.00395963 0.00395963 ... 0.00802033 0.0132604 0 0 0.00395963 0.012824 0.0163866 0.0200526 0.0107656 0.0196547

23 rows × 42 columns

In [23]:
from sklearn.metrics import confusion_matrix

confusion = confusion_matrix(y_test, grid_search.predict(X_test))
print("Confusion matrix:\n{}".format(confusion))
Confusion matrix:
[[ 6  2  1]
 [ 0  6  0]
 [ 0  0 10]]
In [24]:
len(y_test)
Out[24]:
25
In [25]:
plot_decision_regions(X_test, y_test, grid_search)
Out[25]:
<matplotlib.axes._subplots.AxesSubplot at 0x111588550>
In [26]:
plot_decision_regions(X_test, y_test, logreg)
Out[26]:
<matplotlib.axes._subplots.AxesSubplot at 0x111588400>
In [27]:
kaonlearn.plots.plot_confusion_matrix_illustration()
In [28]:
kaonlearn.plots.plot_binary_confusion_matrix()

We call correctly classified samples belonging to the positive class true positives and correctly classified samples belonging to the negative class true negatives. These terms are usually abbreviated FP, FN, TP, and TN and lead to the following interpretation for the confusion matrix

$ Accuracy = \frac{TP+TN}{TP+TN+FP+FN} $

$ Precision = \frac{TP}{TP+FP} $

$ Recall = \frac{TP}{TP+FN} $

$ F_1 = 2\frac{precision*recall}{precision+recall} $

In [29]:
from sklearn.metrics import classification_report
print(classification_report(y_test, grid_search.predict(X_test)))
             precision    recall  f1-score   support

          0       1.00      0.67      0.80         9
          1       0.75      1.00      0.86         6
          2       0.91      1.00      0.95        10

avg / total       0.90      0.88      0.87        25

In [30]:
from sklearn.metrics import classification_report
print(classification_report(y_test, logreg.predict(X_test)))
             precision    recall  f1-score   support

          0       0.88      0.78      0.82         9
          1       0.75      1.00      0.86         6
          2       1.00      0.90      0.95        10

avg / total       0.90      0.88      0.88        25

In [31]:
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_digits

digits = load_digits()
X_train, X_test, y_train, y_test = train_test_split(
    digits.data, digits.target, random_state=0)
lr = LogisticRegression().fit(X_train, y_train)
pred = lr.predict(X_test)
print("Accuracy: {:.3f}".format(accuracy_score(y_test, pred)))
print("Confusion matrix:\n{}".format(confusion_matrix(y_test, pred)))
Accuracy: 0.953
Confusion matrix:
[[37  0  0  0  0  0  0  0  0  0]
 [ 0 39  0  0  0  0  2  0  2  0]
 [ 0  0 41  3  0  0  0  0  0  0]
 [ 0  0  1 43  0  0  0  0  0  1]
 [ 0  0  0  0 38  0  0  0  0  0]
 [ 0  1  0  0  0 47  0  0  0  0]
 [ 0  0  0  0  0  0 52  0  0  0]
 [ 0  1  0  1  1  0  0 45  0  0]
 [ 0  3  1  0  0  0  0  0 43  1]
 [ 0  0  0  1  0  1  0  0  1 44]]
In [32]:
sns.heatmap(confusion_matrix(y_test, pred), annot=True)
Out[32]:
<matplotlib.axes._subplots.AxesSubplot at 0x11192c860>
In [33]:
print(classification_report(y_test, pred))
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        37
          1       0.89      0.91      0.90        43
          2       0.95      0.93      0.94        44
          3       0.90      0.96      0.92        45
          4       0.97      1.00      0.99        38
          5       0.98      0.98      0.98        48
          6       0.96      1.00      0.98        52
          7       1.00      0.94      0.97        48
          8       0.93      0.90      0.91        48
          9       0.96      0.94      0.95        47

avg / total       0.95      0.95      0.95       450