Notebook

In [17]:

%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [11, 7]

In [23]:

import seaborn as sns
import pandas as pd
import numpy as np 
from sklearn.datasets import load_iris

iris = load_iris()
df=pd.DataFrame(data= np.c_[iris['data'], iris['target']],
                 columns= list(iris['feature_names']) + ['target'])
sns.pairplot(df,hue='target')

Out[23]:

<seaborn.axisgrid.PairGrid at 0x1a2e795fd0>

In [24]:

from sklearn import tree

clf = tree.DecisionTreeClassifier(random_state=0)
clf = clf.fit(iris.data, iris.target)
tree.plot_tree(clf)  

Out[24]:

[Text(306.90000000000003, 348.81, 'X[3] <= 0.8\nentropy = 0.667\nsamples = 150\nvalue = [50, 50, 50]'),
 Text(259.6846153846154, 285.39, 'entropy = 0.0\nsamples = 50\nvalue = [50, 0, 0]'),
 Text(354.11538461538464, 285.39, 'X[3] <= 1.75\nentropy = 0.5\nsamples = 100\nvalue = [0, 50, 50]'),
 Text(188.8615384615385, 221.97, 'X[2] <= 4.95\nentropy = 0.168\nsamples = 54\nvalue = [0, 49, 5]'),
 Text(94.43076923076924, 158.55, 'X[3] <= 1.65\nentropy = 0.041\nsamples = 48\nvalue = [0, 47, 1]'),
 Text(47.21538461538462, 95.13, 'entropy = 0.0\nsamples = 47\nvalue = [0, 47, 0]'),
 Text(141.64615384615388, 95.13, 'entropy = 0.0\nsamples = 1\nvalue = [0, 0, 1]'),
 Text(283.29230769230776, 158.55, 'X[3] <= 1.55\nentropy = 0.444\nsamples = 6\nvalue = [0, 2, 4]'),
 Text(236.0769230769231, 95.13, 'entropy = 0.0\nsamples = 3\nvalue = [0, 0, 3]'),
 Text(330.50769230769237, 95.13, 'X[2] <= 5.45\nentropy = 0.444\nsamples = 3\nvalue = [0, 2, 1]'),
 Text(283.29230769230776, 31.710000000000036, 'entropy = 0.0\nsamples = 2\nvalue = [0, 2, 0]'),
 Text(377.723076923077, 31.710000000000036, 'entropy = 0.0\nsamples = 1\nvalue = [0, 0, 1]'),
 Text(519.3692307692309, 221.97, 'X[2] <= 4.85\nentropy = 0.043\nsamples = 46\nvalue = [0, 1, 45]'),
 Text(472.1538461538462, 158.55, 'X[1] <= 3.1\nentropy = 0.444\nsamples = 3\nvalue = [0, 1, 2]'),
 Text(424.9384615384616, 95.13, 'entropy = 0.0\nsamples = 2\nvalue = [0, 0, 2]'),
 Text(519.3692307692309, 95.13, 'entropy = 0.0\nsamples = 1\nvalue = [0, 1, 0]'),
 Text(566.5846153846155, 158.55, 'entropy = 0.0\nsamples = 43\nvalue = [0, 0, 43]')]

In [16]:

# Parameters
n_classes = 3
plot_colors = "ryb"
plot_step = 0.02

for pairidx, pair in enumerate([[0, 1], [0, 2], [0, 3],
                                [1, 2], [1, 3], [2, 3]]):
    # We only take the two corresponding features
    X = iris.data[:, pair]
    y = iris.target
    clf = tree.DecisionTreeClassifier().fit(X, y)

    # Plot the decision boundary
    plt.subplot(2, 3, pairidx + 1)

    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
                         np.arange(y_min, y_max, plot_step))
    plt.tight_layout(h_pad=0.5, w_pad=0.5, pad=2.5)

    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    cs = plt.contourf(xx, yy, Z, cmap=plt.cm.RdYlBu)

    plt.xlabel(iris.feature_names[pair[0]])
    plt.ylabel(iris.feature_names[pair[1]])

    # Plot the training points
    for i, color in zip(range(n_classes), plot_colors):
        idx = np.where(y == i)
        plt.scatter(X[idx, 0], X[idx, 1], c=color, label=iris.target_names[i],
                    cmap=plt.cm.RdYlBu, edgecolor='black', s=15)

plt.suptitle("Decision surface of a decision tree using paired features")
plt.legend(loc='lower right', borderpad=0, handletextpad=0)
plt.axis("tight")

plt.figure()
clf = tree.DecisionTreeClassifier().fit(iris.data, iris.target)
tree.plot_tree(clf, filled=True)
plt.show()

In [28]:

from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels


def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    # Only use the labels that appear in the data
    classes = classes[unique_labels(y_true, y_pred)]
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

X = iris.data
y = iris.target
class_names = iris.target_names

# Split the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

classifier = svm.SVC(kernel='linear', C=0.01)
y_pred = classifier.fit(X_train, y_train).predict(X_test)

np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plot_confusion_matrix(y_test, y_pred, classes=class_names,
                      title='Confusion matrix, without normalization')

# Plot normalized confusion matrix
plot_confusion_matrix(y_test, y_pred, classes=class_names, normalize=True,
                      title='Normalized confusion matrix')

plt.show()

Confusion matrix, without normalization
[[13  0  0]
 [ 0 10  6]
 [ 0  0  9]]
Normalized confusion matrix
[[1.   0.   0.  ]
 [0.   0.62 0.38]
 [0.   0.   1.  ]]

In [36]:

classifier = tree.DecisionTreeClassifier(max_depth=10)
y_pred = classifier.fit(X_train, y_train).predict(X_test)

np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plot_confusion_matrix(y_test, y_pred, classes=class_names,
                      title='Confusion matrix, without normalization')

# Plot normalized confusion matrix
plot_confusion_matrix(y_test, y_pred, classes=class_names, normalize=True,
                      title='Normalized confusion matrix')

plt.show()
tree.plot_tree(classifier, filled=True)
plt.show()

Confusion matrix, without normalization
[[13  0  0]
 [ 0 15  1]
 [ 0  0  9]]
Normalized confusion matrix
[[1.   0.   0.  ]
 [0.   0.94 0.06]
 [0.   0.   1.  ]]

In [38]:

from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()
y_pred = classifier.fit(X_train, y_train).predict(X_test)

np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plot_confusion_matrix(y_test, y_pred, classes=class_names,
                      title='Confusion matrix, without normalization')

# Plot normalized confusion matrix
plot_confusion_matrix(y_test, y_pred, classes=class_names, normalize=True,
                      title='Normalized confusion matrix')

plt.show()
plt.show()

/Users/roman/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.
  "10 in version 0.20 to 100 in 0.22.", FutureWarning)

Confusion matrix, without normalization
[[13  0  0]
 [ 0 15  1]
 [ 0  0  9]]
Normalized confusion matrix
[[1.   0.   0.  ]
 [0.   0.94 0.06]
 [0.   0.   1.  ]]

In [48]:

from sklearn import datasets, svm, metrics

# The digits dataset
digits = datasets.load_digits()

# The data that we are interested in is made of 8x8 images of digits, let's
# have a look at the first 4 images, stored in the `images` attribute of the
# dataset.  If we were working from image files, we could load them using
# matplotlib.pyplot.imread.  Note that each image must have the same size. For these
# images, we know which digit they represent: it is given in the 'target' of
# the dataset.
images_and_labels = list(zip(digits.images, digits.target))
for index, (image, label) in enumerate(images_and_labels[:4]):
    plt.subplot(2, 4, index + 1)
    plt.axis('off')
    plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
    plt.title('Training: %i' % label)

# To apply a classifier on this data, we need to flatten the image, to
# turn the data in a (samples, feature) matrix:
n_samples = len(digits.images)
data = digits.images.reshape((n_samples, -1))

# Create a classifier: a support vector classifier
#classifier = svm.SVC(gamma=0.001)
#classifier = tree.DecisionTreeClassifier()
classifier = tree.DecisionTreeClassifier()

# We learn the digits on the first half of the digits
classifier.fit(data[:n_samples // 2], digits.target[:n_samples // 2])

# Now predict the value of the digit on the second half:
expected = digits.target[n_samples // 2:]
predicted = classifier.predict(data[n_samples // 2:])

print("Classification report for classifier %s:\n%s\n"
      % (classifier, metrics.classification_report(expected, predicted)))
print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted))

images_and_predictions = list(zip(digits.images[n_samples // 2:], predicted))
for index, (image, prediction) in enumerate(images_and_predictions[:4]):
    plt.subplot(2, 4, index + 5)
    plt.axis('off')
    plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
    plt.title('Prediction: %i' % prediction)

plt.show()

plot_confusion_matrix(expected, predicted, classes=digits.target, 
                      title='Confusion matrix, without normalization')

Classification report for classifier DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best'):
              precision    recall  f1-score   support

           0       0.91      0.90      0.90        88
           1       0.83      0.64      0.72        91
           2       0.91      0.74      0.82        86
           3       0.69      0.75      0.72        91
           4       0.86      0.78      0.82        92
           5       0.61      0.77      0.68        91
           6       0.89      0.92      0.91        91
           7       0.90      0.79      0.84        89
           8       0.59      0.64      0.61        88
           9       0.63      0.73      0.67        92

    accuracy                           0.77       899
   macro avg       0.78      0.77      0.77       899
weighted avg       0.78      0.77      0.77       899


Confusion matrix:
[[79  0  0  1  3  2  0  0  0  3]
 [ 0 58  3  8  2  1  0  2  6 11]
 [ 1  3 64  8  0  1  2  0  5  2]
 [ 0  3  1 68  0  2  0  4 10  3]
 [ 6  0  0  0 72  4  4  0  4  2]
 [ 0  1  0  0  1 70  0  0  4 15]
 [ 1  1  0  0  3  0 84  0  2  0]
 [ 0  1  0  6  1  8  0 70  2  1]
 [ 0  2  2  4  2 13  4  2 56  3]
 [ 0  1  0  4  0 14  0  0  6 67]]

Confusion matrix, without normalization
[[79  0  0  1  3  2  0  0  0  3]
 [ 0 58  3  8  2  1  0  2  6 11]
 [ 1  3 64  8  0  1  2  0  5  2]
 [ 0  3  1 68  0  2  0  4 10  3]
 [ 6  0  0  0 72  4  4  0  4  2]
 [ 0  1  0  0  1 70  0  0  4 15]
 [ 1  1  0  0  3  0 84  0  2  0]
 [ 0  1  0  6  1  8  0 70  2  1]
 [ 0  2  2  4  2 13  4  2 56  3]
 [ 0  1  0  4  0 14  0  0  6 67]]

Out[48]:

<matplotlib.axes._subplots.AxesSubplot at 0x1a2b969748>

In [ ]: