The load_digits() function within the sklearn.datasets library returns a copy of handwritten digits.
from sklearn.datasets import load_digits
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
digits_data = load_digits()
digits_data.keys()
dict_keys(['data', 'target', 'frame', 'feature_names', 'target_names', 'images', 'DESCR'])
labels = pd.Series(digits_data['target'])
data = pd.DataFrame(digits_data['data'])
data.head(1)
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | 0.0 | 5.0 | 13.0 | 9.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 6.0 | 13.0 | 10.0 | 0.0 | 0.0 | 0.0 |
1 rows × 64 columns
first_image = data.iloc[0]
np_image = first_image.values
np_image = np_image.reshape(8,8)
plt.imshow(np_image, cmap='gray_r')
<matplotlib.image.AxesImage at 0x7f9f0bde2f10>
f, axarr = plt.subplots(2, 4)
# display the images corresponding to rows 0, 100, 200 and 300
axarr[0, 0].imshow(data.iloc[0].values.reshape(8,8), cmap='gray_r')
axarr[0, 1].imshow(data.iloc[99].values.reshape(8,8), cmap='gray_r')
axarr[0, 2].imshow(data.iloc[199].values.reshape(8,8), cmap='gray_r')
axarr[0, 3].imshow(data.iloc[299].values.reshape(8,8), cmap='gray_r')
# display the images corresponding to rows 1000, 1100, 1200 and 1300
axarr[1, 0].imshow(data.iloc[999].values.reshape(8,8), cmap='gray_r')
axarr[1, 1].imshow(data.iloc[1099].values.reshape(8,8), cmap='gray_r')
axarr[1, 2].imshow(data.iloc[1199].values.reshape(8,8), cmap='gray_r')
axarr[1, 3].imshow(data.iloc[1299].values.reshape(8,8), cmap='gray_r')
<matplotlib.image.AxesImage at 0x7f9f0bc3fe20>
The KNN model is a non-linear model which is more an algorithm than a model.
KNN compares observations in the test set to that of those in the training set.
The algorithm looks for similar observations in the training set with that of test set.
Linear and logistic regression models assume linearity between the features and the target.
The algorithm finds label with the most nearby observation and assigns that as prediction to the unseen observation
Before we apply the KNN model on the data set, we
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
def train_knn(nneighbors, train_features, train_labels):
knn = KNeighborsClassifier(n_neighbors = nneighbors)
knn.fit(train_features, train_labels)
return knn
def test(model, test_features, test_labels):
predictions = model.predict(test_features)
train_test_df = pd.DataFrame()
train_test_df['correct_label'] = test_labels
train_test_df['predicted_label'] = predictions
overall_accuracy = sum(train_test_df['predicted_label'] == train_test_df['correct_label'])/len(train_test_df)
return overall_accuracy
def cross_validate(k):
fold_accuracies = []
kf = KFold(n_splits = 4, random_state=2)
for train_index, test_index in kf.split(data):
train_features, test_features = data.loc[train_index], data.loc[test_index]
train_labels, test_labels = labels.loc[train_index], labels.loc[test_index]
model = train_knn(k, train_features, train_labels)
overall_accuracy = test(model, test_features, test_labels)
fold_accuracies.append(overall_accuracy)
return fold_accuracies
knn_one_accuracies = cross_validate(1)
np.mean(knn_one_accuracies)
/dataquest/system/env/python3/lib/python3.8/site-packages/sklearn/model_selection/_split.py:293: FutureWarning: Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True.
0.9677233358079684
k_values = list(range(1, 10))
k_overall_accuracies = []
for k in k_values:
k_accuracies = cross_validate(k)
k_mean_accuracy = np.mean(k_accuracies)
k_overall_accuracies.append(k_mean_accuracy)
plt.figure(figsize=(8, 4))
plt.title("Mean Accuracy vs k")
plt.plot(k_values, k_overall_accuracies)
/dataquest/system/env/python3/lib/python3.8/site-packages/sklearn/model_selection/_split.py:293: FutureWarning: Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True. /dataquest/system/env/python3/lib/python3.8/site-packages/sklearn/model_selection/_split.py:293: FutureWarning: Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True. /dataquest/system/env/python3/lib/python3.8/site-packages/sklearn/model_selection/_split.py:293: FutureWarning: Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True. /dataquest/system/env/python3/lib/python3.8/site-packages/sklearn/model_selection/_split.py:293: FutureWarning: Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True. /dataquest/system/env/python3/lib/python3.8/site-packages/sklearn/model_selection/_split.py:293: FutureWarning: Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True. /dataquest/system/env/python3/lib/python3.8/site-packages/sklearn/model_selection/_split.py:293: FutureWarning: Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True. /dataquest/system/env/python3/lib/python3.8/site-packages/sklearn/model_selection/_split.py:293: FutureWarning: Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True. /dataquest/system/env/python3/lib/python3.8/site-packages/sklearn/model_selection/_split.py:293: FutureWarning: Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True. /dataquest/system/env/python3/lib/python3.8/site-packages/sklearn/model_selection/_split.py:293: FutureWarning: Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True.
[<matplotlib.lines.Line2D at 0x7f9f0bbaa970>]
Train and test neural network with following single hidden layers using 4-fold cross validation
SHL of 8 neurons
SHL of 16 neurons
SHL of 32 neurons
SHL of 64 neurons
SHL of 128 neurons
SHL of 256 neurons
Model performances ranked on overall accuracies
Overfitting of models, ones peforming well on training sets but perform poorly on test sets
The performance and visualization of each model on the train and test sets.
Understanding the divergence of models as a result of the same.
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
# 50% train/test validation
def train_nn(neuron_arch, train_features, train_labels):
mlp = MLPClassifier(hidden_layer_sizes=neuron_arch)
mlp.fit(train_features, train_labels)
return mlp
def test(model, test_features, test_labels):
predictions = model.predict(test_features)
train_test_df = pd.DataFrame()
train_test_df['correct_label'] = test_labels
train_test_df['predicted_label'] = predictions
overall_accuracy = sum(train_test_df['predicted_label'] == train_test_df['correct_label'])/len(train_test_df)
return overall_accuracy
def cross_validate(neuron_arch):
fold_accuracies = []
kf = KFold(n_splits = 4, random_state=2)
for train_index, test_index in kf.split(data):
train_features, test_features = data.loc[train_index], data.loc[test_index]
train_labels, test_labels = labels.loc[train_index], labels.loc[test_index]
model = train_nn(neuron_arch, train_features, train_labels)
overall_accuracy = test(model, test_features, test_labels)
fold_accuracies.append(overall_accuracy)
return fold_accuracies
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
nn_one_neurons = [
(8,),
(16,),
(32,),
(64,),
(128,),
(256,)
]
nn_one_accuracies = []
for n in nn_one_neurons:
nn_accuracies = cross_validate(n)
nn_mean_accuracy = np.mean(nn_accuracies)
nn_one_accuracies.append(nn_mean_accuracy)
plt.figure(figsize=(8,4))
plt.title("Mean Accuracy vs. Neurons in Single Hidden Layer")
x = [i[0] for i in nn_one_neurons]
plt.plot(x, nn_one_accuracies)
/dataquest/system/env/python3/lib/python3.8/site-packages/sklearn/model_selection/_split.py:293: FutureWarning: Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True. /dataquest/system/env/python3/lib/python3.8/site-packages/sklearn/neural_network/_multilayer_perceptron.py:582: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet. /dataquest/system/env/python3/lib/python3.8/site-packages/sklearn/neural_network/_multilayer_perceptron.py:582: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet. /dataquest/system/env/python3/lib/python3.8/site-packages/sklearn/neural_network/_multilayer_perceptron.py:582: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet. /dataquest/system/env/python3/lib/python3.8/site-packages/sklearn/neural_network/_multilayer_perceptron.py:582: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet. /dataquest/system/env/python3/lib/python3.8/site-packages/sklearn/model_selection/_split.py:293: FutureWarning: Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True. /dataquest/system/env/python3/lib/python3.8/site-packages/sklearn/neural_network/_multilayer_perceptron.py:582: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet. /dataquest/system/env/python3/lib/python3.8/site-packages/sklearn/neural_network/_multilayer_perceptron.py:582: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet. /dataquest/system/env/python3/lib/python3.8/site-packages/sklearn/neural_network/_multilayer_perceptron.py:582: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet. /dataquest/system/env/python3/lib/python3.8/site-packages/sklearn/neural_network/_multilayer_perceptron.py:582: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet. /dataquest/system/env/python3/lib/python3.8/site-packages/sklearn/model_selection/_split.py:293: FutureWarning: Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True. /dataquest/system/env/python3/lib/python3.8/site-packages/sklearn/neural_network/_multilayer_perceptron.py:582: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet. /dataquest/system/env/python3/lib/python3.8/site-packages/sklearn/neural_network/_multilayer_perceptron.py:582: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet. /dataquest/system/env/python3/lib/python3.8/site-packages/sklearn/neural_network/_multilayer_perceptron.py:582: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet. /dataquest/system/env/python3/lib/python3.8/site-packages/sklearn/model_selection/_split.py:293: FutureWarning: Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True. /dataquest/system/env/python3/lib/python3.8/site-packages/sklearn/model_selection/_split.py:293: FutureWarning: Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True. /dataquest/system/env/python3/lib/python3.8/site-packages/sklearn/model_selection/_split.py:293: FutureWarning: Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True.
[<matplotlib.lines.Line2D at 0x7f9f0b9decd0>]
Two hidden layers are used and continue to increase the number of neurons with each layer.
Using 4-fold cross validation to train model using two hidden layers.
nn_two_neurons = [
(64, 64),
(128, 128),
(256, 256)
]
nn_two_accuracies = []
for n in nn_two_neurons:
nn_accuracies = cross_validate(n)
nn_mean_accuracy = np.mean(nn_accuracies)
nn_two_accuracies.append(nn_mean_accuracy)
plt.figure(figsize=(8,4))
plt.title("Mean Accuracy vs Neurons in two hidden layers")
x = [i[0] for i in nn_two_neurons]
plt.plot(x, nn_two_accuracies)
/dataquest/system/env/python3/lib/python3.8/site-packages/sklearn/model_selection/_split.py:293: FutureWarning: Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True. /dataquest/system/env/python3/lib/python3.8/site-packages/sklearn/model_selection/_split.py:293: FutureWarning: Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True. /dataquest/system/env/python3/lib/python3.8/site-packages/sklearn/model_selection/_split.py:293: FutureWarning: Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True.
[<matplotlib.lines.Line2D at 0x7f9f0b8651c0>]
nn_two_accuracies
[0.9376614699331849, 0.9493541202672606, 0.9554763672358328]
More hidden layers increase the overfitting. Increasing the k-fold cross validation to 6 and using 3 hidden layers and testing the same.
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
# 50% train/test validation
def train_nn(neuron_arch, train_features, train_labels):
mlp = MLPClassifier(hidden_layer_sizes=neuron_arch)
mlp.fit(train_features, train_labels)
return mlp
def test(model, test_features, test_labels):
predictions = model.predict(test_features)
train_test_df = pd.DataFrame()
train_test_df['correct_label'] = test_labels
train_test_df['predicted_label'] = predictions
overall_accuracy = sum(train_test_df['predicted_label'] == train_test_df['correct_label'])/len(train_test_df)
return overall_accuracy
def cross_validate_six(neuron_arch):
fold_accuracies = []
kf = KFold(n_splits=6, random_state=2)
for train_index, test_index in kf.split(data):
train_features, test_features = data.loc[train_index], data.loc[test_index]
train_labels, test_labels = labels.loc[train_index], labels.loc[test_index]
model = train_nn(neuron_arch, train_features, train_labels)
overall_accuracy = test(model, test_features, test_labels)
fold_accuracies.append(overall_accuracy)
return fold_accuracies
nn_three_neurons = [
(10, 10, 10),
(64, 64, 64),
(128, 128, 128)
]
nn_three_accuracies = []
for n in nn_three_neurons:
nn_accuracies = cross_validate_six(n)
nn_mean_accuracy = np.mean(nn_accuracies)
nn_three_accuracies.append(nn_mean_accuracy)
plt.figure(figsize=(8,4))
plt.title("Mean Accuracy vs Neurons in three hidden layers")
x = [i[0] for i in nn_three_neurons]
plt.plot(x, nn_three_accuracies)
/dataquest/system/env/python3/lib/python3.8/site-packages/sklearn/model_selection/_split.py:293: FutureWarning: Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True. /dataquest/system/env/python3/lib/python3.8/site-packages/sklearn/neural_network/_multilayer_perceptron.py:582: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet. /dataquest/system/env/python3/lib/python3.8/site-packages/sklearn/neural_network/_multilayer_perceptron.py:582: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet. /dataquest/system/env/python3/lib/python3.8/site-packages/sklearn/neural_network/_multilayer_perceptron.py:582: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet. /dataquest/system/env/python3/lib/python3.8/site-packages/sklearn/neural_network/_multilayer_perceptron.py:582: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet. /dataquest/system/env/python3/lib/python3.8/site-packages/sklearn/neural_network/_multilayer_perceptron.py:582: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet. /dataquest/system/env/python3/lib/python3.8/site-packages/sklearn/neural_network/_multilayer_perceptron.py:582: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet. /dataquest/system/env/python3/lib/python3.8/site-packages/sklearn/model_selection/_split.py:293: FutureWarning: Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True. /dataquest/system/env/python3/lib/python3.8/site-packages/sklearn/model_selection/_split.py:293: FutureWarning: Setting a random_state has no effect since shuffle is False. This will raise an error in 0.24. You should leave random_state to its default (None), or set shuffle=True.
[<matplotlib.lines.Line2D at 0x7f9f0b923700>]
nn_three_accuracies
[0.8948383500557413, 0.9460274990709774, 0.9526978818283166]