# fake data
import pandas as pd
train = pd.DataFrame({'id':[0,1,2], 'length':[0.9,0.3,0.6], 'mass':[0.1,0.2,0.8], 'rings':[40,50,60]})
test = pd.DataFrame({'length':[0.59], 'mass':[0.79], 'rings':[54]})
# training data
train
id | length | mass | rings | |
---|---|---|---|---|
0 | 0 | 0.9 | 0.1 | 40 |
1 | 1 | 0.3 | 0.2 | 50 |
2 | 2 | 0.6 | 0.8 | 60 |
# testing data
test
length | mass | rings | |
---|---|---|---|
0 | 0.59 | 0.79 | 54 |
# define X and y
feature_cols = ['length', 'mass', 'rings']
X = train[feature_cols]
y = train.id
# KNN with K=1
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X, y)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_neighbors=1, p=2, weights='uniform')
# what "should" it predict?
knn.predict(test)
array([1], dtype=int64)
# allow plots to appear in the notebook
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['font.size'] = 14
plt.rcParams['figure.figsize'] = (5, 5)
# create a "colors" array for plotting
import numpy as np
colors = np.array(['red', 'green', 'blue'])
# scatter plot of training data, colored by id (0=red, 1=green, 2=blue)
plt.scatter(train.mass, train.rings, c=colors[train.id], s=50)
# testing data
plt.scatter(test.mass, test.rings, c='white', s=50)
# add labels
plt.xlabel('mass')
plt.ylabel('rings')
plt.title('How we interpret the data')
<matplotlib.text.Text at 0x1709c198>
# adjust the x-limits
plt.scatter(train.mass, train.rings, c=colors[train.id], s=50)
plt.scatter(test.mass, test.rings, c='white', s=50)
plt.xlabel('mass')
plt.ylabel('rings')
plt.title('How KNN interprets the data')
plt.xlim(0, 30)
(0, 30)
StandardScaler is used for the "standardization" of features, also known as "center and scale" or "z-score normalization".
# standardize the features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)
# original values
X.values
array([[ 0.9, 0.1, 40. ], [ 0.3, 0.2, 50. ], [ 0.6, 0.8, 60. ]])
# standardized values
X_scaled
array([[ 1.22474487, -0.86266219, -1.22474487], [-1.22474487, -0.53916387, 0. ], [ 0. , 1.40182605, 1.22474487]])
# figure out how it standardized
print scaler.mean_
print scaler.std_
[ 0.6 0.36666667 50. ] [ 0.24494897 0.30912062 8.16496581]
# manually standardize
(X.values - scaler.mean_) / scaler.std_
array([[ 1.22474487, -0.86266219, -1.22474487], [-1.22474487, -0.53916387, 0. ], [ 0. , 1.40182605, 1.22474487]])
# read three columns from the dataset into a DataFrame
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
col_names = ['label', 'color', 'proline']
wine = pd.read_csv(url, header=None, names=col_names, usecols=[0, 10, 13])
wine.head()
label | color | proline | |
---|---|---|---|
0 | 1 | 5.64 | 1065 |
1 | 1 | 4.38 | 1050 |
2 | 1 | 5.68 | 1185 |
3 | 1 | 7.80 | 1480 |
4 | 1 | 4.32 | 735 |
wine.describe()
label | color | proline | |
---|---|---|---|
count | 178.000000 | 178.000000 | 178.000000 |
mean | 1.938202 | 5.058090 | 746.893258 |
std | 0.775035 | 2.318286 | 314.907474 |
min | 1.000000 | 1.280000 | 278.000000 |
25% | 1.000000 | 3.220000 | 500.500000 |
50% | 2.000000 | 4.690000 | 673.500000 |
75% | 3.000000 | 6.200000 | 985.000000 |
max | 3.000000 | 13.000000 | 1680.000000 |
# define X and y
feature_cols = ['color', 'proline']
X = wine[feature_cols]
y = wine.label
# split into training and testing sets
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
# standardize X_train
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
# check that it standardized properly
print X_train_scaled[:, 0].mean()
print X_train_scaled[:, 0].std()
print X_train_scaled[:, 1].mean()
print X_train_scaled[:, 1].std()
-3.90664944003e-16 1.0 1.6027279754e-16 1.0
# standardize X_test
X_test_scaled = scaler.transform(X_test)
# is this right?
print X_test_scaled[:, 0].mean()
print X_test_scaled[:, 0].std()
print X_test_scaled[:, 1].mean()
print X_test_scaled[:, 1].std()
0.0305898576303 0.866822198488 0.0546533341088 1.14955947533
# KNN accuracy on original data
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
y_pred_class = knn.predict(X_test)
from sklearn import metrics
print metrics.accuracy_score(y_test, y_pred_class)
0.644444444444
# KNN accuracy on scaled data
knn.fit(X_train_scaled, y_train)
y_pred_class = knn.predict(X_test_scaled)
print metrics.accuracy_score(y_test, y_pred_class)
0.866666666667
# define X and y
feature_cols = ['color', 'proline']
X = wine[feature_cols]
y = wine.label
# proper cross-validation on the original (unscaled) data
knn = KNeighborsClassifier(n_neighbors=3)
from sklearn.cross_validation import cross_val_score
cross_val_score(knn, X, y, cv=5, scoring='accuracy').mean()
0.71983168041991563
# why is this improper cross-validation on the scaled data?
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
cross_val_score(knn, X_scaled, y, cv=5, scoring='accuracy').mean()
0.90104247104247115
# fix the cross-validation process using Pipeline
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=3))
cross_val_score(pipe, X, y, cv=5, scoring='accuracy').mean()
0.89516011810129448
Pipeline can also be used with GridSearchCV for parameter searching:
# search for an optimal n_neighbors value using GridSearchCV
neighbors_range = range(1, 21)
param_grid = dict(kneighborsclassifier__n_neighbors=neighbors_range)
from sklearn.grid_search import GridSearchCV
grid = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')
grid.fit(X, y)
print grid.best_score_
print grid.best_params_
0.910112359551 {'kneighborsclassifier__n_neighbors': 1}