In [ ]:

## Try a bit of sklearn tutorial

In [1]:

from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier

Load data and set up feature matrix X and the response vector y.

In [2]:

iris = load_iris()
X = iris.data
y = iris.target

Instantiate a model.

In [3]:

knn = KNeighborsClassifier(n_neighbors=5)

Train it.

In [4]:

knn.fit(X, y)

Out[4]:

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           n_neighbors=5, p=2, weights='uniform')

Make a prediction.

In [6]:

knn.predict([[3,5,4,2], [1,2,2,1], [3,3,3,3]])

Out[6]:

array([1, 0, 1])

Yay!

Let's do some cross-validation, randomly leaving out records for the training dataset, then predicting the omitted records and seeing what sort of reliability we achieve.

In [7]:

from sklearn.cross_validation import train_test_split

In [8]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [9]:

knn.fit(X_train, y_train)

Out[9]:

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           n_neighbors=5, p=2, weights='uniform')

In [12]:

y_pred = knn.predict(X_test)

In [13]:

from sklearn import metrics

In [14]:

metrics.accuracy_score(y_test, y_pred)

Out[14]:

1.0

Nice!

In [23]:

k_range = range(1, 26)
results = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    results.append(metrics.accuracy_score(y_test, y_pred))

In [24]:

import matplotlib.pyplot as plt
%matplotlib inline

plt.plot(results)

Out[24]:

[<matplotlib.lines.Line2D at 0x112db0c10>]

In [1]:

ls data

ls: cannot access data: No such file or directory

In [ ]: