## Try a bit of sklearn tutorial
from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier
Load data and set up feature matrix X
and the response vector y
.
iris = load_iris()
X = iris.data
y = iris.target
Instantiate a model.
knn = KNeighborsClassifier(n_neighbors=5)
Train it.
knn.fit(X, y)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', n_neighbors=5, p=2, weights='uniform')
Make a prediction.
knn.predict([[3,5,4,2], [1,2,2,1], [3,3,3,3]])
array([1, 0, 1])
Yay!
Let's do some cross-validation, randomly leaving out records for the training dataset, then predicting the omitted records and seeing what sort of reliability we achieve.
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
knn.fit(X_train, y_train)
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', n_neighbors=5, p=2, weights='uniform')
y_pred = knn.predict(X_test)
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred)
1.0
Nice!
k_range = range(1, 26)
results = []
for k in k_range:
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
results.append(metrics.accuracy_score(y_test, y_pred))
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(results)
[<matplotlib.lines.Line2D at 0x112db0c10>]
ls data
ls: cannot access data: No such file or directory