import os
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
print pd.__version__
print np.__version__
0.20.3 1.14.2
/usr/local/lib/python2.7/dist-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20. "This module will be removed in 0.20.", DeprecationWarning)
DATA_DIR = "../data"
df = pd.read_csv(
os.path.abspath(os.path.join(DATA_DIR, "day6/glass.csv")),
)
df.head(5)
id | ri | na | mg | al | si | k | ca | ba | fe | class | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1.52101 | 13.64 | 4.49 | 1.10 | 71.78 | 0.06 | 8.75 | 0.0 | 0.0 | 1 |
1 | 2 | 1.51761 | 13.89 | 3.60 | 1.36 | 72.73 | 0.48 | 7.83 | 0.0 | 0.0 | 1 |
2 | 3 | 1.51618 | 13.53 | 3.55 | 1.54 | 72.99 | 0.39 | 7.78 | 0.0 | 0.0 | 1 |
3 | 4 | 1.51766 | 13.21 | 3.69 | 1.29 | 72.61 | 0.57 | 8.22 | 0.0 | 0.0 | 1 |
4 | 5 | 1.51742 | 13.27 | 3.62 | 1.24 | 73.08 | 0.55 | 8.07 | 0.0 | 0.0 | 1 |
# we will first drop the id column; since it is of no use to us
# remember axis=1 (column); axis=0 (row); default value of axis is 0
df.drop(['id'], inplace=True, axis=1)
df.columns
Index([u'ri', u'na', u'mg', u'al', u'si', u'k', u'ca', u'ba', u'fe', u'class'], dtype='object')
# checking for any NaN value in the dataset across any of the remaining columns in our df
df.isnull().sum()
ri 0 na 0 mg 0 al 0 si 0 k 0 ca 0 ba 0 fe 0 class 0 dtype: int64
# total classes for prediction
df['class'].unique()
array([1, 2, 3, 5, 6, 7])
X = df.iloc[:,:-1].values
Y = df.iloc[:,-1].values
# ideal practice is to use test as 20% - 30% of training data
# defined by test_size in train_test_split()
# random_state is required to avoid sequential biasness in the data distribution
def data_split(X, Y):
X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size=0.2, random_state = 10)
X_train, X_validation, Y_train, Y_validation = train_test_split(X_train, Y_train, test_size=0.2, random_state = 10)
return X_train, X_validation, X_test, Y_train, Y_validation, Y_test
X_train, X_validation, X_test, Y_train, Y_validation, Y_test = data_split(X, Y)
print X_train.shape
print X_validation.shape
print X_test.shape
(136, 9) (35, 9) (43, 9)
n = [5, 6, 7, 8, 9, 10, 11, 12]
maximum = 0
for neighbor in n:
knn = KNeighborsClassifier(n_neighbors=neighbor)
knn.fit(X_train, Y_train)
score = knn.score(X_validation, Y_validation)
if score > maximum:
maximum = score
n_final = neighbor
print 'Neighbour: {} - Score: {}'.format(neighbor, maximum)
else:
pass
Neighbour: 5 - Score: 0.571428571429 Neighbour: 6 - Score: 0.6 Neighbour: 7 - Score: 0.628571428571
knn = KNeighborsClassifier(n_neighbors=n_final)
knn.fit(X_train, Y_train)
knn.score(X_test, Y_test)
0.4883720930232558