KNN: An Introduction¶

Import common packages¶

In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Import & observe the raw dataset¶

dataset source: https://www.kaggle.com/rubenssjr/brasilian-houses-to-rent

In [2]:

df = pd.DataFrame(pd.read_csv('sample_data/houses_to_rent_v2.csv'))
display(df.head())
display(df.columns)
display(df.shape)

# Observe the proportion of each class in the dataset
display(100 * (df.groupby('city').count() / int(len(df.city))))

# Observe the count for each class in the dataset
display(df.groupby('city').count()) 

	city	area	rooms	bathroom	parking spaces	floor	animal	furniture	hoa (R$)	rent amount (R$)	property tax (R$)	fire insurance (R$)	total (R$)
0	São Paulo	70	2	1	1	7	acept	furnished	2065	3300	211	42	5618
1	São Paulo	320	4	4	0	20	acept	not furnished	1200	4960	1750	63	7973
2	Porto Alegre	80	1	1	1	6	acept	not furnished	1000	2800	0	41	3841
3	Porto Alegre	51	2	1	0	2	acept	not furnished	270	1112	22	17	1421
4	São Paulo	25	1	1	0	1	not acept	not furnished	0	800	25	11	836

Index(['city', 'area', 'rooms', 'bathroom', 'parking spaces', 'floor',
       'animal', 'furniture', 'hoa (R$)', 'rent amount (R$)',
       'property tax (R$)', 'fire insurance (R$)', 'total (R$)'],
      dtype='object')

(10692, 13)

	area	rooms	bathroom	parking spaces	floor	animal	furniture	hoa (R$)	rent amount (R$)	property tax (R$)	fire insurance (R$)	total (R$)
city
Belo Horizonte	11.765806	11.765806	11.765806	11.765806	11.765806	11.765806	11.765806	11.765806	11.765806	11.765806	11.765806	11.765806
Campinas	7.977927	7.977927	7.977927	7.977927	7.977927	7.977927	7.977927	7.977927	7.977927	7.977927	7.977927	7.977927
Porto Alegre	11.157875	11.157875	11.157875	11.157875	11.157875	11.157875	11.157875	11.157875	11.157875	11.157875	11.157875	11.157875
Rio de Janeiro	14.038533	14.038533	14.038533	14.038533	14.038533	14.038533	14.038533	14.038533	14.038533	14.038533	14.038533	14.038533
São Paulo	55.059858	55.059858	55.059858	55.059858	55.059858	55.059858	55.059858	55.059858	55.059858	55.059858	55.059858	55.059858

	area	rooms	bathroom	parking spaces	floor	animal	furniture	hoa (R$)	rent amount (R$)	property tax (R$)	fire insurance (R$)	total (R$)
city
Belo Horizonte	1258	1258	1258	1258	1258	1258	1258	1258	1258	1258	1258	1258
Campinas	853	853	853	853	853	853	853	853	853	853	853	853
Porto Alegre	1193	1193	1193	1193	1193	1193	1193	1193	1193	1193	1193	1193
Rio de Janeiro	1501	1501	1501	1501	1501	1501	1501	1501	1501	1501	1501	1501
São Paulo	5887	5887	5887	5887	5887	5887	5887	5887	5887	5887	5887	5887

Wrangle the dataset¶

In [3]:

# Assign a unique integer for each label (city)
df.city = pd.factorize(df['city'])[0] 

# Convert boolean strings to 1 and 0
df.animal = pd.Series(np.where(df.animal == 'acept', 1, 0))
df.furniture = pd.Series(np.where(df.furniture == 'furnished', 1, 0))

# Fix inconsistency in column and convert the field to a numeric type
df.floor = df[df.floor == '-'] = 0
df.floor = pd.to_numeric(df.floor)

/home/hp-nunes/anaconda3/envs/TEST/lib/python3.7/site-packages/pandas/core/ops/array_ops.py:253: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  res_values = method(rvalues)

In [4]:

# split our dataset between attributes and labels.

X = df.iloc[:, 1:-1].values # Excludes 'city' & 'total(R$)'
y = df.iloc[:, 0].values # Includes 'city' only

In [5]:

display(df.head())
df.dtypes

	city	area	rooms	bathroom	parking spaces	animal	furniture	hoa (R$)	rent amount (R$)	property tax (R$)	fire insurance (R$)	total (R$)
0	0	70	2	1	1	1	1	2065	3300	211	42	5618
1	0	320	4	4	0	1	0	1200	4960	1750	63	7973
2	1	80	1	1	1	1	0	1000	2800	0	41	3841
3	1	51	2	1	0	1	0	270	1112	22	17	1421
4	0	25	1	1	0	0	0	0	800	25	11	836

Out[5]:

city                   int64
area                   int64
rooms                  int64
bathroom               int64
parking spaces         int64
floor                  int64
animal                 int64
furniture              int64
hoa (R$)               int64
rent amount (R$)       int64
property tax (R$)      int64
fire insurance (R$)    int64
total (R$)             int64
dtype: object

Scaling the dataset¶

In [6]:

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler() # Scale and transform before splitting into training/testing

X = scaler.fit_transform(X)

Split the dataset into training and testing sets¶

In [7]:

from sklearn.model_selection import train_test_split

# The test size is set to 20% of the whole dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0,test_size=0.20)

print(X_train.shape, X_test.shape)

(8553, 11) (2139, 11)

Run the Model¶

In [8]:

from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

# Create an instance
knn = KNeighborsClassifier(n_neighbors=6)

# Train the algorithm
model=knn.fit(X_train, y_train)

# Predict the classes on the testing set
y_pred = model.predict(X_test)

# Get the accuracy score on the testing set
print(metrics.accuracy_score(y_test, y_pred))

0.6161757830762038

Look at the classification report¶

In [9]:

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred)) # Precision should be large, but recall should be "smaller"

              precision    recall  f1-score   support

           0       0.68      0.87      0.76      1198
           1       0.55      0.41      0.47       238
           2       0.45      0.28      0.35       299
           3       0.38      0.25      0.30       159
           4       0.40      0.20      0.27       245

    accuracy                           0.62      2139
   macro avg       0.49      0.40      0.43      2139
weighted avg       0.58      0.62      0.58      2139

Run the model over a range of K¶

In [10]:

#Choose how many neighbors to test
k_range = range(1,300)

#Create a list to store scores
scores=[]
error = []

#Run the KNN
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    scores.append(metrics.accuracy_score(y_test, y_pred)) # Append the accuracy score
    error.append(np.mean(y_pred != y_test)) # Append the error rate

#Print the scores
print(scores)

[0.5885928003740065, 0.6194483403459561, 0.6068256194483403, 0.6115007012622721, 0.6091631603553063, 0.6161757830762038, 0.6119682094436653, 0.6231884057971014, 0.6147732585320244, 0.6152407667134174, 0.6161757830762038, 0.6157082748948106, 0.6105656848994857, 0.6110331930808789, 0.605890603085554, 0.6133707339878448, 0.6115007012622721, 0.6105656848994857, 0.6077606358111267, 0.6044880785413744, 0.5993454885460495, 0.6044880785413744, 0.603085553997195, 0.6040205703599812, 0.6044880785413744, 0.6068256194483403, 0.6016830294530154, 0.6012155212716223, 0.6012155212716223, 0.6021505376344086, 0.6054230949041608, 0.6026180458158018, 0.6012155212716223, 0.5993454885460495, 0.5965404394576905, 0.5960729312762973, 0.5974754558204769, 0.5970079476390837, 0.5970079476390837, 0.5984104721832632, 0.5988779803646563, 0.5979429640018701, 0.5965404394576905, 0.5988779803646563, 0.5998129967274427, 0.5998129967274427, 0.5984104721832632, 0.5965404394576905, 0.595137914913511, 0.5956054230949042, 0.5956054230949042, 0.5928003740065451, 0.5913978494623656, 0.5899953249181861, 0.5899953249181861, 0.5909303412809724, 0.5885928003740065, 0.5895278167367929, 0.5881252921926133, 0.5867227676484339, 0.5862552594670407, 0.5853202431042543, 0.587190275829827, 0.5857877512856475, 0.5867227676484339, 0.5853202431042543, 0.5881252921926133, 0.5867227676484339, 0.5890603085553997, 0.5895278167367929, 0.5881252921926133, 0.5876577840112202, 0.5881252921926133, 0.5899953249181861, 0.5876577840112202, 0.5867227676484339, 0.5853202431042543, 0.5853202431042543, 0.5862552594670407, 0.5857877512856475, 0.5867227676484339, 0.584385226741468, 0.5834502103786816, 0.5825151940158952, 0.5825151940158952, 0.5839177185600748, 0.5825151940158952, 0.5829827021972884, 0.5834502103786816, 0.5853202431042543, 0.5848527349228612, 0.5853202431042543, 0.5848527349228612, 0.584385226741468, 0.5839177185600748, 0.584385226741468, 0.5848527349228612, 0.5853202431042543, 0.5848527349228612, 0.5848527349228612, 0.584385226741468, 0.5848527349228612, 0.5885928003740065, 0.5890603085553997, 0.5876577840112202, 0.5890603085553997, 0.5853202431042543, 0.5857877512856475, 0.5853202431042543, 0.5834502103786816, 0.5815801776531089, 0.5825151940158952, 0.5815801776531089, 0.5829827021972884, 0.5806451612903226, 0.5801776531089294, 0.5820476858345021, 0.5801776531089294, 0.5797101449275363, 0.5792426367461431, 0.5787751285647499, 0.5783076203833567, 0.5759700794763908, 0.576437587657784, 0.5726975222066386, 0.5731650303880318, 0.5741000467508182, 0.5745675549322113, 0.5797101449275363, 0.5792426367461431, 0.5806451612903226, 0.5797101449275363, 0.5792426367461431, 0.5797101449275363, 0.5825151940158952, 0.5815801776531089, 0.5815801776531089, 0.5815801776531089, 0.5829827021972884, 0.5829827021972884, 0.5778401122019635, 0.5778401122019635, 0.5769050958391772, 0.5787751285647499, 0.5801776531089294, 0.5811126694717158, 0.5792426367461431, 0.5806451612903226, 0.5811126694717158, 0.5787751285647499, 0.5806451612903226, 0.5811126694717158, 0.5801776531089294, 0.5801776531089294, 0.5815801776531089, 0.5806451612903226, 0.5820476858345021, 0.5820476858345021, 0.5820476858345021, 0.5825151940158952, 0.5829827021972884, 0.5815801776531089, 0.5825151940158952, 0.5825151940158952, 0.5815801776531089, 0.5815801776531089, 0.5820476858345021, 0.5815801776531089, 0.5811126694717158, 0.5801776531089294, 0.5797101449275363, 0.5820476858345021, 0.5787751285647499, 0.5797101449275363, 0.5797101449275363, 0.5801776531089294, 0.5801776531089294, 0.5806451612903226, 0.5806451612903226, 0.5815801776531089, 0.5820476858345021, 0.5820476858345021, 0.5811126694717158, 0.5820476858345021, 0.5820476858345021, 0.5811126694717158, 0.5825151940158952, 0.5825151940158952, 0.5829827021972884, 0.5820476858345021, 0.5820476858345021, 0.5820476858345021, 0.5815801776531089, 0.5806451612903226, 0.5815801776531089, 0.5811126694717158, 0.5806451612903226, 0.5811126694717158, 0.5820476858345021, 0.5815801776531089, 0.5806451612903226, 0.5806451612903226, 0.5797101449275363, 0.5792426367461431, 0.5792426367461431, 0.5797101449275363, 0.5792426367461431, 0.5783076203833567, 0.5797101449275363, 0.5792426367461431, 0.5801776531089294, 0.5787751285647499, 0.5783076203833567, 0.5778401122019635, 0.5783076203833567, 0.5787751285647499, 0.5787751285647499, 0.5787751285647499, 0.5792426367461431, 0.5792426367461431, 0.5797101449275363, 0.5792426367461431, 0.5787751285647499, 0.5778401122019635, 0.5801776531089294, 0.5806451612903226, 0.5811126694717158, 0.5811126694717158, 0.5815801776531089, 0.5801776531089294, 0.5792426367461431, 0.5792426367461431, 0.5801776531089294, 0.5797101449275363, 0.5801776531089294, 0.5787751285647499, 0.5759700794763908, 0.5759700794763908, 0.573632538569425, 0.5773726040205703, 0.5741000467508182, 0.5778401122019635, 0.5787751285647499, 0.5778401122019635, 0.5750350631136045, 0.5750350631136045, 0.576437587657784, 0.5759700794763908, 0.5755025712949977, 0.5759700794763908, 0.5745675549322113, 0.5745675549322113, 0.5741000467508182, 0.5741000467508182, 0.5741000467508182, 0.5745675549322113, 0.5750350631136045, 0.5755025712949977, 0.5745675549322113, 0.5726975222066386, 0.5717625058438522, 0.5708274894810659, 0.5698924731182796, 0.5703599812996727, 0.5708274894810659, 0.5684899485741001, 0.5703599812996727, 0.5698924731182796, 0.5694249649368864, 0.5689574567554933, 0.5680224403927069, 0.5661524076671341, 0.5661524076671341, 0.5652173913043478, 0.565684899485741, 0.5670874240299205, 0.5666199158485273, 0.5661524076671341, 0.5675549322113137, 0.5670874240299205, 0.5670874240299205, 0.5689574567554933, 0.5675549322113137, 0.565684899485741, 0.5652173913043478, 0.5652173913043478, 0.5670874240299205, 0.5684899485741001, 0.5670874240299205, 0.5703599812996727, 0.5703599812996727, 0.5689574567554933, 0.5661524076671341, 0.562879850397382, 0.5642823749415615, 0.5624123422159888, 0.562879850397382, 0.5624123422159888, 0.5619448340345956]

Plot the Accuracy result over instances of K¶

In [15]:

plt.figure(figsize=(50,20))

plt.plot(k_range, scores)
plt.axis([1,25,min(scores),1])
plt.xticks(k_range)
plt.yticks(fontsize=30)
plt.xlabel('Value of K',fontsize=35)
plt.ylabel('Test accuracy',fontsize=35)
plt.xlim(0, 300, 10)
plt.ylim(0.56, .63)
plt.show()

Plot the error rate over instances of K¶

In [16]:

plt.figure(figsize=(12, 6))
plt.plot(range(1, 300), error, color='red', linestyle='dashed', marker='o',
         markerfacecolor='blue', markersize=10)
plt.title('Error Rate K Value')
plt.xlabel('K Value')
plt.ylabel('Mean Error')

Out[16]:

Text(0, 0.5, 'Mean Error')

Suggestion: Test for Class Imbalance¶

Compare the model's performance between one single label for the predominating class (i.e. city = 'Sao Paulo') versus all other classes minus the majority class.

Boilerplate wrangling¶

In [17]:

df = pd.DataFrame(pd.read_csv('sample_data/houses_to_rent_v2.csv'))

df.animal = pd.Series(np.where(df.animal == 'acept', 1, 0))
df.furniture = pd.Series(np.where(df.furniture == 'furnished', 1, 0))
df.floor = df[df.floor == '-'] = 0
df.floor = pd.to_numeric(df.floor)

/home/hp-nunes/anaconda3/envs/TEST/lib/python3.7/site-packages/pandas/core/ops/array_ops.py:253: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  res_values = method(rvalues)

In [18]:

df.city = pd.Series(np.where(df.city == 'São Paulo',1,0))

# # split our dataset into its attributes and labels.
X = df.iloc[:, 1:-1].values # Excludes 'city'
y = df.iloc[:, 0].values # Includes 'city' only

In [19]:

from sklearn.preprocessing import StandardScaler # Scale and transform before splitting into training/testing

scaler = StandardScaler()
X = scaler.fit_transform(X)

Test on Majority class label¶

In [20]:

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0,test_size=0.20)
print(X_train.shape, X_test.shape)

(8553, 11) (2139, 11)

In [21]:

#Create an instance
knn = KNeighborsClassifier(n_neighbors=6)
#Train the algorithm
model=knn.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))

0.68630201028518

In [22]:

#Choose how many neighbors to test
k_range = range(1,300)

#Create a list to store scores
scores=[]
error = []

#Run the KNN
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    scores.append(metrics.accuracy_score(y_test, y_pred)) # Append the accuracy score
    error.append(np.mean(y_pred != y_test)) # Append the error rate

#Print the scores
print(scores)

[0.689107059373539, 0.6418887330528285, 0.6834969611968209, 0.6708742402992053, 0.7017297802711547, 0.68630201028518, 0.7073398784478728, 0.6947171575502571, 0.7101449275362319, 0.7003272557269752, 0.7031323048153343, 0.6937821411874708, 0.6975222066386162, 0.697054698457223, 0.697054698457223, 0.6919121084618981, 0.6984572230014026, 0.6956521739130435, 0.6965871902758298, 0.6993922393641889, 0.6919121084618981, 0.6965871902758298, 0.6993922393641889, 0.6993922393641889, 0.6993922393641889, 0.6989247311827957, 0.6984572230014026, 0.7017297802711547, 0.7012622720897616, 0.6961196820944366, 0.6965871902758298, 0.6984572230014026, 0.6965871902758298, 0.7012622720897616, 0.6975222066386162, 0.6956521739130435, 0.6951846657316503, 0.6942496493688639, 0.6965871902758298, 0.697054698457223, 0.6993922393641889, 0.6984572230014026, 0.7007947639083684, 0.7007947639083684, 0.6989247311827957, 0.697054698457223, 0.6961196820944366, 0.6942496493688639, 0.6923796166432913, 0.6961196820944366, 0.6937821411874708, 0.6933146330060777, 0.6965871902758298, 0.6961196820944366, 0.6989247311827957, 0.6947171575502571, 0.6928471248246845, 0.6933146330060777, 0.6919121084618981, 0.6909770920991117, 0.6933146330060777, 0.689107059373539, 0.6872370266479664, 0.6858345021037868, 0.689107059373539, 0.6900420757363254, 0.6895745675549322, 0.6881720430107527, 0.68630201028518, 0.6905095839177186, 0.6867695184665732, 0.6886395511921458, 0.6872370266479664, 0.689107059373539, 0.6877045348293596, 0.6900420757363254, 0.68630201028518, 0.6872370266479664, 0.689107059373539, 0.689107059373539, 0.6867695184665732, 0.6858345021037868, 0.6839644693782141, 0.6867695184665732, 0.6867695184665732, 0.6858345021037868, 0.6848994857410005, 0.6848994857410005, 0.6858345021037868, 0.6858345021037868, 0.6816269284712483, 0.6806919121084619, 0.6853669939223936, 0.6830294530154277, 0.6853669939223936, 0.6834969611968209, 0.6820944366526415, 0.6830294530154277, 0.6825619448340347, 0.6825619448340347, 0.6834969611968209, 0.6806919121084619, 0.6806919121084619, 0.6788218793828892, 0.6792893875642824, 0.6816269284712483, 0.6825619448340347, 0.6825619448340347, 0.6816269284712483, 0.6806919121084619, 0.6806919121084619, 0.6825619448340347, 0.6825619448340347, 0.6858345021037868, 0.6839644693782141, 0.6886395511921458, 0.6858345021037868, 0.6872370266479664, 0.6853669939223936, 0.6877045348293596, 0.6844319775596073, 0.6830294530154277, 0.6811594202898551, 0.6825619448340347, 0.6802244039270687, 0.6811594202898551, 0.6806919121084619, 0.6816269284712483, 0.6820944366526415, 0.6764843384759234, 0.6778868630201028, 0.6746143057503506, 0.6797568957456755, 0.6778868630201028, 0.6778868630201028, 0.6774193548387096, 0.6792893875642824, 0.6778868630201028, 0.6792893875642824, 0.6802244039270687, 0.6820944366526415, 0.6806919121084619, 0.6816269284712483, 0.6806919121084619, 0.6830294530154277, 0.6839644693782141, 0.6811594202898551, 0.6825619448340347, 0.6834969611968209, 0.6830294530154277, 0.6806919121084619, 0.675549322113137, 0.6746143057503506, 0.6750818139317438, 0.675549322113137, 0.6736792893875643, 0.6727442730247779, 0.6760168302945302, 0.6750818139317438, 0.675549322113137, 0.6746143057503506, 0.6736792893875643, 0.6732117812061711, 0.6732117812061711, 0.6718092566619915, 0.6732117812061711, 0.6732117812061711, 0.6718092566619915, 0.6713417484805985, 0.6694717157550257, 0.6736792893875643, 0.6722767648433847, 0.6750818139317438, 0.6746143057503506, 0.6746143057503506, 0.6769518466573166, 0.6746143057503506, 0.6769518466573166, 0.6774193548387096, 0.6802244039270687, 0.6769518466573166, 0.6769518466573166, 0.6778868630201028, 0.6769518466573166, 0.6760168302945302, 0.6764843384759234, 0.6760168302945302, 0.6741467975689575, 0.6732117812061711, 0.6732117812061711, 0.6718092566619915, 0.6722767648433847, 0.6722767648433847, 0.6713417484805985, 0.6727442730247779, 0.6718092566619915, 0.6732117812061711, 0.6727442730247779, 0.6718092566619915, 0.6704067321178121, 0.6727442730247779, 0.6722767648433847, 0.6708742402992053, 0.6685366993922394, 0.667601683029453, 0.6694717157550257, 0.6732117812061711, 0.6704067321178121, 0.6736792893875643, 0.6732117812061711, 0.6746143057503506, 0.6736792893875643, 0.6736792893875643, 0.6741467975689575, 0.675549322113137, 0.6708742402992053, 0.6736792893875643, 0.6718092566619915, 0.6732117812061711, 0.6694717157550257, 0.6722767648433847, 0.6694717157550257, 0.6736792893875643, 0.6741467975689575, 0.6736792893875643, 0.6741467975689575, 0.6708742402992053, 0.6704067321178121, 0.6718092566619915, 0.6718092566619915, 0.6741467975689575, 0.6694717157550257, 0.6760168302945302, 0.6699392239364189, 0.6736792893875643, 0.6690042075736325, 0.6741467975689575, 0.6708742402992053, 0.6769518466573166, 0.6708742402992053, 0.6764843384759234, 0.6699392239364189, 0.6746143057503506, 0.6708742402992053, 0.6746143057503506, 0.6732117812061711, 0.6778868630201028, 0.6760168302945302, 0.675549322113137, 0.6746143057503506, 0.6699392239364189, 0.6690042075736325, 0.6680691912108462, 0.6680691912108462, 0.6661991584852734, 0.6680691912108462, 0.6699392239364189, 0.6708742402992053, 0.6699392239364189, 0.6727442730247779, 0.667601683029453, 0.6690042075736325, 0.6680691912108462, 0.6708742402992053, 0.6699392239364189, 0.6704067321178121, 0.6694717157550257, 0.6708742402992053, 0.6704067321178121, 0.6713417484805985, 0.6713417484805985, 0.6694717157550257, 0.6708742402992053, 0.6732117812061711, 0.6722767648433847, 0.6736792893875643, 0.6722767648433847, 0.6732117812061711, 0.6727442730247779, 0.6722767648433847, 0.6736792893875643, 0.6736792893875643, 0.6732117812061711, 0.6746143057503506, 0.6741467975689575, 0.675549322113137, 0.6774193548387096, 0.6764843384759234, 0.6764843384759234, 0.6764843384759234, 0.6774193548387096, 0.6764843384759234, 0.6774193548387096, 0.6760168302945302, 0.6736792893875643, 0.6746143057503506, 0.6746143057503506, 0.6741467975689575, 0.6750818139317438]

In [23]:

from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred)) 

[[479 462]
 [233 965]]
              precision    recall  f1-score   support

           0       0.67      0.51      0.58       941
           1       0.68      0.81      0.74      1198

    accuracy                           0.68      2139
   macro avg       0.67      0.66      0.66      2139
weighted avg       0.67      0.68      0.67      2139

In [24]:

plt.figure(figsize=(50,20))

plt.plot(k_range, scores)
plt.axis([1,25,min(scores),1])
plt.xticks(k_range)
plt.yticks(fontsize=30)
plt.xlabel('Value of K',fontsize=35)
plt.ylabel('Test accuracy',fontsize=35)
plt.xlim(0, 300, 10)
plt.ylim(0.64, .72)
plt.show()

In [25]:

plt.figure(figsize=(12, 6))
plt.plot(range(1, 300), error, color='red', linestyle='dashed', marker='o',
         markerfacecolor='blue', markersize=10)
plt.title('Error Rate K Value')
plt.xlabel('K Value')
plt.ylabel('Mean Error')

Out[25]:

Text(0, 0.5, 'Mean Error')