#!/usr/bin/env python # coding: utf-8 # #### In this example we will look at how to use the K-Nearest_Neighbor algorithm for classification. We will use a modified version of the Video Store data set for this example. We will use the "Incidentals" attribute as the target attribute for classification (the class attribute). The goal is to be able to classify an unseen instance as "Yes" or "No" given the values of "Incidentals" from training instances. # In[1]: import numpy as np import pandas as pd import matplotlib.pyplot as plt # In[2]: vstable = pd.read_csv("http://facweb.cs.depaul.edu/mobasher/classes/csc478/data/Video_Store_2.csv", index_col=0) vstable.shape # In[3]: vstable.head() # In[4]: vs_names = vstable.columns.values vs_names # #### We will be splitting the data into a test and training partitions with the test partition to be used for evaluating model error-rate and the training partition to be used to find the K nearest neighbors. Before spliting the data we need to do a random reshuffling to make sure the instances are randomized. # In[5]: vs = vstable.reindex(np.random.permutation(vstable.index)) vs.head(10) # In[6]: len(vs) # #### The target attribute for classification is Incidentals. These are the class labels (in this case "yes" and "no") corresponding to instances in the data. # In[7]: vs_target = vs.Incidentals # #### Before we can compute distances we need to convert the data (excluding the target attribute "incidentals" which contains the class labels) into standard spreadsheet format with binary dummy variables created for each categorical attribute. # In[8]: vs = pd.get_dummies(vs[['Gender','Income','Age','Rentals','Avg Per Visit','Genre']]) vs.head(10) # #### To be able to evaluate the accuracy of our predictions, we will split the data into training and test sets. In this case, we will use 80% for training and the remaining 20% for testing. Note that we must also do the same split to the target attribute. # In[9]: tpercent = 0.8 tsize = int(tpercent * len(vs)) vs_train = vs[:tsize] vs_test = vs[tsize:] # In[10]: print(vs_train.shape) print(vs_test.shape) # In[11]: vs_train.head(10) # In[12]: vs_test # #### Splitting the target attribute ("Incidentals") accordingly: # In[13]: vs_target_train = vs_target[0:int(tsize)] vs_target_test = vs_target[int(tsize):len(vs)] # In[14]: vs_target_train.head() # In[15]: vs_target_test # #### Next, we normalize the attributes so that everything is in [0,1] scale. We can use the normalization functions we developed in earlier examples. In this case, however, we will use the more flexible and robust scaler function from the preprocessing module of scikit-learn package. important Note: we train the scaler on the training portion of the data only. Then we use the scaler to transform (normalize) both the training and then test partitions # In[16]: from sklearn import preprocessing # In[17]: # Fit the scaler to the training data min_max_scaler = preprocessing.MinMaxScaler() min_max_scaler.fit(vs_train) # In[18]: # Use the scaler to transfrom both training and test data sets vs_train_norm = min_max_scaler.fit_transform(vs_train) vs_test_norm = min_max_scaler.fit_transform(vs_test) # #### Note that MinMaxScaler returns a Numpy nd-array). # In[19]: np.set_printoptions(precision=2, linewidth=100) print(vs_train_norm[:10]) # In[20]: print(vs_test_norm[:10]) # #### For consitency, we'll also convert the training and test target labels into Numpy arrays. # In[21]: vs_target_train = np.array(vs_target_train) vs_target_test = np.array(vs_target_test) # In[22]: print(vs_target_train) print("\n") print(vs_target_test) # #### The following function illustrates how we can perform a k-nearest-neighbor search. It takes an instance x (a row in the test data) to be classifed and a data matrix D (assumed to be a 2d Numpy array) as inputs. It also takes K (the desired number of nearest-neighbors to be identified), and "measure" as arguments. The "measure" argument allows us to use either Euclidean distance (measure=0) or (the inverse of) Cosine similarity (measure = 1) as the distance function: # In[23]: def knn_search(x, D, K, measure): """ find K nearest neighbors of an instance x among the instances in D """ if measure == 0: # euclidean distances from the other points dists = np.sqrt(((D - x)**2).sum(axis=1)) elif measure == 1: # first find the vector norm for each instance in D as wel as the norm for vector x D_norm = np.array([np.linalg.norm(D[i]) for i in range(len(D))]) x_norm = np.linalg.norm(x) # Compute Cosine: divide the dot product o x and each instance in D by the product of the two norms sims = np.dot(D,x)/(D_norm * x_norm) # The distance measure will be the inverse of Cosine similarity dists = 1 - sims idx = np.argsort(dists) # sorting # return the indexes of K nearest neighbors return idx[:K], dists # #### To test our function, we'll use the first instance in the test data as the instance x as input to the knn_search function and find its K nearest neighbors in the training data. # In[24]: # We'll use K = 5 and Euclidean distance for this example neigh_idx, distances = knn_search(vs_test_norm[0], vs_train_norm, 5, 0) # In[25]: # This was the (non-normalized version of the) test instance used vs_test.head(1) # In[26]: # Let's show the indexes of the 5 nearest neighbors and the neirghbors, themselves, in the original training data print(neigh_idx) print("\nNearest Neigbors:") vs_train.iloc[neigh_idx] # In[27]: # And here are the distances of the above neighbors to the test instance print(distances[neigh_idx]) # In[28]: # Let's see how the nearest neighbors of the test instance labeled the target attribute "incidentals" neigh_labels = vs_target_train[neigh_idx] print(neigh_labels) # #### Now that we know the nearest neighbors, we need to find the majority class label among them. The majority class would be the class assgined to the new instance x. # In[29]: from collections import Counter print(Counter(neigh_labels)) # In[30]: Counter(neigh_labels).most_common(1) # #### Let's now put everything together into a function that calls our knn_search function and then returns that majority class among the K nearest neighbors of the instance to be classified. This is our "classifier function." # In[31]: def knn_classify(x, D, K, labels, measure): from collections import Counter neigh_idx, distances = knn_search(x, D, K, measure) neigh_labels = labels[neigh_idx] count = Counter(neigh_labels) print("Labels for top ", K, "neighbors: ", count.most_common()) return count.most_common(1)[0][0] # #### We can now use our KNN classifer to evaluate it's classification accuracy. Here we will run the classifier on each test instance in our test data and compare the predicted class to the actual class for each. We will maintain the number of disagreements which allows us to compute the final error rate across all test instances. # In[32]: numTestVecs = len(vs_target_test) print(numTestVecs) # In[33]: errorCount = 0.0 for i in range(numTestVecs): classifierResult = knn_classify(vs_test_norm[i,:], vs_train_norm, 5, vs_target_train, 0) print("Predicted Label: ", classifierResult, "==> Actual Label: ", vs_target_test[i]) print() if (classifierResult != vs_target_test[i]): errorCount += 1.0 print("Classification Accuracy: ", 1 - (errorCount/float(numTestVecs))) # #### Let's put this evaluation code into a function that we can resuse easily with different parameters of KNN classifier. We'll also create a new version of the classifier without the extraneous output which returns predicted label and the top K neighbors. # In[34]: def knn_classify(x, D, K, labels, measure): from collections import Counter neigh_idx, distances = knn_search(x, D, K, measure) neigh_labels = labels[neigh_idx] count = Counter(neigh_labels) # print("Labels for top", K, "neighbors: ", count) predicted_label = count.most_common(1)[0][0] return neigh_idx, predicted_label # In[35]: def knn_evaluate(test, test_labs, train, train_labs, K, measure): # Inputs: # test: an array or list of test instances # test_labs: an array or list of class labels for the corresponding test instances in test # train: the training instances # train_labs: class labels for the corresponding training instances in train # K: number of neighbors # measure: 0 = Euclidean distance; 1 = Cosine distance T=0 # no. of correctly classified instances F=0 # no. of incorrectly classified instances for i in range(len(test)): actual=test_labs[i] top_K_neighbors, predicted = knn_classify(test[i], train, K, train_labs, measure) if actual == predicted: T += 1 else: F += 1 accuracy = float(T)/float(T+F) return accuracy # In[36]: # Testing the evaluation function with K =5 and Euclidean distance on the full test set accuracy = knn_evaluate(vs_test_norm, vs_target_test, vs_train_norm, vs_target_train, 5, 0) print("Classification Accuracy: ", accuracy) # In[41]: # Let's compare this to the accuracy on the training data, itself accuracy = knn_evaluate(vs_train_norm, vs_target_train, vs_train_norm, vs_target_train, 5, 0) print("Classification Accuracy (Train): ", accuracy) # #### Let's repeat with the distance metric based on Cosine similarity (instead of Euclidean distance): # In[42]: # Testing the evaluation function with K =5 and Cosine distance on the full test set accuracy = knn_evaluate(vs_test_norm, vs_target_test, vs_train_norm, vs_target_train, 5, 1) print("Classification Accuracy: ", accuracy) # In[39]: Euclid=[] for K in range(1, 40): Euclid.append(knn_evaluate(vs_test_norm, vs_target_test, vs_train_norm, vs_target_train, K, 0)) print(Euclid) # In[40]: Ks=list(range(1, 40)) plt.figure(figsize=(10,5)) plt.plot(Ks, Euclid, 'r^--', label='Euclidean Distance') plt.xlabel('K') plt.ylabel('Accuracy') plt.title('Knn classifier Accuracy') plt.grid(linestyle='--') plt.xticks(Ks) plt.legend(loc='center left', bbox_to_anchor=(1, 0.5)) plt.show() # #### A better way to split the data into training and test sets # In[48]: from sklearn.model_selection import train_test_split vs_train2, vs_test2, vs_target_train2, vs_target_test2 = train_test_split(vs, vs_target, test_size=0.2, random_state=44) print (vs_test2.shape) print (vs_train2.shape) # In[49]: vs_test2 # In[50]: vs_target_test2 # In[51]: vs_train2.head(10) # #### After the split, we should agan perform the normalization as before. # In[ ]: