import numpy as np
import pandas as pd
vstable = pd.read_csv("http://facweb.cs.depaul.edu/mobasher/classes/csc478/data/Video_Store_2.csv", index_col=0)
vstable.shape
vstable.head()
vs = vstable.reindex(np.random.permutation(vstable.index))
vs.head(10)
len(vs)
vs_names = vs.columns.values
vs_names
vs_target = vs.Incidentals
vs = pd.get_dummies(vs[['Gender','Income','Age','Rentals','Avg Per Visit','Genre']])
vs.head(10)
tpercent = 0.8
tsize = int(tpercent * len(vs))
vs_train = vs[:tsize]
vs_test = vs[tsize:]
print(vs_train.shape)
print(vs_test.shape)
vs_train.head(10)
vs_test
vs_target_train = vs_target[0:int(tsize)]
vs_target_test = vs_target[int(tsize):len(vs)]
vs_target_train.head()
vs_target_test
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
min_max_scaler.fit(vs_train)
vs_train_norm = min_max_scaler.fit_transform(vs_train)
vs_test_norm = min_max_scaler.fit_transform(vs_test)
np.set_printoptions(precision=2, linewidth=100)
print(vs_train_norm[:10])
print(vs_test_norm[:10])
vs_target_train = np.array(vs_target_train)
vs_target_test = np.array(vs_target_test)
print(vs_target_train)
print("\n")
print(vs_target_test)
def knn_search(x, D, K, measure):
""" find K nearest neighbors of an instance x among the instances in D """
if measure == 0:
# euclidean distances from the other points
dists = np.sqrt(((D - x)**2).sum(axis=1))
elif measure == 1:
# first find the vector norm for each instance in D as wel as the norm for vector x
D_norm = np.array([np.linalg.norm(D[i]) for i in range(len(D))])
x_norm = np.linalg.norm(x)
# Compute Cosine: divide the dot product o x and each instance in D by the product of the two norms
sims = np.dot(D,x)/(D_norm * x_norm)
# The distance measure will be the inverse of Cosine similarity
dists = 1 - sims
idx = np.argsort(dists) # sorting
# return the indexes of K nearest neighbors
return idx[:K], dists
# Let's use vs_test_norm[0] as a test instance x and find its K nearest neighbors
neigh_idx, distances = knn_search(vs_test_norm[0], vs_train_norm, 5, 0)
vs_test.head(1)
print(neigh_idx)
print("\nNearest Neigbors:")
vs_train.iloc[neigh_idx]
print(distances[neigh_idx])
# Let's see how the nearest neighbors of the test instance labeled the target attribute "incidentals"
neigh_labels = vs_target_train[neigh_idx]
print(neigh_labels)
from collections import Counter
print(Counter(neigh_labels))
Counter(neigh_labels).most_common(1)
def knn_classify(x, D, K, labels, measure):
from collections import Counter
neigh_idx, distances = knn_search(x, D, K, measure)
neigh_labels = labels[neigh_idx]
count = Counter(neigh_labels)
print("Labels for top ", K, "neighbors: ", count.most_common())
return count.most_common(1)[0][0]
numTestVecs = len(vs_target_test)
print(numTestVecs)
errorCount = 0.0
for i in range(numTestVecs):
classifierResult = knn_classify(vs_test_norm[i,:], vs_train_norm, 5, vs_target_train, 0)
print("Predicted Label: ", classifierResult, "==> Actual Label: ", vs_target_test[i])
print()
if (classifierResult != vs_target_test[i]):
errorCount += 1.0
print("the total error rate is: ", errorCount/float(numTestVecs))
errorCount = 0.0
for i in range(numTestVecs):
classifierResult = knn_classify(vs_test_norm[i,:], vs_train_norm, 5, vs_target_train, 1)
print("Predicted Label: ", classifierResult, "==> Actual Label: ", vs_target_test[i])
print()
if (classifierResult != vs_target_test[i]):
errorCount += 1.0
print("the total error rate is: ", errorCount/float(numTestVecs))