import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
DF = pd.read_csv("http://facweb.cs.depaul.edu/mobasher/classes/csc478/data/term-doc-mat.csv", header=None)
DF
# TD will be the termxdocument matrix
TD = DF.iloc[:,1:]
TD
# Reindex the columns to start from 0
TD.columns= range(15)
TD
# The list of our index terms
terms = DF.iloc[:,0]
terms
DT = TD.T
DT
DT.shape
numTerms=DT.shape[1]
NDocs = DT.shape[0]
print(numTerms)
print(NDocs)
termFreqs = TD.sum(axis=1)
print(termFreqs)
plt.plot(sorted(termFreqs, reverse=True))
plt.show()
DTM = np.array(DT)
DTM
def knn_search(x, D, K, measure):
""" find K nearest neighbors of an instance x among the instances in D """
if measure == 0:
# euclidean distances from the other points
dists = np.sqrt(((D - x)**2).sum(axis=1))
elif measure == 1:
# first find the vector norm for each instance in D as wel as the norm for vector x
D_norm = np.array([np.linalg.norm(D[i]) for i in range(len(D))])
x_norm = np.linalg.norm(x)
# Compute Cosine: divide the dot product o x and each instance in D by the product of the two norms
sims = np.dot(D,x)/(D_norm * x_norm)
# The distance measure will be the inverse of Cosine similarity
dists = 1 - sims
idx = np.argsort(dists) # sorting
# return the indexes of K nearest neighbors
return idx[:K], dists
x = np.array([3, 22, 0, 17, 9, 6, 1, 12, 0, 22])
x
# Finding the k=5 nearest neighbors using inverse of Cosine similarity as a distance metric
neigh_idx, distances = knn_search(x, DTM, 5, 1)
neigh_idx
distances = pd.Series(distances, index=DT.index)
distances
print("Query:", x)
print("\nNeighbors:")
DT.iloc[neigh_idx]
# Finding the k=5 nearest neighbors using Euclidean distance metric
neigh_idx, distances = knn_search(x, DTM, 5, 0)
print(neigh_idx)
distances = pd.Series(distances, index=DT.index)
distances
print("Query:", x)
print("\nNeighbors:")
DT.iloc[neigh_idx]