In [1]:

```
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
```

In [2]:

```
DF = pd.read_csv("http://facweb.cs.depaul.edu/mobasher/classes/csc478/data/term-doc-mat.csv", header=None)
DF
```

Out[2]:

In [3]:

```
# TD will be the termxdocument matrix
TD = DF.iloc[:,1:]
TD
```

Out[3]:

In [4]:

```
# Reindex the columns to start from 0
TD.columns= range(15)
TD
```

Out[4]:

In [5]:

```
# The list of our index terms
terms = DF.iloc[:,0]
terms
```

Out[5]:

In [6]:

```
DT = TD.T
```

In [7]:

```
DT
```

Out[7]:

In [8]:

```
DT.shape
```

Out[8]:

In [9]:

```
numTerms=DT.shape[1]
NDocs = DT.shape[0]
```

In [10]:

```
print(numTerms)
print(NDocs)
```

In [11]:

```
termFreqs = TD.sum(axis=1)
print(termFreqs)
```

In [12]:

```
plt.plot(sorted(termFreqs, reverse=True))
plt.show()
```

In [13]:

```
DTM = np.array(DT)
DTM
```

Out[13]:

In [14]:

```
def knn_search(x, D, K, measure):
""" find K nearest neighbors of an instance x among the instances in D """
if measure == 0:
# euclidean distances from the other points
dists = np.sqrt(((D - x)**2).sum(axis=1))
elif measure == 1:
# first find the vector norm for each instance in D as wel as the norm for vector x
D_norm = np.array([np.linalg.norm(D[i]) for i in range(len(D))])
x_norm = np.linalg.norm(x)
# Compute Cosine: divide the dot product o x and each instance in D by the product of the two norms
sims = np.dot(D,x)/(D_norm * x_norm)
# The distance measure will be the inverse of Cosine similarity
dists = 1 - sims
idx = np.argsort(dists) # sorting
# return the indexes of K nearest neighbors
return idx[:K], dists
```

In [15]:

```
x = np.array([3, 22, 0, 17, 9, 6, 1, 12, 0, 22])
x
```

Out[15]:

In [16]:

```
# Finding the k=5 nearest neighbors using inverse of Cosine similarity as a distance metric
neigh_idx, distances = knn_search(x, DTM, 5, 1)
```

In [17]:

```
neigh_idx
```

Out[17]:

In [18]:

```
distances = pd.Series(distances, index=DT.index)
distances
```

Out[18]:

In [19]:

```
print("Query:", x)
print("\nNeighbors:")
DT.iloc[neigh_idx]
```

Out[19]:

In [20]:

```
# Finding the k=5 nearest neighbors using Euclidean distance metric
neigh_idx, distances = knn_search(x, DTM, 5, 0)
```

In [21]:

```
print(neigh_idx)
```

In [22]:

```
distances = pd.Series(distances, index=DT.index)
distances
```

Out[22]:

In [23]:

```
print("Query:", x)
print("\nNeighbors:")
DT.iloc[neigh_idx]
```

Out[23]: