import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
Data = pd.read_csv("http://facweb.cs.depaul.edu/mobasher/classes/csc478/data/term-doc-mat.csv", header=None)
Data
# Let's remove the column containing the terms
# TD will be out term x document matrix
TD = Data.iloc[:,1:]
TD
# Reindex the columns to start from 0
TD.columns= range(15)
TD
# The list of our index terms
terms = Data.iloc[:,0]
terms
TD.shape
numTerms=TD.shape[0]
NDocs = TD.shape[1]
print(numTerms)
print(NDocs)
termFreqs = TD.sum(axis=1)
termFreqs
plt.plot(sorted(termFreqs, reverse=True))
plt.show()
# Note: doc frequency (df) for a term t is the number of docs in which t appears divided by total number of docs
# first let's find the doc counts for each term
DF = pd.DataFrame([(TD!=0).sum(1)]).T
DF
# Create a matrix with all entries = NDocs
NMatrix=np.ones(np.shape(TD), dtype=float)*NDocs
np.set_printoptions(precision=2,suppress=True,linewidth=120)
print(NMatrix)
# Convert each entry into IDF values
# IDF is the log of the inverse of document frequency
# Note that IDF is only a function of the term, so all columns will be identical.
IDF = np.log2(np.divide(NMatrix, np.array(DF)))
print(IDF)
# Finally compute the TFxIDF values for each document-term entry
TD_tfidf = TD * IDF
pd.set_option("display.precision", 2)
TD_tfidf
def knn_search(x, D, K, measure):
""" find K nearest neighbors of an instance x among the instances in D """
if measure == 0:
# euclidean distances from the other points
dists = np.sqrt(((D - x)**2).sum(axis=1))
elif measure == 1:
# first find the vector norm for each instance in D as wel as the norm for vector x
D_norm = np.array([np.linalg.norm(D[i]) for i in range(len(D))])
x_norm = np.linalg.norm(x)
# Compute Cosine: divide the dot product o x and each instance in D by the product of the two norms
sims = np.dot(D,x)/(D_norm * x_norm)
# The distance measure will be the inverse of Cosine similarity
dists = 1 - sims
idx = np.argsort(dists) # sorting
# return the indexes of K nearest neighbors
return idx[:K], dists
x = np.array([3, 22, 0, 17, 9, 6, 1, 12, 0, 22])
# Each term in query x must be multiplied by the idf value of the term we computed earlier (the IDF matrix)
x_tfidf = x * IDF.T[0] # note that this coordinatewise multiplication of two vectors
print(x_tfidf)
# The KNN Search function expects a document x term materix as an np array, so we need to transpose the TF_tfidf matrix
DT_tfidf = TD_tfidf.T
DT_array = np.array(DT_tfidf)
# Finding the k=5 nearest neighbors using inverse of Cosine similarity as a distance metric
neigh_idx, distances = knn_search(x_tfidf, DT_array, 5, 1)
distances = pd.Series(distances, index=DT_tfidf.index)
distances.sort_values()
print("Query:", x)
print("\nNeighbors:")
DT_tfidf.iloc[neigh_idx]
# Let's add some labels to our original data
cat_labels = np.array(["Databases", "Databases", "Databases", "Databases", "Databases", "Regression", "Regression", "Regression", "Regression", "Regression", "Information Retrieval", "Information Retrieval", "Information Retrieval", "Information Retrieval", "Information Retrieval"])
cat_labels = pd.Series(cat_labels, index=DT_tfidf.index)
DT_tfidf["Category"] = cat_labels
DT_tfidf
def knn_classify(x, D, K, labels, measure):
from collections import Counter
neigh_idx, distances = knn_search(x, D, K, measure)
neigh_labels = labels[neigh_idx]
count = Counter(neigh_labels)
print("Labels for top ", K, "neighbors: ", count)
return count.most_common(1)[0][0]
print("Instance to classify:\n", x)
print("Predicted Category for the new instance: ", knn_classify(x_tfidf, DT_array, 5, cat_labels, 1))