import numpy as np
import pylab as pl
import pandas as pd
from sklearn.cluster import KMeans
Data = pd.read_csv("http://facweb.cs.depaul.edu/mobasher/classes/csc478/data/term-doc-mat.csv", header=None)
Data
# Let's remove the column containing the terms
# TD will be out term x document matrix
TD = Data.iloc[:,1:]
TD
# Reindex the columns to start from 0
TD.columns= range(15)
TD
# The list of our index terms
terms = Data.iloc[:,0]
terms
DT = TD.T
DT
numTerms=TD.shape[0]
NDocs = TD.shape[1]
# Note: doc frequency (df) for a term t is the number of docs in which t appears divided by total number of docs
# first let's find the doc counts for each term
DF = pd.DataFrame([(TD!=0).sum(1)]).T
DF
# Create a matrix with all entries = NDocs
NMatrix=np.ones(np.shape(TD), dtype=float)*NDocs
np.set_printoptions(precision=2,suppress=True,linewidth=120)
print(NMatrix)
# Convert each entry into IDF values
# IDF is the log of the inverse of document frequency
# Note that IDF is only a function of the term, so all columns will be identical.
IDF = np.log2(np.divide(NMatrix, np.array(DF)))
np.set_printoptions(precision=2,suppress=True)
print(IDF)
# Finally compute the TFxIDF values for each document-term entry
TD_tfidf = TD * IDF
pd.set_option("display.precision", 2)
TD_tfidf
import kMeans
# First transpose to get the doc by term matrix and then call kMeans to cluster the docs
DT_tfidf = TD_tfidf.T
DT_tfidf = np.array(DT_tfidf)
centroids_tfidf, clusters_tfidf = kMeans.kMeans(DT_tfidf, 3, kMeans.distCosine, kMeans.randCent)
pd.options.display.float_format='{:,.2f}'.format
centroids = pd.DataFrame(centroids_tfidf, columns=terms)
centroids
DT = np.array(DT)
centroids, clusters = kMeans.kMeans(DT, 3, kMeans.distCosine, kMeans.randCent)
centroids = pd.DataFrame(centroids, columns=terms)
centroids
# Let's look at cluster assigmens for each of the instances in the data.
print(clusters)
doc_clusters = pd.DataFrame(clusters, columns=["Cluster", "MinDistance**2"])
doc_clusters
from sklearn import decomposition
pca = decomposition.PCA(n_components=5)
DTtrans = pca.fit(DT).transform(DT)
np.set_printoptions(precision=2,suppress=True)
print(DTtrans)
print(pca.explained_variance_ratio_)
centroids_pca, clusters_pca = kMeans.kMeans(DTtrans, 3, kMeans.distCosine, kMeans.randCent)
print(clusters_pca)
meanVals = np.mean(DT, axis=0)
meanRemoved = DT - meanVals #remove mean
covMat = np.cov(meanRemoved, rowvar=0)
np.set_printoptions(precision=2,suppress=True,linewidth=100)
print(covMat)
import numpy.linalg as la
eigVals,eigVects = la.eig(np.mat(covMat))
print(eigVals)
print(eigVects)
eigValInd = np.argsort(eigVals) #sort, sort goes smallest to largest
eigValInd = eigValInd[::-1] #reverse
sortedEigVals = eigVals[eigValInd]
print(sortedEigVals)
total = sum(sortedEigVals)
varPercentage = sortedEigVals/total*100
print(varPercentage)
import matplotlib.pyplot as plt
%matplotlib inline
fig = plt.figure()
ax = fig.add_subplot(111)
ax.plot(range(1, 11), varPercentage[:10], marker='^')
plt.xlabel('Principal Component Number')
plt.ylabel('Percentage of Variance')
plt.show()
topNfeat = 5
topEigValInd = eigValInd[:topNfeat] #cut off unwanted dimensions
reducedEigVects = eigVects[:,topEigValInd] #reorganize eig vects largest to smallest
reducedDT = np.dot(meanRemoved, reducedEigVects) #transform data into new dimensions
print(reducedDT)
u, s, vt = np.linalg.svd(TD, full_matrices=False)
u = np.array([u[i]*(-1) for i in range(len(u))])
print(u)
vt = np.array([vt[i]*(-1) for i in range(len(vt))])
print(vt)
print(s)
print(np.diag(s))
# If we use all the dimensions of U.Sigma.Vt, we will get back to original matrix.
originalTD = np.dot(u, np.dot(np.diag(s), vt))
print(originalTD)
# But, the goal of SVD is to use a smaller number of dimensions each of which
# represent a latent variable capturing some cobminations of features associated
# with the data (e.g., general themes in the documents).
numDimensions = 3
u_ld = u[:, :numDimensions]
sigma = np.diag(s)[:numDimensions, :numDimensions]
vt_ld = vt[:numDimensions, :]
lowRankTD = np.dot(u_ld, np.dot(sigma, vt_ld))
# The U.Sigma.Vt in the lower dimensional space gives an approximation of the original materix
np.set_printoptions(precision=2,suppress=True,linewidth=120)
print(lowRankTD)
print(vt_ld)
queryVector = np.array([0,0,1,5,4,0,6,0,0,2])
lowDimQuery = np.dot(la.inv(sigma), np.dot(u_ld.T, queryVector))
print(lowDimQuery)
# Compute Cosine sim between the query and docs in the lower dimensional space
# first compute the normamlized versions of the query and the docs by dividing each vector by their norm
qNorm = lowDimQuery / la.norm(lowDimQuery)
docNorm = np.array([vt_ld[:,i]/la.norm(vt_ld[:,i]) for i in range(len(vt_ld[0]))])
print(docNorm)
# Cosine is now the dot product since the vectors are already normalized
sims = np.dot(qNorm, docNorm.T)
# return indices of the docs in decending order of similarity to the query
simInds = sims.argsort()[::-1]
for i in simInds:
print("Cosine similarity between Document %d and the query is: %.4f" %(i,sims[i]))
centroids_svd, clusters_svd = kMeans.kMeans(vt_ld.T, 3, kMeans.distCosine, kMeans.randCent)
print(clusters_svd)