In [ ]:
import numpy
import skimage
from matplotlib import pyplot
import pandas
from collections import Counter
from scipy.sparse import lil_matrix
import scipy.sparse.linalg
from sklearn.cluster import KMeans
In [ ]:
rows = numpy.array([1,0,0,1,0])
columns = numpy.array([0,1,1,0,0])
pyplot.imshow(numpy.outer(rows,columns), cmap="gray")
In [ ]:
image = skimage.data.camera()
In [ ]:
type(image)
In [ ]:
pyplot.imshow(image, cmap="gray")
In [ ]:
image.shape
In [ ]:
U, S, Vt = numpy.linalg.svd(image)
In [ ]:
def approx(low, high):
    output = numpy.zeros(image.shape)
    for i in range(low, high):
        output += numpy.outer(U[:,i], S[i] * Vt[i,:])
    return output
In [ ]:
pyplot.imshow(approx(0,1), cmap="gray")
In [ ]:
pyplot.imshow(U[:,:100], cmap="gray")
In [ ]:
pyplot.imshow(Vt[:,:100].T, cmap="gray")
In [ ]:
pyplot.plot(S)
In [ ]:
### "Latent Semantic Analysis".
### Do the same thing, but the matrix is the count of words (columns) in 
### documents (rows).

doc_counters = []
corpus_counts = Counter()

doc_text = []
print ("reading")

document_frequency = Counter()

with open("datascience10k.txt", encoding="utf-8") as reader:
    for line in reader:
        fields = line.rstrip().split("\t")
        if len(fields) == 3:
            tag = fields[1]
            tokens = fields[2].lower().split()
            
            doc_counter = Counter(tokens)
            corpus_counts.update(doc_counter)
            document_frequency.update( doc_counter.keys() )
            
            doc_counters.append(doc_counter)
            
            doc_text.append(fields[2])
In [ ]:
doc_text[25]
In [ ]:
num_docs = len(doc_counters)

## construct a vocabulary list in reverse order by corpus count
vocabulary = [ w for w, c in corpus_counts.most_common() if c > 5 ]
reverse_vocab = { word:i for (i, word) in enumerate(vocabulary) }
vocab_size = len(vocabulary)

#idf_weights = { word:-numpy.log( document_frequency[word] / num_docs ) for word in vocabulary }
In [ ]:
print("constructing sparse matrix")
doc_word_counts = lil_matrix((num_docs, vocab_size))

for doc_id, doc_counter in enumerate(doc_counters):
    words = list([word for word in doc_counter if word in reverse_vocab])
    word_ids = [reverse_vocab[word] for word in words]
    counts = [doc_counter[word] for word in words]
    
    #weighted_counts = [idf_weights[word] * doc_counter[word] for word in words]
    

    doc_word_counts[doc_id,word_ids] = counts

doc_word_counts = doc_word_counts.tocsr()
In [ ]:
## Helper functions
def rank_words(x):
    return sorted(zip(x, vocabulary), reverse=True)

def rank_docs(x):
    return sorted(zip(x, doc_text), reverse=True)

def l2_norm(matrix):
    row_norms = numpy.sqrt(numpy.sum(matrix ** 2, axis = 1))
    return matrix / row_norms[:, numpy.newaxis]
In [ ]:
print("running SVD")

n_dimensions = 100
doc_vectors, singular_values, word_vectors = \
  scipy.sparse.linalg.svds(doc_word_counts, n_dimensions)

doc_vectors[ doc_vectors.sum(axis=1) == 0 ,:] = numpy.ones(n_dimensions)
doc_vectors = l2_norm(doc_vectors)
word_vectors = l2_norm(word_vectors.T)
In [ ]:
pyplot.plot(singular_values)
In [ ]:
similar_docs = rank_docs(doc_vectors.dot(doc_vectors[25,:]))
similar_docs[:5]
In [ ]:
sorted_words = rank_words(word_vectors[:,-2])
print(sorted_words[:20])
print(sorted_words[-20:])
In [ ]:
pyplot.imshow(word_vectors[:100,:].T)
In [ ]:
word_clusters = KMeans(25)
word_clusters.fit(word_vectors[:500,:])
In [ ]:
short_vocab = vocabulary[:500]

for cluster in range(25):
    #print(len(word_clusters.labels_ == cluster))
    cluster_words = [short_vocab[i] for i in range(500) if word_clusters.labels_[i] == cluster]
    print(cluster, " ".join(cluster_words))
    print()