import scipy
import numpy as np
import pandas as pd
from sklearn import feature_extraction, datasets
import numba
data = datasets.fetch_20newsgroups()
texts = data.data
vectorizer = feature_extraction.text.CountVectorizer(min_df=5)
vectorized_texts = vectorizer.fit_transform(texts)
vectorized_texts.shape
(11314, 25941)
We use Numba so we can't use sparse matrices
Need to encode documents as dense vectors
y_indices, x_indices, __ = scipy.sparse.find(vectorized_texts)
def get_text_vector_indices(text_vectors):
"""
Convert sparse binary count vectorizer's matrix into dense matrix
doc_word_indices[i, j] = w
means that in document *i* word *j* has positon *w* in dictionary
"""
n_documents = text_vectors.shape[0]
y_indices, x_indices, __ = scipy.sparse.find(text_vectors)
max_doc_length = pd.DataFrame({'y': y_indices, 'x': x_indices}).groupby('x').agg('count').max()[0]
doc_word_indices = -np.ones((n_documents, max_doc_length + 1), dtype='int32')
for i in range(n_documents):
word_indices = y_indices[x_indices == i]
for j in range(len(word_indices)):
doc_word_indices[i, j] = word_indices[j]
return doc_word_indices
%%time
text_vector_indices = get_text_vector_indices(vectorized_texts)
CPU times: user 7.06 s, sys: 23.5 ms, total: 7.08 s Wall time: 7.08 s
@numba.jit(nopython=True)
def count_words_in_topics(document_topics, text_vector_indices, topic_word_counts):
for text in range(text_vector_indices.shape[0]):
for i in text_vector_indices[text]:
if i < 0:
break
topic_word_counts[document_topics[text], i] += 1
return topic_word_counts
def make_topics(text_vector_indices, num_topics):
num_texts, __ = text_vector_indices.shape
num_words = text_vector_indices.max()
topic_document_counts = np.zeros((num_topics,)) # m_z
topic_sizes = np.zeros((num_topics,)) # n_z
topic_word_counts = np.zeros((num_topics, num_words+1)) # n^w_z
text_lenghts = np.array((text_vector_indices >= 0).sum(axis=1))
document_topics = np.random.randint(0, num_topics, size=num_texts)
topic_word_counts = count_words_in_topics(document_topics, text_vector_indices, topic_word_counts)
document_topics_ohe = np.zeros((num_texts, num_topics))
document_topics_ohe[np.arange(num_texts), document_topics] = 1
topic_document_counts += document_topics_ohe.sum(axis=0)
topic_sizes = (text_lenghts[:, np.newaxis] * document_topics_ohe).sum(axis=0)
return topic_document_counts, topic_sizes, topic_word_counts, document_topics
text_vector_indices
array([[ 10, 59, 109, ..., -1, -1, -1], [ 13, 17, 68, ..., -1, -1, -1], [2064, 3665, 4191, ..., -1, -1, -1], ..., [ 70, 127, 760, ..., -1, -1, -1], [1908, 2998, 3867, ..., -1, -1, -1], [1830, 4516, 4881, ..., -1, -1, -1]], dtype=int32)
%%time
num_topics = 4
topic_document_counts, topic_sizes, topic_word_counts, document_topics = make_topics(text_vector_indices, num_topics)
CPU times: user 397 ms, sys: 20 ms, total: 417 ms Wall time: 417 ms
@numba.jit(nopython=True)
def sampling_distribution(doc_idx, topic_document_counts, topic_sizes, topic_word_counts, alpha, beta, epsilon=1e-10):
num_topics = topic_sizes.shape[0]
log_topic_document_counts = np.log(topic_document_counts + alpha)
log_D = np.log(num_texts - 1 + num_topics * alpha)
log_topic_word_counts = np.log(topic_word_counts + beta).sum(axis=0)
log_topic_sizes = np.log(topic_sizes + num_words * beta + i - 1)
return np.exp(log_topic_document_counts - log_D + log_topic_word_counts - log_topic_sizes)
@numba.jit(nopython=True)
def sampling_distribution(doc, alpha, beta, topic_document_counts, topic_sizes, topic_word_counts):
"""
sampling distribution for document doc given m_z, n_z, n^w_z
formula (3) from paper
"""
D, maxsize = text_vector_indices.shape
K, V = topic_word_counts.shape
m_z, n_z, n_z_w = topic_document_counts, topic_sizes, topic_word_counts
log_p = np.zeros((K,))
# We break the formula into the following pieces
# p = N1*N2/(D1*D2) = exp(lN1 - lD1 + lN2 - lD2)
# lN1 = np.log(m_z[z] + alpha)
# lN2 = np.log(D - 1 + K*alpha)
# lN2 = np.log(product(n_z_w[w] + beta)) = sum(np.log(n_z_w[w] + beta))
# lD2 = np.log(product(n_z[d] + V*beta + i -1)) = sum(np.log(n_z[d] + V*beta + i -1))
lD1 = np.log(D - 1 + K * alpha)
for label in range(K):
lN1 = np.log(m_z[label] + alpha)
lN2 = 0
lD2 = 0
doc_size = 0
for word in doc:
if word < 0:
break
lN2 += np.log(n_z_w[label, word] + beta)
doc_size += 1
for j in range(1, doc_size +1):
lD2 += np.log(n_z[label] + V * beta + j - 1)
log_p[label] = lN1 - lD1 + lN2 - lD2
log_p = log_p - log_p.max() / 2
p = np.exp(log_p)
# normalize the probability vector
pnorm = p.sum()
pnorm = pnorm if pnorm>0 else 1
return p / pnorm
topic_document_counts.shape
(4,)
topic_sizes.shape
(4,)
topic_word_counts.shape
(4, 11314)
%%timeit
sampling_distribution(text_vector_indices[0], alpha, beta, topic_document_counts, topic_sizes, topic_word_counts)
num_topics = 100
V = text_vector_indices.max()
K = num_topics
D = len(text_vector_indices)
alpha = 0.1
beta = 0.1
topic_document_counts, topic_sizes, topic_word_counts, document_topics = make_topics(text_vector_indices, num_topics)
pd.Series(document_topics).value_counts()
43 141 45 138 64 133 2 133 23 130 ... 59 94 71 91 26 90 14 89 36 87 Length: 100, dtype: int64
@numba.jit(nopython=True)
def update_topic(doc, topic, topic_sizes, topic_document_counts, topic_word_counts, update_int):
topic_sizes[topic] += update_int
topic_document_counts[topic] += update_int
for w in doc:
if w < 0:
break
topic_word_counts[topic][w] += update_int
def step(text_vector_indices, alpha, beta, topic_document_counts, topic_sizes, topic_word_counts, document_topics):
D, maxsize = text_vector_indices.shape
K, V = topic_word_counts.shape
for d in range(D):
doc = text_vector_indices[d]
# update old
previous_cluster = document_topics[d]
update_topic(doc, previous_cluster, topic_sizes, topic_document_counts, topic_word_counts, -1)
# sample
p = sampling_distribution(doc, alpha, beta, topic_document_counts, topic_sizes, topic_word_counts)
new_cluster = np.argmax(np.random.multinomial(1, p))
document_topics[d] = new_cluster
# update new
update_topic(doc, previous_cluster, topic_sizes, topic_document_counts, topic_word_counts, 1)
%%time
text_vector_indices = get_text_vector_indices(vectorized_texts)
topic_document_counts, topic_sizes, topic_word_counts, document_topics = make_topics(text_vector_indices, num_topics)
step(text_vector_indices, alpha, beta, topic_document_counts, topic_sizes, topic_word_counts, document_topics)
CPU times: user 11.7 s, sys: 63.7 ms, total: 11.7 s Wall time: 11.7 s
topic_document_counts
array([ 93., 100., 128., 116., 117., 108., 115., 103., 106., 105., 119., 133., 121., 120., 119., 111., 121., 91., 124., 125., 118., 103., 117., 129., 119., 121., 112., 109., 119., 115., 123., 96., 134., 121., 118., 121., 125., 107., 118., 106., 122., 126., 104., 104., 115., 115., 103., 117., 101., 124., 115., 103., 93., 110., 115., 113., 100., 125., 106., 102., 121., 102., 97., 118., 123., 104., 99., 100., 119., 113., 113., 119., 105., 126., 124., 116., 111., 117., 151., 126., 112., 130., 110., 120., 96., 124., 107., 104., 104., 99., 111., 89., 106., 124., 108., 104., 121., 110., 111., 101.])
import gsdmm
mgp = gsdmm.MovieGroupProcess(K=8, alpha=0.1, beta=0.1, n_iters=1)
%%time
y = mgp.fit(texts, vocab_size=V)
In stage 0: transferred 9905 clusters with 8 clusters populated CPU times: user 12min 53s, sys: 54.3 ms, total: 12min 53s Wall time: 12min 53s
(16 * 60 + 53) / 8
126.625