In-class exercise implementing parameters estimation via Expectation-Maximization.
$z_{dn} \sim \text{Categorical}(\theta_d)$
$x_{dn}|z_{dn} \sim \text{Categorical}(\beta_{z_{dn}})$
Suppose we have some data; we'll work with 20newgroups, which you will see again in homeworks!
from sklearn.datasets import fetch_20newsgroups
categories = ['alt.atheism', 'soc.religion.christian']
D = fetch_20newsgroups(subset='train', shuffle=True,
categories=categories,
remove=('headers', 'footers', 'quotes'))
D = D["data"]
print(D[0][:500])
Library of Congress to Host Dead Sea Scroll Symposium April 21-22 To: National and Assignment desks, Daybook Editor Contact: John Sullivan, 202-707-9216, or Lucy Suddreth, 202-707-9191 both of the Library of Congress WASHINGTON, April 19 -- A symposium on the Dead Sea Scrolls will be held at the Library of Congress on Wednesday, April 21, and Thursday, April 22. The two-day program, cosponsored by the library and Baltimore Hebrew University, with additional support from the Pr
# how many documents?
n_docs = len(D)
print(n_docs)
1079
# how many words?
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(max_features=5000) # limit to 5k features for now
X = vect.fit_transform(D)
X.shape
(1079, 5000)
v = X.shape[1]
n_topics = 3 # hyperparameter
import numpy as np
# Initialize variables: theta, beta.
# what are our shapes?
theta = np.zeros((n_docs, n_topics)) # topic probabilities per document
beta = np.zeros((v, n_topics)) # word probabilities per topic
phi = np.zeros((n_docs, v, n_topics)) # word-topic assignment probabilities
Aside: How should we initialize these variables?? What do we want? Could use random
values, but what are the potential issues?
Idea: Use a Dirichlet
distribution. Why? Because it will provide row-vectors that sum to 1, and we can control how peaky these are easily. We'll return to this in a bit; for now, just think of it as a convienent way to initialize row vectors that sum to 1.
def init_matrix_rows(m, scale=1.0):
cols = m.shape[1]
for row_idx in range(m.shape[0]):
r = np.random.dirichlet(np.ones(cols)/scale, size=1)
m[row_idx,:] = r
init_matrix_rows(theta, scale=1)
theta[0,:]
array([0.27429133, 0.21086857, 0.51484011])
init_matrix_rows(beta)
# we have an extra dimension for phi
for i in range(n_docs):
init_matrix_rows(phi[i,:,:], scale=.01)
phi[0,0,:]
array([0.37974323, 0.36414193, 0.25611484])
X = X.todense() # convert sparse matrix to dense
X.shape
(1079, 5000)
Pull above together into an initialization method
def init_params(s=0.1):
theta = np.zeros((n_docs, n_topics)) # topic probabilities per document
beta = np.zeros((v, n_topics)) # word probabilities per topic
phi = np.zeros((n_docs, v, n_topics)) # word-topic assignment probabilities
init_matrix_rows(theta, scale=s)
init_matrix_rows(beta, scale=s)
for i in range(n_docs):
init_matrix_rows(phi[i,:,:], scale=s)
$\phi_{dnk}=\frac{\theta_{dk} \beta_{kv}}{\sum_l \theta_{dl} \beta_{lv}}$
# E-Step: Update word-topic assignments
def e_step():
# update \phi (word-topic params), given current parameters
for d in range(n_docs):
for w in range(v):
# \phi is distribution over topics for
# word w in doc d.
z = np.dot(theta[d,:], beta[w,:])
# the top here is an element-wise mult
phi[d,w,:] = (theta[d,:] * beta[w,:]) / z
print("e-step done.")
$\beta_{kv} = \frac{N_{kv}}{\sum_w N_{kw}}$ Where $N_{kv} = \sum_{d,n} \phi_{dnk} x_{nv}$
$\theta_k = \frac{N_k}{\sum_l N_l}$ Where $N_l = \sum_{d,n} \phi_{dnk}$
# M-Step: Update parameters
def m_step():
for k in range(n_topics):
# \beta: word-topic probabilities
# total expectation of words for this topic
N_kz = np.sum(phi[:,:,k])
for w in range(v):
# p(w|t) = [expected count of w in k] / [expected count of k over all words]
beta[w,k] = np.sum(phi[:,w,k]) / N_kz
for d in range(n_docs):
# \theta: document-topic probabilities
# total mass over topics in this doc
z_k = np.sum(phi[d,:,:])
theta[d, k] = np.sum(phi[d,:,k]) / z_k
print("m-step done")
def fit(iters=2):
init_params(s=0.1)
for iter in range(iters):
print(iter)
e_step()
m_step()
fit(iters=10)
0 e-step done. m-step done 1 e-step done. m-step done 2 e-step done. m-step done 3 e-step done. m-step done 4 e-step done. m-step done 5 e-step done. m-step done 6 e-step done. m-step done 7 e-step done. m-step done 8 e-step done. m-step done 9 e-step done. m-step done
e_step()
e-step done.
m_step()
m-step done
theta[3,:]
array([0.32411854, 0.3643046 , 0.31157687])
theta[10,:]
array([0.24457848, 0.30548981, 0.44993172])
Let's inspect the topics. We're particularly interested in $\beta$ parameters, which give topic-word probabilities.
def get_top_words_for_topic(k, m=10):
top_word_indices = np.argsort(beta[:,k])[-m:]
# now retrieve the actual tokens from our vectorizer
top_words = [vect.get_feature_names()[idx] for idx in top_word_indices]
return top_words
np.argsort(beta[1,:])[-20:]
array([2, 1, 0])
np.sort(beta[1,:])[:20]
array([0.00017543, 0.00020041, 0.00022405])
get_top_words_for_topic(0,m=20)
['herod', 'church', 'ezra', 'reasoning', 'perform', 'apocrypha', 'former', 'unlimited', 'open', 'division', 'aimed', 'orthodox', 'some', 'appropriate', 'chooses', 'offended', 'simply', 'cast', 'baltimore', 'british']