from __future__ import division
import graphlab as gl
import pandas as pd
import pyLDAvis
import pyLDAvis.graphlab
pyLDAvis.enable_notebook()
stories_sf = gl.load_sframe("hn_processed.sframe")
bows = stories_sf['bow']
topic_model = gl.topic_model.create(bows, num_topics=100, num_iterations=200)
pyLDAvis.graphlab.prepare(topic_model, bows)
You can fit more topics and the topics become more fine-grained. They become difficult to visualize in the intertopic map tough.
topic_model150 = gl.topic_model.create(bows, num_topics=150, num_iterations=200)
pyLDAvis.graphlab.prepare(topic_model150, bows)
import re
pos_re = re.compile(r'/(NOUN|ADJ|VERB|ADV)')
def extract_dists(model, sf=stories_sf):
data = pyLDAvis.graphlab._extract_data(model, sf['bow'])
vocab = data['vocab'] = [pos_re.sub('', t).replace('_', ' ') for t in data['vocab']]
vis_data = pyLDAvis.prepare(**data)
vis_topic_order = vis_data.topic_order
new_order = np.array(vis_topic_order) - 1
topic_ids = range(1, len(new_order) + 1)
data['topic_term_dists'] = pd.DataFrame(data['topic_term_dists'].T, index=vocab)[new_order]
data['topic_term_dists'].columns = topic_ids
data['doc_topic_dists'] = pd.DataFrame(data['doc_topic_dists'], index=sf['title'])[new_order]
data['doc_topic_dists'].columns = topic_ids
if vis_data:
data['vis'] = vis_data
return data
model_data = extract_dists(topic_model)
def topics_for(doc_name, doc_dist=model_data['doc_topic_dists']):
return doc_dist.ix[doc_name].order(ascending=False)
def _sort_cols(df, cols):
res = df[cols].apply(lambda probs: probs.order(ascending=False).index)
return res.reset_index(drop=True)
def top_topic_terms(topic_ids, topic_term_dists=model_data['topic_term_dists']):
return _sort_cols(topic_term_dists, topic_ids)
def top_docs(topic_ids, doc_topic_dists=model_data['doc_topic_dists']):
return _sort_cols(doc_topic_dists, topic_ids)
def top_term_topics(term, topic_term_dists=model_data['topic_term_dists']):
df = topic_term_dists.T[term].order(ascending=False)
return df#.reset_index(drop=True)
def all_top_terms(topic_term_dists=model_data['topic_term_dists']):
return top_topic_terms(topic_term_dists.columns)
def topic_docs(topic_id, doc_topic_dists=model_data['doc_topic_dists']):
return doc_topic_dists[topic_id].order(ascending=False)
code_topics = top_term_topics('code')[0:10]
code_topics
code_topics = code_topics[code_topics > 0.01]
len(code_topics)
top_docs(code_topics.index).head(5)
code_topics = code_topics[code_topics > 0.02]
top_docs(code_topics.index).head(5)
docs_ordred_by_code = model_data['doc_topic_dists'][code_topics.index].sum(axis=1).order(ascending=False)
def fit_focused_model(ordered_docs, num_topics, num_iters=100, threshold=0.1):
subset = set(ordered_docs[ordered_docs > threshold].index)
print('Keeping %.2f%% of the corpus...' % (100 * (len(subset) / len(ordered_docs))))
# I should have kept the doc index around, oh well..
stories_subset = stories_sf[stories_sf['title'].apply(lambda t: t in subset)]
bows = stories_subset['bow']
print('Fitting model...')
tm = gl.topic_model.create(bows, num_topics, num_iterations=num_iters)
print('Creating vis data...')
data = extract_dists(tm, stories_subset)
data['model'] = tm
return data
code_model = fit_focused_model(docs_ordred_by_code, 40, num_iters=500, threshold=0.25)
code_model['vis']
top_topics = topics_for('Game written by 14 year old passes Angry Birds as the top free iphone app').head(4)
top_topics
Without LDAvis you would then look at the top words for those docs.. something like this:
top_topic_terms(top_topics.index)[0:5]
To look at the all of the topics you are reduced to looking at a wall of words or tables:
all_top_terms().head(5)