#!/usr/bin/env python # coding: utf-8 # In[2]: from __future__ import division import graphlab as gl import pandas as pd import pyLDAvis import pyLDAvis.graphlab pyLDAvis.enable_notebook() # In[4]: stories_sf = gl.load_sframe("hn_processed.sframe") bows = stories_sf['bow'] # In[34]: topic_model = gl.topic_model.create(bows, num_topics=100, num_iterations=200) # In[8]: pyLDAvis.graphlab.prepare(topic_model, bows) # You can fit more topics and the topics become more fine-grained. They become difficult to visualize in the intertopic map tough. # In[16]: topic_model150 = gl.topic_model.create(bows, num_topics=150, num_iterations=200) pyLDAvis.graphlab.prepare(topic_model150, bows) # In[17]: import re pos_re = re.compile(r'/(NOUN|ADJ|VERB|ADV)') def extract_dists(model, sf=stories_sf): data = pyLDAvis.graphlab._extract_data(model, sf['bow']) vocab = data['vocab'] = [pos_re.sub('', t).replace('_', ' ') for t in data['vocab']] vis_data = pyLDAvis.prepare(**data) vis_topic_order = vis_data.topic_order new_order = np.array(vis_topic_order) - 1 topic_ids = range(1, len(new_order) + 1) data['topic_term_dists'] = pd.DataFrame(data['topic_term_dists'].T, index=vocab)[new_order] data['topic_term_dists'].columns = topic_ids data['doc_topic_dists'] = pd.DataFrame(data['doc_topic_dists'], index=sf['title'])[new_order] data['doc_topic_dists'].columns = topic_ids if vis_data: data['vis'] = vis_data return data # In[19]: model_data = extract_dists(topic_model) def topics_for(doc_name, doc_dist=model_data['doc_topic_dists']): return doc_dist.ix[doc_name].order(ascending=False) def _sort_cols(df, cols): res = df[cols].apply(lambda probs: probs.order(ascending=False).index) return res.reset_index(drop=True) def top_topic_terms(topic_ids, topic_term_dists=model_data['topic_term_dists']): return _sort_cols(topic_term_dists, topic_ids) def top_docs(topic_ids, doc_topic_dists=model_data['doc_topic_dists']): return _sort_cols(doc_topic_dists, topic_ids) def top_term_topics(term, topic_term_dists=model_data['topic_term_dists']): df = topic_term_dists.T[term].order(ascending=False) return df#.reset_index(drop=True) def all_top_terms(topic_term_dists=model_data['topic_term_dists']): return top_topic_terms(topic_term_dists.columns) def topic_docs(topic_id, doc_topic_dists=model_data['doc_topic_dists']): return doc_topic_dists[topic_id].order(ascending=False) # ## Building a focused model around 'code' # In[20]: code_topics = top_term_topics('code')[0:10] code_topics # In[21]: code_topics = code_topics[code_topics > 0.01] # In[22]: len(code_topics) # In[23]: top_docs(code_topics.index).head(5) # In[24]: code_topics = code_topics[code_topics > 0.02] # In[25]: top_docs(code_topics.index).head(5) # In[26]: docs_ordred_by_code = model_data['doc_topic_dists'][code_topics.index].sum(axis=1).order(ascending=False) # In[27]: def fit_focused_model(ordered_docs, num_topics, num_iters=100, threshold=0.1): subset = set(ordered_docs[ordered_docs > threshold].index) print('Keeping %.2f%% of the corpus...' % (100 * (len(subset) / len(ordered_docs)))) # I should have kept the doc index around, oh well.. stories_subset = stories_sf[stories_sf['title'].apply(lambda t: t in subset)] bows = stories_subset['bow'] print('Fitting model...') tm = gl.topic_model.create(bows, num_topics, num_iterations=num_iters) print('Creating vis data...') data = extract_dists(tm, stories_subset) data['model'] = tm return data # In[206]: code_model = fit_focused_model(docs_ordred_by_code, 40, num_iters=500, threshold=0.25) # In[207]: code_model['vis'] # ## Example of looking at a document # In[29]: top_topics = topics_for('Game written by 14 year old passes Angry Birds as the top free iphone app').head(4) top_topics # Without LDAvis you would then look at the top words for those docs.. something like this: # In[31]: top_topic_terms(top_topics.index)[0:5] # To look at the all of the topics you are reduced to looking at a wall of words or tables: # In[33]: all_top_terms().head(5)