In [2]:
from __future__ import division

import graphlab as gl
import pandas as pd
import pyLDAvis
import pyLDAvis.graphlab

pyLDAvis.enable_notebook()
In [4]:
stories_sf = gl.load_sframe("hn_processed.sframe")
bows = stories_sf['bow']
In [34]:
topic_model = gl.topic_model.create(bows, num_topics=100, num_iterations=200)
In [8]:
pyLDAvis.graphlab.prepare(topic_model, bows)
Out[8]:

You can fit more topics and the topics become more fine-grained. They become difficult to visualize in the intertopic map tough.

In [16]:
topic_model150 = gl.topic_model.create(bows, num_topics=150, num_iterations=200)
pyLDAvis.graphlab.prepare(topic_model150, bows)
Out[16]:
In [17]:
import re
pos_re = re.compile(r'/(NOUN|ADJ|VERB|ADV)')

def extract_dists(model, sf=stories_sf):
    data = pyLDAvis.graphlab._extract_data(model, sf['bow'])
    vocab = data['vocab'] = [pos_re.sub('', t).replace('_', ' ') for t in data['vocab']]
    vis_data = pyLDAvis.prepare(**data)
    vis_topic_order = vis_data.topic_order
    new_order = np.array(vis_topic_order) - 1
    topic_ids = range(1, len(new_order) + 1)    
    data['topic_term_dists'] = pd.DataFrame(data['topic_term_dists'].T, index=vocab)[new_order]
    data['topic_term_dists'].columns = topic_ids
    data['doc_topic_dists'] = pd.DataFrame(data['doc_topic_dists'], index=sf['title'])[new_order]
    data['doc_topic_dists'].columns = topic_ids
    if vis_data:
        data['vis'] = vis_data
    return data
In [19]:
model_data = extract_dists(topic_model)

def topics_for(doc_name, doc_dist=model_data['doc_topic_dists']):
    return doc_dist.ix[doc_name].order(ascending=False)

def _sort_cols(df, cols):
    res = df[cols].apply(lambda probs: probs.order(ascending=False).index)
    return res.reset_index(drop=True)

def top_topic_terms(topic_ids, topic_term_dists=model_data['topic_term_dists']):
    return _sort_cols(topic_term_dists, topic_ids)

def top_docs(topic_ids, doc_topic_dists=model_data['doc_topic_dists']):
    return _sort_cols(doc_topic_dists, topic_ids)

def top_term_topics(term, topic_term_dists=model_data['topic_term_dists']):
    df = topic_term_dists.T[term].order(ascending=False)
    return df#.reset_index(drop=True)

def all_top_terms(topic_term_dists=model_data['topic_term_dists']):
    return top_topic_terms(topic_term_dists.columns)

def topic_docs(topic_id, doc_topic_dists=model_data['doc_topic_dists']):
    return doc_topic_dists[topic_id].order(ascending=False)

Building a focused model around 'code'

In [20]:
code_topics = top_term_topics('code')[0:10]
code_topics
Out[20]:
59    0.075427
16    0.053946
5     0.040998
2     0.024466
51    0.024436
6     0.021635
48    0.020752
68    0.019684
12    0.009131
61    0.004977
Name: code, dtype: float64
In [21]:
code_topics = code_topics[code_topics > 0.01]
In [22]:
len(code_topics)
Out[22]:
8
In [23]:
top_docs(code_topics.index).head(5)
Out[23]:
59 16 5 2 51 6 48 68
0 Static Analysis isnt Development Testing A Concept Design for C++ Elucidating all about Code Analysis in Visual C++ The Javascript Garden Clarifying the Roles of the .gemspec and Gemfile Working with Design Patterns in JAVA ASF comment on JSR#336 (Java 7) review ballot A Hacker\u2019s Guide to Git
1 Test Driven Development Really Works - The Hil... It Is Not Called The "STL", Mmkay? Collection of Examples of 64-bit Errors in Rea... JavaScript Garden Clarifying the Roles of the .gemspec and Gemfile Working with interfaces in JAVA Why Open Source misses the point of Free Software Git Workflows & tutorials by Atlassian
2 My love affair with code reviews Interesting Standard Libraries to Study (Ltu) Improving and Fixing C Code Javascript Garden Running Pure Django Projects on Google App Engine Dependency Injection - An Introductory Tutoria... Qt to ship standard in Ubuntu 11.10 Cheat git
3 Thoughts on Developer Testing A Brief, Incomplete, and Mostly Wrong History ... The Art of Picking Intel Registers (2003) Understanding Python Decorators Beginning Ember.js on Rails: Part 1 Tinyweb Does Mono JSRs for Java 7 and Java 8 Approved Develop Faster: Set Up Your Git Fork and Merge...
4 The Problems with Unit Testing Frameworks Moving from Java to Scala - One year later... A Collection of Examples of 64-bit Errors in R... Understanding Python Decorators Backbone vs Ember Using MEF to expose interfaces in your Silverl... Control Points and Steering Mechanisms in Open... Git: Merging the right way
In [24]:
code_topics = code_topics[code_topics > 0.02]
In [25]:
top_docs(code_topics.index).head(5)
Out[25]:
59 16 5 2 51 6 48
0 Static Analysis isnt Development Testing A Concept Design for C++ Elucidating all about Code Analysis in Visual C++ The Javascript Garden Clarifying the Roles of the .gemspec and Gemfile Working with Design Patterns in JAVA ASF comment on JSR#336 (Java 7) review ballot
1 Test Driven Development Really Works - The Hil... It Is Not Called The "STL", Mmkay? Collection of Examples of 64-bit Errors in Rea... JavaScript Garden Clarifying the Roles of the .gemspec and Gemfile Working with interfaces in JAVA Why Open Source misses the point of Free Software
2 My love affair with code reviews Interesting Standard Libraries to Study (Ltu) Improving and Fixing C Code Javascript Garden Running Pure Django Projects on Google App Engine Dependency Injection - An Introductory Tutoria... Qt to ship standard in Ubuntu 11.10
3 Thoughts on Developer Testing A Brief, Incomplete, and Mostly Wrong History ... The Art of Picking Intel Registers (2003) Understanding Python Decorators Beginning Ember.js on Rails: Part 1 Tinyweb Does Mono JSRs for Java 7 and Java 8 Approved
4 The Problems with Unit Testing Frameworks Moving from Java to Scala - One year later... A Collection of Examples of 64-bit Errors in R... Understanding Python Decorators Backbone vs Ember Using MEF to expose interfaces in your Silverl... Control Points and Steering Mechanisms in Open...
In [26]:
docs_ordred_by_code = model_data['doc_topic_dists'][code_topics.index].sum(axis=1).order(ascending=False)
In [27]:
def fit_focused_model(ordered_docs, num_topics, num_iters=100, threshold=0.1):
    subset = set(ordered_docs[ordered_docs > threshold].index)
    print('Keeping %.2f%% of the corpus...' % (100 * (len(subset) / len(ordered_docs))))
    # I should have kept the doc index around, oh well..
    stories_subset = stories_sf[stories_sf['title'].apply(lambda t: t in subset)]
    bows = stories_subset['bow']    
    print('Fitting model...')
    tm = gl.topic_model.create(bows, num_topics, num_iterations=num_iters)
    print('Creating vis data...')
    data = extract_dists(tm, stories_subset)
    data['model'] = tm
    return data
In [206]:
code_model = fit_focused_model(docs_ordred_by_code, 40, num_iters=500, threshold=0.25)
Keeping 1.55% of the corpus...
Fitting model...
PROGRESS: Learning a topic model
PROGRESS:        Number of documents      4051
PROGRESS:            Vocabulary size     23879
PROGRESS:    Running collapsed Gibbs sampling
PROGRESS: +-----------+---------------+----------------+-----------------+
PROGRESS: | Iteration | Elapsed Time  | Tokens/Second  | Est. Perplexity |
PROGRESS: +-----------+---------------+----------------+-----------------+
PROGRESS: | 10        | 1.00s         | 4.22476e+06    | 0               |
PROGRESS: | 20        | 1.93s         | 3.79484e+06    | 0               |
PROGRESS: | 30        | 2.85s         | 4.20434e+06    | 0               |
PROGRESS: | 40        | 3.71s         | 4.29245e+06    | 0               |
PROGRESS: | 50        | 4.61s         | 4.1157e+06     | 0               |
PROGRESS: | 60        | 5.50s         | 3.64592e+06    | 0               |
PROGRESS: | 70        | 6.40s         | 3.58882e+06    | 0               |
PROGRESS: | 80        | 7.32s         | 3.33084e+06    | 0               |
PROGRESS: | 90        | 8.27s         | 4.14121e+06    | 0               |
PROGRESS: | 100       | 9.18s         | 4.06099e+06    | 0               |
PROGRESS: | 110       | 10.08s        | 4.37966e+06    | 0               |
PROGRESS: | 120       | 10.96s        | 3.95687e+06    | 0               |
PROGRESS: | 130       | 11.88s        | 4.38291e+06    | 0               |
PROGRESS: | 140       | 12.73s        | 4.06117e+06    | 0               |
PROGRESS: | 150       | 13.59s        | 4.11654e+06    | 0               |
PROGRESS: | 160       | 14.50s        | 3.57834e+06    | 0               |
PROGRESS: | 170       | 15.41s        | 3.93633e+06    | 0               |
PROGRESS: | 180       | 16.30s        | 4.14158e+06    | 0               |
PROGRESS: | 190       | 17.18s        | 4.14951e+06    | 0               |
PROGRESS: | 200       | 18.08s        | 4.19639e+06    | 0               |
PROGRESS: | 210       | 18.94s        | 4.28476e+06    | 0               |
PROGRESS: | 220       | 19.82s        | 3.92746e+06    | 0               |
PROGRESS: | 230       | 20.67s        | 4.25762e+06    | 0               |
PROGRESS: | 240       | 21.58s        | 4.15865e+06    | 0               |
PROGRESS: | 250       | 22.46s        | 4.28757e+06    | 0               |
PROGRESS: | 260       | 23.42s        | 3.50532e+06    | 0               |
PROGRESS: | 270       | 24.34s        | 4.14928e+06    | 0               |
PROGRESS: | 280       | 25.21s        | 4.27971e+06    | 0               |
PROGRESS: | 290       | 26.09s        | 3.83545e+06    | 0               |
PROGRESS: | 300       | 26.97s        | 4.07273e+06    | 0               |
PROGRESS: | 310       | 27.89s        | 4.24095e+06    | 0               |
PROGRESS: | 320       | 28.78s        | 4.0067e+06     | 0               |
PROGRESS: | 330       | 29.67s        | 4.07831e+06    | 0               |
PROGRESS: | 340       | 30.58s        | 4.29804e+06    | 0               |
PROGRESS: | 350       | 31.52s        | 3.49479e+06    | 0               |
PROGRESS: | 360       | 32.45s        | 4.16546e+06    | 0               |
PROGRESS: | 370       | 33.39s        | 2.94196e+06    | 0               |
PROGRESS: | 380       | 34.29s        | 3.39923e+06    | 0               |
PROGRESS: | 390       | 35.34s        | 3.587e+06      | 0               |
PROGRESS: | 400       | 36.25s        | 4.28361e+06    | 0               |
PROGRESS: | 410       | 37.20s        | 4.29764e+06    | 0               |
PROGRESS: | 420       | 38.10s        | 4.22593e+06    | 0               |
PROGRESS: | 430       | 39.02s        | 3.66926e+06    | 0               |
PROGRESS: | 440       | 39.91s        | 4.34361e+06    | 0               |
PROGRESS: | 450       | 40.80s        | 4.39185e+06    | 0               |
PROGRESS: | 460       | 41.79s        | 3.65108e+06    | 0               |
PROGRESS: | 470       | 42.76s        | 3.85248e+06    | 0               |
PROGRESS: | 480       | 43.71s        | 3.8152e+06     | 0               |
PROGRESS: | 490       | 44.69s        | 2.877e+06      | 0               |
PROGRESS: | 500       | 45.72s        | 3.40368e+06    | 0               |
PROGRESS: +-----------+---------------+----------------+-----------------+
Creating vis data...
In [207]:
code_model['vis']
Out[207]:

Example of looking at a document

In [29]:
top_topics = topics_for('Game written by 14 year old passes Angry Birds as the top free iphone app').head(4)
top_topics
Out[29]:
57    0.141026
38    0.132479
65    0.072650
70    0.072650
Name: Game written by 14 year old passes Angry Birds as the top free iphone app, dtype: float64

Without LDAvis you would then look at the top words for those docs.. something like this:

In [31]:
top_topic_terms(top_topics.index)[0:5]
Out[31]:
57 38 65 70
0 app game time movie
1 apps player day film
2 developer video game work show
3 application gaming hour story
4 user developer week episode

To look at the all of the topics you are reduced to looking at a wall of words or tables:

In [33]:
all_top_terms().head(5)
Out[33]:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 ... 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
0 problem function world government code class number scientist research system market browser country file wikileaks language time startup data android health law energy idea company ... experience google car design business video seo event service blog email music service photo question atau home february shoe feature india ask hn ask hn ask hn ask hn
1 system type life money bit method point science study process company javascript china package information code man company database device drug state water time market ... developer search vehicle designer company content brick marketing conference phone post message artist customer image answer anda service january fashion laptop travel error show hn website startup
2 model object man economy data object algorithm researcher brain application sale firefox united state directory government programming year ago entrepreneur table phone patient court power goal stock ... skill user driver website money youtube website presentation call site mail sound company picture time dalam real estate site style battery hotel der comment domain idea
3 question code book tax bug code problem cell researcher user product element world command document java day founder query google treatment obama material problem share ... company chrome system web design client medium company talk communication wordpress email address audio business camera topic itu house online sale phone match request url hacker news site advice
4 point var god bank program data result animal paper data revenue html government version assange programmer dog investor system apple doctor lawyer plant life investor ... engineer product model logo idea flash marketing day technology reader service band solution wallpaper community untuk property tip quality video sport mit post domain name web app

5 rows × 100 columns