In [2]:

from __future__ import division

import graphlab as gl
import pandas as pd
import pyLDAvis
import pyLDAvis.graphlab

pyLDAvis.enable_notebook()

In [4]:

stories_sf = gl.load_sframe("hn_processed.sframe")
bows = stories_sf['bow']

In [34]:

topic_model = gl.topic_model.create(bows, num_topics=100, num_iterations=200)

In [8]:

pyLDAvis.graphlab.prepare(topic_model, bows)

Out[8]:

You can fit more topics and the topics become more fine-grained. They become difficult to visualize in the intertopic map tough.

In [16]:

topic_model150 = gl.topic_model.create(bows, num_topics=150, num_iterations=200)
pyLDAvis.graphlab.prepare(topic_model150, bows)

Out[16]:

In [17]:

import re
pos_re = re.compile(r'/(NOUN|ADJ|VERB|ADV)')

def extract_dists(model, sf=stories_sf):
    data = pyLDAvis.graphlab._extract_data(model, sf['bow'])
    vocab = data['vocab'] = [pos_re.sub('', t).replace('_', ' ') for t in data['vocab']]
    vis_data = pyLDAvis.prepare(**data)
    vis_topic_order = vis_data.topic_order
    new_order = np.array(vis_topic_order) - 1
    topic_ids = range(1, len(new_order) + 1)    
    data['topic_term_dists'] = pd.DataFrame(data['topic_term_dists'].T, index=vocab)[new_order]
    data['topic_term_dists'].columns = topic_ids
    data['doc_topic_dists'] = pd.DataFrame(data['doc_topic_dists'], index=sf['title'])[new_order]
    data['doc_topic_dists'].columns = topic_ids
    if vis_data:
        data['vis'] = vis_data
    return data

In [19]:

model_data = extract_dists(topic_model)

def topics_for(doc_name, doc_dist=model_data['doc_topic_dists']):
    return doc_dist.ix[doc_name].order(ascending=False)

def _sort_cols(df, cols):
    res = df[cols].apply(lambda probs: probs.order(ascending=False).index)
    return res.reset_index(drop=True)

def top_topic_terms(topic_ids, topic_term_dists=model_data['topic_term_dists']):
    return _sort_cols(topic_term_dists, topic_ids)

def top_docs(topic_ids, doc_topic_dists=model_data['doc_topic_dists']):
    return _sort_cols(doc_topic_dists, topic_ids)

def top_term_topics(term, topic_term_dists=model_data['topic_term_dists']):
    df = topic_term_dists.T[term].order(ascending=False)
    return df#.reset_index(drop=True)

def all_top_terms(topic_term_dists=model_data['topic_term_dists']):
    return top_topic_terms(topic_term_dists.columns)

def topic_docs(topic_id, doc_topic_dists=model_data['doc_topic_dists']):
    return doc_topic_dists[topic_id].order(ascending=False)

Building a focused model around 'code'¶

In [20]:

code_topics = top_term_topics('code')[0:10]
code_topics

Out[20]:

59    0.075427
16    0.053946
5     0.040998
2     0.024466
51    0.024436
6     0.021635
48    0.020752
68    0.019684
12    0.009131
61    0.004977
Name: code, dtype: float64

In [21]:

code_topics = code_topics[code_topics > 0.01]

In [22]:

len(code_topics)

Out[22]:

In [23]:

top_docs(code_topics.index).head(5)

Out[23]:

	59	16	5	2	51	6	48	68
0	Static Analysis isnt Development Testing	A Concept Design for C++	Elucidating all about Code Analysis in Visual C++	The Javascript Garden	Clarifying the Roles of the .gemspec and Gemfile	Working with Design Patterns in JAVA	ASF comment on JSR#336 (Java 7) review ballot	A Hacker\u2019s Guide to Git
1	Test Driven Development Really Works - The Hil...	It Is Not Called The "STL", Mmkay?	Collection of Examples of 64-bit Errors in Rea...	JavaScript Garden	Clarifying the Roles of the .gemspec and Gemfile	Working with interfaces in JAVA	Why Open Source misses the point of Free Software	Git Workflows & tutorials by Atlassian
2	My love affair with code reviews	Interesting Standard Libraries to Study (Ltu)	Improving and Fixing C Code	Javascript Garden	Running Pure Django Projects on Google App Engine	Dependency Injection - An Introductory Tutoria...	Qt to ship standard in Ubuntu 11.10	Cheat git
3	Thoughts on Developer Testing	A Brief, Incomplete, and Mostly Wrong History ...	The Art of Picking Intel Registers (2003)	Understanding Python Decorators	Beginning Ember.js on Rails: Part 1	Tinyweb Does Mono	JSRs for Java 7 and Java 8 Approved	Develop Faster: Set Up Your Git Fork and Merge...
4	The Problems with Unit Testing Frameworks	Moving from Java to Scala - One year later...	A Collection of Examples of 64-bit Errors in R...	Understanding Python Decorators	Backbone vs Ember	Using MEF to expose interfaces in your Silverl...	Control Points and Steering Mechanisms in Open...	Git: Merging the right way

In [24]:

code_topics = code_topics[code_topics > 0.02]

In [25]:

top_docs(code_topics.index).head(5)

Out[25]:

	59	16	5	2	51	6	48
0	Static Analysis isnt Development Testing	A Concept Design for C++	Elucidating all about Code Analysis in Visual C++	The Javascript Garden	Clarifying the Roles of the .gemspec and Gemfile	Working with Design Patterns in JAVA	ASF comment on JSR#336 (Java 7) review ballot
1	Test Driven Development Really Works - The Hil...	It Is Not Called The "STL", Mmkay?	Collection of Examples of 64-bit Errors in Rea...	JavaScript Garden	Clarifying the Roles of the .gemspec and Gemfile	Working with interfaces in JAVA	Why Open Source misses the point of Free Software
2	My love affair with code reviews	Interesting Standard Libraries to Study (Ltu)	Improving and Fixing C Code	Javascript Garden	Running Pure Django Projects on Google App Engine	Dependency Injection - An Introductory Tutoria...	Qt to ship standard in Ubuntu 11.10
3	Thoughts on Developer Testing	A Brief, Incomplete, and Mostly Wrong History ...	The Art of Picking Intel Registers (2003)	Understanding Python Decorators	Beginning Ember.js on Rails: Part 1	Tinyweb Does Mono	JSRs for Java 7 and Java 8 Approved
4	The Problems with Unit Testing Frameworks	Moving from Java to Scala - One year later...	A Collection of Examples of 64-bit Errors in R...	Understanding Python Decorators	Backbone vs Ember	Using MEF to expose interfaces in your Silverl...	Control Points and Steering Mechanisms in Open...

In [26]:

docs_ordred_by_code = model_data['doc_topic_dists'][code_topics.index].sum(axis=1).order(ascending=False)

In [27]:

def fit_focused_model(ordered_docs, num_topics, num_iters=100, threshold=0.1):
    subset = set(ordered_docs[ordered_docs > threshold].index)
    print('Keeping %.2f%% of the corpus...' % (100 * (len(subset) / len(ordered_docs))))
    # I should have kept the doc index around, oh well..
    stories_subset = stories_sf[stories_sf['title'].apply(lambda t: t in subset)]
    bows = stories_subset['bow']    
    print('Fitting model...')
    tm = gl.topic_model.create(bows, num_topics, num_iterations=num_iters)
    print('Creating vis data...')
    data = extract_dists(tm, stories_subset)
    data['model'] = tm
    return data

In [206]:

code_model = fit_focused_model(docs_ordred_by_code, 40, num_iters=500, threshold=0.25)

Keeping 1.55% of the corpus...
Fitting model...
PROGRESS: Learning a topic model
PROGRESS:        Number of documents      4051
PROGRESS:            Vocabulary size     23879
PROGRESS:    Running collapsed Gibbs sampling
PROGRESS: +-----------+---------------+----------------+-----------------+
PROGRESS: | Iteration | Elapsed Time  | Tokens/Second  | Est. Perplexity |
PROGRESS: +-----------+---------------+----------------+-----------------+
PROGRESS: | 10        | 1.00s         | 4.22476e+06    | 0               |
PROGRESS: | 20        | 1.93s         | 3.79484e+06    | 0               |
PROGRESS: | 30        | 2.85s         | 4.20434e+06    | 0               |
PROGRESS: | 40        | 3.71s         | 4.29245e+06    | 0               |
PROGRESS: | 50        | 4.61s         | 4.1157e+06     | 0               |
PROGRESS: | 60        | 5.50s         | 3.64592e+06    | 0               |
PROGRESS: | 70        | 6.40s         | 3.58882e+06    | 0               |
PROGRESS: | 80        | 7.32s         | 3.33084e+06    | 0               |
PROGRESS: | 90        | 8.27s         | 4.14121e+06    | 0               |
PROGRESS: | 100       | 9.18s         | 4.06099e+06    | 0               |
PROGRESS: | 110       | 10.08s        | 4.37966e+06    | 0               |
PROGRESS: | 120       | 10.96s        | 3.95687e+06    | 0               |
PROGRESS: | 130       | 11.88s        | 4.38291e+06    | 0               |
PROGRESS: | 140       | 12.73s        | 4.06117e+06    | 0               |
PROGRESS: | 150       | 13.59s        | 4.11654e+06    | 0               |
PROGRESS: | 160       | 14.50s        | 3.57834e+06    | 0               |
PROGRESS: | 170       | 15.41s        | 3.93633e+06    | 0               |
PROGRESS: | 180       | 16.30s        | 4.14158e+06    | 0               |
PROGRESS: | 190       | 17.18s        | 4.14951e+06    | 0               |
PROGRESS: | 200       | 18.08s        | 4.19639e+06    | 0               |
PROGRESS: | 210       | 18.94s        | 4.28476e+06    | 0               |
PROGRESS: | 220       | 19.82s        | 3.92746e+06    | 0               |
PROGRESS: | 230       | 20.67s        | 4.25762e+06    | 0               |
PROGRESS: | 240       | 21.58s        | 4.15865e+06    | 0               |
PROGRESS: | 250       | 22.46s        | 4.28757e+06    | 0               |
PROGRESS: | 260       | 23.42s        | 3.50532e+06    | 0               |
PROGRESS: | 270       | 24.34s        | 4.14928e+06    | 0               |
PROGRESS: | 280       | 25.21s        | 4.27971e+06    | 0               |
PROGRESS: | 290       | 26.09s        | 3.83545e+06    | 0               |
PROGRESS: | 300       | 26.97s        | 4.07273e+06    | 0               |
PROGRESS: | 310       | 27.89s        | 4.24095e+06    | 0               |
PROGRESS: | 320       | 28.78s        | 4.0067e+06     | 0               |
PROGRESS: | 330       | 29.67s        | 4.07831e+06    | 0               |
PROGRESS: | 340       | 30.58s        | 4.29804e+06    | 0               |
PROGRESS: | 350       | 31.52s        | 3.49479e+06    | 0               |
PROGRESS: | 360       | 32.45s        | 4.16546e+06    | 0               |
PROGRESS: | 370       | 33.39s        | 2.94196e+06    | 0               |
PROGRESS: | 380       | 34.29s        | 3.39923e+06    | 0               |
PROGRESS: | 390       | 35.34s        | 3.587e+06      | 0               |
PROGRESS: | 400       | 36.25s        | 4.28361e+06    | 0               |
PROGRESS: | 410       | 37.20s        | 4.29764e+06    | 0               |
PROGRESS: | 420       | 38.10s        | 4.22593e+06    | 0               |
PROGRESS: | 430       | 39.02s        | 3.66926e+06    | 0               |
PROGRESS: | 440       | 39.91s        | 4.34361e+06    | 0               |
PROGRESS: | 450       | 40.80s        | 4.39185e+06    | 0               |
PROGRESS: | 460       | 41.79s        | 3.65108e+06    | 0               |
PROGRESS: | 470       | 42.76s        | 3.85248e+06    | 0               |
PROGRESS: | 480       | 43.71s        | 3.8152e+06     | 0               |
PROGRESS: | 490       | 44.69s        | 2.877e+06      | 0               |
PROGRESS: | 500       | 45.72s        | 3.40368e+06    | 0               |
PROGRESS: +-----------+---------------+----------------+-----------------+
Creating vis data...

In [207]:

code_model['vis']

Out[207]:

Example of looking at a document¶

In [29]:

top_topics = topics_for('Game written by 14 year old passes Angry Birds as the top free iphone app').head(4)
top_topics

Out[29]:

57    0.141026
38    0.132479
65    0.072650
70    0.072650
Name: Game written by 14 year old passes Angry Birds as the top free iphone app, dtype: float64

Without LDAvis you would then look at the top words for those docs.. something like this:

In [31]:

top_topic_terms(top_topics.index)[0:5]

Out[31]:

	57	38	65	70
0	app	game	time	movie
1	apps	player	day	film
2	developer	video game	work	show
3	application	gaming	hour	story
4	user	developer	week	episode

To look at the all of the topics you are reduced to looking at a wall of words or tables:

In [33]:

all_top_terms().head(5)

Out[33]:

	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19	20	21	22	23	24	25	...	76	77	78	79	80	81	82	83	84	85	86	87	88	89	90	91	92	93	94	95	96	97	98	99	100
0	problem	function	world	government	code	class	number	scientist	research	system	market	browser	country	file	wikileaks	language	time	startup	data	android	health	law	energy	idea	company	...	experience	google	car	design	business	video	seo	event	service	blog	email	music	service	photo	question	atau	home	february	shoe	feature	india	ask hn	ask hn	ask hn	ask hn
1	system	type	life	money	bit	method	point	science	study	process	company	javascript	china	package	information	code	man	company	database	device	drug	state	water	time	market	...	developer	search	vehicle	designer	company	content	brick marketing	conference	phone	post	message	artist	customer	image	answer	anda	service	january	fashion	laptop	travel	error	show hn	website	startup
2	model	object	man	economy	data	object	algorithm	researcher	brain	application	sale	firefox	united state	directory	government	programming	year ago	entrepreneur	table	phone	patient	court	power	goal	stock	...	skill	user	driver	website	money	youtube	website	presentation	call	site	mail	sound	company	picture	time	dalam	real estate	site	style	battery	hotel	der	comment	domain	idea
3	question	code	book	tax	bug	code	problem	cell	researcher	user	product	element	world	command	document	java	day	founder	query	google	treatment	obama	material	problem	share	...	company	chrome	system	web design	client	medium	company	talk	communication	wordpress	email address	audio	business	camera	topic	itu	house	online	sale	phone	match	request url	hacker news	site	advice
4	point	var	god	bank	program	data	result	animal	paper	data	revenue	html	government	version	assange	programmer	dog	investor	system	apple	doctor	lawyer	plant	life	investor	...	engineer	product	model	logo	idea	flash	marketing	day	technology	reader	service	band	solution	wallpaper	community	untuk	property	tip	quality	video	sport	mit	post	domain name	web app

5 rows × 100 columns