from __future__ import division
import graphlab as gl
import pandas as pd
import pyLDAvis
import pyLDAvis.graphlab
pyLDAvis.enable_notebook()
stories_sf = gl.load_sframe("hn_processed.sframe")
bows = stories_sf['bow']
topic_model = gl.topic_model.create(bows, num_topics=100, num_iterations=200)
pyLDAvis.graphlab.prepare(topic_model, bows)
You can fit more topics and the topics become more fine-grained. They become difficult to visualize in the intertopic map tough.
topic_model150 = gl.topic_model.create(bows, num_topics=150, num_iterations=200)
pyLDAvis.graphlab.prepare(topic_model150, bows)
import re
pos_re = re.compile(r'/(NOUN|ADJ|VERB|ADV)')
def extract_dists(model, sf=stories_sf):
data = pyLDAvis.graphlab._extract_data(model, sf['bow'])
vocab = data['vocab'] = [pos_re.sub('', t).replace('_', ' ') for t in data['vocab']]
vis_data = pyLDAvis.prepare(**data)
vis_topic_order = vis_data.topic_order
new_order = np.array(vis_topic_order) - 1
topic_ids = range(1, len(new_order) + 1)
data['topic_term_dists'] = pd.DataFrame(data['topic_term_dists'].T, index=vocab)[new_order]
data['topic_term_dists'].columns = topic_ids
data['doc_topic_dists'] = pd.DataFrame(data['doc_topic_dists'], index=sf['title'])[new_order]
data['doc_topic_dists'].columns = topic_ids
if vis_data:
data['vis'] = vis_data
return data
model_data = extract_dists(topic_model)
def topics_for(doc_name, doc_dist=model_data['doc_topic_dists']):
return doc_dist.ix[doc_name].order(ascending=False)
def _sort_cols(df, cols):
res = df[cols].apply(lambda probs: probs.order(ascending=False).index)
return res.reset_index(drop=True)
def top_topic_terms(topic_ids, topic_term_dists=model_data['topic_term_dists']):
return _sort_cols(topic_term_dists, topic_ids)
def top_docs(topic_ids, doc_topic_dists=model_data['doc_topic_dists']):
return _sort_cols(doc_topic_dists, topic_ids)
def top_term_topics(term, topic_term_dists=model_data['topic_term_dists']):
df = topic_term_dists.T[term].order(ascending=False)
return df#.reset_index(drop=True)
def all_top_terms(topic_term_dists=model_data['topic_term_dists']):
return top_topic_terms(topic_term_dists.columns)
def topic_docs(topic_id, doc_topic_dists=model_data['doc_topic_dists']):
return doc_topic_dists[topic_id].order(ascending=False)
code_topics = top_term_topics('code')[0:10]
code_topics
59 0.075427 16 0.053946 5 0.040998 2 0.024466 51 0.024436 6 0.021635 48 0.020752 68 0.019684 12 0.009131 61 0.004977 Name: code, dtype: float64
code_topics = code_topics[code_topics > 0.01]
len(code_topics)
8
top_docs(code_topics.index).head(5)
59 | 16 | 5 | 2 | 51 | 6 | 48 | 68 | |
---|---|---|---|---|---|---|---|---|
0 | Static Analysis isnt Development Testing | A Concept Design for C++ | Elucidating all about Code Analysis in Visual C++ | The Javascript Garden | Clarifying the Roles of the .gemspec and Gemfile | Working with Design Patterns in JAVA | ASF comment on JSR#336 (Java 7) review ballot | A Hacker\u2019s Guide to Git |
1 | Test Driven Development Really Works - The Hil... | It Is Not Called The "STL", Mmkay? | Collection of Examples of 64-bit Errors in Rea... | JavaScript Garden | Clarifying the Roles of the .gemspec and Gemfile | Working with interfaces in JAVA | Why Open Source misses the point of Free Software | Git Workflows & tutorials by Atlassian |
2 | My love affair with code reviews | Interesting Standard Libraries to Study (Ltu) | Improving and Fixing C Code | Javascript Garden | Running Pure Django Projects on Google App Engine | Dependency Injection - An Introductory Tutoria... | Qt to ship standard in Ubuntu 11.10 | Cheat git |
3 | Thoughts on Developer Testing | A Brief, Incomplete, and Mostly Wrong History ... | The Art of Picking Intel Registers (2003) | Understanding Python Decorators | Beginning Ember.js on Rails: Part 1 | Tinyweb Does Mono | JSRs for Java 7 and Java 8 Approved | Develop Faster: Set Up Your Git Fork and Merge... |
4 | The Problems with Unit Testing Frameworks | Moving from Java to Scala - One year later... | A Collection of Examples of 64-bit Errors in R... | Understanding Python Decorators | Backbone vs Ember | Using MEF to expose interfaces in your Silverl... | Control Points and Steering Mechanisms in Open... | Git: Merging the right way |
code_topics = code_topics[code_topics > 0.02]
top_docs(code_topics.index).head(5)
59 | 16 | 5 | 2 | 51 | 6 | 48 | |
---|---|---|---|---|---|---|---|
0 | Static Analysis isnt Development Testing | A Concept Design for C++ | Elucidating all about Code Analysis in Visual C++ | The Javascript Garden | Clarifying the Roles of the .gemspec and Gemfile | Working with Design Patterns in JAVA | ASF comment on JSR#336 (Java 7) review ballot |
1 | Test Driven Development Really Works - The Hil... | It Is Not Called The "STL", Mmkay? | Collection of Examples of 64-bit Errors in Rea... | JavaScript Garden | Clarifying the Roles of the .gemspec and Gemfile | Working with interfaces in JAVA | Why Open Source misses the point of Free Software |
2 | My love affair with code reviews | Interesting Standard Libraries to Study (Ltu) | Improving and Fixing C Code | Javascript Garden | Running Pure Django Projects on Google App Engine | Dependency Injection - An Introductory Tutoria... | Qt to ship standard in Ubuntu 11.10 |
3 | Thoughts on Developer Testing | A Brief, Incomplete, and Mostly Wrong History ... | The Art of Picking Intel Registers (2003) | Understanding Python Decorators | Beginning Ember.js on Rails: Part 1 | Tinyweb Does Mono | JSRs for Java 7 and Java 8 Approved |
4 | The Problems with Unit Testing Frameworks | Moving from Java to Scala - One year later... | A Collection of Examples of 64-bit Errors in R... | Understanding Python Decorators | Backbone vs Ember | Using MEF to expose interfaces in your Silverl... | Control Points and Steering Mechanisms in Open... |
docs_ordred_by_code = model_data['doc_topic_dists'][code_topics.index].sum(axis=1).order(ascending=False)
def fit_focused_model(ordered_docs, num_topics, num_iters=100, threshold=0.1):
subset = set(ordered_docs[ordered_docs > threshold].index)
print('Keeping %.2f%% of the corpus...' % (100 * (len(subset) / len(ordered_docs))))
# I should have kept the doc index around, oh well..
stories_subset = stories_sf[stories_sf['title'].apply(lambda t: t in subset)]
bows = stories_subset['bow']
print('Fitting model...')
tm = gl.topic_model.create(bows, num_topics, num_iterations=num_iters)
print('Creating vis data...')
data = extract_dists(tm, stories_subset)
data['model'] = tm
return data
code_model = fit_focused_model(docs_ordred_by_code, 40, num_iters=500, threshold=0.25)
Keeping 1.55% of the corpus... Fitting model... PROGRESS: Learning a topic model PROGRESS: Number of documents 4051 PROGRESS: Vocabulary size 23879 PROGRESS: Running collapsed Gibbs sampling PROGRESS: +-----------+---------------+----------------+-----------------+ PROGRESS: | Iteration | Elapsed Time | Tokens/Second | Est. Perplexity | PROGRESS: +-----------+---------------+----------------+-----------------+ PROGRESS: | 10 | 1.00s | 4.22476e+06 | 0 | PROGRESS: | 20 | 1.93s | 3.79484e+06 | 0 | PROGRESS: | 30 | 2.85s | 4.20434e+06 | 0 | PROGRESS: | 40 | 3.71s | 4.29245e+06 | 0 | PROGRESS: | 50 | 4.61s | 4.1157e+06 | 0 | PROGRESS: | 60 | 5.50s | 3.64592e+06 | 0 | PROGRESS: | 70 | 6.40s | 3.58882e+06 | 0 | PROGRESS: | 80 | 7.32s | 3.33084e+06 | 0 | PROGRESS: | 90 | 8.27s | 4.14121e+06 | 0 | PROGRESS: | 100 | 9.18s | 4.06099e+06 | 0 | PROGRESS: | 110 | 10.08s | 4.37966e+06 | 0 | PROGRESS: | 120 | 10.96s | 3.95687e+06 | 0 | PROGRESS: | 130 | 11.88s | 4.38291e+06 | 0 | PROGRESS: | 140 | 12.73s | 4.06117e+06 | 0 | PROGRESS: | 150 | 13.59s | 4.11654e+06 | 0 | PROGRESS: | 160 | 14.50s | 3.57834e+06 | 0 | PROGRESS: | 170 | 15.41s | 3.93633e+06 | 0 | PROGRESS: | 180 | 16.30s | 4.14158e+06 | 0 | PROGRESS: | 190 | 17.18s | 4.14951e+06 | 0 | PROGRESS: | 200 | 18.08s | 4.19639e+06 | 0 | PROGRESS: | 210 | 18.94s | 4.28476e+06 | 0 | PROGRESS: | 220 | 19.82s | 3.92746e+06 | 0 | PROGRESS: | 230 | 20.67s | 4.25762e+06 | 0 | PROGRESS: | 240 | 21.58s | 4.15865e+06 | 0 | PROGRESS: | 250 | 22.46s | 4.28757e+06 | 0 | PROGRESS: | 260 | 23.42s | 3.50532e+06 | 0 | PROGRESS: | 270 | 24.34s | 4.14928e+06 | 0 | PROGRESS: | 280 | 25.21s | 4.27971e+06 | 0 | PROGRESS: | 290 | 26.09s | 3.83545e+06 | 0 | PROGRESS: | 300 | 26.97s | 4.07273e+06 | 0 | PROGRESS: | 310 | 27.89s | 4.24095e+06 | 0 | PROGRESS: | 320 | 28.78s | 4.0067e+06 | 0 | PROGRESS: | 330 | 29.67s | 4.07831e+06 | 0 | PROGRESS: | 340 | 30.58s | 4.29804e+06 | 0 | PROGRESS: | 350 | 31.52s | 3.49479e+06 | 0 | PROGRESS: | 360 | 32.45s | 4.16546e+06 | 0 | PROGRESS: | 370 | 33.39s | 2.94196e+06 | 0 | PROGRESS: | 380 | 34.29s | 3.39923e+06 | 0 | PROGRESS: | 390 | 35.34s | 3.587e+06 | 0 | PROGRESS: | 400 | 36.25s | 4.28361e+06 | 0 | PROGRESS: | 410 | 37.20s | 4.29764e+06 | 0 | PROGRESS: | 420 | 38.10s | 4.22593e+06 | 0 | PROGRESS: | 430 | 39.02s | 3.66926e+06 | 0 | PROGRESS: | 440 | 39.91s | 4.34361e+06 | 0 | PROGRESS: | 450 | 40.80s | 4.39185e+06 | 0 | PROGRESS: | 460 | 41.79s | 3.65108e+06 | 0 | PROGRESS: | 470 | 42.76s | 3.85248e+06 | 0 | PROGRESS: | 480 | 43.71s | 3.8152e+06 | 0 | PROGRESS: | 490 | 44.69s | 2.877e+06 | 0 | PROGRESS: | 500 | 45.72s | 3.40368e+06 | 0 | PROGRESS: +-----------+---------------+----------------+-----------------+ Creating vis data...
code_model['vis']
top_topics = topics_for('Game written by 14 year old passes Angry Birds as the top free iphone app').head(4)
top_topics
57 0.141026 38 0.132479 65 0.072650 70 0.072650 Name: Game written by 14 year old passes Angry Birds as the top free iphone app, dtype: float64
Without LDAvis you would then look at the top words for those docs.. something like this:
top_topic_terms(top_topics.index)[0:5]
57 | 38 | 65 | 70 | |
---|---|---|---|---|
0 | app | game | time | movie |
1 | apps | player | day | film |
2 | developer | video game | work | show |
3 | application | gaming | hour | story |
4 | user | developer | week | episode |
To look at the all of the topics you are reduced to looking at a wall of words or tables:
all_top_terms().head(5)
1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | ... | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | problem | function | world | government | code | class | number | scientist | research | system | market | browser | country | file | wikileaks | language | time | startup | data | android | health | law | energy | idea | company | ... | experience | car | design | business | video | seo | event | service | blog | music | service | photo | question | atau | home | february | shoe | feature | india | ask hn | ask hn | ask hn | ask hn | ||
1 | system | type | life | money | bit | method | point | science | study | process | company | javascript | china | package | information | code | man | company | database | device | drug | state | water | time | market | ... | developer | search | vehicle | designer | company | content | brick marketing | conference | phone | post | message | artist | customer | image | answer | anda | service | january | fashion | laptop | travel | error | show hn | website | startup |
2 | model | object | man | economy | data | object | algorithm | researcher | brain | application | sale | firefox | united state | directory | government | programming | year ago | entrepreneur | table | phone | patient | court | power | goal | stock | ... | skill | user | driver | website | money | youtube | website | presentation | call | site | sound | company | picture | time | dalam | real estate | site | style | battery | hotel | der | comment | domain | idea | |
3 | question | code | book | tax | bug | code | problem | cell | researcher | user | product | element | world | command | document | java | day | founder | query | treatment | obama | material | problem | share | ... | company | chrome | system | web design | client | medium | company | talk | communication | wordpress | email address | audio | business | camera | topic | itu | house | online | sale | phone | match | request url | hacker news | site | advice | |
4 | point | var | god | bank | program | data | result | animal | paper | data | revenue | html | government | version | assange | programmer | dog | investor | system | apple | doctor | lawyer | plant | life | investor | ... | engineer | product | model | logo | idea | flash | marketing | day | technology | reader | service | band | solution | wallpaper | community | untuk | property | tip | quality | video | sport | mit | post | domain name | web app |
5 rows × 100 columns