#!/usr/bin/env python
# coding: utf-8

# # Topic Modeling of Twitter Followers
# 
# This python 2 notebook is a companion to the blog post [Segmentation of Twitter Timelines via Topic Modeling](http://alexperrier.github.io/jekyll/update/2015/09/16/segmentation_twitter_timelines_lda_vs_lsa.html) where we explore a corpus of Twitter timelines composed of the followers of the [@alexip](https://twitter.com/alexip) account and compare the results obtained through Latent Semantic Allocation vs Latent Dirichlet Allocation (LDA). Below are the results for LDA on a set of 245 timelines. 
# 
# Some of the best topics are: 
# 
# * [T1 Software Development](http://nbviewer.ipython.org/github/alexperrier/datatalks/blob/master/twitter/LDAvis_V2.ipynb#topic=1&lambda=0.57&term=), 
# * [T2 Data Science](http://nbviewer.ipython.org/github/alexperrier/datatalks/blob/master/twitter/LDAvis_V2.ipynb#topic=2&lambda=0.57&term=), 
# * [T3 Conference in London](http://nbviewer.ipython.org/github/alexperrier/datatalks/blob/master/twitter/LDAvis_V2.ipynb#topic=2&lambda=0.57&term=),  (open for interpretation)
# * [T4 Fantasy Football](http://nbviewer.ipython.org/github/alexperrier/datatalks/blob/master/twitter/LDAvis_V2.ipynb#topic=4&lambda=0.57&term=),(mixed with international events)
# * [T6 RSS feeds](http://nbviewer.ipython.org/github/alexperrier/datatalks/blob/master/twitter/LDAvis_V2.ipynb#topic=6&lambda=0.57&term=), 
# * [T8 PMP and Project Management](http://nbviewer.ipython.org/github/alexperrier/datatalks/blob/master/twitter/LDAvis_V2.ipynb#topic=8&lambda=0.5&term=), 
# * [T19 Martha's Vineyard](http://nbviewer.ipython.org/github/alexperrier/datatalks/blob/master/twitter/LDAvis_V2.ipynb#topic=19&lambda=0.57&term=)
# * [T31 Fenway](http://nbviewer.ipython.org/github/alexperrier/datatalks/blob/master/twitter/LDAvis_V2.ipynb#topic=31&lambda=0.57&term=)
# * [T33 Addiction and drugs](http://nbviewer.ipython.org/github/alexperrier/datatalks/blob/master/notebooks/twitter/LDAvis_V2.ipynb#topic=33&lambda=0.57&term=)
# 
# etc ...
# 
# 

# In[6]:


from gensim import corpora, models
import pyLDAvis.gensim

corpus = corpora.MmCorpus('data/alexip_followers_v3.mm')
dictionary = corpora.Dictionary.load('data/alexip_followers_v3.dict')

lda = models.LdaModel.load('data/alexip_followers_v3_t40_p200_a001.lda')
followers_data =  pyLDAvis.gensim.prepare(lda,corpus, dictionary)
pyLDAvis.display(followers_data)


# For Best results set the $\lambda$ parameter between 0.5 and 0.6. Lowering $\lambda$ increases the relative importance of words that are discriminant to a certain topic. 
# 
# We use the amazing [LDAvis](https://pypi.python.org/pypi/pyLDAvis) package for this visualization. LDa was carried out with the [Gensim](https://radimrehurek.com/gensim/) package.  The data is available in a [Json 3M gz file](https://github.com/alexperrier/datatalks/raw/master/twitter/data/alexip_followers.json.gz). 
#